]>
Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas ([email protected]) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras ([email protected]). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
17 | #include <asm/errno.h> | |
18 | #include <asm/ppc_asm.h> | |
9445aa1a | 19 | #include <asm/export.h> |
14cf11af | 20 | |
14cf11af PM |
21 | /* |
22 | * Computes the checksum of a memory block at buff, length len, | |
23 | * and adds in "sum" (32-bit). | |
24 | * | |
7e393220 | 25 | * __csum_partial(r3=buff, r4=len, r5=sum) |
14cf11af | 26 | */ |
7e393220 | 27 | _GLOBAL(__csum_partial) |
9b83ecb0 AB |
28 | addic r0,r5,0 /* clear carry */ |
29 | ||
30 | srdi. r6,r4,3 /* less than 8 bytes? */ | |
31 | beq .Lcsum_tail_word | |
32 | ||
33 | /* | |
34 | * If only halfword aligned, align to a double word. Since odd | |
35 | * aligned addresses should be rare and they would require more | |
36 | * work to calculate the correct checksum, we ignore that case | |
37 | * and take the potential slowdown of unaligned loads. | |
38 | */ | |
d4fde568 | 39 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ |
9b83ecb0 AB |
40 | beq .Lcsum_aligned |
41 | ||
42 | li r7,4 | |
43 | sub r6,r7,r6 | |
44 | mtctr r6 | |
45 | ||
46 | 1: | |
47 | lhz r6,0(r3) /* align to doubleword */ | |
48 | subi r4,r4,2 | |
49 | addi r3,r3,2 | |
50 | adde r0,r0,r6 | |
51 | bdnz 1b | |
52 | ||
53 | .Lcsum_aligned: | |
54 | /* | |
55 | * We unroll the loop such that each iteration is 64 bytes with an | |
56 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
57 | * 128 bytes. | |
58 | */ | |
59 | srdi. r6,r4,7 | |
60 | beq .Lcsum_tail_doublewords /* len < 128 */ | |
61 | ||
62 | srdi r6,r4,6 | |
63 | subi r6,r6,1 | |
64 | mtctr r6 | |
65 | ||
66 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
67 | std r14,STK_REG(R14)(r1) |
68 | std r15,STK_REG(R15)(r1) | |
69 | std r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
70 | |
71 | ld r6,0(r3) | |
72 | ld r9,8(r3) | |
73 | ||
74 | ld r10,16(r3) | |
75 | ld r11,24(r3) | |
76 | ||
77 | /* | |
ec5619fd SS |
78 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
79 | * because of the XER dependency. This means the fastest this loop can | |
80 | * go is 16 cycles per iteration. The scheduling of the loop below has | |
9b83ecb0 AB |
81 | * been shown to hit this on both POWER6 and POWER7. |
82 | */ | |
83 | .align 5 | |
84 | 2: | |
85 | adde r0,r0,r6 | |
86 | ld r12,32(r3) | |
87 | ld r14,40(r3) | |
88 | ||
89 | adde r0,r0,r9 | |
90 | ld r15,48(r3) | |
91 | ld r16,56(r3) | |
92 | addi r3,r3,64 | |
93 | ||
94 | adde r0,r0,r10 | |
95 | ||
96 | adde r0,r0,r11 | |
97 | ||
98 | adde r0,r0,r12 | |
99 | ||
100 | adde r0,r0,r14 | |
101 | ||
102 | adde r0,r0,r15 | |
103 | ld r6,0(r3) | |
104 | ld r9,8(r3) | |
105 | ||
106 | adde r0,r0,r16 | |
107 | ld r10,16(r3) | |
108 | ld r11,24(r3) | |
109 | bdnz 2b | |
110 | ||
111 | ||
112 | adde r0,r0,r6 | |
113 | ld r12,32(r3) | |
114 | ld r14,40(r3) | |
115 | ||
116 | adde r0,r0,r9 | |
117 | ld r15,48(r3) | |
118 | ld r16,56(r3) | |
119 | addi r3,r3,64 | |
120 | ||
121 | adde r0,r0,r10 | |
122 | adde r0,r0,r11 | |
123 | adde r0,r0,r12 | |
124 | adde r0,r0,r14 | |
125 | adde r0,r0,r15 | |
126 | adde r0,r0,r16 | |
127 | ||
c75df6f9 MN |
128 | ld r14,STK_REG(R14)(r1) |
129 | ld r15,STK_REG(R15)(r1) | |
130 | ld r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
131 | addi r1,r1,STACKFRAMESIZE |
132 | ||
133 | andi. r4,r4,63 | |
134 | ||
135 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ | |
136 | srdi. r6,r4,3 | |
137 | beq .Lcsum_tail_word | |
138 | ||
139 | mtctr r6 | |
140 | 3: | |
141 | ld r6,0(r3) | |
142 | addi r3,r3,8 | |
143 | adde r0,r0,r6 | |
144 | bdnz 3b | |
145 | ||
146 | andi. r4,r4,7 | |
147 | ||
148 | .Lcsum_tail_word: /* Up to 7 bytes to go */ | |
149 | srdi. r6,r4,2 | |
150 | beq .Lcsum_tail_halfword | |
151 | ||
152 | lwz r6,0(r3) | |
14cf11af | 153 | addi r3,r3,4 |
9b83ecb0 | 154 | adde r0,r0,r6 |
14cf11af | 155 | subi r4,r4,4 |
9b83ecb0 AB |
156 | |
157 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ | |
158 | srdi. r6,r4,1 | |
159 | beq .Lcsum_tail_byte | |
160 | ||
161 | lhz r6,0(r3) | |
162 | addi r3,r3,2 | |
163 | adde r0,r0,r6 | |
164 | subi r4,r4,2 | |
165 | ||
166 | .Lcsum_tail_byte: /* Up to 1 byte to go */ | |
167 | andi. r6,r4,1 | |
168 | beq .Lcsum_finish | |
169 | ||
170 | lbz r6,0(r3) | |
d4fde568 | 171 | #ifdef __BIG_ENDIAN__ |
9b83ecb0 AB |
172 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
173 | adde r0,r0,r9 | |
d4fde568 PM |
174 | #else |
175 | adde r0,r0,r6 | |
176 | #endif | |
9b83ecb0 AB |
177 | |
178 | .Lcsum_finish: | |
179 | addze r0,r0 /* add in final carry */ | |
180 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
181 | add r3,r4,r0 | |
182 | srdi r3,r3,32 | |
183 | blr | |
9445aa1a | 184 | EXPORT_SYMBOL(__csum_partial) |
14cf11af | 185 | |
fdd374b6 | 186 | |
8f21bd00 | 187 | .macro srcnr |
fdd374b6 | 188 | 100: |
24bfa6a9 | 189 | EX_TABLE(100b,.Lsrc_error_nr) |
fdd374b6 AB |
190 | .endm |
191 | ||
8f21bd00 PM |
192 | .macro source |
193 | 150: | |
24bfa6a9 | 194 | EX_TABLE(150b,.Lsrc_error) |
8f21bd00 PM |
195 | .endm |
196 | ||
197 | .macro dstnr | |
fdd374b6 | 198 | 200: |
24bfa6a9 | 199 | EX_TABLE(200b,.Ldest_error_nr) |
8f21bd00 PM |
200 | .endm |
201 | ||
202 | .macro dest | |
203 | 250: | |
24bfa6a9 | 204 | EX_TABLE(250b,.Ldest_error) |
fdd374b6 AB |
205 | .endm |
206 | ||
14cf11af PM |
207 | /* |
208 | * Computes the checksum of a memory block at src, length len, | |
209 | * and adds in "sum" (32-bit), while copying the block to dst. | |
210 | * If an access exception occurs on src or dst, it stores -EFAULT | |
fdd374b6 AB |
211 | * to *src_err or *dst_err respectively. The caller must take any action |
212 | * required in this case (zeroing memory, recalculating partial checksum etc). | |
14cf11af PM |
213 | * |
214 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | |
215 | */ | |
216 | _GLOBAL(csum_partial_copy_generic) | |
fdd374b6 AB |
217 | addic r0,r6,0 /* clear carry */ |
218 | ||
219 | srdi. r6,r5,3 /* less than 8 bytes? */ | |
220 | beq .Lcopy_tail_word | |
221 | ||
222 | /* | |
223 | * If only halfword aligned, align to a double word. Since odd | |
224 | * aligned addresses should be rare and they would require more | |
225 | * work to calculate the correct checksum, we ignore that case | |
226 | * and take the potential slowdown of unaligned loads. | |
227 | * | |
228 | * If the source and destination are relatively unaligned we only | |
229 | * align the source. This keeps things simple. | |
230 | */ | |
d4fde568 | 231 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ |
fdd374b6 AB |
232 | beq .Lcopy_aligned |
233 | ||
d9813c36 PM |
234 | li r9,4 |
235 | sub r6,r9,r6 | |
fdd374b6 AB |
236 | mtctr r6 |
237 | ||
238 | 1: | |
8f21bd00 | 239 | srcnr; lhz r6,0(r3) /* align to doubleword */ |
14cf11af | 240 | subi r5,r5,2 |
14cf11af | 241 | addi r3,r3,2 |
fdd374b6 | 242 | adde r0,r0,r6 |
8f21bd00 | 243 | dstnr; sth r6,0(r4) |
14cf11af | 244 | addi r4,r4,2 |
fdd374b6 AB |
245 | bdnz 1b |
246 | ||
247 | .Lcopy_aligned: | |
248 | /* | |
249 | * We unroll the loop such that each iteration is 64 bytes with an | |
250 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
251 | * 128 bytes. | |
252 | */ | |
253 | srdi. r6,r5,7 | |
254 | beq .Lcopy_tail_doublewords /* len < 128 */ | |
255 | ||
256 | srdi r6,r5,6 | |
257 | subi r6,r6,1 | |
258 | mtctr r6 | |
259 | ||
260 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
261 | std r14,STK_REG(R14)(r1) |
262 | std r15,STK_REG(R15)(r1) | |
263 | std r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
264 | |
265 | source; ld r6,0(r3) | |
266 | source; ld r9,8(r3) | |
267 | ||
268 | source; ld r10,16(r3) | |
269 | source; ld r11,24(r3) | |
270 | ||
271 | /* | |
ec5619fd SS |
272 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
273 | * because of the XER dependency. This means the fastest this loop can | |
274 | * go is 16 cycles per iteration. The scheduling of the loop below has | |
fdd374b6 AB |
275 | * been shown to hit this on both POWER6 and POWER7. |
276 | */ | |
277 | .align 5 | |
278 | 2: | |
279 | adde r0,r0,r6 | |
280 | source; ld r12,32(r3) | |
281 | source; ld r14,40(r3) | |
282 | ||
283 | adde r0,r0,r9 | |
284 | source; ld r15,48(r3) | |
285 | source; ld r16,56(r3) | |
286 | addi r3,r3,64 | |
287 | ||
288 | adde r0,r0,r10 | |
289 | dest; std r6,0(r4) | |
290 | dest; std r9,8(r4) | |
291 | ||
292 | adde r0,r0,r11 | |
293 | dest; std r10,16(r4) | |
294 | dest; std r11,24(r4) | |
295 | ||
296 | adde r0,r0,r12 | |
297 | dest; std r12,32(r4) | |
298 | dest; std r14,40(r4) | |
299 | ||
300 | adde r0,r0,r14 | |
301 | dest; std r15,48(r4) | |
302 | dest; std r16,56(r4) | |
303 | addi r4,r4,64 | |
304 | ||
305 | adde r0,r0,r15 | |
306 | source; ld r6,0(r3) | |
307 | source; ld r9,8(r3) | |
308 | ||
309 | adde r0,r0,r16 | |
310 | source; ld r10,16(r3) | |
311 | source; ld r11,24(r3) | |
312 | bdnz 2b | |
313 | ||
314 | ||
14cf11af | 315 | adde r0,r0,r6 |
fdd374b6 AB |
316 | source; ld r12,32(r3) |
317 | source; ld r14,40(r3) | |
318 | ||
319 | adde r0,r0,r9 | |
320 | source; ld r15,48(r3) | |
321 | source; ld r16,56(r3) | |
322 | addi r3,r3,64 | |
323 | ||
324 | adde r0,r0,r10 | |
325 | dest; std r6,0(r4) | |
326 | dest; std r9,8(r4) | |
327 | ||
328 | adde r0,r0,r11 | |
329 | dest; std r10,16(r4) | |
330 | dest; std r11,24(r4) | |
331 | ||
332 | adde r0,r0,r12 | |
333 | dest; std r12,32(r4) | |
334 | dest; std r14,40(r4) | |
335 | ||
336 | adde r0,r0,r14 | |
337 | dest; std r15,48(r4) | |
338 | dest; std r16,56(r4) | |
339 | addi r4,r4,64 | |
340 | ||
341 | adde r0,r0,r15 | |
342 | adde r0,r0,r16 | |
343 | ||
c75df6f9 MN |
344 | ld r14,STK_REG(R14)(r1) |
345 | ld r15,STK_REG(R15)(r1) | |
346 | ld r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
347 | addi r1,r1,STACKFRAMESIZE |
348 | ||
349 | andi. r5,r5,63 | |
350 | ||
351 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ | |
352 | srdi. r6,r5,3 | |
353 | beq .Lcopy_tail_word | |
354 | ||
355 | mtctr r6 | |
356 | 3: | |
8f21bd00 | 357 | srcnr; ld r6,0(r3) |
fdd374b6 | 358 | addi r3,r3,8 |
14cf11af | 359 | adde r0,r0,r6 |
8f21bd00 | 360 | dstnr; std r6,0(r4) |
fdd374b6 AB |
361 | addi r4,r4,8 |
362 | bdnz 3b | |
14cf11af | 363 | |
fdd374b6 | 364 | andi. r5,r5,7 |
14cf11af | 365 | |
fdd374b6 AB |
366 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
367 | srdi. r6,r5,2 | |
368 | beq .Lcopy_tail_halfword | |
369 | ||
8f21bd00 | 370 | srcnr; lwz r6,0(r3) |
fdd374b6 AB |
371 | addi r3,r3,4 |
372 | adde r0,r0,r6 | |
8f21bd00 | 373 | dstnr; stw r6,0(r4) |
fdd374b6 AB |
374 | addi r4,r4,4 |
375 | subi r5,r5,4 | |
376 | ||
377 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ | |
378 | srdi. r6,r5,1 | |
379 | beq .Lcopy_tail_byte | |
380 | ||
8f21bd00 | 381 | srcnr; lhz r6,0(r3) |
fdd374b6 AB |
382 | addi r3,r3,2 |
383 | adde r0,r0,r6 | |
8f21bd00 | 384 | dstnr; sth r6,0(r4) |
14cf11af | 385 | addi r4,r4,2 |
fdd374b6 AB |
386 | subi r5,r5,2 |
387 | ||
388 | .Lcopy_tail_byte: /* Up to 1 byte to go */ | |
389 | andi. r6,r5,1 | |
390 | beq .Lcopy_finish | |
391 | ||
8f21bd00 | 392 | srcnr; lbz r6,0(r3) |
d4fde568 | 393 | #ifdef __BIG_ENDIAN__ |
fdd374b6 AB |
394 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
395 | adde r0,r0,r9 | |
d4fde568 PM |
396 | #else |
397 | adde r0,r0,r6 | |
398 | #endif | |
8f21bd00 | 399 | dstnr; stb r6,0(r4) |
fdd374b6 AB |
400 | |
401 | .Lcopy_finish: | |
402 | addze r0,r0 /* add in final carry */ | |
403 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
404 | add r3,r4,r0 | |
405 | srdi r3,r3,32 | |
406 | blr | |
407 | ||
408 | .Lsrc_error: | |
8f21bd00 PM |
409 | ld r14,STK_REG(R14)(r1) |
410 | ld r15,STK_REG(R15)(r1) | |
411 | ld r16,STK_REG(R16)(r1) | |
412 | addi r1,r1,STACKFRAMESIZE | |
413 | .Lsrc_error_nr: | |
14cf11af | 414 | cmpdi 0,r7,0 |
fdd374b6 | 415 | beqlr |
14cf11af PM |
416 | li r6,-EFAULT |
417 | stw r6,0(r7) | |
14cf11af PM |
418 | blr |
419 | ||
fdd374b6 | 420 | .Ldest_error: |
8f21bd00 PM |
421 | ld r14,STK_REG(R14)(r1) |
422 | ld r15,STK_REG(R15)(r1) | |
423 | ld r16,STK_REG(R16)(r1) | |
424 | addi r1,r1,STACKFRAMESIZE | |
425 | .Ldest_error_nr: | |
14cf11af | 426 | cmpdi 0,r8,0 |
fdd374b6 | 427 | beqlr |
14cf11af PM |
428 | li r6,-EFAULT |
429 | stw r6,0(r8) | |
14cf11af | 430 | blr |
9445aa1a | 431 | EXPORT_SYMBOL(csum_partial_copy_generic) |
e9c4943a CL |
432 | |
433 | /* | |
434 | * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, | |
435 | * const struct in6_addr *daddr, | |
436 | * __u32 len, __u8 proto, __wsum sum) | |
437 | */ | |
438 | ||
439 | _GLOBAL(csum_ipv6_magic) | |
440 | ld r8, 0(r3) | |
441 | ld r9, 8(r3) | |
442 | add r5, r5, r6 | |
443 | addc r0, r8, r9 | |
444 | ld r10, 0(r4) | |
445 | ld r11, 8(r4) | |
446 | adde r0, r0, r10 | |
447 | add r5, r5, r7 | |
448 | adde r0, r0, r11 | |
449 | adde r0, r0, r5 | |
450 | addze r0, r0 | |
451 | rotldi r3, r0, 32 /* fold two 32 bit halves together */ | |
452 | add r3, r0, r3 | |
453 | srdi r0, r3, 32 | |
454 | rotlwi r3, r0, 16 /* fold two 16 bit halves together */ | |
455 | add r3, r0, r3 | |
456 | not r3, r3 | |
457 | rlwinm r3, r3, 16, 16, 31 | |
458 | blr | |
459 | EXPORT_SYMBOL(csum_ipv6_magic) |