]>
Commit | Line | Data |
---|---|---|
0b95ec56 JK |
1 | /* |
2 | * Camellia Cipher Algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2012 Jussi Kivilinna <[email protected]> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
59990684 JK |
23 | #include <linux/linkage.h> |
24 | ||
0b95ec56 JK |
25 | .file "camellia-x86_64-asm_64.S" |
26 | .text | |
27 | ||
28 | .extern camellia_sp10011110; | |
29 | .extern camellia_sp22000222; | |
30 | .extern camellia_sp03303033; | |
31 | .extern camellia_sp00444404; | |
32 | .extern camellia_sp02220222; | |
33 | .extern camellia_sp30333033; | |
34 | .extern camellia_sp44044404; | |
35 | .extern camellia_sp11101110; | |
36 | ||
37 | #define sp10011110 camellia_sp10011110 | |
38 | #define sp22000222 camellia_sp22000222 | |
39 | #define sp03303033 camellia_sp03303033 | |
40 | #define sp00444404 camellia_sp00444404 | |
41 | #define sp02220222 camellia_sp02220222 | |
42 | #define sp30333033 camellia_sp30333033 | |
43 | #define sp44044404 camellia_sp44044404 | |
44 | #define sp11101110 camellia_sp11101110 | |
45 | ||
46 | #define CAMELLIA_TABLE_BYTE_LEN 272 | |
47 | ||
48 | /* struct camellia_ctx: */ | |
49 | #define key_table 0 | |
50 | #define key_length CAMELLIA_TABLE_BYTE_LEN | |
51 | ||
52 | /* register macros */ | |
53 | #define CTX %rdi | |
54 | #define RIO %rsi | |
55 | #define RIOd %esi | |
56 | ||
57 | #define RAB0 %rax | |
58 | #define RCD0 %rcx | |
59 | #define RAB1 %rbx | |
60 | #define RCD1 %rdx | |
61 | ||
62 | #define RAB0d %eax | |
63 | #define RCD0d %ecx | |
64 | #define RAB1d %ebx | |
65 | #define RCD1d %edx | |
66 | ||
67 | #define RAB0bl %al | |
68 | #define RCD0bl %cl | |
69 | #define RAB1bl %bl | |
70 | #define RCD1bl %dl | |
71 | ||
72 | #define RAB0bh %ah | |
73 | #define RCD0bh %ch | |
74 | #define RAB1bh %bh | |
75 | #define RCD1bh %dh | |
76 | ||
77 | #define RT0 %rsi | |
78 | #define RT1 %rbp | |
79 | #define RT2 %r8 | |
80 | ||
81 | #define RT0d %esi | |
82 | #define RT1d %ebp | |
83 | #define RT2d %r8d | |
84 | ||
85 | #define RT2bl %r8b | |
86 | ||
87 | #define RXOR %r9 | |
88 | #define RRBP %r10 | |
89 | #define RDST %r11 | |
90 | ||
91 | #define RXORd %r9d | |
92 | #define RXORbl %r9b | |
93 | ||
94 | #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ | |
95 | movzbl ab ## bl, tmp2 ## d; \ | |
96 | movzbl ab ## bh, tmp1 ## d; \ | |
97 | rorq $16, ab; \ | |
98 | xorq T0(, tmp2, 8), dst; \ | |
99 | xorq T1(, tmp1, 8), dst; | |
100 | ||
101 | /********************************************************************** | |
102 | 1-way camellia | |
103 | **********************************************************************/ | |
104 | #define roundsm(ab, subkey, cd) \ | |
105 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ | |
106 | \ | |
107 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ | |
108 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ | |
109 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ | |
110 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ | |
111 | \ | |
112 | xorq RT2, cd ## 0; | |
113 | ||
114 | #define fls(l, r, kl, kr) \ | |
115 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ | |
116 | andl l ## 0d, RT0d; \ | |
117 | roll $1, RT0d; \ | |
118 | shlq $32, RT0; \ | |
119 | xorq RT0, l ## 0; \ | |
120 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ | |
121 | orq r ## 0, RT1; \ | |
122 | shrq $32, RT1; \ | |
123 | xorq RT1, r ## 0; \ | |
124 | \ | |
125 | movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ | |
126 | orq l ## 0, RT2; \ | |
127 | shrq $32, RT2; \ | |
128 | xorq RT2, l ## 0; \ | |
129 | movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ | |
130 | andl r ## 0d, RT0d; \ | |
131 | roll $1, RT0d; \ | |
132 | shlq $32, RT0; \ | |
133 | xorq RT0, r ## 0; | |
134 | ||
135 | #define enc_rounds(i) \ | |
136 | roundsm(RAB, i + 2, RCD); \ | |
137 | roundsm(RCD, i + 3, RAB); \ | |
138 | roundsm(RAB, i + 4, RCD); \ | |
139 | roundsm(RCD, i + 5, RAB); \ | |
140 | roundsm(RAB, i + 6, RCD); \ | |
141 | roundsm(RCD, i + 7, RAB); | |
142 | ||
143 | #define enc_fls(i) \ | |
144 | fls(RAB, RCD, i + 0, i + 1); | |
145 | ||
146 | #define enc_inpack() \ | |
147 | movq (RIO), RAB0; \ | |
148 | bswapq RAB0; \ | |
149 | rolq $32, RAB0; \ | |
150 | movq 4*2(RIO), RCD0; \ | |
151 | bswapq RCD0; \ | |
152 | rorq $32, RCD0; \ | |
153 | xorq key_table(CTX), RAB0; | |
154 | ||
155 | #define enc_outunpack(op, max) \ | |
156 | xorq key_table(CTX, max, 8), RCD0; \ | |
157 | rorq $32, RCD0; \ | |
158 | bswapq RCD0; \ | |
159 | op ## q RCD0, (RIO); \ | |
160 | rolq $32, RAB0; \ | |
161 | bswapq RAB0; \ | |
162 | op ## q RAB0, 4*2(RIO); | |
163 | ||
164 | #define dec_rounds(i) \ | |
165 | roundsm(RAB, i + 7, RCD); \ | |
166 | roundsm(RCD, i + 6, RAB); \ | |
167 | roundsm(RAB, i + 5, RCD); \ | |
168 | roundsm(RCD, i + 4, RAB); \ | |
169 | roundsm(RAB, i + 3, RCD); \ | |
170 | roundsm(RCD, i + 2, RAB); | |
171 | ||
172 | #define dec_fls(i) \ | |
173 | fls(RAB, RCD, i + 1, i + 0); | |
174 | ||
175 | #define dec_inpack(max) \ | |
176 | movq (RIO), RAB0; \ | |
177 | bswapq RAB0; \ | |
178 | rolq $32, RAB0; \ | |
179 | movq 4*2(RIO), RCD0; \ | |
180 | bswapq RCD0; \ | |
181 | rorq $32, RCD0; \ | |
182 | xorq key_table(CTX, max, 8), RAB0; | |
183 | ||
184 | #define dec_outunpack() \ | |
185 | xorq key_table(CTX), RCD0; \ | |
186 | rorq $32, RCD0; \ | |
187 | bswapq RCD0; \ | |
188 | movq RCD0, (RIO); \ | |
189 | rolq $32, RAB0; \ | |
190 | bswapq RAB0; \ | |
191 | movq RAB0, 4*2(RIO); | |
192 | ||
59990684 | 193 | ENTRY(__camellia_enc_blk) |
0b95ec56 JK |
194 | /* input: |
195 | * %rdi: ctx, CTX | |
196 | * %rsi: dst | |
197 | * %rdx: src | |
198 | * %rcx: bool xor | |
199 | */ | |
200 | movq %rbp, RRBP; | |
201 | ||
202 | movq %rcx, RXOR; | |
203 | movq %rsi, RDST; | |
204 | movq %rdx, RIO; | |
205 | ||
206 | enc_inpack(); | |
207 | ||
208 | enc_rounds(0); | |
209 | enc_fls(8); | |
210 | enc_rounds(8); | |
211 | enc_fls(16); | |
212 | enc_rounds(16); | |
213 | movl $24, RT1d; /* max */ | |
214 | ||
215 | cmpb $16, key_length(CTX); | |
59990684 | 216 | je .L__enc_done; |
0b95ec56 JK |
217 | |
218 | enc_fls(24); | |
219 | enc_rounds(24); | |
220 | movl $32, RT1d; /* max */ | |
221 | ||
59990684 | 222 | .L__enc_done: |
0b95ec56 JK |
223 | testb RXORbl, RXORbl; |
224 | movq RDST, RIO; | |
225 | ||
59990684 | 226 | jnz .L__enc_xor; |
0b95ec56 JK |
227 | |
228 | enc_outunpack(mov, RT1); | |
229 | ||
230 | movq RRBP, %rbp; | |
231 | ret; | |
232 | ||
59990684 | 233 | .L__enc_xor: |
0b95ec56 JK |
234 | enc_outunpack(xor, RT1); |
235 | ||
236 | movq RRBP, %rbp; | |
237 | ret; | |
59990684 | 238 | ENDPROC(__camellia_enc_blk) |
0b95ec56 | 239 | |
59990684 | 240 | ENTRY(camellia_dec_blk) |
0b95ec56 JK |
241 | /* input: |
242 | * %rdi: ctx, CTX | |
243 | * %rsi: dst | |
244 | * %rdx: src | |
245 | */ | |
246 | cmpl $16, key_length(CTX); | |
247 | movl $32, RT2d; | |
248 | movl $24, RXORd; | |
249 | cmovel RXORd, RT2d; /* max */ | |
250 | ||
251 | movq %rbp, RRBP; | |
252 | movq %rsi, RDST; | |
253 | movq %rdx, RIO; | |
254 | ||
255 | dec_inpack(RT2); | |
256 | ||
257 | cmpb $24, RT2bl; | |
59990684 | 258 | je .L__dec_rounds16; |
0b95ec56 JK |
259 | |
260 | dec_rounds(24); | |
261 | dec_fls(24); | |
262 | ||
59990684 | 263 | .L__dec_rounds16: |
0b95ec56 JK |
264 | dec_rounds(16); |
265 | dec_fls(16); | |
266 | dec_rounds(8); | |
267 | dec_fls(8); | |
268 | dec_rounds(0); | |
269 | ||
270 | movq RDST, RIO; | |
271 | ||
272 | dec_outunpack(); | |
273 | ||
274 | movq RRBP, %rbp; | |
275 | ret; | |
59990684 | 276 | ENDPROC(camellia_dec_blk) |
0b95ec56 JK |
277 | |
278 | /********************************************************************** | |
279 | 2-way camellia | |
280 | **********************************************************************/ | |
281 | #define roundsm2(ab, subkey, cd) \ | |
282 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ | |
283 | xorq RT2, cd ## 1; \ | |
284 | \ | |
285 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ | |
286 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ | |
287 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ | |
288 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ | |
289 | \ | |
290 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ | |
291 | xorq RT2, cd ## 0; \ | |
292 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ | |
293 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ | |
294 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); | |
295 | ||
296 | #define fls2(l, r, kl, kr) \ | |
297 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ | |
298 | andl l ## 0d, RT0d; \ | |
299 | roll $1, RT0d; \ | |
300 | shlq $32, RT0; \ | |
301 | xorq RT0, l ## 0; \ | |
302 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ | |
303 | orq r ## 0, RT1; \ | |
304 | shrq $32, RT1; \ | |
305 | xorq RT1, r ## 0; \ | |
306 | \ | |
307 | movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ | |
308 | andl l ## 1d, RT2d; \ | |
309 | roll $1, RT2d; \ | |
310 | shlq $32, RT2; \ | |
311 | xorq RT2, l ## 1; \ | |
312 | movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ | |
313 | orq r ## 1, RT0; \ | |
314 | shrq $32, RT0; \ | |
315 | xorq RT0, r ## 1; \ | |
316 | \ | |
317 | movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ | |
318 | orq l ## 0, RT1; \ | |
319 | shrq $32, RT1; \ | |
320 | xorq RT1, l ## 0; \ | |
321 | movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ | |
322 | andl r ## 0d, RT2d; \ | |
323 | roll $1, RT2d; \ | |
324 | shlq $32, RT2; \ | |
325 | xorq RT2, r ## 0; \ | |
326 | \ | |
327 | movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ | |
328 | orq l ## 1, RT0; \ | |
329 | shrq $32, RT0; \ | |
330 | xorq RT0, l ## 1; \ | |
331 | movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ | |
332 | andl r ## 1d, RT1d; \ | |
333 | roll $1, RT1d; \ | |
334 | shlq $32, RT1; \ | |
335 | xorq RT1, r ## 1; | |
336 | ||
337 | #define enc_rounds2(i) \ | |
338 | roundsm2(RAB, i + 2, RCD); \ | |
339 | roundsm2(RCD, i + 3, RAB); \ | |
340 | roundsm2(RAB, i + 4, RCD); \ | |
341 | roundsm2(RCD, i + 5, RAB); \ | |
342 | roundsm2(RAB, i + 6, RCD); \ | |
343 | roundsm2(RCD, i + 7, RAB); | |
344 | ||
345 | #define enc_fls2(i) \ | |
346 | fls2(RAB, RCD, i + 0, i + 1); | |
347 | ||
348 | #define enc_inpack2() \ | |
349 | movq (RIO), RAB0; \ | |
350 | bswapq RAB0; \ | |
351 | rorq $32, RAB0; \ | |
352 | movq 4*2(RIO), RCD0; \ | |
353 | bswapq RCD0; \ | |
354 | rolq $32, RCD0; \ | |
355 | xorq key_table(CTX), RAB0; \ | |
356 | \ | |
357 | movq 8*2(RIO), RAB1; \ | |
358 | bswapq RAB1; \ | |
359 | rorq $32, RAB1; \ | |
360 | movq 12*2(RIO), RCD1; \ | |
361 | bswapq RCD1; \ | |
362 | rolq $32, RCD1; \ | |
363 | xorq key_table(CTX), RAB1; | |
364 | ||
365 | #define enc_outunpack2(op, max) \ | |
366 | xorq key_table(CTX, max, 8), RCD0; \ | |
367 | rolq $32, RCD0; \ | |
368 | bswapq RCD0; \ | |
369 | op ## q RCD0, (RIO); \ | |
370 | rorq $32, RAB0; \ | |
371 | bswapq RAB0; \ | |
372 | op ## q RAB0, 4*2(RIO); \ | |
373 | \ | |
374 | xorq key_table(CTX, max, 8), RCD1; \ | |
375 | rolq $32, RCD1; \ | |
376 | bswapq RCD1; \ | |
377 | op ## q RCD1, 8*2(RIO); \ | |
378 | rorq $32, RAB1; \ | |
379 | bswapq RAB1; \ | |
380 | op ## q RAB1, 12*2(RIO); | |
381 | ||
382 | #define dec_rounds2(i) \ | |
383 | roundsm2(RAB, i + 7, RCD); \ | |
384 | roundsm2(RCD, i + 6, RAB); \ | |
385 | roundsm2(RAB, i + 5, RCD); \ | |
386 | roundsm2(RCD, i + 4, RAB); \ | |
387 | roundsm2(RAB, i + 3, RCD); \ | |
388 | roundsm2(RCD, i + 2, RAB); | |
389 | ||
390 | #define dec_fls2(i) \ | |
391 | fls2(RAB, RCD, i + 1, i + 0); | |
392 | ||
393 | #define dec_inpack2(max) \ | |
394 | movq (RIO), RAB0; \ | |
395 | bswapq RAB0; \ | |
396 | rorq $32, RAB0; \ | |
397 | movq 4*2(RIO), RCD0; \ | |
398 | bswapq RCD0; \ | |
399 | rolq $32, RCD0; \ | |
400 | xorq key_table(CTX, max, 8), RAB0; \ | |
401 | \ | |
402 | movq 8*2(RIO), RAB1; \ | |
403 | bswapq RAB1; \ | |
404 | rorq $32, RAB1; \ | |
405 | movq 12*2(RIO), RCD1; \ | |
406 | bswapq RCD1; \ | |
407 | rolq $32, RCD1; \ | |
408 | xorq key_table(CTX, max, 8), RAB1; | |
409 | ||
410 | #define dec_outunpack2() \ | |
411 | xorq key_table(CTX), RCD0; \ | |
412 | rolq $32, RCD0; \ | |
413 | bswapq RCD0; \ | |
414 | movq RCD0, (RIO); \ | |
415 | rorq $32, RAB0; \ | |
416 | bswapq RAB0; \ | |
417 | movq RAB0, 4*2(RIO); \ | |
418 | \ | |
419 | xorq key_table(CTX), RCD1; \ | |
420 | rolq $32, RCD1; \ | |
421 | bswapq RCD1; \ | |
422 | movq RCD1, 8*2(RIO); \ | |
423 | rorq $32, RAB1; \ | |
424 | bswapq RAB1; \ | |
425 | movq RAB1, 12*2(RIO); | |
426 | ||
59990684 | 427 | ENTRY(__camellia_enc_blk_2way) |
0b95ec56 JK |
428 | /* input: |
429 | * %rdi: ctx, CTX | |
430 | * %rsi: dst | |
431 | * %rdx: src | |
432 | * %rcx: bool xor | |
433 | */ | |
434 | pushq %rbx; | |
435 | ||
436 | movq %rbp, RRBP; | |
437 | movq %rcx, RXOR; | |
438 | movq %rsi, RDST; | |
439 | movq %rdx, RIO; | |
440 | ||
441 | enc_inpack2(); | |
442 | ||
443 | enc_rounds2(0); | |
444 | enc_fls2(8); | |
445 | enc_rounds2(8); | |
446 | enc_fls2(16); | |
447 | enc_rounds2(16); | |
448 | movl $24, RT2d; /* max */ | |
449 | ||
450 | cmpb $16, key_length(CTX); | |
59990684 | 451 | je .L__enc2_done; |
0b95ec56 JK |
452 | |
453 | enc_fls2(24); | |
454 | enc_rounds2(24); | |
455 | movl $32, RT2d; /* max */ | |
456 | ||
59990684 | 457 | .L__enc2_done: |
0b95ec56 JK |
458 | test RXORbl, RXORbl; |
459 | movq RDST, RIO; | |
59990684 | 460 | jnz .L__enc2_xor; |
0b95ec56 JK |
461 | |
462 | enc_outunpack2(mov, RT2); | |
463 | ||
464 | movq RRBP, %rbp; | |
465 | popq %rbx; | |
466 | ret; | |
467 | ||
59990684 | 468 | .L__enc2_xor: |
0b95ec56 JK |
469 | enc_outunpack2(xor, RT2); |
470 | ||
471 | movq RRBP, %rbp; | |
472 | popq %rbx; | |
473 | ret; | |
59990684 | 474 | ENDPROC(__camellia_enc_blk_2way) |
0b95ec56 | 475 | |
59990684 | 476 | ENTRY(camellia_dec_blk_2way) |
0b95ec56 JK |
477 | /* input: |
478 | * %rdi: ctx, CTX | |
479 | * %rsi: dst | |
480 | * %rdx: src | |
481 | */ | |
482 | cmpl $16, key_length(CTX); | |
483 | movl $32, RT2d; | |
484 | movl $24, RXORd; | |
485 | cmovel RXORd, RT2d; /* max */ | |
486 | ||
487 | movq %rbx, RXOR; | |
488 | movq %rbp, RRBP; | |
489 | movq %rsi, RDST; | |
490 | movq %rdx, RIO; | |
491 | ||
492 | dec_inpack2(RT2); | |
493 | ||
494 | cmpb $24, RT2bl; | |
59990684 | 495 | je .L__dec2_rounds16; |
0b95ec56 JK |
496 | |
497 | dec_rounds2(24); | |
498 | dec_fls2(24); | |
499 | ||
59990684 | 500 | .L__dec2_rounds16: |
0b95ec56 JK |
501 | dec_rounds2(16); |
502 | dec_fls2(16); | |
503 | dec_rounds2(8); | |
504 | dec_fls2(8); | |
505 | dec_rounds2(0); | |
506 | ||
507 | movq RDST, RIO; | |
508 | ||
509 | dec_outunpack2(); | |
510 | ||
511 | movq RRBP, %rbp; | |
512 | movq RXOR, %rbx; | |
513 | ret; | |
59990684 | 514 | ENDPROC(camellia_dec_blk_2way) |