]>
Commit | Line | Data |
---|---|---|
8280daad JK |
1 | /* |
2 | * Twofish Cipher 3-way parallel algorithm (x86_64) | |
3 | * | |
4 | * Copyright (C) 2011 Jussi Kivilinna <[email protected]> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
19 | * USA | |
20 | * | |
21 | */ | |
22 | ||
d3f5188d JK |
23 | #include <linux/linkage.h> |
24 | ||
8280daad JK |
25 | .file "twofish-x86_64-asm-3way.S" |
26 | .text | |
27 | ||
28 | /* structure of crypto context */ | |
29 | #define s0 0 | |
30 | #define s1 1024 | |
31 | #define s2 2048 | |
32 | #define s3 3072 | |
33 | #define w 4096 | |
34 | #define k 4128 | |
35 | ||
36 | /********************************************************************** | |
37 | 3-way twofish | |
38 | **********************************************************************/ | |
39 | #define CTX %rdi | |
40 | #define RIO %rdx | |
41 | ||
42 | #define RAB0 %rax | |
43 | #define RAB1 %rbx | |
44 | #define RAB2 %rcx | |
45 | ||
46 | #define RAB0d %eax | |
47 | #define RAB1d %ebx | |
48 | #define RAB2d %ecx | |
49 | ||
50 | #define RAB0bh %ah | |
51 | #define RAB1bh %bh | |
52 | #define RAB2bh %ch | |
53 | ||
54 | #define RAB0bl %al | |
55 | #define RAB1bl %bl | |
56 | #define RAB2bl %cl | |
57 | ||
58 | #define RCD0 %r8 | |
59 | #define RCD1 %r9 | |
60 | #define RCD2 %r10 | |
61 | ||
62 | #define RCD0d %r8d | |
63 | #define RCD1d %r9d | |
64 | #define RCD2d %r10d | |
65 | ||
66 | #define RX0 %rbp | |
67 | #define RX1 %r11 | |
68 | #define RX2 %r12 | |
69 | ||
70 | #define RX0d %ebp | |
71 | #define RX1d %r11d | |
72 | #define RX2d %r12d | |
73 | ||
74 | #define RY0 %r13 | |
75 | #define RY1 %r14 | |
76 | #define RY2 %r15 | |
77 | ||
78 | #define RY0d %r13d | |
79 | #define RY1d %r14d | |
80 | #define RY2d %r15d | |
81 | ||
82 | #define RT0 %rdx | |
83 | #define RT1 %rsi | |
84 | ||
85 | #define RT0d %edx | |
86 | #define RT1d %esi | |
87 | ||
88 | #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ | |
89 | movzbl ab ## bl, tmp2 ## d; \ | |
90 | movzbl ab ## bh, tmp1 ## d; \ | |
91 | rorq $(rot), ab; \ | |
92 | op1##l T0(CTX, tmp2, 4), dst ## d; \ | |
93 | op2##l T1(CTX, tmp1, 4), dst ## d; | |
94 | ||
95 | /* | |
96 | * Combined G1 & G2 function. Reordered with help of rotates to have moves | |
97 | * at begining. | |
98 | */ | |
99 | #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ | |
100 | /* G1,1 && G2,1 */ \ | |
101 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ | |
102 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ | |
103 | \ | |
104 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ | |
105 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ | |
106 | \ | |
107 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ | |
108 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ | |
109 | \ | |
110 | /* G1,2 && G2,2 */ \ | |
111 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ | |
112 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ | |
113 | xchgq cd ## 0, ab ## 0; \ | |
114 | \ | |
115 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ | |
116 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ | |
117 | xchgq cd ## 1, ab ## 1; \ | |
118 | \ | |
119 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ | |
120 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ | |
121 | xchgq cd ## 2, ab ## 2; | |
122 | ||
123 | #define enc_round_end(ab, x, y, n) \ | |
124 | addl y ## d, x ## d; \ | |
125 | addl x ## d, y ## d; \ | |
126 | addl k+4*(2*(n))(CTX), x ## d; \ | |
127 | xorl ab ## d, x ## d; \ | |
128 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | |
129 | shrq $32, ab; \ | |
130 | roll $1, ab ## d; \ | |
131 | xorl y ## d, ab ## d; \ | |
132 | shlq $32, ab; \ | |
133 | rorl $1, x ## d; \ | |
134 | orq x, ab; | |
135 | ||
136 | #define dec_round_end(ba, x, y, n) \ | |
137 | addl y ## d, x ## d; \ | |
138 | addl x ## d, y ## d; \ | |
139 | addl k+4*(2*(n))(CTX), x ## d; \ | |
140 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | |
141 | xorl ba ## d, y ## d; \ | |
142 | shrq $32, ba; \ | |
143 | roll $1, ba ## d; \ | |
144 | xorl x ## d, ba ## d; \ | |
145 | shlq $32, ba; \ | |
146 | rorl $1, y ## d; \ | |
147 | orq y, ba; | |
148 | ||
149 | #define encrypt_round3(ab, cd, n) \ | |
150 | g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ | |
151 | \ | |
152 | enc_round_end(ab ## 0, RX0, RY0, n); \ | |
153 | enc_round_end(ab ## 1, RX1, RY1, n); \ | |
154 | enc_round_end(ab ## 2, RX2, RY2, n); | |
155 | ||
156 | #define decrypt_round3(ba, dc, n) \ | |
157 | g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ | |
158 | \ | |
159 | dec_round_end(ba ## 0, RX0, RY0, n); \ | |
160 | dec_round_end(ba ## 1, RX1, RY1, n); \ | |
161 | dec_round_end(ba ## 2, RX2, RY2, n); | |
162 | ||
163 | #define encrypt_cycle3(ab, cd, n) \ | |
164 | encrypt_round3(ab, cd, n*2); \ | |
165 | encrypt_round3(ab, cd, (n*2)+1); | |
166 | ||
167 | #define decrypt_cycle3(ba, dc, n) \ | |
168 | decrypt_round3(ba, dc, (n*2)+1); \ | |
169 | decrypt_round3(ba, dc, (n*2)); | |
170 | ||
171 | #define inpack3(in, n, xy, m) \ | |
172 | movq 4*(n)(in), xy ## 0; \ | |
173 | xorq w+4*m(CTX), xy ## 0; \ | |
174 | \ | |
175 | movq 4*(4+(n))(in), xy ## 1; \ | |
176 | xorq w+4*m(CTX), xy ## 1; \ | |
177 | \ | |
178 | movq 4*(8+(n))(in), xy ## 2; \ | |
179 | xorq w+4*m(CTX), xy ## 2; | |
180 | ||
181 | #define outunpack3(op, out, n, xy, m) \ | |
182 | xorq w+4*m(CTX), xy ## 0; \ | |
183 | op ## q xy ## 0, 4*(n)(out); \ | |
184 | \ | |
185 | xorq w+4*m(CTX), xy ## 1; \ | |
186 | op ## q xy ## 1, 4*(4+(n))(out); \ | |
187 | \ | |
188 | xorq w+4*m(CTX), xy ## 2; \ | |
189 | op ## q xy ## 2, 4*(8+(n))(out); | |
190 | ||
191 | #define inpack_enc3() \ | |
192 | inpack3(RIO, 0, RAB, 0); \ | |
193 | inpack3(RIO, 2, RCD, 2); | |
194 | ||
195 | #define outunpack_enc3(op) \ | |
196 | outunpack3(op, RIO, 2, RAB, 6); \ | |
197 | outunpack3(op, RIO, 0, RCD, 4); | |
198 | ||
199 | #define inpack_dec3() \ | |
200 | inpack3(RIO, 0, RAB, 4); \ | |
201 | rorq $32, RAB0; \ | |
202 | rorq $32, RAB1; \ | |
203 | rorq $32, RAB2; \ | |
204 | inpack3(RIO, 2, RCD, 6); \ | |
205 | rorq $32, RCD0; \ | |
206 | rorq $32, RCD1; \ | |
207 | rorq $32, RCD2; | |
208 | ||
209 | #define outunpack_dec3() \ | |
210 | rorq $32, RCD0; \ | |
211 | rorq $32, RCD1; \ | |
212 | rorq $32, RCD2; \ | |
213 | outunpack3(mov, RIO, 0, RCD, 0); \ | |
214 | rorq $32, RAB0; \ | |
215 | rorq $32, RAB1; \ | |
216 | rorq $32, RAB2; \ | |
217 | outunpack3(mov, RIO, 2, RAB, 2); | |
218 | ||
d3f5188d | 219 | ENTRY(__twofish_enc_blk_3way) |
8280daad JK |
220 | /* input: |
221 | * %rdi: ctx, CTX | |
222 | * %rsi: dst | |
223 | * %rdx: src, RIO | |
224 | * %rcx: bool, if true: xor output | |
225 | */ | |
226 | pushq %r15; | |
227 | pushq %r14; | |
228 | pushq %r13; | |
229 | pushq %r12; | |
230 | pushq %rbp; | |
231 | pushq %rbx; | |
232 | ||
233 | pushq %rcx; /* bool xor */ | |
234 | pushq %rsi; /* dst */ | |
235 | ||
236 | inpack_enc3(); | |
237 | ||
238 | encrypt_cycle3(RAB, RCD, 0); | |
239 | encrypt_cycle3(RAB, RCD, 1); | |
240 | encrypt_cycle3(RAB, RCD, 2); | |
241 | encrypt_cycle3(RAB, RCD, 3); | |
242 | encrypt_cycle3(RAB, RCD, 4); | |
243 | encrypt_cycle3(RAB, RCD, 5); | |
244 | encrypt_cycle3(RAB, RCD, 6); | |
245 | encrypt_cycle3(RAB, RCD, 7); | |
246 | ||
247 | popq RIO; /* dst */ | |
248 | popq %rbp; /* bool xor */ | |
249 | ||
250 | testb %bpl, %bpl; | |
d3f5188d | 251 | jnz .L__enc_xor3; |
8280daad JK |
252 | |
253 | outunpack_enc3(mov); | |
254 | ||
255 | popq %rbx; | |
256 | popq %rbp; | |
257 | popq %r12; | |
258 | popq %r13; | |
259 | popq %r14; | |
260 | popq %r15; | |
261 | ret; | |
262 | ||
d3f5188d | 263 | .L__enc_xor3: |
8280daad JK |
264 | outunpack_enc3(xor); |
265 | ||
266 | popq %rbx; | |
267 | popq %rbp; | |
268 | popq %r12; | |
269 | popq %r13; | |
270 | popq %r14; | |
271 | popq %r15; | |
272 | ret; | |
d3f5188d | 273 | ENDPROC(__twofish_enc_blk_3way) |
8280daad | 274 | |
d3f5188d | 275 | ENTRY(twofish_dec_blk_3way) |
8280daad JK |
276 | /* input: |
277 | * %rdi: ctx, CTX | |
278 | * %rsi: dst | |
279 | * %rdx: src, RIO | |
280 | */ | |
281 | pushq %r15; | |
282 | pushq %r14; | |
283 | pushq %r13; | |
284 | pushq %r12; | |
285 | pushq %rbp; | |
286 | pushq %rbx; | |
287 | ||
288 | pushq %rsi; /* dst */ | |
289 | ||
290 | inpack_dec3(); | |
291 | ||
292 | decrypt_cycle3(RAB, RCD, 7); | |
293 | decrypt_cycle3(RAB, RCD, 6); | |
294 | decrypt_cycle3(RAB, RCD, 5); | |
295 | decrypt_cycle3(RAB, RCD, 4); | |
296 | decrypt_cycle3(RAB, RCD, 3); | |
297 | decrypt_cycle3(RAB, RCD, 2); | |
298 | decrypt_cycle3(RAB, RCD, 1); | |
299 | decrypt_cycle3(RAB, RCD, 0); | |
300 | ||
301 | popq RIO; /* dst */ | |
302 | ||
303 | outunpack_dec3(); | |
304 | ||
305 | popq %rbx; | |
306 | popq %rbp; | |
307 | popq %r12; | |
308 | popq %r13; | |
309 | popq %r14; | |
310 | popq %r15; | |
311 | ret; | |
d3f5188d | 312 | ENDPROC(twofish_dec_blk_3way) |