]> Git Repo - haraka-avx512.git/blame - haraka.c
run
[haraka-avx512.git] / haraka.c
CommitLineData
23e3fea5 1/*\r
2The MIT License (MIT)\r
3\r
4Copyright (c) 2016 kste\r
5\r
6Permission is hereby granted, free of charge, to any person obtaining a copy\r
7of this software and associated documentation files (the "Software"), to deal\r
8in the Software without restriction, including without limitation the rights\r
9to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
10copies of the Software, and to permit persons to whom the Software is\r
11furnished to do so, subject to the following conditions:\r
12\r
13The above copyright notice and this permission notice shall be included in all\r
14copies or substantial portions of the Software.\r
15\r
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
17IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
18FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
19AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
20LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
21OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
22SOFTWARE.\r
23\r
24Optimized Implementations for Haraka256 and Haraka512\r
25*/\r
26\r
27#include <stdio.h>\r
28#include "haraka.h"\r
29#include <stdint.h>\r
30u512 rc[10];\r
31u512 rc0[10] = {0};\r
32u512 MIX_4;\r
33void load_constants() {\r
9c70fd33 34 //MIX_4 = _mm512_set_epi32(3,11,7,15,8,0,12,4,9,1,13,5,2,10,6,14); stupid thing goes the wrong way\r
35 MIX_4 = _mm512_set_epi32(14,6,10,2,5,13,1,9,4,12,0,8,15,7,11,3);\r
23e3fea5 36\r
8453c554 37 rc[0] = _mm512_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79,0x3402de2d,0x53f28498,0xcf029d60,0x9f029114,0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717,0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);\r
d52fb4a4 38 rc[1] = _mm512_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b,0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b,0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b,0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);\r
8453c554 39 rc[2] = _mm512_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a,0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800,0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33,0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);\r
40 rc[3] = _mm512_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec,0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6,0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee,0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);\r
41 rc[4] = _mm512_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a,0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6,0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b,0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);\r
42 rc[5] = _mm512_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d,0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1,0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d,0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);\r
43 rc[6] = _mm512_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9,0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c,0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899,0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);\r
44 rc[7] = _mm512_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350,0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9,0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1,0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);\r
45 rc[8] = _mm512_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde,0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6,0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442,0xae3db102,0x5e962988,0xab0dde30,0x938dca39);\r
46 rc[9] = _mm512_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1,0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf,0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235,0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);\r
47}//duck its endien ness // good programers make programs to make their programs i fixed it\r
48\r
575ac7c2 49\r
42472fbc 50int test_implementations() {\r
1619b74e 51 alignas(64) unsigned char in[64*8];\r
52 alignas(64) unsigned char out256[32*8];\r
53 alignas(64) unsigned char out512[32*8];\r
23e722b7 54 alignas(64) unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,\r
23e3fea5 55 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,\r
56 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,\r
57 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};\r
58\r
23e722b7 59 alignas(64) unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,\r
23e3fea5 60 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,\r
61 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,\r
62 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};\r
63\r
64\r
65\r
66 int i;\r
67\r
68 // Input for testvector\r
69 for(i = 0; i < 512; i++) {\r
70 in[i] = i % 64;\r
71 }\r
72\r
73 load_constants();\r
1df53243 74 haraka512_8x(out512, in);\r
23e3fea5 75\r
76 // Verify output\r
77 for(i = 0; i < 32; i++) {\r
78 if (out512[i % 32] != testvector512[i]) {\r
79 printf("Error: testvector incorrect.\n");\r
42472fbc 80 return -1;\r
23e3fea5 81 }\r
82 }\r
42472fbc 83 return 0;\r
23e3fea5 84}\r
85/*\r
86void haraka256(unsigned char *out, const unsigned char *in) {\r
87 __m128i s[2], tmp;\r
88\r
89 s[0] = LOAD(in);\r
90 s[1] = LOAD(in + 16);\r
91\r
92 AES2(s[0], s[1], 0);\r
93 MIX2(s[0], s[1]);\r
94\r
95 AES2(s[0], s[1], 4);\r
96 MIX2(s[0], s[1]);\r
97\r
98 AES2(s[0], s[1], 8);\r
99 MIX2(s[0], s[1]);\r
100\r
101 AES2(s[0], s[1], 12);\r
102 MIX2(s[0], s[1]);\r
103\r
104 AES2(s[0], s[1], 16);\r
105 MIX2(s[0], s[1]);\r
106\r
107 s[0] = _mm_xor_si128(s[0], LOAD(in));\r
108 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));\r
109\r
110 STORE(out, s[0]);\r
111 STORE(out + 16, s[1]);\r
112}\r
113\r
114void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {\r
115 __m128i s[2], tmp;\r
116\r
117 s[0] = LOAD(in);\r
118 s[1] = LOAD(in + 16);\r
119\r
120\r
121 s[0] = _mm_xor_si128(s[0], LOAD(in));\r
122 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));\r
123\r
124 STORE(out, s[0]);\r
125 STORE(out + 16, s[1]);\r
126}\r
127\r
128void haraka256_4x(unsigned char *out, const unsigned char *in) {\r
129 __m128i s[4][2], tmp;\r
130\r
131 s[0][0] = LOAD(in);\r
132 s[0][1] = LOAD(in + 16);\r
133 s[1][0] = LOAD(in + 32);\r
134 s[1][1] = LOAD(in + 48);\r
135 s[2][0] = LOAD(in + 64);\r
136 s[2][1] = LOAD(in + 80);\r
137 s[3][0] = LOAD(in + 96);\r
138\r
139\r
140 MIX2(s[0][0], s[0][1]);\r
141 MIX2(s[1][0], s[1][1]);\r
142 MIX2(s[2][0], s[2][1]);\r
143 MIX2(s[3][0], s[3][1]);\r
144\r
145 // Feed Forward\r
146 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));\r
147 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));\r
148 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));\r
149 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));\r
150 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));\r
151 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));\r
152 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));\r
153 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));\r
154\r
155 STORE(out, s[0][0]);\r
156 STORE(out + 16, s[0][1]);\r
157 STORE(out + 32, s[1][0]);\r
158 STORE(out + 48, s[1][1]);\r
159 STORE(out + 64, s[2][0]);\r
160 STORE(out + 80, s[2][1]);\r
161 STORE(out + 96, s[3][0]);\r
162 STORE(out + 112, s[3][1]);\r
163}\r
164\r
165void haraka256_8x(unsigned char *out, const unsigned char *in) {\r
166 // This is faster on Skylake, the code below is faster on Haswell.\r
167 haraka256_4x(out, in);\r
168 haraka256_4x(out + 128, in + 128);\r
169 return;\r
170 // __m128i s[8][2], tmp;\r
171 //\r
172 // int i;\r
173 //\r
174 // s[0][0] = LOAD(in);\r
175 // s[0][1] = LOAD(in + 16);\r
176 // s[1][0] = LOAD(in + 32);\r
177 // s[1][1] = LOAD(in + 48);\r
178 // s[2][0] = LOAD(in + 64);\r
179 // s[2][1] = LOAD(in + 80);\r
180 // s[3][0] = LOAD(in + 96);\r
181 // s[3][1] = LOAD(in + 112);\r
182 // s[4][0] = LOAD(in + 128);\r
183 // s[4][1] = LOAD(in + 144);\r
184 // s[5][0] = LOAD(in + 160);\r
185 // s[5][1] = LOAD(in + 176);\r
186 // s[6][0] = LOAD(in + 192);\r
187 // s[6][1] = LOAD(in + 208);\r
188 // s[7][0] = LOAD(in + 224);\r
189 // s[7][1] = LOAD(in + 240);\r
190 //\r
191 // // Round 1\r
192 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);\r
193 //\r
194 // MIX2(s[0][0], s[0][1]);\r
195 // MIX2(s[1][0], s[1][1]);\r
196 // MIX2(s[2][0], s[2][1]);\r
197 // MIX2(s[3][0], s[3][1]);\r
198 // MIX2(s[4][0], s[4][1]);\r
199 // MIX2(s[5][0], s[5][1]);\r
200 // MIX2(s[6][0], s[6][1]);\r
201 // MIX2(s[7][0], s[7][1]);\r
202 //\r
203 //\r
204 // // Round 2\r
205 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4);\r
206 //\r
207 // MIX2(s[0][0], s[0][1]);\r
208 // MIX2(s[1][0], s[1][1]);\r
209 // MIX2(s[2][0], s[2][1]);\r
210 // MIX2(s[3][0], s[3][1]);\r
211 // MIX2(s[4][0], s[4][1]);\r
212 // MIX2(s[5][0], s[5][1]);\r
213 // MIX2(s[6][0], s[6][1]);\r
214 // MIX2(s[7][0], s[7][1]);\r
215 //\r
216 // // Round 3\r
217 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);\r
218 //\r
219 // MIX2(s[0][0], s[0][1]);\r
220 // MIX2(s[1][0], s[1][1]);\r
221 // MIX2(s[2][0], s[2][1]);\r
222 // MIX2(s[3][0], s[3][1]);\r
223 // MIX2(s[4][0], s[4][1]);\r
224 // MIX2(s[5][0], s[5][1]);\r
225 // MIX2(s[6][0], s[6][1]);\r
226 // MIX2(s[7][0], s[7][1]);\r
227 //\r
228 // // Round 4\r
229 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12);\r
230 //\r
231 // MIX2(s[0][0], s[0][1]);\r
232 // MIX2(s[1][0], s[1][1]);\r
233 // MIX2(s[2][0], s[2][1]);\r
234 // MIX2(s[3][0], s[3][1]);\r
235 // MIX2(s[4][0], s[4][1]);\r
236 // MIX2(s[5][0], s[5][1]);\r
237 // MIX2(s[6][0], s[6][1]);\r
238 // MIX2(s[7][0], s[7][1]);\r
239 //\r
240 // // Round 5\r
241 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);\r
242 //\r
243 // MIX2(s[0][0], s[0][1]);\r
244 // MIX2(s[1][0], s[1][1]);\r
245 // MIX2(s[2][0], s[2][1]);\r
246 // MIX2(s[3][0], s[3][1]);\r
247 // MIX2(s[4][0], s[4][1]);\r
248 // MIX2(s[5][0], s[5][1]);\r
249 // MIX2(s[6][0], s[6][1]);\r
250 // MIX2(s[7][0], s[7][1]);\r
251 //\r
252 // // Feed Forward\r
253 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));\r
254 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));\r
255 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));\r
256 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));\r
257 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));\r
258 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));\r
259 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));\r
260 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));\r
261 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128));\r
262 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144));\r
263 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160));\r
264 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176));\r
265 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192));\r
266 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208));\r
267 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224));\r
268 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240));\r
269 //\r
270 // STORE(out, s[0][0]);\r
271 // STORE(out + 16, s[0][1]);\r
272 // STORE(out + 32, s[1][0]);\r
273 // STORE(out + 48, s[1][1]);\r
274 // STORE(out + 64, s[2][0]);\r
275 // STORE(out + 80, s[2][1]);\r
276 // STORE(out + 96, s[3][0]);\r
277 // STORE(out + 112, s[3][1]);\r
278 // STORE(out + 128, s[4][0]);\r
279 // STORE(out + 144, s[4][1]);\r
280 // STORE(out + 160, s[5][0]);\r
281 // STORE(out + 176, s[5][1]);\r
282 // STORE(out + 192, s[6][0]);\r
283 // STORE(out + 208, s[6][1]);\r
284 // STORE(out + 224, s[7][0]);\r
285 // STORE(out + 240, s[7][1]);\r
286}\r
287*/\r
73b6698e
MB
288static void phex(uint8_t* str)\r
289{\r
290 uint8_t len = 64;\r
291\r
292 unsigned char i;\r
293 for (i = 0; i < len; ++i)\r
39ff9d4a 294 printf("%hx, ", str[i]);\r
295 printf("\n");\r
73b6698e 296}\r
23e3fea5 297void haraka512(unsigned char *out, const unsigned char *in) {\r
298 u512 s,i;\r
299 s = LOAD (in);\r
300 i = s;\r
b44a5e9b 301\r
7a7754f3 302 AES4(s, 0);\r
80b5b832 303 MIX4(s);\r
23e3fea5 304\r
305 AES4(s, 2);\r
306 MIX4(s);\r
307\r
308 AES4(s, 4);\r
309 MIX4(s);\r
310\r
311 AES4(s, 6);\r
312 MIX4(s);\r
313\r
314 AES4(s, 8);\r
315 MIX4(s);\r
316\r
317 s = _mm512_xor_si512(s, i);\r
23e3fea5 318\r
64a515af 319 TRUNCSTORE(out, s);\r
23e3fea5 320}\r
321\r
322void haraka512_zero(unsigned char *out, const unsigned char *in) {\r
323 u512 s,i;\r
324 s = LOAD (in);\r
1df53243 325 i = s;\r
23e3fea5 326 AES4_zero(s, 0);\r
327 MIX4(s);\r
328\r
329 AES4_zero(s, 2);\r
330 MIX4(s);\r
331\r
332 AES4_zero(s, 4);\r
333 MIX4(s);\r
334\r
335 AES4_zero(s, 6);\r
336 MIX4(s);\r
337\r
338 AES4_zero(s, 8);\r
339 MIX4(s);\r
340\r
341 s = _mm512_xor_si512(s, i);\r
23e3fea5 342\r
1a2c14e5 343 TRUNCSTORE(out, s);\r
23e3fea5 344}\r
23e3fea5 345\r
346void haraka512_4x(unsigned char *out, const unsigned char *in) {\r
1df53243 347 u512 s[4],i[4];\r
23e3fea5 348\r
1df53243 349 s[0] = LOAD(in);\r
350 s[1] = LOAD(in + 64);\r
351 s[2] = LOAD(in + 128);\r
352 s[3] = LOAD(in + 192);\r
353 i[0] = s[0];\r
354 i[1] = s[1];\r
355 i[2] = s[2];\r
356 i[3] = s[3];\r
357\r
358\r
359\r
360 AES4_4x(s, 0);\r
361 MIX4(s[0]);\r
362 MIX4(s[1]);\r
363 MIX4(s[2]);\r
364 MIX4(s[3]);\r
365\r
366 AES4_4x(s, 8);\r
367 MIX4(s[0]);\r
368 MIX4(s[1]);\r
369 MIX4(s[2]);\r
370 MIX4(s[3]);\r
371\r
372 AES4_4x(s, 16);\r
373 MIX4(s[0]);\r
374 MIX4(s[1]);\r
375 MIX4(s[2]);\r
376 MIX4(s[3]);\r
377\r
378 AES4_4x(s, 24);\r
379 MIX4(s[0]);\r
380 MIX4(s[1]);\r
381 MIX4(s[2]);\r
382 MIX4(s[3]);\r
383\r
384 AES4_4x(s, 32);\r
385 MIX4(s[0]);\r
386 MIX4(s[1]);\r
387 MIX4(s[2]);\r
388 MIX4(s[3]);\r
389\r
390\r
391 s[0] = _mm512_xor_si512(s[0], i[0]);\r
392 s[1] = _mm512_xor_si512(s[1], i[1]);\r
393 s[2] = _mm512_xor_si512(s[2], i[2]);\r
394 s[3] = _mm512_xor_si512(s[3], i[3]);\r
395\r
396\r
397 TRUNCSTORE(out, s[0]);\r
398 TRUNCSTORE(out + 32, s[1]);\r
399 TRUNCSTORE(out + 64, s[2]);\r
400 TRUNCSTORE(out + 96, s[3]);\r
23e3fea5 401}\r
402\r
403void haraka512_8x(unsigned char *out, const unsigned char *in) {\r
404 // This is faster on Skylake, the code below is faster on Haswell.\r
405 haraka512_4x(out, in);\r
406 haraka512_4x(out + 128, in + 256);\r
407\r
408 // u128 s[8][4], tmp;\r
409 //\r
410 // s[0][0] = LOAD(in);\r
411 // s[0][1] = LOAD(in + 16);\r
412 // s[0][2] = LOAD(in + 32);\r
413 // s[0][3] = LOAD(in + 48);\r
414 // s[1][0] = LOAD(in + 64);\r
415 // s[1][1] = LOAD(in + 80);\r
416 // s[1][2] = LOAD(in + 96);\r
417 // s[1][3] = LOAD(in + 112);\r
418 // s[2][0] = LOAD(in + 128);\r
419 // s[2][1] = LOAD(in + 144);\r
420 // s[2][2] = LOAD(in + 160);\r
421 // s[2][3] = LOAD(in + 176);\r
422 // s[3][0] = LOAD(in + 192);\r
423 // s[3][1] = LOAD(in + 208);\r
424 // s[3][2] = LOAD(in + 224);\r
425 // s[3][3] = LOAD(in + 240);\r
426 // s[4][0] = LOAD(in + 256);\r
427 // s[4][1] = LOAD(in + 272);\r
428 // s[4][2] = LOAD(in + 288);\r
429 // s[4][3] = LOAD(in + 304);\r
430 // s[5][0] = LOAD(in + 320);\r
431 // s[5][1] = LOAD(in + 336);\r
432 // s[5][2] = LOAD(in + 352);\r
433 // s[5][3] = LOAD(in + 368);\r
434 // s[6][0] = LOAD(in + 384);\r
435 // s[6][1] = LOAD(in + 400);\r
436 // s[6][2] = LOAD(in + 416);\r
437 // s[6][3] = LOAD(in + 432);\r
438 // s[7][0] = LOAD(in + 448);\r
439 // s[7][1] = LOAD(in + 464);\r
440 // s[7][2] = LOAD(in + 480);\r
441 // s[7][3] = LOAD(in + 496);\r
442 //\r
443 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);\r
444 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);\r
445 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);\r
446 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);\r
447 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);\r
448 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);\r
449 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);\r
450 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);\r
451 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);\r
452 //\r
453 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);\r
454 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);\r
455 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);\r
456 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);\r
457 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);\r
458 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);\r
459 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);\r
460 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);\r
461 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);\r
462 //\r
463 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);\r
464 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);\r
465 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);\r
466 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);\r
467 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);\r
468 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);\r
469 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);\r
470 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);\r
471 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);\r
472 //\r
473 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24);\r
474 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);\r
475 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);\r
476 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);\r
477 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);\r
478 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);\r
479 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);\r
480 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);\r
481 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);\r
482 //\r
483 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32);\r
484 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);\r
485 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);\r
486 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);\r
487 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);\r
488 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);\r
489 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);\r
490 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);\r
491 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);\r
492 //\r
493 //\r
494 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));\r
495 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));\r
496 // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));\r
497 // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));\r
498 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));\r
499 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));\r
500 // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));\r
501 // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));\r
502 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));\r
503 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));\r
504 // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));\r
505 // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));\r
506 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));\r
507 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));\r
508 // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));\r
509 // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));\r
510 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256));\r
511 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272));\r
512 // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288));\r
513 // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304));\r
514 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320));\r
515 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336));\r
516 // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352));\r
517 // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368));\r
518 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384));\r
519 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400));\r
520 // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416));\r
521 // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432));\r
522 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448));\r
523 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464));\r
524 // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480));\r
525 // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496));\r
526 //\r
527 // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);\r
528 // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);\r
529 // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);\r
530 // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);\r
531 // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]);\r
532 // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]);\r
533 // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]);\r
534 // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]);\r
1df53243 535}\r
This page took 0.082403 seconds and 4 git commands to generate.