2 The MIT License (MIT)
\r
4 Copyright (c) 2016 kste
\r
6 Permission is hereby granted, free of charge, to any person obtaining a copy
\r
7 of this software and associated documentation files (the "Software"), to deal
\r
8 in the Software without restriction, including without limitation the rights
\r
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
\r
10 copies of the Software, and to permit persons to whom the Software is
\r
11 furnished to do so, subject to the following conditions:
\r
13 The above copyright notice and this permission notice shall be included in all
\r
14 copies or substantial portions of the Software.
\r
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
\r
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
\r
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
\r
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
\r
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
\r
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
24 Optimized Implementations for Haraka256 and Haraka512
\r
26 #if defined(__arm__) || defined(__aarch64__)
\r
27 #include "crypto/SSE2NEON.h"
\r
28 __m128i _mm_aesenc_si128 (__m128i a, __m128i RoundKey)
\r
29 { //TODO intrinsic version of ARM AESENC
\r
30 uint8x16_t tmp1, tmp2, tmp3;
\r
35 tmp3 = vaesmcq_u8(vaeseq_u8(tmp1, (uint8x16_t){})) ^ tmp2;
\r
43 #include "crypto/haraka.h"
\r
48 void load_constants() {
\r
49 rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
\r
50 rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
\r
51 rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
\r
52 rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
\r
53 rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
\r
54 rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
\r
55 rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
\r
56 rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
\r
57 rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
\r
58 rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
\r
59 rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
\r
60 rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
\r
61 rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
\r
62 rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
\r
63 rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
\r
64 rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
\r
65 rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
\r
66 rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
\r
67 rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
\r
68 rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
\r
69 rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);
\r
70 rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d);
\r
71 rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1);
\r
72 rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d);
\r
73 rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);
\r
74 rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899);
\r
75 rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c);
\r
76 rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9);
\r
77 rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);
\r
78 rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1);
\r
79 rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9);
\r
80 rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350);
\r
81 rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39);
\r
82 rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442);
\r
83 rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6);
\r
84 rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde);
\r
85 rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);
\r
86 rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235);
\r
87 rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf);
\r
88 rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1);
\r
91 void test_implementations() {
\r
92 unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char));
\r
93 unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
\r
94 unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
\r
95 unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,
\r
96 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,
\r
97 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,
\r
98 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};
\r
100 unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,
\r
101 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,
\r
102 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,
\r
103 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};
\r
109 // Input for testvector
\r
110 for(i = 0; i < 512; i++) {
\r
115 haraka512_8x(out512, in);
\r
118 for(i = 0; i < 32; i++) {
\r
119 if (out512[i % 32] != testvector512[i]) {
\r
120 printf("Error: testvector incorrect.\n");
\r
130 void haraka256(unsigned char *out, const unsigned char *in) {
\r
134 s[1] = LOAD(in + 16);
\r
136 AES2(s[0], s[1], 0);
\r
139 AES2(s[0], s[1], 4);
\r
142 AES2(s[0], s[1], 8);
\r
145 AES2(s[0], s[1], 12);
\r
148 AES2(s[0], s[1], 16);
\r
151 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
152 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
155 STORE(out + 16, s[1]);
\r
158 void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
\r
162 s[1] = LOAD(in + 16);
\r
164 AES2(s[0], s[1], 0);
\r
167 AES2(s[0], s[1], 4);
\r
170 AES2(s[0], s[1], 8);
\r
173 AES2(s[0], s[1], 12);
\r
176 AES2(s[0], s[1], 16);
\r
179 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
180 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
183 STORE(out + 16, s[1]);
\r
186 void haraka256_4x(unsigned char *out, const unsigned char *in) {
\r
187 __m128i s[4][2], tmp;
\r
189 s[0][0] = LOAD(in);
\r
190 s[0][1] = LOAD(in + 16);
\r
191 s[1][0] = LOAD(in + 32);
\r
192 s[1][1] = LOAD(in + 48);
\r
193 s[2][0] = LOAD(in + 64);
\r
194 s[2][1] = LOAD(in + 80);
\r
195 s[3][0] = LOAD(in + 96);
\r
196 s[3][1] = LOAD(in + 112);
\r
199 AES2_4x(s[0], s[1], s[2], s[3], 0);
\r
201 MIX2(s[0][0], s[0][1]);
\r
202 MIX2(s[1][0], s[1][1]);
\r
203 MIX2(s[2][0], s[2][1]);
\r
204 MIX2(s[3][0], s[3][1]);
\r
207 AES2_4x(s[0], s[1], s[2], s[3], 4);
\r
209 MIX2(s[0][0], s[0][1]);
\r
210 MIX2(s[1][0], s[1][1]);
\r
211 MIX2(s[2][0], s[2][1]);
\r
212 MIX2(s[3][0], s[3][1]);
\r
215 AES2_4x(s[0], s[1], s[2], s[3], 8);
\r
217 MIX2(s[0][0], s[0][1]);
\r
218 MIX2(s[1][0], s[1][1]);
\r
219 MIX2(s[2][0], s[2][1]);
\r
220 MIX2(s[3][0], s[3][1]);
\r
223 AES2_4x(s[0], s[1], s[2], s[3], 12);
\r
225 MIX2(s[0][0], s[0][1]);
\r
226 MIX2(s[1][0], s[1][1]);
\r
227 MIX2(s[2][0], s[2][1]);
\r
228 MIX2(s[3][0], s[3][1]);
\r
231 AES2_4x(s[0], s[1], s[2], s[3], 16);
\r
233 MIX2(s[0][0], s[0][1]);
\r
234 MIX2(s[1][0], s[1][1]);
\r
235 MIX2(s[2][0], s[2][1]);
\r
236 MIX2(s[3][0], s[3][1]);
\r
239 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
240 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
241 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
\r
242 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
\r
243 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
\r
244 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
\r
245 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
\r
246 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
\r
248 STORE(out, s[0][0]);
\r
249 STORE(out + 16, s[0][1]);
\r
250 STORE(out + 32, s[1][0]);
\r
251 STORE(out + 48, s[1][1]);
\r
252 STORE(out + 64, s[2][0]);
\r
253 STORE(out + 80, s[2][1]);
\r
254 STORE(out + 96, s[3][0]);
\r
255 STORE(out + 112, s[3][1]);
\r
258 void haraka256_8x(unsigned char *out, const unsigned char *in) {
\r
259 // This is faster on Skylake, the code below is faster on Haswell.
\r
260 haraka256_4x(out, in);
\r
261 haraka256_4x(out + 128, in + 128);
\r
263 // __m128i s[8][2], tmp;
\r
267 // s[0][0] = LOAD(in);
\r
268 // s[0][1] = LOAD(in + 16);
\r
269 // s[1][0] = LOAD(in + 32);
\r
270 // s[1][1] = LOAD(in + 48);
\r
271 // s[2][0] = LOAD(in + 64);
\r
272 // s[2][1] = LOAD(in + 80);
\r
273 // s[3][0] = LOAD(in + 96);
\r
274 // s[3][1] = LOAD(in + 112);
\r
275 // s[4][0] = LOAD(in + 128);
\r
276 // s[4][1] = LOAD(in + 144);
\r
277 // s[5][0] = LOAD(in + 160);
\r
278 // s[5][1] = LOAD(in + 176);
\r
279 // s[6][0] = LOAD(in + 192);
\r
280 // s[6][1] = LOAD(in + 208);
\r
281 // s[7][0] = LOAD(in + 224);
\r
282 // s[7][1] = LOAD(in + 240);
\r
285 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
\r
287 // MIX2(s[0][0], s[0][1]);
\r
288 // MIX2(s[1][0], s[1][1]);
\r
289 // MIX2(s[2][0], s[2][1]);
\r
290 // MIX2(s[3][0], s[3][1]);
\r
291 // MIX2(s[4][0], s[4][1]);
\r
292 // MIX2(s[5][0], s[5][1]);
\r
293 // MIX2(s[6][0], s[6][1]);
\r
294 // MIX2(s[7][0], s[7][1]);
\r
298 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4);
\r
300 // MIX2(s[0][0], s[0][1]);
\r
301 // MIX2(s[1][0], s[1][1]);
\r
302 // MIX2(s[2][0], s[2][1]);
\r
303 // MIX2(s[3][0], s[3][1]);
\r
304 // MIX2(s[4][0], s[4][1]);
\r
305 // MIX2(s[5][0], s[5][1]);
\r
306 // MIX2(s[6][0], s[6][1]);
\r
307 // MIX2(s[7][0], s[7][1]);
\r
310 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
\r
312 // MIX2(s[0][0], s[0][1]);
\r
313 // MIX2(s[1][0], s[1][1]);
\r
314 // MIX2(s[2][0], s[2][1]);
\r
315 // MIX2(s[3][0], s[3][1]);
\r
316 // MIX2(s[4][0], s[4][1]);
\r
317 // MIX2(s[5][0], s[5][1]);
\r
318 // MIX2(s[6][0], s[6][1]);
\r
319 // MIX2(s[7][0], s[7][1]);
\r
322 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12);
\r
324 // MIX2(s[0][0], s[0][1]);
\r
325 // MIX2(s[1][0], s[1][1]);
\r
326 // MIX2(s[2][0], s[2][1]);
\r
327 // MIX2(s[3][0], s[3][1]);
\r
328 // MIX2(s[4][0], s[4][1]);
\r
329 // MIX2(s[5][0], s[5][1]);
\r
330 // MIX2(s[6][0], s[6][1]);
\r
331 // MIX2(s[7][0], s[7][1]);
\r
334 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
\r
336 // MIX2(s[0][0], s[0][1]);
\r
337 // MIX2(s[1][0], s[1][1]);
\r
338 // MIX2(s[2][0], s[2][1]);
\r
339 // MIX2(s[3][0], s[3][1]);
\r
340 // MIX2(s[4][0], s[4][1]);
\r
341 // MIX2(s[5][0], s[5][1]);
\r
342 // MIX2(s[6][0], s[6][1]);
\r
343 // MIX2(s[7][0], s[7][1]);
\r
346 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
347 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
348 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
\r
349 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
\r
350 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
\r
351 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
\r
352 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
\r
353 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
\r
354 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128));
\r
355 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144));
\r
356 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160));
\r
357 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176));
\r
358 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192));
\r
359 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208));
\r
360 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224));
\r
361 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240));
\r
363 // STORE(out, s[0][0]);
\r
364 // STORE(out + 16, s[0][1]);
\r
365 // STORE(out + 32, s[1][0]);
\r
366 // STORE(out + 48, s[1][1]);
\r
367 // STORE(out + 64, s[2][0]);
\r
368 // STORE(out + 80, s[2][1]);
\r
369 // STORE(out + 96, s[3][0]);
\r
370 // STORE(out + 112, s[3][1]);
\r
371 // STORE(out + 128, s[4][0]);
\r
372 // STORE(out + 144, s[4][1]);
\r
373 // STORE(out + 160, s[5][0]);
\r
374 // STORE(out + 176, s[5][1]);
\r
375 // STORE(out + 192, s[6][0]);
\r
376 // STORE(out + 208, s[6][1]);
\r
377 // STORE(out + 224, s[7][0]);
\r
378 // STORE(out + 240, s[7][1]);
\r
381 void haraka512(unsigned char *out, const unsigned char *in) {
\r
385 s[1] = LOAD(in + 16);
\r
386 s[2] = LOAD(in + 32);
\r
387 s[3] = LOAD(in + 48);
\r
389 AES4(s[0], s[1], s[2], s[3], 0);
\r
390 MIX4(s[0], s[1], s[2], s[3]);
\r
392 AES4(s[0], s[1], s[2], s[3], 8);
\r
393 MIX4(s[0], s[1], s[2], s[3]);
\r
395 AES4(s[0], s[1], s[2], s[3], 16);
\r
396 MIX4(s[0], s[1], s[2], s[3]);
\r
398 AES4(s[0], s[1], s[2], s[3], 24);
\r
399 MIX4(s[0], s[1], s[2], s[3]);
\r
401 AES4(s[0], s[1], s[2], s[3], 32);
\r
402 MIX4(s[0], s[1], s[2], s[3]);
\r
404 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
405 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
406 s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
\r
407 s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
\r
409 TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
\r
412 void haraka512_zero(unsigned char *out, const unsigned char *in) {
\r
416 s[1] = LOAD(in + 16);
\r
417 s[2] = LOAD(in + 32);
\r
418 s[3] = LOAD(in + 48);
\r
420 AES4_zero(s[0], s[1], s[2], s[3], 0);
\r
421 MIX4(s[0], s[1], s[2], s[3]);
\r
423 AES4_zero(s[0], s[1], s[2], s[3], 8);
\r
424 MIX4(s[0], s[1], s[2], s[3]);
\r
426 AES4_zero(s[0], s[1], s[2], s[3], 16);
\r
427 MIX4(s[0], s[1], s[2], s[3]);
\r
429 AES4_zero(s[0], s[1], s[2], s[3], 24);
\r
430 MIX4(s[0], s[1], s[2], s[3]);
\r
432 AES4_zero(s[0], s[1], s[2], s[3], 32);
\r
433 MIX4(s[0], s[1], s[2], s[3]);
\r
435 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
436 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
437 s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
\r
438 s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
\r
440 TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
\r
443 void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
\r
447 s[1] = LOAD(in + 16);
\r
448 s[2] = LOAD(in + 32);
\r
449 s[3] = LOAD(in + 48);
\r
451 AES4(s[0], s[1], s[2], s[3], 0);
\r
452 MIX4(s[0], s[1], s[2], s[3]);
\r
454 AES4(s[0], s[1], s[2], s[3], 8);
\r
455 MIX4(s[0], s[1], s[2], s[3]);
\r
457 AES4(s[0], s[1], s[2], s[3], 16);
\r
458 MIX4(s[0], s[1], s[2], s[3]);
\r
460 AES4(s[0], s[1], s[2], s[3], 24);
\r
461 MIX4(s[0], s[1], s[2], s[3]);
\r
463 AES4(s[0], s[1], s[2], s[3], 32);
\r
464 MIX4(s[0], s[1], s[2], s[3]);
\r
466 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
467 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
468 s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
\r
469 s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
\r
471 TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
\r
474 void haraka512_4x(unsigned char *out, const unsigned char *in) {
\r
477 s[0][0] = LOAD(in);
\r
478 s[0][1] = LOAD(in + 16);
\r
479 s[0][2] = LOAD(in + 32);
\r
480 s[0][3] = LOAD(in + 48);
\r
481 s[1][0] = LOAD(in + 64);
\r
482 s[1][1] = LOAD(in + 80);
\r
483 s[1][2] = LOAD(in + 96);
\r
484 s[1][3] = LOAD(in + 112);
\r
485 s[2][0] = LOAD(in + 128);
\r
486 s[2][1] = LOAD(in + 144);
\r
487 s[2][2] = LOAD(in + 160);
\r
488 s[2][3] = LOAD(in + 176);
\r
489 s[3][0] = LOAD(in + 192);
\r
490 s[3][1] = LOAD(in + 208);
\r
491 s[3][2] = LOAD(in + 224);
\r
492 s[3][3] = LOAD(in + 240);
\r
494 AES4_4x(s[0], s[1], s[2], s[3], 0);
\r
495 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
496 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
497 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
498 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
500 AES4_4x(s[0], s[1], s[2], s[3], 8);
\r
501 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
502 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
503 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
504 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
506 AES4_4x(s[0], s[1], s[2], s[3], 16);
\r
507 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
508 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
509 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
510 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
512 AES4_4x(s[0], s[1], s[2], s[3], 24);
\r
513 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
514 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
515 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
516 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
518 AES4_4x(s[0], s[1], s[2], s[3], 32);
\r
519 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
520 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
521 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
522 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
525 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
526 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
527 s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
\r
528 s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
\r
529 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
\r
530 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
\r
531 s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
\r
532 s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
\r
533 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
\r
534 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
\r
535 s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
\r
536 s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
\r
537 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
\r
538 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
\r
539 s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
\r
540 s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
\r
542 TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
\r
543 TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
\r
544 TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
\r
545 TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
\r
548 void haraka512_8x(unsigned char *out, const unsigned char *in) {
\r
549 // This is faster on Skylake, the code below is faster on Haswell.
\r
550 haraka512_4x(out, in);
\r
551 haraka512_4x(out + 128, in + 256);
\r
553 // u128 s[8][4], tmp;
\r
555 // s[0][0] = LOAD(in);
\r
556 // s[0][1] = LOAD(in + 16);
\r
557 // s[0][2] = LOAD(in + 32);
\r
558 // s[0][3] = LOAD(in + 48);
\r
559 // s[1][0] = LOAD(in + 64);
\r
560 // s[1][1] = LOAD(in + 80);
\r
561 // s[1][2] = LOAD(in + 96);
\r
562 // s[1][3] = LOAD(in + 112);
\r
563 // s[2][0] = LOAD(in + 128);
\r
564 // s[2][1] = LOAD(in + 144);
\r
565 // s[2][2] = LOAD(in + 160);
\r
566 // s[2][3] = LOAD(in + 176);
\r
567 // s[3][0] = LOAD(in + 192);
\r
568 // s[3][1] = LOAD(in + 208);
\r
569 // s[3][2] = LOAD(in + 224);
\r
570 // s[3][3] = LOAD(in + 240);
\r
571 // s[4][0] = LOAD(in + 256);
\r
572 // s[4][1] = LOAD(in + 272);
\r
573 // s[4][2] = LOAD(in + 288);
\r
574 // s[4][3] = LOAD(in + 304);
\r
575 // s[5][0] = LOAD(in + 320);
\r
576 // s[5][1] = LOAD(in + 336);
\r
577 // s[5][2] = LOAD(in + 352);
\r
578 // s[5][3] = LOAD(in + 368);
\r
579 // s[6][0] = LOAD(in + 384);
\r
580 // s[6][1] = LOAD(in + 400);
\r
581 // s[6][2] = LOAD(in + 416);
\r
582 // s[6][3] = LOAD(in + 432);
\r
583 // s[7][0] = LOAD(in + 448);
\r
584 // s[7][1] = LOAD(in + 464);
\r
585 // s[7][2] = LOAD(in + 480);
\r
586 // s[7][3] = LOAD(in + 496);
\r
588 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
\r
589 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
590 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
591 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
592 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
593 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
594 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
595 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
596 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
598 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
\r
599 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
600 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
601 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
602 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
603 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
604 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
605 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
606 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
608 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
\r
609 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
610 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
611 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
612 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
613 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
614 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
615 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
616 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
618 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24);
\r
619 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
620 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
621 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
622 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
623 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
624 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
625 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
626 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
628 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32);
\r
629 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
630 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
631 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
632 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
633 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
634 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
635 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
636 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
639 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
640 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
641 // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
\r
642 // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
\r
643 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
\r
644 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
\r
645 // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
\r
646 // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
\r
647 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
\r
648 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
\r
649 // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
\r
650 // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
\r
651 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
\r
652 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
\r
653 // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
\r
654 // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
\r
655 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256));
\r
656 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272));
\r
657 // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288));
\r
658 // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304));
\r
659 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320));
\r
660 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336));
\r
661 // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352));
\r
662 // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368));
\r
663 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384));
\r
664 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400));
\r
665 // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416));
\r
666 // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432));
\r
667 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448));
\r
668 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464));
\r
669 // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480));
\r
670 // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496));
\r
672 // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
\r
673 // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
\r
674 // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
\r
675 // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
\r
676 // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]);
\r
677 // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]);
\r
678 // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]);
\r
679 // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]);
\r