2 The MIT License (MIT)
\r
4 Copyright (c) 2016 kste
\r
6 Permission is hereby granted, free of charge, to any person obtaining a copy
\r
7 of this software and associated documentation files (the "Software"), to deal
\r
8 in the Software without restriction, including without limitation the rights
\r
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
\r
10 copies of the Software, and to permit persons to whom the Software is
\r
11 furnished to do so, subject to the following conditions:
\r
13 The above copyright notice and this permission notice shall be included in all
\r
14 copies or substantial portions of the Software.
\r
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
\r
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
\r
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
\r
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
\r
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
\r
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
24 Optimized Implementations for Haraka256 and Haraka512
\r
28 #include "crypto/haraka.h"
\r
32 void load_constants() {
\r
33 rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
\r
34 rc[1] = _mm_set_epi32(0x8b66b4e1,0x88f3a06b,0x640f6ba4,0x2f08f717);
\r
35 rc[2] = _mm_set_epi32(0x3402de2d,0x53f28498,0xcf029d60,0x9f029114);
\r
36 rc[3] = _mm_set_epi32(0x0ed6eae6,0x2e7b4f08,0xbbf3bcaf,0xfd5b4f79);
\r
37 rc[4] = _mm_set_epi32(0xcbcfb0cb,0x4872448b,0x79eecd1c,0xbe397044);
\r
38 rc[5] = _mm_set_epi32(0x7eeacdee,0x6e9032b7,0x8d5335ed,0x2b8a057b);
\r
39 rc[6] = _mm_set_epi32(0x67c28f43,0x5e2e7cd0,0xe2412761,0xda4fef1b);
\r
40 rc[7] = _mm_set_epi32(0x2924d9b0,0xafcacc07,0x675ffde2,0x1fc70b3b);
\r
41 rc[8] = _mm_set_epi32(0xab4d63f1,0xe6867fe9,0xecdb8fca,0xb9d465ee);
\r
42 rc[9] = _mm_set_epi32(0x1c30bf84,0xd4b7cd64,0x5b2a404f,0xad037e33);
\r
43 rc[10] = _mm_set_epi32(0xb2cc0bb9,0x941723bf,0x69028b2e,0x8df69800);
\r
44 rc[11] = _mm_set_epi32(0xfa0478a6,0xde6f5572,0x4aaa9ec8,0x5c9d2d8a);
\r
45 rc[12] = _mm_set_epi32(0xdfb49f2b,0x6b772a12,0x0efa4f2e,0x29129fd4);
\r
46 rc[13] = _mm_set_epi32(0x1ea10344,0xf449a236,0x32d611ae,0xbb6a12ee);
\r
47 rc[14] = _mm_set_epi32(0xaf044988,0x4b050084,0x5f9600c9,0x9ca8eca6);
\r
48 rc[15] = _mm_set_epi32(0x21025ed8,0x9d199c4f,0x78a2c7e3,0x27e593ec);
\r
49 rc[16] = _mm_set_epi32(0xbf3aaaf8,0xa759c9b7,0xb9282ecd,0x82d40173);
\r
50 rc[17] = _mm_set_epi32(0x6260700d,0x6186b017,0x37f2efd9,0x10307d6b);
\r
51 rc[18] = _mm_set_epi32(0x5aca45c2,0x21300443,0x81c29153,0xf6fc9ac6);
\r
52 rc[19] = _mm_set_epi32(0x9223973c,0x226b68bb,0x2caf92e8,0x36d1943a);
\r
53 rc[20] = _mm_set_epi32(0xd3bf9238,0x225886eb,0x6cbab958,0xe51071b4);
\r
54 rc[21] = _mm_set_epi32(0xdb863ce5,0xaef0c677,0x933dfddd,0x24e1128d);
\r
55 rc[22] = _mm_set_epi32(0xbb606268,0xffeba09c,0x83e48de3,0xcb2212b1);
\r
56 rc[23] = _mm_set_epi32(0x734bd3dc,0xe2e4d19c,0x2db91a4e,0xc72bf77d);
\r
57 rc[24] = _mm_set_epi32(0x43bb47c3,0x61301b43,0x4b1415c4,0x2cb3924e);
\r
58 rc[25] = _mm_set_epi32(0xdba775a8,0xe707eff6,0x03b231dd,0x16eb6899);
\r
59 rc[26] = _mm_set_epi32(0x6df3614b,0x3c755977,0x8e5e2302,0x7eca472c);
\r
60 rc[27] = _mm_set_epi32(0xcda75a17,0xd6de7d77,0x6d1be5b9,0xb88617f9);
\r
61 rc[28] = _mm_set_epi32(0xec6b43f0,0x6ba8e9aa,0x9d6c069d,0xa946ee5d);
\r
62 rc[29] = _mm_set_epi32(0xcb1e6950,0xf957332b,0xa2531159,0x3bf327c1);
\r
63 rc[30] = _mm_set_epi32(0x2cee0c75,0x00da619c,0xe4ed0353,0x600ed0d9);
\r
64 rc[31] = _mm_set_epi32(0xf0b1a5a1,0x96e90cab,0x80bbbabc,0x63a4a350);
\r
65 rc[32] = _mm_set_epi32(0xae3db102,0x5e962988,0xab0dde30,0x938dca39);
\r
66 rc[33] = _mm_set_epi32(0x17bb8f38,0xd554a40b,0x8814f3a8,0x2e75b442);
\r
67 rc[34] = _mm_set_epi32(0x34bb8a5b,0x5f427fd7,0xaeb6b779,0x360a16f6);
\r
68 rc[35] = _mm_set_epi32(0x26f65241,0xcbe55438,0x43ce5918,0xffbaafde);
\r
69 rc[36] = _mm_set_epi32(0x4ce99a54,0xb9f3026a,0xa2ca9cf7,0x839ec978);
\r
70 rc[37] = _mm_set_epi32(0xae51a51a,0x1bdff7be,0x40c06e28,0x22901235);
\r
71 rc[38] = _mm_set_epi32(0xa0c1613c,0xba7ed22b,0xc173bc0f,0x48a659cf);
\r
72 rc[39] = _mm_set_epi32(0x756acc03,0x02288288,0x4ad6bdfd,0xe9c59da1);
\r
75 void test_implementations() {
\r
76 unsigned char *in = (unsigned char *)calloc(64*8, sizeof(unsigned char));
\r
77 unsigned char *out256 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
\r
78 unsigned char *out512 = (unsigned char *)calloc(32*8, sizeof(unsigned char));
\r
79 unsigned char testvector256[32] = {0x80, 0x27, 0xcc, 0xb8, 0x79, 0x49, 0x77, 0x4b,
\r
80 0x78, 0xd0, 0x54, 0x5f, 0xb7, 0x2b, 0xf7, 0x0c,
\r
81 0x69, 0x5c, 0x2a, 0x09, 0x23, 0xcb, 0xd4, 0x7b,
\r
82 0xba, 0x11, 0x59, 0xef, 0xbf, 0x2b, 0x2c, 0x1c};
\r
84 unsigned char testvector512[32] = {0xbe, 0x7f, 0x72, 0x3b, 0x4e, 0x80, 0xa9, 0x98,
\r
85 0x13, 0xb2, 0x92, 0x28, 0x7f, 0x30, 0x6f, 0x62,
\r
86 0x5a, 0x6d, 0x57, 0x33, 0x1c, 0xae, 0x5f, 0x34,
\r
87 0xdd, 0x92, 0x77, 0xb0, 0x94, 0x5b, 0xe2, 0xaa};
\r
93 // Input for testvector
\r
94 for(i = 0; i < 512; i++) {
\r
99 haraka512_8x(out512, in);
\r
102 for(i = 0; i < 32; i++) {
\r
103 if (out512[i % 32] != testvector512[i]) {
\r
104 printf("Error: testvector incorrect.\n");
\r
114 void haraka256(unsigned char *out, const unsigned char *in) {
\r
118 s[1] = LOAD(in + 16);
\r
120 AES2(s[0], s[1], 0);
\r
123 AES2(s[0], s[1], 4);
\r
126 AES2(s[0], s[1], 8);
\r
129 AES2(s[0], s[1], 12);
\r
132 AES2(s[0], s[1], 16);
\r
135 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
136 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
139 STORE(out + 16, s[1]);
\r
142 void haraka256_4x(unsigned char *out, const unsigned char *in) {
\r
143 __m128i s[4][2], tmp;
\r
145 s[0][0] = LOAD(in);
\r
146 s[0][1] = LOAD(in + 16);
\r
147 s[1][0] = LOAD(in + 32);
\r
148 s[1][1] = LOAD(in + 48);
\r
149 s[2][0] = LOAD(in + 64);
\r
150 s[2][1] = LOAD(in + 80);
\r
151 s[3][0] = LOAD(in + 96);
\r
152 s[3][1] = LOAD(in + 112);
\r
155 AES2_4x(s[0], s[1], s[2], s[3], 0);
\r
157 MIX2(s[0][0], s[0][1]);
\r
158 MIX2(s[1][0], s[1][1]);
\r
159 MIX2(s[2][0], s[2][1]);
\r
160 MIX2(s[3][0], s[3][1]);
\r
163 AES2_4x(s[0], s[1], s[2], s[3], 4);
\r
165 MIX2(s[0][0], s[0][1]);
\r
166 MIX2(s[1][0], s[1][1]);
\r
167 MIX2(s[2][0], s[2][1]);
\r
168 MIX2(s[3][0], s[3][1]);
\r
171 AES2_4x(s[0], s[1], s[2], s[3], 8);
\r
173 MIX2(s[0][0], s[0][1]);
\r
174 MIX2(s[1][0], s[1][1]);
\r
175 MIX2(s[2][0], s[2][1]);
\r
176 MIX2(s[3][0], s[3][1]);
\r
179 AES2_4x(s[0], s[1], s[2], s[3], 12);
\r
181 MIX2(s[0][0], s[0][1]);
\r
182 MIX2(s[1][0], s[1][1]);
\r
183 MIX2(s[2][0], s[2][1]);
\r
184 MIX2(s[3][0], s[3][1]);
\r
187 AES2_4x(s[0], s[1], s[2], s[3], 16);
\r
189 MIX2(s[0][0], s[0][1]);
\r
190 MIX2(s[1][0], s[1][1]);
\r
191 MIX2(s[2][0], s[2][1]);
\r
192 MIX2(s[3][0], s[3][1]);
\r
195 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
196 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
197 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
\r
198 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
\r
199 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
\r
200 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
\r
201 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
\r
202 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
\r
204 STORE(out, s[0][0]);
\r
205 STORE(out + 16, s[0][1]);
\r
206 STORE(out + 32, s[1][0]);
\r
207 STORE(out + 48, s[1][1]);
\r
208 STORE(out + 64, s[2][0]);
\r
209 STORE(out + 80, s[2][1]);
\r
210 STORE(out + 96, s[3][0]);
\r
211 STORE(out + 112, s[3][1]);
\r
214 void haraka256_8x(unsigned char *out, const unsigned char *in) {
\r
215 // This is faster on Skylake, the code below is faster on Haswell.
\r
216 haraka256_4x(out, in);
\r
217 haraka256_4x(out + 128, in + 128);
\r
219 // __m128i s[8][2], tmp;
\r
223 // s[0][0] = LOAD(in);
\r
224 // s[0][1] = LOAD(in + 16);
\r
225 // s[1][0] = LOAD(in + 32);
\r
226 // s[1][1] = LOAD(in + 48);
\r
227 // s[2][0] = LOAD(in + 64);
\r
228 // s[2][1] = LOAD(in + 80);
\r
229 // s[3][0] = LOAD(in + 96);
\r
230 // s[3][1] = LOAD(in + 112);
\r
231 // s[4][0] = LOAD(in + 128);
\r
232 // s[4][1] = LOAD(in + 144);
\r
233 // s[5][0] = LOAD(in + 160);
\r
234 // s[5][1] = LOAD(in + 176);
\r
235 // s[6][0] = LOAD(in + 192);
\r
236 // s[6][1] = LOAD(in + 208);
\r
237 // s[7][0] = LOAD(in + 224);
\r
238 // s[7][1] = LOAD(in + 240);
\r
241 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
\r
243 // MIX2(s[0][0], s[0][1]);
\r
244 // MIX2(s[1][0], s[1][1]);
\r
245 // MIX2(s[2][0], s[2][1]);
\r
246 // MIX2(s[3][0], s[3][1]);
\r
247 // MIX2(s[4][0], s[4][1]);
\r
248 // MIX2(s[5][0], s[5][1]);
\r
249 // MIX2(s[6][0], s[6][1]);
\r
250 // MIX2(s[7][0], s[7][1]);
\r
254 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 4);
\r
256 // MIX2(s[0][0], s[0][1]);
\r
257 // MIX2(s[1][0], s[1][1]);
\r
258 // MIX2(s[2][0], s[2][1]);
\r
259 // MIX2(s[3][0], s[3][1]);
\r
260 // MIX2(s[4][0], s[4][1]);
\r
261 // MIX2(s[5][0], s[5][1]);
\r
262 // MIX2(s[6][0], s[6][1]);
\r
263 // MIX2(s[7][0], s[7][1]);
\r
266 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
\r
268 // MIX2(s[0][0], s[0][1]);
\r
269 // MIX2(s[1][0], s[1][1]);
\r
270 // MIX2(s[2][0], s[2][1]);
\r
271 // MIX2(s[3][0], s[3][1]);
\r
272 // MIX2(s[4][0], s[4][1]);
\r
273 // MIX2(s[5][0], s[5][1]);
\r
274 // MIX2(s[6][0], s[6][1]);
\r
275 // MIX2(s[7][0], s[7][1]);
\r
278 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 12);
\r
280 // MIX2(s[0][0], s[0][1]);
\r
281 // MIX2(s[1][0], s[1][1]);
\r
282 // MIX2(s[2][0], s[2][1]);
\r
283 // MIX2(s[3][0], s[3][1]);
\r
284 // MIX2(s[4][0], s[4][1]);
\r
285 // MIX2(s[5][0], s[5][1]);
\r
286 // MIX2(s[6][0], s[6][1]);
\r
287 // MIX2(s[7][0], s[7][1]);
\r
290 // AES2_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
\r
292 // MIX2(s[0][0], s[0][1]);
\r
293 // MIX2(s[1][0], s[1][1]);
\r
294 // MIX2(s[2][0], s[2][1]);
\r
295 // MIX2(s[3][0], s[3][1]);
\r
296 // MIX2(s[4][0], s[4][1]);
\r
297 // MIX2(s[5][0], s[5][1]);
\r
298 // MIX2(s[6][0], s[6][1]);
\r
299 // MIX2(s[7][0], s[7][1]);
\r
302 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
303 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
304 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
\r
305 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
\r
306 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
\r
307 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
\r
308 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
\r
309 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
\r
310 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 128));
\r
311 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 144));
\r
312 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 160));
\r
313 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 176));
\r
314 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 192));
\r
315 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 208));
\r
316 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 224));
\r
317 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 240));
\r
319 // STORE(out, s[0][0]);
\r
320 // STORE(out + 16, s[0][1]);
\r
321 // STORE(out + 32, s[1][0]);
\r
322 // STORE(out + 48, s[1][1]);
\r
323 // STORE(out + 64, s[2][0]);
\r
324 // STORE(out + 80, s[2][1]);
\r
325 // STORE(out + 96, s[3][0]);
\r
326 // STORE(out + 112, s[3][1]);
\r
327 // STORE(out + 128, s[4][0]);
\r
328 // STORE(out + 144, s[4][1]);
\r
329 // STORE(out + 160, s[5][0]);
\r
330 // STORE(out + 176, s[5][1]);
\r
331 // STORE(out + 192, s[6][0]);
\r
332 // STORE(out + 208, s[6][1]);
\r
333 // STORE(out + 224, s[7][0]);
\r
334 // STORE(out + 240, s[7][1]);
\r
337 void haraka512(unsigned char *out, const unsigned char *in) {
\r
341 s[1] = LOAD(in + 16);
\r
342 s[2] = LOAD(in + 32);
\r
343 s[3] = LOAD(in + 48);
\r
345 AES4(s[0], s[1], s[2], s[3], 0);
\r
346 MIX4(s[0], s[1], s[2], s[3]);
\r
348 AES4(s[0], s[1], s[2], s[3], 8);
\r
349 MIX4(s[0], s[1], s[2], s[3]);
\r
351 AES4(s[0], s[1], s[2], s[3], 16);
\r
352 MIX4(s[0], s[1], s[2], s[3]);
\r
354 AES4(s[0], s[1], s[2], s[3], 24);
\r
355 MIX4(s[0], s[1], s[2], s[3]);
\r
357 AES4(s[0], s[1], s[2], s[3], 32);
\r
358 MIX4(s[0], s[1], s[2], s[3]);
\r
360 s[0] = _mm_xor_si128(s[0], LOAD(in));
\r
361 s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
\r
362 s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
\r
363 s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
\r
365 TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
\r
368 void haraka512_4x(unsigned char *out, const unsigned char *in) {
\r
371 s[0][0] = LOAD(in);
\r
372 s[0][1] = LOAD(in + 16);
\r
373 s[0][2] = LOAD(in + 32);
\r
374 s[0][3] = LOAD(in + 48);
\r
375 s[1][0] = LOAD(in + 64);
\r
376 s[1][1] = LOAD(in + 80);
\r
377 s[1][2] = LOAD(in + 96);
\r
378 s[1][3] = LOAD(in + 112);
\r
379 s[2][0] = LOAD(in + 128);
\r
380 s[2][1] = LOAD(in + 144);
\r
381 s[2][2] = LOAD(in + 160);
\r
382 s[2][3] = LOAD(in + 176);
\r
383 s[3][0] = LOAD(in + 192);
\r
384 s[3][1] = LOAD(in + 208);
\r
385 s[3][2] = LOAD(in + 224);
\r
386 s[3][3] = LOAD(in + 240);
\r
388 AES4_4x(s[0], s[1], s[2], s[3], 0);
\r
389 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
390 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
391 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
392 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
394 AES4_4x(s[0], s[1], s[2], s[3], 8);
\r
395 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
396 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
397 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
398 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
400 AES4_4x(s[0], s[1], s[2], s[3], 16);
\r
401 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
402 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
403 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
404 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
406 AES4_4x(s[0], s[1], s[2], s[3], 24);
\r
407 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
408 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
409 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
410 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
412 AES4_4x(s[0], s[1], s[2], s[3], 32);
\r
413 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
414 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
415 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
416 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
419 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
420 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
421 s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
\r
422 s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
\r
423 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
\r
424 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
\r
425 s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
\r
426 s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
\r
427 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
\r
428 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
\r
429 s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
\r
430 s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
\r
431 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
\r
432 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
\r
433 s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
\r
434 s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
\r
436 TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
\r
437 TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
\r
438 TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
\r
439 TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
\r
442 void haraka512_8x(unsigned char *out, const unsigned char *in) {
\r
443 // This is faster on Skylake, the code below is faster on Haswell.
\r
444 haraka512_4x(out, in);
\r
445 haraka512_4x(out + 128, in + 256);
\r
447 // u128 s[8][4], tmp;
\r
449 // s[0][0] = LOAD(in);
\r
450 // s[0][1] = LOAD(in + 16);
\r
451 // s[0][2] = LOAD(in + 32);
\r
452 // s[0][3] = LOAD(in + 48);
\r
453 // s[1][0] = LOAD(in + 64);
\r
454 // s[1][1] = LOAD(in + 80);
\r
455 // s[1][2] = LOAD(in + 96);
\r
456 // s[1][3] = LOAD(in + 112);
\r
457 // s[2][0] = LOAD(in + 128);
\r
458 // s[2][1] = LOAD(in + 144);
\r
459 // s[2][2] = LOAD(in + 160);
\r
460 // s[2][3] = LOAD(in + 176);
\r
461 // s[3][0] = LOAD(in + 192);
\r
462 // s[3][1] = LOAD(in + 208);
\r
463 // s[3][2] = LOAD(in + 224);
\r
464 // s[3][3] = LOAD(in + 240);
\r
465 // s[4][0] = LOAD(in + 256);
\r
466 // s[4][1] = LOAD(in + 272);
\r
467 // s[4][2] = LOAD(in + 288);
\r
468 // s[4][3] = LOAD(in + 304);
\r
469 // s[5][0] = LOAD(in + 320);
\r
470 // s[5][1] = LOAD(in + 336);
\r
471 // s[5][2] = LOAD(in + 352);
\r
472 // s[5][3] = LOAD(in + 368);
\r
473 // s[6][0] = LOAD(in + 384);
\r
474 // s[6][1] = LOAD(in + 400);
\r
475 // s[6][2] = LOAD(in + 416);
\r
476 // s[6][3] = LOAD(in + 432);
\r
477 // s[7][0] = LOAD(in + 448);
\r
478 // s[7][1] = LOAD(in + 464);
\r
479 // s[7][2] = LOAD(in + 480);
\r
480 // s[7][3] = LOAD(in + 496);
\r
482 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0);
\r
483 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
484 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
485 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
486 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
487 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
488 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
489 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
490 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
492 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8);
\r
493 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
494 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
495 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
496 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
497 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
498 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
499 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
500 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
502 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16);
\r
503 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
504 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
505 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
506 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
507 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
508 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
509 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
510 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
512 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24);
\r
513 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
514 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
515 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
516 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
517 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
518 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
519 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
520 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
522 // AES4_8x(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32);
\r
523 // MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
\r
524 // MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
\r
525 // MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
\r
526 // MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
\r
527 // MIX4(s[4][0], s[4][1], s[4][2], s[4][3]);
\r
528 // MIX4(s[5][0], s[5][1], s[5][2], s[5][3]);
\r
529 // MIX4(s[6][0], s[6][1], s[6][2], s[6][3]);
\r
530 // MIX4(s[7][0], s[7][1], s[7][2], s[7][3]);
\r
533 // s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
\r
534 // s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
\r
535 // s[0][2] = _mm_xor_si128(s[0][2], LOAD(in + 32));
\r
536 // s[0][3] = _mm_xor_si128(s[0][3], LOAD(in + 48));
\r
537 // s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 64));
\r
538 // s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 80));
\r
539 // s[1][2] = _mm_xor_si128(s[1][2], LOAD(in + 96));
\r
540 // s[1][3] = _mm_xor_si128(s[1][3], LOAD(in + 112));
\r
541 // s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 128));
\r
542 // s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 144));
\r
543 // s[2][2] = _mm_xor_si128(s[2][2], LOAD(in + 160));
\r
544 // s[2][3] = _mm_xor_si128(s[2][3], LOAD(in + 176));
\r
545 // s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 192));
\r
546 // s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 208));
\r
547 // s[3][2] = _mm_xor_si128(s[3][2], LOAD(in + 224));
\r
548 // s[3][3] = _mm_xor_si128(s[3][3], LOAD(in + 240));
\r
549 // s[4][0] = _mm_xor_si128(s[4][0], LOAD(in + 256));
\r
550 // s[4][1] = _mm_xor_si128(s[4][1], LOAD(in + 272));
\r
551 // s[4][2] = _mm_xor_si128(s[4][2], LOAD(in + 288));
\r
552 // s[4][3] = _mm_xor_si128(s[4][3], LOAD(in + 304));
\r
553 // s[5][0] = _mm_xor_si128(s[5][0], LOAD(in + 320));
\r
554 // s[5][1] = _mm_xor_si128(s[5][1], LOAD(in + 336));
\r
555 // s[5][2] = _mm_xor_si128(s[5][2], LOAD(in + 352));
\r
556 // s[5][3] = _mm_xor_si128(s[5][3], LOAD(in + 368));
\r
557 // s[6][0] = _mm_xor_si128(s[6][0], LOAD(in + 384));
\r
558 // s[6][1] = _mm_xor_si128(s[6][1], LOAD(in + 400));
\r
559 // s[6][2] = _mm_xor_si128(s[6][2], LOAD(in + 416));
\r
560 // s[6][3] = _mm_xor_si128(s[6][3], LOAD(in + 432));
\r
561 // s[7][0] = _mm_xor_si128(s[7][0], LOAD(in + 448));
\r
562 // s[7][1] = _mm_xor_si128(s[7][1], LOAD(in + 464));
\r
563 // s[7][2] = _mm_xor_si128(s[7][2], LOAD(in + 480));
\r
564 // s[7][3] = _mm_xor_si128(s[7][3], LOAD(in + 496));
\r
566 // TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
\r
567 // TRUNCSTORE(out + 32, s[1][0], s[1][1], s[1][2], s[1][3]);
\r
568 // TRUNCSTORE(out + 64, s[2][0], s[2][1], s[2][2], s[2][3]);
\r
569 // TRUNCSTORE(out + 96, s[3][0], s[3][1], s[3][2], s[3][3]);
\r
570 // TRUNCSTORE(out + 128, s[4][0], s[4][1], s[4][2], s[4][3]);
\r
571 // TRUNCSTORE(out + 160, s[5][0], s[5][1], s[5][2], s[5][3]);
\r
572 // TRUNCSTORE(out + 192, s[6][0], s[6][1], s[6][2], s[6][3]);
\r
573 // TRUNCSTORE(out + 224, s[7][0], s[7][1], s[7][2], s[7][3]);
\r