2 * This uses veriations of the clhash algorithm for Verus Coin, licensed
3 * with the Apache-2.0 open source license.
5 * Copyright (c) 2018 Michael Toutonghi
6 * Distributed under the Apache 2.0 software license, available in the original form for clhash
7 * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
9 * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
10 * Faster 64-bit universal hashing
11 * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
13 * Best used on recent x64 processors (Haswell or better).
15 * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
16 * is to more effectively equalize FPGAs over GPUs and CPUs.
21 #include "verus_hash.h"
25 #include <x86intrin.h>
28 #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
31 void clmul64(uint64_t a, uint64_t b, uint64_t* r)
33 uint8_t s = 4,i; //window size
34 uint64_t two_s = 1 << s; //2^s
35 uint64_t smask = two_s-1; //s 1 bits
42 for(i = 2 ; i < two_s; i += 2){
43 u[i] = u[i >> 1] << 1; //even indices: left shift
44 u[i + 1] = u[i] ^ b; //odd indices: xor b
47 r[0] = u[a & smask]; //first window only affects lower word
49 for(i = s ; i < 64 ; i += s){
50 tmp = u[a >> i & smask];
52 r[1] ^= tmp >> (64 - i);
55 uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
56 for(i = 1 ; i < s ; i++){
58 m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
59 ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
60 r[1] ^= (tmp & ifmask);
64 static u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
67 clmul64(*(uint64_t*)&a + (imm & 1), *(uint64_t*)&a + ((imm & 0x10) > 4), result);
68 return *(__m128i *)result;
71 static u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
74 uint16_t *a = (uint16_t*)&_a, *b = (uint16_t*)&_b;
75 for (int i = 0; i < 7; i ++)
77 result[i] = (uint16_t)((uint32_t)((((int32_t)a[i] * (int32_t)b[i]) >> 14) + 1) >> 1);
81 static inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
84 ((uint64_t *)&result)[0] = lo;
85 ((uint64_t *)&result)[1] = hi;
89 static inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
92 ((uint64_t *)&result)[0] = lo;
93 ((uint64_t *)&result)[1] = 0;
97 static inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
99 return *(int64_t *)&a;
102 static inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
104 return *(int32_t *)&a;
107 static inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
110 ((uint32_t *)&result)[0] = lo;
111 ((uint32_t *)&result)[1] = 0;
112 ((uint64_t *)&result)[1] = 0;
116 static u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
119 ((uint8_t *)&result)[0] = c0;
120 ((uint8_t *)&result)[1] = c1;
121 ((uint8_t *)&result)[2] = c2;
122 ((uint8_t *)&result)[3] = c3;
123 ((uint8_t *)&result)[4] = c4;
124 ((uint8_t *)&result)[5] = c5;
125 ((uint8_t *)&result)[6] = c6;
126 ((uint8_t *)&result)[7] = c7;
127 ((uint8_t *)&result)[8] = c8;
128 ((uint8_t *)&result)[9] = c9;
129 ((uint8_t *)&result)[10] = c10;
130 ((uint8_t *)&result)[11] = c11;
131 ((uint8_t *)&result)[12] = c12;
132 ((uint8_t *)&result)[13] = c13;
133 ((uint8_t *)&result)[14] = c14;
134 ((uint8_t *)&result)[15] = c15;
138 static inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
140 uint8_t shift = imm8 & 0xff;
141 shift = shift > 15 ? 128 : shift << 3;
145 inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
150 inline __m128i _mm_load_si128_emu(const void *p)
152 return *(__m128i *)p;
155 inline __m128i _mm_store_si128_emu(void *p, __m128i val)
160 __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
163 for (int i = 0; i < 16; i++)
165 if (((uint8_t *)&b)[i] & 0x80)
167 ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
171 ((uint8_t *)&result)[i] = 0;
178 static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
179 const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
180 const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
184 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
185 static inline __m128i precompReduction64_si128_port( __m128i A) {
187 //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
188 const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
189 __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
190 __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
191 _mm_srli_si128_emu(Q2,8));
192 __m128i Q4 = _mm_xor_si128_emu(Q2,A);
193 const __m128i final = _mm_xor_si128_emu(Q3,Q4);
194 return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
197 static inline uint64_t precompReduction64_port( __m128i A) {
198 return _mm_cvtsi128_si64(precompReduction64_si128_port(A));
201 // verus intermediate hash extra
202 static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
206 // divide key mask by 16 from bytes to __m128i
209 // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
210 // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
211 // be used to xor into the accumulator before it is hashed with other values first
212 __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
214 for (int64_t i = 0; i < 32; i++)
216 const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
218 // get two random locations in the key, which will be mutated and swapped
219 __m128i *prand = randomsource + ((selector >> 5) & keyMask);
220 __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
222 // select random start and order of pbuf processing
223 pbuf = buf + (selector & 3);
225 switch (selector & 0x1c)
229 const __m128i temp1 = _mm_load_si128_emu(prandex);
230 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
231 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
232 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
233 acc = _mm_xor_si128_emu(clprod1, acc);
235 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
236 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
238 const __m128i temp12 = _mm_load_si128_emu(prand);
239 _mm_store_si128_emu(prand, tempa2);
241 const __m128i temp22 = _mm_load_si128_emu(pbuf);
242 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
243 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
244 acc = _mm_xor_si128_emu(clprod12, acc);
246 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
247 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
248 _mm_store_si128_emu(prandex, tempb2);
253 const __m128i temp1 = _mm_load_si128_emu(prand);
254 const __m128i temp2 = _mm_load_si128_emu(pbuf);
255 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
256 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
257 acc = _mm_xor_si128_emu(clprod1, acc);
258 const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
259 acc = _mm_xor_si128_emu(clprod2, acc);
261 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
262 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
264 const __m128i temp12 = _mm_load_si128_emu(prandex);
265 _mm_store_si128_emu(prandex, tempa2);
267 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
268 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
269 acc = _mm_xor_si128_emu(add12, acc);
271 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
272 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
273 _mm_store_si128_emu(prand, tempb2);
278 const __m128i temp1 = _mm_load_si128_emu(prandex);
279 const __m128i temp2 = _mm_load_si128_emu(pbuf);
280 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
281 acc = _mm_xor_si128_emu(add1, acc);
283 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
284 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
286 const __m128i temp12 = _mm_load_si128_emu(prand);
287 _mm_store_si128_emu(prand, tempa2);
289 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
290 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
291 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
292 acc = _mm_xor_si128_emu(clprod12, acc);
293 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
294 acc = _mm_xor_si128_emu(clprod22, acc);
296 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
297 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
298 _mm_store_si128_emu(prandex, tempb2);
303 const __m128i temp1 = _mm_load_si128_emu(prand);
304 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
305 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
307 // cannot be zero here
308 const int32_t divisor = (uint32_t)selector;
310 acc = _mm_xor_si128(add1, acc);
312 const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
313 const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
314 acc = _mm_xor_si128_emu(modulo, acc);
316 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
317 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
321 const __m128i temp12 = _mm_load_si128_emu(prandex);
322 _mm_store_si128_emu(prandex, tempa2);
324 const __m128i temp22 = _mm_load_si128_emu(pbuf);
325 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
326 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
327 acc = _mm_xor_si128_emu(clprod12, acc);
328 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
329 acc = _mm_xor_si128_emu(clprod22, acc);
331 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
332 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
333 _mm_store_si128_emu(prand, tempb2);
337 const __m128i tempb3 = _mm_load_si128_emu(prandex);
338 _mm_store_si128_emu(prandex, tempa2);
339 _mm_store_si128_emu(prand, tempb3);
345 // a few AES operations
346 const __m128i *rc = prand;
349 __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
350 __m128i temp2 = _mm_load_si128_emu(pbuf);
352 AES2_EMU(temp1, temp2, 0);
353 MIX2_EMU(temp1, temp2);
355 AES2_EMU(temp1, temp2, 4);
356 MIX2_EMU(temp1, temp2);
358 AES2_EMU(temp1, temp2, 8);
359 MIX2_EMU(temp1, temp2);
361 acc = _mm_xor_si128_emu(temp1, acc);
362 acc = _mm_xor_si128_emu(temp2, acc);
364 const __m128i tempa1 = _mm_load_si128_emu(prand);
365 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
366 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
368 const __m128i tempa4 = _mm_load_si128_emu(prandex);
369 _mm_store_si128_emu(prandex, tempa3);
370 _mm_store_si128_emu(prand, tempa4);
375 // we'll just call this one the monkins loop, inspired by Chris
376 const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
377 __m128i tmp; // used by MIX2
379 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
381 uint64_t aesround = 0;
386 if (selector & (0x10000000 << rounds))
388 onekey = _mm_load_si128_emu(rc++);
389 const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
390 const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
391 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
392 acc = _mm_xor_si128_emu(clprod1, acc);
396 onekey = _mm_load_si128_emu(rc++);
397 __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
398 AES2_EMU(onekey, temp2, aesround++ << 2);
399 MIX2_EMU(onekey, temp2);
400 acc = _mm_xor_si128_emu(onekey, acc);
401 acc = _mm_xor_si128_emu(temp2, acc);
405 const __m128i tempa1 = _mm_load_si128_emu(prand);
406 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
407 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
409 const __m128i tempa4 = _mm_load_si128_emu(prandex);
410 _mm_store_si128_emu(prandex, tempa3);
411 _mm_store_si128_emu(prand, tempa4);
416 const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
417 const __m128i temp2 = _mm_load_si128_emu(prand);
418 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
419 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
420 acc = _mm_xor_si128_emu(clprod1, acc);
422 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
423 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
425 const __m128i tempb3 = _mm_load_si128_emu(prandex);
426 _mm_store_si128_emu(prandex, tempa2);
427 _mm_store_si128_emu(prand, tempb3);
432 const __m128i temp1 = _mm_load_si128_emu(pbuf);
433 const __m128i temp2 = _mm_load_si128_emu(prandex);
434 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
435 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
436 acc = _mm_xor_si128_emu(clprod1, acc);
438 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
439 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
441 const __m128i tempa3 = _mm_load_si128_emu(prand);
442 _mm_store_si128_emu(prand, tempa2);
444 acc = _mm_xor_si128_emu(tempa3, acc);
446 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
447 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
448 _mm_store_si128_emu(prandex, tempb2);
456 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
457 // returning a 64 bit hash value
458 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
459 const unsigned int m = 128;// we process the data in chunks of 16 cache lines
460 __m128i * rs64 = (__m128i *)random;
461 const __m128i * string = (const __m128i *) buf;
463 __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
464 acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
465 return precompReduction64_port(acc);