2 * This uses veriations of the clhash algorithm for Verus Coin, licensed
3 * with the Apache-2.0 open source license.
5 * Copyright (c) 2018 Michael Toutonghi
6 * Distributed under the Apache 2.0 software license, available in the original form for clhash
7 * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
9 * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
10 * Faster 64-bit universal hashing
11 * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
13 * Best used on recent x64 processors (Haswell or better).
15 * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
16 * is to more effectively equalize FPGAs over GPUs and CPUs.
21 #include "primitives/block.h"
27 #include <sys/types.h>
32 #ifdef __i386__ || __X86_64__
33 #include <x86intrin.h>
34 #elif defined(__arm__) || defined(__aarch64__)
35 #include "crypto/SSE2NEON.h"
39 #pragma warning (disable : 4146)
43 void clmul64(uint64_t a, uint64_t b, uint64_t* r)
45 uint8_t s = 4,i; //window size
46 uint64_t two_s = 1 << s; //2^s
47 uint64_t smask = two_s-1; //s 1 bits
54 for(i = 2 ; i < two_s; i += 2){
55 u[i] = u[i >> 1] << 1; //even indices: left shift
56 u[i + 1] = u[i] ^ b; //odd indices: xor b
59 r[0] = u[a & smask]; //first window only affects lower word
61 for(i = s ; i < 64 ; i += s){
62 tmp = u[a >> i & smask];
64 r[1] ^= tmp >> (64 - i);
67 uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
68 for(i = 1 ; i < s ; i++){
70 m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
71 ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
72 r[1] ^= (tmp & ifmask);
76 u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
79 clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result);
83 const __m128i tmp1 = _mm_load_si128(&a);
84 const __m128i tmp2 = _mm_load_si128(&b);
86 const __m128i testresult = (imm == 0x10) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x10) : ((imm == 0x01) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x01) : ((imm == 0x00) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x00) : _mm_clmulepi64_si128(tmp1, tmp2, 0x11)));
87 if (!memcmp(&testresult, &result, 16))
89 printf("_mm_clmulepi64_si128_emu: Portable version passed!\n");
93 printf("_mm_clmulepi64_si128_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, imm: %x, emu: %lxh %lxl, intrin: %lxh %lxl\n",
94 *((uint64_t *)&a + 1), *(uint64_t *)&a,
95 *((uint64_t *)&b + 1), *(uint64_t *)&b,
97 *((uint64_t *)result + 1), *(uint64_t *)result,
98 *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
103 return *(__m128i *)result;
106 u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
109 int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b;
110 for (int i = 0; i < 8; i ++)
112 result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15);
116 const __m128i testresult = _mm_mulhrs_epi16(_a, _b);
117 if (!memcmp(&testresult, &result, 16))
119 printf("_mm_mulhrs_epi16_emu: Portable version passed!\n");
123 printf("_mm_mulhrs_epi16_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, emu: %lxh %lxl, intrin: %lxh %lxl\n",
124 *((uint64_t *)&a + 1), *(uint64_t *)&a,
125 *((uint64_t *)&b + 1), *(uint64_t *)&b,
126 *((uint64_t *)result + 1), *(uint64_t *)result,
127 *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
131 return *(__m128i *)result;
134 inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
137 ((uint64_t *)&result)[0] = lo;
138 ((uint64_t *)&result)[1] = hi;
142 inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
145 ((uint64_t *)&result)[0] = lo;
146 ((uint64_t *)&result)[1] = 0;
150 inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
152 return *(int64_t *)&a;
155 inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
157 return *(int32_t *)&a;
160 inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
163 ((uint32_t *)&result)[0] = lo;
164 ((uint32_t *)&result)[1] = 0;
165 ((uint64_t *)&result)[1] = 0;
168 const __m128i testresult = _mm_cvtsi32_si128(lo);
169 if (!memcmp(&testresult, &result, 16))
171 printf("_mm_cvtsi32_si128_emu: Portable version passed!\n");
175 printf("_mm_cvtsi32_si128_emu: Portable version failed!\n");
182 u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
185 ((uint8_t *)&result)[0] = c0;
186 ((uint8_t *)&result)[1] = c1;
187 ((uint8_t *)&result)[2] = c2;
188 ((uint8_t *)&result)[3] = c3;
189 ((uint8_t *)&result)[4] = c4;
190 ((uint8_t *)&result)[5] = c5;
191 ((uint8_t *)&result)[6] = c6;
192 ((uint8_t *)&result)[7] = c7;
193 ((uint8_t *)&result)[8] = c8;
194 ((uint8_t *)&result)[9] = c9;
195 ((uint8_t *)&result)[10] = c10;
196 ((uint8_t *)&result)[11] = c11;
197 ((uint8_t *)&result)[12] = c12;
198 ((uint8_t *)&result)[13] = c13;
199 ((uint8_t *)&result)[14] = c14;
200 ((uint8_t *)&result)[15] = c15;
203 const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15);
204 if (!memcmp(&testresult, &result, 16))
206 printf("_mm_setr_epi8_emu: Portable version passed!\n");
210 printf("_mm_setr_epi8_emu: Portable version failed!\n");
217 inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
219 unsigned char result[16];
220 uint8_t shift = imm8 & 0xff;
221 if (shift > 15) shift = 16;
224 for (i = 0; i < (16 - shift); i++)
226 result[i] = ((unsigned char *)&a)[shift + i];
234 const __m128i tmp1 = _mm_load_si128(&a);
235 __m128i testresult = _mm_srli_si128(tmp1, imm8);
236 if (!memcmp(&testresult, result, 16))
238 printf("_mm_srli_si128_emu: Portable version passed!\n");
242 printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n",
243 *((uint64_t *)&a + 1), *(uint64_t *)&a,
245 *((uint64_t *)result + 1), *(uint64_t *)result,
246 *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
250 return *(__m128i *)result;
253 inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
257 result[0] = *(uint64_t *)&a ^ *(uint64_t *)&b;
258 result[1] = *((uint64_t *)&a + 1) ^ *((uint64_t *)&b + 1);
259 return *(__m128i *)result;
265 inline __m128i _mm_load_si128_emu(const void *p)
267 return *(__m128i *)p;
270 inline void _mm_store_si128_emu(void *p, __m128i val)
275 __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
278 for (int i = 0; i < 16; i++)
280 if (((uint8_t *)&b)[i] & 0x80)
282 ((uint8_t *)&result)[i] = 0;
286 ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
291 const __m128i tmp1 = _mm_load_si128(&a);
292 const __m128i tmp2 = _mm_load_si128(&b);
293 __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2);
294 if (!memcmp(&testresult, &result, 16))
296 printf("_mm_shuffle_epi8_emu: Portable version passed!\n");
300 printf("_mm_shuffle_epi8_emu: Portable version failed!\n");
308 static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
309 const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
310 const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
314 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
315 static inline __m128i precompReduction64_si128_port( __m128i A) {
317 //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
318 const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
319 __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
320 __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
321 _mm_srli_si128_emu(Q2,8));
322 __m128i Q4 = _mm_xor_si128_emu(Q2,A);
323 const __m128i final = _mm_xor_si128_emu(Q3,Q4);
324 return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
327 static inline uint64_t precompReduction64_port( __m128i A) {
328 __m128i tmp = precompReduction64_si128_port(A);
329 return _mm_cvtsi128_si64_emu(tmp);
332 // verus intermediate hash extra
333 static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
338 std::cout << "Random key start: ";
339 std::cout << LEToHex(*randomsource) << ", ";
340 std::cout << LEToHex(*(randomsource + 1));
341 std::cout << std::endl;
344 // divide key mask by 16 from bytes to __m128i
347 // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
348 // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
349 // be used to xor into the accumulator before it is hashed with other values first
350 __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
352 for (int64_t i = 0; i < 32; i++)
354 //std::cout << "LOOP " << i << " acc: " << LEToHex(acc) << std::endl;
356 const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
358 // get two random locations in the key, which will be mutated and swapped
359 __m128i *prand = randomsource + ((selector >> 5) & keyMask);
360 __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
362 *pMoveScratch++ = prand;
363 *pMoveScratch++ = prandex;
365 // select random start and order of pbuf processing
366 pbuf = buf + (selector & 3);
368 switch (selector & 0x1c)
372 const __m128i temp1 = _mm_load_si128_emu(prandex);
373 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
374 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
375 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
376 acc = _mm_xor_si128_emu(clprod1, acc);
379 std::cout << "temp1: " << LEToHex(temp1) << std::endl;
380 std::cout << "temp2: " << LEToHex(temp2) << std::endl;
381 std::cout << "add1: " << LEToHex(add1) << std::endl;
382 std::cout << "clprod1: " << LEToHex(clprod1) << std::endl;
383 std::cout << "acc: " << LEToHex(acc) << std::endl;
386 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
387 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
389 const __m128i temp12 = _mm_load_si128_emu(prand);
390 _mm_store_si128_emu(prand, tempa2);
392 const __m128i temp22 = _mm_load_si128_emu(pbuf);
393 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
394 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
395 acc = _mm_xor_si128_emu(clprod12, acc);
397 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
398 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
399 _mm_store_si128_emu(prandex, tempb2);
404 const __m128i temp1 = _mm_load_si128_emu(prand);
405 const __m128i temp2 = _mm_load_si128_emu(pbuf);
406 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
407 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
408 acc = _mm_xor_si128_emu(clprod1, acc);
409 const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
410 acc = _mm_xor_si128_emu(clprod2, acc);
412 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
413 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
415 const __m128i temp12 = _mm_load_si128_emu(prandex);
416 _mm_store_si128_emu(prandex, tempa2);
418 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
419 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
420 acc = _mm_xor_si128_emu(add12, acc);
422 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
423 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
424 _mm_store_si128_emu(prand, tempb2);
429 const __m128i temp1 = _mm_load_si128_emu(prandex);
430 const __m128i temp2 = _mm_load_si128_emu(pbuf);
431 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
432 acc = _mm_xor_si128_emu(add1, acc);
434 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
435 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
437 const __m128i temp12 = _mm_load_si128_emu(prand);
438 _mm_store_si128_emu(prand, tempa2);
440 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
441 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
442 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
443 acc = _mm_xor_si128_emu(clprod12, acc);
444 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
445 acc = _mm_xor_si128_emu(clprod22, acc);
447 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
448 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
449 _mm_store_si128_emu(prandex, tempb2);
454 const __m128i temp1 = _mm_load_si128_emu(prand);
455 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
456 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
458 // cannot be zero here
459 const int32_t divisor = (uint32_t)selector;
461 acc = _mm_xor_si128_emu(add1, acc);
463 const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
464 const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
465 acc = _mm_xor_si128_emu(modulo, acc);
467 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
468 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
472 const __m128i temp12 = _mm_load_si128_emu(prandex);
473 _mm_store_si128_emu(prandex, tempa2);
475 const __m128i temp22 = _mm_load_si128_emu(pbuf);
476 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
477 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
478 acc = _mm_xor_si128_emu(clprod12, acc);
479 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
480 acc = _mm_xor_si128_emu(clprod22, acc);
482 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
483 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
484 _mm_store_si128_emu(prand, tempb2);
488 const __m128i tempb3 = _mm_load_si128_emu(prandex);
489 _mm_store_si128_emu(prandex, tempa2);
490 _mm_store_si128_emu(prand, tempb3);
496 // a few AES operations
497 const __m128i *rc = prand;
500 __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
501 __m128i temp2 = _mm_load_si128_emu(pbuf);
503 AES2_EMU(temp1, temp2, 0);
504 MIX2_EMU(temp1, temp2);
506 AES2_EMU(temp1, temp2, 4);
507 MIX2_EMU(temp1, temp2);
509 AES2_EMU(temp1, temp2, 8);
510 MIX2_EMU(temp1, temp2);
512 acc = _mm_xor_si128_emu(temp1, acc);
513 acc = _mm_xor_si128_emu(temp2, acc);
515 const __m128i tempa1 = _mm_load_si128_emu(prand);
516 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
517 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
519 const __m128i tempa4 = _mm_load_si128_emu(prandex);
520 _mm_store_si128_emu(prandex, tempa3);
521 _mm_store_si128_emu(prand, tempa4);
526 // we'll just call this one the monkins loop, inspired by Chris
527 const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
528 __m128i tmp; // used by MIX2
530 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
532 uint64_t aesround = 0;
537 //std::cout << "acc: " << LEToHex(acc) << ", round check: " << LEToHex((selector & (0x10000000 << rounds))) << std::endl;
539 // note that due to compiler and CPUs, we expect this to do:
540 // if (selector & ((0x10000000 << rounds) & 0xffffffff) if rounds != 3 else selector & 0xffffffff80000000):
541 if (selector & (0x10000000 << rounds))
543 onekey = _mm_load_si128_emu(rc++);
544 const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
545 const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
546 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
547 acc = _mm_xor_si128_emu(clprod1, acc);
551 onekey = _mm_load_si128_emu(rc++);
552 __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
553 const uint64_t roundidx = aesround++ << 2;
554 AES2_EMU(onekey, temp2, roundidx);
557 std::cout << " onekey1: " << LEToHex(onekey) << std::endl;
558 std::cout << " temp21: " << LEToHex(temp2) << std::endl;
559 std::cout << "roundkey: " << LEToHex(rc[roundidx]) << std::endl;
561 aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx]));
563 std::cout << "onekey2: " << LEToHex(onekey) << std::endl;
564 std::cout << "roundkey: " << LEToHex(rc[roundidx + 1]) << std::endl;
566 aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 1]));
568 std::cout << " temp22: " << LEToHex(temp2) << std::endl;
569 std::cout << "roundkey: " << LEToHex(rc[roundidx + 2]) << std::endl;
571 aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx + 2]));
573 std::cout << "onekey2: " << LEToHex(onekey) << std::endl;
575 aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 3]));
577 std::cout << " temp22: " << LEToHex(temp2) << std::endl;
580 MIX2_EMU(onekey, temp2);
583 std::cout << "onekey3: " << LEToHex(onekey) << std::endl;
586 acc = _mm_xor_si128_emu(onekey, acc);
587 acc = _mm_xor_si128_emu(temp2, acc);
591 const __m128i tempa1 = _mm_load_si128_emu(prand);
592 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
593 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
595 const __m128i tempa4 = _mm_load_si128_emu(prandex);
596 _mm_store_si128_emu(prandex, tempa3);
597 _mm_store_si128_emu(prand, tempa4);
602 const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
603 const __m128i temp2 = _mm_load_si128_emu(prand);
604 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
605 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
606 acc = _mm_xor_si128_emu(clprod1, acc);
608 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
609 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
611 const __m128i tempb3 = _mm_load_si128_emu(prandex);
612 _mm_store_si128_emu(prandex, tempa2);
613 _mm_store_si128_emu(prand, tempb3);
618 const __m128i temp1 = _mm_load_si128_emu(pbuf);
619 const __m128i temp2 = _mm_load_si128_emu(prandex);
620 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
621 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
622 acc = _mm_xor_si128_emu(clprod1, acc);
624 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
625 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
627 const __m128i tempa3 = _mm_load_si128_emu(prand);
628 _mm_store_si128_emu(prand, tempa2);
630 acc = _mm_xor_si128_emu(tempa3, acc);
632 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
633 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
634 _mm_store_si128_emu(prandex, tempb2);
642 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
643 // returning a 64 bit hash value
644 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
645 __m128i * rs64 = (__m128i *)random;
646 const __m128i * string = (const __m128i *) buf;
648 __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, pMoveScratch);
649 acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64));
650 return precompReduction64_port(acc);
653 bool mine_verus_v2_port(CBlockHeader &bh, CVerusHashV2bWriter &vhw, uint256 &finalHash, uint256 &target, uint64_t start, uint64_t *count)
655 CVerusHashV2 &vh = vhw.GetState();
656 verusclhasher &vclh = vh.vclh;
658 alignas(32) uint256 curHash;
659 arith_uint256 curTarget = UintToArith256(target);
661 u128 *hashKey = (u128 *)verusclhasher_key.get();
662 verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
663 const uint32_t keysize = pdesc->keySizeInBytes;
664 void *hasherrefresh = ((unsigned char *)hashKey) + keysize;
665 __m128i **pMoveScratch = vclh.getpmovescratch(hasherrefresh);
666 const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks
671 int64_t *extraPtr = vhw.xI64p();
672 unsigned char *curBuf = vh.CurBuffer();
674 // skip keygen if it is the current key
675 if (pdesc->seed != *((uint256 *)curBuf))
677 // generate a new key by chain hashing with Haraka256 from the last curbuf
678 // assume 256 bit boundary
679 int n256blks = keysize >> 5;
680 unsigned char *pkey = ((unsigned char *)hashKey);
681 unsigned char *psrc = curBuf;
682 for (int i = 0; i < n256blks; i++)
684 haraka256_port(pkey, psrc);
688 pdesc->seed = *((uint256 *)curBuf);
689 memcpy(hasherrefresh, hashKey, keyrefreshsize);
690 memset(((unsigned char *)hasherrefresh) + keyrefreshsize, 0, keysize - keyrefreshsize);
697 // loop the requested number of times or until canceled. determine if we
698 // found a winner, and send all winners found as solutions. count only one hash.
699 // hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only
700 // hashrate and sharerate are valid, solutionrate will equal sharerate
701 uint64_t i, end = start + *count;
702 for (i = start; i < end; i++)
706 // prepare the buffer
707 vh.FillExtra((u128 *)curBuf);
709 // run verusclhash on the buffer
710 const uint64_t intermediate = vclh(curBuf, hashKey, pMoveScratch);
712 // prepare the buffer
713 vh.FillExtra(&intermediate);
715 (*vh.haraka512KeyedFunction)((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate));
717 if (UintToArith256(curHash) > curTarget)
720 vclh.fixupkey(hashKey, *pdesc);
724 std::vector<unsigned char> solution = bh.nSolution;
725 int extraSpace = (solution.size() % 32) + 15;
726 assert(solution.size() > 32);
727 *((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i;
728 bh.nSolution = solution;
730 *count = (i - start) + 1;