src/crypto/verus_clhash_portable.cpp

   1 /*
   2  * This uses veriations of the clhash algorithm for Verus Coin, licensed
   3  * with the Apache-2.0 open source license.
   4  *
   5  * Copyright (c) 2018 Michael Toutonghi
   6  * Distributed under the Apache 2.0 software license, available in the original form for clhash
   7  * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
   8  *
   9  * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
  10  * Faster 64-bit universal hashing
  11  * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
  12  *
  13  * Best used on recent x64 processors (Haswell or better).
  14  *
  15  * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
  16  * is to more effectively equalize FPGAs over GPUs and CPUs.
  17  *
  18  **/
  19
  20 #include "hash.h"
  21 #include "primitives/block.h"
  22
  23 #include <assert.h>
  24 #include <string.h>
  25
  26 #ifdef __APPLE__
  27 #include <sys/types.h>
  28 #endif// APPLE
  29
  30 #ifdef __linux__
  31
  32 #ifdef __i386__ || __X86_64__
  33 #include <x86intrin.h>
  34 #elif defined(__arm__)  || defined(__aarch64__)
  35 #include "crypto/SSE2NEON.h"
  36 #endif
  37
  38 #elif _WIN32
  39 #pragma warning (disable : 4146)
  40 #include <intrin.h>
  41 #endif
  42
  43 void clmul64(uint64_t a, uint64_t b, uint64_t* r)
  44 {
  45     uint8_t s = 4,i; //window size
  46     uint64_t two_s = 1 << s; //2^s
  47     uint64_t smask = two_s-1; //s 1 bits
  48     uint64_t u[16];
  49     uint64_t tmp;
  50     uint64_t ifmask;
  51     //Precomputation
  52     u[0] = 0;
  53     u[1] = b;
  54     for(i = 2 ; i < two_s; i += 2){
  55         u[i] = u[i >> 1] << 1; //even indices: left shift
  56         u[i + 1] = u[i] ^ b; //odd indices: xor b
  57     }
  58     //Multiply
  59     r[0] = u[a & smask]; //first window only affects lower word
  60     r[1] = 0;
  61     for(i = s ; i < 64 ; i += s){
  62         tmp = u[a >> i & smask];
  63         r[0] ^= tmp << i;
  64         r[1] ^= tmp >> (64 - i);
  65     }
  66     //Repair
  67     uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
  68     for(i = 1 ; i < s ; i++){
  69         tmp = ((a & m) >> i);
  70         m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
  71         ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
  72         r[1] ^= (tmp & ifmask);
  73     }
  74 }
  75
  76 u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
  77 {
  78     uint64_t result[2];
  79     clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result);
  80
  81     /*
  82     // TEST
  83     const __m128i tmp1 = _mm_load_si128(&a);
  84     const __m128i tmp2 = _mm_load_si128(&b);
  85     imm = imm & 0x11;
  86     const __m128i testresult = (imm == 0x10) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x10) : ((imm == 0x01) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x01) : ((imm == 0x00) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x00) : _mm_clmulepi64_si128(tmp1, tmp2, 0x11)));
  87     if (!memcmp(&testresult, &result, 16))
  88     {
  89         printf("_mm_clmulepi64_si128_emu: Portable version passed!\n");
  90     }
  91     else
  92     {
  93         printf("_mm_clmulepi64_si128_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, imm: %x, emu: %lxh %lxl, intrin: %lxh %lxl\n",
  94                *((uint64_t *)&a + 1), *(uint64_t *)&a,
  95                *((uint64_t *)&b + 1), *(uint64_t *)&b,
  96                imm,
  97                *((uint64_t *)result + 1), *(uint64_t *)result,
  98                *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
  99         return testresult;
 100     }
 101     */
 102
 103     return *(__m128i *)result;
 104 }
 105
 106 u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
 107 {
 108     int16_t result[8];
 109     int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b;
 110     for (int i = 0; i < 8; i ++)
 111     {
 112         result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15);
 113     }
 114
 115     /*
 116     const __m128i testresult = _mm_mulhrs_epi16(_a, _b);
 117     if (!memcmp(&testresult, &result, 16))
 118     {
 119         printf("_mm_mulhrs_epi16_emu: Portable version passed!\n");
 120     }
 121     else
 122     {
 123         printf("_mm_mulhrs_epi16_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, emu: %lxh %lxl, intrin: %lxh %lxl\n",
 124                *((uint64_t *)&a + 1), *(uint64_t *)&a,
 125                *((uint64_t *)&b + 1), *(uint64_t *)&b,
 126                *((uint64_t *)result + 1), *(uint64_t *)result,
 127                *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
 128     }
 129     */
 130
 131     return *(__m128i *)result;
 132 }
 133
 134 inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
 135 {
 136     __m128i result;
 137     ((uint64_t *)&result)[0] = lo;
 138     ((uint64_t *)&result)[1] = hi;
 139     return result;
 140 }
 141
 142 inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
 143 {
 144     __m128i result;
 145     ((uint64_t *)&result)[0] = lo;
 146     ((uint64_t *)&result)[1] = 0;
 147     return result;
 148 }
 149
 150 inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
 151 {
 152     return *(int64_t *)&a;
 153 }
 154
 155 inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
 156 {
 157     return *(int32_t *)&a;
 158 }
 159
 160 inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
 161 {
 162     __m128i result;
 163     ((uint32_t *)&result)[0] = lo;
 164     ((uint32_t *)&result)[1] = 0;
 165     ((uint64_t *)&result)[1] = 0;
 166
 167     /*
 168     const __m128i testresult = _mm_cvtsi32_si128(lo);
 169     if (!memcmp(&testresult, &result, 16))
 170     {
 171         printf("_mm_cvtsi32_si128_emu: Portable version passed!\n");
 172     }
 173     else
 174     {
 175         printf("_mm_cvtsi32_si128_emu: Portable version failed!\n");
 176     }
 177     */
 178
 179     return result;
 180 }
 181
 182 u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
 183 {
 184     __m128i result;
 185     ((uint8_t *)&result)[0] = c0;
 186     ((uint8_t *)&result)[1] = c1;
 187     ((uint8_t *)&result)[2] = c2;
 188     ((uint8_t *)&result)[3] = c3;
 189     ((uint8_t *)&result)[4] = c4;
 190     ((uint8_t *)&result)[5] = c5;
 191     ((uint8_t *)&result)[6] = c6;
 192     ((uint8_t *)&result)[7] = c7;
 193     ((uint8_t *)&result)[8] = c8;
 194     ((uint8_t *)&result)[9] = c9;
 195     ((uint8_t *)&result)[10] = c10;
 196     ((uint8_t *)&result)[11] = c11;
 197     ((uint8_t *)&result)[12] = c12;
 198     ((uint8_t *)&result)[13] = c13;
 199     ((uint8_t *)&result)[14] = c14;
 200     ((uint8_t *)&result)[15] = c15;
 201
 202     /*
 203     const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15);
 204     if (!memcmp(&testresult, &result, 16))
 205     {
 206         printf("_mm_setr_epi8_emu: Portable version passed!\n");
 207     }
 208     else
 209     {
 210         printf("_mm_setr_epi8_emu: Portable version failed!\n");
 211     }
 212     */
 213
 214     return result;
 215 }
 216
 217 inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
 218 {
 219     unsigned char result[16];
 220     uint8_t shift = imm8 & 0xff;
 221     if (shift > 15) shift = 16;
 222
 223     int i;
 224     for (i = 0; i < (16 - shift); i++)
 225     {
 226         result[i] = ((unsigned char *)&a)[shift + i];
 227     }
 228     for ( ; i < 16; i++)
 229     {
 230         result[i] = 0;
 231     }
 232
 233     /*
 234     const __m128i tmp1 = _mm_load_si128(&a);
 235     __m128i testresult = _mm_srli_si128(tmp1, imm8);
 236     if (!memcmp(&testresult, result, 16))
 237     {
 238         printf("_mm_srli_si128_emu: Portable version passed!\n");
 239     }
 240     else
 241     {
 242         printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n",
 243                *((uint64_t *)&a + 1), *(uint64_t *)&a,
 244                imm8,
 245                *((uint64_t *)result + 1), *(uint64_t *)result,
 246                *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
 247     }
 248     */
 249
 250     return *(__m128i *)result;
 251 }
 252
 253 inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
 254 {
 255 #ifdef _WIN32
 256     uint64_t result[2];
 257     result[0] = *(uint64_t *)&a ^ *(uint64_t *)&b;
 258     result[1] = *((uint64_t *)&a + 1) ^ *((uint64_t *)&b + 1);
 259     return *(__m128i *)result;
 260 #else
 261     return a ^ b;
 262 #endif
 263 }
 264
 265 inline __m128i _mm_load_si128_emu(const void *p)
 266 {
 267     return *(__m128i *)p;
 268 }
 269
 270 inline void _mm_store_si128_emu(void *p, __m128i val)
 271 {
 272     *(__m128i *)p = val;
 273 }
 274
 275 __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
 276 {
 277     __m128i result;
 278     for (int i = 0; i < 16; i++)
 279     {
 280         if (((uint8_t *)&b)[i] & 0x80)
 281         {
 282             ((uint8_t *)&result)[i] = 0;
 283         }
 284         else
 285         {
 286             ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
 287         }
 288     }
 289
 290     /*
 291     const __m128i tmp1 = _mm_load_si128(&a);
 292     const __m128i tmp2 = _mm_load_si128(&b);
 293     __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2);
 294     if (!memcmp(&testresult, &result, 16))
 295     {
 296         printf("_mm_shuffle_epi8_emu: Portable version passed!\n");
 297     }
 298     else
 299     {
 300         printf("_mm_shuffle_epi8_emu: Portable version failed!\n");
 301     }
 302     */
 303
 304     return result;
 305 }
 306
 307 // portable
 308 static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
 309     const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
 310     const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
 311     return clprod1;
 312 }
 313
 314 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
 315 static inline __m128i precompReduction64_si128_port( __m128i A) {
 316
 317     //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
 318     const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
 319     __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
 320     __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
 321                                   _mm_srli_si128_emu(Q2,8));
 322     __m128i Q4 = _mm_xor_si128_emu(Q2,A);
 323     const __m128i final = _mm_xor_si128_emu(Q3,Q4);
 324     return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
 325 }
 326
 327 static inline uint64_t precompReduction64_port( __m128i A) {
 328     __m128i tmp = precompReduction64_si128_port(A);
 329     return _mm_cvtsi128_si64_emu(tmp);
 330 }
 331
 332 // verus intermediate hash extra
 333 static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
 334 {
 335     __m128i const *pbuf;
 336
 337     /*
 338     std::cout << "Random key start: ";
 339     std::cout << LEToHex(*randomsource) << ", ";
 340     std::cout << LEToHex(*(randomsource + 1));
 341     std::cout << std::endl;
 342     */
 343
 344     // divide key mask by 16 from bytes to __m128i
 345     keyMask >>= 4;
 346
 347     // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
 348     // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
 349     // be used to xor into the accumulator before it is hashed with other values first
 350     __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
 351
 352     for (int64_t i = 0; i < 32; i++)
 353     {
 354         //std::cout << "LOOP " << i << " acc: " << LEToHex(acc) << std::endl;
 355
 356         const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
 357
 358         // get two random locations in the key, which will be mutated and swapped
 359         __m128i *prand = randomsource + ((selector >> 5) & keyMask);
 360         __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
 361
 362         *pMoveScratch++ = prand;
 363         *pMoveScratch++ = prandex;
 364
 365         // select random start and order of pbuf processing
 366         pbuf = buf + (selector & 3);
 367
 368         switch (selector & 0x1c)
 369         {
 370             case 0:
 371             {
 372                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 373                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 374                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 375                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 376                 acc = _mm_xor_si128_emu(clprod1, acc);
 377
 378                 /*
 379                 std::cout << "temp1: " << LEToHex(temp1) << std::endl;
 380                 std::cout << "temp2: " << LEToHex(temp2) << std::endl;
 381                 std::cout << "add1: " << LEToHex(add1) << std::endl;
 382                 std::cout << "clprod1: " << LEToHex(clprod1) << std::endl;
 383                 std::cout << "acc: " << LEToHex(acc) << std::endl;
 384                 */
 385
 386                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 387                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 388
 389                 const __m128i temp12 = _mm_load_si128_emu(prand);
 390                 _mm_store_si128_emu(prand, tempa2);
 391
 392                 const __m128i temp22 = _mm_load_si128_emu(pbuf);
 393                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 394                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 395                 acc = _mm_xor_si128_emu(clprod12, acc);
 396
 397                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 398                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 399                 _mm_store_si128_emu(prandex, tempb2);
 400                 break;
 401             }
 402             case 4:
 403             {
 404                 const __m128i temp1 = _mm_load_si128_emu(prand);
 405                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 406                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 407                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 408                 acc = _mm_xor_si128_emu(clprod1, acc);
 409                 const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
 410                 acc = _mm_xor_si128_emu(clprod2, acc);
 411
 412                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 413                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 414
 415                 const __m128i temp12 = _mm_load_si128_emu(prandex);
 416                 _mm_store_si128_emu(prandex, tempa2);
 417
 418                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 419                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 420                 acc = _mm_xor_si128_emu(add12, acc);
 421
 422                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 423                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 424                 _mm_store_si128_emu(prand, tempb2);
 425                 break;
 426             }
 427             case 8:
 428             {
 429                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 430                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 431                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 432                 acc = _mm_xor_si128_emu(add1, acc);
 433
 434                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 435                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 436
 437                 const __m128i temp12 = _mm_load_si128_emu(prand);
 438                 _mm_store_si128_emu(prand, tempa2);
 439
 440                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 441                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 442                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 443                 acc = _mm_xor_si128_emu(clprod12, acc);
 444                 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 445                 acc = _mm_xor_si128_emu(clprod22, acc);
 446
 447                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 448                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 449                 _mm_store_si128_emu(prandex, tempb2);
 450                 break;
 451             }
 452             case 0xc:
 453             {
 454                 const __m128i temp1 = _mm_load_si128_emu(prand);
 455                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 456                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 457
 458                 // cannot be zero here
 459                 const int32_t divisor = (uint32_t)selector;
 460
 461                 acc = _mm_xor_si128_emu(add1, acc);
 462
 463                 const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
 464                 const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
 465                 acc = _mm_xor_si128_emu(modulo, acc);
 466
 467                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 468                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 469
 470                 if (dividend & 1)
 471                 {
 472                     const __m128i temp12 = _mm_load_si128_emu(prandex);
 473                     _mm_store_si128_emu(prandex, tempa2);
 474
 475                     const __m128i temp22 = _mm_load_si128_emu(pbuf);
 476                     const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 477                     const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 478                     acc = _mm_xor_si128_emu(clprod12, acc);
 479                     const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 480                     acc = _mm_xor_si128_emu(clprod22, acc);
 481
 482                     const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 483                     const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 484                     _mm_store_si128_emu(prand, tempb2);
 485                 }
 486                 else
 487                 {
 488                     const __m128i tempb3 = _mm_load_si128_emu(prandex);
 489                     _mm_store_si128_emu(prandex, tempa2);
 490                     _mm_store_si128_emu(prand, tempb3);
 491                 }
 492                 break;
 493             }
 494             case 0x10:
 495             {
 496                 // a few AES operations
 497                 const __m128i *rc = prand;
 498                 __m128i tmp;
 499
 500                 __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 501                 __m128i temp2 = _mm_load_si128_emu(pbuf);
 502
 503                 AES2_EMU(temp1, temp2, 0);
 504                 MIX2_EMU(temp1, temp2);
 505
 506                 AES2_EMU(temp1, temp2, 4);
 507                 MIX2_EMU(temp1, temp2);
 508
 509                 AES2_EMU(temp1, temp2, 8);
 510                 MIX2_EMU(temp1, temp2);
 511
 512                 acc = _mm_xor_si128_emu(temp1, acc);
 513                 acc = _mm_xor_si128_emu(temp2, acc);
 514
 515                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 516                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 517                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 518
 519                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 520                 _mm_store_si128_emu(prandex, tempa3);
 521                 _mm_store_si128_emu(prand, tempa4);
 522                 break;
 523             }
 524             case 0x14:
 525             {
 526                 // we'll just call this one the monkins loop, inspired by Chris
 527                 const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
 528                 __m128i tmp; // used by MIX2
 529
 530                 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
 531                 __m128i *rc = prand;
 532                 uint64_t aesround = 0;
 533                 __m128i onekey;
 534
 535                 do
 536                 {
 537                     //std::cout << "acc: " << LEToHex(acc) << ", round check: " << LEToHex((selector & (0x10000000 << rounds))) << std::endl;
 538
 539                     // note that due to compiler and CPUs, we expect this to do:
 540                     // if (selector & ((0x10000000 << rounds) & 0xffffffff) if rounds != 3 else selector & 0xffffffff80000000):
 541                     if (selector & (0x10000000 << rounds))
 542                     {
 543                         onekey = _mm_load_si128_emu(rc++);
 544                         const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
 545                         const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
 546                         const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 547                         acc = _mm_xor_si128_emu(clprod1, acc);
 548                     }
 549                     else
 550                     {
 551                         onekey = _mm_load_si128_emu(rc++);
 552                         __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
 553                         const uint64_t roundidx = aesround++ << 2;
 554                         AES2_EMU(onekey, temp2, roundidx);
 555
 556                         /*
 557                         std::cout << " onekey1: " << LEToHex(onekey) << std::endl;
 558                         std::cout << "  temp21: " << LEToHex(temp2) << std::endl;
 559                         std::cout << "roundkey: " << LEToHex(rc[roundidx]) << std::endl;
 560
 561                         aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx]));
 562
 563                         std::cout << "onekey2: " << LEToHex(onekey) << std::endl;
 564                         std::cout << "roundkey: " << LEToHex(rc[roundidx + 1]) << std::endl;
 565
 566                         aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 1]));
 567
 568                         std::cout << " temp22: " << LEToHex(temp2) << std::endl;
 569                         std::cout << "roundkey: " << LEToHex(rc[roundidx + 2]) << std::endl;
 570
 571                         aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx + 2]));
 572
 573                         std::cout << "onekey2: " << LEToHex(onekey) << std::endl;
 574
 575                         aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 3]));
 576
 577                         std::cout << " temp22: " << LEToHex(temp2) << std::endl;
 578                         */
 579
 580                         MIX2_EMU(onekey, temp2);
 581
 582                         /*
 583                         std::cout << "onekey3: " << LEToHex(onekey) << std::endl;
 584                         */
 585
 586                         acc = _mm_xor_si128_emu(onekey, acc);
 587                         acc = _mm_xor_si128_emu(temp2, acc);
 588                     }
 589                 } while (rounds--);
 590
 591                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 592                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 593                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 594
 595                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 596                 _mm_store_si128_emu(prandex, tempa3);
 597                 _mm_store_si128_emu(prand, tempa4);
 598                 break;
 599             }
 600             case 0x18:
 601             {
 602                 const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 603                 const __m128i temp2 = _mm_load_si128_emu(prand);
 604                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 605                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 606                 acc = _mm_xor_si128_emu(clprod1, acc);
 607
 608                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 609                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 610
 611                 const __m128i tempb3 = _mm_load_si128_emu(prandex);
 612                 _mm_store_si128_emu(prandex, tempa2);
 613                 _mm_store_si128_emu(prand, tempb3);
 614                 break;
 615             }
 616             case 0x1c:
 617             {
 618                 const __m128i temp1 = _mm_load_si128_emu(pbuf);
 619                 const __m128i temp2 = _mm_load_si128_emu(prandex);
 620                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 621                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 622                 acc = _mm_xor_si128_emu(clprod1, acc);
 623
 624                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 625                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 626
 627                 const __m128i tempa3 = _mm_load_si128_emu(prand);
 628                 _mm_store_si128_emu(prand, tempa2);
 629
 630                 acc = _mm_xor_si128_emu(tempa3, acc);
 631
 632                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
 633                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
 634                 _mm_store_si128_emu(prandex, tempb2);
 635                 break;
 636             }
 637         }
 638     }
 639     return acc;
 640 }
 641
 642 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
 643 // returning a 64 bit hash value
 644 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
 645     __m128i * rs64 = (__m128i *)random;
 646     const __m128i * string = (const __m128i *) buf;
 647
 648     __m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, pMoveScratch);
 649     acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64));
 650     return precompReduction64_port(acc);
 651 }
 652
 653 bool mine_verus_v2_port(CBlockHeader &bh, CVerusHashV2bWriter &vhw, uint256 &finalHash, uint256 &target, uint64_t start, uint64_t *count)
 654 {
 655         CVerusHashV2 &vh = vhw.GetState();
 656     verusclhasher &vclh = vh.vclh;
 657
 658         alignas(32) uint256 curHash;
 659     arith_uint256 curTarget = UintToArith256(target);
 660
 661     u128 *hashKey = (u128 *)verusclhasher_key.get();
 662     verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
 663     const uint32_t keysize = pdesc->keySizeInBytes;
 664     void *hasherrefresh = ((unsigned char *)hashKey) + keysize;
 665         __m128i **pMoveScratch = vclh.getpmovescratch(hasherrefresh);
 666     const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks
 667
 668     vhw.Reset();
 669         vhw << bh;
 670
 671         int64_t *extraPtr = vhw.xI64p();
 672         unsigned char *curBuf = vh.CurBuffer();
 673
 674     // skip keygen if it is the current key
 675     if (pdesc->seed != *((uint256 *)curBuf))
 676     {
 677         // generate a new key by chain hashing with Haraka256 from the last curbuf
 678         // assume 256 bit boundary
 679         int n256blks = keysize >> 5;
 680         unsigned char *pkey = ((unsigned char *)hashKey);
 681         unsigned char *psrc = curBuf;
 682         for (int i = 0; i < n256blks; i++)
 683         {
 684             haraka256_port(pkey, psrc);
 685             psrc = pkey;
 686             pkey += 32;
 687         }
 688         pdesc->seed = *((uint256 *)curBuf);
 689         memcpy(hasherrefresh, hashKey, keyrefreshsize);
 690         memset(((unsigned char *)hasherrefresh) + keyrefreshsize, 0, keysize - keyrefreshsize);
 691     }
 692     else
 693     {
 694         vclh.gethashkey();
 695     }
 696
 697         // loop the requested number of times or until canceled. determine if we
 698         // found a winner, and send all winners found as solutions. count only one hash.
 699         // hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only
 700         // hashrate and sharerate are valid, solutionrate will equal sharerate
 701     uint64_t i, end = start + *count;
 702         for (i = start; i < end; i++)
 703         {
 704                 *extraPtr = i;
 705
 706                 // prepare the buffer
 707         vh.FillExtra((u128 *)curBuf);
 708
 709                 // run verusclhash on the buffer
 710         const uint64_t intermediate = vclh(curBuf, hashKey, pMoveScratch);
 711
 712                 // prepare the buffer
 713         vh.FillExtra(&intermediate);
 714
 715                 (*vh.haraka512KeyedFunction)((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate));
 716
 717         if (UintToArith256(curHash) > curTarget)
 718         {
 719             // refresh the key
 720             vclh.fixupkey(hashKey, *pdesc);
 721                         continue;
 722         }
 723
 724         std::vector<unsigned char> solution = bh.nSolution;
 725                 int extraSpace = (solution.size() % 32) + 15;
 726                 assert(solution.size() > 32);
 727                 *((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i;
 728         bh.nSolution = solution;
 729         finalHash = curHash;
 730         *count = (i - start) + 1;
 731         return true;
 732         }
 733         return false;
 734 }