src/crypto/verus_clhash_portable.cpp

   1 /*
   2  * This uses veriations of the clhash algorithm for Verus Coin, licensed
   3  * with the Apache-2.0 open source license.
   4  *
   5  * Copyright (c) 2018 Michael Toutonghi
   6  * Distributed under the Apache 2.0 software license, available in the original form for clhash
   7  * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
   8  *
   9  * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
  10  * Faster 64-bit universal hashing
  11  * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
  12  *
  13  * Best used on recent x64 processors (Haswell or better).
  14  *
  15  * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
  16  * is to more effectively equalize FPGAs over GPUs and CPUs.
  17  *
  18  **/
  19
  20
  21 #include "verus_hash.h"
  22
  23 #include <assert.h>
  24 #include <string.h>
  25 #include <x86intrin.h>
  26
  27 void clmul64(uint64_t a, uint64_t b, uint64_t* r)
  28 {
  29     uint8_t s = 4,i; //window size
  30     uint64_t two_s = 1 << s; //2^s
  31     uint64_t smask = two_s-1; //s 1 bits
  32     uint64_t u[16];
  33     uint64_t tmp;
  34     uint64_t ifmask;
  35     //Precomputation
  36     u[0] = 0;
  37     u[1] = b;
  38     for(i = 2 ; i < two_s; i += 2){
  39         u[i] = u[i >> 1] << 1; //even indices: left shift
  40         u[i + 1] = u[i] ^ b; //odd indices: xor b
  41     }
  42     //Multiply
  43     r[0] = u[a & smask]; //first window only affects lower word
  44     r[1] = 0;
  45     for(i = s ; i < 64 ; i += s){
  46         tmp = u[a >> i & smask];
  47         r[0] ^= tmp << i;
  48         r[1] ^= tmp >> (64 - i);
  49     }
  50     //Repair
  51     uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
  52     for(i = 1 ; i < s ; i++){
  53         tmp = ((a & m) >> i);
  54         m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
  55         ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
  56         r[1] ^= (tmp & ifmask);
  57     }
  58 }
  59
  60 u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
  61 {
  62     uint64_t result[2];
  63     clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result);
  64
  65     /*
  66     const __m128i tmp1 = _mm_load_si128(&a);
  67     const __m128i tmp2 = _mm_load_si128(&b);
  68     const __m128i testresult = _mm_clmulepi64_si128(tmp1, tmp2, 0x10);
  69     if (!memcmp(&testresult, &result, 16))
  70     {
  71         printf("_mm_clmulepi64_si128_emu: Portable version passed!\n");
  72     }
  73     else
  74     {
  75         printf("_mm_clmulepi64_si128_emu: Portable version failed!\n");
  76     }
  77     */
  78
  79     return *(__m128i *)result;
  80 }
  81
  82 u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
  83 {
  84     uint16_t result[8];
  85     uint16_t *a = (uint16_t*)&_a, *b = (uint16_t*)&_b;
  86     for (int i = 0; i < 7; i ++)
  87     {
  88         result[i] = (uint16_t)((uint32_t)((((int32_t)a[i] * (int32_t)b[i]) >> 14) + 1) >> 1);
  89     }
  90 }
  91
  92 inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
  93 {
  94     __m128i result;
  95     ((uint64_t *)&result)[0] = lo;
  96     ((uint64_t *)&result)[1] = hi;
  97     return result;
  98 }
  99
 100 inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
 101 {
 102     __m128i result;
 103     ((uint64_t *)&result)[0] = lo;
 104     ((uint64_t *)&result)[1] = 0;
 105     return result;
 106 }
 107
 108 inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
 109 {
 110     return *(int64_t *)&a;
 111 }
 112
 113 inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
 114 {
 115     return *(int32_t *)&a;
 116 }
 117
 118 inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
 119 {
 120     __m128i result;
 121     ((uint32_t *)&result)[0] = lo;
 122     ((uint32_t *)&result)[1] = 0;
 123     ((uint64_t *)&result)[1] = 0;
 124
 125     /*
 126     const __m128i testresult = _mm_cvtsi32_si128(lo);
 127     if (!memcmp(&testresult, &result, 16))
 128     {
 129         printf("_mm_cvtsi32_si128_emu: Portable version passed!\n");
 130     }
 131     else
 132     {
 133         printf("_mm_cvtsi32_si128_emu: Portable version failed!\n");
 134     }
 135     */
 136
 137     return result;
 138 }
 139
 140 u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
 141 {
 142     __m128i result;
 143     ((uint8_t *)&result)[0] = c0;
 144     ((uint8_t *)&result)[1] = c1;
 145     ((uint8_t *)&result)[2] = c2;
 146     ((uint8_t *)&result)[3] = c3;
 147     ((uint8_t *)&result)[4] = c4;
 148     ((uint8_t *)&result)[5] = c5;
 149     ((uint8_t *)&result)[6] = c6;
 150     ((uint8_t *)&result)[7] = c7;
 151     ((uint8_t *)&result)[8] = c8;
 152     ((uint8_t *)&result)[9] = c9;
 153     ((uint8_t *)&result)[10] = c10;
 154     ((uint8_t *)&result)[11] = c11;
 155     ((uint8_t *)&result)[12] = c12;
 156     ((uint8_t *)&result)[13] = c13;
 157     ((uint8_t *)&result)[14] = c14;
 158     ((uint8_t *)&result)[15] = c15;
 159
 160     /*
 161     const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15);
 162     if (!memcmp(&testresult, &result, 16))
 163     {
 164         printf("_mm_setr_epi8_emu: Portable version passed!\n");
 165     }
 166     else
 167     {
 168         printf("_mm_setr_epi8_emu: Portable version failed!\n");
 169     }
 170     */
 171
 172     return result;
 173 }
 174
 175 inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
 176 {
 177     unsigned char result[16];
 178     uint8_t shift = imm8 & 0xff;
 179     if (shift > 15) shift = 16;
 180
 181     int i;
 182     for (i = 0; i < (16 - shift); i++)
 183     {
 184         result[i] = ((unsigned char *)&a)[shift + i];
 185     }
 186     for ( ; i < 16; i++)
 187     {
 188         result[i] = 0;
 189     }
 190
 191     /*
 192     const __m128i tmp1 = _mm_load_si128(&a);
 193     __m128i testresult = _mm_srli_si128(tmp1, imm8);
 194     if (!memcmp(&testresult, result, 16))
 195     {
 196         printf("_mm_srli_si128_emu: Portable version passed!\n");
 197     }
 198     else
 199     {
 200         printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n",
 201                *((uint64_t *)&a + 1), *(uint64_t *)&a,
 202                imm8,
 203                *((uint64_t *)result + 1), *(uint64_t *)result,
 204                *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
 205     }
 206     */
 207
 208     return *(__m128i *)result;
 209 }
 210
 211 inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
 212 {
 213     return a ^ b;
 214 }
 215
 216 inline __m128i _mm_load_si128_emu(const void *p)
 217 {
 218     return *(__m128i *)p;
 219 }
 220
 221 inline void _mm_store_si128_emu(void *p, __m128i val)
 222 {
 223     *(__m128i *)p = val;
 224 }
 225
 226 __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
 227 {
 228     __m128i result;
 229     for (int i = 0; i < 16; i++)
 230     {
 231         if (((uint8_t *)&b)[i] & 0x80)
 232         {
 233             ((uint8_t *)&result)[i] = 0;
 234         }
 235         else
 236         {
 237             ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
 238         }
 239     }
 240
 241     /*
 242     const __m128i tmp1 = _mm_load_si128(&a);
 243     const __m128i tmp2 = _mm_load_si128(&b);
 244     __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2);
 245     if (!memcmp(&testresult, &result, 16))
 246     {
 247         printf("_mm_shuffle_epi8_emu: Portable version passed!\n");
 248     }
 249     else
 250     {
 251         printf("_mm_shuffle_epi8_emu: Portable version failed!\n");
 252     }
 253     */
 254
 255     return result;
 256 }
 257
 258 // portable
 259 static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
 260     const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
 261     const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
 262     return clprod1;
 263 }
 264
 265 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
 266 static inline __m128i precompReduction64_si128_port( __m128i A) {
 267
 268     //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
 269     const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
 270     __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
 271     __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
 272                                   _mm_srli_si128_emu(Q2,8));
 273     __m128i Q4 = _mm_xor_si128_emu(Q2,A);
 274     const __m128i final = _mm_xor_si128_emu(Q3,Q4);
 275     return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
 276 }
 277
 278 static inline uint64_t precompReduction64_port( __m128i A) {
 279     return _mm_cvtsi128_si64(precompReduction64_si128_port(A));
 280 }
 281
 282 // verus intermediate hash extra
 283 static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
 284 {
 285     __m128i const *pbuf;
 286
 287     // divide key mask by 16 from bytes to __m128i
 288     keyMask >>= 4;
 289
 290     // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
 291     // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
 292     // be used to xor into the accumulator before it is hashed with other values first
 293     __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
 294
 295     for (int64_t i = 0; i < 32; i++)
 296     {
 297         const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
 298
 299         // get two random locations in the key, which will be mutated and swapped
 300         __m128i *prand = randomsource + ((selector >> 5) & keyMask);
 301         __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
 302
 303         // select random start and order of pbuf processing
 304         pbuf = buf + (selector & 3);
 305
 306         switch (selector & 0x1c)
 307         {
 308             case 0:
 309             {
 310                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 311                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 312                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 313                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 314                 acc = _mm_xor_si128_emu(clprod1, acc);
 315
 316                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 317                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 318
 319                 const __m128i temp12 = _mm_load_si128_emu(prand);
 320                 _mm_store_si128_emu(prand, tempa2);
 321
 322                 const __m128i temp22 = _mm_load_si128_emu(pbuf);
 323                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 324                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 325                 acc = _mm_xor_si128_emu(clprod12, acc);
 326
 327                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 328                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 329                 _mm_store_si128_emu(prandex, tempb2);
 330                 break;
 331             }
 332             case 4:
 333             {
 334                 const __m128i temp1 = _mm_load_si128_emu(prand);
 335                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 336                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 337                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 338                 acc = _mm_xor_si128_emu(clprod1, acc);
 339                 const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
 340                 acc = _mm_xor_si128_emu(clprod2, acc);
 341
 342                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 343                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 344
 345                 const __m128i temp12 = _mm_load_si128_emu(prandex);
 346                 _mm_store_si128_emu(prandex, tempa2);
 347
 348                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 349                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 350                 acc = _mm_xor_si128_emu(add12, acc);
 351
 352                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 353                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 354                 _mm_store_si128_emu(prand, tempb2);
 355                 break;
 356             }
 357             case 8:
 358             {
 359                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 360                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 361                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 362                 acc = _mm_xor_si128_emu(add1, acc);
 363
 364                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 365                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 366
 367                 const __m128i temp12 = _mm_load_si128_emu(prand);
 368                 _mm_store_si128_emu(prand, tempa2);
 369
 370                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 371                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 372                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 373                 acc = _mm_xor_si128_emu(clprod12, acc);
 374                 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 375                 acc = _mm_xor_si128_emu(clprod22, acc);
 376
 377                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 378                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 379                 _mm_store_si128_emu(prandex, tempb2);
 380                 break;
 381             }
 382             case 0x0c:
 383             {
 384                 const __m128i temp1 = _mm_load_si128_emu(prand);
 385                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 386                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 387
 388                 // cannot be zero here
 389                 const int32_t divisor = (uint32_t)selector;
 390
 391                 acc = _mm_xor_si128(add1, acc);
 392
 393                 const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
 394                 const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
 395                 acc = _mm_xor_si128_emu(modulo, acc);
 396
 397                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 398                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 399
 400                 if (dividend & 1)
 401                 {
 402                     const __m128i temp12 = _mm_load_si128_emu(prandex);
 403                     _mm_store_si128_emu(prandex, tempa2);
 404
 405                     const __m128i temp22 = _mm_load_si128_emu(pbuf);
 406                     const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 407                     const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 408                     acc = _mm_xor_si128_emu(clprod12, acc);
 409                     const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 410                     acc = _mm_xor_si128_emu(clprod22, acc);
 411
 412                     const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 413                     const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 414                     _mm_store_si128_emu(prand, tempb2);
 415                 }
 416                 else
 417                 {
 418                     const __m128i tempb3 = _mm_load_si128_emu(prandex);
 419                     _mm_store_si128_emu(prandex, tempa2);
 420                     _mm_store_si128_emu(prand, tempb3);
 421                 }
 422                 break;
 423             }
 424             case 0x10:
 425             {
 426                 // a few AES operations
 427                 const __m128i *rc = prand;
 428                 __m128i tmp;
 429
 430                 __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 431                 __m128i temp2 = _mm_load_si128_emu(pbuf);
 432
 433                 AES2_EMU(temp1, temp2, 0);
 434                 MIX2_EMU(temp1, temp2);
 435
 436                 AES2_EMU(temp1, temp2, 4);
 437                 MIX2_EMU(temp1, temp2);
 438
 439                 AES2_EMU(temp1, temp2, 8);
 440                 MIX2_EMU(temp1, temp2);
 441
 442                 acc = _mm_xor_si128_emu(temp1, acc);
 443                 acc = _mm_xor_si128_emu(temp2, acc);
 444
 445                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 446                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 447                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 448
 449                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 450                 _mm_store_si128_emu(prandex, tempa3);
 451                 _mm_store_si128_emu(prand, tempa4);
 452                 break;
 453             }
 454             case 0x14:
 455             {
 456                 // we'll just call this one the monkins loop, inspired by Chris
 457                 const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
 458                 __m128i tmp; // used by MIX2
 459
 460                 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
 461                 __m128i *rc = prand;
 462                 uint64_t aesround = 0;
 463                 __m128i onekey;
 464
 465                 do
 466                 {
 467                     if (selector & (0x10000000 << rounds))
 468                     {
 469                         onekey = _mm_load_si128_emu(rc++);
 470                         const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
 471                         const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
 472                         const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 473                         acc = _mm_xor_si128_emu(clprod1, acc);
 474                     }
 475                     else
 476                     {
 477                         onekey = _mm_load_si128_emu(rc++);
 478                         __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
 479                         const uint64_t roundidx = aesround++ << 2;
 480                         AES2_EMU(onekey, temp2, roundidx);
 481                         MIX2_EMU(onekey, temp2);
 482                         acc = _mm_xor_si128_emu(onekey, acc);
 483                         acc = _mm_xor_si128_emu(temp2, acc);
 484                     }
 485                 } while (rounds--);
 486
 487                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 488                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 489                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 490
 491                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 492                 _mm_store_si128_emu(prandex, tempa3);
 493                 _mm_store_si128_emu(prand, tempa4);
 494                 break;
 495             }
 496             case 0x18:
 497             {
 498                 const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 499                 const __m128i temp2 = _mm_load_si128_emu(prand);
 500                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 501                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 502                 acc = _mm_xor_si128_emu(clprod1, acc);
 503
 504                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 505                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 506
 507                 const __m128i tempb3 = _mm_load_si128_emu(prandex);
 508                 _mm_store_si128_emu(prandex, tempa2);
 509                 _mm_store_si128_emu(prand, tempb3);
 510                 break;
 511             }
 512             case 0x1c:
 513             {
 514                 const __m128i temp1 = _mm_load_si128_emu(pbuf);
 515                 const __m128i temp2 = _mm_load_si128_emu(prandex);
 516                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 517                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 518                 acc = _mm_xor_si128_emu(clprod1, acc);
 519
 520                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 521                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 522
 523                 const __m128i tempa3 = _mm_load_si128_emu(prand);
 524                 _mm_store_si128_emu(prand, tempa2);
 525
 526                 acc = _mm_xor_si128_emu(tempa3, acc);
 527
 528                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
 529                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
 530                 _mm_store_si128_emu(prandex, tempb2);
 531                 break;
 532             }
 533         }
 534     }
 535     return acc;
 536 }
 537
 538 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
 539 // returning a 64 bit hash value
 540 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
 541     const unsigned int  m = 128;// we process the data in chunks of 16 cache lines
 542     __m128i * rs64 = (__m128i *)random;
 543     const __m128i * string = (const __m128i *) buf;
 544
 545     __m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
 546     acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
 547     return precompReduction64_port(acc);
 548 }