src/crypto/verus_clhash_portable.cpp

   1 /*
   2  * This uses veriations of the clhash algorithm for Verus Coin, licensed
   3  * with the Apache-2.0 open source license.
   4  *
   5  * Copyright (c) 2018 Michael Toutonghi
   6  * Distributed under the Apache 2.0 software license, available in the original form for clhash
   7  * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
   8  *
   9  * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
  10  * Faster 64-bit universal hashing
  11  * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
  12  *
  13  * Best used on recent x64 processors (Haswell or better).
  14  *
  15  * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
  16  * is to more effectively equalize FPGAs over GPUs and CPUs.
  17  *
  18  **/
  19
  20
  21 #include "verus_hash.h"
  22
  23 #include <assert.h>
  24 #include <string.h>
  25 #include <x86intrin.h>
  26
  27 #ifdef __WIN32
  28 #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
  29 #endif
  30
  31 void clmul64(uint64_t a, uint64_t b, uint64_t* r)
  32 {
  33     uint8_t s = 4,i; //window size
  34     uint64_t two_s = 1 << s; //2^s
  35     uint64_t smask = two_s-1; //s 1 bits
  36     uint64_t u[16];
  37     uint64_t tmp;
  38     uint64_t ifmask;
  39     //Precomputation
  40     u[0] = 0;
  41     u[1] = b;
  42     for(i = 2 ; i < two_s; i += 2){
  43         u[i] = u[i >> 1] << 1; //even indices: left shift
  44         u[i + 1] = u[i] ^ b; //odd indices: xor b
  45     }
  46     //Multiply
  47     r[0] = u[a & smask]; //first window only affects lower word
  48     r[1] = 0;
  49     for(i = s ; i < 64 ; i += s){
  50         tmp = u[a >> i & smask];
  51         r[0] ^= tmp << i;
  52         r[1] ^= tmp >> (64 - i);
  53     }
  54     //Repair
  55     uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
  56     for(i = 1 ; i < s ; i++){
  57         tmp = ((a & m) >> i);
  58         m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
  59         ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
  60         r[1] ^= (tmp & ifmask);
  61     }
  62 }
  63
  64 static u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
  65 {
  66     uint64_t result[2];
  67     clmul64(*(uint64_t*)&a + (imm & 1), *(uint64_t*)&a + ((imm & 0x10) > 4), result);
  68     return *(__m128i *)result;
  69 }
  70
  71 static u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
  72 {
  73     uint16_t result[8];
  74     uint16_t *a = (uint16_t*)&_a, *b = (uint16_t*)&_b;
  75     for (int i = 0; i < 7; i ++)
  76     {
  77         result[i] = (uint16_t)((uint32_t)((((int32_t)a[i] * (int32_t)b[i]) >> 14) + 1) >> 1);
  78     }
  79 }
  80
  81 static inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
  82 {
  83     __m128i result;
  84     ((uint64_t *)&result)[0] = lo;
  85     ((uint64_t *)&result)[1] = hi;
  86     return result;
  87 }
  88
  89 static inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
  90 {
  91     __m128i result;
  92     ((uint64_t *)&result)[0] = lo;
  93     ((uint64_t *)&result)[1] = 0;
  94     return result;
  95 }
  96
  97 static inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
  98 {
  99     return *(int64_t *)&a;
 100 }
 101
 102 static inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
 103 {
 104     return *(int32_t *)&a;
 105 }
 106
 107 static inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
 108 {
 109     __m128i result;
 110     ((uint32_t *)&result)[0] = lo;
 111     ((uint32_t *)&result)[1] = 0;
 112     ((uint64_t *)&result)[1] = 0;
 113     return result;
 114 }
 115
 116 static u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
 117 {
 118     __m128i result;
 119     ((uint8_t *)&result)[0] = c0;
 120     ((uint8_t *)&result)[1] = c1;
 121     ((uint8_t *)&result)[2] = c2;
 122     ((uint8_t *)&result)[3] = c3;
 123     ((uint8_t *)&result)[4] = c4;
 124     ((uint8_t *)&result)[5] = c5;
 125     ((uint8_t *)&result)[6] = c6;
 126     ((uint8_t *)&result)[7] = c7;
 127     ((uint8_t *)&result)[8] = c8;
 128     ((uint8_t *)&result)[9] = c9;
 129     ((uint8_t *)&result)[10] = c10;
 130     ((uint8_t *)&result)[11] = c11;
 131     ((uint8_t *)&result)[12] = c12;
 132     ((uint8_t *)&result)[13] = c13;
 133     ((uint8_t *)&result)[14] = c14;
 134     ((uint8_t *)&result)[15] = c15;
 135     return result;
 136 }
 137
 138 static inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
 139 {
 140     uint8_t shift = imm8 & 0xff;
 141     shift = shift > 15 ? 128 : shift << 3;
 142     return a >> shift;
 143 }
 144
 145 inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
 146 {
 147     return a ^ b;
 148 }
 149
 150 inline __m128i _mm_load_si128_emu(const void *p)
 151 {
 152     return *(__m128i *)p;
 153 }
 154
 155 inline __m128i _mm_store_si128_emu(void *p, __m128i val)
 156 {
 157     *(__m128i *)p = val;
 158 }
 159
 160 __m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
 161 {
 162     __m128i result;
 163     for (int i = 0; i < 16; i++)
 164     {
 165         if (((uint8_t *)&b)[i] & 0x80)
 166         {
 167             ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
 168         }
 169         else
 170         {
 171             ((uint8_t *)&result)[i] = 0;
 172         }
 173     }
 174     return result;
 175 }
 176
 177 // portable
 178 static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
 179     const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
 180     const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
 181     return clprod1;
 182 }
 183
 184 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
 185 static inline __m128i precompReduction64_si128_port( __m128i A) {
 186
 187     //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
 188     const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
 189     __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
 190     __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
 191                                   _mm_srli_si128_emu(Q2,8));
 192     __m128i Q4 = _mm_xor_si128_emu(Q2,A);
 193     const __m128i final = _mm_xor_si128_emu(Q3,Q4);
 194     return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
 195 }
 196
 197 static inline uint64_t precompReduction64_port( __m128i A) {
 198     return _mm_cvtsi128_si64(precompReduction64_si128_port(A));
 199 }
 200
 201 // verus intermediate hash extra
 202 static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
 203 {
 204     __m128i const *pbuf;
 205
 206     // divide key mask by 16 from bytes to __m128i
 207     keyMask >>= 4;
 208
 209     // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
 210     // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
 211     // be used to xor into the accumulator before it is hashed with other values first
 212     __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
 213
 214     for (int64_t i = 0; i < 32; i++)
 215     {
 216         const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
 217
 218         // get two random locations in the key, which will be mutated and swapped
 219         __m128i *prand = randomsource + ((selector >> 5) & keyMask);
 220         __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
 221
 222         // select random start and order of pbuf processing
 223         pbuf = buf + (selector & 3);
 224
 225         switch (selector & 0x1c)
 226         {
 227             case 0:
 228             {
 229                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 230                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 231                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 232                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 233                 acc = _mm_xor_si128_emu(clprod1, acc);
 234
 235                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 236                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 237
 238                 const __m128i temp12 = _mm_load_si128_emu(prand);
 239                 _mm_store_si128_emu(prand, tempa2);
 240
 241                 const __m128i temp22 = _mm_load_si128_emu(pbuf);
 242                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 243                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 244                 acc = _mm_xor_si128_emu(clprod12, acc);
 245
 246                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 247                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 248                 _mm_store_si128_emu(prandex, tempb2);
 249                 break;
 250             }
 251             case 4:
 252             {
 253                 const __m128i temp1 = _mm_load_si128_emu(prand);
 254                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 255                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 256                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 257                 acc = _mm_xor_si128_emu(clprod1, acc);
 258                 const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
 259                 acc = _mm_xor_si128_emu(clprod2, acc);
 260
 261                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 262                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 263
 264                 const __m128i temp12 = _mm_load_si128_emu(prandex);
 265                 _mm_store_si128_emu(prandex, tempa2);
 266
 267                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 268                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 269                 acc = _mm_xor_si128_emu(add12, acc);
 270
 271                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 272                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 273                 _mm_store_si128_emu(prand, tempb2);
 274                 break;
 275             }
 276             case 8:
 277             {
 278                 const __m128i temp1 = _mm_load_si128_emu(prandex);
 279                 const __m128i temp2 = _mm_load_si128_emu(pbuf);
 280                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 281                 acc = _mm_xor_si128_emu(add1, acc);
 282
 283                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 284                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 285
 286                 const __m128i temp12 = _mm_load_si128_emu(prand);
 287                 _mm_store_si128_emu(prand, tempa2);
 288
 289                 const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 290                 const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 291                 const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 292                 acc = _mm_xor_si128_emu(clprod12, acc);
 293                 const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 294                 acc = _mm_xor_si128_emu(clprod22, acc);
 295
 296                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 297                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 298                 _mm_store_si128_emu(prandex, tempb2);
 299                 break;
 300             }
 301             case 0x0c:
 302             {
 303                 const __m128i temp1 = _mm_load_si128_emu(prand);
 304                 const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 305                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 306
 307                 // cannot be zero here
 308                 const int32_t divisor = (uint32_t)selector;
 309
 310                 acc = _mm_xor_si128(add1, acc);
 311
 312                 const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
 313                 const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
 314                 acc = _mm_xor_si128_emu(modulo, acc);
 315
 316                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
 317                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
 318
 319                 if (dividend & 1)
 320                 {
 321                     const __m128i temp12 = _mm_load_si128_emu(prandex);
 322                     _mm_store_si128_emu(prandex, tempa2);
 323
 324                     const __m128i temp22 = _mm_load_si128_emu(pbuf);
 325                     const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
 326                     const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
 327                     acc = _mm_xor_si128_emu(clprod12, acc);
 328                     const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
 329                     acc = _mm_xor_si128_emu(clprod22, acc);
 330
 331                     const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
 332                     const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
 333                     _mm_store_si128_emu(prand, tempb2);
 334                 }
 335                 else
 336                 {
 337                     const __m128i tempb3 = _mm_load_si128_emu(prandex);
 338                     _mm_store_si128_emu(prandex, tempa2);
 339                     _mm_store_si128_emu(prand, tempb3);
 340                 }
 341                 break;
 342             }
 343             case 0x10:
 344             {
 345                 // a few AES operations
 346                 const __m128i *rc = prand;
 347                 __m128i tmp;
 348
 349                 __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 350                 __m128i temp2 = _mm_load_si128_emu(pbuf);
 351
 352                 AES2_EMU(temp1, temp2, 0);
 353                 MIX2_EMU(temp1, temp2);
 354
 355                 AES2_EMU(temp1, temp2, 4);
 356                 MIX2_EMU(temp1, temp2);
 357
 358                 AES2_EMU(temp1, temp2, 8);
 359                 MIX2_EMU(temp1, temp2);
 360
 361                 acc = _mm_xor_si128_emu(temp1, acc);
 362                 acc = _mm_xor_si128_emu(temp2, acc);
 363
 364                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 365                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 366                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 367
 368                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 369                 _mm_store_si128_emu(prandex, tempa3);
 370                 _mm_store_si128_emu(prand, tempa4);
 371                 break;
 372             }
 373             case 0x14:
 374             {
 375                 // we'll just call this one the monkins loop, inspired by Chris
 376                 const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
 377                 __m128i tmp; // used by MIX2
 378
 379                 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
 380                 __m128i *rc = prand;
 381                 uint64_t aesround = 0;
 382                 __m128i onekey;
 383
 384                 do
 385                 {
 386                     if (selector & (0x10000000 << rounds))
 387                     {
 388                         onekey = _mm_load_si128_emu(rc++);
 389                         const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
 390                         const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
 391                         const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 392                         acc = _mm_xor_si128_emu(clprod1, acc);
 393                     }
 394                     else
 395                     {
 396                         onekey = _mm_load_si128_emu(rc++);
 397                         __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
 398                         AES2_EMU(onekey, temp2, aesround++ << 2);
 399                         MIX2_EMU(onekey, temp2);
 400                         acc = _mm_xor_si128_emu(onekey, acc);
 401                         acc = _mm_xor_si128_emu(temp2, acc);
 402                     }
 403                 } while (rounds--);
 404
 405                 const __m128i tempa1 = _mm_load_si128_emu(prand);
 406                 const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
 407                 const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
 408
 409                 const __m128i tempa4 = _mm_load_si128_emu(prandex);
 410                 _mm_store_si128_emu(prandex, tempa3);
 411                 _mm_store_si128_emu(prand, tempa4);
 412                 break;
 413             }
 414             case 0x18:
 415             {
 416                 const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
 417                 const __m128i temp2 = _mm_load_si128_emu(prand);
 418                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 419                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 420                 acc = _mm_xor_si128_emu(clprod1, acc);
 421
 422                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 423                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 424
 425                 const __m128i tempb3 = _mm_load_si128_emu(prandex);
 426                 _mm_store_si128_emu(prandex, tempa2);
 427                 _mm_store_si128_emu(prand, tempb3);
 428                 break;
 429             }
 430             case 0x1c:
 431             {
 432                 const __m128i temp1 = _mm_load_si128_emu(pbuf);
 433                 const __m128i temp2 = _mm_load_si128_emu(prandex);
 434                 const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
 435                 const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
 436                 acc = _mm_xor_si128_emu(clprod1, acc);
 437
 438                 const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
 439                 const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
 440
 441                 const __m128i tempa3 = _mm_load_si128_emu(prand);
 442                 _mm_store_si128_emu(prand, tempa2);
 443
 444                 acc = _mm_xor_si128_emu(tempa3, acc);
 445
 446                 const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
 447                 const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
 448                 _mm_store_si128_emu(prandex, tempb2);
 449                 break;
 450             }
 451         }
 452     }
 453     return acc;
 454 }
 455
 456 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
 457 // returning a 64 bit hash value
 458 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
 459     const unsigned int  m = 128;// we process the data in chunks of 16 cache lines
 460     __m128i * rs64 = (__m128i *)random;
 461     const __m128i * string = (const __m128i *) buf;
 462
 463     __m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
 464     acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
 465     return precompReduction64_port(acc);
 466 }
 467