thread_local int64_t verusclhasher_keySizeInBytes;
thread_local uint256 verusclhasher_seed;
+class uint128 : public base_blob<128> {
+public:
+ uint128() {}
+ uint128(const base_blob<128>& b) : base_blob<128>(b) {}
+ explicit uint128(const std::vector<unsigned char>& vch) : base_blob<128>(vch) {}
+ uint128(__m128i &a) { }
+
+ operator __m128i&() { return *(__m128i *)this; }
+ uint128 &operator()(__m128i &a) { return *(uint128 *)&a; }
+};
+
+void clmul64(uint64_t a, uint64_t b, uint64_t* r)
+{
+ uint8_t s = 4,i; //window size
+ uint64_t two_s = 1 << s; //2^s
+ uint64_t smask = two_s-1; //s 1 bits
+ uint64_t u[16];
+ uint64_t tmp;
+ uint64_t ifmask;
+ //Precomputation
+ u[0] = 0;
+ u[1] = b;
+ for(i = 2 ; i < two_s; i += 2){
+ u[i] = u[i >> 1] << 1; //even indices: left shift
+ u[i + 1] = u[i] ^ b; //odd indices: xor b
+ }
+ //Multiply
+ r[0] = u[a & smask]; //first window only affects lower word
+ r[1] = 0;
+ for(i = s ; i < 64 ; i += s){
+ tmp = u[a >> i & smask];
+ r[0] ^= tmp << i;
+ r[1] ^= tmp >> (64 - i);
+ }
+ //Repair
+ uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
+ for(i = 1 ; i < s ; i++){
+ tmp = ((a & m) >> i);
+ m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
+ ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
+ r[1] ^= (tmp & ifmask);
+ }
+}
+
+static u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
+{
+ uint64_t result[2];
+ clmul64(*(uint64_t*)&a + (imm & 1), *(uint64_t*)&a + ((imm & 0x10) > 4), result);
+ return *(__m128i *)result;
+}
+
+static u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
+{
+ uint16_t result[8];
+ uint16_t *a = (uint16_t*)&_a, *b = (uint16_t*)&_b;
+ for (int i = 0; i < 7; i ++)
+ {
+ result[i] = (uint16_t)((uint32_t)((((int32_t)a[i] * (int32_t)b[i]) >> 14) + 1) >> 1);
+ }
+}
+
+static inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
+{
+ __m128i result;
+ ((uint64_t *)&result)[0] = lo;
+ ((uint64_t *)&result)[1] = hi;
+ return result;
+}
+
+static inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
+{
+ __m128i result;
+ ((uint64_t *)&result)[0] = lo;
+ ((uint64_t *)&result)[1] = 0;
+ return result;
+}
+
+static inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
+{
+ return *(int64_t *)&a;
+}
+
+static inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
+{
+ return *(int32_t *)&a;
+}
+
+static inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
+{
+ __m128i result;
+ ((uint32_t *)&result)[0] = lo;
+ ((uint32_t *)&result)[1] = 0;
+ ((uint64_t *)&result)[1] = 0;
+ return result;
+}
+
+static u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
+{
+ __m128i result;
+ ((uint8_t *)&result)[0] = c0;
+ ((uint8_t *)&result)[1] = c1;
+ ((uint8_t *)&result)[2] = c2;
+ ((uint8_t *)&result)[3] = c3;
+ ((uint8_t *)&result)[4] = c4;
+ ((uint8_t *)&result)[5] = c5;
+ ((uint8_t *)&result)[6] = c6;
+ ((uint8_t *)&result)[7] = c7;
+ ((uint8_t *)&result)[8] = c8;
+ ((uint8_t *)&result)[9] = c9;
+ ((uint8_t *)&result)[10] = c10;
+ ((uint8_t *)&result)[11] = c11;
+ ((uint8_t *)&result)[12] = c12;
+ ((uint8_t *)&result)[13] = c13;
+ ((uint8_t *)&result)[14] = c14;
+ ((uint8_t *)&result)[15] = c15;
+ return result;
+}
+
+static inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
+{
+ uint8_t shift = (uint32_t)imm8 & 0xff;
+ shift = shift > 15 ? 128 : shift << 3;
+ return a >> shift;
+}
+
+inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
+{
+ return a ^ b;
+}
+
+inline __m128i _mm_load_si128_emu(const void *p)
+{
+ return *(__m128i *)p;
+}
+
+inline __m128i _mm_store_si128_emu(void *p, __m128i val)
+{
+ *(__m128i *)p = val;
+}
+
+// portable
+static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
+ const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
+ return clprod1;
+}
+
+// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
+static inline __m128i precompReduction64_si128_port( __m128i A) {
+
+ //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
+ const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
+ __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
+ __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+ _mm_srli_si128_emu(Q2,8));
+ __m128i Q4 = _mm_xor_si128_emu(Q2,A);
+ const __m128i final = _mm_xor_si128_emu(Q3,Q4);
+ return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
+}
+
+static inline uint64_t precompReduction64_port( __m128i A) {
+ return _mm_cvtsi128_si64(precompReduction64_si128_port(A));
+}
+
+// verus intermediate hash extra
+static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
+{
+ __m128i acc = _mm_cvtsi64_si128_emu(0);
+
+ __m128i const *pbuf = buf;
+
+ // divide key mask by 32 from bytes to __m128i
+ keyMask >>= 5;
+
+ for (int64_t i = 0; i < 32; i++)
+ {
+ const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
+
+ // get two random locations in the key, which will be mutated and swapped
+ __m128i *prand = randomsource + ((selector >> 5) & keyMask);
+ __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+ // select random start and order of pbuf processing
+ pbuf = buf + (selector & 3);
+
+ switch (selector & 0x1c)
+ {
+ case 0:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(prandex);
+ const __m128i temp2 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+ acc = _mm_xor_si128_emu(clprod1, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+ const __m128i temp12 = _mm_load_si128_emu(prand);
+ _mm_store_si128_emu(prand, tempa2);
+
+ const __m128i temp22 = _mm_load_si128_emu(pbuf);
+ const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+ const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+ acc = _mm_xor_si128_emu(clprod12, acc);
+
+ const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+ const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+ _mm_store_si128_emu(prandex, tempb2);
+ break;
+ }
+ case 4:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(prand);
+ const __m128i temp2 = _mm_load_si128_emu(pbuf);
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+ acc = _mm_xor_si128_emu(clprod1, acc);
+ const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
+ acc = _mm_xor_si128_emu(clprod2, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+ const __m128i temp12 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa2);
+
+ const __m128i temp22 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+ acc = _mm_xor_si128_emu(add12, acc);
+
+ const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+ const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+ _mm_store_si128_emu(prand, tempb2);
+ break;
+ }
+ case 8:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(prandex);
+ const __m128i temp2 = _mm_load_si128_emu(pbuf);
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+ acc = _mm_xor_si128_emu(add1, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+ const __m128i temp12 = _mm_load_si128_emu(prand);
+ _mm_store_si128_emu(prand, tempa2);
+
+ const __m128i temp22 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+ const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+ acc = _mm_xor_si128_emu(clprod12, acc);
+ const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+ acc = _mm_xor_si128_emu(clprod22, acc);
+
+ const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+ const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+ _mm_store_si128_emu(prandex, tempb2);
+ break;
+ }
+ case 0x0c:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(prand);
+ const __m128i temp2 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+
+ // cannot be zero here
+ const int32_t divisor = _mm_cvtsi128_si32_emu(acc);
+
+ acc = _mm_xor_si128(add1, acc);
+
+ const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
+ const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
+ acc = _mm_xor_si128_emu(modulo, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+ if (dividend & 1)
+ {
+ const __m128i temp12 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa2);
+
+ const __m128i temp22 = _mm_load_si128_emu(pbuf);
+ const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+ const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+ acc = _mm_xor_si128_emu(clprod12, acc);
+ const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+ acc = _mm_xor_si128_emu(clprod22, acc);
+
+ const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+ const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+ _mm_store_si128_emu(prand, tempb2);
+ }
+ else
+ {
+ const __m128i tempb3 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa2);
+ _mm_store_si128_emu(prand, tempb3);
+ }
+ break;
+ }
+ case 0x10:
+ {
+ // a few AES operations
+ const __m128i *rc = prand;
+ __m128i tmp;
+
+ __m128i temp1 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ __m128i temp2 = _mm_load_si128_emu(pbuf);
+
+ AES2_EMU(temp1, temp2, 0);
+ MIX2_EMU(temp1, temp2);
+
+ AES2_EMU(temp1, temp2, 4);
+ MIX2_EMU(temp1, temp2);
+
+ AES2_EMU(temp1, temp2, 8);
+ MIX2_EMU(temp1, temp2);
+
+ acc = _mm_xor_si128_emu(temp1, acc);
+ acc = _mm_xor_si128_emu(temp2, acc);
+
+ const __m128i tempa1 = _mm_load_si128_emu(prand);
+ const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+ const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+ const __m128i tempa4 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa3);
+ _mm_store_si128_emu(prand, tempa4);
+ break;
+ }
+ case 0x14:
+ {
+ // we'll just call this one the monkins loop, inspired by Chris
+ const __m128i *buftmp = pbuf + (((selector & 1) << 1) - 1);
+ __m128i tmp; // used by MIX2
+
+ uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+ __m128i *pkey = prand;
+ uint64_t aesround = 0;
+ __m128i onekey;
+
+ do
+ {
+ if (selector & (0x10000000 << rounds))
+ {
+ onekey = _mm_load_si128_emu(pkey++);
+ const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
+ const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+ acc = _mm_xor_si128_emu(clprod1, acc);
+ }
+ else
+ {
+ const __m128i *rc = pkey++;
+ onekey = _mm_load_si128_emu(rc);
+ __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
+ AES2_EMU(onekey, temp2, aesround++ << 2);
+ MIX2_EMU(onekey, temp2);
+ acc = _mm_xor_si128_emu(onekey, acc);
+ acc = _mm_xor_si128_emu(temp2, acc);
+ }
+ } while (rounds--);
+
+ const __m128i tempa1 = _mm_load_si128_emu(prand);
+ const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+ const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+ const __m128i tempa4 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa3);
+ _mm_store_si128_emu(prand, tempa4);
+ break;
+ }
+ case 0x18:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+ const __m128i temp2 = _mm_load_si128_emu(prand);
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+ acc = _mm_xor_si128_emu(clprod1, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+ const __m128i tempb3 = _mm_load_si128_emu(prandex);
+ _mm_store_si128_emu(prandex, tempa2);
+ _mm_store_si128_emu(prand, tempb3);
+ break;
+ }
+ case 0x1c:
+ {
+ const __m128i temp1 = _mm_load_si128_emu(pbuf);
+ const __m128i temp2 = _mm_load_si128_emu(prandex);
+ const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+ const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+ acc = _mm_xor_si128_emu(clprod1, acc);
+
+ const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+ const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+ const __m128i tempa3 = _mm_load_si128_emu(prand);
+ _mm_store_si128_emu(prand, tempa2);
+
+ acc = _mm_xor_si128_emu(tempa3, acc);
+
+ const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
+ const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
+ _mm_store_si128_emu(prandex, tempb2);
+ break;
+ }
+ }
+ }
+ return acc;
+}
+
+// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
+// returning a 64 bit hash value
+uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
+ const unsigned int m = 128;// we process the data in chunks of 16 cache lines
+ __m128i * rs64 = (__m128i *)random;
+ const __m128i * string = (const __m128i *) buf;
+
+ __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
+ acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
+ return precompReduction64_port(acc);
+}
+
// multiply the length and the some key, no modulo
static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
const __m128i lengthvector = _mm_set_epi64x(keylength,length);
//const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
__m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
- __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
- _mm_srli_si128(Q2,8));
+ __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+ _mm_srli_si128_emu(Q2,8));
__m128i Q4 = _mm_xor_si128(Q2,A);
const __m128i final = _mm_xor_si128(Q3,Q4);
return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE