Updates, portability, and fixes

author miketout <[email protected]>

Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)

committer miketout <[email protected]>

Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)
author miketout <[email protected]>
Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)
committer miketout <[email protected]>
Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)
diff --git a/src/chain.h b/src/chain.h

index 8d5a3b3239600aac124351e493e98c03bafc5c7c..4381bae12c82c978d4171b72e63be90d0fcb273f 100644 (file)
--- a/src/chain.h
+++ b/src/chain.h
@@ -471,7 +471,9 @@ public:
      template <typename Stream, typename Operation>
      inline void SerializationOp(Stream& s, Operation ser_action) {
          int nVersion = s.GetVersion();
-        //if (!ser_action.ForRead()) printf("Serializing block index %s, stream version: %x\n", ToString().c_str(), nVersion);
+#ifdef VERUSHASHDEBUG
+        if (!ser_action.ForRead()) printf("Serializing block index %s, stream version: %x\n", ToString().c_str(), nVersion);
+#endif
          if (!(s.GetType() & SER_GETHASH))
              READWRITE(VARINT(nVersion));
  
diff --git a/src/crypto/haraka.c b/src/crypto/haraka.c

index df616246e807f7ace408c420e1e2d3078c067807..90e399aaa6efadbd12c9e84c4ff37f21c7811e48 100644 (file)
--- a/src/crypto/haraka.c
+++ b/src/crypto/haraka.c
@@ -433,19 +433,19 @@ void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc
    s[2] = LOAD(in + 32);\r
    s[3] = LOAD(in + 48);\r
  \r
-  AES4_zero(s[0], s[1], s[2], s[3], 0);\r
+  AES4(s[0], s[1], s[2], s[3], 0);\r
    MIX4(s[0], s[1], s[2], s[3]);\r
  \r
-  AES4_zero(s[0], s[1], s[2], s[3], 8);\r
+  AES4(s[0], s[1], s[2], s[3], 8);\r
    MIX4(s[0], s[1], s[2], s[3]);\r
  \r
-  AES4_zero(s[0], s[1], s[2], s[3], 16);\r
+  AES4(s[0], s[1], s[2], s[3], 16);\r
    MIX4(s[0], s[1], s[2], s[3]);\r
  \r
-  AES4_zero(s[0], s[1], s[2], s[3], 24);\r
+  AES4(s[0], s[1], s[2], s[3], 24);\r
    MIX4(s[0], s[1], s[2], s[3]);\r
  \r
-  AES4_zero(s[0], s[1], s[2], s[3], 32);\r
+  AES4(s[0], s[1], s[2], s[3], 32);\r
    MIX4(s[0], s[1], s[2], s[3]);\r
  \r
    s[0] = _mm_xor_si128(s[0], LOAD(in));\r
diff --git a/src/crypto/haraka_portable.c b/src/crypto/haraka_portable.c

index 1f6d3b14cfb8a389a404e8ca2d52d5d107aece70..1744c0a17f3a09a232ee13627cd7faac9c805081 100644 (file)
--- a/src/crypto/haraka_portable.c
+++ b/src/crypto/haraka_portable.c
@@ -246,10 +246,9 @@ void haraka512_perm(unsigned char *out, const unsigned char *in)
      memcpy(out, s, 64);\r
  }\r
  \r
-void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u128 *_rc) \r
+void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) \r
  {\r
      int i, j;\r
-    const unsigned char *rc;\r
  \r
      unsigned char s[64], tmp[16];\r
  \r
@@ -261,10 +260,10 @@ void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u12
      for (i = 0; i < 5; ++i) {\r
          // aes round(s)\r
          for (j = 0; j < 2; ++j) {\r
-            aesenc(s, rc[4*2*i + 4*j]);\r
-            aesenc(s + 16, rc[4*2*i + 4*j + 1]);\r
-            aesenc(s + 32, rc[4*2*i + 4*j + 2]);\r
-            aesenc(s + 48, rc[4*2*i + 4*j + 3]);\r
+            aesenc(s, (const unsigned char *)&rc[4*2*i + 4*j]);\r
+            aesenc(s + 16, (const unsigned char *)&rc[4*2*i + 4*j + 1]);\r
+            aesenc(s + 32, (const unsigned char *)&rc[4*2*i + 4*j + 2]);\r
+            aesenc(s + 48, (const unsigned char *)&rc[4*2*i + 4*j + 3]);\r
          }\r
  \r
          // mixing\r
diff --git a/src/crypto/haraka_portable.h b/src/crypto/haraka_portable.h

index e948f2f36a7d0ebc91455e709b7406d9611826af..4004cb1b1a825e84e7c325e5b4f98cf03b15ccc0 100644 (file)
--- a/src/crypto/haraka_portable.h
+++ b/src/crypto/haraka_portable.h
@@ -12,6 +12,27 @@ typedef unsigned long u64;
  #endif\r
  typedef __m128i u128;\r
  \r
+extern void aesenc(unsigned char *s, const unsigned char *rk);\r
+\r
+#define AES2_EMU(s0, s1, rci) \\r
+  aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci])); \\r
+  aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 1])); \\r
+  aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci + 2])); \\r
+  aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 3]));\r
+\r
+static inline void mix2_emu(__m128i *s0, __m128i *s1)\r
+{\r
+    __m128i tmp;\r
+    tmp = (*s0 & 0xffffffff) | ((*s1 & 0xffffffff) << 32) | ((*s0 & 0xffffffff00000000) << 32) | ((*s1 & 0xffffffff00000000) << 64);\r
+    *s1 = ((*s0 >> 64) & 0xffffffff) | (((*s1 >> 64) & 0xffffffff) << 32) | (((*s0 >> 64) & 0xffffffff00000000) << 32) | (((*s1 >> 64) & 0xffffffff00000000) << 64);\r
+    *s0 = tmp;\r
+}\r
+\r
+#define MIX2_EMU(s0, s1) \\r
+  tmp = _mm_unpacklo_epi32(s0, s1); \\r
+  s1 = _mm_unpackhi_epi32(s0, s1); \\r
+  s0 = tmp;\r
+\r
  /* load constants */\r
  void load_constants_port();\r
  \r
diff --git a/src/crypto/verus_clhash.cpp b/src/crypto/verus_clhash.cpp

index f2603818ceb3bb9e77d665800cfde1b65507f4d8..156724b1224bdad76a5a847e8cc396dfac990d38 100644 (file)
--- a/src/crypto/verus_clhash.cpp
+++ b/src/crypto/verus_clhash.cpp
@@ -33,6 +33,435 @@ thread_local void *verusclhasherrefresh;
  thread_local int64_t verusclhasher_keySizeInBytes;
  thread_local uint256 verusclhasher_seed;
  
+class uint128 : public base_blob<128> {
+public:
+    uint128() {}
+    uint128(const base_blob<128>& b) : base_blob<128>(b) {}
+    explicit uint128(const std::vector<unsigned char>& vch) : base_blob<128>(vch) {}
+    uint128(__m128i &a) {  }
+
+    operator __m128i&() { return *(__m128i *)this; }
+    uint128 &operator()(__m128i &a) { return *(uint128 *)&a; }
+};
+
+void clmul64(uint64_t a, uint64_t b, uint64_t* r)
+{
+    uint8_t s = 4,i; //window size
+    uint64_t two_s = 1 << s; //2^s
+    uint64_t smask = two_s-1; //s 1 bits
+    uint64_t u[16];
+    uint64_t tmp;
+    uint64_t ifmask;
+    //Precomputation
+    u[0] = 0;
+    u[1] = b;
+    for(i = 2 ; i < two_s; i += 2){
+        u[i] = u[i >> 1] << 1; //even indices: left shift
+        u[i + 1] = u[i] ^ b; //odd indices: xor b
+    }
+    //Multiply
+    r[0] = u[a & smask]; //first window only affects lower word
+    r[1] = 0;
+    for(i = s ; i < 64 ; i += s){
+        tmp = u[a >> i & smask];     
+        r[0] ^= tmp << i;
+        r[1] ^= tmp >> (64 - i);
+    }
+    //Repair
+    uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
+    for(i = 1 ; i < s ; i++){
+        tmp = ((a & m) >> i);
+        m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
+        ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
+        r[1] ^= (tmp & ifmask);
+    }
+}
+
+static u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
+{
+    uint64_t result[2];
+    clmul64(*(uint64_t*)&a + (imm & 1), *(uint64_t*)&a + ((imm & 0x10) > 4), result);
+    return *(__m128i *)result;
+}
+
+static u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
+{
+    uint16_t result[8];
+    uint16_t *a = (uint16_t*)&_a, *b = (uint16_t*)&_b;
+    for (int i = 0; i < 7; i ++)
+    {
+        result[i] = (uint16_t)((uint32_t)((((int32_t)a[i] * (int32_t)b[i]) >> 14) + 1) >> 1);
+    }
+}
+
+static inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
+{
+    __m128i result;
+    ((uint64_t *)&result)[0] = lo;
+    ((uint64_t *)&result)[1] = hi;
+    return result;
+}
+
+static inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
+{
+    __m128i result;
+    ((uint64_t *)&result)[0] = lo;
+    ((uint64_t *)&result)[1] = 0;
+    return result;
+}
+
+static inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
+{
+    return *(int64_t *)&a;
+}
+
+static inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
+{
+    return *(int32_t *)&a;
+}
+
+static inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
+{
+    __m128i result;
+    ((uint32_t *)&result)[0] = lo;
+    ((uint32_t *)&result)[1] = 0;
+    ((uint64_t *)&result)[1] = 0;
+    return result;
+}
+
+static u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
+{
+    __m128i result;
+    ((uint8_t *)&result)[0] = c0;
+    ((uint8_t *)&result)[1] = c1;
+    ((uint8_t *)&result)[2] = c2;
+    ((uint8_t *)&result)[3] = c3;
+    ((uint8_t *)&result)[4] = c4;
+    ((uint8_t *)&result)[5] = c5;
+    ((uint8_t *)&result)[6] = c6;
+    ((uint8_t *)&result)[7] = c7;
+    ((uint8_t *)&result)[8] = c8;
+    ((uint8_t *)&result)[9] = c9;
+    ((uint8_t *)&result)[10] = c10;
+    ((uint8_t *)&result)[11] = c11;
+    ((uint8_t *)&result)[12] = c12;
+    ((uint8_t *)&result)[13] = c13;
+    ((uint8_t *)&result)[14] = c14;
+    ((uint8_t *)&result)[15] = c15;
+    return result;
+}
+
+static inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
+{
+    uint8_t shift = (uint32_t)imm8 & 0xff;
+    shift = shift > 15 ? 128 : shift << 3;
+    return a >> shift;
+}
+
+inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
+{
+    return a ^ b;
+}
+
+inline __m128i _mm_load_si128_emu(const void *p)
+{
+    return *(__m128i *)p;
+}
+
+inline __m128i _mm_store_si128_emu(void *p, __m128i val)
+{
+    *(__m128i *)p = val;
+}
+
+// portable
+static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
+    const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
+    const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
+    return clprod1;
+}
+
+// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
+static inline __m128i precompReduction64_si128_port( __m128i A) {
+
+    //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
+    const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
+    __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
+    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+                                  _mm_srli_si128_emu(Q2,8));
+    __m128i Q4 = _mm_xor_si128_emu(Q2,A);
+    const __m128i final = _mm_xor_si128_emu(Q3,Q4);
+    return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
+}
+
+static inline uint64_t precompReduction64_port( __m128i A) {
+    return _mm_cvtsi128_si64(precompReduction64_si128_port(A));
+}
+
+// verus intermediate hash extra
+static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
+{
+    __m128i acc = _mm_cvtsi64_si128_emu(0);
+
+    __m128i const *pbuf = buf;
+
+    // divide key mask by 32 from bytes to __m128i
+    keyMask >>= 5;
+
+    for (int64_t i = 0; i < 32; i++)
+    {
+        const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
+
+        // get two random locations in the key, which will be mutated and swapped
+        __m128i *prand = randomsource + ((selector >> 5) & keyMask);
+        __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+        // select random start and order of pbuf processing
+        pbuf = buf + (selector & 3);
+
+        switch (selector & 0x1c)
+        {
+            case 0:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prandex);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf);
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                acc = _mm_xor_si128_emu(clprod12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+            case 4:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prand);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+                const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
+                acc = _mm_xor_si128_emu(clprod2, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                acc = _mm_xor_si128_emu(add12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prand, tempb2);
+                break;
+            }
+            case 8:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prandex);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                acc = _mm_xor_si128_emu(add1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                acc = _mm_xor_si128_emu(clprod12, acc);
+                const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+                acc = _mm_xor_si128_emu(clprod22, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+            case 0x0c:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prand);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+
+                // cannot be zero here
+                const int32_t divisor = _mm_cvtsi128_si32_emu(acc);
+
+                acc = _mm_xor_si128(add1, acc);
+
+                const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
+                const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
+                acc = _mm_xor_si128_emu(modulo, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                if (dividend & 1)
+                {
+                    const __m128i temp12 = _mm_load_si128_emu(prandex);
+                    _mm_store_si128_emu(prandex, tempa2);
+
+                    const __m128i temp22 = _mm_load_si128_emu(pbuf);
+                    const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                    const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                    acc = _mm_xor_si128_emu(clprod12, acc);
+                    const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+                    acc = _mm_xor_si128_emu(clprod22, acc);
+
+                    const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                    const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                    _mm_store_si128_emu(prand, tempb2);
+                }
+                else
+                {
+                    const __m128i tempb3 = _mm_load_si128_emu(prandex);
+                    _mm_store_si128_emu(prandex, tempa2);
+                    _mm_store_si128_emu(prand, tempb3);
+                }
+                break;
+            }
+            case 0x10:
+            {
+                // a few AES operations
+                const __m128i *rc = prand;
+                __m128i tmp;
+
+                __m128i temp1 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                __m128i temp2 = _mm_load_si128_emu(pbuf);
+
+                AES2_EMU(temp1, temp2, 0);
+                MIX2_EMU(temp1, temp2);
+
+                AES2_EMU(temp1, temp2, 4);
+                MIX2_EMU(temp1, temp2);
+
+                AES2_EMU(temp1, temp2, 8);
+                MIX2_EMU(temp1, temp2);
+
+                acc = _mm_xor_si128_emu(temp1, acc);
+                acc = _mm_xor_si128_emu(temp2, acc);
+
+                const __m128i tempa1 = _mm_load_si128_emu(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa3);
+                _mm_store_si128_emu(prand, tempa4);
+                break;
+            }
+            case 0x14:
+            {
+                // we'll just call this one the monkins loop, inspired by Chris
+                const __m128i *buftmp = pbuf + (((selector & 1) << 1) - 1);
+                __m128i tmp; // used by MIX2
+
+                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+                __m128i *pkey = prand;
+                uint64_t aesround = 0;
+                __m128i onekey;
+
+                do
+                {
+                    if (selector & (0x10000000 << rounds))
+                    {
+                        onekey = _mm_load_si128_emu(pkey++);
+                        const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
+                        const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
+                        const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                        acc = _mm_xor_si128_emu(clprod1, acc);
+                    }
+                    else
+                    {
+                        const __m128i *rc = pkey++; 
+                        onekey = _mm_load_si128_emu(rc);
+                        __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
+                        AES2_EMU(onekey, temp2, aesround++ << 2);
+                        MIX2_EMU(onekey, temp2);
+                        acc = _mm_xor_si128_emu(onekey, acc);
+                        acc = _mm_xor_si128_emu(temp2, acc);
+                    }
+                } while (rounds--);
+
+                const __m128i tempa1 = _mm_load_si128_emu(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa3);
+                _mm_store_si128_emu(prand, tempa4);
+                break;
+            }
+            case 0x18:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(pbuf + (((selector & 1) << 1) - 1));
+                const __m128i temp2 = _mm_load_si128_emu(prand);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+                const __m128i tempb3 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa2);
+                _mm_store_si128_emu(prand, tempb3);
+                break;
+            }
+            case 0x1c:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(pbuf);
+                const __m128i temp2 = _mm_load_si128_emu(prandex);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+                const __m128i tempa3 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                acc = _mm_xor_si128_emu(tempa3, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+        }
+    }
+    return acc;
+}
+
+// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
+// returning a 64 bit hash value
+uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
+    const unsigned int  m = 128;// we process the data in chunks of 16 cache lines
+    __m128i * rs64 = (__m128i *)random;
+    const __m128i * string = (const __m128i *) buf;
+
+    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
+    acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
+    return precompReduction64_port(acc);
+}
+
  // multiply the length and the some key, no modulo
  static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
      const __m128i lengthvector = _mm_set_epi64x(keylength,length);
@@ -46,8 +475,8 @@ static inline __m128i precompReduction64_si128( __m128i A) {
      //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
      const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
      __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
-    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
-                                  _mm_srli_si128(Q2,8));
+    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+                                  _mm_srli_si128_emu(Q2,8));
      __m128i Q4 = _mm_xor_si128(Q2,A);
      const __m128i final = _mm_xor_si128(Q3,Q4);
      return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
diff --git a/src/crypto/verus_clhash.h b/src/crypto/verus_clhash.h

index 1e5d32a8beb4e9ce50308eed46f73d77204e97fc..2ea7e04f4b4fd1eb693f628b4fc7e31567b4ec49 100644 (file)
--- a/src/crypto/verus_clhash.h
+++ b/src/crypto/verus_clhash.h
@@ -22,6 +22,8 @@
  
  #include "clhash.h"
  
+#include <cpuid.h>
+
  #include <stdlib.h>
  #include <stdint.h>
  #include <stddef.h>
@@ -42,7 +44,19 @@ extern thread_local void *verusclhasherrefresh;
  extern thread_local int64_t verusclhasher_keySizeInBytes;
  extern thread_local uint256 verusclhasher_seed;
  
+inline bool IsCPUVerusOptimized()
+{
+    unsigned int eax,ebx,ecx,edx;
+
+    if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx))
+    {
+        return false;
+    }
+    return ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));
+};
+
  uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask);
+uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask);
  
  void *alloc_aligned_buffer(uint64_t bufSize);
  
@@ -59,6 +73,7 @@ void *alloc_aligned_buffer(uint64_t bufSize);
  struct verusclhasher {
      int64_t keySizeIn64BitWords;
      int64_t keyMask;
+    uint64_t (*verusclhashfunction)(void * random, const unsigned char buf[64], uint64_t keyMask);
  
      inline uint64_t keymask(uint64_t keysize)
      {
@@ -73,6 +88,15 @@ struct verusclhasher {
      // align on 128 byte boundary at end
      verusclhasher(uint64_t keysize=VERUSKEYSIZE) : keySizeIn64BitWords((keysize >> 5) << 2)
      {
+        if (IsCPUVerusOptimized())
+        {
+            verusclhashfunction = &verusclhash;
+        }
+        else
+        {
+            verusclhashfunction = &verusclhash_port;
+        }
+
          // align to 128 bits
          int64_t newKeySize = keySizeIn64BitWords << 3;
          if (verusclhasher_random_data_ && newKeySize != verusclhasher_keySizeInBytes)
@@ -110,8 +134,8 @@ struct verusclhasher {
          return verusclhasher_random_data_;
      }
  
-    uint64_t operator()(const unsigned char buf[64]) const {
-        return verusclhash(verusclhasher_random_data_, buf, keyMask);
+    inline uint64_t operator()(const unsigned char buf[64]) const {
+        return (*verusclhashfunction)(verusclhasher_random_data_, buf, keyMask);
      }
  };
  
diff --git a/src/crypto/verus_hash.h b/src/crypto/verus_hash.h

index 231533b9ec16e81043dfadcf65399d9233c501aa..6b65f96862318669abb66846ac022f7e7f5ceee5 100644 (file)
--- a/src/crypto/verus_hash.h
+++ b/src/crypto/verus_hash.h
@@ -11,8 +11,6 @@ This provides the PoW hash function for Verus, enabling CPU mining.
  #include <cstring>\r
  #include <vector>\r
  \r
-#include <cpuid.h>\r
-\r
  #include "uint256.h"\r
  #include "crypto/verus_clhash.h"\r
  \r
@@ -22,6 +20,9 @@ extern "C"
  #include "crypto/haraka_portable.h"\r
  }\r
  \r
+// verbose output when defined\r
+#define VERUSHASHDEBUG 1\r
+\r
  class CVerusHash\r
  {\r
      public:\r
@@ -217,15 +218,4 @@ class CVerusHashV2
  extern void verus_hash(void *result, const void *data, size_t len);\r
  extern void verus_hash_v2(void *result, const void *data, size_t len);\r
  \r
-inline bool IsCPUVerusOptimized()\r
-{\r
-    unsigned int eax,ebx,ecx,edx;\r
-\r
-    if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx))\r
-    {\r
-        return false;\r
-    }\r
-    return ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));\r
-};\r
-\r
  #endif\r
diff --git a/src/main.cpp b/src/main.cpp

index 0f41ddd3eab229a649cb0a3c2030c2137a4995cf..3970a5e2707a601ff0d99f83439db27bfc7575e0 100644 (file)
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2194,33 +2194,29 @@ int IsNotInSync()
  {
      const CChainParams& chainParams = Params();
  
-    LOCK(cs_main);
      if (fImporting || fReindex)
      {
          //fprintf(stderr,"IsNotInSync: fImporting %d || %d fReindex\n",(int32_t)fImporting,(int32_t)fReindex);
          return true;
      }
-    if (fCheckpointsEnabled)
-    {
-        if (fCheckpointsEnabled && chainActive.Height() < Checkpoints::GetTotalBlocksEstimate(chainParams.Checkpoints()))
-        {
-            //fprintf(stderr,"IsNotInSync: checkpoint -> initialdownload chainActive.Height().%d GetTotalBlocksEstimate(chainParams.Checkpoints().%d\n", chainActive.Height(), Checkpoints::GetTotalBlocksEstimate(chainParams.Checkpoints()));
-            return true;
-        }
-    }
+    //if (fCheckpointsEnabled)
+    //{
+    //    if (fCheckpointsEnabled && chainActive.Height() < Checkpoints::GetTotalBlocksEstimate(chainParams.Checkpoints()))
+    //    {
+    //        //fprintf(stderr,"IsNotInSync: checkpoint -> initialdownload chainActive.Height().%d GetTotalBlocksEstimate(chainParams.Checkpoints().%d\n", chainActive.Height(), Checkpoints::GetTotalBlocksEstimate(chainParams.Checkpoints()));
+    //        return true;
+    //    }
+    //}
  
-    CBlockIndex *pbi = chainActive.Tip();
-    int longestchain = komodo_longestchain();
+    CBlockIndex *pbi = chainActive.LastTip();
      if ( !pbi || 
           (pindexBestHeader == 0) || 
-         ((pindexBestHeader->GetHeight() - 1) > pbi->GetHeight()) || 
-         (longestchain != 0 && longestchain > pbi->GetHeight()) )
+         ((pindexBestHeader->GetHeight() - 1) > pbi->GetHeight()))
      {
          return (pbi && pindexBestHeader && (pindexBestHeader->GetHeight() - 1) > pbi->GetHeight()) ?
                  pindexBestHeader->GetHeight() - pbi->GetHeight() :
                  true;
      }
-
      return false;
  }
  
@@ -6417,7 +6413,10 @@ bool static ProcessMessage(CNode* pfrom, string strCommand, CDataStream& vRecv,
          if (nVersion == 10300)
              nVersion = 300;
  
-        if (nVersion < MIN_PEER_PROTO_VERSION)
+        int nHeight = GetHeight();
+
+        if (CConstVerusSolutionVector::activationHeight.ActiveVersion(nHeight) ? nVersion < MIN_VERUSHASHV2_VERSION : 
+                                                                                 nVersion < MIN_PEER_PROTO_VERSION)
          {
              // disconnect from peers older than this proto version
              LogPrintf("peer=%d using obsolete version %i; disconnecting\n", pfrom->id, pfrom->nVersion);
diff --git a/src/miner.cpp b/src/miner.cpp

index 825f503b7eb44453af4e226bf42a6ec51b7187f2..3f6d617a186c0c9650b4c78aca66d11cbb2af6dc 100644 (file)
--- a/src/miner.cpp
+++ b/src/miner.cpp
@@ -939,6 +939,7 @@ int32_t FOUND_BLOCK,KOMODO_MAYBEMINED;
  extern int32_t KOMODO_LASTMINED,KOMODO_INSYNC;
  int32_t roundrobin_delay;
  arith_uint256 HASHTarget,HASHTarget_POW;
+int32_t komodo_longestchain();
  
  // wait for peers to connect
  void waitForPeers(const CChainParams &chainparams)
@@ -951,7 +952,9 @@ void waitForPeers(const CChainParams &chainparams)
              LOCK(cs_vNodes);
              fvNodesEmpty = vNodes.empty();
          }
-        if (fvNodesEmpty || IsNotInSync())
+        int longestchain = komodo_longestchain();
+        int lastlongest = 0;
+        if (fvNodesEmpty || IsNotInSync() || (longestchain != 0 && longestchain > chainActive.LastTip()->GetHeight()))
          {
              int loops = 0, blockDiff = 0, newDiff = 0;
              
@@ -964,8 +967,9 @@ void waitForPeers(const CChainParams &chainparams)
                      fvNodesEmpty = vNodes.empty();
                      loops = 0;
                      blockDiff = 0;
+                    lastlongest = 0;
                  }
-                if ((newDiff = IsNotInSync()) > 1)
+                else if ((newDiff = IsNotInSync()) > 0)
                  {
                      if (blockDiff != newDiff)
                      {
@@ -973,12 +977,23 @@ void waitForPeers(const CChainParams &chainparams)
                      }
                      else
                      {
-                        if (++loops <= 10)
+                        if (++loops <= 5)
                          {
                              MilliSleep(1000);
                          }
                          else break;
                      }
+                    lastlongest = 0;
+                }
+                else if (!fvNodesEmpty && !IsNotInSync() && longestchain > chainActive.LastTip()->GetHeight())
+                {
+                    // the only thing may be that we are seeing a long chain that we'll never get
+                    // don't wait forever
+                    if (lastlongest == 0)
+                    {
+                        MilliSleep(3000);
+                        lastlongest = longestchain;
+                    }
                  }
              } while (fvNodesEmpty || IsNotInSync());
              MilliSleep(100 + rand() % 400);
@@ -1436,20 +1451,24 @@ void static BitcoinMiner_noeq()
                          int32_t unlockTime = komodo_block_unlocktime(Mining_height);
                          int64_t subsidy = (int64_t)(pblock->vtx[0].vout[0].nValue);
  
-
+#ifdef VERUSHASHDEBUG
+                        std::string validateStr = hashResult.GetHex();
+                        std::string hashStr = pblock->GetHash().GetHex();
+                        uint256 *bhalf1 = (uint256 *)vh2->CurBuffer();
+                        uint256 *bhalf2 = bhalf1 + 1;
+#else
                          std::string hashStr = hashResult.GetHex();
-                        //std::string validateStr = hashResult.GetHex();
-                        //std::string hashStr = pblock->GetHash().GetHex();
-                        //uint256 *bhalf1 = (uint256 *)vh2->CurBuffer();
-                        //uint256 *bhalf2 = bhalf1 + 1;
-
+#endif
                          LogPrintf("Using %s algorithm:\n", ASSETCHAINS_ALGORITHMS[ASSETCHAINS_ALGO]);
                          LogPrintf("proof-of-work found  \n  hash: %s  \ntarget: %s\n", hashStr, hashTarget.GetHex());
                          printf("Found block %d \n", Mining_height );
                          printf("mining reward %.8f %s!\n", (double)subsidy / (double)COIN, ASSETCHAINS_SYMBOL);
-                        //printf("  hash: %s\n   val: %s  \ntarget: %s\n\n", hashStr.c_str(), validateStr.c_str(), hashTarget.GetHex().c_str());
-                        //printf("intermediate %lx\n", intermediate);
+#ifdef VERUSHASHDEBUG
+                        printf("  hash: %s\n   val: %s  \ntarget: %s\n\n", hashStr.c_str(), validateStr.c_str(), hashTarget.GetHex().c_str());
+                        printf("intermediate %lx\n", intermediate);
+#else
                          printf("  hash: %s\ntarget: %s", hashStr.c_str(), hashTarget.GetHex().c_str());
+#endif
                          if (unlockTime > Mining_height && subsidy >= ASSETCHAINS_TIMELOCKGTE)
                              printf(" - timelocked until block %i\n", unlockTime);
                          else
diff --git a/src/rpc/mining.cpp b/src/rpc/mining.cpp

index b1bb404d2293509cea1c039f26c9368ea5aaa102..ee8794b314163b035d4525aed8a913de8a776026 100644 (file)
--- a/src/rpc/mining.cpp
+++ b/src/rpc/mining.cpp
@@ -614,37 +614,6 @@ UniValue getblocktemplate(const UniValue& params, bool fHelp)
      }
      if (Params().MiningRequiresPeers() && (IsNotInSync() || fvNodesEmpty))
      {
-        /*
-        int loops = 0, blockDiff = 0, newDiff = 0;
-        const int delay = 15;
-        do {
-            if (fvNodesEmpty)
-            {
-                MilliSleep(1000 + rand() % 4000);
-                LOCK(cs_vNodes);
-                fvNodesEmpty = vNodes.empty();
-                loops = 0;
-                blockDiff = 0;
-            }
-            if ((newDiff = IsNotInSync()) > 1)
-            {
-                if (blockDiff != newDiff)
-                {
-                    blockDiff = newDiff;
-                }
-                else
-                {
-                    if (++loops <= delay)
-                    {
-                        MilliSleep(1000);
-                    }
-                    else break;
-                }
-            }
-        } while (fvNodesEmpty || IsNotInSync());
-        if (loops > delay)
-            throw JSONRPCError(RPC_CLIENT_NOT_CONNECTED, "Cannot get a block template while no peers are connected or chain not in sync!");
-        */
          throw JSONRPCError(RPC_CLIENT_NOT_CONNECTED, "Cannot get a block template while no peers are connected or chain not in sync!");
      }
  
diff --git a/src/txdb.cpp b/src/txdb.cpp

index 09454112333a397fd3628538f091878ebf3cf6f3..4fc248cbf46ab0993063edfefc4aa19aec489378 100644 (file)
--- a/src/txdb.cpp
+++ b/src/txdb.cpp
@@ -652,10 +652,12 @@ bool CBlockTreeDB::LoadBlockIndexGuts()
              CDiskBlockIndex diskindex;
              if (pcursor->GetValue(diskindex)) {
                  // Construct block index object
-                //if (diskindex.nVersion == CBlockHeader::VERUS_V2)
-                //{
-                //    printf("VerusHash 2.0 block header: %s\n", diskindex.ToString().c_str());
-                //}
+#ifdef VERUSHASHDEBUG
+                if (diskindex.nVersion == CBlockHeader::VERUS_V2)
+                {
+                    printf("VerusHash 2.0 block header: %s\n", diskindex.ToString().c_str());
+                }
+#endif
                  CBlockIndex* pindexNew    = InsertBlockIndex(diskindex.GetBlockHash());
                  pindexNew->pprev          = InsertBlockIndex(diskindex.hashPrev);
                  pindexNew->SetHeight(diskindex.GetHeight());
diff --git a/src/version.h b/src/version.h

index 72ddac446a654ba942e7f1fcfb9129e411861b71..337ea2c9b7f99bba7bdd3349c56fd24fdb738c92 100644 (file)
--- a/src/version.h
+++ b/src/version.h
@@ -9,7 +9,7 @@
   * network protocol versioning
   */
  
-static const int PROTOCOL_VERSION = 170007;
+static const int PROTOCOL_VERSION = 170008;
  
  //! initial proto version, to be increased after version/verack negotiation
  static const int INIT_PROTO_VERSION = 209;
@@ -19,6 +19,7 @@ static const int GETHEADERS_VERSION = 31800;
  
  //! disconnect from peers older than this proto version
  static const int MIN_PEER_PROTO_VERSION = 170002;
+static const int MIN_VERUSHASHV2_VERSION = 170008;
  
  //! nTime field added to CAddress, starting with this version;
  //! if possible, avoid requesting addresses nodes older than this
author	miketout <[email protected]>
	Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)
committer	miketout <[email protected]>
	Mon, 17 Dec 2018 05:13:40 +0000 (21:13 -0800)
src/chain.h		patch \| blob \| blame \| history
src/crypto/haraka.c		patch \| blob \| blame \| history
src/crypto/haraka_portable.c		patch \| blob \| blame \| history
src/crypto/haraka_portable.h		patch \| blob \| blame \| history
src/crypto/verus_clhash.cpp		patch \| blob \| blame \| history
src/crypto/verus_clhash.h		patch \| blob \| blame \| history
src/crypto/verus_hash.h		patch \| blob \| blame \| history
src/main.cpp		patch \| blob \| blame \| history
src/miner.cpp		patch \| blob \| blame \| history
src/rpc/mining.cpp		patch \| blob \| blame \| history
src/txdb.cpp		patch \| blob \| blame \| history
src/version.h		patch \| blob \| blame \| history