]> Git Repo - cpuminer-multi.git/commitdiff
Scrypt-Jane algorithm from nicehash
authornicehashdev <[email protected]>
Sat, 17 Oct 2015 11:36:02 +0000 (13:36 +0200)
committerTanguy Pruvot <[email protected]>
Mon, 7 Dec 2015 16:42:37 +0000 (17:42 +0100)
30 files changed:
Makefile.am
algo/scrypt-jane.c [new file with mode: 0644]
cpu-miner.c
cpuminer.vcxproj
cpuminer.vcxproj.filters
miner.h
scryptjane/scrypt-conf.h [new file with mode: 0644]
scryptjane/scrypt-jane-chacha.h
scryptjane/scrypt-jane-hash_blake256.h [new file with mode: 0644]
scryptjane/scrypt-jane-hash_blake512.h [new file with mode: 0644]
scryptjane/scrypt-jane-hash_sha512.h [new file with mode: 0644]
scryptjane/scrypt-jane-hash_skein512.h [new file with mode: 0644]
scryptjane/scrypt-jane-mix_chacha-avx.h
scryptjane/scrypt-jane-mix_chacha-sse2.h
scryptjane/scrypt-jane-mix_chacha-ssse3.h
scryptjane/scrypt-jane-mix_salsa-avx.h
scryptjane/scrypt-jane-mix_salsa-sse2.h
scryptjane/scrypt-jane-mix_salsa64-avx.h [new file with mode: 0644]
scryptjane/scrypt-jane-mix_salsa64-sse2.h [new file with mode: 0644]
scryptjane/scrypt-jane-mix_salsa64-ssse3.h [new file with mode: 0644]
scryptjane/scrypt-jane-mix_salsa64.h [new file with mode: 0644]
scryptjane/scrypt-jane-pbkdf2.h
scryptjane/scrypt-jane-portable-x86.h
scryptjane/scrypt-jane-portable.h
scryptjane/scrypt-jane-romix-basic.h
scryptjane/scrypt-jane-romix-template.h
scryptjane/scrypt-jane-salsa.h
scryptjane/scrypt-jane-salsa64.h [new file with mode: 0644]
scryptjane/scrypt-jane-test-vectors.h
util.c

index df19e35d144685cc08c7bfa7be025a1d4a73c966..296af68472f171d84004e8288314f7693adf246c 100644 (file)
@@ -77,6 +77,7 @@ cpuminer_SOURCES = \
   algo/pluck.c \
   algo/qubit.c \
   algo/scrypt.c \
+  algo/scrypt-jane.c \
   algo/sha2.c \
   algo/sibcoin.c \
   algo/skein.c \
@@ -117,6 +118,7 @@ cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
 
 if HAVE_WINDOWS
 cpuminer_CFLAGS += -Wl,--stack,10485760
+cpuminer_LDADD += -lcrypt32 -lgdi32
 endif
 
 if HAVE_WINDOWS
diff --git a/algo/scrypt-jane.c b/algo/scrypt-jane.c
new file mode 100644 (file)
index 0000000..acdd7b7
--- /dev/null
@@ -0,0 +1,190 @@
+#include "miner.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "inttypes.h"
+
+/* Hard-coded scrypt parameteres r and p - mikaelh */
+#define SCRYPT_R 1
+#define SCRYPT_P 1
+
+/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */
+#define CPU_X86_FORCE_INTRINSICS
+#define SCRYPT_KECCAK512
+#define SCRYPT_CHACHA
+#define SCRYPT_CHOOSE_COMPILETIME
+
+//#include "scrypt-jane.h"
+#include "../scryptjane/scrypt-jane-portable.h"
+#include "../scryptjane/scrypt-jane-hash.h"
+#include "../scryptjane/scrypt-jane-romix.h"
+#include "../scryptjane/scrypt-jane-test-vectors.h"
+
+
+#define scrypt_maxN 30  /* (1 << (30 + 1)) = ~2 billion */
+#if (SCRYPT_BLOCK_BYTES == 64)
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 128)
+#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 256)
+#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 512)
+#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
+#endif
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25  /* (1 << 25) = ~33 million */
+
+typedef struct scrypt_aligned_alloc_t {
+       uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+static int
+scrypt_alloc(uint64_t size, scrypt_aligned_alloc *aa) {
+       static const size_t max_alloc = (size_t)-1;
+       size += (SCRYPT_BLOCK_BYTES - 1);
+       if (size > max_alloc)
+               return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+       aa->mem = (uint8_t *)malloc((size_t)size);
+       aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+       if (!aa->mem)
+               return 0; // scrypt_fatal_error("scrypt: out of memory");
+       return 1;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+       free(aa->mem);
+}
+
+void
+scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) {
+       uint32_t chunk_bytes, i;
+       const uint32_t r = SCRYPT_R;
+       const uint32_t p = SCRYPT_P;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+       scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+       chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+
+       /* 1: X = PBKDF2(password, salt) */
+       scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
+
+       /* 2: X = ROMix(X) */
+       for (i = 0; i < p; i++)
+               scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
+
+       /* 3: Out = PBKDF2(password, X) */
+       scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+       /* This is an unnecessary security feature - mikaelh */
+       scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
+#endif
+}
+
+
+//  increasing Nfactor gradually
+const unsigned char minNfactor = 4;
+const unsigned char maxNfactor = 30;
+
+unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) {
+       int l = 0;
+       unsigned long int s;
+       int n;
+       unsigned char N;
+
+       if (nTimestamp <= ntime)
+               return 4;
+
+       s = nTimestamp - ntime;
+       while ((s >> 1) > 3) {
+               l += 1;
+               s >>= 1;
+       }
+
+       s &= 3;
+
+       n = (l * 170 + s * 25 - 2320) / 100;
+
+       if (n < 0) n = 0;
+
+       if (n > 255) {
+               n = 255;
+               // printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+       }
+
+       N = (unsigned char)n;
+       //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor));
+
+       if (N<minNfactor) return minNfactor;
+       if (N>maxNfactor) return maxNfactor;
+       return N;
+}
+
+
+int scanhash_scryptjane(int Nfactor, int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+       uint32_t max_nonce, uint64_t *hashes_done)
+{
+       scrypt_aligned_alloc YX, V;
+       uint8_t *X, *Y;
+       uint32_t N, chunk_bytes;
+       const uint32_t r = SCRYPT_R;
+       const uint32_t p = SCRYPT_P;
+
+       uint32_t _ALIGN(64) endiandata[20];
+       const uint32_t first_nonce = pdata[19];
+       uint32_t nonce = first_nonce;
+
+       if (opt_benchmark)
+               ((uint32_t*)ptarget)[7] = 0x0000ff;
+
+       for (int k = 0; k < 20; k++)
+               be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+       //Nfactor = GetNfactor(data[17], ntime);
+       //if (Nfactor > scrypt_maxN) {
+       //      return 1;
+       //      //scrypt_fatal_error("scrypt: N out of range");
+       //}
+
+       N = (1 << (Nfactor + 1));
+
+       chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+       if (!scrypt_alloc((uint64_t)N * chunk_bytes, &V)) return 1;
+       if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
+               scrypt_free(&V);
+               return 1;
+       }
+
+       Y = YX.ptr;
+       X = Y + chunk_bytes;
+
+       do {
+               const uint32_t Htarg = ptarget[7];
+               uint32_t hash[8];
+               be32enc(&endiandata[19], nonce);
+
+               scrypt_N_1_1((unsigned char *)endiandata, 80,
+                       (unsigned char *)endiandata, 80,
+                       N, (unsigned char *)hash, 32, X, Y, V.ptr);
+
+               if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+                       pdata[19] = nonce;
+                       *hashes_done = pdata[19] - first_nonce;
+                       scrypt_free(&V);
+                       scrypt_free(&YX);
+                       return 1;
+               }
+               nonce++;
+
+       } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+       pdata[19] = nonce;
+       *hashes_done = pdata[19] - first_nonce + 1;
+
+       scrypt_free(&V);
+       scrypt_free(&YX);
+       return 0;
+}
\ No newline at end of file
index bab51cd5f50f671ff80ac9b0fc0371b186538303..a088ef912195ad711ef689995406d545adf05dfd 100644 (file)
@@ -110,6 +110,7 @@ enum algos {
        ALGO_X15,         /* X15 Whirlpool */
        ALGO_YESCRYPT,
        ALGO_ZR5,
+       ALGO_SCRYPTJANE,
        ALGO_COUNT
 };
 
@@ -152,6 +153,7 @@ static const char *algo_names[] = {
        "x15",
        "yescrypt",
        "zr5",
+       "scryptjane",
        "\0"
 };
 
@@ -292,6 +294,7 @@ Options:\n\
                           x15          X15\n\
                           yescrypt     Yescrypt\n\
                           zr5          ZR5\n\
+                          scryptjane:N\n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
   -u, --user=USERNAME   username for mining server\n\
@@ -460,7 +463,10 @@ static void affine_to_cpu_mask(int id, unsigned long mask) { }
 
 void get_currentalgo(char* buf, int sz)
 {
-       snprintf(buf, sz, "%s", algo_names[opt_algo]);
+       if (opt_algo == ALGO_SCRYPTJANE)
+               snprintf(buf, sz, "%s:%d", algo_names[opt_algo], opt_scrypt_n);
+       else
+               snprintf(buf, sz, "%s", algo_names[opt_algo]);
 }
 
 void proper_exit(int reason)
@@ -941,6 +947,7 @@ static int share_result(int result, struct work *work, const char *reason)
        case ALGO_CRYPTOLIGHT:
        case ALGO_CRYPTONIGHT:
        case ALGO_PLUCK:
+       case ALGO_SCRYPTJANE:
                sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate);
                applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s H/s %s",
                        accepted_count, accepted_count + rejected_count,
@@ -1603,6 +1610,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
                switch (opt_algo) {
                        case ALGO_DROP:
                        case ALGO_SCRYPT:
+                       case ALGO_SCRYPTJANE:
                        case ALGO_NEOSCRYPT:
                        case ALGO_PLUCK:
                        case ALGO_YESCRYPT:
@@ -1886,6 +1894,7 @@ static void *miner_thread(void *userdata)
                        case ALGO_AXIOM:
                        case ALGO_CRYPTOLIGHT:
                        case ALGO_CRYPTONIGHT:
+                       case ALGO_SCRYPTJANE:
                                max64 = 0x40LL;
                                break;
                        case ALGO_DROP:
@@ -2080,6 +2089,9 @@ static void *miner_thread(void *userdata)
                        rc = scanhash_pentablake(thr_id, work.data, work.target, max_nonce,
                                        &hashes_done);
                        break;
+               case ALGO_SCRYPTJANE:
+                       rc = scanhash_scryptjane(opt_scrypt_n, thr_id, work.data, work.target, max_nonce, &hashes_done);
+                       break;
                default:
                        /* should never happen */
                        goto out;
@@ -2100,6 +2112,7 @@ static void *miner_thread(void *userdata)
                        case ALGO_CRYPTOLIGHT:
                        case ALGO_CRYPTONIGHT:
                        case ALGO_PLUCK:
+                       case ALGO_SCRYPTJANE:
                                applog(LOG_INFO, "CPU #%d: %.2f H/s", thr_id, thr_hashrates[thr_id]);
                                break;
                        default:
@@ -2117,6 +2130,8 @@ static void *miner_thread(void *userdata)
                                switch(opt_algo) {
                                case ALGO_CRYPTOLIGHT:
                                case ALGO_CRYPTONIGHT:
+                               case ALGO_AXIOM:
+                               case ALGO_SCRYPTJANE:
                                        sprintf(s, "%.3f", hashrate);
                                        applog(LOG_NOTICE, "Total: %s H/s", s);
                                        break;
@@ -2536,7 +2551,7 @@ void parse_arg(int key, char *arg)
                                if (arg[v] == ':') {
                                        char *ep;
                                        v = strtol(arg+v+1, &ep, 10);
-                                       if (*ep || v & (v-1) || v < 2)
+                                       if (*ep || (i == ALGO_SCRYPT && v & (v-1)) || v < 2)
                                                continue;
                                        opt_algo = (enum algos) i;
                                        opt_scrypt_n = v;
@@ -2572,6 +2587,8 @@ void parse_arg(int key, char *arg)
                }
                if (!opt_nfactor && opt_algo == ALGO_SCRYPT)
                        opt_nfactor = 9;
+               if (opt_algo == ALGO_SCRYPTJANE && opt_scrypt_n == 0)
+                       opt_scrypt_n = 5;
                break;
        case 'b':
                p = strstr(arg, ":");
index f1cb17fdd9a6594ad70901c916cf65a449170337..e06c5ec91876ba7f520c1401b8a029b6934ad05a 100644 (file)
     <ClCompile Include="algo\scrypt.c">
       <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
     </ClCompile>
+    <ClCompile Include="algo\scrypt-jane.c" />
     <ClCompile Include="algo\sibcoin.c" />
     <ClCompile Include="algo\skein.c" />
     <ClCompile Include="algo\skein2.c" />
index d4c996a13a537756d504a1e4979bd29676cce5d1..cf55e4aaf1adc2cacb35c78fa22084fab898f239 100644 (file)
     <ClCompile Include="algo\axiom.c">
       <Filter>algo</Filter>
     </ClCompile>
+    <ClCompile Include="algo\scrypt-jane.c">
+      <Filter>algo</Filter>
+    </ClCompile>
     <ClCompile Include="algo\sibcoin.c">
       <Filter>algo</Filter>
     </ClCompile>
diff --git a/miner.h b/miner.h
index d88bc64d985cd65c1c4e75fd6871ac2c4264cdf1..15d79037a844834b4c7bf60f05d9103c81f51b5f 100644 (file)
--- a/miner.h
+++ b/miner.h
@@ -302,6 +302,9 @@ int scanhash_cryptolight(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
                             uint32_t max_nonce, uint64_t *hashes_done);
 
+int scanhash_scryptjane(int Nfactor, int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+       uint32_t max_nonce, uint64_t *hashes_done);
+
 /* api related */
 void *api_thread(void *userdata);
 
diff --git a/scryptjane/scrypt-conf.h b/scryptjane/scrypt-conf.h
new file mode 100644 (file)
index 0000000..46685a5
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+       pick the best algo at runtime or compile time?
+       ----------------------------------------------
+       SCRYPT_CHOOSE_COMPILETIME (gcc only!)
+       SCRYPT_CHOOSE_RUNTIME
+*/
+#define SCRYPT_CHOOSE_RUNTIME
+
+
+/*
+       hash function to use
+       -------------------------------
+       SCRYPT_BLAKE256
+       SCRYPT_BLAKE512
+       SCRYPT_SHA256
+       SCRYPT_SHA512
+       SCRYPT_SKEIN512
+*/
+//#define SCRYPT_SHA256
+
+
+/*
+       block mixer to use
+       -----------------------------
+       SCRYPT_CHACHA
+       SCRYPT_SALSA
+*/
+//#define SCRYPT_SALSA
index 41d96e5ee890ee60e1b1ef867979cb1ea41a5aa5..c4d44c24b4c1ab9683c1c5853f78c93f678a94f4 100644 (file)
@@ -18,6 +18,10 @@ typedef uint32_t scrypt_mix_word_t;
 
 #if defined(SCRYPT_CHACHA_AVX)
        #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+       #if defined(X86_INTRINSIC_AVX)
+               #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1
+               #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor
+       #endif
        #define SCRYPT_ROMIX_FN scrypt_ROMix_avx
        #define SCRYPT_MIX_FN chacha_core_avx
        #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
@@ -27,6 +31,10 @@ typedef uint32_t scrypt_mix_word_t;
 
 #if defined(SCRYPT_CHACHA_SSSE3)
        #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+       #if defined(X86_INTRINSIC_SSSE3)
+               #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1
+               #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor
+       #endif
        #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
        #define SCRYPT_MIX_FN chacha_core_ssse3
        #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
@@ -36,6 +44,10 @@ typedef uint32_t scrypt_mix_word_t;
 
 #if defined(SCRYPT_CHACHA_SSE2)
        #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+       #if defined(X86_INTRINSIC_SSE2)
+               #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1
+               #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor
+       #endif
        #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
        #define SCRYPT_MIX_FN chacha_core_sse2
        #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
@@ -81,17 +93,21 @@ scrypt_getROMix() {
 #if defined(SCRYPT_TEST_SPEED)
 static size_t
 available_implementations() {
+       size_t cpuflags = detect_cpu();
        size_t flags = 0;
 
 #if defined(SCRYPT_CHACHA_AVX)
-       flags |= cpu_avx;
+       if (cpuflags & cpu_avx)
+               flags |= cpu_avx;
 #endif
 
 #if defined(SCRYPT_CHACHA_SSSE3)
-       flags |= cpu_ssse3;
+       if (cpuflags & cpu_ssse3)
+               flags |= cpu_ssse3;
 #endif
 
 #if defined(SCRYPT_CHACHA_SSE2)
+       if (cpuflags & cpu_sse2)
                flags |= cpu_sse2;
 #endif
 
diff --git a/scryptjane/scrypt-jane-hash_blake256.h b/scryptjane/scrypt-jane-hash_blake256.h
new file mode 100644 (file)
index 0000000..dee9013
--- /dev/null
@@ -0,0 +1,177 @@
+#define SCRYPT_HASH "BLAKE-256"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake256_sigma[] = {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+       14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+       11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+        7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+        9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+        2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+       12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+       13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+        6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+       10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint32_t blake256_constants[16] = {
+       0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
+       0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
+};
+
+typedef struct scrypt_hash_state_t {
+       uint32_t H[8], T[2];
+       uint32_t leftover;
+       uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+       const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
+       uint32_t m[16], v[16], h[8], t[2];
+       uint32_t i;
+
+       for (i = 0; i < 8; i++) h[i] = S->H[i];
+       for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+       while (blocks--) {
+               t[0] += 512;
+               t[1] += (t[0] < 512) ? 1 : 0;
+
+               for (i = 0; i <  8; i++) v[i     ] = h[i];
+               for (i = 0; i <  4; i++) v[i +  8] = blake256_constants[i];
+               for (i = 0; i <  2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0];
+               for (i = 0; i <  2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1];
+
+               for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]);
+               in += 64;
+
+               #define G(a,b,c,d,e)                                                 \
+                       v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
+                       v[d] = ROTR32(v[d] ^ v[a],16);                                   \
+                       v[c] += v[d];                                                    \
+                       v[b] = ROTR32(v[b] ^ v[c],12);                                   \
+                       v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
+                       v[d] = ROTR32(v[d] ^ v[a], 8);                                   \
+                       v[c] += v[d];                                                    \
+                       v[b] = ROTR32(v[b] ^ v[c], 7);
+
+               for (i = 0, sigma = blake256_sigma; i < 14; i++) {
+                       G(0, 4, 8,12, 0);
+                       G(1, 5, 9,13, 2);
+                       G(2, 6,10,14, 4);
+                       G(3, 7,11,15, 6);
+
+                       G(0, 5,10,15, 8);
+                       G(1, 6,11,12,10);
+                       G(2, 7, 8,13,12);
+                       G(3, 4, 9,14,14);
+
+                       sigma += 16;
+                       if (sigma == sigma_end)
+                               sigma = blake256_sigma;
+               }
+
+               #undef G
+
+               for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+       }
+
+       for (i = 0; i < 8; i++) S->H[i] = h[i];
+       for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+       S->H[0] = 0x6a09e667ULL;
+       S->H[1] = 0xbb67ae85ULL;
+       S->H[2] = 0x3c6ef372ULL;
+       S->H[3] = 0xa54ff53aULL;
+       S->H[4] = 0x510e527fULL;
+       S->H[5] = 0x9b05688cULL;
+       S->H[6] = 0x1f83d9abULL;
+       S->H[7] = 0x5be0cd19ULL;
+       S->T[0] = 0;
+       S->T[1] = 0;
+       S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+       size_t blocks, want;
+
+       /* handle the previous data */
+       if (S->leftover) {
+               want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+               want = (want < inlen) ? want : inlen;
+               memcpy(S->buffer + S->leftover, in, want);
+               S->leftover += (uint32_t)want;
+               if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+                       return;
+               in += want;
+               inlen -= want;
+               blake256_blocks(S, S->buffer, 1);
+       }
+
+       /* handle the current data */
+       blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+       S->leftover = (uint32_t)(inlen - blocks);
+       if (blocks) {
+               blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+               in += blocks;
+       }
+
+       /* handle leftover data */
+       if (S->leftover)
+               memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+       uint32_t th, tl, bits;
+
+       bits = (S->leftover << 3);
+       tl = S->T[0] + bits;
+       th = S->T[1];
+       if (S->leftover == 0) {
+               S->T[0] = (uint32_t)0 - (uint32_t)512;
+               S->T[1] = (uint32_t)0 - (uint32_t)1;
+       } else if (S->T[0] == 0) {
+               S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
+               S->T[1] = S->T[1] - 1;
+       } else {
+               S->T[0] -= (512 - bits);
+       }
+
+       S->buffer[S->leftover] = 0x80;
+       if (S->leftover <= 55) {
+               memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+       } else {
+               memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+               blake256_blocks(S, S->buffer, 1);
+               S->T[0] = (uint32_t)0 - (uint32_t)512;
+               S->T[1] = (uint32_t)0 - (uint32_t)1;
+               memset(S->buffer, 0, 56);
+       }
+       S->buffer[55] |= 1;
+       U32TO8_BE(S->buffer + 56, th);
+       U32TO8_BE(S->buffer + 60, tl);
+       blake256_blocks(S, S->buffer, 1);
+
+       U32TO8_BE(&hash[ 0], S->H[0]);
+       U32TO8_BE(&hash[ 4], S->H[1]);
+       U32TO8_BE(&hash[ 8], S->H[2]);
+       U32TO8_BE(&hash[12], S->H[3]);
+       U32TO8_BE(&hash[16], S->H[4]);
+       U32TO8_BE(&hash[20], S->H[5]);
+       U32TO8_BE(&hash[24], S->H[6]);
+       U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+       0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20,
+       0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8,
+};
diff --git a/scryptjane/scrypt-jane-hash_blake512.h b/scryptjane/scrypt-jane-hash_blake512.h
new file mode 100644 (file)
index 0000000..ea2a583
--- /dev/null
@@ -0,0 +1,181 @@
+#define SCRYPT_HASH "BLAKE-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake512_sigma[] = {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+       14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+       11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+        7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+        9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+        2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+       12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+       13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+        6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+       10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint64_t blake512_constants[16] = {
+       0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+       0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+       0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+       0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+typedef struct scrypt_hash_state_t {
+       uint64_t H[8], T[2];
+       uint32_t leftover;
+       uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+       const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16);
+       uint64_t m[16], v[16], h[8], t[2];
+       uint32_t i;
+
+       for (i = 0; i < 8; i++) h[i] = S->H[i];
+       for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+       while (blocks--) {
+               t[0] += 1024;
+               t[1] += (t[0] < 1024) ? 1 : 0;
+
+               for (i = 0; i <  8; i++) v[i     ] = h[i];
+               for (i = 0; i <  4; i++) v[i +  8] = blake512_constants[i];
+               for (i = 0; i <  2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0];
+               for (i = 0; i <  2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1];
+
+               for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]);
+               in += 128;
+
+               #define G(a,b,c,d,e)                                                 \
+                       v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \
+                       v[d] = ROTR64(v[d] ^ v[a],32);                                   \
+                       v[c] += v[d];                                                    \
+                       v[b] = ROTR64(v[b] ^ v[c],25);                                   \
+                       v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \
+                       v[d] = ROTR64(v[d] ^ v[a],16);                                   \
+                       v[c] += v[d];                                                    \
+                       v[b] = ROTR64(v[b] ^ v[c],11);
+
+               for (i = 0, sigma = blake512_sigma; i < 16; i++) {
+                       G(0, 4, 8,12, 0);
+                       G(1, 5, 9,13, 2);
+                       G(2, 6,10,14, 4);
+                       G(3, 7,11,15, 6);
+                       G(0, 5,10,15, 8);
+                       G(1, 6,11,12,10);
+                       G(2, 7, 8,13,12);
+                       G(3, 4, 9,14,14);
+
+                       sigma += 16;
+                       if (sigma == sigma_end)
+                               sigma = blake512_sigma;
+               }
+
+               #undef G
+
+               for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+       }
+
+       for (i = 0; i < 8; i++) S->H[i] = h[i];
+       for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+       S->H[0] = 0x6a09e667f3bcc908ULL;
+       S->H[1] = 0xbb67ae8584caa73bULL;
+       S->H[2] = 0x3c6ef372fe94f82bULL;
+       S->H[3] = 0xa54ff53a5f1d36f1ULL;
+       S->H[4] = 0x510e527fade682d1ULL;
+       S->H[5] = 0x9b05688c2b3e6c1fULL;
+       S->H[6] = 0x1f83d9abfb41bd6bULL;
+       S->H[7] = 0x5be0cd19137e2179ULL;
+       S->T[0] = 0;
+       S->T[1] = 0;
+       S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+       size_t blocks, want;
+
+       /* handle the previous data */
+       if (S->leftover) {
+               want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+               want = (want < inlen) ? want : inlen;
+               memcpy(S->buffer + S->leftover, in, want);
+               S->leftover += (uint32_t)want;
+               if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+                       return;
+               in += want;
+               inlen -= want;
+               blake512_blocks(S, S->buffer, 1);
+       }
+
+       /* handle the current data */
+       blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+       S->leftover = (uint32_t)(inlen - blocks);
+       if (blocks) {
+               blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+               in += blocks;
+       }
+
+       /* handle leftover data */
+       if (S->leftover)
+               memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+       uint64_t th, tl;
+       size_t bits;
+
+       bits = (S->leftover << 3);
+       tl = S->T[0] + bits;
+       th = S->T[1];
+       if (S->leftover == 0) {
+               S->T[0] = (uint64_t)0 - (uint64_t)1024;
+               S->T[1] = (uint64_t)0 - (uint64_t)1;
+       } else if (S->T[0] == 0) {
+               S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits;
+               S->T[1] = S->T[1] - 1;
+       } else {
+               S->T[0] -= (1024 - bits);
+       }
+
+       S->buffer[S->leftover] = 0x80;
+       if (S->leftover <= 111) {
+               memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+       } else {
+               memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+               blake512_blocks(S, S->buffer, 1);
+               S->T[0] = (uint64_t)0 - (uint64_t)1024;
+               S->T[1] = (uint64_t)0 - (uint64_t)1;
+               memset(S->buffer, 0, 112);
+       }
+       S->buffer[111] |= 1;
+       U64TO8_BE(S->buffer + 112, th);
+       U64TO8_BE(S->buffer + 120, tl);
+       blake512_blocks(S, S->buffer, 1);
+
+       U64TO8_BE(&hash[ 0], S->H[0]);
+       U64TO8_BE(&hash[ 8], S->H[1]);
+       U64TO8_BE(&hash[16], S->H[2]);
+       U64TO8_BE(&hash[24], S->H[3]);
+       U64TO8_BE(&hash[32], S->H[4]);
+       U64TO8_BE(&hash[40], S->H[5]);
+       U64TO8_BE(&hash[48], S->H[6]);
+       U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+       0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f,
+       0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83,
+       0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a,
+       0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac
+};
diff --git a/scryptjane/scrypt-jane-hash_sha512.h b/scryptjane/scrypt-jane-hash_sha512.h
new file mode 100644 (file)
index 0000000..3e3997d
--- /dev/null
@@ -0,0 +1,152 @@
+#define SCRYPT_HASH "SHA-2-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+       uint64_t H[8];
+       uint64_t T[2];
+       uint32_t leftover;
+       uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t sha512_constants[80] = {
+       0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull,
+       0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull,
+       0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull,
+       0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull,
+       0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull,
+       0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull,
+       0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull,
+       0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull,
+       0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull,
+       0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull,
+       0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull,
+       0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull,
+       0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull,
+       0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull,
+       0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull,
+       0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull,
+       0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull,
+       0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull,
+       0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull,
+       0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define S1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define G0(x)      (ROTR64(x,  1) ^ ROTR64(x,  8) ^ (x >>  7))
+#define G1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >>  6))
+#define W0(in,i)   (U8TO64_BE(&in[i * 8]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+       t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+       t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \
+       r[7] = r[6]; \
+       r[6] = r[5]; \
+       r[5] = r[4]; \
+       r[4] = r[3] + t0; \
+       r[3] = r[2]; \
+       r[2] = r[1]; \
+       r[1] = r[0]; \
+       r[0] = t0 + t1;
+
+static void
+sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+       uint64_t r[8], w[80], t0, t1;
+       size_t i;
+
+       for (i = 0; i < 8; i++) r[i] = S->H[i];
+
+       while (blocks--) {
+               for (i =  0; i < 16; i++) { w[i] = W0(in, i); }
+               for (i = 16; i < 80; i++) { w[i] = W1(i); }
+               for (i =  0; i < 80; i++) { STEP(i); }
+               for (i =  0; i <  8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
+               S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8;
+               S->T[1] += (!S->T[0]) ? 1 : 0;
+               in += SCRYPT_HASH_BLOCK_SIZE;
+       }
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+       S->H[0] = 0x6a09e667f3bcc908ull;
+       S->H[1] = 0xbb67ae8584caa73bull;
+       S->H[2] = 0x3c6ef372fe94f82bull;
+       S->H[3] = 0xa54ff53a5f1d36f1ull;
+       S->H[4] = 0x510e527fade682d1ull;
+       S->H[5] = 0x9b05688c2b3e6c1full;
+       S->H[6] = 0x1f83d9abfb41bd6bull;
+       S->H[7] = 0x5be0cd19137e2179ull;
+       S->T[0] = 0;
+       S->T[1] = 0;
+       S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+       size_t blocks, want;
+
+       /* handle the previous data */
+       if (S->leftover) {
+               want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+               want = (want < inlen) ? want : inlen;
+               memcpy(S->buffer + S->leftover, in, want);
+               S->leftover += (uint32_t)want;
+               if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+                       return;
+               in += want;
+               inlen -= want;
+               sha512_blocks(S, S->buffer, 1);
+       }
+
+       /* handle the current data */
+       blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+       S->leftover = (uint32_t)(inlen - blocks);
+       if (blocks) {
+               sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+               in += blocks;
+       }
+
+       /* handle leftover data */
+       if (S->leftover)
+               memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+       uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1];
+
+       S->buffer[S->leftover] = 0x80;
+       if (S->leftover <= 111) {
+               memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+       } else {
+               memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+               sha512_blocks(S, S->buffer, 1);
+               memset(S->buffer, 0, 112);
+       }
+
+       U64TO8_BE(S->buffer + 112, t1);
+       U64TO8_BE(S->buffer + 120, t0);
+       sha512_blocks(S, S->buffer, 1);
+
+       U64TO8_BE(&hash[ 0], S->H[0]);
+       U64TO8_BE(&hash[ 8], S->H[1]);
+       U64TO8_BE(&hash[16], S->H[2]);
+       U64TO8_BE(&hash[24], S->H[3]);
+       U64TO8_BE(&hash[32], S->H[4]);
+       U64TO8_BE(&hash[40], S->H[5]);
+       U64TO8_BE(&hash[48], S->H[6]);
+       U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+       0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d,
+       0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec,
+       0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c,
+       0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c,
+};
diff --git a/scryptjane/scrypt-jane-hash_skein512.h b/scryptjane/scrypt-jane-hash_skein512.h
new file mode 100644 (file)
index 0000000..a95d46b
--- /dev/null
@@ -0,0 +1,188 @@
+#define SCRYPT_HASH "Skein-512"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+       uint64_t X[8], T[2];
+       uint32_t leftover;
+       uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+#include <stdio.h>
+
+static void
+skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
+       uint64_t X[8], key[8], Xt[9+18], T[3+1];
+       size_t r;
+
+       while (blocks--) {
+               T[0] = S->T[0] + add;
+               T[1] = S->T[1];
+               T[2] = T[0] ^ T[1];
+               key[0] = U8TO64_LE(in +  0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
+               key[1] = U8TO64_LE(in +  8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
+               key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
+               key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
+               key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
+               key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
+               key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
+               key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
+               Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
+               in += SCRYPT_HASH_BLOCK_SIZE;
+
+               for (r = 0; r < 18; r++)
+                       Xt[r + 9] = Xt[r + 0];
+
+               for (r = 0; r < 18; r += 2) {
+                       X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
+                       X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
+                       X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
+                       X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
+                       X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
+                       X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
+                       X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
+                       X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
+                       X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
+                       X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
+                       X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
+                       X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
+                       X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
+                       X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
+                       X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
+                       X[0] += X[7]; X[7] = ROTL64(X[7],  9) ^ X[0];
+
+                       X[0] += Xt[r + 1];
+                       X[1] += Xt[r + 2];
+                       X[2] += Xt[r + 3];
+                       X[3] += Xt[r + 4];
+                       X[4] += Xt[r + 5];
+                       X[5] += Xt[r + 6] + T[1];
+                       X[6] += Xt[r + 7] + T[2];
+                       X[7] += Xt[r + 8] + r + 1;
+
+                       T[3] = T[0];
+                       T[0] = T[1];
+                       T[1] = T[2];
+                       T[2] = T[3];
+
+                       X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
+                       X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
+                       X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
+                       X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
+                       X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
+                       X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
+                       X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
+                       X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
+                       X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
+                       X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
+                       X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
+                       X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
+                       X[6] += X[1]; X[1] = ROTL64(X[1],  8) ^ X[6];
+                       X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
+                       X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
+                       X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
+
+                       X[0] += Xt[r + 2];
+                       X[1] += Xt[r + 3];
+                       X[2] += Xt[r + 4];
+                       X[3] += Xt[r + 5];
+                       X[4] += Xt[r + 6];
+                       X[5] += Xt[r + 7] + T[1];
+                       X[6] += Xt[r + 8] + T[2];
+                       X[7] += Xt[r + 9] + r + 2;
+
+                       T[3] = T[0];
+                       T[0] = T[1];
+                       T[1] = T[2];
+                       T[2] = T[3];
+               }
+
+               S->X[0] = key[0] ^ X[0];
+               S->X[1] = key[1] ^ X[1];
+               S->X[2] = key[2] ^ X[2];
+               S->X[3] = key[3] ^ X[3];
+               S->X[4] = key[4] ^ X[4];
+               S->X[5] = key[5] ^ X[5];
+               S->X[6] = key[6] ^ X[6];
+               S->X[7] = key[7] ^ X[7];
+
+               S->T[0] = T[0];
+               S->T[1] = T[1] & ~0x4000000000000000ull;
+       }
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+       S->X[0] = 0x4903ADFF749C51CEull;
+       S->X[1] = 0x0D95DE399746DF03ull;
+       S->X[2] = 0x8FD1934127C79BCEull;
+       S->X[3] = 0x9A255629FF352CB1ull;
+       S->X[4] = 0x5DB62599DF6CA7B0ull;
+       S->X[5] = 0xEABE394CA9D5C3F4ull;
+       S->X[6] = 0x991112C71A75B523ull;
+       S->X[7] = 0xAE18A40B660FCC33ull;
+       S->T[0] = 0x0000000000000000ull;
+       S->T[1] = 0x7000000000000000ull;
+       S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+       size_t blocks, want;
+
+       /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
+       if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
+               /* handle the previous data, we know there is enough for at least one block */
+               if (S->leftover) {
+                       want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+                       memcpy(S->buffer + S->leftover, in, want);
+                       in += want;
+                       inlen -= want;
+                       S->leftover = 0;
+                       skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
+               }
+
+               /* handle the current data if there's more than one block */
+               if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
+                       blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+                       skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
+                       inlen -= blocks;
+                       in += blocks;
+               }
+       }
+
+       /* handle leftover data */
+       memcpy(S->buffer + S->leftover, in, inlen);
+       S->leftover += inlen;
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+       memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+       S->T[1] |= 0x8000000000000000ull;
+       skein512_blocks(S, S->buffer, 1, S->leftover);
+
+       memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
+       S->T[0] = 0;
+       S->T[1] = 0xff00000000000000ull;
+       skein512_blocks(S, S->buffer, 1, 8);
+
+       U64TO8_LE(&hash[ 0], S->X[0]);
+       U64TO8_LE(&hash[ 8], S->X[1]);
+       U64TO8_LE(&hash[16], S->X[2]);
+       U64TO8_LE(&hash[24], S->X[3]);
+       U64TO8_LE(&hash[32], S->X[4]);
+       U64TO8_LE(&hash[40], S->X[5]);
+       U64TO8_LE(&hash[48], S->X[6]);
+       U64TO8_LE(&hash[56], S->X[7]);
+}
+
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+       0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
+       0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
+       0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
+       0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
+};
index 50d6e2d2a1c1c50e479bbba17794247ed0b5e70e..17559d88acb271f5e962316d1cee42d4abb83136 100644 (file)
@@ -20,8 +20,28 @@ asm_naked_fn(scrypt_ChunkMix_avx)
        a2(shl edx,6)
        a2(lea ecx,[edx-64])
        a2(and eax, eax)
-       a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
-       a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
+       a2(mov ebx, 0x01000302)
+       a2(vmovd xmm4, ebx)
+       a2(mov ebx, 0x05040706)
+       a2(vmovd xmm0, ebx)
+       a2(mov ebx, 0x09080b0a)
+       a2(vmovd xmm1, ebx)
+       a2(mov ebx, 0x0d0c0f0e)
+       a2(vmovd xmm2, ebx)
+       a2(mov ebx, 0x02010003)
+       a2(vmovd xmm5, ebx)
+       a2(mov ebx, 0x06050407)
+       a2(vmovd xmm3, ebx)
+       a2(mov ebx, 0x0a09080b)
+       a2(vmovd xmm6, ebx)
+       a2(mov ebx, 0x0e0d0c0f)
+       a2(vmovd xmm7, ebx)
+       a3(vpunpckldq xmm4, xmm4, xmm0)
+       a3(vpunpckldq xmm5, xmm5, xmm3)
+       a3(vpunpckldq xmm1, xmm1, xmm2)
+       a3(vpunpckldq xmm6, xmm6, xmm7)
+       a3(vpunpcklqdq xmm4, xmm4, xmm1)
+       a3(vpunpcklqdq xmm5, xmm5, xmm6)
        a2(vmovdqa xmm0,[ecx+esi+0])
        a2(vmovdqa xmm1,[ecx+esi+16])
        a2(vmovdqa xmm2,[ecx+esi+32])
@@ -114,7 +134,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
        a1(pop esi)
        a1(pop edi)
        a1(pop ebx)
-       a1(ret 16)
+       aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #endif
@@ -134,12 +154,20 @@ asm_naked_fn(scrypt_ChunkMix_avx)
        a2(lea rax,[rsi+r9])
        a2(lea r9,[rdx+r9])
        a2(and rdx, rdx)
-       a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
-       a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
        a2(vmovdqa xmm0,[rax+0])
        a2(vmovdqa xmm1,[rax+16])
        a2(vmovdqa xmm2,[rax+32])
        a2(vmovdqa xmm3,[rax+48])
+       a2(mov r8, 0x0504070601000302)
+       a2(mov rax, 0x0d0c0f0e09080b0a)
+       a2(movq xmm4, r8)
+       a2(movq xmm6, rax)
+       a2(mov r8, 0x0605040702010003)
+       a2(mov rax, 0x0e0d0c0f0a09080b)
+       a2(movq xmm5, r8)
+       a2(movq xmm7, rax)
+       a3(vpunpcklqdq xmm4, xmm4, xmm6)
+       a3(vpunpcklqdq xmm5, xmm5, xmm7)
        a1(jz scrypt_ChunkMix_avx_no_xor1)
        a3(vpxor xmm0,xmm0,[r9+0])
        a3(vpxor xmm1,xmm1,[r9+16])
@@ -283,8 +311,9 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*
                        x3 = _mm_shuffle_epi8(x3, x4);
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
-                       x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x5);
@@ -293,15 +322,17 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*
                        x3 = _mm_shuffle_epi32(x3, 0x4e);
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x39);
-                       x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x4);
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
-                       x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x5);
@@ -310,8 +341,201 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*
                        x3 = _mm_shuffle_epi32(x3, 0x4e);
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x93);
-                       x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+       const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+       const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+       x0 = _mm_xor_si128(x0, xmmp[0]);
+       x1 = _mm_xor_si128(x1, xmmp[1]);
+       x2 = _mm_xor_si128(x2, xmmp[2]);
+       x3 = _mm_xor_si128(x3, xmmp[3]);
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               xmmp = (xmmi *)scrypt_block(Bxor, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = _mm_srli_epi32(x1, 20);
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, x6);
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x6 = _mm_srli_epi32(x1, 25);
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, x6);
                }
 
                x0 = _mm_add_epi32(x0, t0);
index d2192c8f987f1e569d74cbcece15c664019b8ece..8f79decde5f1105e1bb510ade9a7110a3b00c0bf 100644 (file)
@@ -128,7 +128,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
        a1(pop esi)
        a1(pop edi)
        a1(pop ebx)
-       a1(ret 16)
+       aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #endif
@@ -308,41 +308,255 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x4 = x3;
-                       x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
                        x4 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x4 = x3;
-                       x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
                        x0 = _mm_shuffle_epi32(x0, 0x93);
                        x2 = _mm_add_epi32(x2, x3);
                        x3 = _mm_shuffle_epi32(x3, 0x4e);
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x39);
                        x4 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x4 = x3;
-                       x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
                        x4 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x4 = x3;
-                       x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
                        x0 = _mm_shuffle_epi32(x0, 0x39);
                        x2 = _mm_add_epi32(x2, x3);
                        x3 = _mm_shuffle_epi32(x3, 0x4e);
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x93);
                        x4 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+       x0 = _mm_xor_si128(x0, xmmp[0]);
+       x1 = _mm_xor_si128(x1, xmmp[1]);
+       x2 = _mm_xor_si128(x2, xmmp[2]);
+       x3 = _mm_xor_si128(x3, xmmp[3]);
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               xmmp = (xmmi *)scrypt_block(Bxor, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 16);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x4 = x3;
+                       x3 = _mm_slli_epi32(x3, 8);
+                       x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x4 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
                }
 
                x0 = _mm_add_epi32(x0, t0);
index b25e356729a9ee2cd0f67a0b1fe5fdcb7d887428..6a80cac5b23631dea799c1bcac0634269f204c21 100644 (file)
@@ -20,8 +20,28 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
        a2(shl edx,6)
        a2(lea ecx,[edx-64])
        a2(and eax, eax)
-       a2(movdqa xmm4,[ssse3_rotl16_32bit])
-       a2(movdqa xmm5,[ssse3_rotl8_32bit])
+       a2(mov ebx, 0x01000302)
+       a2(movd xmm4, ebx)
+       a2(mov ebx, 0x05040706)
+       a2(movd xmm0, ebx)
+       a2(mov ebx, 0x09080b0a)
+       a2(movd xmm1, ebx)
+       a2(mov ebx, 0x0d0c0f0e)
+       a2(movd xmm2, ebx)
+       a2(mov ebx, 0x02010003)
+       a2(movd xmm5, ebx)
+       a2(mov ebx, 0x06050407)
+       a2(movd xmm3, ebx)
+       a2(mov ebx, 0x0a09080b)
+       a2(movd xmm6, ebx)
+       a2(mov ebx, 0x0e0d0c0f)
+       a2(movd xmm7, ebx)
+       a2(punpckldq xmm4, xmm0)
+       a2(punpckldq xmm5, xmm3)
+       a2(punpckldq xmm1, xmm2)
+       a2(punpckldq xmm6, xmm7)
+       a2(punpcklqdq xmm4, xmm1)
+       a2(punpcklqdq xmm5, xmm6)
        a2(movdqa xmm0,[ecx+esi+0])
        a2(movdqa xmm1,[ecx+esi+16])
        a2(movdqa xmm2,[ecx+esi+32])
@@ -118,7 +138,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
        a1(pop esi)
        a1(pop edi)
        a1(pop ebx)
-       a1(ret 16)
+       aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_ssse3)
 
 #endif
@@ -138,12 +158,20 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
        a2(lea rax,[rsi+r9])
        a2(lea r9,[rdx+r9])
        a2(and rdx, rdx)
-       a2(movdqa xmm4,[ssse3_rotl16_32bit])
-       a2(movdqa xmm5,[ssse3_rotl8_32bit])
        a2(movdqa xmm0,[rax+0])
        a2(movdqa xmm1,[rax+16])
        a2(movdqa xmm2,[rax+32])
        a2(movdqa xmm3,[rax+48])
+       a2(mov r8, 0x0504070601000302)
+       a2(mov rax, 0x0d0c0f0e09080b0a)
+       a2(movq xmm4, r8)
+       a2(movq xmm6, rax)
+       a2(mov r8, 0x0605040702010003)
+       a2(mov rax, 0x0e0d0c0f0a09080b)
+       a2(movq xmm5, r8)
+       a2(movq xmm7, rax)
+       a2(punpcklqdq xmm4, xmm6)
+       a2(punpcklqdq xmm5, xmm7)
        a1(jz scrypt_ChunkMix_ssse3_no_xor1)
        a2(pxor xmm0,[r9+0])
        a2(pxor xmm1,[r9+16])
@@ -292,7 +320,8 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
                        x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x5);
@@ -302,14 +331,16 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x39);
                        x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x4);
                        x2 = _mm_add_epi32(x2, x3);
                        x1 = _mm_xor_si128(x1, x2);
                        x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
                        x0 = _mm_add_epi32(x0, x1);
                        x3 = _mm_xor_si128(x3, x0);
                        x3 = _mm_shuffle_epi8(x3, x5);
@@ -319,7 +350,200 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes
                        x1 = _mm_xor_si128(x1, x2);
                        x2 = _mm_shuffle_epi32(x2, 0x93);
                        x6 = x1;
-                       x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+       const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+               }
+
+               x0 = _mm_add_epi32(x0, t0);
+               x1 = _mm_add_epi32(x1, t1);
+               x2 = _mm_add_epi32(x2, t2);
+               x3 = _mm_add_epi32(x3, t3);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+       }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ *  - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+       const uint32_t r = 1;
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+       const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+
+       xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+       x0 = _mm_xor_si128(x0, xmmp[0]);
+       x1 = _mm_xor_si128(x1, xmmp[1]);
+       x2 = _mm_xor_si128(x2, xmmp[2]);
+       x3 = _mm_xor_si128(x3, xmmp[3]);
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               xmmp = (xmmi *)scrypt_block(Bxor, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x93);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x39);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x4);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 12);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+                       x0 = _mm_add_epi32(x0, x1);
+                       x3 = _mm_xor_si128(x3, x0);
+                       x3 = _mm_shuffle_epi8(x3, x5);
+                       x0 = _mm_shuffle_epi32(x0, 0x39);
+                       x2 = _mm_add_epi32(x2, x3);
+                       x3 = _mm_shuffle_epi32(x3, 0x4e);
+                       x1 = _mm_xor_si128(x1, x2);
+                       x2 = _mm_shuffle_epi32(x2, 0x93);
+                       x6 = x1;
+                       x1 = _mm_slli_epi32(x1, 7);
+                       x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
                }
 
                x0 = _mm_add_epi32(x0, t0);
index 15fb48e39d8b5e67f83221498e17e3ba30601440..1ca90b5fa943588ead680abbe2a9cf796e089b6d 100644 (file)
@@ -120,7 +120,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
        a1(pop esi)
        a1(pop edi)
        a1(pop ebx)
-       a1(ret 16)
+       aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #endif
index 4898659e64edac33ab86b7edd9fc155420c92635..ecc5f0f8d337b9040d5f418fee83193aa55d0274 100644 (file)
@@ -136,7 +136,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
        a1(pop esi)
        a1(pop edi)
        a1(pop ebx)
-       a1(ret 16)
+       aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #endif
@@ -426,7 +426,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
                 4  9 14  3
        */
 
-       static void STDCALL
+       static void asm_calling_convention
        salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
                uint32_t t;
                while (count--) {
diff --git a/scryptjane/scrypt-jane-mix_salsa64-avx.h b/scryptjane/scrypt-jane-mix_salsa64-avx.h
new file mode 100644 (file)
index 0000000..50c9902
--- /dev/null
@@ -0,0 +1,367 @@
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+       a1(push rbp)
+       a2(mov rbp, rsp)
+       a2(and rsp, ~63)
+       a2(sub rsp, 128)
+       a2(lea rcx,[rcx*2])
+       a2(shl rcx,7)
+       a2(lea r9,[rcx-128])
+       a2(lea rax,[rsi+r9])
+       a2(lea r9,[rdx+r9])
+       a2(and rdx, rdx)
+       a2(vmovdqa xmm0,[rax+0])
+       a2(vmovdqa xmm1,[rax+16])
+       a2(vmovdqa xmm2,[rax+32])
+       a2(vmovdqa xmm3,[rax+48])
+       a2(vmovdqa xmm4,[rax+64])
+       a2(vmovdqa xmm5,[rax+80])
+       a2(vmovdqa xmm6,[rax+96])
+       a2(vmovdqa xmm7,[rax+112])
+       a1(jz scrypt_ChunkMix_avx_no_xor1)
+       a3(vpxor xmm0,xmm0,[r9+0])
+       a3(vpxor xmm1,xmm1,[r9+16])
+       a3(vpxor xmm2,xmm2,[r9+32])
+       a3(vpxor xmm3,xmm3,[r9+48])
+       a3(vpxor xmm4,xmm4,[r9+64])
+       a3(vpxor xmm5,xmm5,[r9+80])
+       a3(vpxor xmm6,xmm6,[r9+96])
+       a3(vpxor xmm7,xmm7,[r9+112])
+       a1(scrypt_ChunkMix_avx_no_xor1:)
+       a2(xor r9,r9)
+       a2(xor r8,r8)
+       a1(scrypt_ChunkMix_avx_loop:)
+               a2(and rdx, rdx)
+               a3(vpxor xmm0,xmm0,[rsi+r9+0])
+               a3(vpxor xmm1,xmm1,[rsi+r9+16])
+               a3(vpxor xmm2,xmm2,[rsi+r9+32])
+               a3(vpxor xmm3,xmm3,[rsi+r9+48])
+               a3(vpxor xmm4,xmm4,[rsi+r9+64])
+               a3(vpxor xmm5,xmm5,[rsi+r9+80])
+               a3(vpxor xmm6,xmm6,[rsi+r9+96])
+               a3(vpxor xmm7,xmm7,[rsi+r9+112])
+               a1(jz scrypt_ChunkMix_avx_no_xor2)
+               a3(vpxor xmm0,xmm0,[rdx+r9+0])
+               a3(vpxor xmm1,xmm1,[rdx+r9+16])
+               a3(vpxor xmm2,xmm2,[rdx+r9+32])
+               a3(vpxor xmm3,xmm3,[rdx+r9+48])
+               a3(vpxor xmm4,xmm4,[rdx+r9+64])
+               a3(vpxor xmm5,xmm5,[rdx+r9+80])
+               a3(vpxor xmm6,xmm6,[rdx+r9+96])
+               a3(vpxor xmm7,xmm7,[rdx+r9+112])
+               a1(scrypt_ChunkMix_avx_no_xor2:)
+               a2(vmovdqa [rsp+0],xmm0)
+               a2(vmovdqa [rsp+16],xmm1)
+               a2(vmovdqa [rsp+32],xmm2)
+               a2(vmovdqa [rsp+48],xmm3)
+               a2(vmovdqa [rsp+64],xmm4)
+               a2(vmovdqa [rsp+80],xmm5)
+               a2(vmovdqa [rsp+96],xmm6)
+               a2(vmovdqa [rsp+112],xmm7)
+               a2(mov rax,8)
+               a1(scrypt_salsa64_avx_loop: )
+                       a3(vpaddq xmm8, xmm0, xmm2)
+                       a3(vpaddq xmm9, xmm1, xmm3)
+                       a3(vpshufd xmm8, xmm8, 0xb1)
+                       a3(vpshufd xmm9, xmm9, 0xb1)
+                       a3(vpxor xmm6, xmm6, xmm8)
+                       a3(vpxor xmm7, xmm7, xmm9)
+                       a3(vpaddq xmm10, xmm0, xmm6)
+                       a3(vpaddq xmm11, xmm1, xmm7)
+                       a3(vpsrlq xmm8, xmm10, 51)
+                       a3(vpsrlq xmm9, xmm11, 51)
+                       a3(vpsllq xmm10, xmm10, 13)
+                       a3(vpsllq xmm11, xmm11, 13)
+                       a3(vpxor xmm4, xmm4, xmm8)
+                       a3(vpxor xmm5, xmm5, xmm9)
+                       a3(vpxor xmm4, xmm4, xmm10)
+                       a3(vpxor xmm5, xmm5, xmm11)
+                       a3(vpaddq xmm8, xmm6, xmm4)
+                       a3(vpaddq xmm9, xmm7, xmm5)
+                       a3(vpsrlq xmm10, xmm8, 25)
+                       a3(vpsrlq xmm11, xmm9, 25)
+                       a3(vpsllq xmm8, xmm8, 39)
+                       a3(vpsllq xmm9, xmm9, 39)
+                       a3(vpxor xmm2, xmm2, xmm10)
+                       a3(vpxor xmm3, xmm3, xmm11)
+                       a3(vpxor xmm2, xmm2, xmm8)
+                       a3(vpxor xmm3, xmm3, xmm9)
+                       a3(vpaddq xmm10, xmm4, xmm2)
+                       a3(vpaddq xmm11, xmm5, xmm3)
+                       a3(vpshufd xmm10, xmm10, 0xb1)
+                       a3(vpshufd xmm11, xmm11, 0xb1)
+                       a3(vpxor xmm0, xmm0, xmm10)
+                       a3(vpxor xmm1, xmm1, xmm11)
+                       a2(vmovdqa xmm8, xmm2)
+                       a2(vmovdqa xmm9, xmm3)
+                       a4(vpalignr xmm2, xmm6, xmm7, 8)
+                       a4(vpalignr xmm3, xmm7, xmm6, 8)
+                       a4(vpalignr xmm6, xmm9, xmm8, 8)
+                       a4(vpalignr xmm7, xmm8, xmm9, 8)
+                       a2(sub rax, 2)
+                       a3(vpaddq xmm10, xmm0, xmm2)
+                       a3(vpaddq xmm11, xmm1, xmm3)
+                       a3(vpshufd xmm10, xmm10, 0xb1)
+                       a3(vpshufd xmm11, xmm11, 0xb1)
+                       a3(vpxor xmm6, xmm6, xmm10)
+                       a3(vpxor xmm7, xmm7, xmm11)
+                       a3(vpaddq xmm8, xmm0, xmm6)
+                       a3(vpaddq xmm9, xmm1, xmm7)
+                       a3(vpsrlq xmm10, xmm8, 51)
+                       a3(vpsrlq xmm11, xmm9, 51)
+                       a3(vpsllq xmm8, xmm8, 13)
+                       a3(vpsllq xmm9, xmm9, 13)
+                       a3(vpxor xmm5, xmm5, xmm10)
+                       a3(vpxor xmm4, xmm4, xmm11)
+                       a3(vpxor xmm5, xmm5, xmm8)
+                       a3(vpxor xmm4, xmm4, xmm9)
+                       a3(vpaddq xmm10, xmm6, xmm5)
+                       a3(vpaddq xmm11, xmm7, xmm4)
+                       a3(vpsrlq xmm8, xmm10, 25)
+                       a3(vpsrlq xmm9, xmm11, 25)
+                       a3(vpsllq xmm10, xmm10, 39)
+                       a3(vpsllq xmm11, xmm11, 39)
+                       a3(vpxor xmm2, xmm2, xmm8)
+                       a3(vpxor xmm3, xmm3, xmm9)
+                       a3(vpxor xmm2, xmm2, xmm10)
+                       a3(vpxor xmm3, xmm3, xmm11)
+                       a3(vpaddq xmm8, xmm5, xmm2)
+                       a3(vpaddq xmm9, xmm4, xmm3)
+                       a3(vpshufd xmm8, xmm8, 0xb1)
+                       a3(vpshufd xmm9, xmm9, 0xb1)
+                       a3(vpxor xmm0, xmm0, xmm8)
+                       a3(vpxor xmm1, xmm1, xmm9)
+                       a2(vmovdqa xmm10, xmm2)
+                       a2(vmovdqa xmm11, xmm3)
+                       a4(vpalignr xmm2, xmm6, xmm7, 8)
+                       a4(vpalignr xmm3, xmm7, xmm6, 8)
+                       a4(vpalignr xmm6, xmm11, xmm10, 8)
+                       a4(vpalignr xmm7, xmm10, xmm11, 8)
+                       a1(ja scrypt_salsa64_avx_loop)
+               a3(vpaddq xmm0,xmm0,[rsp+0])
+               a3(vpaddq xmm1,xmm1,[rsp+16])
+               a3(vpaddq xmm2,xmm2,[rsp+32])
+               a3(vpaddq xmm3,xmm3,[rsp+48])
+               a3(vpaddq xmm4,xmm4,[rsp+64])
+               a3(vpaddq xmm5,xmm5,[rsp+80])
+               a3(vpaddq xmm6,xmm6,[rsp+96])
+               a3(vpaddq xmm7,xmm7,[rsp+112])
+               a2(lea rax,[r8+r9])
+               a2(xor r8,rcx)
+               a2(and rax,~0xff)
+               a2(add r9,128)
+               a2(shr rax,1)
+               a2(add rax, rdi)
+               a2(cmp r9,rcx)
+               a2(vmovdqa [rax+0],xmm0)
+               a2(vmovdqa [rax+16],xmm1)
+               a2(vmovdqa [rax+32],xmm2)
+               a2(vmovdqa [rax+48],xmm3)
+               a2(vmovdqa [rax+64],xmm4)
+               a2(vmovdqa [rax+80],xmm5)
+               a2(vmovdqa [rax+96],xmm6)
+               a2(vmovdqa [rax+112],xmm7)
+               a1(jne scrypt_ChunkMix_avx_loop)
+       a2(mov rsp, rbp)
+       a1(pop rbp)
+       a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
+
+#define SCRYPT_SALSA64_AVX
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+       x4 = xmmp[4];
+       x5 = xmmp[5];
+       x6 = xmmp[6];
+       x7 = xmmp[7];
+
+       if (Bxor) {
+               xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+       }
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+
+               if (Bxor) {
+                       xmmp = (xmmi *)scrypt_block(Bxor, i);
+                       x0 = _mm_xor_si128(x0, xmmp[0]);
+                       x1 = _mm_xor_si128(x1, xmmp[1]);
+                       x2 = _mm_xor_si128(x2, xmmp[2]);
+                       x3 = _mm_xor_si128(x3, xmmp[3]);
+                       x4 = _mm_xor_si128(x4, xmmp[4]);
+                       x5 = _mm_xor_si128(x5, xmmp[5]);
+                       x6 = _mm_xor_si128(x6, xmmp[6]);
+                       x7 = _mm_xor_si128(x7, xmmp[7]);
+               }
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+               t4 = x4;
+               t5 = x5;
+               t6 = x6;
+               t7 = x7;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x4 = _mm_xor_si128(x4, z2);
+                       x5 = _mm_xor_si128(x5, z3);
+                       x4 = _mm_xor_si128(x4, z0);
+                       x5 = _mm_xor_si128(x5, z1);
+
+                       z0 = _mm_add_epi64(x4, x6);
+                       z1 = _mm_add_epi64(x5, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x4);
+                       z1 = _mm_add_epi64(x3, x5);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x2;
+                       z1 = x3;
+                       x2 = _mm_alignr_epi8(x6, x7, 8);
+                       x3 = _mm_alignr_epi8(x7, x6, 8);
+                       x6 = _mm_alignr_epi8(z1, z0, 8);
+                       x7 = _mm_alignr_epi8(z0, z1, 8);
+
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x5 = _mm_xor_si128(x5, z2);
+                       x4 = _mm_xor_si128(x4, z3);
+                       x5 = _mm_xor_si128(x5, z0);
+                       x4 = _mm_xor_si128(x4, z1);
+
+                       z0 = _mm_add_epi64(x5, x6);
+                       z1 = _mm_add_epi64(x4, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x5);
+                       z1 = _mm_add_epi64(x3, x4);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x2;
+                       z1 = x3;
+                       x2 = _mm_alignr_epi8(x6, x7, 8);
+                       x3 = _mm_alignr_epi8(x7, x6, 8);
+                       x6 = _mm_alignr_epi8(z1, z0, 8);
+                       x7 = _mm_alignr_epi8(z0, z1, 8);
+               }
+
+               x0 = _mm_add_epi64(x0, t0);
+               x1 = _mm_add_epi64(x1, t1);
+               x2 = _mm_add_epi64(x2, t2);
+               x3 = _mm_add_epi64(x3, t3);
+               x4 = _mm_add_epi64(x4, t4);
+               x5 = _mm_add_epi64(x5, t5);
+               x6 = _mm_add_epi64(x6, t6);
+               x7 = _mm_add_epi64(x7, t7);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+               xmmp[4] = x4;
+               xmmp[5] = x5;
+               xmmp[6] = x6;
+               xmmp[7] = x7;
+       }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+       /* uses salsa64_core_tangle_sse2 */
+       
+       #undef SCRYPT_MIX
+       #define SCRYPT_MIX "Salsa64/8-AVX"
+       #undef SCRYPT_SALSA64_INCLUDED
+       #define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64-sse2.h b/scryptjane/scrypt-jane-mix_salsa64-sse2.h
new file mode 100644 (file)
index 0000000..f8d9574
--- /dev/null
@@ -0,0 +1,449 @@
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+       a1(push rbp)
+       a2(mov rbp, rsp)
+       a2(and rsp, ~63)
+       a2(sub rsp, 128)
+       a2(lea rcx,[rcx*2])
+       a2(shl rcx,7)
+       a2(lea r9,[rcx-128])
+       a2(lea rax,[rsi+r9])
+       a2(lea r9,[rdx+r9])
+       a2(and rdx, rdx)
+       a2(movdqa xmm0,[rax+0])
+       a2(movdqa xmm1,[rax+16])
+       a2(movdqa xmm2,[rax+32])
+       a2(movdqa xmm3,[rax+48])
+       a2(movdqa xmm4,[rax+64])
+       a2(movdqa xmm5,[rax+80])
+       a2(movdqa xmm6,[rax+96])
+       a2(movdqa xmm7,[rax+112])
+       a1(jz scrypt_ChunkMix_sse2_no_xor1)
+       a2(pxor xmm0,[r9+0])
+       a2(pxor xmm1,[r9+16])
+       a2(pxor xmm2,[r9+32])
+       a2(pxor xmm3,[r9+48])
+       a2(pxor xmm4,[r9+64])
+       a2(pxor xmm5,[r9+80])
+       a2(pxor xmm6,[r9+96])
+       a2(pxor xmm7,[r9+112])
+       a1(scrypt_ChunkMix_sse2_no_xor1:)
+       a2(xor r9,r9)
+       a2(xor r8,r8)
+       a1(scrypt_ChunkMix_sse2_loop:)
+               a2(and rdx, rdx)
+               a2(pxor xmm0,[rsi+r9+0])
+               a2(pxor xmm1,[rsi+r9+16])
+               a2(pxor xmm2,[rsi+r9+32])
+               a2(pxor xmm3,[rsi+r9+48])
+               a2(pxor xmm4,[rsi+r9+64])
+               a2(pxor xmm5,[rsi+r9+80])
+               a2(pxor xmm6,[rsi+r9+96])
+               a2(pxor xmm7,[rsi+r9+112])
+               a1(jz scrypt_ChunkMix_sse2_no_xor2)
+               a2(pxor xmm0,[rdx+r9+0])
+               a2(pxor xmm1,[rdx+r9+16])
+               a2(pxor xmm2,[rdx+r9+32])
+               a2(pxor xmm3,[rdx+r9+48])
+               a2(pxor xmm4,[rdx+r9+64])
+               a2(pxor xmm5,[rdx+r9+80])
+               a2(pxor xmm6,[rdx+r9+96])
+               a2(pxor xmm7,[rdx+r9+112])
+               a1(scrypt_ChunkMix_sse2_no_xor2:)
+               a2(movdqa [rsp+0],xmm0)
+               a2(movdqa [rsp+16],xmm1)
+               a2(movdqa [rsp+32],xmm2)
+               a2(movdqa [rsp+48],xmm3)
+               a2(movdqa [rsp+64],xmm4)
+               a2(movdqa [rsp+80],xmm5)
+               a2(movdqa [rsp+96],xmm6)
+               a2(movdqa [rsp+112],xmm7)
+               a2(mov rax,8)
+               a1(scrypt_salsa64_sse2_loop: )
+                       a2(movdqa xmm8, xmm0)
+                       a2(movdqa xmm9, xmm1)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm6, xmm8)
+                       a2(pxor xmm7, xmm9)
+                       a2(movdqa xmm10, xmm0)
+                       a2(movdqa xmm11, xmm1)
+                       a2(paddq xmm10, xmm6)
+                       a2(paddq xmm11, xmm7)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 51)
+                       a2(psrlq xmm11, 51)
+                       a2(psllq xmm8, 13)
+                       a2(psllq xmm9, 13)
+                       a2(pxor xmm4, xmm10)
+                       a2(pxor xmm5, xmm11)
+                       a2(pxor xmm4, xmm8)
+                       a2(pxor xmm5, xmm9)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(paddq xmm10, xmm4)
+                       a2(paddq xmm11, xmm5)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 25)
+                       a2(psrlq xmm11, 25)
+                       a2(psllq xmm8, 39)
+                       a2(psllq xmm9, 39)
+                       a2(pxor xmm2, xmm10)
+                       a2(pxor xmm3, xmm11)
+                       a2(pxor xmm2, xmm8)
+                       a2(pxor xmm3, xmm9)
+                       a2(movdqa xmm8, xmm4)
+                       a2(movdqa xmm9, xmm5)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm0, xmm8)
+                       a2(pxor xmm1, xmm9)
+                       a2(movdqa xmm8, xmm2)
+                       a2(movdqa xmm9, xmm3)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(movdqa xmm2, xmm7)
+                       a2(movdqa xmm3, xmm6)
+                       a2(punpcklqdq xmm10, xmm6)
+                       a2(punpcklqdq xmm11, xmm7)
+                       a2(movdqa xmm6, xmm8)
+                       a2(movdqa xmm7, xmm9)
+                       a2(punpcklqdq xmm9, xmm9)
+                       a2(punpcklqdq xmm8, xmm8)
+                       a2(punpckhqdq xmm2, xmm10)
+                       a2(punpckhqdq xmm3, xmm11)
+                       a2(punpckhqdq xmm6, xmm9)
+                       a2(punpckhqdq xmm7, xmm8)
+                       a2(sub rax, 2)
+                       a2(movdqa xmm8, xmm0)
+                       a2(movdqa xmm9, xmm1)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm6, xmm8)
+                       a2(pxor xmm7, xmm9)
+                       a2(movdqa xmm10, xmm0)
+                       a2(movdqa xmm11, xmm1)
+                       a2(paddq xmm10, xmm6)
+                       a2(paddq xmm11, xmm7)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 51)
+                       a2(psrlq xmm11, 51)
+                       a2(psllq xmm8, 13)
+                       a2(psllq xmm9, 13)
+                       a2(pxor xmm5, xmm10)
+                       a2(pxor xmm4, xmm11)
+                       a2(pxor xmm5, xmm8)
+                       a2(pxor xmm4, xmm9)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(paddq xmm10, xmm5)
+                       a2(paddq xmm11, xmm4)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 25)
+                       a2(psrlq xmm11, 25)
+                       a2(psllq xmm8, 39)
+                       a2(psllq xmm9, 39)
+                       a2(pxor xmm2, xmm10)
+                       a2(pxor xmm3, xmm11)
+                       a2(pxor xmm2, xmm8)
+                       a2(pxor xmm3, xmm9)
+                       a2(movdqa xmm8, xmm5)
+                       a2(movdqa xmm9, xmm4)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm0, xmm8)
+                       a2(pxor xmm1, xmm9)
+                       a2(movdqa xmm8, xmm2)
+                       a2(movdqa xmm9, xmm3)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(movdqa xmm2, xmm7)
+                       a2(movdqa xmm3, xmm6)
+                       a2(punpcklqdq xmm10, xmm6)
+                       a2(punpcklqdq xmm11, xmm7)
+                       a2(movdqa xmm6, xmm8)
+                       a2(movdqa xmm7, xmm9)
+                       a2(punpcklqdq xmm9, xmm9)
+                       a2(punpcklqdq xmm8, xmm8)
+                       a2(punpckhqdq xmm2, xmm10)
+                       a2(punpckhqdq xmm3, xmm11)
+                       a2(punpckhqdq xmm6, xmm9)
+                       a2(punpckhqdq xmm7, xmm8)
+                       a1(ja scrypt_salsa64_sse2_loop)
+               a2(paddq xmm0,[rsp+0])
+               a2(paddq xmm1,[rsp+16])
+               a2(paddq xmm2,[rsp+32])
+               a2(paddq xmm3,[rsp+48])
+               a2(paddq xmm4,[rsp+64])
+               a2(paddq xmm5,[rsp+80])
+               a2(paddq xmm6,[rsp+96])
+               a2(paddq xmm7,[rsp+112])
+               a2(lea rax,[r8+r9])
+               a2(xor r8,rcx)
+               a2(and rax,~0xff)
+               a2(add r9,128)
+               a2(shr rax,1)
+               a2(add rax, rdi)
+               a2(cmp r9,rcx)
+               a2(movdqa [rax+0],xmm0)
+               a2(movdqa [rax+16],xmm1)
+               a2(movdqa [rax+32],xmm2)
+               a2(movdqa [rax+48],xmm3)
+               a2(movdqa [rax+64],xmm4)
+               a2(movdqa [rax+80],xmm5)
+               a2(movdqa [rax+96],xmm6)
+               a2(movdqa [rax+112],xmm7)
+               a1(jne scrypt_ChunkMix_sse2_loop)
+       a2(mov rsp, rbp)
+       a1(pop rbp)
+       a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)
+
+#define SCRYPT_SALSA64_SSE2
+
+static void asm_calling_convention
+scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+       x4 = xmmp[4];
+       x5 = xmmp[5];
+       x6 = xmmp[6];
+       x7 = xmmp[7];
+
+       if (Bxor) {
+               xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+       }
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+
+               if (Bxor) {
+                       xmmp = (xmmi *)scrypt_block(Bxor, i);
+                       x0 = _mm_xor_si128(x0, xmmp[0]);
+                       x1 = _mm_xor_si128(x1, xmmp[1]);
+                       x2 = _mm_xor_si128(x2, xmmp[2]);
+                       x3 = _mm_xor_si128(x3, xmmp[3]);
+                       x4 = _mm_xor_si128(x4, xmmp[4]);
+                       x5 = _mm_xor_si128(x5, xmmp[5]);
+                       x6 = _mm_xor_si128(x6, xmmp[6]);
+                       x7 = _mm_xor_si128(x7, xmmp[7]);
+               }
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+               t4 = x4;
+               t5 = x5;
+               t6 = x6;
+               t7 = x7;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x4 = _mm_xor_si128(x4, z2);
+                       x5 = _mm_xor_si128(x5, z3);
+                       x4 = _mm_xor_si128(x4, z0);
+                       x5 = _mm_xor_si128(x5, z1);
+
+                       z0 = _mm_add_epi64(x4, x6);
+                       z1 = _mm_add_epi64(x5, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x4);
+                       z1 = _mm_add_epi64(x3, x5);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x4;
+                       z1 = x5;
+                       z2 = x2;
+                       z3 = x3;
+                       x4 = z1;
+                       x5 = z0;
+                       x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+                       x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+                       x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+                       x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x4 = _mm_xor_si128(x4, z2);
+                       x5 = _mm_xor_si128(x5, z3);
+                       x4 = _mm_xor_si128(x4, z0);
+                       x5 = _mm_xor_si128(x5, z1);
+
+                       z0 = _mm_add_epi64(x4, x6);
+                       z1 = _mm_add_epi64(x5, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x4);
+                       z1 = _mm_add_epi64(x3, x5);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x4;
+                       z1 = x5;
+                       z2 = x2;
+                       z3 = x3;
+                       x4 = z1;
+                       x5 = z0;
+                       x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+                       x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+                       x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+                       x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+               }
+
+               x0 = _mm_add_epi64(x0, t0);
+               x1 = _mm_add_epi64(x1, t1);
+               x2 = _mm_add_epi64(x2, t2);
+               x3 = _mm_add_epi64(x3, t3);
+               x4 = _mm_add_epi64(x4, t4);
+               x5 = _mm_add_epi64(x5, t5);
+               x6 = _mm_add_epi64(x6, t6);
+               x7 = _mm_add_epi64(x7, t7);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+               xmmp[4] = x4;
+               xmmp[5] = x5;
+               xmmp[6] = x6;
+               xmmp[7] = x7;
+       }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+       #undef SCRYPT_MIX
+       #define SCRYPT_MIX "Salsa64/8-SSE2"
+       #undef SCRYPT_SALSA64_INCLUDED
+       #define SCRYPT_SALSA64_INCLUDED
+#endif
+
+/* sse3/avx use this as well */
+#if defined(SCRYPT_SALSA64_INCLUDED)
+       /*
+               Default layout:
+                0  1  2  3
+                4  5  6  7
+                8  9 10 11
+               12 13 14 15
+
+               SSE2 layout:
+                0  5 10 15
+               12  1  6 11
+                8 13  2  7
+                4  9 14  3
+       */
+
+
+       static void asm_calling_convention
+       salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
+               uint64_t t;
+               while (count--) {
+                       t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+                       t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+                       t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+                       t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+                       t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+                       t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+                       blocks += 16;
+               }
+       }
+#endif
\ No newline at end of file
diff --git a/scryptjane/scrypt-jane-mix_salsa64-ssse3.h b/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
new file mode 100644 (file)
index 0000000..bebfe5c
--- /dev/null
@@ -0,0 +1,399 @@
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+       a1(push rbp)
+       a2(mov rbp, rsp)
+       a2(and rsp, ~63)
+       a2(sub rsp, 128)
+       a2(lea rcx,[rcx*2])
+       a2(shl rcx,7)
+       a2(lea r9,[rcx-128])
+       a2(lea rax,[rsi+r9])
+       a2(lea r9,[rdx+r9])
+       a2(and rdx, rdx)
+       a2(movdqa xmm0,[rax+0])
+       a2(movdqa xmm1,[rax+16])
+       a2(movdqa xmm2,[rax+32])
+       a2(movdqa xmm3,[rax+48])
+       a2(movdqa xmm4,[rax+64])
+       a2(movdqa xmm5,[rax+80])
+       a2(movdqa xmm6,[rax+96])
+       a2(movdqa xmm7,[rax+112])
+       a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+       a2(pxor xmm0,[r9+0])
+       a2(pxor xmm1,[r9+16])
+       a2(pxor xmm2,[r9+32])
+       a2(pxor xmm3,[r9+48])
+       a2(pxor xmm4,[r9+64])
+       a2(pxor xmm5,[r9+80])
+       a2(pxor xmm6,[r9+96])
+       a2(pxor xmm7,[r9+112])
+       a1(scrypt_ChunkMix_ssse3_no_xor1:)
+       a2(xor r9,r9)
+       a2(xor r8,r8)
+       a1(scrypt_ChunkMix_ssse3_loop:)
+               a2(and rdx, rdx)
+               a2(pxor xmm0,[rsi+r9+0])
+               a2(pxor xmm1,[rsi+r9+16])
+               a2(pxor xmm2,[rsi+r9+32])
+               a2(pxor xmm3,[rsi+r9+48])
+               a2(pxor xmm4,[rsi+r9+64])
+               a2(pxor xmm5,[rsi+r9+80])
+               a2(pxor xmm6,[rsi+r9+96])
+               a2(pxor xmm7,[rsi+r9+112])
+               a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+               a2(pxor xmm0,[rdx+r9+0])
+               a2(pxor xmm1,[rdx+r9+16])
+               a2(pxor xmm2,[rdx+r9+32])
+               a2(pxor xmm3,[rdx+r9+48])
+               a2(pxor xmm4,[rdx+r9+64])
+               a2(pxor xmm5,[rdx+r9+80])
+               a2(pxor xmm6,[rdx+r9+96])
+               a2(pxor xmm7,[rdx+r9+112])
+               a1(scrypt_ChunkMix_ssse3_no_xor2:)
+               a2(movdqa [rsp+0],xmm0)
+               a2(movdqa [rsp+16],xmm1)
+               a2(movdqa [rsp+32],xmm2)
+               a2(movdqa [rsp+48],xmm3)
+               a2(movdqa [rsp+64],xmm4)
+               a2(movdqa [rsp+80],xmm5)
+               a2(movdqa [rsp+96],xmm6)
+               a2(movdqa [rsp+112],xmm7)
+               a2(mov rax,8)
+               a1(scrypt_salsa64_ssse3_loop: )
+                       a2(movdqa xmm8, xmm0)
+                       a2(movdqa xmm9, xmm1)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm6, xmm8)
+                       a2(pxor xmm7, xmm9)
+                       a2(movdqa xmm10, xmm0)
+                       a2(movdqa xmm11, xmm1)
+                       a2(paddq xmm10, xmm6)
+                       a2(paddq xmm11, xmm7)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 51)
+                       a2(psrlq xmm11, 51)
+                       a2(psllq xmm8, 13)
+                       a2(psllq xmm9, 13)
+                       a2(pxor xmm4, xmm10)
+                       a2(pxor xmm5, xmm11)
+                       a2(pxor xmm4, xmm8)
+                       a2(pxor xmm5, xmm9)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(paddq xmm10, xmm4)
+                       a2(paddq xmm11, xmm5)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 25)
+                       a2(psrlq xmm11, 25)
+                       a2(psllq xmm8, 39)
+                       a2(psllq xmm9, 39)
+                       a2(pxor xmm2, xmm10)
+                       a2(pxor xmm3, xmm11)
+                       a2(pxor xmm2, xmm8)
+                       a2(pxor xmm3, xmm9)
+                       a2(movdqa xmm8, xmm4)
+                       a2(movdqa xmm9, xmm5)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm0, xmm8)
+                       a2(pxor xmm1, xmm9)
+                       a2(movdqa xmm10, xmm2)
+                       a2(movdqa xmm11, xmm3)
+                       a2(movdqa xmm2, xmm6)
+                       a2(movdqa xmm3, xmm7)
+                       a3(palignr xmm2, xmm7, 8)
+                       a3(palignr xmm3, xmm6, 8)
+                       a2(movdqa xmm6, xmm11)
+                       a2(movdqa xmm7, xmm10)
+                       a3(palignr xmm6, xmm10, 8)
+                       a3(palignr xmm7, xmm11, 8)
+                       a2(sub rax, 2)
+                       a2(movdqa xmm8, xmm0)
+                       a2(movdqa xmm9, xmm1)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm6, xmm8)
+                       a2(pxor xmm7, xmm9)
+                       a2(movdqa xmm10, xmm0)
+                       a2(movdqa xmm11, xmm1)
+                       a2(paddq xmm10, xmm6)
+                       a2(paddq xmm11, xmm7)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 51)
+                       a2(psrlq xmm11, 51)
+                       a2(psllq xmm8, 13)
+                       a2(psllq xmm9, 13)
+                       a2(pxor xmm5, xmm10)
+                       a2(pxor xmm4, xmm11)
+                       a2(pxor xmm5, xmm8)
+                       a2(pxor xmm4, xmm9)
+                       a2(movdqa xmm10, xmm6)
+                       a2(movdqa xmm11, xmm7)
+                       a2(paddq xmm10, xmm5)
+                       a2(paddq xmm11, xmm4)
+                       a2(movdqa xmm8, xmm10)
+                       a2(movdqa xmm9, xmm11)
+                       a2(psrlq xmm10, 25)
+                       a2(psrlq xmm11, 25)
+                       a2(psllq xmm8, 39)
+                       a2(psllq xmm9, 39)
+                       a2(pxor xmm2, xmm10)
+                       a2(pxor xmm3, xmm11)
+                       a2(pxor xmm2, xmm8)
+                       a2(pxor xmm3, xmm9)
+                       a2(movdqa xmm8, xmm5)
+                       a2(movdqa xmm9, xmm4)
+                       a2(paddq xmm8, xmm2)
+                       a2(paddq xmm9, xmm3)
+                       a3(pshufd xmm8, xmm8, 0xb1)
+                       a3(pshufd xmm9, xmm9, 0xb1)
+                       a2(pxor xmm0, xmm8)
+                       a2(pxor xmm1, xmm9)
+                       a2(movdqa xmm10, xmm2)
+                       a2(movdqa xmm11, xmm3)
+                       a2(movdqa xmm2, xmm6)
+                       a2(movdqa xmm3, xmm7)
+                       a3(palignr xmm2, xmm7, 8)
+                       a3(palignr xmm3, xmm6, 8)
+                       a2(movdqa xmm6, xmm11)
+                       a2(movdqa xmm7, xmm10)
+                       a3(palignr xmm6, xmm10, 8)
+                       a3(palignr xmm7, xmm11, 8)
+                       a1(ja scrypt_salsa64_ssse3_loop)
+               a2(paddq xmm0,[rsp+0])
+               a2(paddq xmm1,[rsp+16])
+               a2(paddq xmm2,[rsp+32])
+               a2(paddq xmm3,[rsp+48])
+               a2(paddq xmm4,[rsp+64])
+               a2(paddq xmm5,[rsp+80])
+               a2(paddq xmm6,[rsp+96])
+               a2(paddq xmm7,[rsp+112])
+               a2(lea rax,[r8+r9])
+               a2(xor r8,rcx)
+               a2(and rax,~0xff)
+               a2(add r9,128)
+               a2(shr rax,1)
+               a2(add rax, rdi)
+               a2(cmp r9,rcx)
+               a2(movdqa [rax+0],xmm0)
+               a2(movdqa [rax+16],xmm1)
+               a2(movdqa [rax+32],xmm2)
+               a2(movdqa [rax+48],xmm3)
+               a2(movdqa [rax+64],xmm4)
+               a2(movdqa [rax+80],xmm5)
+               a2(movdqa [rax+96],xmm6)
+               a2(movdqa [rax+112],xmm7)
+               a1(jne scrypt_ChunkMix_ssse3_loop)
+       a2(mov rsp, rbp)
+       a1(pop rbp)
+       a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
+
+#define SCRYPT_SALSA64_SSSE3
+
+static void asm_calling_convention
+scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+       uint32_t i, blocksPerChunk = r * 2, half = 0;
+       xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+       size_t rounds;
+
+       /* 1: X = B_{2r - 1} */
+       xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+       x0 = xmmp[0];
+       x1 = xmmp[1];
+       x2 = xmmp[2];
+       x3 = xmmp[3];
+       x4 = xmmp[4];
+       x5 = xmmp[5];
+       x6 = xmmp[6];
+       x7 = xmmp[7];
+
+       if (Bxor) {
+               xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+       }
+
+       /* 2: for i = 0 to 2r - 1 do */
+       for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+               /* 3: X = H(X ^ B_i) */
+               xmmp = (xmmi *)scrypt_block(Bin, i);
+               x0 = _mm_xor_si128(x0, xmmp[0]);
+               x1 = _mm_xor_si128(x1, xmmp[1]);
+               x2 = _mm_xor_si128(x2, xmmp[2]);
+               x3 = _mm_xor_si128(x3, xmmp[3]);
+               x4 = _mm_xor_si128(x4, xmmp[4]);
+               x5 = _mm_xor_si128(x5, xmmp[5]);
+               x6 = _mm_xor_si128(x6, xmmp[6]);
+               x7 = _mm_xor_si128(x7, xmmp[7]);
+
+               if (Bxor) {
+                       xmmp = (xmmi *)scrypt_block(Bxor, i);
+                       x0 = _mm_xor_si128(x0, xmmp[0]);
+                       x1 = _mm_xor_si128(x1, xmmp[1]);
+                       x2 = _mm_xor_si128(x2, xmmp[2]);
+                       x3 = _mm_xor_si128(x3, xmmp[3]);
+                       x4 = _mm_xor_si128(x4, xmmp[4]);
+                       x5 = _mm_xor_si128(x5, xmmp[5]);
+                       x6 = _mm_xor_si128(x6, xmmp[6]);
+                       x7 = _mm_xor_si128(x7, xmmp[7]);
+               }
+
+               t0 = x0;
+               t1 = x1;
+               t2 = x2;
+               t3 = x3;
+               t4 = x4;
+               t5 = x5;
+               t6 = x6;
+               t7 = x7;
+
+               for (rounds = 8; rounds; rounds -= 2) {
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x4 = _mm_xor_si128(x4, z2);
+                       x5 = _mm_xor_si128(x5, z3);
+                       x4 = _mm_xor_si128(x4, z0);
+                       x5 = _mm_xor_si128(x5, z1);
+
+                       z0 = _mm_add_epi64(x4, x6);
+                       z1 = _mm_add_epi64(x5, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x4);
+                       z1 = _mm_add_epi64(x3, x5);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x2;
+                       z1 = x3;
+                       x2 = _mm_alignr_epi8(x6, x7, 8);
+                       x3 = _mm_alignr_epi8(x7, x6, 8);
+                       x6 = _mm_alignr_epi8(z1, z0, 8);
+                       x7 = _mm_alignr_epi8(z0, z1, 8);
+
+                       z0 = _mm_add_epi64(x0, x2);
+                       z1 = _mm_add_epi64(x1, x3);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x6 = _mm_xor_si128(x6, z0);
+                       x7 = _mm_xor_si128(x7, z1);
+
+                       z0 = _mm_add_epi64(x6, x0);
+                       z1 = _mm_add_epi64(x7, x1);
+                       z2 = _mm_srli_epi64(z0, 64-13);
+                       z3 = _mm_srli_epi64(z1, 64-13);
+                       z0 = _mm_slli_epi64(z0, 13);
+                       z1 = _mm_slli_epi64(z1, 13);
+                       x5 = _mm_xor_si128(x5, z2);
+                       x4 = _mm_xor_si128(x4, z3);
+                       x5 = _mm_xor_si128(x5, z0);
+                       x4 = _mm_xor_si128(x4, z1);
+
+                       z0 = _mm_add_epi64(x5, x6);
+                       z1 = _mm_add_epi64(x4, x7);
+                       z2 = _mm_srli_epi64(z0, 64-39);
+                       z3 = _mm_srli_epi64(z1, 64-39);
+                       z0 = _mm_slli_epi64(z0, 39);
+                       z1 = _mm_slli_epi64(z1, 39);
+                       x2 = _mm_xor_si128(x2, z2);
+                       x3 = _mm_xor_si128(x3, z3);
+                       x2 = _mm_xor_si128(x2, z0);
+                       x3 = _mm_xor_si128(x3, z1);
+
+                       z0 = _mm_add_epi64(x2, x5);
+                       z1 = _mm_add_epi64(x3, x4);
+                       z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+                       z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+                       x0 = _mm_xor_si128(x0, z0);
+                       x1 = _mm_xor_si128(x1, z1);
+
+                       z0 = x2;
+                       z1 = x3;
+                       x2 = _mm_alignr_epi8(x6, x7, 8);
+                       x3 = _mm_alignr_epi8(x7, x6, 8);
+                       x6 = _mm_alignr_epi8(z1, z0, 8);
+                       x7 = _mm_alignr_epi8(z0, z1, 8);
+               }
+
+               x0 = _mm_add_epi64(x0, t0);
+               x1 = _mm_add_epi64(x1, t1);
+               x2 = _mm_add_epi64(x2, t2);
+               x3 = _mm_add_epi64(x3, t3);
+               x4 = _mm_add_epi64(x4, t4);
+               x5 = _mm_add_epi64(x5, t5);
+               x6 = _mm_add_epi64(x6, t6);
+               x7 = _mm_add_epi64(x7, t7);
+
+               /* 4: Y_i = X */
+               /* 6: B'[0..r-1] = Y_even */
+               /* 6: B'[r..2r-1] = Y_odd */
+               xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+               xmmp[0] = x0;
+               xmmp[1] = x1;
+               xmmp[2] = x2;
+               xmmp[3] = x3;
+               xmmp[4] = x4;
+               xmmp[5] = x5;
+               xmmp[6] = x6;
+               xmmp[7] = x7;
+       }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+       /* uses salsa64_core_tangle_sse2 */
+
+       #undef SCRYPT_MIX
+       #define SCRYPT_MIX "Salsa64/8-SSSE3"
+       #undef SCRYPT_SALSA64_INCLUDED
+       #define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64.h b/scryptjane/scrypt-jane-mix_salsa64.h
new file mode 100644 (file)
index 0000000..2aec04f
--- /dev/null
@@ -0,0 +1,41 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa64/8 Ref"
+
+#undef SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_BASIC
+
+static void
+salsa64_core_basic(uint64_t state[16]) {
+       const size_t rounds = 8;
+       uint64_t v[16], t;
+       size_t i;
+
+       for (i = 0; i < 16; i++) v[i] = state[i];
+
+       #define G(a,b,c,d) \
+               t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
+               t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
+               t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
+               t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
+
+       for (i = 0; i < rounds; i += 2) {
+               G( 0, 4, 8,12);
+               G( 5, 9,13, 1);
+               G(10,14, 2, 6);
+               G(15, 3, 7,11);
+               G( 0, 1, 2, 3);
+               G( 5, 6, 7, 4);
+               G(10,11, 8, 9);
+               G(15,12,13,14);
+       }
+
+       for (i = 0; i < 16; i++) state[i] += v[i];
+
+       #undef G
+}
+
+#endif
+
index 711e3d633293aa44823ef64e63dfc71d71cd4517..761b812c56433f0221dacc1e035ea2040fbfb4d2 100644 (file)
@@ -40,7 +40,9 @@ scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
                pad[i] ^= (0x5c ^ 0x36);
        scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
 
+#ifdef SCRYPT_PREVENT_STATE_LEAK
        scrypt_ensure_zero(pad, sizeof(pad));
+#endif
 }
 
 static void
@@ -59,7 +61,9 @@ scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
        scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
        scrypt_hash_finish(&st->outer, mac);
 
+#ifdef SCRYPT_PREVENT_STATE_LEAK
        scrypt_ensure_zero(st, sizeof(*st));
+#endif
 }
 
 static void
@@ -69,7 +73,7 @@ scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt,
        uint8_t be[4];
        uint32_t i, j, blocks;
        uint64_t c;
-       
+
        /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
 
        /* hmac(password, ...) */
@@ -105,8 +109,53 @@ scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt,
                bytes -= SCRYPT_HASH_DIGEST_SIZE;
        }
 
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+       scrypt_ensure_zero(ti, sizeof(ti));
+       scrypt_ensure_zero(u, sizeof(u));
+       scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
+       scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+#endif
+}
+
+/*
+ * Special version where N = 1
+ *  - mikaelh
+ */
+static void
+scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
+       scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+       scrypt_hash_digest ti, u;
+       uint8_t be[4];
+       uint32_t i, /*j,*/ blocks;
+       //uint64_t c;
+
+       /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+       /* hmac(password, ...) */
+       scrypt_hmac_init(&hmac_pw, password, password_len);
+
+       /* hmac(password, salt...) */
+       hmac_pw_salt = hmac_pw;
+       scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+       blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+       for (i = 1; i <= blocks; i++) {
+               /* U1 = hmac(password, salt || be(i)) */
+               U32TO8_BE(be, i);
+               work = hmac_pw_salt;
+               scrypt_hmac_update(&work, be, 4);
+               scrypt_hmac_finish(&work, ti);
+               memcpy(u, ti, sizeof(u));
+
+               memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+               out += SCRYPT_HASH_DIGEST_SIZE;
+               bytes -= SCRYPT_HASH_DIGEST_SIZE;
+       }
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
        scrypt_ensure_zero(ti, sizeof(ti));
        scrypt_ensure_zero(u, sizeof(u));
        scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
        scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+#endif
 }
index 03282fa8a1780312670fb91f9a2165efe8bb362f..d8325f0596c9d21328e899137c6742247f03e35d 100644 (file)
        #define X86_64USE_INTRINSIC
 #endif
 
+#ifdef __AVX__
+#define X86_INTRINSIC_AVX
+#endif
+
 #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
        #define X86_INTRINSIC
        #if defined(__SSE__)
        #if defined(__AVX__)
                #define X86_INTRINSIC_AVX
        #endif
+
+       /* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */
+       #undef X86_64ASM_SSSE3
+       #undef X86_64ASM_AVX
+       #undef X86_64ASM_SSE2
+       #undef X86ASM_AVX
+       #undef X86ASM_SSSE3
+       #undef X86ASM_SSE2
+       #undef X86ASM_SSE
 #endif
 
 /* only use simd on windows (or SSE2 on gcc)! */
                #define X86_INTRINSIC_SSSE3
                #include <tmmintrin.h>
        #endif
+       #if defined (X86_INTRINSIC_AVX)
+               #define X86_INTRINSIC_AVX
+               #include <immintrin.h>
+       #endif
 #endif
 
 
        } packedelem64;
 #endif
 
-#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3)
-       const packedelem8 MM16 ssse3_rotr16_64bit      = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}};
-       const packedelem8 MM16 ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
-       const packedelem8 MM16 ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
-       const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}};
+#if defined(X86_INTRINSIC_SSSE3)
+       static const packedelem8 MM16 ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+       static const packedelem8 MM16 ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
 #endif
 
 /*
                a1(..)
                a2(.., ..)
                a3(.., .., ..)
-               a1(ret)
+               64bit OR 0 paramters: a1(ret)
+               32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
        asm_naked_fn_end(name)
 */
 
        #define asm_align8 a1(ALIGN 8)
        #define asm_align16 a1(ALIGN 16)
 
-       #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn
+       #define asm_calling_convention STDCALL
+       #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
        #define asm_naked_fn(fn) {
        #define asm_naked_fn_end(fn) }
 #elif defined(COMPILER_GCC)
        #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
        #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
        #define GNU_ASL(x) "\n" #x ":\n"
+       #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
        #define GNU_ASJ(x, y, z) #x " " #y #z ";"
 
        #define a1(x) GNU_AS1(x)
        #define asm_align8 a1(.align 8)
        #define asm_align16 a1(.align 16)
 
-       #define asm_naked_fn_proto(type, fn) extern type STDCALL fn
-       #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn)
-       #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type  " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+       #if defined(OS_WINDOWS)
+               #define asm_calling_convention CDECL
+               #define aret(n) a1(ret)
+               #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
+       #else
+               #define asm_calling_convention STDCALL
+               #define aret(n) a1(ret n)
+               #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type  " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+       #endif
+       #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
+       #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+
        #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
        #define asm_gcc_parms() ".att_syntax prefix;"
        #define asm_gcc_trashed() __asm__ __volatile__("" :::
@@ -361,4 +388,4 @@ get_top_cpuflag_desc(size_t flag) {
        #endif
 #endif
 
-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
\ No newline at end of file
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
index 33c8c2cade965252d91f603abd038f66f4983df0..939fc9851034103c8a9f227d8763c26b40e17a17 100644 (file)
@@ -47,7 +47,6 @@
        #pragma warning(disable : 4127) /* conditional expression is constant */
        #pragma warning(disable : 4100) /* unreferenced formal parameter */
        
-       #define _CRT_SECURE_NO_WARNINGS 
        #include <float.h>
        #include <stdlib.h> /* _rotl */
        #include <intrin.h>
index ca1df02d531f2e5726b178e42a3cf0652722c10b..9e27a0d7ab524bd6f0cf3209f4980bb32f28e9f6 100644 (file)
@@ -4,12 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
 #endif
 
 /* romix pre/post nop function */
-static void STDCALL
+//static void asm_calling_convention
 scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
 }
 
 /* romix pre/post endian conversion function */
-static void STDCALL
+//static void asm_calling_convention
 scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
 #if !defined(CPU_LE)
        static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
@@ -24,8 +24,8 @@ scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
 }
 
 /* chunkmix test function */
-typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
-typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
 
 static int
 scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
index 2fd7674eca94a8e17252f5b4a169f92f9cc22bf6..f7dda63b5968e91150df0b5475994158b49992be 100644 (file)
@@ -17,7 +17,7 @@
 
        2*r: number of blocks in the chunk
 */
-static void STDCALL
+//static void asm_calling_convention
 SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
        scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
        uint32_t i, j, blocksPerChunk = r * 2, half = 0;
@@ -107,6 +107,67 @@ SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chu
        SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
 }
 
+/*
+ * Special version with hard-coded r = 1
+ *  - mikaelh
+ */
+static void NOINLINE FASTCALL
+scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
+       const uint32_t r = 1;
+       uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+       scrypt_mix_word_t *block = V;
+
+       SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+       /* 1: X = B */
+       /* implicit */
+
+       /* 2: for i = 0 to N - 1 do */
+       memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+       for (i = 0; i < N - 1; i++, block += chunkWords) {
+               /* 3: V_i = X */
+               /* 4: X = H(X) */
+#ifdef SCRYPT_CHUNKMIX_1_FN
+               SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
+#else
+               SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+#endif
+       }
+#ifdef SCRYPT_CHUNKMIX_1_FN
+       SCRYPT_CHUNKMIX_1_FN(X, block);
+#else
+       SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+#endif
+
+       /* 6: for i = 0 to N - 1 do */
+       for (i = 0; i < N; i += 2) {
+               /* 7: j = Integerify(X) % N */
+               j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+               /* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+               SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
+#else
+               SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+#endif
+
+               /* 7: j = Integerify(Y) % N */
+               j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+               /* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+               SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
+#else
+               SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+#endif
+       }
+
+       /* 10: B' = X */
+       /* implicit */
+
+       SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
 #endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
 
 
index 0c1604bad2e09428f371b892633230c692143c4c..76f3da6305737b7106ea48f7e8572456d17f3803 100644 (file)
@@ -64,13 +64,16 @@ scrypt_getROMix() {
 #if defined(SCRYPT_TEST_SPEED)
 static size_t
 available_implementations() {
+       size_t cpuflags = detect_cpu();
        size_t flags = 0;
 
 #if defined(SCRYPT_SALSA_AVX)
+       if (cpuflags & cpu_avx)
                flags |= cpu_avx;
 #endif
 
 #if defined(SCRYPT_SALSA_SSE2)
+       if (cpuflags & cpu_sse2)
                flags |= cpu_sse2;
 #endif
 
diff --git a/scryptjane/scrypt-jane-salsa64.h b/scryptjane/scrypt-jane-salsa64.h
new file mode 100644 (file)
index 0000000..ecc87f5
--- /dev/null
@@ -0,0 +1,133 @@
+#define SCRYPT_MIX_BASE "Salsa64/8"
+
+typedef uint64_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U64TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
+
+#define SCRYPT_BLOCK_BYTES 128
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa64-avx.h"
+#include "scrypt-jane-mix_salsa64-ssse3.h"
+#include "scrypt-jane-mix_salsa64-sse2.h"
+#include "scrypt-jane-mix_salsa64.h"
+
+#if defined(SCRYPT_SALSA64_AVX)
+       #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+       #define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+       #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+       #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+       #include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+       #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+       #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+       #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+       #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+       #include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+       #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+       #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+       #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+       #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+       #include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa64_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+       size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX)
+       if (cpuflags & cpu_avx)
+               return scrypt_ROMix_avx;
+       else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+       if (cpuflags & cpu_ssse3)
+               return scrypt_ROMix_ssse3;
+       else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+       if (cpuflags & cpu_sse2)
+               return scrypt_ROMix_sse2;
+       else
+#endif
+
+       return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+       size_t cpuflags = detect_cpu();
+       size_t flags = 0;
+
+#if defined(SCRYPT_SALSA64_AVX)
+       if (cpuflags & cpu_avx)
+               flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+       if (cpuflags & cpu_ssse3)
+               flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+       if (cpuflags & cpu_sse2)
+               flags |= cpu_sse2;
+#endif
+
+       return flags;
+}
+#endif
+
+static int
+scrypt_test_mix() {
+       static const uint8_t expected[16] = {
+               0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
+       };
+
+       int ret = 1;
+       size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX)
+       if (cpuflags & cpu_avx)
+               ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+       if (cpuflags & cpu_ssse3)
+               ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+       if (cpuflags & cpu_sse2)
+               ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_BASIC)
+       ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+       return ret;
+}
+
index a1e4c619a0a2c7c4d5ed3303ff7ca462ead2a3dc..d7740917e4b83e34385c9c6f1ba679a86ae99623 100644 (file)
@@ -3,9 +3,14 @@ typedef struct scrypt_test_setting_t {
        uint8_t Nfactor, rfactor, pfactor;
 } scrypt_test_setting;
 
+/*
+ * I'm hardcoding the values of p and r, which means they can't be tested
+ * anymore. A new test case with a different value for N should maybe be added.
+ *  - mikaelh
+ */
 static const scrypt_test_setting post_settings[] = {
        {"", "", 3, 0, 0},
-       {"password", "NaCl", 9, 3, 4},
+//     {"password", "NaCl", 9, 3, 4},
        {0}
 };
 
diff --git a/util.c b/util.c
index 7f0ae051d9c5a6e12d986e368466901c35ef57a8..ad12706ca2cae08df4c84c91fa2baf54840b0232 100644 (file)
--- a/util.c
+++ b/util.c
@@ -456,6 +456,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
        curl_easy_setopt(curl, CURLOPT_URL, url);
        if (opt_cert)
                curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+       curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_easy_setopt(curl, CURLOPT_ENCODING, "");
        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
        curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
This page took 0.148809 seconds and 4 git commands to generate.