algo/pluck.c \
algo/qubit.c \
algo/scrypt.c \
+ algo/scrypt-jane.c \
algo/sha2.c \
algo/sibcoin.c \
algo/skein.c \
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
+cpuminer_LDADD += -lcrypt32 -lgdi32
endif
if HAVE_WINDOWS
--- /dev/null
+#include "miner.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "inttypes.h"
+
+/* Hard-coded scrypt parameteres r and p - mikaelh */
+#define SCRYPT_R 1
+#define SCRYPT_P 1
+
+/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */
+#define CPU_X86_FORCE_INTRINSICS
+#define SCRYPT_KECCAK512
+#define SCRYPT_CHACHA
+#define SCRYPT_CHOOSE_COMPILETIME
+
+//#include "scrypt-jane.h"
+#include "../scryptjane/scrypt-jane-portable.h"
+#include "../scryptjane/scrypt-jane-hash.h"
+#include "../scryptjane/scrypt-jane-romix.h"
+#include "../scryptjane/scrypt-jane-test-vectors.h"
+
+
+#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */
+#if (SCRYPT_BLOCK_BYTES == 64)
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 128)
+#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 256)
+#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 512)
+#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
+#endif
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25 /* (1 << 25) = ~33 million */
+
+typedef struct scrypt_aligned_alloc_t {
+ uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+static int
+scrypt_alloc(uint64_t size, scrypt_aligned_alloc *aa) {
+ static const size_t max_alloc = (size_t)-1;
+ size += (SCRYPT_BLOCK_BYTES - 1);
+ if (size > max_alloc)
+ return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+ aa->mem = (uint8_t *)malloc((size_t)size);
+ aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+ if (!aa->mem)
+ return 0; // scrypt_fatal_error("scrypt: out of memory");
+ return 1;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+ free(aa->mem);
+}
+
+void
+scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) {
+ uint32_t chunk_bytes, i;
+ const uint32_t r = SCRYPT_R;
+ const uint32_t p = SCRYPT_P;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+ scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+ chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+
+ /* 1: X = PBKDF2(password, salt) */
+ scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
+
+ /* 2: X = ROMix(X) */
+ for (i = 0; i < p; i++)
+ scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
+
+ /* 3: Out = PBKDF2(password, X) */
+ scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+ /* This is an unnecessary security feature - mikaelh */
+ scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
+#endif
+}
+
+
+// increasing Nfactor gradually
+const unsigned char minNfactor = 4;
+const unsigned char maxNfactor = 30;
+
+unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) {
+ int l = 0;
+ unsigned long int s;
+ int n;
+ unsigned char N;
+
+ if (nTimestamp <= ntime)
+ return 4;
+
+ s = nTimestamp - ntime;
+ while ((s >> 1) > 3) {
+ l += 1;
+ s >>= 1;
+ }
+
+ s &= 3;
+
+ n = (l * 170 + s * 25 - 2320) / 100;
+
+ if (n < 0) n = 0;
+
+ if (n > 255) {
+ n = 255;
+ // printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+ }
+
+ N = (unsigned char)n;
+ //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor));
+
+ if (N<minNfactor) return minNfactor;
+ if (N>maxNfactor) return maxNfactor;
+ return N;
+}
+
+
+int scanhash_scryptjane(int Nfactor, int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+ uint32_t max_nonce, uint64_t *hashes_done)
+{
+ scrypt_aligned_alloc YX, V;
+ uint8_t *X, *Y;
+ uint32_t N, chunk_bytes;
+ const uint32_t r = SCRYPT_R;
+ const uint32_t p = SCRYPT_P;
+
+ uint32_t _ALIGN(64) endiandata[20];
+ const uint32_t first_nonce = pdata[19];
+ uint32_t nonce = first_nonce;
+
+ if (opt_benchmark)
+ ((uint32_t*)ptarget)[7] = 0x0000ff;
+
+ for (int k = 0; k < 20; k++)
+ be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+ //Nfactor = GetNfactor(data[17], ntime);
+ //if (Nfactor > scrypt_maxN) {
+ // return 1;
+ // //scrypt_fatal_error("scrypt: N out of range");
+ //}
+
+ N = (1 << (Nfactor + 1));
+
+ chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+ if (!scrypt_alloc((uint64_t)N * chunk_bytes, &V)) return 1;
+ if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
+ scrypt_free(&V);
+ return 1;
+ }
+
+ Y = YX.ptr;
+ X = Y + chunk_bytes;
+
+ do {
+ const uint32_t Htarg = ptarget[7];
+ uint32_t hash[8];
+ be32enc(&endiandata[19], nonce);
+
+ scrypt_N_1_1((unsigned char *)endiandata, 80,
+ (unsigned char *)endiandata, 80,
+ N, (unsigned char *)hash, 32, X, Y, V.ptr);
+
+ if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+ pdata[19] = nonce;
+ *hashes_done = pdata[19] - first_nonce;
+ scrypt_free(&V);
+ scrypt_free(&YX);
+ return 1;
+ }
+ nonce++;
+
+ } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+ pdata[19] = nonce;
+ *hashes_done = pdata[19] - first_nonce + 1;
+
+ scrypt_free(&V);
+ scrypt_free(&YX);
+ return 0;
+}
\ No newline at end of file
ALGO_X15, /* X15 Whirlpool */
ALGO_YESCRYPT,
ALGO_ZR5,
+ ALGO_SCRYPTJANE,
ALGO_COUNT
};
"x15",
"yescrypt",
"zr5",
+ "scryptjane",
"\0"
};
x15 X15\n\
yescrypt Yescrypt\n\
zr5 ZR5\n\
+ scryptjane:N\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\
void get_currentalgo(char* buf, int sz)
{
- snprintf(buf, sz, "%s", algo_names[opt_algo]);
+ if (opt_algo == ALGO_SCRYPTJANE)
+ snprintf(buf, sz, "%s:%d", algo_names[opt_algo], opt_scrypt_n);
+ else
+ snprintf(buf, sz, "%s", algo_names[opt_algo]);
}
void proper_exit(int reason)
case ALGO_CRYPTOLIGHT:
case ALGO_CRYPTONIGHT:
case ALGO_PLUCK:
+ case ALGO_SCRYPTJANE:
sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate);
applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s H/s %s",
accepted_count, accepted_count + rejected_count,
switch (opt_algo) {
case ALGO_DROP:
case ALGO_SCRYPT:
+ case ALGO_SCRYPTJANE:
case ALGO_NEOSCRYPT:
case ALGO_PLUCK:
case ALGO_YESCRYPT:
case ALGO_AXIOM:
case ALGO_CRYPTOLIGHT:
case ALGO_CRYPTONIGHT:
+ case ALGO_SCRYPTJANE:
max64 = 0x40LL;
break;
case ALGO_DROP:
rc = scanhash_pentablake(thr_id, work.data, work.target, max_nonce,
&hashes_done);
break;
+ case ALGO_SCRYPTJANE:
+ rc = scanhash_scryptjane(opt_scrypt_n, thr_id, work.data, work.target, max_nonce, &hashes_done);
+ break;
default:
/* should never happen */
goto out;
case ALGO_CRYPTOLIGHT:
case ALGO_CRYPTONIGHT:
case ALGO_PLUCK:
+ case ALGO_SCRYPTJANE:
applog(LOG_INFO, "CPU #%d: %.2f H/s", thr_id, thr_hashrates[thr_id]);
break;
default:
switch(opt_algo) {
case ALGO_CRYPTOLIGHT:
case ALGO_CRYPTONIGHT:
+ case ALGO_AXIOM:
+ case ALGO_SCRYPTJANE:
sprintf(s, "%.3f", hashrate);
applog(LOG_NOTICE, "Total: %s H/s", s);
break;
if (arg[v] == ':') {
char *ep;
v = strtol(arg+v+1, &ep, 10);
- if (*ep || v & (v-1) || v < 2)
+ if (*ep || (i == ALGO_SCRYPT && v & (v-1)) || v < 2)
continue;
opt_algo = (enum algos) i;
opt_scrypt_n = v;
}
if (!opt_nfactor && opt_algo == ALGO_SCRYPT)
opt_nfactor = 9;
+ if (opt_algo == ALGO_SCRYPTJANE && opt_scrypt_n == 0)
+ opt_scrypt_n = 5;
break;
case 'b':
p = strstr(arg, ":");
<ClCompile Include="algo\scrypt.c">
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
</ClCompile>
+ <ClCompile Include="algo\scrypt-jane.c" />
<ClCompile Include="algo\sibcoin.c" />
<ClCompile Include="algo\skein.c" />
<ClCompile Include="algo\skein2.c" />
<ClCompile Include="algo\axiom.c">
<Filter>algo</Filter>
</ClCompile>
+ <ClCompile Include="algo\scrypt-jane.c">
+ <Filter>algo</Filter>
+ </ClCompile>
<ClCompile Include="algo\sibcoin.c">
<Filter>algo</Filter>
</ClCompile>
int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, uint64_t *hashes_done);
+int scanhash_scryptjane(int Nfactor, int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+ uint32_t max_nonce, uint64_t *hashes_done);
+
/* api related */
void *api_thread(void *userdata);
--- /dev/null
+/*
+ pick the best algo at runtime or compile time?
+ ----------------------------------------------
+ SCRYPT_CHOOSE_COMPILETIME (gcc only!)
+ SCRYPT_CHOOSE_RUNTIME
+*/
+#define SCRYPT_CHOOSE_RUNTIME
+
+
+/*
+ hash function to use
+ -------------------------------
+ SCRYPT_BLAKE256
+ SCRYPT_BLAKE512
+ SCRYPT_SHA256
+ SCRYPT_SHA512
+ SCRYPT_SKEIN512
+*/
+//#define SCRYPT_SHA256
+
+
+/*
+ block mixer to use
+ -----------------------------
+ SCRYPT_CHACHA
+ SCRYPT_SALSA
+*/
+//#define SCRYPT_SALSA
#if defined(SCRYPT_CHACHA_AVX)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+ #if defined(X86_INTRINSIC_AVX)
+ #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1
+ #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor
+ #endif
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
#define SCRYPT_MIX_FN chacha_core_avx
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
#if defined(SCRYPT_CHACHA_SSSE3)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+ #if defined(X86_INTRINSIC_SSSE3)
+ #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1
+ #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor
+ #endif
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
#define SCRYPT_MIX_FN chacha_core_ssse3
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
#if defined(SCRYPT_CHACHA_SSE2)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+ #if defined(X86_INTRINSIC_SSE2)
+ #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1
+ #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor
+ #endif
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
#define SCRYPT_MIX_FN chacha_core_sse2
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
#if defined(SCRYPT_TEST_SPEED)
static size_t
available_implementations() {
+ size_t cpuflags = detect_cpu();
size_t flags = 0;
#if defined(SCRYPT_CHACHA_AVX)
- flags |= cpu_avx;
+ if (cpuflags & cpu_avx)
+ flags |= cpu_avx;
#endif
#if defined(SCRYPT_CHACHA_SSSE3)
- flags |= cpu_ssse3;
+ if (cpuflags & cpu_ssse3)
+ flags |= cpu_ssse3;
#endif
#if defined(SCRYPT_CHACHA_SSE2)
+ if (cpuflags & cpu_sse2)
flags |= cpu_sse2;
#endif
--- /dev/null
+#define SCRYPT_HASH "BLAKE-256"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake256_sigma[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+ 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+ 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+ 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+ 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+ 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint32_t blake256_constants[16] = {
+ 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
+ 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
+};
+
+typedef struct scrypt_hash_state_t {
+ uint32_t H[8], T[2];
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+ const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
+ uint32_t m[16], v[16], h[8], t[2];
+ uint32_t i;
+
+ for (i = 0; i < 8; i++) h[i] = S->H[i];
+ for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+ while (blocks--) {
+ t[0] += 512;
+ t[1] += (t[0] < 512) ? 1 : 0;
+
+ for (i = 0; i < 8; i++) v[i ] = h[i];
+ for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i];
+ for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0];
+ for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1];
+
+ for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]);
+ in += 64;
+
+ #define G(a,b,c,d,e) \
+ v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
+ v[d] = ROTR32(v[d] ^ v[a],16); \
+ v[c] += v[d]; \
+ v[b] = ROTR32(v[b] ^ v[c],12); \
+ v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
+ v[d] = ROTR32(v[d] ^ v[a], 8); \
+ v[c] += v[d]; \
+ v[b] = ROTR32(v[b] ^ v[c], 7);
+
+ for (i = 0, sigma = blake256_sigma; i < 14; i++) {
+ G(0, 4, 8,12, 0);
+ G(1, 5, 9,13, 2);
+ G(2, 6,10,14, 4);
+ G(3, 7,11,15, 6);
+
+ G(0, 5,10,15, 8);
+ G(1, 6,11,12,10);
+ G(2, 7, 8,13,12);
+ G(3, 4, 9,14,14);
+
+ sigma += 16;
+ if (sigma == sigma_end)
+ sigma = blake256_sigma;
+ }
+
+ #undef G
+
+ for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+ }
+
+ for (i = 0; i < 8; i++) S->H[i] = h[i];
+ for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+ S->H[0] = 0x6a09e667ULL;
+ S->H[1] = 0xbb67ae85ULL;
+ S->H[2] = 0x3c6ef372ULL;
+ S->H[3] = 0xa54ff53aULL;
+ S->H[4] = 0x510e527fULL;
+ S->H[5] = 0x9b05688cULL;
+ S->H[6] = 0x1f83d9abULL;
+ S->H[7] = 0x5be0cd19ULL;
+ S->T[0] = 0;
+ S->T[1] = 0;
+ S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+ size_t blocks, want;
+
+ /* handle the previous data */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ want = (want < inlen) ? want : inlen;
+ memcpy(S->buffer + S->leftover, in, want);
+ S->leftover += (uint32_t)want;
+ if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+ return;
+ in += want;
+ inlen -= want;
+ blake256_blocks(S, S->buffer, 1);
+ }
+
+ /* handle the current data */
+ blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+ S->leftover = (uint32_t)(inlen - blocks);
+ if (blocks) {
+ blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+ in += blocks;
+ }
+
+ /* handle leftover data */
+ if (S->leftover)
+ memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+ uint32_t th, tl, bits;
+
+ bits = (S->leftover << 3);
+ tl = S->T[0] + bits;
+ th = S->T[1];
+ if (S->leftover == 0) {
+ S->T[0] = (uint32_t)0 - (uint32_t)512;
+ S->T[1] = (uint32_t)0 - (uint32_t)1;
+ } else if (S->T[0] == 0) {
+ S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
+ S->T[1] = S->T[1] - 1;
+ } else {
+ S->T[0] -= (512 - bits);
+ }
+
+ S->buffer[S->leftover] = 0x80;
+ if (S->leftover <= 55) {
+ memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+ } else {
+ memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+ blake256_blocks(S, S->buffer, 1);
+ S->T[0] = (uint32_t)0 - (uint32_t)512;
+ S->T[1] = (uint32_t)0 - (uint32_t)1;
+ memset(S->buffer, 0, 56);
+ }
+ S->buffer[55] |= 1;
+ U32TO8_BE(S->buffer + 56, th);
+ U32TO8_BE(S->buffer + 60, tl);
+ blake256_blocks(S, S->buffer, 1);
+
+ U32TO8_BE(&hash[ 0], S->H[0]);
+ U32TO8_BE(&hash[ 4], S->H[1]);
+ U32TO8_BE(&hash[ 8], S->H[2]);
+ U32TO8_BE(&hash[12], S->H[3]);
+ U32TO8_BE(&hash[16], S->H[4]);
+ U32TO8_BE(&hash[20], S->H[5]);
+ U32TO8_BE(&hash[24], S->H[6]);
+ U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+ 0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20,
+ 0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8,
+};
--- /dev/null
+#define SCRYPT_HASH "BLAKE-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake512_sigma[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+ 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+ 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+ 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+ 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+ 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint64_t blake512_constants[16] = {
+ 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+ 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+ 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+ 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+typedef struct scrypt_hash_state_t {
+ uint64_t H[8], T[2];
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+ const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16);
+ uint64_t m[16], v[16], h[8], t[2];
+ uint32_t i;
+
+ for (i = 0; i < 8; i++) h[i] = S->H[i];
+ for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+ while (blocks--) {
+ t[0] += 1024;
+ t[1] += (t[0] < 1024) ? 1 : 0;
+
+ for (i = 0; i < 8; i++) v[i ] = h[i];
+ for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i];
+ for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0];
+ for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1];
+
+ for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]);
+ in += 128;
+
+ #define G(a,b,c,d,e) \
+ v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \
+ v[d] = ROTR64(v[d] ^ v[a],32); \
+ v[c] += v[d]; \
+ v[b] = ROTR64(v[b] ^ v[c],25); \
+ v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \
+ v[d] = ROTR64(v[d] ^ v[a],16); \
+ v[c] += v[d]; \
+ v[b] = ROTR64(v[b] ^ v[c],11);
+
+ for (i = 0, sigma = blake512_sigma; i < 16; i++) {
+ G(0, 4, 8,12, 0);
+ G(1, 5, 9,13, 2);
+ G(2, 6,10,14, 4);
+ G(3, 7,11,15, 6);
+ G(0, 5,10,15, 8);
+ G(1, 6,11,12,10);
+ G(2, 7, 8,13,12);
+ G(3, 4, 9,14,14);
+
+ sigma += 16;
+ if (sigma == sigma_end)
+ sigma = blake512_sigma;
+ }
+
+ #undef G
+
+ for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+ }
+
+ for (i = 0; i < 8; i++) S->H[i] = h[i];
+ for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+ S->H[0] = 0x6a09e667f3bcc908ULL;
+ S->H[1] = 0xbb67ae8584caa73bULL;
+ S->H[2] = 0x3c6ef372fe94f82bULL;
+ S->H[3] = 0xa54ff53a5f1d36f1ULL;
+ S->H[4] = 0x510e527fade682d1ULL;
+ S->H[5] = 0x9b05688c2b3e6c1fULL;
+ S->H[6] = 0x1f83d9abfb41bd6bULL;
+ S->H[7] = 0x5be0cd19137e2179ULL;
+ S->T[0] = 0;
+ S->T[1] = 0;
+ S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+ size_t blocks, want;
+
+ /* handle the previous data */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ want = (want < inlen) ? want : inlen;
+ memcpy(S->buffer + S->leftover, in, want);
+ S->leftover += (uint32_t)want;
+ if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+ return;
+ in += want;
+ inlen -= want;
+ blake512_blocks(S, S->buffer, 1);
+ }
+
+ /* handle the current data */
+ blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+ S->leftover = (uint32_t)(inlen - blocks);
+ if (blocks) {
+ blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+ in += blocks;
+ }
+
+ /* handle leftover data */
+ if (S->leftover)
+ memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+ uint64_t th, tl;
+ size_t bits;
+
+ bits = (S->leftover << 3);
+ tl = S->T[0] + bits;
+ th = S->T[1];
+ if (S->leftover == 0) {
+ S->T[0] = (uint64_t)0 - (uint64_t)1024;
+ S->T[1] = (uint64_t)0 - (uint64_t)1;
+ } else if (S->T[0] == 0) {
+ S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits;
+ S->T[1] = S->T[1] - 1;
+ } else {
+ S->T[0] -= (1024 - bits);
+ }
+
+ S->buffer[S->leftover] = 0x80;
+ if (S->leftover <= 111) {
+ memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+ } else {
+ memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+ blake512_blocks(S, S->buffer, 1);
+ S->T[0] = (uint64_t)0 - (uint64_t)1024;
+ S->T[1] = (uint64_t)0 - (uint64_t)1;
+ memset(S->buffer, 0, 112);
+ }
+ S->buffer[111] |= 1;
+ U64TO8_BE(S->buffer + 112, th);
+ U64TO8_BE(S->buffer + 120, tl);
+ blake512_blocks(S, S->buffer, 1);
+
+ U64TO8_BE(&hash[ 0], S->H[0]);
+ U64TO8_BE(&hash[ 8], S->H[1]);
+ U64TO8_BE(&hash[16], S->H[2]);
+ U64TO8_BE(&hash[24], S->H[3]);
+ U64TO8_BE(&hash[32], S->H[4]);
+ U64TO8_BE(&hash[40], S->H[5]);
+ U64TO8_BE(&hash[48], S->H[6]);
+ U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+ 0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f,
+ 0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83,
+ 0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a,
+ 0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac
+};
--- /dev/null
+#define SCRYPT_HASH "SHA-2-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+ uint64_t H[8];
+ uint64_t T[2];
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t sha512_constants[80] = {
+ 0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull,
+ 0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull,
+ 0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull,
+ 0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull,
+ 0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull,
+ 0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull,
+ 0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull,
+ 0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull,
+ 0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull,
+ 0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull,
+ 0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull,
+ 0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull,
+ 0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull,
+ 0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull,
+ 0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull,
+ 0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull,
+ 0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull,
+ 0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull,
+ 0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull,
+ 0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull
+};
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7))
+#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6))
+#define W0(in,i) (U8TO64_BE(&in[i * 8]))
+#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+ t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+ t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \
+ r[7] = r[6]; \
+ r[6] = r[5]; \
+ r[5] = r[4]; \
+ r[4] = r[3] + t0; \
+ r[3] = r[2]; \
+ r[2] = r[1]; \
+ r[1] = r[0]; \
+ r[0] = t0 + t1;
+
+static void
+sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+ uint64_t r[8], w[80], t0, t1;
+ size_t i;
+
+ for (i = 0; i < 8; i++) r[i] = S->H[i];
+
+ while (blocks--) {
+ for (i = 0; i < 16; i++) { w[i] = W0(in, i); }
+ for (i = 16; i < 80; i++) { w[i] = W1(i); }
+ for (i = 0; i < 80; i++) { STEP(i); }
+ for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
+ S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8;
+ S->T[1] += (!S->T[0]) ? 1 : 0;
+ in += SCRYPT_HASH_BLOCK_SIZE;
+ }
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+ S->H[0] = 0x6a09e667f3bcc908ull;
+ S->H[1] = 0xbb67ae8584caa73bull;
+ S->H[2] = 0x3c6ef372fe94f82bull;
+ S->H[3] = 0xa54ff53a5f1d36f1ull;
+ S->H[4] = 0x510e527fade682d1ull;
+ S->H[5] = 0x9b05688c2b3e6c1full;
+ S->H[6] = 0x1f83d9abfb41bd6bull;
+ S->H[7] = 0x5be0cd19137e2179ull;
+ S->T[0] = 0;
+ S->T[1] = 0;
+ S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+ size_t blocks, want;
+
+ /* handle the previous data */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ want = (want < inlen) ? want : inlen;
+ memcpy(S->buffer + S->leftover, in, want);
+ S->leftover += (uint32_t)want;
+ if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+ return;
+ in += want;
+ inlen -= want;
+ sha512_blocks(S, S->buffer, 1);
+ }
+
+ /* handle the current data */
+ blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+ S->leftover = (uint32_t)(inlen - blocks);
+ if (blocks) {
+ sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+ in += blocks;
+ }
+
+ /* handle leftover data */
+ if (S->leftover)
+ memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+ uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1];
+
+ S->buffer[S->leftover] = 0x80;
+ if (S->leftover <= 111) {
+ memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+ } else {
+ memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+ sha512_blocks(S, S->buffer, 1);
+ memset(S->buffer, 0, 112);
+ }
+
+ U64TO8_BE(S->buffer + 112, t1);
+ U64TO8_BE(S->buffer + 120, t0);
+ sha512_blocks(S, S->buffer, 1);
+
+ U64TO8_BE(&hash[ 0], S->H[0]);
+ U64TO8_BE(&hash[ 8], S->H[1]);
+ U64TO8_BE(&hash[16], S->H[2]);
+ U64TO8_BE(&hash[24], S->H[3]);
+ U64TO8_BE(&hash[32], S->H[4]);
+ U64TO8_BE(&hash[40], S->H[5]);
+ U64TO8_BE(&hash[48], S->H[6]);
+ U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+ 0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d,
+ 0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec,
+ 0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c,
+ 0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c,
+};
--- /dev/null
+#define SCRYPT_HASH "Skein-512"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+ uint64_t X[8], T[2];
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+#include <stdio.h>
+
+static void
+skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
+ uint64_t X[8], key[8], Xt[9+18], T[3+1];
+ size_t r;
+
+ while (blocks--) {
+ T[0] = S->T[0] + add;
+ T[1] = S->T[1];
+ T[2] = T[0] ^ T[1];
+ key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
+ key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
+ key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
+ key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
+ key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
+ key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
+ key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
+ key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
+ Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
+ in += SCRYPT_HASH_BLOCK_SIZE;
+
+ for (r = 0; r < 18; r++)
+ Xt[r + 9] = Xt[r + 0];
+
+ for (r = 0; r < 18; r += 2) {
+ X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
+ X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
+ X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
+ X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
+ X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
+ X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
+ X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
+ X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
+ X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
+ X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
+ X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
+ X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
+ X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
+ X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
+ X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
+ X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
+
+ X[0] += Xt[r + 1];
+ X[1] += Xt[r + 2];
+ X[2] += Xt[r + 3];
+ X[3] += Xt[r + 4];
+ X[4] += Xt[r + 5];
+ X[5] += Xt[r + 6] + T[1];
+ X[6] += Xt[r + 7] + T[2];
+ X[7] += Xt[r + 8] + r + 1;
+
+ T[3] = T[0];
+ T[0] = T[1];
+ T[1] = T[2];
+ T[2] = T[3];
+
+ X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
+ X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
+ X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
+ X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
+ X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
+ X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
+ X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
+ X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
+ X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
+ X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
+ X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
+ X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
+ X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
+ X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
+ X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
+ X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
+
+ X[0] += Xt[r + 2];
+ X[1] += Xt[r + 3];
+ X[2] += Xt[r + 4];
+ X[3] += Xt[r + 5];
+ X[4] += Xt[r + 6];
+ X[5] += Xt[r + 7] + T[1];
+ X[6] += Xt[r + 8] + T[2];
+ X[7] += Xt[r + 9] + r + 2;
+
+ T[3] = T[0];
+ T[0] = T[1];
+ T[1] = T[2];
+ T[2] = T[3];
+ }
+
+ S->X[0] = key[0] ^ X[0];
+ S->X[1] = key[1] ^ X[1];
+ S->X[2] = key[2] ^ X[2];
+ S->X[3] = key[3] ^ X[3];
+ S->X[4] = key[4] ^ X[4];
+ S->X[5] = key[5] ^ X[5];
+ S->X[6] = key[6] ^ X[6];
+ S->X[7] = key[7] ^ X[7];
+
+ S->T[0] = T[0];
+ S->T[1] = T[1] & ~0x4000000000000000ull;
+ }
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+ S->X[0] = 0x4903ADFF749C51CEull;
+ S->X[1] = 0x0D95DE399746DF03ull;
+ S->X[2] = 0x8FD1934127C79BCEull;
+ S->X[3] = 0x9A255629FF352CB1ull;
+ S->X[4] = 0x5DB62599DF6CA7B0ull;
+ S->X[5] = 0xEABE394CA9D5C3F4ull;
+ S->X[6] = 0x991112C71A75B523ull;
+ S->X[7] = 0xAE18A40B660FCC33ull;
+ S->T[0] = 0x0000000000000000ull;
+ S->T[1] = 0x7000000000000000ull;
+ S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+ size_t blocks, want;
+
+ /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
+ if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
+ /* handle the previous data, we know there is enough for at least one block */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ memcpy(S->buffer + S->leftover, in, want);
+ in += want;
+ inlen -= want;
+ S->leftover = 0;
+ skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
+ }
+
+ /* handle the current data if there's more than one block */
+ if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
+ blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+ skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
+ inlen -= blocks;
+ in += blocks;
+ }
+ }
+
+ /* handle leftover data */
+ memcpy(S->buffer + S->leftover, in, inlen);
+ S->leftover += inlen;
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+ memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ S->T[1] |= 0x8000000000000000ull;
+ skein512_blocks(S, S->buffer, 1, S->leftover);
+
+ memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
+ S->T[0] = 0;
+ S->T[1] = 0xff00000000000000ull;
+ skein512_blocks(S, S->buffer, 1, 8);
+
+ U64TO8_LE(&hash[ 0], S->X[0]);
+ U64TO8_LE(&hash[ 8], S->X[1]);
+ U64TO8_LE(&hash[16], S->X[2]);
+ U64TO8_LE(&hash[24], S->X[3]);
+ U64TO8_LE(&hash[32], S->X[4]);
+ U64TO8_LE(&hash[40], S->X[5]);
+ U64TO8_LE(&hash[48], S->X[6]);
+ U64TO8_LE(&hash[56], S->X[7]);
+}
+
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+ 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
+ 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
+ 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
+ 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
+};
a2(shl edx,6)
a2(lea ecx,[edx-64])
a2(and eax, eax)
- a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
- a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
+ a2(mov ebx, 0x01000302)
+ a2(vmovd xmm4, ebx)
+ a2(mov ebx, 0x05040706)
+ a2(vmovd xmm0, ebx)
+ a2(mov ebx, 0x09080b0a)
+ a2(vmovd xmm1, ebx)
+ a2(mov ebx, 0x0d0c0f0e)
+ a2(vmovd xmm2, ebx)
+ a2(mov ebx, 0x02010003)
+ a2(vmovd xmm5, ebx)
+ a2(mov ebx, 0x06050407)
+ a2(vmovd xmm3, ebx)
+ a2(mov ebx, 0x0a09080b)
+ a2(vmovd xmm6, ebx)
+ a2(mov ebx, 0x0e0d0c0f)
+ a2(vmovd xmm7, ebx)
+ a3(vpunpckldq xmm4, xmm4, xmm0)
+ a3(vpunpckldq xmm5, xmm5, xmm3)
+ a3(vpunpckldq xmm1, xmm1, xmm2)
+ a3(vpunpckldq xmm6, xmm6, xmm7)
+ a3(vpunpcklqdq xmm4, xmm4, xmm1)
+ a3(vpunpcklqdq xmm5, xmm5, xmm6)
a2(vmovdqa xmm0,[ecx+esi+0])
a2(vmovdqa xmm1,[ecx+esi+16])
a2(vmovdqa xmm2,[ecx+esi+32])
a1(pop esi)
a1(pop edi)
a1(pop ebx)
- a1(ret 16)
+ aret(16)
asm_naked_fn_end(scrypt_ChunkMix_avx)
#endif
a2(lea rax,[rsi+r9])
a2(lea r9,[rdx+r9])
a2(and rdx, rdx)
- a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
- a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
a2(vmovdqa xmm0,[rax+0])
a2(vmovdqa xmm1,[rax+16])
a2(vmovdqa xmm2,[rax+32])
a2(vmovdqa xmm3,[rax+48])
+ a2(mov r8, 0x0504070601000302)
+ a2(mov rax, 0x0d0c0f0e09080b0a)
+ a2(movq xmm4, r8)
+ a2(movq xmm6, rax)
+ a2(mov r8, 0x0605040702010003)
+ a2(mov rax, 0x0e0d0c0f0a09080b)
+ a2(movq xmm5, r8)
+ a2(movq xmm7, rax)
+ a3(vpunpcklqdq xmm4, xmm4, xmm6)
+ a3(vpunpcklqdq xmm5, xmm5, xmm7)
a1(jz scrypt_ChunkMix_avx_no_xor1)
a3(vpxor xmm0,xmm0,[r9+0])
a3(vpxor xmm1,xmm1,[r9+16])
x3 = _mm_shuffle_epi8(x3, x4);
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
- x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x5);
x3 = _mm_shuffle_epi32(x3, 0x4e);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x39);
- x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x4);
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
- x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x5);
x3 = _mm_shuffle_epi32(x3, 0x4e);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x93);
- x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+ const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+ const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = _mm_srli_epi32(x1, 20);
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, x6);
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x6 = _mm_srli_epi32(x1, 25);
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, x6);
}
x0 = _mm_add_epi32(x0, t0);
a1(pop esi)
a1(pop edi)
a1(pop ebx)
- a1(ret 16)
+ aret(16)
asm_naked_fn_end(scrypt_ChunkMix_sse2)
#endif
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x4 = x3;
- x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
x4 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x4 = x3;
- x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
x0 = _mm_shuffle_epi32(x0, 0x93);
x2 = _mm_add_epi32(x2, x3);
x3 = _mm_shuffle_epi32(x3, 0x4e);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x39);
x4 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x4 = x3;
- x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
x4 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x4 = x3;
- x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
x0 = _mm_shuffle_epi32(x0, 0x39);
x2 = _mm_add_epi32(x2, x3);
x3 = _mm_shuffle_epi32(x3, 0x4e);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x93);
x4 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 16);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x4 = x3;
+ x3 = _mm_slli_epi32(x3, 8);
+ x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x4 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
}
x0 = _mm_add_epi32(x0, t0);
a2(shl edx,6)
a2(lea ecx,[edx-64])
a2(and eax, eax)
- a2(movdqa xmm4,[ssse3_rotl16_32bit])
- a2(movdqa xmm5,[ssse3_rotl8_32bit])
+ a2(mov ebx, 0x01000302)
+ a2(movd xmm4, ebx)
+ a2(mov ebx, 0x05040706)
+ a2(movd xmm0, ebx)
+ a2(mov ebx, 0x09080b0a)
+ a2(movd xmm1, ebx)
+ a2(mov ebx, 0x0d0c0f0e)
+ a2(movd xmm2, ebx)
+ a2(mov ebx, 0x02010003)
+ a2(movd xmm5, ebx)
+ a2(mov ebx, 0x06050407)
+ a2(movd xmm3, ebx)
+ a2(mov ebx, 0x0a09080b)
+ a2(movd xmm6, ebx)
+ a2(mov ebx, 0x0e0d0c0f)
+ a2(movd xmm7, ebx)
+ a2(punpckldq xmm4, xmm0)
+ a2(punpckldq xmm5, xmm3)
+ a2(punpckldq xmm1, xmm2)
+ a2(punpckldq xmm6, xmm7)
+ a2(punpcklqdq xmm4, xmm1)
+ a2(punpcklqdq xmm5, xmm6)
a2(movdqa xmm0,[ecx+esi+0])
a2(movdqa xmm1,[ecx+esi+16])
a2(movdqa xmm2,[ecx+esi+32])
a1(pop esi)
a1(pop edi)
a1(pop ebx)
- a1(ret 16)
+ aret(16)
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
#endif
a2(lea rax,[rsi+r9])
a2(lea r9,[rdx+r9])
a2(and rdx, rdx)
- a2(movdqa xmm4,[ssse3_rotl16_32bit])
- a2(movdqa xmm5,[ssse3_rotl8_32bit])
a2(movdqa xmm0,[rax+0])
a2(movdqa xmm1,[rax+16])
a2(movdqa xmm2,[rax+32])
a2(movdqa xmm3,[rax+48])
+ a2(mov r8, 0x0504070601000302)
+ a2(mov rax, 0x0d0c0f0e09080b0a)
+ a2(movq xmm4, r8)
+ a2(movq xmm6, rax)
+ a2(mov r8, 0x0605040702010003)
+ a2(mov rax, 0x0e0d0c0f0a09080b)
+ a2(movq xmm5, r8)
+ a2(movq xmm7, rax)
+ a2(punpcklqdq xmm4, xmm6)
+ a2(punpcklqdq xmm5, xmm7)
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
a2(pxor xmm0,[r9+0])
a2(pxor xmm1,[r9+16])
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x5);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x39);
x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x4);
x2 = _mm_add_epi32(x2, x3);
x1 = _mm_xor_si128(x1, x2);
x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
x0 = _mm_add_epi32(x0, x1);
x3 = _mm_xor_si128(x3, x0);
x3 = _mm_shuffle_epi8(x3, x5);
x1 = _mm_xor_si128(x1, x2);
x2 = _mm_shuffle_epi32(x2, 0x93);
x6 = x1;
- x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and no XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+ const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+ }
+
+ x0 = _mm_add_epi32(x0, t0);
+ x1 = _mm_add_epi32(x1, t1);
+ x2 = _mm_add_epi32(x2, t2);
+ x3 = _mm_add_epi32(x3, t3);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ }
+}
+
+/*
+ * Special version with r = 1 and unconditional XORing
+ * - mikaelh
+ */
+static void NOINLINE
+scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
+ const uint32_t r = 1;
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+ const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x93);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x39);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x4);
+ x2 = _mm_add_epi32(x2, x3);
+ x1 = _mm_xor_si128(x1, x2);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 12);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
+ x0 = _mm_add_epi32(x0, x1);
+ x3 = _mm_xor_si128(x3, x0);
+ x3 = _mm_shuffle_epi8(x3, x5);
+ x0 = _mm_shuffle_epi32(x0, 0x39);
+ x2 = _mm_add_epi32(x2, x3);
+ x3 = _mm_shuffle_epi32(x3, 0x4e);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_shuffle_epi32(x2, 0x93);
+ x6 = x1;
+ x1 = _mm_slli_epi32(x1, 7);
+ x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
}
x0 = _mm_add_epi32(x0, t0);
a1(pop esi)
a1(pop edi)
a1(pop ebx)
- a1(ret 16)
+ aret(16)
asm_naked_fn_end(scrypt_ChunkMix_avx)
#endif
a1(pop esi)
a1(pop edi)
a1(pop ebx)
- a1(ret 16)
+ aret(16)
asm_naked_fn_end(scrypt_ChunkMix_sse2)
#endif
4 9 14 3
*/
- static void STDCALL
+ static void asm_calling_convention
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
uint32_t t;
while (count--) {
--- /dev/null
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+ a1(push rbp)
+ a2(mov rbp, rsp)
+ a2(and rsp, ~63)
+ a2(sub rsp, 128)
+ a2(lea rcx,[rcx*2])
+ a2(shl rcx,7)
+ a2(lea r9,[rcx-128])
+ a2(lea rax,[rsi+r9])
+ a2(lea r9,[rdx+r9])
+ a2(and rdx, rdx)
+ a2(vmovdqa xmm0,[rax+0])
+ a2(vmovdqa xmm1,[rax+16])
+ a2(vmovdqa xmm2,[rax+32])
+ a2(vmovdqa xmm3,[rax+48])
+ a2(vmovdqa xmm4,[rax+64])
+ a2(vmovdqa xmm5,[rax+80])
+ a2(vmovdqa xmm6,[rax+96])
+ a2(vmovdqa xmm7,[rax+112])
+ a1(jz scrypt_ChunkMix_avx_no_xor1)
+ a3(vpxor xmm0,xmm0,[r9+0])
+ a3(vpxor xmm1,xmm1,[r9+16])
+ a3(vpxor xmm2,xmm2,[r9+32])
+ a3(vpxor xmm3,xmm3,[r9+48])
+ a3(vpxor xmm4,xmm4,[r9+64])
+ a3(vpxor xmm5,xmm5,[r9+80])
+ a3(vpxor xmm6,xmm6,[r9+96])
+ a3(vpxor xmm7,xmm7,[r9+112])
+ a1(scrypt_ChunkMix_avx_no_xor1:)
+ a2(xor r9,r9)
+ a2(xor r8,r8)
+ a1(scrypt_ChunkMix_avx_loop:)
+ a2(and rdx, rdx)
+ a3(vpxor xmm0,xmm0,[rsi+r9+0])
+ a3(vpxor xmm1,xmm1,[rsi+r9+16])
+ a3(vpxor xmm2,xmm2,[rsi+r9+32])
+ a3(vpxor xmm3,xmm3,[rsi+r9+48])
+ a3(vpxor xmm4,xmm4,[rsi+r9+64])
+ a3(vpxor xmm5,xmm5,[rsi+r9+80])
+ a3(vpxor xmm6,xmm6,[rsi+r9+96])
+ a3(vpxor xmm7,xmm7,[rsi+r9+112])
+ a1(jz scrypt_ChunkMix_avx_no_xor2)
+ a3(vpxor xmm0,xmm0,[rdx+r9+0])
+ a3(vpxor xmm1,xmm1,[rdx+r9+16])
+ a3(vpxor xmm2,xmm2,[rdx+r9+32])
+ a3(vpxor xmm3,xmm3,[rdx+r9+48])
+ a3(vpxor xmm4,xmm4,[rdx+r9+64])
+ a3(vpxor xmm5,xmm5,[rdx+r9+80])
+ a3(vpxor xmm6,xmm6,[rdx+r9+96])
+ a3(vpxor xmm7,xmm7,[rdx+r9+112])
+ a1(scrypt_ChunkMix_avx_no_xor2:)
+ a2(vmovdqa [rsp+0],xmm0)
+ a2(vmovdqa [rsp+16],xmm1)
+ a2(vmovdqa [rsp+32],xmm2)
+ a2(vmovdqa [rsp+48],xmm3)
+ a2(vmovdqa [rsp+64],xmm4)
+ a2(vmovdqa [rsp+80],xmm5)
+ a2(vmovdqa [rsp+96],xmm6)
+ a2(vmovdqa [rsp+112],xmm7)
+ a2(mov rax,8)
+ a1(scrypt_salsa64_avx_loop: )
+ a3(vpaddq xmm8, xmm0, xmm2)
+ a3(vpaddq xmm9, xmm1, xmm3)
+ a3(vpshufd xmm8, xmm8, 0xb1)
+ a3(vpshufd xmm9, xmm9, 0xb1)
+ a3(vpxor xmm6, xmm6, xmm8)
+ a3(vpxor xmm7, xmm7, xmm9)
+ a3(vpaddq xmm10, xmm0, xmm6)
+ a3(vpaddq xmm11, xmm1, xmm7)
+ a3(vpsrlq xmm8, xmm10, 51)
+ a3(vpsrlq xmm9, xmm11, 51)
+ a3(vpsllq xmm10, xmm10, 13)
+ a3(vpsllq xmm11, xmm11, 13)
+ a3(vpxor xmm4, xmm4, xmm8)
+ a3(vpxor xmm5, xmm5, xmm9)
+ a3(vpxor xmm4, xmm4, xmm10)
+ a3(vpxor xmm5, xmm5, xmm11)
+ a3(vpaddq xmm8, xmm6, xmm4)
+ a3(vpaddq xmm9, xmm7, xmm5)
+ a3(vpsrlq xmm10, xmm8, 25)
+ a3(vpsrlq xmm11, xmm9, 25)
+ a3(vpsllq xmm8, xmm8, 39)
+ a3(vpsllq xmm9, xmm9, 39)
+ a3(vpxor xmm2, xmm2, xmm10)
+ a3(vpxor xmm3, xmm3, xmm11)
+ a3(vpxor xmm2, xmm2, xmm8)
+ a3(vpxor xmm3, xmm3, xmm9)
+ a3(vpaddq xmm10, xmm4, xmm2)
+ a3(vpaddq xmm11, xmm5, xmm3)
+ a3(vpshufd xmm10, xmm10, 0xb1)
+ a3(vpshufd xmm11, xmm11, 0xb1)
+ a3(vpxor xmm0, xmm0, xmm10)
+ a3(vpxor xmm1, xmm1, xmm11)
+ a2(vmovdqa xmm8, xmm2)
+ a2(vmovdqa xmm9, xmm3)
+ a4(vpalignr xmm2, xmm6, xmm7, 8)
+ a4(vpalignr xmm3, xmm7, xmm6, 8)
+ a4(vpalignr xmm6, xmm9, xmm8, 8)
+ a4(vpalignr xmm7, xmm8, xmm9, 8)
+ a2(sub rax, 2)
+ a3(vpaddq xmm10, xmm0, xmm2)
+ a3(vpaddq xmm11, xmm1, xmm3)
+ a3(vpshufd xmm10, xmm10, 0xb1)
+ a3(vpshufd xmm11, xmm11, 0xb1)
+ a3(vpxor xmm6, xmm6, xmm10)
+ a3(vpxor xmm7, xmm7, xmm11)
+ a3(vpaddq xmm8, xmm0, xmm6)
+ a3(vpaddq xmm9, xmm1, xmm7)
+ a3(vpsrlq xmm10, xmm8, 51)
+ a3(vpsrlq xmm11, xmm9, 51)
+ a3(vpsllq xmm8, xmm8, 13)
+ a3(vpsllq xmm9, xmm9, 13)
+ a3(vpxor xmm5, xmm5, xmm10)
+ a3(vpxor xmm4, xmm4, xmm11)
+ a3(vpxor xmm5, xmm5, xmm8)
+ a3(vpxor xmm4, xmm4, xmm9)
+ a3(vpaddq xmm10, xmm6, xmm5)
+ a3(vpaddq xmm11, xmm7, xmm4)
+ a3(vpsrlq xmm8, xmm10, 25)
+ a3(vpsrlq xmm9, xmm11, 25)
+ a3(vpsllq xmm10, xmm10, 39)
+ a3(vpsllq xmm11, xmm11, 39)
+ a3(vpxor xmm2, xmm2, xmm8)
+ a3(vpxor xmm3, xmm3, xmm9)
+ a3(vpxor xmm2, xmm2, xmm10)
+ a3(vpxor xmm3, xmm3, xmm11)
+ a3(vpaddq xmm8, xmm5, xmm2)
+ a3(vpaddq xmm9, xmm4, xmm3)
+ a3(vpshufd xmm8, xmm8, 0xb1)
+ a3(vpshufd xmm9, xmm9, 0xb1)
+ a3(vpxor xmm0, xmm0, xmm8)
+ a3(vpxor xmm1, xmm1, xmm9)
+ a2(vmovdqa xmm10, xmm2)
+ a2(vmovdqa xmm11, xmm3)
+ a4(vpalignr xmm2, xmm6, xmm7, 8)
+ a4(vpalignr xmm3, xmm7, xmm6, 8)
+ a4(vpalignr xmm6, xmm11, xmm10, 8)
+ a4(vpalignr xmm7, xmm10, xmm11, 8)
+ a1(ja scrypt_salsa64_avx_loop)
+ a3(vpaddq xmm0,xmm0,[rsp+0])
+ a3(vpaddq xmm1,xmm1,[rsp+16])
+ a3(vpaddq xmm2,xmm2,[rsp+32])
+ a3(vpaddq xmm3,xmm3,[rsp+48])
+ a3(vpaddq xmm4,xmm4,[rsp+64])
+ a3(vpaddq xmm5,xmm5,[rsp+80])
+ a3(vpaddq xmm6,xmm6,[rsp+96])
+ a3(vpaddq xmm7,xmm7,[rsp+112])
+ a2(lea rax,[r8+r9])
+ a2(xor r8,rcx)
+ a2(and rax,~0xff)
+ a2(add r9,128)
+ a2(shr rax,1)
+ a2(add rax, rdi)
+ a2(cmp r9,rcx)
+ a2(vmovdqa [rax+0],xmm0)
+ a2(vmovdqa [rax+16],xmm1)
+ a2(vmovdqa [rax+32],xmm2)
+ a2(vmovdqa [rax+48],xmm3)
+ a2(vmovdqa [rax+64],xmm4)
+ a2(vmovdqa [rax+80],xmm5)
+ a2(vmovdqa [rax+96],xmm6)
+ a2(vmovdqa [rax+112],xmm7)
+ a1(jne scrypt_ChunkMix_avx_loop)
+ a2(mov rsp, rbp)
+ a1(pop rbp)
+ a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
+
+#define SCRYPT_SALSA64_AVX
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+ x4 = xmmp[4];
+ x5 = xmmp[5];
+ x6 = xmmp[6];
+ x7 = xmmp[7];
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+ t4 = x4;
+ t5 = x5;
+ t6 = x6;
+ t7 = x7;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x4 = _mm_xor_si128(x4, z2);
+ x5 = _mm_xor_si128(x5, z3);
+ x4 = _mm_xor_si128(x4, z0);
+ x5 = _mm_xor_si128(x5, z1);
+
+ z0 = _mm_add_epi64(x4, x6);
+ z1 = _mm_add_epi64(x5, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x4);
+ z1 = _mm_add_epi64(x3, x5);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x2;
+ z1 = x3;
+ x2 = _mm_alignr_epi8(x6, x7, 8);
+ x3 = _mm_alignr_epi8(x7, x6, 8);
+ x6 = _mm_alignr_epi8(z1, z0, 8);
+ x7 = _mm_alignr_epi8(z0, z1, 8);
+
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x5 = _mm_xor_si128(x5, z2);
+ x4 = _mm_xor_si128(x4, z3);
+ x5 = _mm_xor_si128(x5, z0);
+ x4 = _mm_xor_si128(x4, z1);
+
+ z0 = _mm_add_epi64(x5, x6);
+ z1 = _mm_add_epi64(x4, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x5);
+ z1 = _mm_add_epi64(x3, x4);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x2;
+ z1 = x3;
+ x2 = _mm_alignr_epi8(x6, x7, 8);
+ x3 = _mm_alignr_epi8(x7, x6, 8);
+ x6 = _mm_alignr_epi8(z1, z0, 8);
+ x7 = _mm_alignr_epi8(z0, z1, 8);
+ }
+
+ x0 = _mm_add_epi64(x0, t0);
+ x1 = _mm_add_epi64(x1, t1);
+ x2 = _mm_add_epi64(x2, t2);
+ x3 = _mm_add_epi64(x3, t3);
+ x4 = _mm_add_epi64(x4, t4);
+ x5 = _mm_add_epi64(x5, t5);
+ x6 = _mm_add_epi64(x6, t6);
+ x7 = _mm_add_epi64(x7, t7);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ xmmp[4] = x4;
+ xmmp[5] = x5;
+ xmmp[6] = x6;
+ xmmp[7] = x7;
+ }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+ /* uses salsa64_core_tangle_sse2 */
+
+ #undef SCRYPT_MIX
+ #define SCRYPT_MIX "Salsa64/8-AVX"
+ #undef SCRYPT_SALSA64_INCLUDED
+ #define SCRYPT_SALSA64_INCLUDED
+#endif
--- /dev/null
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+ a1(push rbp)
+ a2(mov rbp, rsp)
+ a2(and rsp, ~63)
+ a2(sub rsp, 128)
+ a2(lea rcx,[rcx*2])
+ a2(shl rcx,7)
+ a2(lea r9,[rcx-128])
+ a2(lea rax,[rsi+r9])
+ a2(lea r9,[rdx+r9])
+ a2(and rdx, rdx)
+ a2(movdqa xmm0,[rax+0])
+ a2(movdqa xmm1,[rax+16])
+ a2(movdqa xmm2,[rax+32])
+ a2(movdqa xmm3,[rax+48])
+ a2(movdqa xmm4,[rax+64])
+ a2(movdqa xmm5,[rax+80])
+ a2(movdqa xmm6,[rax+96])
+ a2(movdqa xmm7,[rax+112])
+ a1(jz scrypt_ChunkMix_sse2_no_xor1)
+ a2(pxor xmm0,[r9+0])
+ a2(pxor xmm1,[r9+16])
+ a2(pxor xmm2,[r9+32])
+ a2(pxor xmm3,[r9+48])
+ a2(pxor xmm4,[r9+64])
+ a2(pxor xmm5,[r9+80])
+ a2(pxor xmm6,[r9+96])
+ a2(pxor xmm7,[r9+112])
+ a1(scrypt_ChunkMix_sse2_no_xor1:)
+ a2(xor r9,r9)
+ a2(xor r8,r8)
+ a1(scrypt_ChunkMix_sse2_loop:)
+ a2(and rdx, rdx)
+ a2(pxor xmm0,[rsi+r9+0])
+ a2(pxor xmm1,[rsi+r9+16])
+ a2(pxor xmm2,[rsi+r9+32])
+ a2(pxor xmm3,[rsi+r9+48])
+ a2(pxor xmm4,[rsi+r9+64])
+ a2(pxor xmm5,[rsi+r9+80])
+ a2(pxor xmm6,[rsi+r9+96])
+ a2(pxor xmm7,[rsi+r9+112])
+ a1(jz scrypt_ChunkMix_sse2_no_xor2)
+ a2(pxor xmm0,[rdx+r9+0])
+ a2(pxor xmm1,[rdx+r9+16])
+ a2(pxor xmm2,[rdx+r9+32])
+ a2(pxor xmm3,[rdx+r9+48])
+ a2(pxor xmm4,[rdx+r9+64])
+ a2(pxor xmm5,[rdx+r9+80])
+ a2(pxor xmm6,[rdx+r9+96])
+ a2(pxor xmm7,[rdx+r9+112])
+ a1(scrypt_ChunkMix_sse2_no_xor2:)
+ a2(movdqa [rsp+0],xmm0)
+ a2(movdqa [rsp+16],xmm1)
+ a2(movdqa [rsp+32],xmm2)
+ a2(movdqa [rsp+48],xmm3)
+ a2(movdqa [rsp+64],xmm4)
+ a2(movdqa [rsp+80],xmm5)
+ a2(movdqa [rsp+96],xmm6)
+ a2(movdqa [rsp+112],xmm7)
+ a2(mov rax,8)
+ a1(scrypt_salsa64_sse2_loop: )
+ a2(movdqa xmm8, xmm0)
+ a2(movdqa xmm9, xmm1)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm6, xmm8)
+ a2(pxor xmm7, xmm9)
+ a2(movdqa xmm10, xmm0)
+ a2(movdqa xmm11, xmm1)
+ a2(paddq xmm10, xmm6)
+ a2(paddq xmm11, xmm7)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 51)
+ a2(psrlq xmm11, 51)
+ a2(psllq xmm8, 13)
+ a2(psllq xmm9, 13)
+ a2(pxor xmm4, xmm10)
+ a2(pxor xmm5, xmm11)
+ a2(pxor xmm4, xmm8)
+ a2(pxor xmm5, xmm9)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(paddq xmm10, xmm4)
+ a2(paddq xmm11, xmm5)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 25)
+ a2(psrlq xmm11, 25)
+ a2(psllq xmm8, 39)
+ a2(psllq xmm9, 39)
+ a2(pxor xmm2, xmm10)
+ a2(pxor xmm3, xmm11)
+ a2(pxor xmm2, xmm8)
+ a2(pxor xmm3, xmm9)
+ a2(movdqa xmm8, xmm4)
+ a2(movdqa xmm9, xmm5)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm0, xmm8)
+ a2(pxor xmm1, xmm9)
+ a2(movdqa xmm8, xmm2)
+ a2(movdqa xmm9, xmm3)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(movdqa xmm2, xmm7)
+ a2(movdqa xmm3, xmm6)
+ a2(punpcklqdq xmm10, xmm6)
+ a2(punpcklqdq xmm11, xmm7)
+ a2(movdqa xmm6, xmm8)
+ a2(movdqa xmm7, xmm9)
+ a2(punpcklqdq xmm9, xmm9)
+ a2(punpcklqdq xmm8, xmm8)
+ a2(punpckhqdq xmm2, xmm10)
+ a2(punpckhqdq xmm3, xmm11)
+ a2(punpckhqdq xmm6, xmm9)
+ a2(punpckhqdq xmm7, xmm8)
+ a2(sub rax, 2)
+ a2(movdqa xmm8, xmm0)
+ a2(movdqa xmm9, xmm1)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm6, xmm8)
+ a2(pxor xmm7, xmm9)
+ a2(movdqa xmm10, xmm0)
+ a2(movdqa xmm11, xmm1)
+ a2(paddq xmm10, xmm6)
+ a2(paddq xmm11, xmm7)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 51)
+ a2(psrlq xmm11, 51)
+ a2(psllq xmm8, 13)
+ a2(psllq xmm9, 13)
+ a2(pxor xmm5, xmm10)
+ a2(pxor xmm4, xmm11)
+ a2(pxor xmm5, xmm8)
+ a2(pxor xmm4, xmm9)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(paddq xmm10, xmm5)
+ a2(paddq xmm11, xmm4)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 25)
+ a2(psrlq xmm11, 25)
+ a2(psllq xmm8, 39)
+ a2(psllq xmm9, 39)
+ a2(pxor xmm2, xmm10)
+ a2(pxor xmm3, xmm11)
+ a2(pxor xmm2, xmm8)
+ a2(pxor xmm3, xmm9)
+ a2(movdqa xmm8, xmm5)
+ a2(movdqa xmm9, xmm4)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm0, xmm8)
+ a2(pxor xmm1, xmm9)
+ a2(movdqa xmm8, xmm2)
+ a2(movdqa xmm9, xmm3)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(movdqa xmm2, xmm7)
+ a2(movdqa xmm3, xmm6)
+ a2(punpcklqdq xmm10, xmm6)
+ a2(punpcklqdq xmm11, xmm7)
+ a2(movdqa xmm6, xmm8)
+ a2(movdqa xmm7, xmm9)
+ a2(punpcklqdq xmm9, xmm9)
+ a2(punpcklqdq xmm8, xmm8)
+ a2(punpckhqdq xmm2, xmm10)
+ a2(punpckhqdq xmm3, xmm11)
+ a2(punpckhqdq xmm6, xmm9)
+ a2(punpckhqdq xmm7, xmm8)
+ a1(ja scrypt_salsa64_sse2_loop)
+ a2(paddq xmm0,[rsp+0])
+ a2(paddq xmm1,[rsp+16])
+ a2(paddq xmm2,[rsp+32])
+ a2(paddq xmm3,[rsp+48])
+ a2(paddq xmm4,[rsp+64])
+ a2(paddq xmm5,[rsp+80])
+ a2(paddq xmm6,[rsp+96])
+ a2(paddq xmm7,[rsp+112])
+ a2(lea rax,[r8+r9])
+ a2(xor r8,rcx)
+ a2(and rax,~0xff)
+ a2(add r9,128)
+ a2(shr rax,1)
+ a2(add rax, rdi)
+ a2(cmp r9,rcx)
+ a2(movdqa [rax+0],xmm0)
+ a2(movdqa [rax+16],xmm1)
+ a2(movdqa [rax+32],xmm2)
+ a2(movdqa [rax+48],xmm3)
+ a2(movdqa [rax+64],xmm4)
+ a2(movdqa [rax+80],xmm5)
+ a2(movdqa [rax+96],xmm6)
+ a2(movdqa [rax+112],xmm7)
+ a1(jne scrypt_ChunkMix_sse2_loop)
+ a2(mov rsp, rbp)
+ a1(pop rbp)
+ a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)
+
+#define SCRYPT_SALSA64_SSE2
+
+static void asm_calling_convention
+scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+ x4 = xmmp[4];
+ x5 = xmmp[5];
+ x6 = xmmp[6];
+ x7 = xmmp[7];
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+ t4 = x4;
+ t5 = x5;
+ t6 = x6;
+ t7 = x7;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x4 = _mm_xor_si128(x4, z2);
+ x5 = _mm_xor_si128(x5, z3);
+ x4 = _mm_xor_si128(x4, z0);
+ x5 = _mm_xor_si128(x5, z1);
+
+ z0 = _mm_add_epi64(x4, x6);
+ z1 = _mm_add_epi64(x5, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x4);
+ z1 = _mm_add_epi64(x3, x5);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x4;
+ z1 = x5;
+ z2 = x2;
+ z3 = x3;
+ x4 = z1;
+ x5 = z0;
+ x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+ x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+ x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+ x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x4 = _mm_xor_si128(x4, z2);
+ x5 = _mm_xor_si128(x5, z3);
+ x4 = _mm_xor_si128(x4, z0);
+ x5 = _mm_xor_si128(x5, z1);
+
+ z0 = _mm_add_epi64(x4, x6);
+ z1 = _mm_add_epi64(x5, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x4);
+ z1 = _mm_add_epi64(x3, x5);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x4;
+ z1 = x5;
+ z2 = x2;
+ z3 = x3;
+ x4 = z1;
+ x5 = z0;
+ x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+ x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+ x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+ x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+ }
+
+ x0 = _mm_add_epi64(x0, t0);
+ x1 = _mm_add_epi64(x1, t1);
+ x2 = _mm_add_epi64(x2, t2);
+ x3 = _mm_add_epi64(x3, t3);
+ x4 = _mm_add_epi64(x4, t4);
+ x5 = _mm_add_epi64(x5, t5);
+ x6 = _mm_add_epi64(x6, t6);
+ x7 = _mm_add_epi64(x7, t7);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ xmmp[4] = x4;
+ xmmp[5] = x5;
+ xmmp[6] = x6;
+ xmmp[7] = x7;
+ }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+ #undef SCRYPT_MIX
+ #define SCRYPT_MIX "Salsa64/8-SSE2"
+ #undef SCRYPT_SALSA64_INCLUDED
+ #define SCRYPT_SALSA64_INCLUDED
+#endif
+
+/* sse3/avx use this as well */
+#if defined(SCRYPT_SALSA64_INCLUDED)
+ /*
+ Default layout:
+ 0 1 2 3
+ 4 5 6 7
+ 8 9 10 11
+ 12 13 14 15
+
+ SSE2 layout:
+ 0 5 10 15
+ 12 1 6 11
+ 8 13 2 7
+ 4 9 14 3
+ */
+
+
+ static void asm_calling_convention
+ salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
+ uint64_t t;
+ while (count--) {
+ t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+ t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+ t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+ t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+ t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+ t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+ blocks += 16;
+ }
+ }
+#endif
\ No newline at end of file
--- /dev/null
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+ a1(push rbp)
+ a2(mov rbp, rsp)
+ a2(and rsp, ~63)
+ a2(sub rsp, 128)
+ a2(lea rcx,[rcx*2])
+ a2(shl rcx,7)
+ a2(lea r9,[rcx-128])
+ a2(lea rax,[rsi+r9])
+ a2(lea r9,[rdx+r9])
+ a2(and rdx, rdx)
+ a2(movdqa xmm0,[rax+0])
+ a2(movdqa xmm1,[rax+16])
+ a2(movdqa xmm2,[rax+32])
+ a2(movdqa xmm3,[rax+48])
+ a2(movdqa xmm4,[rax+64])
+ a2(movdqa xmm5,[rax+80])
+ a2(movdqa xmm6,[rax+96])
+ a2(movdqa xmm7,[rax+112])
+ a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+ a2(pxor xmm0,[r9+0])
+ a2(pxor xmm1,[r9+16])
+ a2(pxor xmm2,[r9+32])
+ a2(pxor xmm3,[r9+48])
+ a2(pxor xmm4,[r9+64])
+ a2(pxor xmm5,[r9+80])
+ a2(pxor xmm6,[r9+96])
+ a2(pxor xmm7,[r9+112])
+ a1(scrypt_ChunkMix_ssse3_no_xor1:)
+ a2(xor r9,r9)
+ a2(xor r8,r8)
+ a1(scrypt_ChunkMix_ssse3_loop:)
+ a2(and rdx, rdx)
+ a2(pxor xmm0,[rsi+r9+0])
+ a2(pxor xmm1,[rsi+r9+16])
+ a2(pxor xmm2,[rsi+r9+32])
+ a2(pxor xmm3,[rsi+r9+48])
+ a2(pxor xmm4,[rsi+r9+64])
+ a2(pxor xmm5,[rsi+r9+80])
+ a2(pxor xmm6,[rsi+r9+96])
+ a2(pxor xmm7,[rsi+r9+112])
+ a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+ a2(pxor xmm0,[rdx+r9+0])
+ a2(pxor xmm1,[rdx+r9+16])
+ a2(pxor xmm2,[rdx+r9+32])
+ a2(pxor xmm3,[rdx+r9+48])
+ a2(pxor xmm4,[rdx+r9+64])
+ a2(pxor xmm5,[rdx+r9+80])
+ a2(pxor xmm6,[rdx+r9+96])
+ a2(pxor xmm7,[rdx+r9+112])
+ a1(scrypt_ChunkMix_ssse3_no_xor2:)
+ a2(movdqa [rsp+0],xmm0)
+ a2(movdqa [rsp+16],xmm1)
+ a2(movdqa [rsp+32],xmm2)
+ a2(movdqa [rsp+48],xmm3)
+ a2(movdqa [rsp+64],xmm4)
+ a2(movdqa [rsp+80],xmm5)
+ a2(movdqa [rsp+96],xmm6)
+ a2(movdqa [rsp+112],xmm7)
+ a2(mov rax,8)
+ a1(scrypt_salsa64_ssse3_loop: )
+ a2(movdqa xmm8, xmm0)
+ a2(movdqa xmm9, xmm1)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm6, xmm8)
+ a2(pxor xmm7, xmm9)
+ a2(movdqa xmm10, xmm0)
+ a2(movdqa xmm11, xmm1)
+ a2(paddq xmm10, xmm6)
+ a2(paddq xmm11, xmm7)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 51)
+ a2(psrlq xmm11, 51)
+ a2(psllq xmm8, 13)
+ a2(psllq xmm9, 13)
+ a2(pxor xmm4, xmm10)
+ a2(pxor xmm5, xmm11)
+ a2(pxor xmm4, xmm8)
+ a2(pxor xmm5, xmm9)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(paddq xmm10, xmm4)
+ a2(paddq xmm11, xmm5)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 25)
+ a2(psrlq xmm11, 25)
+ a2(psllq xmm8, 39)
+ a2(psllq xmm9, 39)
+ a2(pxor xmm2, xmm10)
+ a2(pxor xmm3, xmm11)
+ a2(pxor xmm2, xmm8)
+ a2(pxor xmm3, xmm9)
+ a2(movdqa xmm8, xmm4)
+ a2(movdqa xmm9, xmm5)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm0, xmm8)
+ a2(pxor xmm1, xmm9)
+ a2(movdqa xmm10, xmm2)
+ a2(movdqa xmm11, xmm3)
+ a2(movdqa xmm2, xmm6)
+ a2(movdqa xmm3, xmm7)
+ a3(palignr xmm2, xmm7, 8)
+ a3(palignr xmm3, xmm6, 8)
+ a2(movdqa xmm6, xmm11)
+ a2(movdqa xmm7, xmm10)
+ a3(palignr xmm6, xmm10, 8)
+ a3(palignr xmm7, xmm11, 8)
+ a2(sub rax, 2)
+ a2(movdqa xmm8, xmm0)
+ a2(movdqa xmm9, xmm1)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm6, xmm8)
+ a2(pxor xmm7, xmm9)
+ a2(movdqa xmm10, xmm0)
+ a2(movdqa xmm11, xmm1)
+ a2(paddq xmm10, xmm6)
+ a2(paddq xmm11, xmm7)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 51)
+ a2(psrlq xmm11, 51)
+ a2(psllq xmm8, 13)
+ a2(psllq xmm9, 13)
+ a2(pxor xmm5, xmm10)
+ a2(pxor xmm4, xmm11)
+ a2(pxor xmm5, xmm8)
+ a2(pxor xmm4, xmm9)
+ a2(movdqa xmm10, xmm6)
+ a2(movdqa xmm11, xmm7)
+ a2(paddq xmm10, xmm5)
+ a2(paddq xmm11, xmm4)
+ a2(movdqa xmm8, xmm10)
+ a2(movdqa xmm9, xmm11)
+ a2(psrlq xmm10, 25)
+ a2(psrlq xmm11, 25)
+ a2(psllq xmm8, 39)
+ a2(psllq xmm9, 39)
+ a2(pxor xmm2, xmm10)
+ a2(pxor xmm3, xmm11)
+ a2(pxor xmm2, xmm8)
+ a2(pxor xmm3, xmm9)
+ a2(movdqa xmm8, xmm5)
+ a2(movdqa xmm9, xmm4)
+ a2(paddq xmm8, xmm2)
+ a2(paddq xmm9, xmm3)
+ a3(pshufd xmm8, xmm8, 0xb1)
+ a3(pshufd xmm9, xmm9, 0xb1)
+ a2(pxor xmm0, xmm8)
+ a2(pxor xmm1, xmm9)
+ a2(movdqa xmm10, xmm2)
+ a2(movdqa xmm11, xmm3)
+ a2(movdqa xmm2, xmm6)
+ a2(movdqa xmm3, xmm7)
+ a3(palignr xmm2, xmm7, 8)
+ a3(palignr xmm3, xmm6, 8)
+ a2(movdqa xmm6, xmm11)
+ a2(movdqa xmm7, xmm10)
+ a3(palignr xmm6, xmm10, 8)
+ a3(palignr xmm7, xmm11, 8)
+ a1(ja scrypt_salsa64_ssse3_loop)
+ a2(paddq xmm0,[rsp+0])
+ a2(paddq xmm1,[rsp+16])
+ a2(paddq xmm2,[rsp+32])
+ a2(paddq xmm3,[rsp+48])
+ a2(paddq xmm4,[rsp+64])
+ a2(paddq xmm5,[rsp+80])
+ a2(paddq xmm6,[rsp+96])
+ a2(paddq xmm7,[rsp+112])
+ a2(lea rax,[r8+r9])
+ a2(xor r8,rcx)
+ a2(and rax,~0xff)
+ a2(add r9,128)
+ a2(shr rax,1)
+ a2(add rax, rdi)
+ a2(cmp r9,rcx)
+ a2(movdqa [rax+0],xmm0)
+ a2(movdqa [rax+16],xmm1)
+ a2(movdqa [rax+32],xmm2)
+ a2(movdqa [rax+48],xmm3)
+ a2(movdqa [rax+64],xmm4)
+ a2(movdqa [rax+80],xmm5)
+ a2(movdqa [rax+96],xmm6)
+ a2(movdqa [rax+112],xmm7)
+ a1(jne scrypt_ChunkMix_ssse3_loop)
+ a2(mov rsp, rbp)
+ a1(pop rbp)
+ a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
+
+#define SCRYPT_SALSA64_SSSE3
+
+static void asm_calling_convention
+scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+ uint32_t i, blocksPerChunk = r * 2, half = 0;
+ xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+ size_t rounds;
+
+ /* 1: X = B_{2r - 1} */
+ xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+ x0 = xmmp[0];
+ x1 = xmmp[1];
+ x2 = xmmp[2];
+ x3 = xmmp[3];
+ x4 = xmmp[4];
+ x5 = xmmp[5];
+ x6 = xmmp[6];
+ x7 = xmmp[7];
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ /* 2: for i = 0 to 2r - 1 do */
+ for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+ /* 3: X = H(X ^ B_i) */
+ xmmp = (xmmi *)scrypt_block(Bin, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+
+ if (Bxor) {
+ xmmp = (xmmi *)scrypt_block(Bxor, i);
+ x0 = _mm_xor_si128(x0, xmmp[0]);
+ x1 = _mm_xor_si128(x1, xmmp[1]);
+ x2 = _mm_xor_si128(x2, xmmp[2]);
+ x3 = _mm_xor_si128(x3, xmmp[3]);
+ x4 = _mm_xor_si128(x4, xmmp[4]);
+ x5 = _mm_xor_si128(x5, xmmp[5]);
+ x6 = _mm_xor_si128(x6, xmmp[6]);
+ x7 = _mm_xor_si128(x7, xmmp[7]);
+ }
+
+ t0 = x0;
+ t1 = x1;
+ t2 = x2;
+ t3 = x3;
+ t4 = x4;
+ t5 = x5;
+ t6 = x6;
+ t7 = x7;
+
+ for (rounds = 8; rounds; rounds -= 2) {
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x4 = _mm_xor_si128(x4, z2);
+ x5 = _mm_xor_si128(x5, z3);
+ x4 = _mm_xor_si128(x4, z0);
+ x5 = _mm_xor_si128(x5, z1);
+
+ z0 = _mm_add_epi64(x4, x6);
+ z1 = _mm_add_epi64(x5, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x4);
+ z1 = _mm_add_epi64(x3, x5);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x2;
+ z1 = x3;
+ x2 = _mm_alignr_epi8(x6, x7, 8);
+ x3 = _mm_alignr_epi8(x7, x6, 8);
+ x6 = _mm_alignr_epi8(z1, z0, 8);
+ x7 = _mm_alignr_epi8(z0, z1, 8);
+
+ z0 = _mm_add_epi64(x0, x2);
+ z1 = _mm_add_epi64(x1, x3);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x6 = _mm_xor_si128(x6, z0);
+ x7 = _mm_xor_si128(x7, z1);
+
+ z0 = _mm_add_epi64(x6, x0);
+ z1 = _mm_add_epi64(x7, x1);
+ z2 = _mm_srli_epi64(z0, 64-13);
+ z3 = _mm_srli_epi64(z1, 64-13);
+ z0 = _mm_slli_epi64(z0, 13);
+ z1 = _mm_slli_epi64(z1, 13);
+ x5 = _mm_xor_si128(x5, z2);
+ x4 = _mm_xor_si128(x4, z3);
+ x5 = _mm_xor_si128(x5, z0);
+ x4 = _mm_xor_si128(x4, z1);
+
+ z0 = _mm_add_epi64(x5, x6);
+ z1 = _mm_add_epi64(x4, x7);
+ z2 = _mm_srli_epi64(z0, 64-39);
+ z3 = _mm_srli_epi64(z1, 64-39);
+ z0 = _mm_slli_epi64(z0, 39);
+ z1 = _mm_slli_epi64(z1, 39);
+ x2 = _mm_xor_si128(x2, z2);
+ x3 = _mm_xor_si128(x3, z3);
+ x2 = _mm_xor_si128(x2, z0);
+ x3 = _mm_xor_si128(x3, z1);
+
+ z0 = _mm_add_epi64(x2, x5);
+ z1 = _mm_add_epi64(x3, x4);
+ z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+ z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+ x0 = _mm_xor_si128(x0, z0);
+ x1 = _mm_xor_si128(x1, z1);
+
+ z0 = x2;
+ z1 = x3;
+ x2 = _mm_alignr_epi8(x6, x7, 8);
+ x3 = _mm_alignr_epi8(x7, x6, 8);
+ x6 = _mm_alignr_epi8(z1, z0, 8);
+ x7 = _mm_alignr_epi8(z0, z1, 8);
+ }
+
+ x0 = _mm_add_epi64(x0, t0);
+ x1 = _mm_add_epi64(x1, t1);
+ x2 = _mm_add_epi64(x2, t2);
+ x3 = _mm_add_epi64(x3, t3);
+ x4 = _mm_add_epi64(x4, t4);
+ x5 = _mm_add_epi64(x5, t5);
+ x6 = _mm_add_epi64(x6, t6);
+ x7 = _mm_add_epi64(x7, t7);
+
+ /* 4: Y_i = X */
+ /* 6: B'[0..r-1] = Y_even */
+ /* 6: B'[r..2r-1] = Y_odd */
+ xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+ xmmp[0] = x0;
+ xmmp[1] = x1;
+ xmmp[2] = x2;
+ xmmp[3] = x3;
+ xmmp[4] = x4;
+ xmmp[5] = x5;
+ xmmp[6] = x6;
+ xmmp[7] = x7;
+ }
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+ /* uses salsa64_core_tangle_sse2 */
+
+ #undef SCRYPT_MIX
+ #define SCRYPT_MIX "Salsa64/8-SSSE3"
+ #undef SCRYPT_SALSA64_INCLUDED
+ #define SCRYPT_SALSA64_INCLUDED
+#endif
--- /dev/null
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa64/8 Ref"
+
+#undef SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_BASIC
+
+static void
+salsa64_core_basic(uint64_t state[16]) {
+ const size_t rounds = 8;
+ uint64_t v[16], t;
+ size_t i;
+
+ for (i = 0; i < 16; i++) v[i] = state[i];
+
+ #define G(a,b,c,d) \
+ t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
+ t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
+ t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
+ t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
+
+ for (i = 0; i < rounds; i += 2) {
+ G( 0, 4, 8,12);
+ G( 5, 9,13, 1);
+ G(10,14, 2, 6);
+ G(15, 3, 7,11);
+ G( 0, 1, 2, 3);
+ G( 5, 6, 7, 4);
+ G(10,11, 8, 9);
+ G(15,12,13,14);
+ }
+
+ for (i = 0; i < 16; i++) state[i] += v[i];
+
+ #undef G
+}
+
+#endif
+
pad[i] ^= (0x5c ^ 0x36);
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+#ifdef SCRYPT_PREVENT_STATE_LEAK
scrypt_ensure_zero(pad, sizeof(pad));
+#endif
}
static void
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
scrypt_hash_finish(&st->outer, mac);
+#ifdef SCRYPT_PREVENT_STATE_LEAK
scrypt_ensure_zero(st, sizeof(*st));
+#endif
}
static void
uint8_t be[4];
uint32_t i, j, blocks;
uint64_t c;
-
+
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
/* hmac(password, ...) */
bytes -= SCRYPT_HASH_DIGEST_SIZE;
}
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+ scrypt_ensure_zero(ti, sizeof(ti));
+ scrypt_ensure_zero(u, sizeof(u));
+ scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
+ scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+#endif
+}
+
+/*
+ * Special version where N = 1
+ * - mikaelh
+ */
+static void
+scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
+ scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+ scrypt_hash_digest ti, u;
+ uint8_t be[4];
+ uint32_t i, /*j,*/ blocks;
+ //uint64_t c;
+
+ /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+ /* hmac(password, ...) */
+ scrypt_hmac_init(&hmac_pw, password, password_len);
+
+ /* hmac(password, salt...) */
+ hmac_pw_salt = hmac_pw;
+ scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+ blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+ for (i = 1; i <= blocks; i++) {
+ /* U1 = hmac(password, salt || be(i)) */
+ U32TO8_BE(be, i);
+ work = hmac_pw_salt;
+ scrypt_hmac_update(&work, be, 4);
+ scrypt_hmac_finish(&work, ti);
+ memcpy(u, ti, sizeof(u));
+
+ memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+ out += SCRYPT_HASH_DIGEST_SIZE;
+ bytes -= SCRYPT_HASH_DIGEST_SIZE;
+ }
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
scrypt_ensure_zero(ti, sizeof(ti));
scrypt_ensure_zero(u, sizeof(u));
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+#endif
}
#define X86_64USE_INTRINSIC
#endif
+#ifdef __AVX__
+#define X86_INTRINSIC_AVX
+#endif
+
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
#define X86_INTRINSIC
#if defined(__SSE__)
#if defined(__AVX__)
#define X86_INTRINSIC_AVX
#endif
+
+ /* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */
+ #undef X86_64ASM_SSSE3
+ #undef X86_64ASM_AVX
+ #undef X86_64ASM_SSE2
+ #undef X86ASM_AVX
+ #undef X86ASM_SSSE3
+ #undef X86ASM_SSE2
+ #undef X86ASM_SSE
#endif
/* only use simd on windows (or SSE2 on gcc)! */
#define X86_INTRINSIC_SSSE3
#include <tmmintrin.h>
#endif
+ #if defined (X86_INTRINSIC_AVX)
+ #define X86_INTRINSIC_AVX
+ #include <immintrin.h>
+ #endif
#endif
} packedelem64;
#endif
-#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3)
- const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}};
- const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
- const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
- const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}};
+#if defined(X86_INTRINSIC_SSSE3)
+ static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+ static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
#endif
/*
a1(..)
a2(.., ..)
a3(.., .., ..)
- a1(ret)
+ 64bit OR 0 paramters: a1(ret)
+ 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
asm_naked_fn_end(name)
*/
#define asm_align8 a1(ALIGN 8)
#define asm_align16 a1(ALIGN 16)
- #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn
+ #define asm_calling_convention STDCALL
+ #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
#define asm_naked_fn(fn) {
#define asm_naked_fn_end(fn) }
#elif defined(COMPILER_GCC)
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
#define GNU_ASL(x) "\n" #x ":\n"
+ #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
#define GNU_ASJ(x, y, z) #x " " #y #z ";"
#define a1(x) GNU_AS1(x)
#define asm_align8 a1(.align 8)
#define asm_align16 a1(.align 16)
- #define asm_naked_fn_proto(type, fn) extern type STDCALL fn
- #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn)
- #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+ #if defined(OS_WINDOWS)
+ #define asm_calling_convention CDECL
+ #define aret(n) a1(ret)
+ #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
+ #else
+ #define asm_calling_convention STDCALL
+ #define aret(n) a1(ret n)
+ #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+ #endif
+ #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
+ #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
#define asm_gcc_parms() ".att_syntax prefix;"
#define asm_gcc_trashed() __asm__ __volatile__("" :::
#endif
#endif
-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
\ No newline at end of file
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
#pragma warning(disable : 4127) /* conditional expression is constant */
#pragma warning(disable : 4100) /* unreferenced formal parameter */
- #define _CRT_SECURE_NO_WARNINGS
#include <float.h>
#include <stdlib.h> /* _rotl */
#include <intrin.h>
#endif
/* romix pre/post nop function */
-static void STDCALL
+//static void asm_calling_convention
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
}
/* romix pre/post endian conversion function */
-static void STDCALL
+//static void asm_calling_convention
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
#if !defined(CPU_LE)
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
}
/* chunkmix test function */
-typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
-typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
static int
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
2*r: number of blocks in the chunk
*/
-static void STDCALL
+//static void asm_calling_convention
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
uint32_t i, j, blocksPerChunk = r * 2, half = 0;
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
}
+/*
+ * Special version with hard-coded r = 1
+ * - mikaelh
+ */
+static void NOINLINE FASTCALL
+scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
+ const uint32_t r = 1;
+ uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+ scrypt_mix_word_t *block = V;
+
+ SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+ /* 1: X = B */
+ /* implicit */
+
+ /* 2: for i = 0 to N - 1 do */
+ memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+ for (i = 0; i < N - 1; i++, block += chunkWords) {
+ /* 3: V_i = X */
+ /* 4: X = H(X) */
+#ifdef SCRYPT_CHUNKMIX_1_FN
+ SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
+#else
+ SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+#endif
+ }
+#ifdef SCRYPT_CHUNKMIX_1_FN
+ SCRYPT_CHUNKMIX_1_FN(X, block);
+#else
+ SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+#endif
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < N; i += 2) {
+ /* 7: j = Integerify(X) % N */
+ j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+ /* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+ SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
+#else
+ SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+#endif
+
+ /* 7: j = Integerify(Y) % N */
+ j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+ /* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+ SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
+#else
+ SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+#endif
+ }
+
+ /* 10: B' = X */
+ /* implicit */
+
+ SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
#if defined(SCRYPT_TEST_SPEED)
static size_t
available_implementations() {
+ size_t cpuflags = detect_cpu();
size_t flags = 0;
#if defined(SCRYPT_SALSA_AVX)
+ if (cpuflags & cpu_avx)
flags |= cpu_avx;
#endif
#if defined(SCRYPT_SALSA_SSE2)
+ if (cpuflags & cpu_sse2)
flags |= cpu_sse2;
#endif
--- /dev/null
+#define SCRYPT_MIX_BASE "Salsa64/8"
+
+typedef uint64_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U64TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
+
+#define SCRYPT_BLOCK_BYTES 128
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa64-avx.h"
+#include "scrypt-jane-mix_salsa64-ssse3.h"
+#include "scrypt-jane-mix_salsa64-sse2.h"
+#include "scrypt-jane-mix_salsa64.h"
+
+#if defined(SCRYPT_SALSA64_AVX)
+ #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+ #define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+ #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+ #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+ #include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+ #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+ #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+ #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+ #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+ #include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+ #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+ #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+ #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+ #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+ #include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa64_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+ size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX)
+ if (cpuflags & cpu_avx)
+ return scrypt_ROMix_avx;
+ else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+ if (cpuflags & cpu_ssse3)
+ return scrypt_ROMix_ssse3;
+ else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+ if (cpuflags & cpu_sse2)
+ return scrypt_ROMix_sse2;
+ else
+#endif
+
+ return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+ size_t cpuflags = detect_cpu();
+ size_t flags = 0;
+
+#if defined(SCRYPT_SALSA64_AVX)
+ if (cpuflags & cpu_avx)
+ flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+ if (cpuflags & cpu_ssse3)
+ flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+ if (cpuflags & cpu_sse2)
+ flags |= cpu_sse2;
+#endif
+
+ return flags;
+}
+#endif
+
+static int
+scrypt_test_mix() {
+ static const uint8_t expected[16] = {
+ 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
+ };
+
+ int ret = 1;
+ size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX)
+ if (cpuflags & cpu_avx)
+ ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+ if (cpuflags & cpu_ssse3)
+ ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+ if (cpuflags & cpu_sse2)
+ ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_BASIC)
+ ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+ return ret;
+}
+
uint8_t Nfactor, rfactor, pfactor;
} scrypt_test_setting;
+/*
+ * I'm hardcoding the values of p and r, which means they can't be tested
+ * anymore. A new test case with a different value for N should maybe be added.
+ * - mikaelh
+ */
static const scrypt_test_setting post_settings[] = {
{"", "", 3, 0, 0},
- {"password", "NaCl", 9, 3, 4},
+// {"password", "NaCl", 9, 3, 4},
{0}
};
curl_easy_setopt(curl, CURLOPT_URL, url);
if (opt_cert)
curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_ENCODING, "");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);