]> Git Repo - cpuminer-multi.git/blame - scryptjane/scrypt-jane-mix_salsa64-ssse3.h
update build.sh
[cpuminer-multi.git] / scryptjane / scrypt-jane-mix_salsa64-ssse3.h
CommitLineData
3513ed84 1/* x64 */
2#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
3
4#define SCRYPT_SALSA64_SSSE3
5
6asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
7asm_naked_fn(scrypt_ChunkMix_ssse3)
8 a1(push rbp)
9 a2(mov rbp, rsp)
10 a2(and rsp, ~63)
11 a2(sub rsp, 128)
12 a2(lea rcx,[rcx*2])
13 a2(shl rcx,7)
14 a2(lea r9,[rcx-128])
15 a2(lea rax,[rsi+r9])
16 a2(lea r9,[rdx+r9])
17 a2(and rdx, rdx)
18 a2(movdqa xmm0,[rax+0])
19 a2(movdqa xmm1,[rax+16])
20 a2(movdqa xmm2,[rax+32])
21 a2(movdqa xmm3,[rax+48])
22 a2(movdqa xmm4,[rax+64])
23 a2(movdqa xmm5,[rax+80])
24 a2(movdqa xmm6,[rax+96])
25 a2(movdqa xmm7,[rax+112])
26 a1(jz scrypt_ChunkMix_ssse3_no_xor1)
27 a2(pxor xmm0,[r9+0])
28 a2(pxor xmm1,[r9+16])
29 a2(pxor xmm2,[r9+32])
30 a2(pxor xmm3,[r9+48])
31 a2(pxor xmm4,[r9+64])
32 a2(pxor xmm5,[r9+80])
33 a2(pxor xmm6,[r9+96])
34 a2(pxor xmm7,[r9+112])
35 a1(scrypt_ChunkMix_ssse3_no_xor1:)
36 a2(xor r9,r9)
37 a2(xor r8,r8)
38 a1(scrypt_ChunkMix_ssse3_loop:)
39 a2(and rdx, rdx)
40 a2(pxor xmm0,[rsi+r9+0])
41 a2(pxor xmm1,[rsi+r9+16])
42 a2(pxor xmm2,[rsi+r9+32])
43 a2(pxor xmm3,[rsi+r9+48])
44 a2(pxor xmm4,[rsi+r9+64])
45 a2(pxor xmm5,[rsi+r9+80])
46 a2(pxor xmm6,[rsi+r9+96])
47 a2(pxor xmm7,[rsi+r9+112])
48 a1(jz scrypt_ChunkMix_ssse3_no_xor2)
49 a2(pxor xmm0,[rdx+r9+0])
50 a2(pxor xmm1,[rdx+r9+16])
51 a2(pxor xmm2,[rdx+r9+32])
52 a2(pxor xmm3,[rdx+r9+48])
53 a2(pxor xmm4,[rdx+r9+64])
54 a2(pxor xmm5,[rdx+r9+80])
55 a2(pxor xmm6,[rdx+r9+96])
56 a2(pxor xmm7,[rdx+r9+112])
57 a1(scrypt_ChunkMix_ssse3_no_xor2:)
58 a2(movdqa [rsp+0],xmm0)
59 a2(movdqa [rsp+16],xmm1)
60 a2(movdqa [rsp+32],xmm2)
61 a2(movdqa [rsp+48],xmm3)
62 a2(movdqa [rsp+64],xmm4)
63 a2(movdqa [rsp+80],xmm5)
64 a2(movdqa [rsp+96],xmm6)
65 a2(movdqa [rsp+112],xmm7)
66 a2(mov rax,8)
67 a1(scrypt_salsa64_ssse3_loop: )
68 a2(movdqa xmm8, xmm0)
69 a2(movdqa xmm9, xmm1)
70 a2(paddq xmm8, xmm2)
71 a2(paddq xmm9, xmm3)
72 a3(pshufd xmm8, xmm8, 0xb1)
73 a3(pshufd xmm9, xmm9, 0xb1)
74 a2(pxor xmm6, xmm8)
75 a2(pxor xmm7, xmm9)
76 a2(movdqa xmm10, xmm0)
77 a2(movdqa xmm11, xmm1)
78 a2(paddq xmm10, xmm6)
79 a2(paddq xmm11, xmm7)
80 a2(movdqa xmm8, xmm10)
81 a2(movdqa xmm9, xmm11)
82 a2(psrlq xmm10, 51)
83 a2(psrlq xmm11, 51)
84 a2(psllq xmm8, 13)
85 a2(psllq xmm9, 13)
86 a2(pxor xmm4, xmm10)
87 a2(pxor xmm5, xmm11)
88 a2(pxor xmm4, xmm8)
89 a2(pxor xmm5, xmm9)
90 a2(movdqa xmm10, xmm6)
91 a2(movdqa xmm11, xmm7)
92 a2(paddq xmm10, xmm4)
93 a2(paddq xmm11, xmm5)
94 a2(movdqa xmm8, xmm10)
95 a2(movdqa xmm9, xmm11)
96 a2(psrlq xmm10, 25)
97 a2(psrlq xmm11, 25)
98 a2(psllq xmm8, 39)
99 a2(psllq xmm9, 39)
100 a2(pxor xmm2, xmm10)
101 a2(pxor xmm3, xmm11)
102 a2(pxor xmm2, xmm8)
103 a2(pxor xmm3, xmm9)
104 a2(movdqa xmm8, xmm4)
105 a2(movdqa xmm9, xmm5)
106 a2(paddq xmm8, xmm2)
107 a2(paddq xmm9, xmm3)
108 a3(pshufd xmm8, xmm8, 0xb1)
109 a3(pshufd xmm9, xmm9, 0xb1)
110 a2(pxor xmm0, xmm8)
111 a2(pxor xmm1, xmm9)
112 a2(movdqa xmm10, xmm2)
113 a2(movdqa xmm11, xmm3)
114 a2(movdqa xmm2, xmm6)
115 a2(movdqa xmm3, xmm7)
116 a3(palignr xmm2, xmm7, 8)
117 a3(palignr xmm3, xmm6, 8)
118 a2(movdqa xmm6, xmm11)
119 a2(movdqa xmm7, xmm10)
120 a3(palignr xmm6, xmm10, 8)
121 a3(palignr xmm7, xmm11, 8)
122 a2(sub rax, 2)
123 a2(movdqa xmm8, xmm0)
124 a2(movdqa xmm9, xmm1)
125 a2(paddq xmm8, xmm2)
126 a2(paddq xmm9, xmm3)
127 a3(pshufd xmm8, xmm8, 0xb1)
128 a3(pshufd xmm9, xmm9, 0xb1)
129 a2(pxor xmm6, xmm8)
130 a2(pxor xmm7, xmm9)
131 a2(movdqa xmm10, xmm0)
132 a2(movdqa xmm11, xmm1)
133 a2(paddq xmm10, xmm6)
134 a2(paddq xmm11, xmm7)
135 a2(movdqa xmm8, xmm10)
136 a2(movdqa xmm9, xmm11)
137 a2(psrlq xmm10, 51)
138 a2(psrlq xmm11, 51)
139 a2(psllq xmm8, 13)
140 a2(psllq xmm9, 13)
141 a2(pxor xmm5, xmm10)
142 a2(pxor xmm4, xmm11)
143 a2(pxor xmm5, xmm8)
144 a2(pxor xmm4, xmm9)
145 a2(movdqa xmm10, xmm6)
146 a2(movdqa xmm11, xmm7)
147 a2(paddq xmm10, xmm5)
148 a2(paddq xmm11, xmm4)
149 a2(movdqa xmm8, xmm10)
150 a2(movdqa xmm9, xmm11)
151 a2(psrlq xmm10, 25)
152 a2(psrlq xmm11, 25)
153 a2(psllq xmm8, 39)
154 a2(psllq xmm9, 39)
155 a2(pxor xmm2, xmm10)
156 a2(pxor xmm3, xmm11)
157 a2(pxor xmm2, xmm8)
158 a2(pxor xmm3, xmm9)
159 a2(movdqa xmm8, xmm5)
160 a2(movdqa xmm9, xmm4)
161 a2(paddq xmm8, xmm2)
162 a2(paddq xmm9, xmm3)
163 a3(pshufd xmm8, xmm8, 0xb1)
164 a3(pshufd xmm9, xmm9, 0xb1)
165 a2(pxor xmm0, xmm8)
166 a2(pxor xmm1, xmm9)
167 a2(movdqa xmm10, xmm2)
168 a2(movdqa xmm11, xmm3)
169 a2(movdqa xmm2, xmm6)
170 a2(movdqa xmm3, xmm7)
171 a3(palignr xmm2, xmm7, 8)
172 a3(palignr xmm3, xmm6, 8)
173 a2(movdqa xmm6, xmm11)
174 a2(movdqa xmm7, xmm10)
175 a3(palignr xmm6, xmm10, 8)
176 a3(palignr xmm7, xmm11, 8)
177 a1(ja scrypt_salsa64_ssse3_loop)
178 a2(paddq xmm0,[rsp+0])
179 a2(paddq xmm1,[rsp+16])
180 a2(paddq xmm2,[rsp+32])
181 a2(paddq xmm3,[rsp+48])
182 a2(paddq xmm4,[rsp+64])
183 a2(paddq xmm5,[rsp+80])
184 a2(paddq xmm6,[rsp+96])
185 a2(paddq xmm7,[rsp+112])
186 a2(lea rax,[r8+r9])
187 a2(xor r8,rcx)
188 a2(and rax,~0xff)
189 a2(add r9,128)
190 a2(shr rax,1)
191 a2(add rax, rdi)
192 a2(cmp r9,rcx)
193 a2(movdqa [rax+0],xmm0)
194 a2(movdqa [rax+16],xmm1)
195 a2(movdqa [rax+32],xmm2)
196 a2(movdqa [rax+48],xmm3)
197 a2(movdqa [rax+64],xmm4)
198 a2(movdqa [rax+80],xmm5)
199 a2(movdqa [rax+96],xmm6)
200 a2(movdqa [rax+112],xmm7)
201 a1(jne scrypt_ChunkMix_ssse3_loop)
202 a2(mov rsp, rbp)
203 a1(pop rbp)
204 a1(ret)
205asm_naked_fn_end(scrypt_ChunkMix_ssse3)
206
207#endif
208
209
210/* intrinsic */
211#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
212
213#define SCRYPT_SALSA64_SSSE3
214
215static void asm_calling_convention
216scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
217 uint32_t i, blocksPerChunk = r * 2, half = 0;
218 xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
219 size_t rounds;
220
221 /* 1: X = B_{2r - 1} */
222 xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
223 x0 = xmmp[0];
224 x1 = xmmp[1];
225 x2 = xmmp[2];
226 x3 = xmmp[3];
227 x4 = xmmp[4];
228 x5 = xmmp[5];
229 x6 = xmmp[6];
230 x7 = xmmp[7];
231
232 if (Bxor) {
233 xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
234 x0 = _mm_xor_si128(x0, xmmp[0]);
235 x1 = _mm_xor_si128(x1, xmmp[1]);
236 x2 = _mm_xor_si128(x2, xmmp[2]);
237 x3 = _mm_xor_si128(x3, xmmp[3]);
238 x4 = _mm_xor_si128(x4, xmmp[4]);
239 x5 = _mm_xor_si128(x5, xmmp[5]);
240 x6 = _mm_xor_si128(x6, xmmp[6]);
241 x7 = _mm_xor_si128(x7, xmmp[7]);
242 }
243
244 /* 2: for i = 0 to 2r - 1 do */
245 for (i = 0; i < blocksPerChunk; i++, half ^= r) {
246 /* 3: X = H(X ^ B_i) */
247 xmmp = (xmmi *)scrypt_block(Bin, i);
248 x0 = _mm_xor_si128(x0, xmmp[0]);
249 x1 = _mm_xor_si128(x1, xmmp[1]);
250 x2 = _mm_xor_si128(x2, xmmp[2]);
251 x3 = _mm_xor_si128(x3, xmmp[3]);
252 x4 = _mm_xor_si128(x4, xmmp[4]);
253 x5 = _mm_xor_si128(x5, xmmp[5]);
254 x6 = _mm_xor_si128(x6, xmmp[6]);
255 x7 = _mm_xor_si128(x7, xmmp[7]);
256
257 if (Bxor) {
258 xmmp = (xmmi *)scrypt_block(Bxor, i);
259 x0 = _mm_xor_si128(x0, xmmp[0]);
260 x1 = _mm_xor_si128(x1, xmmp[1]);
261 x2 = _mm_xor_si128(x2, xmmp[2]);
262 x3 = _mm_xor_si128(x3, xmmp[3]);
263 x4 = _mm_xor_si128(x4, xmmp[4]);
264 x5 = _mm_xor_si128(x5, xmmp[5]);
265 x6 = _mm_xor_si128(x6, xmmp[6]);
266 x7 = _mm_xor_si128(x7, xmmp[7]);
267 }
268
269 t0 = x0;
270 t1 = x1;
271 t2 = x2;
272 t3 = x3;
273 t4 = x4;
274 t5 = x5;
275 t6 = x6;
276 t7 = x7;
277
278 for (rounds = 8; rounds; rounds -= 2) {
279 z0 = _mm_add_epi64(x0, x2);
280 z1 = _mm_add_epi64(x1, x3);
281 z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
282 z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
283 x6 = _mm_xor_si128(x6, z0);
284 x7 = _mm_xor_si128(x7, z1);
285
286 z0 = _mm_add_epi64(x6, x0);
287 z1 = _mm_add_epi64(x7, x1);
288 z2 = _mm_srli_epi64(z0, 64-13);
289 z3 = _mm_srli_epi64(z1, 64-13);
290 z0 = _mm_slli_epi64(z0, 13);
291 z1 = _mm_slli_epi64(z1, 13);
292 x4 = _mm_xor_si128(x4, z2);
293 x5 = _mm_xor_si128(x5, z3);
294 x4 = _mm_xor_si128(x4, z0);
295 x5 = _mm_xor_si128(x5, z1);
296
297 z0 = _mm_add_epi64(x4, x6);
298 z1 = _mm_add_epi64(x5, x7);
299 z2 = _mm_srli_epi64(z0, 64-39);
300 z3 = _mm_srli_epi64(z1, 64-39);
301 z0 = _mm_slli_epi64(z0, 39);
302 z1 = _mm_slli_epi64(z1, 39);
303 x2 = _mm_xor_si128(x2, z2);
304 x3 = _mm_xor_si128(x3, z3);
305 x2 = _mm_xor_si128(x2, z0);
306 x3 = _mm_xor_si128(x3, z1);
307
308 z0 = _mm_add_epi64(x2, x4);
309 z1 = _mm_add_epi64(x3, x5);
310 z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
311 z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
312 x0 = _mm_xor_si128(x0, z0);
313 x1 = _mm_xor_si128(x1, z1);
314
315 z0 = x2;
316 z1 = x3;
317 x2 = _mm_alignr_epi8(x6, x7, 8);
318 x3 = _mm_alignr_epi8(x7, x6, 8);
319 x6 = _mm_alignr_epi8(z1, z0, 8);
320 x7 = _mm_alignr_epi8(z0, z1, 8);
321
322 z0 = _mm_add_epi64(x0, x2);
323 z1 = _mm_add_epi64(x1, x3);
324 z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
325 z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
326 x6 = _mm_xor_si128(x6, z0);
327 x7 = _mm_xor_si128(x7, z1);
328
329 z0 = _mm_add_epi64(x6, x0);
330 z1 = _mm_add_epi64(x7, x1);
331 z2 = _mm_srli_epi64(z0, 64-13);
332 z3 = _mm_srli_epi64(z1, 64-13);
333 z0 = _mm_slli_epi64(z0, 13);
334 z1 = _mm_slli_epi64(z1, 13);
335 x5 = _mm_xor_si128(x5, z2);
336 x4 = _mm_xor_si128(x4, z3);
337 x5 = _mm_xor_si128(x5, z0);
338 x4 = _mm_xor_si128(x4, z1);
339
340 z0 = _mm_add_epi64(x5, x6);
341 z1 = _mm_add_epi64(x4, x7);
342 z2 = _mm_srli_epi64(z0, 64-39);
343 z3 = _mm_srli_epi64(z1, 64-39);
344 z0 = _mm_slli_epi64(z0, 39);
345 z1 = _mm_slli_epi64(z1, 39);
346 x2 = _mm_xor_si128(x2, z2);
347 x3 = _mm_xor_si128(x3, z3);
348 x2 = _mm_xor_si128(x2, z0);
349 x3 = _mm_xor_si128(x3, z1);
350
351 z0 = _mm_add_epi64(x2, x5);
352 z1 = _mm_add_epi64(x3, x4);
353 z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
354 z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
355 x0 = _mm_xor_si128(x0, z0);
356 x1 = _mm_xor_si128(x1, z1);
357
358 z0 = x2;
359 z1 = x3;
360 x2 = _mm_alignr_epi8(x6, x7, 8);
361 x3 = _mm_alignr_epi8(x7, x6, 8);
362 x6 = _mm_alignr_epi8(z1, z0, 8);
363 x7 = _mm_alignr_epi8(z0, z1, 8);
364 }
365
366 x0 = _mm_add_epi64(x0, t0);
367 x1 = _mm_add_epi64(x1, t1);
368 x2 = _mm_add_epi64(x2, t2);
369 x3 = _mm_add_epi64(x3, t3);
370 x4 = _mm_add_epi64(x4, t4);
371 x5 = _mm_add_epi64(x5, t5);
372 x6 = _mm_add_epi64(x6, t6);
373 x7 = _mm_add_epi64(x7, t7);
374
375 /* 4: Y_i = X */
376 /* 6: B'[0..r-1] = Y_even */
377 /* 6: B'[r..2r-1] = Y_odd */
378 xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
379 xmmp[0] = x0;
380 xmmp[1] = x1;
381 xmmp[2] = x2;
382 xmmp[3] = x3;
383 xmmp[4] = x4;
384 xmmp[5] = x5;
385 xmmp[6] = x6;
386 xmmp[7] = x7;
387 }
388}
389
390#endif
391
392#if defined(SCRYPT_SALSA64_SSSE3)
393 /* uses salsa64_core_tangle_sse2 */
394
395 #undef SCRYPT_MIX
396 #define SCRYPT_MIX "Salsa64/8-SSSE3"
397 #undef SCRYPT_SALSA64_INCLUDED
398 #define SCRYPT_SALSA64_INCLUDED
399#endif
This page took 0.062914 seconds and 4 git commands to generate.