]>
Commit | Line | Data |
---|---|---|
3513ed84 | 1 | /* x64 */ |
2 | #if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) | |
3 | ||
4 | #define SCRYPT_SALSA64_SSSE3 | |
5 | ||
6 | asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) | |
7 | asm_naked_fn(scrypt_ChunkMix_ssse3) | |
8 | a1(push rbp) | |
9 | a2(mov rbp, rsp) | |
10 | a2(and rsp, ~63) | |
11 | a2(sub rsp, 128) | |
12 | a2(lea rcx,[rcx*2]) | |
13 | a2(shl rcx,7) | |
14 | a2(lea r9,[rcx-128]) | |
15 | a2(lea rax,[rsi+r9]) | |
16 | a2(lea r9,[rdx+r9]) | |
17 | a2(and rdx, rdx) | |
18 | a2(movdqa xmm0,[rax+0]) | |
19 | a2(movdqa xmm1,[rax+16]) | |
20 | a2(movdqa xmm2,[rax+32]) | |
21 | a2(movdqa xmm3,[rax+48]) | |
22 | a2(movdqa xmm4,[rax+64]) | |
23 | a2(movdqa xmm5,[rax+80]) | |
24 | a2(movdqa xmm6,[rax+96]) | |
25 | a2(movdqa xmm7,[rax+112]) | |
26 | a1(jz scrypt_ChunkMix_ssse3_no_xor1) | |
27 | a2(pxor xmm0,[r9+0]) | |
28 | a2(pxor xmm1,[r9+16]) | |
29 | a2(pxor xmm2,[r9+32]) | |
30 | a2(pxor xmm3,[r9+48]) | |
31 | a2(pxor xmm4,[r9+64]) | |
32 | a2(pxor xmm5,[r9+80]) | |
33 | a2(pxor xmm6,[r9+96]) | |
34 | a2(pxor xmm7,[r9+112]) | |
35 | a1(scrypt_ChunkMix_ssse3_no_xor1:) | |
36 | a2(xor r9,r9) | |
37 | a2(xor r8,r8) | |
38 | a1(scrypt_ChunkMix_ssse3_loop:) | |
39 | a2(and rdx, rdx) | |
40 | a2(pxor xmm0,[rsi+r9+0]) | |
41 | a2(pxor xmm1,[rsi+r9+16]) | |
42 | a2(pxor xmm2,[rsi+r9+32]) | |
43 | a2(pxor xmm3,[rsi+r9+48]) | |
44 | a2(pxor xmm4,[rsi+r9+64]) | |
45 | a2(pxor xmm5,[rsi+r9+80]) | |
46 | a2(pxor xmm6,[rsi+r9+96]) | |
47 | a2(pxor xmm7,[rsi+r9+112]) | |
48 | a1(jz scrypt_ChunkMix_ssse3_no_xor2) | |
49 | a2(pxor xmm0,[rdx+r9+0]) | |
50 | a2(pxor xmm1,[rdx+r9+16]) | |
51 | a2(pxor xmm2,[rdx+r9+32]) | |
52 | a2(pxor xmm3,[rdx+r9+48]) | |
53 | a2(pxor xmm4,[rdx+r9+64]) | |
54 | a2(pxor xmm5,[rdx+r9+80]) | |
55 | a2(pxor xmm6,[rdx+r9+96]) | |
56 | a2(pxor xmm7,[rdx+r9+112]) | |
57 | a1(scrypt_ChunkMix_ssse3_no_xor2:) | |
58 | a2(movdqa [rsp+0],xmm0) | |
59 | a2(movdqa [rsp+16],xmm1) | |
60 | a2(movdqa [rsp+32],xmm2) | |
61 | a2(movdqa [rsp+48],xmm3) | |
62 | a2(movdqa [rsp+64],xmm4) | |
63 | a2(movdqa [rsp+80],xmm5) | |
64 | a2(movdqa [rsp+96],xmm6) | |
65 | a2(movdqa [rsp+112],xmm7) | |
66 | a2(mov rax,8) | |
67 | a1(scrypt_salsa64_ssse3_loop: ) | |
68 | a2(movdqa xmm8, xmm0) | |
69 | a2(movdqa xmm9, xmm1) | |
70 | a2(paddq xmm8, xmm2) | |
71 | a2(paddq xmm9, xmm3) | |
72 | a3(pshufd xmm8, xmm8, 0xb1) | |
73 | a3(pshufd xmm9, xmm9, 0xb1) | |
74 | a2(pxor xmm6, xmm8) | |
75 | a2(pxor xmm7, xmm9) | |
76 | a2(movdqa xmm10, xmm0) | |
77 | a2(movdqa xmm11, xmm1) | |
78 | a2(paddq xmm10, xmm6) | |
79 | a2(paddq xmm11, xmm7) | |
80 | a2(movdqa xmm8, xmm10) | |
81 | a2(movdqa xmm9, xmm11) | |
82 | a2(psrlq xmm10, 51) | |
83 | a2(psrlq xmm11, 51) | |
84 | a2(psllq xmm8, 13) | |
85 | a2(psllq xmm9, 13) | |
86 | a2(pxor xmm4, xmm10) | |
87 | a2(pxor xmm5, xmm11) | |
88 | a2(pxor xmm4, xmm8) | |
89 | a2(pxor xmm5, xmm9) | |
90 | a2(movdqa xmm10, xmm6) | |
91 | a2(movdqa xmm11, xmm7) | |
92 | a2(paddq xmm10, xmm4) | |
93 | a2(paddq xmm11, xmm5) | |
94 | a2(movdqa xmm8, xmm10) | |
95 | a2(movdqa xmm9, xmm11) | |
96 | a2(psrlq xmm10, 25) | |
97 | a2(psrlq xmm11, 25) | |
98 | a2(psllq xmm8, 39) | |
99 | a2(psllq xmm9, 39) | |
100 | a2(pxor xmm2, xmm10) | |
101 | a2(pxor xmm3, xmm11) | |
102 | a2(pxor xmm2, xmm8) | |
103 | a2(pxor xmm3, xmm9) | |
104 | a2(movdqa xmm8, xmm4) | |
105 | a2(movdqa xmm9, xmm5) | |
106 | a2(paddq xmm8, xmm2) | |
107 | a2(paddq xmm9, xmm3) | |
108 | a3(pshufd xmm8, xmm8, 0xb1) | |
109 | a3(pshufd xmm9, xmm9, 0xb1) | |
110 | a2(pxor xmm0, xmm8) | |
111 | a2(pxor xmm1, xmm9) | |
112 | a2(movdqa xmm10, xmm2) | |
113 | a2(movdqa xmm11, xmm3) | |
114 | a2(movdqa xmm2, xmm6) | |
115 | a2(movdqa xmm3, xmm7) | |
116 | a3(palignr xmm2, xmm7, 8) | |
117 | a3(palignr xmm3, xmm6, 8) | |
118 | a2(movdqa xmm6, xmm11) | |
119 | a2(movdqa xmm7, xmm10) | |
120 | a3(palignr xmm6, xmm10, 8) | |
121 | a3(palignr xmm7, xmm11, 8) | |
122 | a2(sub rax, 2) | |
123 | a2(movdqa xmm8, xmm0) | |
124 | a2(movdqa xmm9, xmm1) | |
125 | a2(paddq xmm8, xmm2) | |
126 | a2(paddq xmm9, xmm3) | |
127 | a3(pshufd xmm8, xmm8, 0xb1) | |
128 | a3(pshufd xmm9, xmm9, 0xb1) | |
129 | a2(pxor xmm6, xmm8) | |
130 | a2(pxor xmm7, xmm9) | |
131 | a2(movdqa xmm10, xmm0) | |
132 | a2(movdqa xmm11, xmm1) | |
133 | a2(paddq xmm10, xmm6) | |
134 | a2(paddq xmm11, xmm7) | |
135 | a2(movdqa xmm8, xmm10) | |
136 | a2(movdqa xmm9, xmm11) | |
137 | a2(psrlq xmm10, 51) | |
138 | a2(psrlq xmm11, 51) | |
139 | a2(psllq xmm8, 13) | |
140 | a2(psllq xmm9, 13) | |
141 | a2(pxor xmm5, xmm10) | |
142 | a2(pxor xmm4, xmm11) | |
143 | a2(pxor xmm5, xmm8) | |
144 | a2(pxor xmm4, xmm9) | |
145 | a2(movdqa xmm10, xmm6) | |
146 | a2(movdqa xmm11, xmm7) | |
147 | a2(paddq xmm10, xmm5) | |
148 | a2(paddq xmm11, xmm4) | |
149 | a2(movdqa xmm8, xmm10) | |
150 | a2(movdqa xmm9, xmm11) | |
151 | a2(psrlq xmm10, 25) | |
152 | a2(psrlq xmm11, 25) | |
153 | a2(psllq xmm8, 39) | |
154 | a2(psllq xmm9, 39) | |
155 | a2(pxor xmm2, xmm10) | |
156 | a2(pxor xmm3, xmm11) | |
157 | a2(pxor xmm2, xmm8) | |
158 | a2(pxor xmm3, xmm9) | |
159 | a2(movdqa xmm8, xmm5) | |
160 | a2(movdqa xmm9, xmm4) | |
161 | a2(paddq xmm8, xmm2) | |
162 | a2(paddq xmm9, xmm3) | |
163 | a3(pshufd xmm8, xmm8, 0xb1) | |
164 | a3(pshufd xmm9, xmm9, 0xb1) | |
165 | a2(pxor xmm0, xmm8) | |
166 | a2(pxor xmm1, xmm9) | |
167 | a2(movdqa xmm10, xmm2) | |
168 | a2(movdqa xmm11, xmm3) | |
169 | a2(movdqa xmm2, xmm6) | |
170 | a2(movdqa xmm3, xmm7) | |
171 | a3(palignr xmm2, xmm7, 8) | |
172 | a3(palignr xmm3, xmm6, 8) | |
173 | a2(movdqa xmm6, xmm11) | |
174 | a2(movdqa xmm7, xmm10) | |
175 | a3(palignr xmm6, xmm10, 8) | |
176 | a3(palignr xmm7, xmm11, 8) | |
177 | a1(ja scrypt_salsa64_ssse3_loop) | |
178 | a2(paddq xmm0,[rsp+0]) | |
179 | a2(paddq xmm1,[rsp+16]) | |
180 | a2(paddq xmm2,[rsp+32]) | |
181 | a2(paddq xmm3,[rsp+48]) | |
182 | a2(paddq xmm4,[rsp+64]) | |
183 | a2(paddq xmm5,[rsp+80]) | |
184 | a2(paddq xmm6,[rsp+96]) | |
185 | a2(paddq xmm7,[rsp+112]) | |
186 | a2(lea rax,[r8+r9]) | |
187 | a2(xor r8,rcx) | |
188 | a2(and rax,~0xff) | |
189 | a2(add r9,128) | |
190 | a2(shr rax,1) | |
191 | a2(add rax, rdi) | |
192 | a2(cmp r9,rcx) | |
193 | a2(movdqa [rax+0],xmm0) | |
194 | a2(movdqa [rax+16],xmm1) | |
195 | a2(movdqa [rax+32],xmm2) | |
196 | a2(movdqa [rax+48],xmm3) | |
197 | a2(movdqa [rax+64],xmm4) | |
198 | a2(movdqa [rax+80],xmm5) | |
199 | a2(movdqa [rax+96],xmm6) | |
200 | a2(movdqa [rax+112],xmm7) | |
201 | a1(jne scrypt_ChunkMix_ssse3_loop) | |
202 | a2(mov rsp, rbp) | |
203 | a1(pop rbp) | |
204 | a1(ret) | |
205 | asm_naked_fn_end(scrypt_ChunkMix_ssse3) | |
206 | ||
207 | #endif | |
208 | ||
209 | ||
210 | /* intrinsic */ | |
211 | #if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) | |
212 | ||
213 | #define SCRYPT_SALSA64_SSSE3 | |
214 | ||
215 | static void asm_calling_convention | |
216 | scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { | |
217 | uint32_t i, blocksPerChunk = r * 2, half = 0; | |
218 | xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; | |
219 | size_t rounds; | |
220 | ||
221 | /* 1: X = B_{2r - 1} */ | |
222 | xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); | |
223 | x0 = xmmp[0]; | |
224 | x1 = xmmp[1]; | |
225 | x2 = xmmp[2]; | |
226 | x3 = xmmp[3]; | |
227 | x4 = xmmp[4]; | |
228 | x5 = xmmp[5]; | |
229 | x6 = xmmp[6]; | |
230 | x7 = xmmp[7]; | |
231 | ||
232 | if (Bxor) { | |
233 | xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); | |
234 | x0 = _mm_xor_si128(x0, xmmp[0]); | |
235 | x1 = _mm_xor_si128(x1, xmmp[1]); | |
236 | x2 = _mm_xor_si128(x2, xmmp[2]); | |
237 | x3 = _mm_xor_si128(x3, xmmp[3]); | |
238 | x4 = _mm_xor_si128(x4, xmmp[4]); | |
239 | x5 = _mm_xor_si128(x5, xmmp[5]); | |
240 | x6 = _mm_xor_si128(x6, xmmp[6]); | |
241 | x7 = _mm_xor_si128(x7, xmmp[7]); | |
242 | } | |
243 | ||
244 | /* 2: for i = 0 to 2r - 1 do */ | |
245 | for (i = 0; i < blocksPerChunk; i++, half ^= r) { | |
246 | /* 3: X = H(X ^ B_i) */ | |
247 | xmmp = (xmmi *)scrypt_block(Bin, i); | |
248 | x0 = _mm_xor_si128(x0, xmmp[0]); | |
249 | x1 = _mm_xor_si128(x1, xmmp[1]); | |
250 | x2 = _mm_xor_si128(x2, xmmp[2]); | |
251 | x3 = _mm_xor_si128(x3, xmmp[3]); | |
252 | x4 = _mm_xor_si128(x4, xmmp[4]); | |
253 | x5 = _mm_xor_si128(x5, xmmp[5]); | |
254 | x6 = _mm_xor_si128(x6, xmmp[6]); | |
255 | x7 = _mm_xor_si128(x7, xmmp[7]); | |
256 | ||
257 | if (Bxor) { | |
258 | xmmp = (xmmi *)scrypt_block(Bxor, i); | |
259 | x0 = _mm_xor_si128(x0, xmmp[0]); | |
260 | x1 = _mm_xor_si128(x1, xmmp[1]); | |
261 | x2 = _mm_xor_si128(x2, xmmp[2]); | |
262 | x3 = _mm_xor_si128(x3, xmmp[3]); | |
263 | x4 = _mm_xor_si128(x4, xmmp[4]); | |
264 | x5 = _mm_xor_si128(x5, xmmp[5]); | |
265 | x6 = _mm_xor_si128(x6, xmmp[6]); | |
266 | x7 = _mm_xor_si128(x7, xmmp[7]); | |
267 | } | |
268 | ||
269 | t0 = x0; | |
270 | t1 = x1; | |
271 | t2 = x2; | |
272 | t3 = x3; | |
273 | t4 = x4; | |
274 | t5 = x5; | |
275 | t6 = x6; | |
276 | t7 = x7; | |
277 | ||
278 | for (rounds = 8; rounds; rounds -= 2) { | |
279 | z0 = _mm_add_epi64(x0, x2); | |
280 | z1 = _mm_add_epi64(x1, x3); | |
281 | z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); | |
282 | z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); | |
283 | x6 = _mm_xor_si128(x6, z0); | |
284 | x7 = _mm_xor_si128(x7, z1); | |
285 | ||
286 | z0 = _mm_add_epi64(x6, x0); | |
287 | z1 = _mm_add_epi64(x7, x1); | |
288 | z2 = _mm_srli_epi64(z0, 64-13); | |
289 | z3 = _mm_srli_epi64(z1, 64-13); | |
290 | z0 = _mm_slli_epi64(z0, 13); | |
291 | z1 = _mm_slli_epi64(z1, 13); | |
292 | x4 = _mm_xor_si128(x4, z2); | |
293 | x5 = _mm_xor_si128(x5, z3); | |
294 | x4 = _mm_xor_si128(x4, z0); | |
295 | x5 = _mm_xor_si128(x5, z1); | |
296 | ||
297 | z0 = _mm_add_epi64(x4, x6); | |
298 | z1 = _mm_add_epi64(x5, x7); | |
299 | z2 = _mm_srli_epi64(z0, 64-39); | |
300 | z3 = _mm_srli_epi64(z1, 64-39); | |
301 | z0 = _mm_slli_epi64(z0, 39); | |
302 | z1 = _mm_slli_epi64(z1, 39); | |
303 | x2 = _mm_xor_si128(x2, z2); | |
304 | x3 = _mm_xor_si128(x3, z3); | |
305 | x2 = _mm_xor_si128(x2, z0); | |
306 | x3 = _mm_xor_si128(x3, z1); | |
307 | ||
308 | z0 = _mm_add_epi64(x2, x4); | |
309 | z1 = _mm_add_epi64(x3, x5); | |
310 | z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); | |
311 | z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); | |
312 | x0 = _mm_xor_si128(x0, z0); | |
313 | x1 = _mm_xor_si128(x1, z1); | |
314 | ||
315 | z0 = x2; | |
316 | z1 = x3; | |
317 | x2 = _mm_alignr_epi8(x6, x7, 8); | |
318 | x3 = _mm_alignr_epi8(x7, x6, 8); | |
319 | x6 = _mm_alignr_epi8(z1, z0, 8); | |
320 | x7 = _mm_alignr_epi8(z0, z1, 8); | |
321 | ||
322 | z0 = _mm_add_epi64(x0, x2); | |
323 | z1 = _mm_add_epi64(x1, x3); | |
324 | z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); | |
325 | z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); | |
326 | x6 = _mm_xor_si128(x6, z0); | |
327 | x7 = _mm_xor_si128(x7, z1); | |
328 | ||
329 | z0 = _mm_add_epi64(x6, x0); | |
330 | z1 = _mm_add_epi64(x7, x1); | |
331 | z2 = _mm_srli_epi64(z0, 64-13); | |
332 | z3 = _mm_srli_epi64(z1, 64-13); | |
333 | z0 = _mm_slli_epi64(z0, 13); | |
334 | z1 = _mm_slli_epi64(z1, 13); | |
335 | x5 = _mm_xor_si128(x5, z2); | |
336 | x4 = _mm_xor_si128(x4, z3); | |
337 | x5 = _mm_xor_si128(x5, z0); | |
338 | x4 = _mm_xor_si128(x4, z1); | |
339 | ||
340 | z0 = _mm_add_epi64(x5, x6); | |
341 | z1 = _mm_add_epi64(x4, x7); | |
342 | z2 = _mm_srli_epi64(z0, 64-39); | |
343 | z3 = _mm_srli_epi64(z1, 64-39); | |
344 | z0 = _mm_slli_epi64(z0, 39); | |
345 | z1 = _mm_slli_epi64(z1, 39); | |
346 | x2 = _mm_xor_si128(x2, z2); | |
347 | x3 = _mm_xor_si128(x3, z3); | |
348 | x2 = _mm_xor_si128(x2, z0); | |
349 | x3 = _mm_xor_si128(x3, z1); | |
350 | ||
351 | z0 = _mm_add_epi64(x2, x5); | |
352 | z1 = _mm_add_epi64(x3, x4); | |
353 | z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); | |
354 | z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); | |
355 | x0 = _mm_xor_si128(x0, z0); | |
356 | x1 = _mm_xor_si128(x1, z1); | |
357 | ||
358 | z0 = x2; | |
359 | z1 = x3; | |
360 | x2 = _mm_alignr_epi8(x6, x7, 8); | |
361 | x3 = _mm_alignr_epi8(x7, x6, 8); | |
362 | x6 = _mm_alignr_epi8(z1, z0, 8); | |
363 | x7 = _mm_alignr_epi8(z0, z1, 8); | |
364 | } | |
365 | ||
366 | x0 = _mm_add_epi64(x0, t0); | |
367 | x1 = _mm_add_epi64(x1, t1); | |
368 | x2 = _mm_add_epi64(x2, t2); | |
369 | x3 = _mm_add_epi64(x3, t3); | |
370 | x4 = _mm_add_epi64(x4, t4); | |
371 | x5 = _mm_add_epi64(x5, t5); | |
372 | x6 = _mm_add_epi64(x6, t6); | |
373 | x7 = _mm_add_epi64(x7, t7); | |
374 | ||
375 | /* 4: Y_i = X */ | |
376 | /* 6: B'[0..r-1] = Y_even */ | |
377 | /* 6: B'[r..2r-1] = Y_odd */ | |
378 | xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); | |
379 | xmmp[0] = x0; | |
380 | xmmp[1] = x1; | |
381 | xmmp[2] = x2; | |
382 | xmmp[3] = x3; | |
383 | xmmp[4] = x4; | |
384 | xmmp[5] = x5; | |
385 | xmmp[6] = x6; | |
386 | xmmp[7] = x7; | |
387 | } | |
388 | } | |
389 | ||
390 | #endif | |
391 | ||
392 | #if defined(SCRYPT_SALSA64_SSSE3) | |
393 | /* uses salsa64_core_tangle_sse2 */ | |
394 | ||
395 | #undef SCRYPT_MIX | |
396 | #define SCRYPT_MIX "Salsa64/8-SSSE3" | |
397 | #undef SCRYPT_SALSA64_INCLUDED | |
398 | #define SCRYPT_SALSA64_INCLUDED | |
399 | #endif |