1 /**********************************************************************
2 * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
3 * Distributed under the MIT software license, see the accompanying *
4 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
5 **********************************************************************/
9 * - March 2013, Diederik Huys: original version
10 * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
11 * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
14 #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
15 #define _SECP256K1_FIELD_INNER5X52_IMPL_H_
17 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
19 * Registers: rdx:rax = multiplication accumulator
27 uint64_t tmp1, tmp2, tmp3;
29 "movq 0(%%rsi),%%r10\n"
30 "movq 8(%%rsi),%%r11\n"
31 "movq 16(%%rsi),%%r12\n"
32 "movq 24(%%rsi),%%r13\n"
33 "movq 32(%%rsi),%%r14\n"
36 "movq 0(%%rbx),%%rax\n"
41 "movq 8(%%rbx),%%rax\n"
46 "movq 16(%%rbx),%%rax\n"
51 "movq 24(%%rbx),%%rax\n"
56 "movq 32(%%rbx),%%rax\n"
60 /* d += (c & M) * R */
61 "movq $0xfffffffffffff,%%rdx\n"
63 "movq $0x1000003d10,%%rdx\n"
67 /* c >>= 52 (%%r8 only) */
68 "shrdq $52,%%r9,%%r8\n"
69 /* t3 (tmp1) = d & M */
71 "movq $0xfffffffffffff,%%rdx\n"
75 "shrdq $52,%%r15,%%rcx\n"
78 "movq 0(%%rbx),%%rax\n"
83 "movq 8(%%rbx),%%rax\n"
88 "movq 16(%%rbx),%%rax\n"
93 "movq 24(%%rbx),%%rax\n"
98 "movq 32(%%rbx),%%rax\n"
104 "movq $0x1000003d10,%%rdx\n"
108 /* t4 = d & M (%%rsi) */
110 "movq $0xfffffffffffff,%%rdx\n"
113 "shrdq $52,%%r15,%%rcx\n"
115 /* tx = t4 >> 48 (tmp3) */
119 /* t4 &= (M >> 4) (tmp2) */
120 "movq $0xffffffffffff,%%rax\n"
124 "movq 0(%%rbx),%%rax\n"
129 "movq 8(%%rbx),%%rax\n"
134 "movq 16(%%rbx),%%rax\n"
139 "movq 24(%%rbx),%%rax\n"
144 "movq 32(%%rbx),%%rax\n"
148 /* u0 = d & M (%%rsi) */
150 "movq $0xfffffffffffff,%%rdx\n"
153 "shrdq $52,%%r15,%%rcx\n"
155 /* u0 = (u0 << 4) | tx (%%rsi) */
159 /* c += u0 * (R >> 4) */
160 "movq $0x1000003d1,%%rax\n"
166 "movq $0xfffffffffffff,%%rdx\n"
168 "movq %%rax,0(%%rdi)\n"
170 "shrdq $52,%%r9,%%r8\n"
173 "movq 0(%%rbx),%%rax\n"
178 "movq 8(%%rbx),%%rax\n"
183 "movq 16(%%rbx),%%rax\n"
188 "movq 24(%%rbx),%%rax\n"
193 "movq 32(%%rbx),%%rax\n"
197 /* c += (d & M) * R */
199 "movq $0xfffffffffffff,%%rdx\n"
201 "movq $0x1000003d10,%%rdx\n"
206 "shrdq $52,%%r15,%%rcx\n"
210 "movq $0xfffffffffffff,%%rdx\n"
212 "movq %%rax,8(%%rdi)\n"
214 "shrdq $52,%%r9,%%r8\n"
217 "movq 0(%%rbx),%%rax\n"
222 "movq 8(%%rbx),%%rax\n"
226 /* c += a0 * b2 (last use of %%r10 = a0) */
227 "movq 16(%%rbx),%%rax\n"
231 /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
235 "movq 24(%%rbx),%%rax\n"
240 "movq 32(%%rbx),%%rax\n"
244 /* c += (d & M) * R */
246 "movq $0xfffffffffffff,%%rdx\n"
248 "movq $0x1000003d10,%%rdx\n"
252 /* d >>= 52 (%%rcx only) */
253 "shrdq $52,%%r15,%%rcx\n"
256 "movq $0xfffffffffffff,%%rdx\n"
258 "movq %%rax,16(%%rdi)\n"
260 "shrdq $52,%%r9,%%r8\n"
266 "movq $0x1000003d10,%%rdx\n"
272 "movq $0xfffffffffffff,%%rdx\n"
274 "movq %%rax,24(%%rdi)\n"
275 /* c >>= 52 (%%r8 only) */
276 "shrdq $52,%%r9,%%r8\n"
277 /* c += t4 (%%r8 only) */
280 "movq %%r8,32(%%rdi)\n"
281 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
283 : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
287 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
289 * Registers: rdx:rax = multiplication accumulator
293 * r15 = M (0xfffffffffffff)
297 uint64_t tmp1, tmp2, tmp3;
298 __asm__ __volatile__(
299 "movq 0(%%rsi),%%r10\n"
300 "movq 8(%%rsi),%%r11\n"
301 "movq 16(%%rsi),%%r12\n"
302 "movq 24(%%rsi),%%r13\n"
303 "movq 32(%%rsi),%%r14\n"
304 "movq $0xfffffffffffff,%%r15\n"
306 /* d = (a0*2) * a3 */
307 "leaq (%%r10,%%r10,1),%%rax\n"
311 /* d += (a1*2) * a2 */
312 "leaq (%%r11,%%r11,1),%%rax\n"
321 /* d += (c & M) * R */
323 "movq $0x1000003d10,%%rdx\n"
327 /* c >>= 52 (%%r8 only) */
328 "shrdq $52,%%r9,%%r8\n"
329 /* t3 (tmp1) = d & M */
334 "shrdq $52,%%rcx,%%rbx\n"
343 /* d+= (a1*2) * a3 */
344 "leaq (%%r11,%%r11,1),%%rax\n"
355 "movq $0x1000003d10,%%rdx\n"
359 /* t4 = d & M (%%rsi) */
363 "shrdq $52,%%rcx,%%rbx\n"
365 /* tx = t4 >> 48 (tmp3) */
369 /* t4 &= (M >> 4) (tmp2) */
370 "movq $0xffffffffffff,%%rax\n"
383 /* d += (a2*2) * a3 */
384 "leaq (%%r12,%%r12,1),%%rax\n"
388 /* u0 = d & M (%%rsi) */
392 "shrdq $52,%%rcx,%%rbx\n"
394 /* u0 = (u0 << 4) | tx (%%rsi) */
398 /* c += u0 * (R >> 4) */
399 "movq $0x1000003d1,%%rax\n"
406 "movq %%rax,0(%%rdi)\n"
408 "shrdq $52,%%r9,%%r8\n"
427 /* c += (d & M) * R */
430 "movq $0x1000003d10,%%rdx\n"
435 "shrdq $52,%%rcx,%%rbx\n"
440 "movq %%rax,8(%%rdi)\n"
442 "shrdq $52,%%r9,%%r8\n"
444 /* c += a0 * a2 (last use of %%r10) */
449 /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
462 /* c += (d & M) * R */
465 "movq $0x1000003d10,%%rdx\n"
469 /* d >>= 52 (%%rbx only) */
470 "shrdq $52,%%rcx,%%rbx\n"
474 "movq %%rax,16(%%rdi)\n"
476 "shrdq $52,%%r9,%%r8\n"
482 "movq $0x1000003d10,%%rdx\n"
489 "movq %%rax,24(%%rdi)\n"
490 /* c >>= 52 (%%r8 only) */
491 "shrdq $52,%%r9,%%r8\n"
492 /* c += t4 (%%r8 only) */
495 "movq %%r8,32(%%rdi)\n"
496 : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
498 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"