]>
Commit | Line | Data |
---|---|---|
71712b27 | 1 | /********************************************************************** |
67935050 | 2 | * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * |
71712b27 GM |
3 | * Distributed under the MIT software license, see the accompanying * |
4 | * file COPYING or http://www.opensource.org/licenses/mit-license.php.* | |
5 | **********************************************************************/ | |
0a433ea2 | 6 | |
67935050 PW |
7 | /** |
8 | * Changelog: | |
9 | * - March 2013, Diederik Huys: original version | |
10 | * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm | |
11 | * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly | |
12 | */ | |
13 | ||
abe2d3e8 DR |
14 | #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H |
15 | #define SECP256K1_FIELD_INNER5X52_IMPL_H | |
7a4b7691 | 16 | |
b2c9681c | 17 | SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { |
67935050 PW |
18 | /** |
19 | * Registers: rdx:rax = multiplication accumulator | |
20 | * r9:r8 = c | |
21 | * r15:rcx = d | |
22 | * r10-r14 = a0-a4 | |
23 | * rbx = b | |
e66d4d6d PW |
24 | * rdi = r |
25 | * rsi = a / t? | |
67935050 | 26 | */ |
e66d4d6d | 27 | uint64_t tmp1, tmp2, tmp3; |
67935050 | 28 | __asm__ __volatile__( |
e66d4d6d PW |
29 | "movq 0(%%rsi),%%r10\n" |
30 | "movq 8(%%rsi),%%r11\n" | |
31 | "movq 16(%%rsi),%%r12\n" | |
32 | "movq 24(%%rsi),%%r13\n" | |
33 | "movq 32(%%rsi),%%r14\n" | |
67935050 PW |
34 | |
35 | /* d += a3 * b0 */ | |
36 | "movq 0(%%rbx),%%rax\n" | |
37 | "mulq %%r13\n" | |
38 | "movq %%rax,%%rcx\n" | |
39 | "movq %%rdx,%%r15\n" | |
40 | /* d += a2 * b1 */ | |
41 | "movq 8(%%rbx),%%rax\n" | |
42 | "mulq %%r12\n" | |
43 | "addq %%rax,%%rcx\n" | |
44 | "adcq %%rdx,%%r15\n" | |
45 | /* d += a1 * b2 */ | |
46 | "movq 16(%%rbx),%%rax\n" | |
47 | "mulq %%r11\n" | |
48 | "addq %%rax,%%rcx\n" | |
49 | "adcq %%rdx,%%r15\n" | |
50 | /* d = a0 * b3 */ | |
51 | "movq 24(%%rbx),%%rax\n" | |
52 | "mulq %%r10\n" | |
53 | "addq %%rax,%%rcx\n" | |
54 | "adcq %%rdx,%%r15\n" | |
55 | /* c = a4 * b4 */ | |
56 | "movq 32(%%rbx),%%rax\n" | |
57 | "mulq %%r14\n" | |
58 | "movq %%rax,%%r8\n" | |
59 | "movq %%rdx,%%r9\n" | |
60 | /* d += (c & M) * R */ | |
61 | "movq $0xfffffffffffff,%%rdx\n" | |
62 | "andq %%rdx,%%rax\n" | |
e66d4d6d PW |
63 | "movq $0x1000003d10,%%rdx\n" |
64 | "mulq %%rdx\n" | |
67935050 PW |
65 | "addq %%rax,%%rcx\n" |
66 | "adcq %%rdx,%%r15\n" | |
67 | /* c >>= 52 (%%r8 only) */ | |
68 | "shrdq $52,%%r9,%%r8\n" | |
e66d4d6d PW |
69 | /* t3 (tmp1) = d & M */ |
70 | "movq %%rcx,%%rsi\n" | |
67935050 | 71 | "movq $0xfffffffffffff,%%rdx\n" |
e66d4d6d | 72 | "andq %%rdx,%%rsi\n" |
f22d73e7 | 73 | "movq %%rsi,%q1\n" |
67935050 PW |
74 | /* d >>= 52 */ |
75 | "shrdq $52,%%r15,%%rcx\n" | |
76 | "xorq %%r15,%%r15\n" | |
77 | /* d += a4 * b0 */ | |
78 | "movq 0(%%rbx),%%rax\n" | |
79 | "mulq %%r14\n" | |
80 | "addq %%rax,%%rcx\n" | |
81 | "adcq %%rdx,%%r15\n" | |
82 | /* d += a3 * b1 */ | |
83 | "movq 8(%%rbx),%%rax\n" | |
84 | "mulq %%r13\n" | |
85 | "addq %%rax,%%rcx\n" | |
86 | "adcq %%rdx,%%r15\n" | |
87 | /* d += a2 * b2 */ | |
88 | "movq 16(%%rbx),%%rax\n" | |
89 | "mulq %%r12\n" | |
90 | "addq %%rax,%%rcx\n" | |
91 | "adcq %%rdx,%%r15\n" | |
92 | /* d += a1 * b3 */ | |
93 | "movq 24(%%rbx),%%rax\n" | |
94 | "mulq %%r11\n" | |
95 | "addq %%rax,%%rcx\n" | |
96 | "adcq %%rdx,%%r15\n" | |
97 | /* d += a0 * b4 */ | |
98 | "movq 32(%%rbx),%%rax\n" | |
99 | "mulq %%r10\n" | |
100 | "addq %%rax,%%rcx\n" | |
101 | "adcq %%rdx,%%r15\n" | |
102 | /* d += c * R */ | |
103 | "movq %%r8,%%rax\n" | |
e66d4d6d PW |
104 | "movq $0x1000003d10,%%rdx\n" |
105 | "mulq %%rdx\n" | |
67935050 PW |
106 | "addq %%rax,%%rcx\n" |
107 | "adcq %%rdx,%%r15\n" | |
e66d4d6d PW |
108 | /* t4 = d & M (%%rsi) */ |
109 | "movq %%rcx,%%rsi\n" | |
67935050 | 110 | "movq $0xfffffffffffff,%%rdx\n" |
e66d4d6d | 111 | "andq %%rdx,%%rsi\n" |
67935050 PW |
112 | /* d >>= 52 */ |
113 | "shrdq $52,%%r15,%%rcx\n" | |
114 | "xorq %%r15,%%r15\n" | |
e66d4d6d PW |
115 | /* tx = t4 >> 48 (tmp3) */ |
116 | "movq %%rsi,%%rax\n" | |
117 | "shrq $48,%%rax\n" | |
f22d73e7 | 118 | "movq %%rax,%q3\n" |
e66d4d6d | 119 | /* t4 &= (M >> 4) (tmp2) */ |
67935050 | 120 | "movq $0xffffffffffff,%%rax\n" |
e66d4d6d | 121 | "andq %%rax,%%rsi\n" |
f22d73e7 | 122 | "movq %%rsi,%q2\n" |
67935050 PW |
123 | /* c = a0 * b0 */ |
124 | "movq 0(%%rbx),%%rax\n" | |
125 | "mulq %%r10\n" | |
126 | "movq %%rax,%%r8\n" | |
127 | "movq %%rdx,%%r9\n" | |
128 | /* d += a4 * b1 */ | |
129 | "movq 8(%%rbx),%%rax\n" | |
130 | "mulq %%r14\n" | |
131 | "addq %%rax,%%rcx\n" | |
132 | "adcq %%rdx,%%r15\n" | |
133 | /* d += a3 * b2 */ | |
134 | "movq 16(%%rbx),%%rax\n" | |
135 | "mulq %%r13\n" | |
136 | "addq %%rax,%%rcx\n" | |
137 | "adcq %%rdx,%%r15\n" | |
138 | /* d += a2 * b3 */ | |
139 | "movq 24(%%rbx),%%rax\n" | |
140 | "mulq %%r12\n" | |
141 | "addq %%rax,%%rcx\n" | |
142 | "adcq %%rdx,%%r15\n" | |
143 | /* d += a1 * b4 */ | |
144 | "movq 32(%%rbx),%%rax\n" | |
145 | "mulq %%r11\n" | |
146 | "addq %%rax,%%rcx\n" | |
147 | "adcq %%rdx,%%r15\n" | |
e66d4d6d PW |
148 | /* u0 = d & M (%%rsi) */ |
149 | "movq %%rcx,%%rsi\n" | |
67935050 | 150 | "movq $0xfffffffffffff,%%rdx\n" |
e66d4d6d | 151 | "andq %%rdx,%%rsi\n" |
67935050 PW |
152 | /* d >>= 52 */ |
153 | "shrdq $52,%%r15,%%rcx\n" | |
154 | "xorq %%r15,%%r15\n" | |
e66d4d6d PW |
155 | /* u0 = (u0 << 4) | tx (%%rsi) */ |
156 | "shlq $4,%%rsi\n" | |
f22d73e7 | 157 | "movq %q3,%%rax\n" |
e66d4d6d | 158 | "orq %%rax,%%rsi\n" |
67935050 PW |
159 | /* c += u0 * (R >> 4) */ |
160 | "movq $0x1000003d1,%%rax\n" | |
e66d4d6d | 161 | "mulq %%rsi\n" |
67935050 PW |
162 | "addq %%rax,%%r8\n" |
163 | "adcq %%rdx,%%r9\n" | |
164 | /* r[0] = c & M */ | |
165 | "movq %%r8,%%rax\n" | |
166 | "movq $0xfffffffffffff,%%rdx\n" | |
167 | "andq %%rdx,%%rax\n" | |
e66d4d6d | 168 | "movq %%rax,0(%%rdi)\n" |
67935050 PW |
169 | /* c >>= 52 */ |
170 | "shrdq $52,%%r9,%%r8\n" | |
171 | "xorq %%r9,%%r9\n" | |
172 | /* c += a1 * b0 */ | |
173 | "movq 0(%%rbx),%%rax\n" | |
174 | "mulq %%r11\n" | |
175 | "addq %%rax,%%r8\n" | |
176 | "adcq %%rdx,%%r9\n" | |
177 | /* c += a0 * b1 */ | |
178 | "movq 8(%%rbx),%%rax\n" | |
179 | "mulq %%r10\n" | |
180 | "addq %%rax,%%r8\n" | |
181 | "adcq %%rdx,%%r9\n" | |
182 | /* d += a4 * b2 */ | |
183 | "movq 16(%%rbx),%%rax\n" | |
184 | "mulq %%r14\n" | |
185 | "addq %%rax,%%rcx\n" | |
186 | "adcq %%rdx,%%r15\n" | |
187 | /* d += a3 * b3 */ | |
188 | "movq 24(%%rbx),%%rax\n" | |
189 | "mulq %%r13\n" | |
190 | "addq %%rax,%%rcx\n" | |
191 | "adcq %%rdx,%%r15\n" | |
192 | /* d += a2 * b4 */ | |
193 | "movq 32(%%rbx),%%rax\n" | |
194 | "mulq %%r12\n" | |
195 | "addq %%rax,%%rcx\n" | |
196 | "adcq %%rdx,%%r15\n" | |
67935050 PW |
197 | /* c += (d & M) * R */ |
198 | "movq %%rcx,%%rax\n" | |
199 | "movq $0xfffffffffffff,%%rdx\n" | |
200 | "andq %%rdx,%%rax\n" | |
e66d4d6d PW |
201 | "movq $0x1000003d10,%%rdx\n" |
202 | "mulq %%rdx\n" | |
67935050 PW |
203 | "addq %%rax,%%r8\n" |
204 | "adcq %%rdx,%%r9\n" | |
205 | /* d >>= 52 */ | |
206 | "shrdq $52,%%r15,%%rcx\n" | |
207 | "xorq %%r15,%%r15\n" | |
208 | /* r[1] = c & M */ | |
209 | "movq %%r8,%%rax\n" | |
210 | "movq $0xfffffffffffff,%%rdx\n" | |
211 | "andq %%rdx,%%rax\n" | |
e66d4d6d | 212 | "movq %%rax,8(%%rdi)\n" |
67935050 PW |
213 | /* c >>= 52 */ |
214 | "shrdq $52,%%r9,%%r8\n" | |
215 | "xorq %%r9,%%r9\n" | |
216 | /* c += a2 * b0 */ | |
217 | "movq 0(%%rbx),%%rax\n" | |
218 | "mulq %%r12\n" | |
219 | "addq %%rax,%%r8\n" | |
220 | "adcq %%rdx,%%r9\n" | |
221 | /* c += a1 * b1 */ | |
222 | "movq 8(%%rbx),%%rax\n" | |
223 | "mulq %%r11\n" | |
224 | "addq %%rax,%%r8\n" | |
225 | "adcq %%rdx,%%r9\n" | |
226 | /* c += a0 * b2 (last use of %%r10 = a0) */ | |
227 | "movq 16(%%rbx),%%rax\n" | |
228 | "mulq %%r10\n" | |
229 | "addq %%rax,%%r8\n" | |
230 | "adcq %%rdx,%%r9\n" | |
e66d4d6d | 231 | /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ |
f22d73e7 LD |
232 | "movq %q2,%%rsi\n" |
233 | "movq %q1,%%r10\n" | |
67935050 PW |
234 | /* d += a4 * b3 */ |
235 | "movq 24(%%rbx),%%rax\n" | |
236 | "mulq %%r14\n" | |
237 | "addq %%rax,%%rcx\n" | |
238 | "adcq %%rdx,%%r15\n" | |
239 | /* d += a3 * b4 */ | |
240 | "movq 32(%%rbx),%%rax\n" | |
241 | "mulq %%r13\n" | |
242 | "addq %%rax,%%rcx\n" | |
243 | "adcq %%rdx,%%r15\n" | |
244 | /* c += (d & M) * R */ | |
245 | "movq %%rcx,%%rax\n" | |
246 | "movq $0xfffffffffffff,%%rdx\n" | |
247 | "andq %%rdx,%%rax\n" | |
e66d4d6d PW |
248 | "movq $0x1000003d10,%%rdx\n" |
249 | "mulq %%rdx\n" | |
67935050 PW |
250 | "addq %%rax,%%r8\n" |
251 | "adcq %%rdx,%%r9\n" | |
252 | /* d >>= 52 (%%rcx only) */ | |
253 | "shrdq $52,%%r15,%%rcx\n" | |
254 | /* r[2] = c & M */ | |
255 | "movq %%r8,%%rax\n" | |
256 | "movq $0xfffffffffffff,%%rdx\n" | |
257 | "andq %%rdx,%%rax\n" | |
e66d4d6d | 258 | "movq %%rax,16(%%rdi)\n" |
67935050 PW |
259 | /* c >>= 52 */ |
260 | "shrdq $52,%%r9,%%r8\n" | |
261 | "xorq %%r9,%%r9\n" | |
262 | /* c += t3 */ | |
263 | "addq %%r10,%%r8\n" | |
264 | /* c += d * R */ | |
265 | "movq %%rcx,%%rax\n" | |
e66d4d6d PW |
266 | "movq $0x1000003d10,%%rdx\n" |
267 | "mulq %%rdx\n" | |
67935050 PW |
268 | "addq %%rax,%%r8\n" |
269 | "adcq %%rdx,%%r9\n" | |
270 | /* r[3] = c & M */ | |
271 | "movq %%r8,%%rax\n" | |
272 | "movq $0xfffffffffffff,%%rdx\n" | |
273 | "andq %%rdx,%%rax\n" | |
e66d4d6d | 274 | "movq %%rax,24(%%rdi)\n" |
67935050 PW |
275 | /* c >>= 52 (%%r8 only) */ |
276 | "shrdq $52,%%r9,%%r8\n" | |
277 | /* c += t4 (%%r8 only) */ | |
e66d4d6d | 278 | "addq %%rsi,%%r8\n" |
67935050 | 279 | /* r[4] = c */ |
e66d4d6d PW |
280 | "movq %%r8,32(%%rdi)\n" |
281 | : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) | |
67935050 PW |
282 | : "b"(b), "D"(r) |
283 | : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" | |
284 | ); | |
285 | } | |
286 | ||
b2c9681c | 287 | SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { |
67935050 PW |
288 | /** |
289 | * Registers: rdx:rax = multiplication accumulator | |
290 | * r9:r8 = c | |
291 | * rcx:rbx = d | |
292 | * r10-r14 = a0-a4 | |
293 | * r15 = M (0xfffffffffffff) | |
e66d4d6d PW |
294 | * rdi = r |
295 | * rsi = a / t? | |
67935050 | 296 | */ |
e66d4d6d | 297 | uint64_t tmp1, tmp2, tmp3; |
67935050 | 298 | __asm__ __volatile__( |
e66d4d6d PW |
299 | "movq 0(%%rsi),%%r10\n" |
300 | "movq 8(%%rsi),%%r11\n" | |
301 | "movq 16(%%rsi),%%r12\n" | |
302 | "movq 24(%%rsi),%%r13\n" | |
303 | "movq 32(%%rsi),%%r14\n" | |
67935050 PW |
304 | "movq $0xfffffffffffff,%%r15\n" |
305 | ||
306 | /* d = (a0*2) * a3 */ | |
307 | "leaq (%%r10,%%r10,1),%%rax\n" | |
308 | "mulq %%r13\n" | |
309 | "movq %%rax,%%rbx\n" | |
310 | "movq %%rdx,%%rcx\n" | |
311 | /* d += (a1*2) * a2 */ | |
312 | "leaq (%%r11,%%r11,1),%%rax\n" | |
313 | "mulq %%r12\n" | |
314 | "addq %%rax,%%rbx\n" | |
315 | "adcq %%rdx,%%rcx\n" | |
316 | /* c = a4 * a4 */ | |
317 | "movq %%r14,%%rax\n" | |
318 | "mulq %%r14\n" | |
319 | "movq %%rax,%%r8\n" | |
320 | "movq %%rdx,%%r9\n" | |
321 | /* d += (c & M) * R */ | |
322 | "andq %%r15,%%rax\n" | |
e66d4d6d PW |
323 | "movq $0x1000003d10,%%rdx\n" |
324 | "mulq %%rdx\n" | |
67935050 PW |
325 | "addq %%rax,%%rbx\n" |
326 | "adcq %%rdx,%%rcx\n" | |
327 | /* c >>= 52 (%%r8 only) */ | |
328 | "shrdq $52,%%r9,%%r8\n" | |
e66d4d6d PW |
329 | /* t3 (tmp1) = d & M */ |
330 | "movq %%rbx,%%rsi\n" | |
331 | "andq %%r15,%%rsi\n" | |
f22d73e7 | 332 | "movq %%rsi,%q1\n" |
67935050 PW |
333 | /* d >>= 52 */ |
334 | "shrdq $52,%%rcx,%%rbx\n" | |
335 | "xorq %%rcx,%%rcx\n" | |
336 | /* a4 *= 2 */ | |
337 | "addq %%r14,%%r14\n" | |
338 | /* d += a0 * a4 */ | |
339 | "movq %%r10,%%rax\n" | |
340 | "mulq %%r14\n" | |
341 | "addq %%rax,%%rbx\n" | |
342 | "adcq %%rdx,%%rcx\n" | |
343 | /* d+= (a1*2) * a3 */ | |
344 | "leaq (%%r11,%%r11,1),%%rax\n" | |
345 | "mulq %%r13\n" | |
346 | "addq %%rax,%%rbx\n" | |
347 | "adcq %%rdx,%%rcx\n" | |
348 | /* d += a2 * a2 */ | |
349 | "movq %%r12,%%rax\n" | |
350 | "mulq %%r12\n" | |
351 | "addq %%rax,%%rbx\n" | |
352 | "adcq %%rdx,%%rcx\n" | |
353 | /* d += c * R */ | |
354 | "movq %%r8,%%rax\n" | |
e66d4d6d PW |
355 | "movq $0x1000003d10,%%rdx\n" |
356 | "mulq %%rdx\n" | |
67935050 PW |
357 | "addq %%rax,%%rbx\n" |
358 | "adcq %%rdx,%%rcx\n" | |
e66d4d6d PW |
359 | /* t4 = d & M (%%rsi) */ |
360 | "movq %%rbx,%%rsi\n" | |
361 | "andq %%r15,%%rsi\n" | |
67935050 PW |
362 | /* d >>= 52 */ |
363 | "shrdq $52,%%rcx,%%rbx\n" | |
364 | "xorq %%rcx,%%rcx\n" | |
e66d4d6d PW |
365 | /* tx = t4 >> 48 (tmp3) */ |
366 | "movq %%rsi,%%rax\n" | |
367 | "shrq $48,%%rax\n" | |
f22d73e7 | 368 | "movq %%rax,%q3\n" |
e66d4d6d | 369 | /* t4 &= (M >> 4) (tmp2) */ |
67935050 | 370 | "movq $0xffffffffffff,%%rax\n" |
e66d4d6d | 371 | "andq %%rax,%%rsi\n" |
f22d73e7 | 372 | "movq %%rsi,%q2\n" |
67935050 PW |
373 | /* c = a0 * a0 */ |
374 | "movq %%r10,%%rax\n" | |
375 | "mulq %%r10\n" | |
376 | "movq %%rax,%%r8\n" | |
377 | "movq %%rdx,%%r9\n" | |
378 | /* d += a1 * a4 */ | |
379 | "movq %%r11,%%rax\n" | |
380 | "mulq %%r14\n" | |
381 | "addq %%rax,%%rbx\n" | |
382 | "adcq %%rdx,%%rcx\n" | |
383 | /* d += (a2*2) * a3 */ | |
384 | "leaq (%%r12,%%r12,1),%%rax\n" | |
385 | "mulq %%r13\n" | |
386 | "addq %%rax,%%rbx\n" | |
387 | "adcq %%rdx,%%rcx\n" | |
e66d4d6d PW |
388 | /* u0 = d & M (%%rsi) */ |
389 | "movq %%rbx,%%rsi\n" | |
390 | "andq %%r15,%%rsi\n" | |
67935050 PW |
391 | /* d >>= 52 */ |
392 | "shrdq $52,%%rcx,%%rbx\n" | |
393 | "xorq %%rcx,%%rcx\n" | |
e66d4d6d PW |
394 | /* u0 = (u0 << 4) | tx (%%rsi) */ |
395 | "shlq $4,%%rsi\n" | |
f22d73e7 | 396 | "movq %q3,%%rax\n" |
e66d4d6d | 397 | "orq %%rax,%%rsi\n" |
67935050 PW |
398 | /* c += u0 * (R >> 4) */ |
399 | "movq $0x1000003d1,%%rax\n" | |
e66d4d6d | 400 | "mulq %%rsi\n" |
67935050 PW |
401 | "addq %%rax,%%r8\n" |
402 | "adcq %%rdx,%%r9\n" | |
403 | /* r[0] = c & M */ | |
404 | "movq %%r8,%%rax\n" | |
405 | "andq %%r15,%%rax\n" | |
e66d4d6d | 406 | "movq %%rax,0(%%rdi)\n" |
67935050 PW |
407 | /* c >>= 52 */ |
408 | "shrdq $52,%%r9,%%r8\n" | |
409 | "xorq %%r9,%%r9\n" | |
410 | /* a0 *= 2 */ | |
411 | "addq %%r10,%%r10\n" | |
412 | /* c += a0 * a1 */ | |
413 | "movq %%r10,%%rax\n" | |
414 | "mulq %%r11\n" | |
415 | "addq %%rax,%%r8\n" | |
416 | "adcq %%rdx,%%r9\n" | |
417 | /* d += a2 * a4 */ | |
418 | "movq %%r12,%%rax\n" | |
419 | "mulq %%r14\n" | |
420 | "addq %%rax,%%rbx\n" | |
421 | "adcq %%rdx,%%rcx\n" | |
422 | /* d += a3 * a3 */ | |
423 | "movq %%r13,%%rax\n" | |
424 | "mulq %%r13\n" | |
425 | "addq %%rax,%%rbx\n" | |
426 | "adcq %%rdx,%%rcx\n" | |
67935050 PW |
427 | /* c += (d & M) * R */ |
428 | "movq %%rbx,%%rax\n" | |
429 | "andq %%r15,%%rax\n" | |
e66d4d6d PW |
430 | "movq $0x1000003d10,%%rdx\n" |
431 | "mulq %%rdx\n" | |
67935050 PW |
432 | "addq %%rax,%%r8\n" |
433 | "adcq %%rdx,%%r9\n" | |
434 | /* d >>= 52 */ | |
435 | "shrdq $52,%%rcx,%%rbx\n" | |
436 | "xorq %%rcx,%%rcx\n" | |
437 | /* r[1] = c & M */ | |
438 | "movq %%r8,%%rax\n" | |
439 | "andq %%r15,%%rax\n" | |
e66d4d6d | 440 | "movq %%rax,8(%%rdi)\n" |
67935050 PW |
441 | /* c >>= 52 */ |
442 | "shrdq $52,%%r9,%%r8\n" | |
443 | "xorq %%r9,%%r9\n" | |
444 | /* c += a0 * a2 (last use of %%r10) */ | |
445 | "movq %%r10,%%rax\n" | |
446 | "mulq %%r12\n" | |
447 | "addq %%rax,%%r8\n" | |
448 | "adcq %%rdx,%%r9\n" | |
e66d4d6d | 449 | /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ |
f22d73e7 LD |
450 | "movq %q2,%%rsi\n" |
451 | "movq %q1,%%r10\n" | |
67935050 PW |
452 | /* c += a1 * a1 */ |
453 | "movq %%r11,%%rax\n" | |
454 | "mulq %%r11\n" | |
455 | "addq %%rax,%%r8\n" | |
456 | "adcq %%rdx,%%r9\n" | |
457 | /* d += a3 * a4 */ | |
458 | "movq %%r13,%%rax\n" | |
459 | "mulq %%r14\n" | |
460 | "addq %%rax,%%rbx\n" | |
461 | "adcq %%rdx,%%rcx\n" | |
462 | /* c += (d & M) * R */ | |
463 | "movq %%rbx,%%rax\n" | |
464 | "andq %%r15,%%rax\n" | |
e66d4d6d PW |
465 | "movq $0x1000003d10,%%rdx\n" |
466 | "mulq %%rdx\n" | |
67935050 PW |
467 | "addq %%rax,%%r8\n" |
468 | "adcq %%rdx,%%r9\n" | |
469 | /* d >>= 52 (%%rbx only) */ | |
470 | "shrdq $52,%%rcx,%%rbx\n" | |
471 | /* r[2] = c & M */ | |
472 | "movq %%r8,%%rax\n" | |
473 | "andq %%r15,%%rax\n" | |
e66d4d6d | 474 | "movq %%rax,16(%%rdi)\n" |
67935050 PW |
475 | /* c >>= 52 */ |
476 | "shrdq $52,%%r9,%%r8\n" | |
477 | "xorq %%r9,%%r9\n" | |
478 | /* c += t3 */ | |
479 | "addq %%r10,%%r8\n" | |
480 | /* c += d * R */ | |
481 | "movq %%rbx,%%rax\n" | |
e66d4d6d PW |
482 | "movq $0x1000003d10,%%rdx\n" |
483 | "mulq %%rdx\n" | |
67935050 PW |
484 | "addq %%rax,%%r8\n" |
485 | "adcq %%rdx,%%r9\n" | |
486 | /* r[3] = c & M */ | |
487 | "movq %%r8,%%rax\n" | |
488 | "andq %%r15,%%rax\n" | |
e66d4d6d | 489 | "movq %%rax,24(%%rdi)\n" |
67935050 PW |
490 | /* c >>= 52 (%%r8 only) */ |
491 | "shrdq $52,%%r9,%%r8\n" | |
492 | /* c += t4 (%%r8 only) */ | |
e66d4d6d | 493 | "addq %%rsi,%%r8\n" |
67935050 | 494 | /* r[4] = c */ |
e66d4d6d PW |
495 | "movq %%r8,32(%%rdi)\n" |
496 | : "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) | |
67935050 PW |
497 | : "D"(r) |
498 | : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" | |
499 | ); | |
500 | } | |
7a4b7691 | 501 | |
abe2d3e8 | 502 | #endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ |