]>
Commit | Line | Data |
---|---|---|
71712b27 GM |
1 | /********************************************************************** |
2 | * Copyright (c) 2013, 2014 Pieter Wuille * | |
3 | * Distributed under the MIT software license, see the accompanying * | |
4 | * file COPYING or http://www.opensource.org/licenses/mit-license.php.* | |
5 | **********************************************************************/ | |
1d52a8b1 | 6 | |
abe2d3e8 DR |
7 | #ifndef SECP256K1_SCALAR_REPR_IMPL_H |
8 | #define SECP256K1_SCALAR_REPR_IMPL_H | |
1d52a8b1 | 9 | |
71712b27 | 10 | /* Limbs of the secp256k1 order. */ |
1d52a8b1 PW |
11 | #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL) |
12 | #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL) | |
13 | #define SECP256K1_N_2 ((uint64_t)0xFFFFFFFFFFFFFFFEULL) | |
14 | #define SECP256K1_N_3 ((uint64_t)0xFFFFFFFFFFFFFFFFULL) | |
15 | ||
71712b27 | 16 | /* Limbs of 2^256 minus the secp256k1 order. */ |
1d52a8b1 PW |
17 | #define SECP256K1_N_C_0 (~SECP256K1_N_0 + 1) |
18 | #define SECP256K1_N_C_1 (~SECP256K1_N_1) | |
19 | #define SECP256K1_N_C_2 (1) | |
20 | ||
71712b27 | 21 | /* Limbs of half the secp256k1 order. */ |
1d52a8b1 PW |
22 | #define SECP256K1_N_H_0 ((uint64_t)0xDFE92F46681B20A0ULL) |
23 | #define SECP256K1_N_H_1 ((uint64_t)0x5D576E7357A4501DULL) | |
24 | #define SECP256K1_N_H_2 ((uint64_t)0xFFFFFFFFFFFFFFFFULL) | |
25 | #define SECP256K1_N_H_3 ((uint64_t)0x7FFFFFFFFFFFFFFFULL) | |
26 | ||
dd891e0e | 27 | SECP256K1_INLINE static void secp256k1_scalar_clear(secp256k1_scalar *r) { |
1d52a8b1 PW |
28 | r->d[0] = 0; |
29 | r->d[1] = 0; | |
30 | r->d[2] = 0; | |
31 | r->d[3] = 0; | |
32 | } | |
33 | ||
dd891e0e | 34 | SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar *r, unsigned int v) { |
1e6c77c3 PW |
35 | r->d[0] = v; |
36 | r->d[1] = 0; | |
37 | r->d[2] = 0; | |
38 | r->d[3] = 0; | |
39 | } | |
40 | ||
dd891e0e | 41 | SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count) { |
1e6c77c3 PW |
42 | VERIFY_CHECK((offset + count - 1) >> 6 == offset >> 6); |
43 | return (a->d[offset >> 6] >> (offset & 0x3F)) & ((((uint64_t)1) << count) - 1); | |
44 | } | |
45 | ||
dd891e0e | 46 | SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count) { |
1e6c77c3 PW |
47 | VERIFY_CHECK(count < 32); |
48 | VERIFY_CHECK(offset + count <= 256); | |
49 | if ((offset + count - 1) >> 6 == offset >> 6) { | |
50 | return secp256k1_scalar_get_bits(a, offset, count); | |
51 | } else { | |
52 | VERIFY_CHECK((offset >> 6) + 1 < 4); | |
53 | return ((a->d[offset >> 6] >> (offset & 0x3F)) | (a->d[(offset >> 6) + 1] << (64 - (offset & 0x3F)))) & ((((uint64_t)1) << count) - 1); | |
54 | } | |
1d52a8b1 PW |
55 | } |
56 | ||
dd891e0e | 57 | SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) { |
1d52a8b1 PW |
58 | int yes = 0; |
59 | int no = 0; | |
71712b27 | 60 | no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */ |
1d52a8b1 PW |
61 | no |= (a->d[2] < SECP256K1_N_2); |
62 | yes |= (a->d[2] > SECP256K1_N_2) & ~no; | |
63 | no |= (a->d[1] < SECP256K1_N_1); | |
64 | yes |= (a->d[1] > SECP256K1_N_1) & ~no; | |
65 | yes |= (a->d[0] >= SECP256K1_N_0) & ~no; | |
66 | return yes; | |
67 | } | |
68 | ||
dd891e0e | 69 | SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar *r, unsigned int overflow) { |
d9543c90 | 70 | uint128_t t; |
1d52a8b1 | 71 | VERIFY_CHECK(overflow <= 1); |
d9543c90 | 72 | t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0; |
1d52a8b1 PW |
73 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
74 | t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1; | |
75 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
76 | t += (uint128_t)r->d[2] + overflow * SECP256K1_N_C_2; | |
77 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
78 | t += (uint64_t)r->d[3]; | |
79 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; | |
80 | return overflow; | |
81 | } | |
82 | ||
dd891e0e | 83 | static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) { |
d9543c90 | 84 | int overflow; |
1d52a8b1 PW |
85 | uint128_t t = (uint128_t)a->d[0] + b->d[0]; |
86 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
87 | t += (uint128_t)a->d[1] + b->d[1]; | |
88 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
89 | t += (uint128_t)a->d[2] + b->d[2]; | |
90 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
91 | t += (uint128_t)a->d[3] + b->d[3]; | |
92 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
d9543c90 | 93 | overflow = t + secp256k1_scalar_check_overflow(r); |
29ae1310 PW |
94 | VERIFY_CHECK(overflow == 0 || overflow == 1); |
95 | secp256k1_scalar_reduce(r, overflow); | |
96 | return overflow; | |
1d52a8b1 PW |
97 | } |
98 | ||
dd891e0e | 99 | static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) { |
d9543c90 | 100 | uint128_t t; |
52132078 | 101 | VERIFY_CHECK(bit < 256); |
ed35d43a | 102 | bit += ((uint32_t) flag - 1) & 0x100; /* forcing (bit >> 6) > 3 makes this a noop */ |
d9543c90 | 103 | t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F)); |
52132078 PW |
104 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
105 | t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F)); | |
106 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
107 | t += (uint128_t)r->d[2] + (((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F)); | |
108 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; | |
109 | t += (uint128_t)r->d[3] + (((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F)); | |
110 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; | |
111 | #ifdef VERIFY | |
112 | VERIFY_CHECK((t >> 64) == 0); | |
113 | VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0); | |
114 | #endif | |
115 | } | |
116 | ||
dd891e0e | 117 | static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) { |
d9543c90 | 118 | int over; |
1d52a8b1 PW |
119 | r->d[0] = (uint64_t)b32[31] | (uint64_t)b32[30] << 8 | (uint64_t)b32[29] << 16 | (uint64_t)b32[28] << 24 | (uint64_t)b32[27] << 32 | (uint64_t)b32[26] << 40 | (uint64_t)b32[25] << 48 | (uint64_t)b32[24] << 56; |
120 | r->d[1] = (uint64_t)b32[23] | (uint64_t)b32[22] << 8 | (uint64_t)b32[21] << 16 | (uint64_t)b32[20] << 24 | (uint64_t)b32[19] << 32 | (uint64_t)b32[18] << 40 | (uint64_t)b32[17] << 48 | (uint64_t)b32[16] << 56; | |
121 | r->d[2] = (uint64_t)b32[15] | (uint64_t)b32[14] << 8 | (uint64_t)b32[13] << 16 | (uint64_t)b32[12] << 24 | (uint64_t)b32[11] << 32 | (uint64_t)b32[10] << 40 | (uint64_t)b32[9] << 48 | (uint64_t)b32[8] << 56; | |
122 | r->d[3] = (uint64_t)b32[7] | (uint64_t)b32[6] << 8 | (uint64_t)b32[5] << 16 | (uint64_t)b32[4] << 24 | (uint64_t)b32[3] << 32 | (uint64_t)b32[2] << 40 | (uint64_t)b32[1] << 48 | (uint64_t)b32[0] << 56; | |
d9543c90 | 123 | over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); |
1d52a8b1 PW |
124 | if (overflow) { |
125 | *overflow = over; | |
126 | } | |
127 | } | |
128 | ||
dd891e0e | 129 | static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) { |
1d52a8b1 PW |
130 | bin[0] = a->d[3] >> 56; bin[1] = a->d[3] >> 48; bin[2] = a->d[3] >> 40; bin[3] = a->d[3] >> 32; bin[4] = a->d[3] >> 24; bin[5] = a->d[3] >> 16; bin[6] = a->d[3] >> 8; bin[7] = a->d[3]; |
131 | bin[8] = a->d[2] >> 56; bin[9] = a->d[2] >> 48; bin[10] = a->d[2] >> 40; bin[11] = a->d[2] >> 32; bin[12] = a->d[2] >> 24; bin[13] = a->d[2] >> 16; bin[14] = a->d[2] >> 8; bin[15] = a->d[2]; | |
132 | bin[16] = a->d[1] >> 56; bin[17] = a->d[1] >> 48; bin[18] = a->d[1] >> 40; bin[19] = a->d[1] >> 32; bin[20] = a->d[1] >> 24; bin[21] = a->d[1] >> 16; bin[22] = a->d[1] >> 8; bin[23] = a->d[1]; | |
133 | bin[24] = a->d[0] >> 56; bin[25] = a->d[0] >> 48; bin[26] = a->d[0] >> 40; bin[27] = a->d[0] >> 32; bin[28] = a->d[0] >> 24; bin[29] = a->d[0] >> 16; bin[30] = a->d[0] >> 8; bin[31] = a->d[0]; | |
134 | } | |
135 | ||
dd891e0e | 136 | SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) { |
1d52a8b1 PW |
137 | return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0; |
138 | } | |
139 | ||
dd891e0e | 140 | static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) { |
1d52a8b1 PW |
141 | uint64_t nonzero = 0xFFFFFFFFFFFFFFFFULL * (secp256k1_scalar_is_zero(a) == 0); |
142 | uint128_t t = (uint128_t)(~a->d[0]) + SECP256K1_N_0 + 1; | |
143 | r->d[0] = t & nonzero; t >>= 64; | |
144 | t += (uint128_t)(~a->d[1]) + SECP256K1_N_1; | |
145 | r->d[1] = t & nonzero; t >>= 64; | |
146 | t += (uint128_t)(~a->d[2]) + SECP256K1_N_2; | |
147 | r->d[2] = t & nonzero; t >>= 64; | |
148 | t += (uint128_t)(~a->d[3]) + SECP256K1_N_3; | |
149 | r->d[3] = t & nonzero; | |
150 | } | |
151 | ||
dd891e0e | 152 | SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) { |
1d52a8b1 PW |
153 | return ((a->d[0] ^ 1) | a->d[1] | a->d[2] | a->d[3]) == 0; |
154 | } | |
155 | ||
dd891e0e | 156 | static int secp256k1_scalar_is_high(const secp256k1_scalar *a) { |
1d52a8b1 PW |
157 | int yes = 0; |
158 | int no = 0; | |
159 | no |= (a->d[3] < SECP256K1_N_H_3); | |
160 | yes |= (a->d[3] > SECP256K1_N_H_3) & ~no; | |
71712b27 | 161 | no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */ |
1d52a8b1 PW |
162 | no |= (a->d[1] < SECP256K1_N_H_1) & ~yes; |
163 | yes |= (a->d[1] > SECP256K1_N_H_1) & ~no; | |
164 | yes |= (a->d[0] > SECP256K1_N_H_0) & ~no; | |
165 | return yes; | |
166 | } | |
167 | ||
dd891e0e | 168 | static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) { |
44015000 AP |
169 | /* If we are flag = 0, mask = 00...00 and this is a no-op; |
170 | * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */ | |
171 | uint64_t mask = !flag - 1; | |
172 | uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1; | |
173 | uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask); | |
174 | r->d[0] = t & nonzero; t >>= 64; | |
175 | t += (uint128_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask); | |
176 | r->d[1] = t & nonzero; t >>= 64; | |
177 | t += (uint128_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask); | |
178 | r->d[2] = t & nonzero; t >>= 64; | |
179 | t += (uint128_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask); | |
180 | r->d[3] = t & nonzero; | |
181 | return 2 * (mask == 0) - 1; | |
182 | } | |
183 | ||
71712b27 | 184 | /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */ |
1d52a8b1 PW |
185 | |
186 | /** Add a*b to the number defined by (c0,c1,c2). c2 must never overflow. */ | |
187 | #define muladd(a,b) { \ | |
188 | uint64_t tl, th; \ | |
189 | { \ | |
190 | uint128_t t = (uint128_t)a * b; \ | |
191 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ | |
192 | tl = t; \ | |
193 | } \ | |
194 | c0 += tl; /* overflow is handled on the next line */ \ | |
5b196338 | 195 | th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ |
1d52a8b1 | 196 | c1 += th; /* overflow is handled on the next line */ \ |
5b196338 | 197 | c2 += (c1 < th); /* never overflows by contract (verified in the next line) */ \ |
1d52a8b1 PW |
198 | VERIFY_CHECK((c1 >= th) || (c2 != 0)); \ |
199 | } | |
200 | ||
201 | /** Add a*b to the number defined by (c0,c1). c1 must never overflow. */ | |
202 | #define muladd_fast(a,b) { \ | |
203 | uint64_t tl, th; \ | |
204 | { \ | |
205 | uint128_t t = (uint128_t)a * b; \ | |
206 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ | |
207 | tl = t; \ | |
208 | } \ | |
209 | c0 += tl; /* overflow is handled on the next line */ \ | |
5b196338 | 210 | th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ |
1d52a8b1 PW |
211 | c1 += th; /* never overflows by contract (verified in the next line) */ \ |
212 | VERIFY_CHECK(c1 >= th); \ | |
213 | } | |
214 | ||
215 | /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */ | |
216 | #define muladd2(a,b) { \ | |
d9543c90 | 217 | uint64_t tl, th, th2, tl2; \ |
1d52a8b1 PW |
218 | { \ |
219 | uint128_t t = (uint128_t)a * b; \ | |
220 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ | |
221 | tl = t; \ | |
222 | } \ | |
d9543c90 | 223 | th2 = th + th; /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \ |
5b196338 | 224 | c2 += (th2 < th); /* never overflows by contract (verified the next line) */ \ |
1d52a8b1 | 225 | VERIFY_CHECK((th2 >= th) || (c2 != 0)); \ |
d9543c90 | 226 | tl2 = tl + tl; /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \ |
5b196338 | 227 | th2 += (tl2 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ |
1d52a8b1 | 228 | c0 += tl2; /* overflow is handled on the next line */ \ |
5b196338 | 229 | th2 += (c0 < tl2); /* second overflow is handled on the next line */ \ |
1d52a8b1 PW |
230 | c2 += (c0 < tl2) & (th2 == 0); /* never overflows by contract (verified the next line) */ \ |
231 | VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \ | |
232 | c1 += th2; /* overflow is handled on the next line */ \ | |
5b196338 | 233 | c2 += (c1 < th2); /* never overflows by contract (verified the next line) */ \ |
1d52a8b1 PW |
234 | VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \ |
235 | } | |
236 | ||
237 | /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */ | |
238 | #define sumadd(a) { \ | |
d9543c90 | 239 | unsigned int over; \ |
1d52a8b1 | 240 | c0 += (a); /* overflow is handled on the next line */ \ |
5b196338 | 241 | over = (c0 < (a)); \ |
1d52a8b1 | 242 | c1 += over; /* overflow is handled on the next line */ \ |
5b196338 | 243 | c2 += (c1 < over); /* never overflows by contract */ \ |
1d52a8b1 PW |
244 | } |
245 | ||
246 | /** Add a to the number defined by (c0,c1). c1 must never overflow, c2 must be zero. */ | |
247 | #define sumadd_fast(a) { \ | |
248 | c0 += (a); /* overflow is handled on the next line */ \ | |
5b196338 | 249 | c1 += (c0 < (a)); /* never overflows by contract (verified the next line) */ \ |
1d52a8b1 PW |
250 | VERIFY_CHECK((c1 != 0) | (c0 >= (a))); \ |
251 | VERIFY_CHECK(c2 == 0); \ | |
252 | } | |
253 | ||
254 | /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. */ | |
255 | #define extract(n) { \ | |
256 | (n) = c0; \ | |
257 | c0 = c1; \ | |
258 | c1 = c2; \ | |
259 | c2 = 0; \ | |
260 | } | |
261 | ||
262 | /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. c2 is required to be zero. */ | |
263 | #define extract_fast(n) { \ | |
264 | (n) = c0; \ | |
265 | c0 = c1; \ | |
266 | c1 = 0; \ | |
267 | VERIFY_CHECK(c2 == 0); \ | |
268 | } | |
269 | ||
dd891e0e | 270 | static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l) { |
ffccfd2d PW |
271 | #ifdef USE_ASM_X86_64 |
272 | /* Reduce 512 bits into 385. */ | |
273 | uint64_t m0, m1, m2, m3, m4, m5, m6; | |
274 | uint64_t p0, p1, p2, p3, p4; | |
275 | uint64_t c; | |
276 | ||
277 | __asm__ __volatile__( | |
278 | /* Preload. */ | |
279 | "movq 32(%%rsi), %%r11\n" | |
280 | "movq 40(%%rsi), %%r12\n" | |
281 | "movq 48(%%rsi), %%r13\n" | |
282 | "movq 56(%%rsi), %%r14\n" | |
283 | /* Initialize r8,r9,r10 */ | |
284 | "movq 0(%%rsi), %%r8\n" | |
9d67afad AG |
285 | "xorq %%r9, %%r9\n" |
286 | "xorq %%r10, %%r10\n" | |
ffccfd2d PW |
287 | /* (r8,r9) += n0 * c0 */ |
288 | "movq %8, %%rax\n" | |
289 | "mulq %%r11\n" | |
290 | "addq %%rax, %%r8\n" | |
291 | "adcq %%rdx, %%r9\n" | |
292 | /* extract m0 */ | |
293 | "movq %%r8, %q0\n" | |
9d67afad | 294 | "xorq %%r8, %%r8\n" |
ffccfd2d PW |
295 | /* (r9,r10) += l1 */ |
296 | "addq 8(%%rsi), %%r9\n" | |
297 | "adcq $0, %%r10\n" | |
298 | /* (r9,r10,r8) += n1 * c0 */ | |
299 | "movq %8, %%rax\n" | |
300 | "mulq %%r12\n" | |
301 | "addq %%rax, %%r9\n" | |
302 | "adcq %%rdx, %%r10\n" | |
303 | "adcq $0, %%r8\n" | |
304 | /* (r9,r10,r8) += n0 * c1 */ | |
305 | "movq %9, %%rax\n" | |
306 | "mulq %%r11\n" | |
307 | "addq %%rax, %%r9\n" | |
308 | "adcq %%rdx, %%r10\n" | |
309 | "adcq $0, %%r8\n" | |
310 | /* extract m1 */ | |
311 | "movq %%r9, %q1\n" | |
9d67afad | 312 | "xorq %%r9, %%r9\n" |
ffccfd2d PW |
313 | /* (r10,r8,r9) += l2 */ |
314 | "addq 16(%%rsi), %%r10\n" | |
315 | "adcq $0, %%r8\n" | |
316 | "adcq $0, %%r9\n" | |
317 | /* (r10,r8,r9) += n2 * c0 */ | |
318 | "movq %8, %%rax\n" | |
319 | "mulq %%r13\n" | |
320 | "addq %%rax, %%r10\n" | |
321 | "adcq %%rdx, %%r8\n" | |
322 | "adcq $0, %%r9\n" | |
323 | /* (r10,r8,r9) += n1 * c1 */ | |
324 | "movq %9, %%rax\n" | |
325 | "mulq %%r12\n" | |
326 | "addq %%rax, %%r10\n" | |
327 | "adcq %%rdx, %%r8\n" | |
328 | "adcq $0, %%r9\n" | |
329 | /* (r10,r8,r9) += n0 */ | |
330 | "addq %%r11, %%r10\n" | |
331 | "adcq $0, %%r8\n" | |
332 | "adcq $0, %%r9\n" | |
333 | /* extract m2 */ | |
334 | "movq %%r10, %q2\n" | |
9d67afad | 335 | "xorq %%r10, %%r10\n" |
ffccfd2d PW |
336 | /* (r8,r9,r10) += l3 */ |
337 | "addq 24(%%rsi), %%r8\n" | |
338 | "adcq $0, %%r9\n" | |
339 | "adcq $0, %%r10\n" | |
340 | /* (r8,r9,r10) += n3 * c0 */ | |
341 | "movq %8, %%rax\n" | |
342 | "mulq %%r14\n" | |
343 | "addq %%rax, %%r8\n" | |
344 | "adcq %%rdx, %%r9\n" | |
345 | "adcq $0, %%r10\n" | |
346 | /* (r8,r9,r10) += n2 * c1 */ | |
347 | "movq %9, %%rax\n" | |
348 | "mulq %%r13\n" | |
349 | "addq %%rax, %%r8\n" | |
350 | "adcq %%rdx, %%r9\n" | |
351 | "adcq $0, %%r10\n" | |
352 | /* (r8,r9,r10) += n1 */ | |
353 | "addq %%r12, %%r8\n" | |
354 | "adcq $0, %%r9\n" | |
355 | "adcq $0, %%r10\n" | |
356 | /* extract m3 */ | |
357 | "movq %%r8, %q3\n" | |
9d67afad | 358 | "xorq %%r8, %%r8\n" |
ffccfd2d PW |
359 | /* (r9,r10,r8) += n3 * c1 */ |
360 | "movq %9, %%rax\n" | |
361 | "mulq %%r14\n" | |
362 | "addq %%rax, %%r9\n" | |
363 | "adcq %%rdx, %%r10\n" | |
364 | "adcq $0, %%r8\n" | |
365 | /* (r9,r10,r8) += n2 */ | |
366 | "addq %%r13, %%r9\n" | |
367 | "adcq $0, %%r10\n" | |
368 | "adcq $0, %%r8\n" | |
369 | /* extract m4 */ | |
370 | "movq %%r9, %q4\n" | |
371 | /* (r10,r8) += n3 */ | |
372 | "addq %%r14, %%r10\n" | |
373 | "adcq $0, %%r8\n" | |
374 | /* extract m5 */ | |
375 | "movq %%r10, %q5\n" | |
376 | /* extract m6 */ | |
377 | "movq %%r8, %q6\n" | |
378 | : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6) | |
d58bc93f | 379 | : "S"(l), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1) |
ffccfd2d PW |
380 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc"); |
381 | ||
382 | /* Reduce 385 bits into 258. */ | |
383 | __asm__ __volatile__( | |
384 | /* Preload */ | |
385 | "movq %q9, %%r11\n" | |
386 | "movq %q10, %%r12\n" | |
387 | "movq %q11, %%r13\n" | |
388 | /* Initialize (r8,r9,r10) */ | |
389 | "movq %q5, %%r8\n" | |
9d67afad AG |
390 | "xorq %%r9, %%r9\n" |
391 | "xorq %%r10, %%r10\n" | |
ffccfd2d PW |
392 | /* (r8,r9) += m4 * c0 */ |
393 | "movq %12, %%rax\n" | |
394 | "mulq %%r11\n" | |
395 | "addq %%rax, %%r8\n" | |
396 | "adcq %%rdx, %%r9\n" | |
397 | /* extract p0 */ | |
398 | "movq %%r8, %q0\n" | |
9d67afad | 399 | "xorq %%r8, %%r8\n" |
ffccfd2d PW |
400 | /* (r9,r10) += m1 */ |
401 | "addq %q6, %%r9\n" | |
402 | "adcq $0, %%r10\n" | |
403 | /* (r9,r10,r8) += m5 * c0 */ | |
404 | "movq %12, %%rax\n" | |
405 | "mulq %%r12\n" | |
406 | "addq %%rax, %%r9\n" | |
407 | "adcq %%rdx, %%r10\n" | |
408 | "adcq $0, %%r8\n" | |
409 | /* (r9,r10,r8) += m4 * c1 */ | |
410 | "movq %13, %%rax\n" | |
411 | "mulq %%r11\n" | |
412 | "addq %%rax, %%r9\n" | |
413 | "adcq %%rdx, %%r10\n" | |
414 | "adcq $0, %%r8\n" | |
415 | /* extract p1 */ | |
416 | "movq %%r9, %q1\n" | |
9d67afad | 417 | "xorq %%r9, %%r9\n" |
ffccfd2d PW |
418 | /* (r10,r8,r9) += m2 */ |
419 | "addq %q7, %%r10\n" | |
420 | "adcq $0, %%r8\n" | |
421 | "adcq $0, %%r9\n" | |
422 | /* (r10,r8,r9) += m6 * c0 */ | |
423 | "movq %12, %%rax\n" | |
424 | "mulq %%r13\n" | |
425 | "addq %%rax, %%r10\n" | |
426 | "adcq %%rdx, %%r8\n" | |
427 | "adcq $0, %%r9\n" | |
428 | /* (r10,r8,r9) += m5 * c1 */ | |
429 | "movq %13, %%rax\n" | |
430 | "mulq %%r12\n" | |
431 | "addq %%rax, %%r10\n" | |
432 | "adcq %%rdx, %%r8\n" | |
433 | "adcq $0, %%r9\n" | |
434 | /* (r10,r8,r9) += m4 */ | |
435 | "addq %%r11, %%r10\n" | |
436 | "adcq $0, %%r8\n" | |
437 | "adcq $0, %%r9\n" | |
438 | /* extract p2 */ | |
439 | "movq %%r10, %q2\n" | |
440 | /* (r8,r9) += m3 */ | |
441 | "addq %q8, %%r8\n" | |
442 | "adcq $0, %%r9\n" | |
443 | /* (r8,r9) += m6 * c1 */ | |
444 | "movq %13, %%rax\n" | |
445 | "mulq %%r13\n" | |
446 | "addq %%rax, %%r8\n" | |
447 | "adcq %%rdx, %%r9\n" | |
448 | /* (r8,r9) += m5 */ | |
449 | "addq %%r12, %%r8\n" | |
450 | "adcq $0, %%r9\n" | |
451 | /* extract p3 */ | |
452 | "movq %%r8, %q3\n" | |
453 | /* (r9) += m6 */ | |
454 | "addq %%r13, %%r9\n" | |
455 | /* extract p4 */ | |
456 | "movq %%r9, %q4\n" | |
457 | : "=&g"(p0), "=&g"(p1), "=&g"(p2), "=g"(p3), "=g"(p4) | |
d58bc93f | 458 | : "g"(m0), "g"(m1), "g"(m2), "g"(m3), "g"(m4), "g"(m5), "g"(m6), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1) |
ffccfd2d PW |
459 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "cc"); |
460 | ||
461 | /* Reduce 258 bits into 256. */ | |
462 | __asm__ __volatile__( | |
463 | /* Preload */ | |
464 | "movq %q5, %%r10\n" | |
465 | /* (rax,rdx) = p4 * c0 */ | |
466 | "movq %7, %%rax\n" | |
467 | "mulq %%r10\n" | |
468 | /* (rax,rdx) += p0 */ | |
469 | "addq %q1, %%rax\n" | |
470 | "adcq $0, %%rdx\n" | |
471 | /* extract r0 */ | |
472 | "movq %%rax, 0(%q6)\n" | |
473 | /* Move to (r8,r9) */ | |
474 | "movq %%rdx, %%r8\n" | |
9d67afad | 475 | "xorq %%r9, %%r9\n" |
ffccfd2d PW |
476 | /* (r8,r9) += p1 */ |
477 | "addq %q2, %%r8\n" | |
478 | "adcq $0, %%r9\n" | |
479 | /* (r8,r9) += p4 * c1 */ | |
480 | "movq %8, %%rax\n" | |
481 | "mulq %%r10\n" | |
482 | "addq %%rax, %%r8\n" | |
483 | "adcq %%rdx, %%r9\n" | |
484 | /* Extract r1 */ | |
485 | "movq %%r8, 8(%q6)\n" | |
9d67afad | 486 | "xorq %%r8, %%r8\n" |
ffccfd2d PW |
487 | /* (r9,r8) += p4 */ |
488 | "addq %%r10, %%r9\n" | |
489 | "adcq $0, %%r8\n" | |
490 | /* (r9,r8) += p2 */ | |
491 | "addq %q3, %%r9\n" | |
492 | "adcq $0, %%r8\n" | |
493 | /* Extract r2 */ | |
494 | "movq %%r9, 16(%q6)\n" | |
9d67afad | 495 | "xorq %%r9, %%r9\n" |
ffccfd2d PW |
496 | /* (r8,r9) += p3 */ |
497 | "addq %q4, %%r8\n" | |
498 | "adcq $0, %%r9\n" | |
499 | /* Extract r3 */ | |
500 | "movq %%r8, 24(%q6)\n" | |
501 | /* Extract c */ | |
502 | "movq %%r9, %q0\n" | |
503 | : "=g"(c) | |
d58bc93f | 504 | : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1) |
ffccfd2d PW |
505 | : "rax", "rdx", "r8", "r9", "r10", "cc", "memory"); |
506 | #else | |
d9543c90 | 507 | uint128_t c; |
ffccfd2d | 508 | uint64_t c0, c1, c2; |
1d52a8b1 | 509 | uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7]; |
d9543c90 GM |
510 | uint64_t m0, m1, m2, m3, m4, m5; |
511 | uint32_t m6; | |
512 | uint64_t p0, p1, p2, p3; | |
513 | uint32_t p4; | |
1d52a8b1 | 514 | |
71712b27 GM |
515 | /* Reduce 512 bits into 385. */ |
516 | /* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */ | |
1d52a8b1 PW |
517 | c0 = l[0]; c1 = 0; c2 = 0; |
518 | muladd_fast(n0, SECP256K1_N_C_0); | |
d9543c90 | 519 | extract_fast(m0); |
1d52a8b1 PW |
520 | sumadd_fast(l[1]); |
521 | muladd(n1, SECP256K1_N_C_0); | |
522 | muladd(n0, SECP256K1_N_C_1); | |
d9543c90 | 523 | extract(m1); |
1d52a8b1 PW |
524 | sumadd(l[2]); |
525 | muladd(n2, SECP256K1_N_C_0); | |
526 | muladd(n1, SECP256K1_N_C_1); | |
527 | sumadd(n0); | |
d9543c90 | 528 | extract(m2); |
1d52a8b1 PW |
529 | sumadd(l[3]); |
530 | muladd(n3, SECP256K1_N_C_0); | |
531 | muladd(n2, SECP256K1_N_C_1); | |
532 | sumadd(n1); | |
d9543c90 | 533 | extract(m3); |
1d52a8b1 PW |
534 | muladd(n3, SECP256K1_N_C_1); |
535 | sumadd(n2); | |
d9543c90 | 536 | extract(m4); |
1d52a8b1 | 537 | sumadd_fast(n3); |
d9543c90 | 538 | extract_fast(m5); |
1d52a8b1 | 539 | VERIFY_CHECK(c0 <= 1); |
d9543c90 | 540 | m6 = c0; |
1d52a8b1 | 541 | |
71712b27 GM |
542 | /* Reduce 385 bits into 258. */ |
543 | /* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */ | |
1d52a8b1 PW |
544 | c0 = m0; c1 = 0; c2 = 0; |
545 | muladd_fast(m4, SECP256K1_N_C_0); | |
d9543c90 | 546 | extract_fast(p0); |
1d52a8b1 PW |
547 | sumadd_fast(m1); |
548 | muladd(m5, SECP256K1_N_C_0); | |
549 | muladd(m4, SECP256K1_N_C_1); | |
d9543c90 | 550 | extract(p1); |
1d52a8b1 PW |
551 | sumadd(m2); |
552 | muladd(m6, SECP256K1_N_C_0); | |
553 | muladd(m5, SECP256K1_N_C_1); | |
554 | sumadd(m4); | |
d9543c90 | 555 | extract(p2); |
1d52a8b1 PW |
556 | sumadd_fast(m3); |
557 | muladd_fast(m6, SECP256K1_N_C_1); | |
558 | sumadd_fast(m5); | |
d9543c90 GM |
559 | extract_fast(p3); |
560 | p4 = c0 + m6; | |
1d52a8b1 PW |
561 | VERIFY_CHECK(p4 <= 2); |
562 | ||
71712b27 GM |
563 | /* Reduce 258 bits into 256. */ |
564 | /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */ | |
d9543c90 | 565 | c = p0 + (uint128_t)SECP256K1_N_C_0 * p4; |
1d52a8b1 PW |
566 | r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; |
567 | c += p1 + (uint128_t)SECP256K1_N_C_1 * p4; | |
568 | r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; | |
569 | c += p2 + (uint128_t)p4; | |
570 | r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; | |
571 | c += p3; | |
572 | r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; | |
ffccfd2d | 573 | #endif |
1d52a8b1 | 574 | |
71712b27 | 575 | /* Final reduction of r. */ |
1d52a8b1 PW |
576 | secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r)); |
577 | } | |
578 | ||
dd891e0e | 579 | static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, const secp256k1_scalar *b) { |
ffccfd2d PW |
580 | #ifdef USE_ASM_X86_64 |
581 | const uint64_t *pb = b->d; | |
582 | __asm__ __volatile__( | |
583 | /* Preload */ | |
584 | "movq 0(%%rdi), %%r15\n" | |
585 | "movq 8(%%rdi), %%rbx\n" | |
586 | "movq 16(%%rdi), %%rcx\n" | |
587 | "movq 0(%%rdx), %%r11\n" | |
588 | "movq 8(%%rdx), %%r12\n" | |
589 | "movq 16(%%rdx), %%r13\n" | |
590 | "movq 24(%%rdx), %%r14\n" | |
591 | /* (rax,rdx) = a0 * b0 */ | |
592 | "movq %%r15, %%rax\n" | |
593 | "mulq %%r11\n" | |
594 | /* Extract l0 */ | |
595 | "movq %%rax, 0(%%rsi)\n" | |
596 | /* (r8,r9,r10) = (rdx) */ | |
597 | "movq %%rdx, %%r8\n" | |
598 | "xorq %%r9, %%r9\n" | |
599 | "xorq %%r10, %%r10\n" | |
600 | /* (r8,r9,r10) += a0 * b1 */ | |
601 | "movq %%r15, %%rax\n" | |
602 | "mulq %%r12\n" | |
603 | "addq %%rax, %%r8\n" | |
604 | "adcq %%rdx, %%r9\n" | |
605 | "adcq $0, %%r10\n" | |
606 | /* (r8,r9,r10) += a1 * b0 */ | |
607 | "movq %%rbx, %%rax\n" | |
608 | "mulq %%r11\n" | |
609 | "addq %%rax, %%r8\n" | |
610 | "adcq %%rdx, %%r9\n" | |
611 | "adcq $0, %%r10\n" | |
612 | /* Extract l1 */ | |
613 | "movq %%r8, 8(%%rsi)\n" | |
614 | "xorq %%r8, %%r8\n" | |
615 | /* (r9,r10,r8) += a0 * b2 */ | |
616 | "movq %%r15, %%rax\n" | |
617 | "mulq %%r13\n" | |
618 | "addq %%rax, %%r9\n" | |
619 | "adcq %%rdx, %%r10\n" | |
620 | "adcq $0, %%r8\n" | |
621 | /* (r9,r10,r8) += a1 * b1 */ | |
622 | "movq %%rbx, %%rax\n" | |
623 | "mulq %%r12\n" | |
624 | "addq %%rax, %%r9\n" | |
625 | "adcq %%rdx, %%r10\n" | |
626 | "adcq $0, %%r8\n" | |
627 | /* (r9,r10,r8) += a2 * b0 */ | |
628 | "movq %%rcx, %%rax\n" | |
629 | "mulq %%r11\n" | |
630 | "addq %%rax, %%r9\n" | |
631 | "adcq %%rdx, %%r10\n" | |
632 | "adcq $0, %%r8\n" | |
633 | /* Extract l2 */ | |
634 | "movq %%r9, 16(%%rsi)\n" | |
635 | "xorq %%r9, %%r9\n" | |
636 | /* (r10,r8,r9) += a0 * b3 */ | |
637 | "movq %%r15, %%rax\n" | |
638 | "mulq %%r14\n" | |
639 | "addq %%rax, %%r10\n" | |
640 | "adcq %%rdx, %%r8\n" | |
641 | "adcq $0, %%r9\n" | |
642 | /* Preload a3 */ | |
643 | "movq 24(%%rdi), %%r15\n" | |
644 | /* (r10,r8,r9) += a1 * b2 */ | |
645 | "movq %%rbx, %%rax\n" | |
646 | "mulq %%r13\n" | |
647 | "addq %%rax, %%r10\n" | |
648 | "adcq %%rdx, %%r8\n" | |
649 | "adcq $0, %%r9\n" | |
650 | /* (r10,r8,r9) += a2 * b1 */ | |
651 | "movq %%rcx, %%rax\n" | |
652 | "mulq %%r12\n" | |
653 | "addq %%rax, %%r10\n" | |
654 | "adcq %%rdx, %%r8\n" | |
655 | "adcq $0, %%r9\n" | |
656 | /* (r10,r8,r9) += a3 * b0 */ | |
657 | "movq %%r15, %%rax\n" | |
658 | "mulq %%r11\n" | |
659 | "addq %%rax, %%r10\n" | |
660 | "adcq %%rdx, %%r8\n" | |
661 | "adcq $0, %%r9\n" | |
662 | /* Extract l3 */ | |
663 | "movq %%r10, 24(%%rsi)\n" | |
664 | "xorq %%r10, %%r10\n" | |
665 | /* (r8,r9,r10) += a1 * b3 */ | |
666 | "movq %%rbx, %%rax\n" | |
667 | "mulq %%r14\n" | |
668 | "addq %%rax, %%r8\n" | |
669 | "adcq %%rdx, %%r9\n" | |
670 | "adcq $0, %%r10\n" | |
671 | /* (r8,r9,r10) += a2 * b2 */ | |
672 | "movq %%rcx, %%rax\n" | |
673 | "mulq %%r13\n" | |
674 | "addq %%rax, %%r8\n" | |
675 | "adcq %%rdx, %%r9\n" | |
676 | "adcq $0, %%r10\n" | |
677 | /* (r8,r9,r10) += a3 * b1 */ | |
678 | "movq %%r15, %%rax\n" | |
679 | "mulq %%r12\n" | |
680 | "addq %%rax, %%r8\n" | |
681 | "adcq %%rdx, %%r9\n" | |
682 | "adcq $0, %%r10\n" | |
683 | /* Extract l4 */ | |
684 | "movq %%r8, 32(%%rsi)\n" | |
685 | "xorq %%r8, %%r8\n" | |
686 | /* (r9,r10,r8) += a2 * b3 */ | |
687 | "movq %%rcx, %%rax\n" | |
688 | "mulq %%r14\n" | |
689 | "addq %%rax, %%r9\n" | |
690 | "adcq %%rdx, %%r10\n" | |
691 | "adcq $0, %%r8\n" | |
692 | /* (r9,r10,r8) += a3 * b2 */ | |
693 | "movq %%r15, %%rax\n" | |
694 | "mulq %%r13\n" | |
695 | "addq %%rax, %%r9\n" | |
696 | "adcq %%rdx, %%r10\n" | |
697 | "adcq $0, %%r8\n" | |
698 | /* Extract l5 */ | |
699 | "movq %%r9, 40(%%rsi)\n" | |
700 | /* (r10,r8) += a3 * b3 */ | |
701 | "movq %%r15, %%rax\n" | |
702 | "mulq %%r14\n" | |
703 | "addq %%rax, %%r10\n" | |
704 | "adcq %%rdx, %%r8\n" | |
705 | /* Extract l6 */ | |
706 | "movq %%r10, 48(%%rsi)\n" | |
707 | /* Extract l7 */ | |
708 | "movq %%r8, 56(%%rsi)\n" | |
709 | : "+d"(pb) | |
710 | : "S"(l), "D"(a->d) | |
711 | : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory"); | |
712 | #else | |
71712b27 | 713 | /* 160 bit accumulator. */ |
1d52a8b1 PW |
714 | uint64_t c0 = 0, c1 = 0; |
715 | uint32_t c2 = 0; | |
716 | ||
71712b27 | 717 | /* l[0..7] = a[0..3] * b[0..3]. */ |
1d52a8b1 PW |
718 | muladd_fast(a->d[0], b->d[0]); |
719 | extract_fast(l[0]); | |
720 | muladd(a->d[0], b->d[1]); | |
721 | muladd(a->d[1], b->d[0]); | |
722 | extract(l[1]); | |
723 | muladd(a->d[0], b->d[2]); | |
724 | muladd(a->d[1], b->d[1]); | |
725 | muladd(a->d[2], b->d[0]); | |
726 | extract(l[2]); | |
727 | muladd(a->d[0], b->d[3]); | |
728 | muladd(a->d[1], b->d[2]); | |
729 | muladd(a->d[2], b->d[1]); | |
730 | muladd(a->d[3], b->d[0]); | |
731 | extract(l[3]); | |
732 | muladd(a->d[1], b->d[3]); | |
733 | muladd(a->d[2], b->d[2]); | |
734 | muladd(a->d[3], b->d[1]); | |
735 | extract(l[4]); | |
736 | muladd(a->d[2], b->d[3]); | |
737 | muladd(a->d[3], b->d[2]); | |
738 | extract(l[5]); | |
739 | muladd_fast(a->d[3], b->d[3]); | |
740 | extract_fast(l[6]); | |
cfe0ed91 | 741 | VERIFY_CHECK(c1 == 0); |
1d52a8b1 | 742 | l[7] = c0; |
ffccfd2d | 743 | #endif |
1d52a8b1 PW |
744 | } |
745 | ||
dd891e0e | 746 | static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) { |
ffccfd2d PW |
747 | #ifdef USE_ASM_X86_64 |
748 | __asm__ __volatile__( | |
749 | /* Preload */ | |
750 | "movq 0(%%rdi), %%r11\n" | |
751 | "movq 8(%%rdi), %%r12\n" | |
752 | "movq 16(%%rdi), %%r13\n" | |
753 | "movq 24(%%rdi), %%r14\n" | |
754 | /* (rax,rdx) = a0 * a0 */ | |
755 | "movq %%r11, %%rax\n" | |
756 | "mulq %%r11\n" | |
757 | /* Extract l0 */ | |
758 | "movq %%rax, 0(%%rsi)\n" | |
759 | /* (r8,r9,r10) = (rdx,0) */ | |
760 | "movq %%rdx, %%r8\n" | |
761 | "xorq %%r9, %%r9\n" | |
762 | "xorq %%r10, %%r10\n" | |
763 | /* (r8,r9,r10) += 2 * a0 * a1 */ | |
764 | "movq %%r11, %%rax\n" | |
765 | "mulq %%r12\n" | |
766 | "addq %%rax, %%r8\n" | |
767 | "adcq %%rdx, %%r9\n" | |
768 | "adcq $0, %%r10\n" | |
769 | "addq %%rax, %%r8\n" | |
770 | "adcq %%rdx, %%r9\n" | |
771 | "adcq $0, %%r10\n" | |
772 | /* Extract l1 */ | |
773 | "movq %%r8, 8(%%rsi)\n" | |
774 | "xorq %%r8, %%r8\n" | |
775 | /* (r9,r10,r8) += 2 * a0 * a2 */ | |
776 | "movq %%r11, %%rax\n" | |
777 | "mulq %%r13\n" | |
778 | "addq %%rax, %%r9\n" | |
779 | "adcq %%rdx, %%r10\n" | |
780 | "adcq $0, %%r8\n" | |
781 | "addq %%rax, %%r9\n" | |
782 | "adcq %%rdx, %%r10\n" | |
783 | "adcq $0, %%r8\n" | |
784 | /* (r9,r10,r8) += a1 * a1 */ | |
785 | "movq %%r12, %%rax\n" | |
786 | "mulq %%r12\n" | |
787 | "addq %%rax, %%r9\n" | |
788 | "adcq %%rdx, %%r10\n" | |
789 | "adcq $0, %%r8\n" | |
790 | /* Extract l2 */ | |
791 | "movq %%r9, 16(%%rsi)\n" | |
792 | "xorq %%r9, %%r9\n" | |
793 | /* (r10,r8,r9) += 2 * a0 * a3 */ | |
794 | "movq %%r11, %%rax\n" | |
795 | "mulq %%r14\n" | |
796 | "addq %%rax, %%r10\n" | |
797 | "adcq %%rdx, %%r8\n" | |
798 | "adcq $0, %%r9\n" | |
799 | "addq %%rax, %%r10\n" | |
800 | "adcq %%rdx, %%r8\n" | |
801 | "adcq $0, %%r9\n" | |
802 | /* (r10,r8,r9) += 2 * a1 * a2 */ | |
803 | "movq %%r12, %%rax\n" | |
804 | "mulq %%r13\n" | |
805 | "addq %%rax, %%r10\n" | |
806 | "adcq %%rdx, %%r8\n" | |
807 | "adcq $0, %%r9\n" | |
808 | "addq %%rax, %%r10\n" | |
809 | "adcq %%rdx, %%r8\n" | |
810 | "adcq $0, %%r9\n" | |
811 | /* Extract l3 */ | |
812 | "movq %%r10, 24(%%rsi)\n" | |
813 | "xorq %%r10, %%r10\n" | |
814 | /* (r8,r9,r10) += 2 * a1 * a3 */ | |
815 | "movq %%r12, %%rax\n" | |
816 | "mulq %%r14\n" | |
817 | "addq %%rax, %%r8\n" | |
818 | "adcq %%rdx, %%r9\n" | |
819 | "adcq $0, %%r10\n" | |
820 | "addq %%rax, %%r8\n" | |
821 | "adcq %%rdx, %%r9\n" | |
822 | "adcq $0, %%r10\n" | |
823 | /* (r8,r9,r10) += a2 * a2 */ | |
824 | "movq %%r13, %%rax\n" | |
825 | "mulq %%r13\n" | |
826 | "addq %%rax, %%r8\n" | |
827 | "adcq %%rdx, %%r9\n" | |
828 | "adcq $0, %%r10\n" | |
829 | /* Extract l4 */ | |
830 | "movq %%r8, 32(%%rsi)\n" | |
831 | "xorq %%r8, %%r8\n" | |
832 | /* (r9,r10,r8) += 2 * a2 * a3 */ | |
833 | "movq %%r13, %%rax\n" | |
834 | "mulq %%r14\n" | |
835 | "addq %%rax, %%r9\n" | |
836 | "adcq %%rdx, %%r10\n" | |
837 | "adcq $0, %%r8\n" | |
838 | "addq %%rax, %%r9\n" | |
839 | "adcq %%rdx, %%r10\n" | |
840 | "adcq $0, %%r8\n" | |
841 | /* Extract l5 */ | |
842 | "movq %%r9, 40(%%rsi)\n" | |
843 | /* (r10,r8) += a3 * a3 */ | |
844 | "movq %%r14, %%rax\n" | |
845 | "mulq %%r14\n" | |
846 | "addq %%rax, %%r10\n" | |
847 | "adcq %%rdx, %%r8\n" | |
848 | /* Extract l6 */ | |
849 | "movq %%r10, 48(%%rsi)\n" | |
850 | /* Extract l7 */ | |
851 | "movq %%r8, 56(%%rsi)\n" | |
852 | : | |
853 | : "S"(l), "D"(a->d) | |
854 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory"); | |
855 | #else | |
71712b27 | 856 | /* 160 bit accumulator. */ |
1d52a8b1 PW |
857 | uint64_t c0 = 0, c1 = 0; |
858 | uint32_t c2 = 0; | |
859 | ||
71712b27 | 860 | /* l[0..7] = a[0..3] * b[0..3]. */ |
1d52a8b1 PW |
861 | muladd_fast(a->d[0], a->d[0]); |
862 | extract_fast(l[0]); | |
863 | muladd2(a->d[0], a->d[1]); | |
864 | extract(l[1]); | |
865 | muladd2(a->d[0], a->d[2]); | |
866 | muladd(a->d[1], a->d[1]); | |
867 | extract(l[2]); | |
868 | muladd2(a->d[0], a->d[3]); | |
869 | muladd2(a->d[1], a->d[2]); | |
870 | extract(l[3]); | |
871 | muladd2(a->d[1], a->d[3]); | |
872 | muladd(a->d[2], a->d[2]); | |
873 | extract(l[4]); | |
874 | muladd2(a->d[2], a->d[3]); | |
875 | extract(l[5]); | |
876 | muladd_fast(a->d[3], a->d[3]); | |
877 | extract_fast(l[6]); | |
878 | VERIFY_CHECK(c1 == 0); | |
879 | l[7] = c0; | |
ffccfd2d | 880 | #endif |
1d52a8b1 PW |
881 | } |
882 | ||
883 | #undef sumadd | |
884 | #undef sumadd_fast | |
885 | #undef muladd | |
886 | #undef muladd_fast | |
887 | #undef muladd2 | |
888 | #undef extract | |
889 | #undef extract_fast | |
890 | ||
dd891e0e | 891 | static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) { |
ff8746d4 PW |
892 | uint64_t l[8]; |
893 | secp256k1_scalar_mul_512(l, a, b); | |
894 | secp256k1_scalar_reduce_512(r, l); | |
895 | } | |
896 | ||
dd891e0e | 897 | static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) { |
44015000 AP |
898 | int ret; |
899 | VERIFY_CHECK(n > 0); | |
900 | VERIFY_CHECK(n < 16); | |
901 | ret = r->d[0] & ((1 << n) - 1); | |
902 | r->d[0] = (r->d[0] >> n) + (r->d[1] << (64 - n)); | |
903 | r->d[1] = (r->d[1] >> n) + (r->d[2] << (64 - n)); | |
904 | r->d[2] = (r->d[2] >> n) + (r->d[3] << (64 - n)); | |
905 | r->d[3] = (r->d[3] >> n); | |
906 | return ret; | |
907 | } | |
908 | ||
dd891e0e | 909 | static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) { |
ff8746d4 PW |
910 | uint64_t l[8]; |
911 | secp256k1_scalar_sqr_512(l, a); | |
912 | secp256k1_scalar_reduce_512(r, l); | |
913 | } | |
914 | ||
9f6993f3 | 915 | #ifdef USE_ENDOMORPHISM |
dd891e0e | 916 | static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) { |
6794be60 PW |
917 | r1->d[0] = a->d[0]; |
918 | r1->d[1] = a->d[1]; | |
919 | r1->d[2] = 0; | |
920 | r1->d[3] = 0; | |
921 | r2->d[0] = a->d[2]; | |
922 | r2->d[1] = a->d[3]; | |
923 | r2->d[2] = 0; | |
924 | r2->d[3] = 0; | |
925 | } | |
9f6993f3 | 926 | #endif |
6794be60 | 927 | |
dd891e0e | 928 | SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) { |
f24041d6 PW |
929 | return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0; |
930 | } | |
931 | ||
dd891e0e | 932 | SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) { |
ff8746d4 | 933 | uint64_t l[8]; |
d9543c90 GM |
934 | unsigned int shiftlimbs; |
935 | unsigned int shiftlow; | |
936 | unsigned int shifthigh; | |
937 | VERIFY_CHECK(shift >= 256); | |
ff8746d4 | 938 | secp256k1_scalar_mul_512(l, a, b); |
d9543c90 GM |
939 | shiftlimbs = shift >> 6; |
940 | shiftlow = shift & 0x3F; | |
941 | shifthigh = 64 - shiftlow; | |
ff8746d4 PW |
942 | r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0; |
943 | r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0; | |
944 | r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0; | |
945 | r->d[3] = shift < 320 ? (l[3 + shiftlimbs] >> shiftlow) : 0; | |
ed35d43a | 946 | secp256k1_scalar_cadd_bit(r, 0, (l[(shift - 1) >> 6] >> ((shift - 1) & 0x3f)) & 1); |
ff8746d4 PW |
947 | } |
948 | ||
34a67c77 GM |
949 | static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) { |
950 | uint64_t mask0, mask1; | |
f79a7adc | 951 | VG_CHECK_VERIFY(r->d, sizeof(r->d)); |
34a67c77 GM |
952 | mask0 = flag + ~((uint64_t)0); |
953 | mask1 = ~mask0; | |
954 | r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1); | |
955 | r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1); | |
956 | r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1); | |
957 | r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1); | |
958 | } | |
959 | ||
abe2d3e8 | 960 | #endif /* SECP256K1_SCALAR_REPR_IMPL_H */ |