| 1 | /********************************************************************** |
| 2 | * Copyright (c) 2013, 2014 Pieter Wuille * |
| 3 | * Distributed under the MIT software license, see the accompanying * |
| 4 | * file COPYING or http://www.opensource.org/licenses/mit-license.php.* |
| 5 | **********************************************************************/ |
| 6 | |
| 7 | #ifndef _SECP256K1_SCALAR_REPR_IMPL_H_ |
| 8 | #define _SECP256K1_SCALAR_REPR_IMPL_H_ |
| 9 | |
| 10 | /* Limbs of the secp256k1 order. */ |
| 11 | #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL) |
| 12 | #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL) |
| 13 | #define SECP256K1_N_2 ((uint64_t)0xFFFFFFFFFFFFFFFEULL) |
| 14 | #define SECP256K1_N_3 ((uint64_t)0xFFFFFFFFFFFFFFFFULL) |
| 15 | |
| 16 | /* Limbs of 2^256 minus the secp256k1 order. */ |
| 17 | #define SECP256K1_N_C_0 (~SECP256K1_N_0 + 1) |
| 18 | #define SECP256K1_N_C_1 (~SECP256K1_N_1) |
| 19 | #define SECP256K1_N_C_2 (1) |
| 20 | |
| 21 | /* Limbs of half the secp256k1 order. */ |
| 22 | #define SECP256K1_N_H_0 ((uint64_t)0xDFE92F46681B20A0ULL) |
| 23 | #define SECP256K1_N_H_1 ((uint64_t)0x5D576E7357A4501DULL) |
| 24 | #define SECP256K1_N_H_2 ((uint64_t)0xFFFFFFFFFFFFFFFFULL) |
| 25 | #define SECP256K1_N_H_3 ((uint64_t)0x7FFFFFFFFFFFFFFFULL) |
| 26 | |
| 27 | SECP256K1_INLINE static void secp256k1_scalar_clear(secp256k1_scalar_t *r) { |
| 28 | r->d[0] = 0; |
| 29 | r->d[1] = 0; |
| 30 | r->d[2] = 0; |
| 31 | r->d[3] = 0; |
| 32 | } |
| 33 | |
| 34 | SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar_t *r, unsigned int v) { |
| 35 | r->d[0] = v; |
| 36 | r->d[1] = 0; |
| 37 | r->d[2] = 0; |
| 38 | r->d[3] = 0; |
| 39 | } |
| 40 | |
| 41 | SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar_t *a, unsigned int offset, unsigned int count) { |
| 42 | VERIFY_CHECK((offset + count - 1) >> 6 == offset >> 6); |
| 43 | return (a->d[offset >> 6] >> (offset & 0x3F)) & ((((uint64_t)1) << count) - 1); |
| 44 | } |
| 45 | |
| 46 | SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar_t *a, unsigned int offset, unsigned int count) { |
| 47 | VERIFY_CHECK(count < 32); |
| 48 | VERIFY_CHECK(offset + count <= 256); |
| 49 | if ((offset + count - 1) >> 6 == offset >> 6) { |
| 50 | return secp256k1_scalar_get_bits(a, offset, count); |
| 51 | } else { |
| 52 | VERIFY_CHECK((offset >> 6) + 1 < 4); |
| 53 | return ((a->d[offset >> 6] >> (offset & 0x3F)) | (a->d[(offset >> 6) + 1] << (64 - (offset & 0x3F)))) & ((((uint64_t)1) << count) - 1); |
| 54 | } |
| 55 | } |
| 56 | |
| 57 | SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar_t *a) { |
| 58 | int yes = 0; |
| 59 | int no = 0; |
| 60 | no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */ |
| 61 | no |= (a->d[2] < SECP256K1_N_2); |
| 62 | yes |= (a->d[2] > SECP256K1_N_2) & ~no; |
| 63 | no |= (a->d[1] < SECP256K1_N_1); |
| 64 | yes |= (a->d[1] > SECP256K1_N_1) & ~no; |
| 65 | yes |= (a->d[0] >= SECP256K1_N_0) & ~no; |
| 66 | return yes; |
| 67 | } |
| 68 | |
| 69 | SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, unsigned int overflow) { |
| 70 | uint128_t t; |
| 71 | VERIFY_CHECK(overflow <= 1); |
| 72 | t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0; |
| 73 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 74 | t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1; |
| 75 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 76 | t += (uint128_t)r->d[2] + overflow * SECP256K1_N_C_2; |
| 77 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 78 | t += (uint64_t)r->d[3]; |
| 79 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; |
| 80 | return overflow; |
| 81 | } |
| 82 | |
| 83 | static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) { |
| 84 | int overflow; |
| 85 | uint128_t t = (uint128_t)a->d[0] + b->d[0]; |
| 86 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 87 | t += (uint128_t)a->d[1] + b->d[1]; |
| 88 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 89 | t += (uint128_t)a->d[2] + b->d[2]; |
| 90 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 91 | t += (uint128_t)a->d[3] + b->d[3]; |
| 92 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 93 | overflow = t + secp256k1_scalar_check_overflow(r); |
| 94 | VERIFY_CHECK(overflow == 0 || overflow == 1); |
| 95 | secp256k1_scalar_reduce(r, overflow); |
| 96 | return overflow; |
| 97 | } |
| 98 | |
| 99 | static void secp256k1_scalar_cadd_bit(secp256k1_scalar_t *r, unsigned int bit, int flag) { |
| 100 | uint128_t t; |
| 101 | VERIFY_CHECK(bit < 256); |
| 102 | bit += ((uint32_t) flag - 1) & 0x100; /* forcing (bit >> 6) > 3 makes this a noop */ |
| 103 | t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F)); |
| 104 | r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 105 | t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F)); |
| 106 | r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 107 | t += (uint128_t)r->d[2] + (((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F)); |
| 108 | r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64; |
| 109 | t += (uint128_t)r->d[3] + (((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F)); |
| 110 | r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; |
| 111 | #ifdef VERIFY |
| 112 | VERIFY_CHECK((t >> 64) == 0); |
| 113 | VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0); |
| 114 | #endif |
| 115 | } |
| 116 | |
| 117 | static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char *b32, int *overflow) { |
| 118 | int over; |
| 119 | r->d[0] = (uint64_t)b32[31] | (uint64_t)b32[30] << 8 | (uint64_t)b32[29] << 16 | (uint64_t)b32[28] << 24 | (uint64_t)b32[27] << 32 | (uint64_t)b32[26] << 40 | (uint64_t)b32[25] << 48 | (uint64_t)b32[24] << 56; |
| 120 | r->d[1] = (uint64_t)b32[23] | (uint64_t)b32[22] << 8 | (uint64_t)b32[21] << 16 | (uint64_t)b32[20] << 24 | (uint64_t)b32[19] << 32 | (uint64_t)b32[18] << 40 | (uint64_t)b32[17] << 48 | (uint64_t)b32[16] << 56; |
| 121 | r->d[2] = (uint64_t)b32[15] | (uint64_t)b32[14] << 8 | (uint64_t)b32[13] << 16 | (uint64_t)b32[12] << 24 | (uint64_t)b32[11] << 32 | (uint64_t)b32[10] << 40 | (uint64_t)b32[9] << 48 | (uint64_t)b32[8] << 56; |
| 122 | r->d[3] = (uint64_t)b32[7] | (uint64_t)b32[6] << 8 | (uint64_t)b32[5] << 16 | (uint64_t)b32[4] << 24 | (uint64_t)b32[3] << 32 | (uint64_t)b32[2] << 40 | (uint64_t)b32[1] << 48 | (uint64_t)b32[0] << 56; |
| 123 | over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r)); |
| 124 | if (overflow) { |
| 125 | *overflow = over; |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar_t* a) { |
| 130 | bin[0] = a->d[3] >> 56; bin[1] = a->d[3] >> 48; bin[2] = a->d[3] >> 40; bin[3] = a->d[3] >> 32; bin[4] = a->d[3] >> 24; bin[5] = a->d[3] >> 16; bin[6] = a->d[3] >> 8; bin[7] = a->d[3]; |
| 131 | bin[8] = a->d[2] >> 56; bin[9] = a->d[2] >> 48; bin[10] = a->d[2] >> 40; bin[11] = a->d[2] >> 32; bin[12] = a->d[2] >> 24; bin[13] = a->d[2] >> 16; bin[14] = a->d[2] >> 8; bin[15] = a->d[2]; |
| 132 | bin[16] = a->d[1] >> 56; bin[17] = a->d[1] >> 48; bin[18] = a->d[1] >> 40; bin[19] = a->d[1] >> 32; bin[20] = a->d[1] >> 24; bin[21] = a->d[1] >> 16; bin[22] = a->d[1] >> 8; bin[23] = a->d[1]; |
| 133 | bin[24] = a->d[0] >> 56; bin[25] = a->d[0] >> 48; bin[26] = a->d[0] >> 40; bin[27] = a->d[0] >> 32; bin[28] = a->d[0] >> 24; bin[29] = a->d[0] >> 16; bin[30] = a->d[0] >> 8; bin[31] = a->d[0]; |
| 134 | } |
| 135 | |
| 136 | SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar_t *a) { |
| 137 | return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0; |
| 138 | } |
| 139 | |
| 140 | static void secp256k1_scalar_negate(secp256k1_scalar_t *r, const secp256k1_scalar_t *a) { |
| 141 | uint64_t nonzero = 0xFFFFFFFFFFFFFFFFULL * (secp256k1_scalar_is_zero(a) == 0); |
| 142 | uint128_t t = (uint128_t)(~a->d[0]) + SECP256K1_N_0 + 1; |
| 143 | r->d[0] = t & nonzero; t >>= 64; |
| 144 | t += (uint128_t)(~a->d[1]) + SECP256K1_N_1; |
| 145 | r->d[1] = t & nonzero; t >>= 64; |
| 146 | t += (uint128_t)(~a->d[2]) + SECP256K1_N_2; |
| 147 | r->d[2] = t & nonzero; t >>= 64; |
| 148 | t += (uint128_t)(~a->d[3]) + SECP256K1_N_3; |
| 149 | r->d[3] = t & nonzero; |
| 150 | } |
| 151 | |
| 152 | SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar_t *a) { |
| 153 | return ((a->d[0] ^ 1) | a->d[1] | a->d[2] | a->d[3]) == 0; |
| 154 | } |
| 155 | |
| 156 | static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) { |
| 157 | int yes = 0; |
| 158 | int no = 0; |
| 159 | no |= (a->d[3] < SECP256K1_N_H_3); |
| 160 | yes |= (a->d[3] > SECP256K1_N_H_3) & ~no; |
| 161 | no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */ |
| 162 | no |= (a->d[1] < SECP256K1_N_H_1) & ~yes; |
| 163 | yes |= (a->d[1] > SECP256K1_N_H_1) & ~no; |
| 164 | yes |= (a->d[0] > SECP256K1_N_H_0) & ~no; |
| 165 | return yes; |
| 166 | } |
| 167 | |
| 168 | static int secp256k1_scalar_cond_negate(secp256k1_scalar_t *r, int flag) { |
| 169 | /* If we are flag = 0, mask = 00...00 and this is a no-op; |
| 170 | * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */ |
| 171 | uint64_t mask = !flag - 1; |
| 172 | uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1; |
| 173 | uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask); |
| 174 | r->d[0] = t & nonzero; t >>= 64; |
| 175 | t += (uint128_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask); |
| 176 | r->d[1] = t & nonzero; t >>= 64; |
| 177 | t += (uint128_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask); |
| 178 | r->d[2] = t & nonzero; t >>= 64; |
| 179 | t += (uint128_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask); |
| 180 | r->d[3] = t & nonzero; |
| 181 | return 2 * (mask == 0) - 1; |
| 182 | } |
| 183 | |
| 184 | /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */ |
| 185 | |
| 186 | /** Add a*b to the number defined by (c0,c1,c2). c2 must never overflow. */ |
| 187 | #define muladd(a,b) { \ |
| 188 | uint64_t tl, th; \ |
| 189 | { \ |
| 190 | uint128_t t = (uint128_t)a * b; \ |
| 191 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ |
| 192 | tl = t; \ |
| 193 | } \ |
| 194 | c0 += tl; /* overflow is handled on the next line */ \ |
| 195 | th += (c0 < tl) ? 1 : 0; /* at most 0xFFFFFFFFFFFFFFFF */ \ |
| 196 | c1 += th; /* overflow is handled on the next line */ \ |
| 197 | c2 += (c1 < th) ? 1 : 0; /* never overflows by contract (verified in the next line) */ \ |
| 198 | VERIFY_CHECK((c1 >= th) || (c2 != 0)); \ |
| 199 | } |
| 200 | |
| 201 | /** Add a*b to the number defined by (c0,c1). c1 must never overflow. */ |
| 202 | #define muladd_fast(a,b) { \ |
| 203 | uint64_t tl, th; \ |
| 204 | { \ |
| 205 | uint128_t t = (uint128_t)a * b; \ |
| 206 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ |
| 207 | tl = t; \ |
| 208 | } \ |
| 209 | c0 += tl; /* overflow is handled on the next line */ \ |
| 210 | th += (c0 < tl) ? 1 : 0; /* at most 0xFFFFFFFFFFFFFFFF */ \ |
| 211 | c1 += th; /* never overflows by contract (verified in the next line) */ \ |
| 212 | VERIFY_CHECK(c1 >= th); \ |
| 213 | } |
| 214 | |
| 215 | /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */ |
| 216 | #define muladd2(a,b) { \ |
| 217 | uint64_t tl, th, th2, tl2; \ |
| 218 | { \ |
| 219 | uint128_t t = (uint128_t)a * b; \ |
| 220 | th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ |
| 221 | tl = t; \ |
| 222 | } \ |
| 223 | th2 = th + th; /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \ |
| 224 | c2 += (th2 < th) ? 1 : 0; /* never overflows by contract (verified the next line) */ \ |
| 225 | VERIFY_CHECK((th2 >= th) || (c2 != 0)); \ |
| 226 | tl2 = tl + tl; /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \ |
| 227 | th2 += (tl2 < tl) ? 1 : 0; /* at most 0xFFFFFFFFFFFFFFFF */ \ |
| 228 | c0 += tl2; /* overflow is handled on the next line */ \ |
| 229 | th2 += (c0 < tl2) ? 1 : 0; /* second overflow is handled on the next line */ \ |
| 230 | c2 += (c0 < tl2) & (th2 == 0); /* never overflows by contract (verified the next line) */ \ |
| 231 | VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \ |
| 232 | c1 += th2; /* overflow is handled on the next line */ \ |
| 233 | c2 += (c1 < th2) ? 1 : 0; /* never overflows by contract (verified the next line) */ \ |
| 234 | VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \ |
| 235 | } |
| 236 | |
| 237 | /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */ |
| 238 | #define sumadd(a) { \ |
| 239 | unsigned int over; \ |
| 240 | c0 += (a); /* overflow is handled on the next line */ \ |
| 241 | over = (c0 < (a)) ? 1 : 0; \ |
| 242 | c1 += over; /* overflow is handled on the next line */ \ |
| 243 | c2 += (c1 < over) ? 1 : 0; /* never overflows by contract */ \ |
| 244 | } |
| 245 | |
| 246 | /** Add a to the number defined by (c0,c1). c1 must never overflow, c2 must be zero. */ |
| 247 | #define sumadd_fast(a) { \ |
| 248 | c0 += (a); /* overflow is handled on the next line */ \ |
| 249 | c1 += (c0 < (a)) ? 1 : 0; /* never overflows by contract (verified the next line) */ \ |
| 250 | VERIFY_CHECK((c1 != 0) | (c0 >= (a))); \ |
| 251 | VERIFY_CHECK(c2 == 0); \ |
| 252 | } |
| 253 | |
| 254 | /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. */ |
| 255 | #define extract(n) { \ |
| 256 | (n) = c0; \ |
| 257 | c0 = c1; \ |
| 258 | c1 = c2; \ |
| 259 | c2 = 0; \ |
| 260 | } |
| 261 | |
| 262 | /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. c2 is required to be zero. */ |
| 263 | #define extract_fast(n) { \ |
| 264 | (n) = c0; \ |
| 265 | c0 = c1; \ |
| 266 | c1 = 0; \ |
| 267 | VERIFY_CHECK(c2 == 0); \ |
| 268 | } |
| 269 | |
| 270 | static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint64_t *l) { |
| 271 | #ifdef USE_ASM_X86_64 |
| 272 | /* Reduce 512 bits into 385. */ |
| 273 | uint64_t m0, m1, m2, m3, m4, m5, m6; |
| 274 | uint64_t p0, p1, p2, p3, p4; |
| 275 | uint64_t c; |
| 276 | |
| 277 | __asm__ __volatile__( |
| 278 | /* Preload. */ |
| 279 | "movq 32(%%rsi), %%r11\n" |
| 280 | "movq 40(%%rsi), %%r12\n" |
| 281 | "movq 48(%%rsi), %%r13\n" |
| 282 | "movq 56(%%rsi), %%r14\n" |
| 283 | /* Initialize r8,r9,r10 */ |
| 284 | "movq 0(%%rsi), %%r8\n" |
| 285 | "movq $0, %%r9\n" |
| 286 | "movq $0, %%r10\n" |
| 287 | /* (r8,r9) += n0 * c0 */ |
| 288 | "movq %8, %%rax\n" |
| 289 | "mulq %%r11\n" |
| 290 | "addq %%rax, %%r8\n" |
| 291 | "adcq %%rdx, %%r9\n" |
| 292 | /* extract m0 */ |
| 293 | "movq %%r8, %q0\n" |
| 294 | "movq $0, %%r8\n" |
| 295 | /* (r9,r10) += l1 */ |
| 296 | "addq 8(%%rsi), %%r9\n" |
| 297 | "adcq $0, %%r10\n" |
| 298 | /* (r9,r10,r8) += n1 * c0 */ |
| 299 | "movq %8, %%rax\n" |
| 300 | "mulq %%r12\n" |
| 301 | "addq %%rax, %%r9\n" |
| 302 | "adcq %%rdx, %%r10\n" |
| 303 | "adcq $0, %%r8\n" |
| 304 | /* (r9,r10,r8) += n0 * c1 */ |
| 305 | "movq %9, %%rax\n" |
| 306 | "mulq %%r11\n" |
| 307 | "addq %%rax, %%r9\n" |
| 308 | "adcq %%rdx, %%r10\n" |
| 309 | "adcq $0, %%r8\n" |
| 310 | /* extract m1 */ |
| 311 | "movq %%r9, %q1\n" |
| 312 | "movq $0, %%r9\n" |
| 313 | /* (r10,r8,r9) += l2 */ |
| 314 | "addq 16(%%rsi), %%r10\n" |
| 315 | "adcq $0, %%r8\n" |
| 316 | "adcq $0, %%r9\n" |
| 317 | /* (r10,r8,r9) += n2 * c0 */ |
| 318 | "movq %8, %%rax\n" |
| 319 | "mulq %%r13\n" |
| 320 | "addq %%rax, %%r10\n" |
| 321 | "adcq %%rdx, %%r8\n" |
| 322 | "adcq $0, %%r9\n" |
| 323 | /* (r10,r8,r9) += n1 * c1 */ |
| 324 | "movq %9, %%rax\n" |
| 325 | "mulq %%r12\n" |
| 326 | "addq %%rax, %%r10\n" |
| 327 | "adcq %%rdx, %%r8\n" |
| 328 | "adcq $0, %%r9\n" |
| 329 | /* (r10,r8,r9) += n0 */ |
| 330 | "addq %%r11, %%r10\n" |
| 331 | "adcq $0, %%r8\n" |
| 332 | "adcq $0, %%r9\n" |
| 333 | /* extract m2 */ |
| 334 | "movq %%r10, %q2\n" |
| 335 | "movq $0, %%r10\n" |
| 336 | /* (r8,r9,r10) += l3 */ |
| 337 | "addq 24(%%rsi), %%r8\n" |
| 338 | "adcq $0, %%r9\n" |
| 339 | "adcq $0, %%r10\n" |
| 340 | /* (r8,r9,r10) += n3 * c0 */ |
| 341 | "movq %8, %%rax\n" |
| 342 | "mulq %%r14\n" |
| 343 | "addq %%rax, %%r8\n" |
| 344 | "adcq %%rdx, %%r9\n" |
| 345 | "adcq $0, %%r10\n" |
| 346 | /* (r8,r9,r10) += n2 * c1 */ |
| 347 | "movq %9, %%rax\n" |
| 348 | "mulq %%r13\n" |
| 349 | "addq %%rax, %%r8\n" |
| 350 | "adcq %%rdx, %%r9\n" |
| 351 | "adcq $0, %%r10\n" |
| 352 | /* (r8,r9,r10) += n1 */ |
| 353 | "addq %%r12, %%r8\n" |
| 354 | "adcq $0, %%r9\n" |
| 355 | "adcq $0, %%r10\n" |
| 356 | /* extract m3 */ |
| 357 | "movq %%r8, %q3\n" |
| 358 | "movq $0, %%r8\n" |
| 359 | /* (r9,r10,r8) += n3 * c1 */ |
| 360 | "movq %9, %%rax\n" |
| 361 | "mulq %%r14\n" |
| 362 | "addq %%rax, %%r9\n" |
| 363 | "adcq %%rdx, %%r10\n" |
| 364 | "adcq $0, %%r8\n" |
| 365 | /* (r9,r10,r8) += n2 */ |
| 366 | "addq %%r13, %%r9\n" |
| 367 | "adcq $0, %%r10\n" |
| 368 | "adcq $0, %%r8\n" |
| 369 | /* extract m4 */ |
| 370 | "movq %%r9, %q4\n" |
| 371 | /* (r10,r8) += n3 */ |
| 372 | "addq %%r14, %%r10\n" |
| 373 | "adcq $0, %%r8\n" |
| 374 | /* extract m5 */ |
| 375 | "movq %%r10, %q5\n" |
| 376 | /* extract m6 */ |
| 377 | "movq %%r8, %q6\n" |
| 378 | : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6) |
| 379 | : "S"(l), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1) |
| 380 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc"); |
| 381 | |
| 382 | /* Reduce 385 bits into 258. */ |
| 383 | __asm__ __volatile__( |
| 384 | /* Preload */ |
| 385 | "movq %q9, %%r11\n" |
| 386 | "movq %q10, %%r12\n" |
| 387 | "movq %q11, %%r13\n" |
| 388 | /* Initialize (r8,r9,r10) */ |
| 389 | "movq %q5, %%r8\n" |
| 390 | "movq $0, %%r9\n" |
| 391 | "movq $0, %%r10\n" |
| 392 | /* (r8,r9) += m4 * c0 */ |
| 393 | "movq %12, %%rax\n" |
| 394 | "mulq %%r11\n" |
| 395 | "addq %%rax, %%r8\n" |
| 396 | "adcq %%rdx, %%r9\n" |
| 397 | /* extract p0 */ |
| 398 | "movq %%r8, %q0\n" |
| 399 | "movq $0, %%r8\n" |
| 400 | /* (r9,r10) += m1 */ |
| 401 | "addq %q6, %%r9\n" |
| 402 | "adcq $0, %%r10\n" |
| 403 | /* (r9,r10,r8) += m5 * c0 */ |
| 404 | "movq %12, %%rax\n" |
| 405 | "mulq %%r12\n" |
| 406 | "addq %%rax, %%r9\n" |
| 407 | "adcq %%rdx, %%r10\n" |
| 408 | "adcq $0, %%r8\n" |
| 409 | /* (r9,r10,r8) += m4 * c1 */ |
| 410 | "movq %13, %%rax\n" |
| 411 | "mulq %%r11\n" |
| 412 | "addq %%rax, %%r9\n" |
| 413 | "adcq %%rdx, %%r10\n" |
| 414 | "adcq $0, %%r8\n" |
| 415 | /* extract p1 */ |
| 416 | "movq %%r9, %q1\n" |
| 417 | "movq $0, %%r9\n" |
| 418 | /* (r10,r8,r9) += m2 */ |
| 419 | "addq %q7, %%r10\n" |
| 420 | "adcq $0, %%r8\n" |
| 421 | "adcq $0, %%r9\n" |
| 422 | /* (r10,r8,r9) += m6 * c0 */ |
| 423 | "movq %12, %%rax\n" |
| 424 | "mulq %%r13\n" |
| 425 | "addq %%rax, %%r10\n" |
| 426 | "adcq %%rdx, %%r8\n" |
| 427 | "adcq $0, %%r9\n" |
| 428 | /* (r10,r8,r9) += m5 * c1 */ |
| 429 | "movq %13, %%rax\n" |
| 430 | "mulq %%r12\n" |
| 431 | "addq %%rax, %%r10\n" |
| 432 | "adcq %%rdx, %%r8\n" |
| 433 | "adcq $0, %%r9\n" |
| 434 | /* (r10,r8,r9) += m4 */ |
| 435 | "addq %%r11, %%r10\n" |
| 436 | "adcq $0, %%r8\n" |
| 437 | "adcq $0, %%r9\n" |
| 438 | /* extract p2 */ |
| 439 | "movq %%r10, %q2\n" |
| 440 | /* (r8,r9) += m3 */ |
| 441 | "addq %q8, %%r8\n" |
| 442 | "adcq $0, %%r9\n" |
| 443 | /* (r8,r9) += m6 * c1 */ |
| 444 | "movq %13, %%rax\n" |
| 445 | "mulq %%r13\n" |
| 446 | "addq %%rax, %%r8\n" |
| 447 | "adcq %%rdx, %%r9\n" |
| 448 | /* (r8,r9) += m5 */ |
| 449 | "addq %%r12, %%r8\n" |
| 450 | "adcq $0, %%r9\n" |
| 451 | /* extract p3 */ |
| 452 | "movq %%r8, %q3\n" |
| 453 | /* (r9) += m6 */ |
| 454 | "addq %%r13, %%r9\n" |
| 455 | /* extract p4 */ |
| 456 | "movq %%r9, %q4\n" |
| 457 | : "=&g"(p0), "=&g"(p1), "=&g"(p2), "=g"(p3), "=g"(p4) |
| 458 | : "g"(m0), "g"(m1), "g"(m2), "g"(m3), "g"(m4), "g"(m5), "g"(m6), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1) |
| 459 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "cc"); |
| 460 | |
| 461 | /* Reduce 258 bits into 256. */ |
| 462 | __asm__ __volatile__( |
| 463 | /* Preload */ |
| 464 | "movq %q5, %%r10\n" |
| 465 | /* (rax,rdx) = p4 * c0 */ |
| 466 | "movq %7, %%rax\n" |
| 467 | "mulq %%r10\n" |
| 468 | /* (rax,rdx) += p0 */ |
| 469 | "addq %q1, %%rax\n" |
| 470 | "adcq $0, %%rdx\n" |
| 471 | /* extract r0 */ |
| 472 | "movq %%rax, 0(%q6)\n" |
| 473 | /* Move to (r8,r9) */ |
| 474 | "movq %%rdx, %%r8\n" |
| 475 | "movq $0, %%r9\n" |
| 476 | /* (r8,r9) += p1 */ |
| 477 | "addq %q2, %%r8\n" |
| 478 | "adcq $0, %%r9\n" |
| 479 | /* (r8,r9) += p4 * c1 */ |
| 480 | "movq %8, %%rax\n" |
| 481 | "mulq %%r10\n" |
| 482 | "addq %%rax, %%r8\n" |
| 483 | "adcq %%rdx, %%r9\n" |
| 484 | /* Extract r1 */ |
| 485 | "movq %%r8, 8(%q6)\n" |
| 486 | "movq $0, %%r8\n" |
| 487 | /* (r9,r8) += p4 */ |
| 488 | "addq %%r10, %%r9\n" |
| 489 | "adcq $0, %%r8\n" |
| 490 | /* (r9,r8) += p2 */ |
| 491 | "addq %q3, %%r9\n" |
| 492 | "adcq $0, %%r8\n" |
| 493 | /* Extract r2 */ |
| 494 | "movq %%r9, 16(%q6)\n" |
| 495 | "movq $0, %%r9\n" |
| 496 | /* (r8,r9) += p3 */ |
| 497 | "addq %q4, %%r8\n" |
| 498 | "adcq $0, %%r9\n" |
| 499 | /* Extract r3 */ |
| 500 | "movq %%r8, 24(%q6)\n" |
| 501 | /* Extract c */ |
| 502 | "movq %%r9, %q0\n" |
| 503 | : "=g"(c) |
| 504 | : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1) |
| 505 | : "rax", "rdx", "r8", "r9", "r10", "cc", "memory"); |
| 506 | #else |
| 507 | uint128_t c; |
| 508 | uint64_t c0, c1, c2; |
| 509 | uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7]; |
| 510 | uint64_t m0, m1, m2, m3, m4, m5; |
| 511 | uint32_t m6; |
| 512 | uint64_t p0, p1, p2, p3; |
| 513 | uint32_t p4; |
| 514 | |
| 515 | /* Reduce 512 bits into 385. */ |
| 516 | /* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */ |
| 517 | c0 = l[0]; c1 = 0; c2 = 0; |
| 518 | muladd_fast(n0, SECP256K1_N_C_0); |
| 519 | extract_fast(m0); |
| 520 | sumadd_fast(l[1]); |
| 521 | muladd(n1, SECP256K1_N_C_0); |
| 522 | muladd(n0, SECP256K1_N_C_1); |
| 523 | extract(m1); |
| 524 | sumadd(l[2]); |
| 525 | muladd(n2, SECP256K1_N_C_0); |
| 526 | muladd(n1, SECP256K1_N_C_1); |
| 527 | sumadd(n0); |
| 528 | extract(m2); |
| 529 | sumadd(l[3]); |
| 530 | muladd(n3, SECP256K1_N_C_0); |
| 531 | muladd(n2, SECP256K1_N_C_1); |
| 532 | sumadd(n1); |
| 533 | extract(m3); |
| 534 | muladd(n3, SECP256K1_N_C_1); |
| 535 | sumadd(n2); |
| 536 | extract(m4); |
| 537 | sumadd_fast(n3); |
| 538 | extract_fast(m5); |
| 539 | VERIFY_CHECK(c0 <= 1); |
| 540 | m6 = c0; |
| 541 | |
| 542 | /* Reduce 385 bits into 258. */ |
| 543 | /* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */ |
| 544 | c0 = m0; c1 = 0; c2 = 0; |
| 545 | muladd_fast(m4, SECP256K1_N_C_0); |
| 546 | extract_fast(p0); |
| 547 | sumadd_fast(m1); |
| 548 | muladd(m5, SECP256K1_N_C_0); |
| 549 | muladd(m4, SECP256K1_N_C_1); |
| 550 | extract(p1); |
| 551 | sumadd(m2); |
| 552 | muladd(m6, SECP256K1_N_C_0); |
| 553 | muladd(m5, SECP256K1_N_C_1); |
| 554 | sumadd(m4); |
| 555 | extract(p2); |
| 556 | sumadd_fast(m3); |
| 557 | muladd_fast(m6, SECP256K1_N_C_1); |
| 558 | sumadd_fast(m5); |
| 559 | extract_fast(p3); |
| 560 | p4 = c0 + m6; |
| 561 | VERIFY_CHECK(p4 <= 2); |
| 562 | |
| 563 | /* Reduce 258 bits into 256. */ |
| 564 | /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */ |
| 565 | c = p0 + (uint128_t)SECP256K1_N_C_0 * p4; |
| 566 | r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; |
| 567 | c += p1 + (uint128_t)SECP256K1_N_C_1 * p4; |
| 568 | r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; |
| 569 | c += p2 + (uint128_t)p4; |
| 570 | r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; |
| 571 | c += p3; |
| 572 | r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64; |
| 573 | #endif |
| 574 | |
| 575 | /* Final reduction of r. */ |
| 576 | secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r)); |
| 577 | } |
| 578 | |
| 579 | static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) { |
| 580 | #ifdef USE_ASM_X86_64 |
| 581 | const uint64_t *pb = b->d; |
| 582 | __asm__ __volatile__( |
| 583 | /* Preload */ |
| 584 | "movq 0(%%rdi), %%r15\n" |
| 585 | "movq 8(%%rdi), %%rbx\n" |
| 586 | "movq 16(%%rdi), %%rcx\n" |
| 587 | "movq 0(%%rdx), %%r11\n" |
| 588 | "movq 8(%%rdx), %%r12\n" |
| 589 | "movq 16(%%rdx), %%r13\n" |
| 590 | "movq 24(%%rdx), %%r14\n" |
| 591 | /* (rax,rdx) = a0 * b0 */ |
| 592 | "movq %%r15, %%rax\n" |
| 593 | "mulq %%r11\n" |
| 594 | /* Extract l0 */ |
| 595 | "movq %%rax, 0(%%rsi)\n" |
| 596 | /* (r8,r9,r10) = (rdx) */ |
| 597 | "movq %%rdx, %%r8\n" |
| 598 | "xorq %%r9, %%r9\n" |
| 599 | "xorq %%r10, %%r10\n" |
| 600 | /* (r8,r9,r10) += a0 * b1 */ |
| 601 | "movq %%r15, %%rax\n" |
| 602 | "mulq %%r12\n" |
| 603 | "addq %%rax, %%r8\n" |
| 604 | "adcq %%rdx, %%r9\n" |
| 605 | "adcq $0, %%r10\n" |
| 606 | /* (r8,r9,r10) += a1 * b0 */ |
| 607 | "movq %%rbx, %%rax\n" |
| 608 | "mulq %%r11\n" |
| 609 | "addq %%rax, %%r8\n" |
| 610 | "adcq %%rdx, %%r9\n" |
| 611 | "adcq $0, %%r10\n" |
| 612 | /* Extract l1 */ |
| 613 | "movq %%r8, 8(%%rsi)\n" |
| 614 | "xorq %%r8, %%r8\n" |
| 615 | /* (r9,r10,r8) += a0 * b2 */ |
| 616 | "movq %%r15, %%rax\n" |
| 617 | "mulq %%r13\n" |
| 618 | "addq %%rax, %%r9\n" |
| 619 | "adcq %%rdx, %%r10\n" |
| 620 | "adcq $0, %%r8\n" |
| 621 | /* (r9,r10,r8) += a1 * b1 */ |
| 622 | "movq %%rbx, %%rax\n" |
| 623 | "mulq %%r12\n" |
| 624 | "addq %%rax, %%r9\n" |
| 625 | "adcq %%rdx, %%r10\n" |
| 626 | "adcq $0, %%r8\n" |
| 627 | /* (r9,r10,r8) += a2 * b0 */ |
| 628 | "movq %%rcx, %%rax\n" |
| 629 | "mulq %%r11\n" |
| 630 | "addq %%rax, %%r9\n" |
| 631 | "adcq %%rdx, %%r10\n" |
| 632 | "adcq $0, %%r8\n" |
| 633 | /* Extract l2 */ |
| 634 | "movq %%r9, 16(%%rsi)\n" |
| 635 | "xorq %%r9, %%r9\n" |
| 636 | /* (r10,r8,r9) += a0 * b3 */ |
| 637 | "movq %%r15, %%rax\n" |
| 638 | "mulq %%r14\n" |
| 639 | "addq %%rax, %%r10\n" |
| 640 | "adcq %%rdx, %%r8\n" |
| 641 | "adcq $0, %%r9\n" |
| 642 | /* Preload a3 */ |
| 643 | "movq 24(%%rdi), %%r15\n" |
| 644 | /* (r10,r8,r9) += a1 * b2 */ |
| 645 | "movq %%rbx, %%rax\n" |
| 646 | "mulq %%r13\n" |
| 647 | "addq %%rax, %%r10\n" |
| 648 | "adcq %%rdx, %%r8\n" |
| 649 | "adcq $0, %%r9\n" |
| 650 | /* (r10,r8,r9) += a2 * b1 */ |
| 651 | "movq %%rcx, %%rax\n" |
| 652 | "mulq %%r12\n" |
| 653 | "addq %%rax, %%r10\n" |
| 654 | "adcq %%rdx, %%r8\n" |
| 655 | "adcq $0, %%r9\n" |
| 656 | /* (r10,r8,r9) += a3 * b0 */ |
| 657 | "movq %%r15, %%rax\n" |
| 658 | "mulq %%r11\n" |
| 659 | "addq %%rax, %%r10\n" |
| 660 | "adcq %%rdx, %%r8\n" |
| 661 | "adcq $0, %%r9\n" |
| 662 | /* Extract l3 */ |
| 663 | "movq %%r10, 24(%%rsi)\n" |
| 664 | "xorq %%r10, %%r10\n" |
| 665 | /* (r8,r9,r10) += a1 * b3 */ |
| 666 | "movq %%rbx, %%rax\n" |
| 667 | "mulq %%r14\n" |
| 668 | "addq %%rax, %%r8\n" |
| 669 | "adcq %%rdx, %%r9\n" |
| 670 | "adcq $0, %%r10\n" |
| 671 | /* (r8,r9,r10) += a2 * b2 */ |
| 672 | "movq %%rcx, %%rax\n" |
| 673 | "mulq %%r13\n" |
| 674 | "addq %%rax, %%r8\n" |
| 675 | "adcq %%rdx, %%r9\n" |
| 676 | "adcq $0, %%r10\n" |
| 677 | /* (r8,r9,r10) += a3 * b1 */ |
| 678 | "movq %%r15, %%rax\n" |
| 679 | "mulq %%r12\n" |
| 680 | "addq %%rax, %%r8\n" |
| 681 | "adcq %%rdx, %%r9\n" |
| 682 | "adcq $0, %%r10\n" |
| 683 | /* Extract l4 */ |
| 684 | "movq %%r8, 32(%%rsi)\n" |
| 685 | "xorq %%r8, %%r8\n" |
| 686 | /* (r9,r10,r8) += a2 * b3 */ |
| 687 | "movq %%rcx, %%rax\n" |
| 688 | "mulq %%r14\n" |
| 689 | "addq %%rax, %%r9\n" |
| 690 | "adcq %%rdx, %%r10\n" |
| 691 | "adcq $0, %%r8\n" |
| 692 | /* (r9,r10,r8) += a3 * b2 */ |
| 693 | "movq %%r15, %%rax\n" |
| 694 | "mulq %%r13\n" |
| 695 | "addq %%rax, %%r9\n" |
| 696 | "adcq %%rdx, %%r10\n" |
| 697 | "adcq $0, %%r8\n" |
| 698 | /* Extract l5 */ |
| 699 | "movq %%r9, 40(%%rsi)\n" |
| 700 | /* (r10,r8) += a3 * b3 */ |
| 701 | "movq %%r15, %%rax\n" |
| 702 | "mulq %%r14\n" |
| 703 | "addq %%rax, %%r10\n" |
| 704 | "adcq %%rdx, %%r8\n" |
| 705 | /* Extract l6 */ |
| 706 | "movq %%r10, 48(%%rsi)\n" |
| 707 | /* Extract l7 */ |
| 708 | "movq %%r8, 56(%%rsi)\n" |
| 709 | : "+d"(pb) |
| 710 | : "S"(l), "D"(a->d) |
| 711 | : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory"); |
| 712 | #else |
| 713 | /* 160 bit accumulator. */ |
| 714 | uint64_t c0 = 0, c1 = 0; |
| 715 | uint32_t c2 = 0; |
| 716 | |
| 717 | /* l[0..7] = a[0..3] * b[0..3]. */ |
| 718 | muladd_fast(a->d[0], b->d[0]); |
| 719 | extract_fast(l[0]); |
| 720 | muladd(a->d[0], b->d[1]); |
| 721 | muladd(a->d[1], b->d[0]); |
| 722 | extract(l[1]); |
| 723 | muladd(a->d[0], b->d[2]); |
| 724 | muladd(a->d[1], b->d[1]); |
| 725 | muladd(a->d[2], b->d[0]); |
| 726 | extract(l[2]); |
| 727 | muladd(a->d[0], b->d[3]); |
| 728 | muladd(a->d[1], b->d[2]); |
| 729 | muladd(a->d[2], b->d[1]); |
| 730 | muladd(a->d[3], b->d[0]); |
| 731 | extract(l[3]); |
| 732 | muladd(a->d[1], b->d[3]); |
| 733 | muladd(a->d[2], b->d[2]); |
| 734 | muladd(a->d[3], b->d[1]); |
| 735 | extract(l[4]); |
| 736 | muladd(a->d[2], b->d[3]); |
| 737 | muladd(a->d[3], b->d[2]); |
| 738 | extract(l[5]); |
| 739 | muladd_fast(a->d[3], b->d[3]); |
| 740 | extract_fast(l[6]); |
| 741 | VERIFY_CHECK(c1 <= 0); |
| 742 | l[7] = c0; |
| 743 | #endif |
| 744 | } |
| 745 | |
| 746 | static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar_t *a) { |
| 747 | #ifdef USE_ASM_X86_64 |
| 748 | __asm__ __volatile__( |
| 749 | /* Preload */ |
| 750 | "movq 0(%%rdi), %%r11\n" |
| 751 | "movq 8(%%rdi), %%r12\n" |
| 752 | "movq 16(%%rdi), %%r13\n" |
| 753 | "movq 24(%%rdi), %%r14\n" |
| 754 | /* (rax,rdx) = a0 * a0 */ |
| 755 | "movq %%r11, %%rax\n" |
| 756 | "mulq %%r11\n" |
| 757 | /* Extract l0 */ |
| 758 | "movq %%rax, 0(%%rsi)\n" |
| 759 | /* (r8,r9,r10) = (rdx,0) */ |
| 760 | "movq %%rdx, %%r8\n" |
| 761 | "xorq %%r9, %%r9\n" |
| 762 | "xorq %%r10, %%r10\n" |
| 763 | /* (r8,r9,r10) += 2 * a0 * a1 */ |
| 764 | "movq %%r11, %%rax\n" |
| 765 | "mulq %%r12\n" |
| 766 | "addq %%rax, %%r8\n" |
| 767 | "adcq %%rdx, %%r9\n" |
| 768 | "adcq $0, %%r10\n" |
| 769 | "addq %%rax, %%r8\n" |
| 770 | "adcq %%rdx, %%r9\n" |
| 771 | "adcq $0, %%r10\n" |
| 772 | /* Extract l1 */ |
| 773 | "movq %%r8, 8(%%rsi)\n" |
| 774 | "xorq %%r8, %%r8\n" |
| 775 | /* (r9,r10,r8) += 2 * a0 * a2 */ |
| 776 | "movq %%r11, %%rax\n" |
| 777 | "mulq %%r13\n" |
| 778 | "addq %%rax, %%r9\n" |
| 779 | "adcq %%rdx, %%r10\n" |
| 780 | "adcq $0, %%r8\n" |
| 781 | "addq %%rax, %%r9\n" |
| 782 | "adcq %%rdx, %%r10\n" |
| 783 | "adcq $0, %%r8\n" |
| 784 | /* (r9,r10,r8) += a1 * a1 */ |
| 785 | "movq %%r12, %%rax\n" |
| 786 | "mulq %%r12\n" |
| 787 | "addq %%rax, %%r9\n" |
| 788 | "adcq %%rdx, %%r10\n" |
| 789 | "adcq $0, %%r8\n" |
| 790 | /* Extract l2 */ |
| 791 | "movq %%r9, 16(%%rsi)\n" |
| 792 | "xorq %%r9, %%r9\n" |
| 793 | /* (r10,r8,r9) += 2 * a0 * a3 */ |
| 794 | "movq %%r11, %%rax\n" |
| 795 | "mulq %%r14\n" |
| 796 | "addq %%rax, %%r10\n" |
| 797 | "adcq %%rdx, %%r8\n" |
| 798 | "adcq $0, %%r9\n" |
| 799 | "addq %%rax, %%r10\n" |
| 800 | "adcq %%rdx, %%r8\n" |
| 801 | "adcq $0, %%r9\n" |
| 802 | /* (r10,r8,r9) += 2 * a1 * a2 */ |
| 803 | "movq %%r12, %%rax\n" |
| 804 | "mulq %%r13\n" |
| 805 | "addq %%rax, %%r10\n" |
| 806 | "adcq %%rdx, %%r8\n" |
| 807 | "adcq $0, %%r9\n" |
| 808 | "addq %%rax, %%r10\n" |
| 809 | "adcq %%rdx, %%r8\n" |
| 810 | "adcq $0, %%r9\n" |
| 811 | /* Extract l3 */ |
| 812 | "movq %%r10, 24(%%rsi)\n" |
| 813 | "xorq %%r10, %%r10\n" |
| 814 | /* (r8,r9,r10) += 2 * a1 * a3 */ |
| 815 | "movq %%r12, %%rax\n" |
| 816 | "mulq %%r14\n" |
| 817 | "addq %%rax, %%r8\n" |
| 818 | "adcq %%rdx, %%r9\n" |
| 819 | "adcq $0, %%r10\n" |
| 820 | "addq %%rax, %%r8\n" |
| 821 | "adcq %%rdx, %%r9\n" |
| 822 | "adcq $0, %%r10\n" |
| 823 | /* (r8,r9,r10) += a2 * a2 */ |
| 824 | "movq %%r13, %%rax\n" |
| 825 | "mulq %%r13\n" |
| 826 | "addq %%rax, %%r8\n" |
| 827 | "adcq %%rdx, %%r9\n" |
| 828 | "adcq $0, %%r10\n" |
| 829 | /* Extract l4 */ |
| 830 | "movq %%r8, 32(%%rsi)\n" |
| 831 | "xorq %%r8, %%r8\n" |
| 832 | /* (r9,r10,r8) += 2 * a2 * a3 */ |
| 833 | "movq %%r13, %%rax\n" |
| 834 | "mulq %%r14\n" |
| 835 | "addq %%rax, %%r9\n" |
| 836 | "adcq %%rdx, %%r10\n" |
| 837 | "adcq $0, %%r8\n" |
| 838 | "addq %%rax, %%r9\n" |
| 839 | "adcq %%rdx, %%r10\n" |
| 840 | "adcq $0, %%r8\n" |
| 841 | /* Extract l5 */ |
| 842 | "movq %%r9, 40(%%rsi)\n" |
| 843 | /* (r10,r8) += a3 * a3 */ |
| 844 | "movq %%r14, %%rax\n" |
| 845 | "mulq %%r14\n" |
| 846 | "addq %%rax, %%r10\n" |
| 847 | "adcq %%rdx, %%r8\n" |
| 848 | /* Extract l6 */ |
| 849 | "movq %%r10, 48(%%rsi)\n" |
| 850 | /* Extract l7 */ |
| 851 | "movq %%r8, 56(%%rsi)\n" |
| 852 | : |
| 853 | : "S"(l), "D"(a->d) |
| 854 | : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory"); |
| 855 | #else |
| 856 | /* 160 bit accumulator. */ |
| 857 | uint64_t c0 = 0, c1 = 0; |
| 858 | uint32_t c2 = 0; |
| 859 | |
| 860 | /* l[0..7] = a[0..3] * b[0..3]. */ |
| 861 | muladd_fast(a->d[0], a->d[0]); |
| 862 | extract_fast(l[0]); |
| 863 | muladd2(a->d[0], a->d[1]); |
| 864 | extract(l[1]); |
| 865 | muladd2(a->d[0], a->d[2]); |
| 866 | muladd(a->d[1], a->d[1]); |
| 867 | extract(l[2]); |
| 868 | muladd2(a->d[0], a->d[3]); |
| 869 | muladd2(a->d[1], a->d[2]); |
| 870 | extract(l[3]); |
| 871 | muladd2(a->d[1], a->d[3]); |
| 872 | muladd(a->d[2], a->d[2]); |
| 873 | extract(l[4]); |
| 874 | muladd2(a->d[2], a->d[3]); |
| 875 | extract(l[5]); |
| 876 | muladd_fast(a->d[3], a->d[3]); |
| 877 | extract_fast(l[6]); |
| 878 | VERIFY_CHECK(c1 == 0); |
| 879 | l[7] = c0; |
| 880 | #endif |
| 881 | } |
| 882 | |
| 883 | #undef sumadd |
| 884 | #undef sumadd_fast |
| 885 | #undef muladd |
| 886 | #undef muladd_fast |
| 887 | #undef muladd2 |
| 888 | #undef extract |
| 889 | #undef extract_fast |
| 890 | |
| 891 | static void secp256k1_scalar_mul(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) { |
| 892 | uint64_t l[8]; |
| 893 | secp256k1_scalar_mul_512(l, a, b); |
| 894 | secp256k1_scalar_reduce_512(r, l); |
| 895 | } |
| 896 | |
| 897 | static int secp256k1_scalar_shr_int(secp256k1_scalar_t *r, int n) { |
| 898 | int ret; |
| 899 | VERIFY_CHECK(n > 0); |
| 900 | VERIFY_CHECK(n < 16); |
| 901 | ret = r->d[0] & ((1 << n) - 1); |
| 902 | r->d[0] = (r->d[0] >> n) + (r->d[1] << (64 - n)); |
| 903 | r->d[1] = (r->d[1] >> n) + (r->d[2] << (64 - n)); |
| 904 | r->d[2] = (r->d[2] >> n) + (r->d[3] << (64 - n)); |
| 905 | r->d[3] = (r->d[3] >> n); |
| 906 | return ret; |
| 907 | } |
| 908 | |
| 909 | static void secp256k1_scalar_sqr(secp256k1_scalar_t *r, const secp256k1_scalar_t *a) { |
| 910 | uint64_t l[8]; |
| 911 | secp256k1_scalar_sqr_512(l, a); |
| 912 | secp256k1_scalar_reduce_512(r, l); |
| 913 | } |
| 914 | |
| 915 | static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a) { |
| 916 | r1->d[0] = a->d[0]; |
| 917 | r1->d[1] = a->d[1]; |
| 918 | r1->d[2] = 0; |
| 919 | r1->d[3] = 0; |
| 920 | r2->d[0] = a->d[2]; |
| 921 | r2->d[1] = a->d[3]; |
| 922 | r2->d[2] = 0; |
| 923 | r2->d[3] = 0; |
| 924 | } |
| 925 | |
| 926 | SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) { |
| 927 | return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0; |
| 928 | } |
| 929 | |
| 930 | SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b, unsigned int shift) { |
| 931 | uint64_t l[8]; |
| 932 | unsigned int shiftlimbs; |
| 933 | unsigned int shiftlow; |
| 934 | unsigned int shifthigh; |
| 935 | VERIFY_CHECK(shift >= 256); |
| 936 | secp256k1_scalar_mul_512(l, a, b); |
| 937 | shiftlimbs = shift >> 6; |
| 938 | shiftlow = shift & 0x3F; |
| 939 | shifthigh = 64 - shiftlow; |
| 940 | r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0; |
| 941 | r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0; |
| 942 | r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0; |
| 943 | r->d[3] = shift < 320 ? (l[3 + shiftlimbs] >> shiftlow) : 0; |
| 944 | secp256k1_scalar_cadd_bit(r, 0, (l[(shift - 1) >> 6] >> ((shift - 1) & 0x3f)) & 1); |
| 945 | } |
| 946 | |
| 947 | #endif |