1 /***********************************************************************
2 * Copyright (c) 2020 Peter Dettman *
3 * Distributed under the MIT software license, see the accompanying *
4 * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
5 **********************************************************************/
7 #ifndef SECP256K1_MODINV32_IMPL_H
8 #define SECP256K1_MODINV32_IMPL_H
16 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
17 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
19 * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
20 * implementation for N=30, using 30-bit signed limbs represented as int32_t.
23 /* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus
24 * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
25 * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range
27 static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int32_t sign, const secp256k1_modinv32_modinfo *modinfo) {
28 const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
29 int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
30 r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
31 int32_t cond_add, cond_negate;
33 /* In a first step, add the modulus if the input is negative, and then negate if requested.
34 * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
35 * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right
36 * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
37 * indeed the behavior of the right shift operator). */
39 r0 += modinfo->modulus.v[0] & cond_add;
40 r1 += modinfo->modulus.v[1] & cond_add;
41 r2 += modinfo->modulus.v[2] & cond_add;
42 r3 += modinfo->modulus.v[3] & cond_add;
43 r4 += modinfo->modulus.v[4] & cond_add;
44 r5 += modinfo->modulus.v[5] & cond_add;
45 r6 += modinfo->modulus.v[6] & cond_add;
46 r7 += modinfo->modulus.v[7] & cond_add;
47 r8 += modinfo->modulus.v[8] & cond_add;
48 cond_negate = sign >> 31;
49 r0 = (r0 ^ cond_negate) - cond_negate;
50 r1 = (r1 ^ cond_negate) - cond_negate;
51 r2 = (r2 ^ cond_negate) - cond_negate;
52 r3 = (r3 ^ cond_negate) - cond_negate;
53 r4 = (r4 ^ cond_negate) - cond_negate;
54 r5 = (r5 ^ cond_negate) - cond_negate;
55 r6 = (r6 ^ cond_negate) - cond_negate;
56 r7 = (r7 ^ cond_negate) - cond_negate;
57 r8 = (r8 ^ cond_negate) - cond_negate;
58 /* Propagate the top bits, to bring limbs back to range (-2^30,2^30). */
59 r1 += r0 >> 30; r0 &= M30;
60 r2 += r1 >> 30; r1 &= M30;
61 r3 += r2 >> 30; r2 &= M30;
62 r4 += r3 >> 30; r3 &= M30;
63 r5 += r4 >> 30; r4 &= M30;
64 r6 += r5 >> 30; r5 &= M30;
65 r7 += r6 >> 30; r6 &= M30;
66 r8 += r7 >> 30; r7 &= M30;
68 /* In a second step add the modulus again if the result is still negative, bringing r to range
71 r0 += modinfo->modulus.v[0] & cond_add;
72 r1 += modinfo->modulus.v[1] & cond_add;
73 r2 += modinfo->modulus.v[2] & cond_add;
74 r3 += modinfo->modulus.v[3] & cond_add;
75 r4 += modinfo->modulus.v[4] & cond_add;
76 r5 += modinfo->modulus.v[5] & cond_add;
77 r6 += modinfo->modulus.v[6] & cond_add;
78 r7 += modinfo->modulus.v[7] & cond_add;
79 r8 += modinfo->modulus.v[8] & cond_add;
80 /* And propagate again. */
81 r1 += r0 >> 30; r0 &= M30;
82 r2 += r1 >> 30; r1 &= M30;
83 r3 += r2 >> 30; r2 &= M30;
84 r4 += r3 >> 30; r3 &= M30;
85 r5 += r4 >> 30; r4 &= M30;
86 r6 += r5 >> 30; r5 &= M30;
87 r7 += r6 >> 30; r6 &= M30;
88 r8 += r7 >> 30; r7 &= M30;
101 /* Data type for transition matrices (see section 3 of explanation).
108 } secp256k1_modinv32_trans2x2;
110 /* Compute the transition matrix and eta for 30 divsteps.
112 * Input: eta: initial eta
113 * f0: bottom limb of initial f
114 * g0: bottom limb of initial g
115 * Output: t: transition matrix
118 * Implements the divsteps_n_matrix function from the explanation.
120 static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
121 /* u,v,q,r are the elements of the transformation matrix being built up,
122 * starting with the identity matrix. Semantically they are signed integers
123 * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
124 * permits left shifting (which is UB for negative numbers). The range
125 * being inside [-2^31,2^31) means that casting to signed works correctly.
127 uint32_t u = 1, v = 0, q = 0, r = 1;
128 uint32_t c1, c2, f = f0, g = g0, x, y, z;
131 for (i = 0; i < 30; ++i) {
132 VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
133 VERIFY_CHECK((u * f0 + v * g0) == f << i);
134 VERIFY_CHECK((q * f0 + r * g0) == g << i);
135 /* Compute conditional masks for (eta < 0) and for (g & 1). */
138 /* Compute x,y,z, conditionally negated versions of f,u,v. */
142 /* Conditionally add x,y,z to g,q,r. */
146 /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
148 /* Conditionally negate eta, and unconditionally subtract 1. */
149 eta = (eta ^ c1) - (c1 + 1);
150 /* Conditionally add g,q,r to f,u,v. */
159 /* Return data in t and return value. */
167 /* Compute the transition matrix and eta for 30 divsteps (variable time).
169 * Input: eta: initial eta
170 * f0: bottom limb of initial f
171 * g0: bottom limb of initial g
172 * Output: t: transition matrix
175 * Implements the divsteps_n_matrix_var function from the explanation.
177 static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
178 /* inv256[i] = -(2*i+1)^-1 (mod 256) */
179 static const uint8_t inv256[128] = {
180 0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
181 0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
182 0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
183 0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
184 0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
185 0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
186 0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
187 0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
188 0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
189 0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
190 0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
193 /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */
194 uint32_t u = 1, v = 0, q = 0, r = 1;
195 uint32_t f = f0, g = g0, m;
197 int i = 30, limit, zeros;
200 /* Use a sentinel bit to count zeros only up to i. */
201 zeros = secp256k1_ctz32_var(g | (UINT32_MAX << i));
202 /* Perform zeros divsteps at once; they all just divide g by two. */
208 /* We're done once we've done 30 divsteps. */
210 VERIFY_CHECK((f & 1) == 1);
211 VERIFY_CHECK((g & 1) == 1);
212 VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
213 VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
214 /* If eta is negative, negate it and replace f,g with g,-f. */
218 tmp = f; f = g; g = -tmp;
219 tmp = u; u = q; q = -tmp;
220 tmp = v; v = r; r = -tmp;
222 /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
223 * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
224 * can be done as its sign will flip once that happens. */
225 limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
226 /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
227 m = (UINT32_MAX >> (32 - limit)) & 255U;
228 /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
229 w = (g * inv256[(f >> 1) & 127]) & m;
234 VERIFY_CHECK((g & m) == 0);
236 /* Return data in t and return value. */
244 /* Compute (t/2^30) * [d, e] mod modulus, where t is a transition matrix for 30 divsteps.
246 * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
249 * This implements the update_de function from the explanation.
251 static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp256k1_modinv32_signed30 *e, const secp256k1_modinv32_trans2x2 *t, const secp256k1_modinv32_modinfo* modinfo) {
252 const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
253 const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
254 int32_t di, ei, md, me, sd, se;
257 /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
260 md = (u & sd) + (v & se);
261 me = (q & sd) + (r & se);
262 /* Begin computing t*[d,e]. */
265 cd = (int64_t)u * di + (int64_t)v * ei;
266 ce = (int64_t)q * di + (int64_t)r * ei;
267 /* Correct md,me so that t*[d,e]+modulus*[md,me] has 30 zero bottom bits. */
268 md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
269 me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
270 /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
271 cd += (int64_t)modinfo->modulus.v[0] * md;
272 ce += (int64_t)modinfo->modulus.v[0] * me;
273 /* Verify that the low 30 bits of the computation are indeed zero, and then throw them away. */
274 VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
275 VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
276 /* Now iteratively compute limb i=1..8 of t*[d,e]+modulus*[md,me], and store them in output
277 * limb i-1 (shifting down by 30 bits). */
278 for (i = 1; i < 9; ++i) {
281 cd += (int64_t)u * di + (int64_t)v * ei;
282 ce += (int64_t)q * di + (int64_t)r * ei;
283 cd += (int64_t)modinfo->modulus.v[i] * md;
284 ce += (int64_t)modinfo->modulus.v[i] * me;
285 d->v[i - 1] = (int32_t)cd & M30; cd >>= 30;
286 e->v[i - 1] = (int32_t)ce & M30; ce >>= 30;
288 /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */
289 d->v[8] = (int32_t)cd;
290 e->v[8] = (int32_t)ce;
293 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
295 * This implements the update_fg function from the explanation.
297 static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
298 const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
299 const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
303 /* Start computing t*[f,g]. */
306 cf = (int64_t)u * fi + (int64_t)v * gi;
307 cg = (int64_t)q * fi + (int64_t)r * gi;
308 /* Verify that the bottom 30 bits of the result are zero, and then throw them away. */
309 VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
310 VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
311 /* Now iteratively compute limb i=1..8 of t*[f,g], and store them in output limb i-1 (shifting
312 * down by 30 bits). */
313 for (i = 1; i < 9; ++i) {
316 cf += (int64_t)u * fi + (int64_t)v * gi;
317 cg += (int64_t)q * fi + (int64_t)r * gi;
318 f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
319 g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
321 /* What remains is limb 9 of t*[f,g]; store it as output limb 8. */
322 f->v[8] = (int32_t)cf;
323 g->v[8] = (int32_t)cg;
326 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
327 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
328 /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
329 secp256k1_modinv32_signed30 d = {{0}};
330 secp256k1_modinv32_signed30 e = {{1}};
331 secp256k1_modinv32_signed30 f = modinfo->modulus;
332 secp256k1_modinv32_signed30 g = *x;
336 /* Do 25 iterations of 30 divsteps each = 750 divsteps. 724 suffices for 256-bit inputs. */
337 for (i = 0; i < 25; ++i) {
338 /* Compute transition matrix and new eta after 30 divsteps. */
339 secp256k1_modinv32_trans2x2 t;
340 eta = secp256k1_modinv32_divsteps_30(eta, f.v[0], g.v[0], &t);
341 /* Update d,e using that transition matrix. */
342 secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
343 /* Update f,g using that transition matrix. */
344 secp256k1_modinv32_update_fg_30(&f, &g, &t);
347 /* At this point sufficient iterations have been performed that g must have reached 0
348 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
349 * values i.e. +/- 1, and d now contains +/- the modular inverse. */
350 VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4] | g.v[5] | g.v[6] | g.v[7] | g.v[8]) == 0);
352 /* Optionally negate d, normalize to [0,modulus), and return it. */
353 secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
357 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
358 static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
359 /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
360 secp256k1_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}};
361 secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
362 secp256k1_modinv32_signed30 f = modinfo->modulus;
363 secp256k1_modinv32_signed30 g = *x;
368 /* Do iterations of 30 divsteps each until g=0. */
370 /* Compute transition matrix and new eta after 30 divsteps. */
371 secp256k1_modinv32_trans2x2 t;
372 eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
373 /* Update d,e using that transition matrix. */
374 secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
375 /* Update f,g using that transition matrix. */
376 secp256k1_modinv32_update_fg_30(&f, &g, &t);
377 /* If the bottom limb of g is 0, there is a chance g=0. */
380 /* Check if the other limbs are also 0. */
381 for (j = 1; j < 9; ++j) {
384 /* If so, we're done. */
385 if (cond == 0) break;
389 /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
390 * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
392 /* Optionally negate d, normalize to [0,modulus), and return it. */
393 secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
397 #endif /* SECP256K1_MODINV32_IMPL_H */