1 @ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm:
2 /**********************************************************************
3 * Copyright (c) 2014 Wladimir J. van der Laan *
4 * Distributed under the MIT software license, see the accompanying *
5 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
6 **********************************************************************/
8 ARM implementation of field_10x26 inner loops.
12 - To avoid unnecessary loads and make use of available registers, two
13 'passes' have every time been interleaved, with the odd passes accumulating c' and d'
14 which will be added to c and d respectively in the the even passes
20 @ eabi attributes - see readelf -A
21 .eabi_attribute 8, 1 @ Tag_ARM_ISA_use = yes
22 .eabi_attribute 9, 0 @ Tag_Thumb_ISA_use = no
23 .eabi_attribute 10, 0 @ Tag_FP_arch = none
24 .eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte
25 .eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP
26 .eabi_attribute 30, 2 @ Tag_ABI_optimization_goals = Agressive Speed
27 .eabi_attribute 34, 1 @ Tag_CPU_unaligned_access = v6
33 .set field_not_M, 0xfc000000 @ ~M = ~0x3ffffff
36 .global secp256k1_fe_mul_inner
37 .type secp256k1_fe_mul_inner, %function
39 @ r0 r Restrict: can overlap with a, not with b
42 @ Stack (total 4+10*4 = 44)
43 @ sp + #0 saved 'r' pointer
44 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
45 secp256k1_fe_mul_inner:
46 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
47 sub sp, sp, #48 @ frame=44 + alignment
48 str r0, [sp, #0] @ save result address, we need it only at the end
50 /******************************************
51 * Main computation code.
52 ******************************************
63 Note: do not write to r[] here, it may overlap with a[]
66 /* A - interleaved with B */
67 ldr r7, [r1, #0*4] @ a[0]
68 ldr r8, [r2, #9*4] @ b[9]
69 ldr r0, [r1, #1*4] @ a[1]
70 umull r5, r6, r7, r8 @ d = a[0] * b[9]
71 ldr r14, [r2, #8*4] @ b[8]
72 umull r9, r10, r0, r8 @ d' = a[1] * b[9]
73 ldr r7, [r1, #2*4] @ a[2]
74 umlal r5, r6, r0, r14 @ d += a[1] * b[8]
75 ldr r8, [r2, #7*4] @ b[7]
76 umlal r9, r10, r7, r14 @ d' += a[2] * b[8]
77 ldr r0, [r1, #3*4] @ a[3]
78 umlal r5, r6, r7, r8 @ d += a[2] * b[7]
79 ldr r14, [r2, #6*4] @ b[6]
80 umlal r9, r10, r0, r8 @ d' += a[3] * b[7]
81 ldr r7, [r1, #4*4] @ a[4]
82 umlal r5, r6, r0, r14 @ d += a[3] * b[6]
83 ldr r8, [r2, #5*4] @ b[5]
84 umlal r9, r10, r7, r14 @ d' += a[4] * b[6]
85 ldr r0, [r1, #5*4] @ a[5]
86 umlal r5, r6, r7, r8 @ d += a[4] * b[5]
87 ldr r14, [r2, #4*4] @ b[4]
88 umlal r9, r10, r0, r8 @ d' += a[5] * b[5]
89 ldr r7, [r1, #6*4] @ a[6]
90 umlal r5, r6, r0, r14 @ d += a[5] * b[4]
91 ldr r8, [r2, #3*4] @ b[3]
92 umlal r9, r10, r7, r14 @ d' += a[6] * b[4]
93 ldr r0, [r1, #7*4] @ a[7]
94 umlal r5, r6, r7, r8 @ d += a[6] * b[3]
95 ldr r14, [r2, #2*4] @ b[2]
96 umlal r9, r10, r0, r8 @ d' += a[7] * b[3]
97 ldr r7, [r1, #8*4] @ a[8]
98 umlal r5, r6, r0, r14 @ d += a[7] * b[2]
99 ldr r8, [r2, #1*4] @ b[1]
100 umlal r9, r10, r7, r14 @ d' += a[8] * b[2]
101 ldr r0, [r1, #9*4] @ a[9]
102 umlal r5, r6, r7, r8 @ d += a[8] * b[1]
103 ldr r14, [r2, #0*4] @ b[0]
104 umlal r9, r10, r0, r8 @ d' += a[9] * b[1]
105 ldr r7, [r1, #0*4] @ a[0]
106 umlal r5, r6, r0, r14 @ d += a[9] * b[0]
109 bic r0, r5, field_not_M @ t9 = d & M
110 str r0, [sp, #4 + 4*9]
111 mov r5, r5, lsr #26 @ d >>= 26
112 orr r5, r5, r6, asl #6
116 umull r3, r4, r7, r14 @ c = a[0] * b[0]
117 adds r5, r5, r9 @ d += d'
120 bic r0, r5, field_not_M @ u0 = d & M
121 mov r5, r5, lsr #26 @ d >>= 26
122 orr r5, r5, r6, asl #6
124 movw r14, field_R0 @ c += u0 * R0
125 umlal r3, r4, r0, r14
127 bic r14, r3, field_not_M @ t0 = c & M
128 str r14, [sp, #4 + 0*4]
129 mov r3, r3, lsr #26 @ c >>= 26
130 orr r3, r3, r4, asl #6
132 mov r14, field_R1 @ c += u0 * R1
133 umlal r3, r4, r0, r14
135 /* C - interleaved with D */
136 ldr r7, [r1, #0*4] @ a[0]
137 ldr r8, [r2, #2*4] @ b[2]
138 ldr r14, [r2, #1*4] @ b[1]
139 umull r11, r12, r7, r8 @ c' = a[0] * b[2]
140 ldr r0, [r1, #1*4] @ a[1]
141 umlal r3, r4, r7, r14 @ c += a[0] * b[1]
142 ldr r8, [r2, #0*4] @ b[0]
143 umlal r11, r12, r0, r14 @ c' += a[1] * b[1]
144 ldr r7, [r1, #2*4] @ a[2]
145 umlal r3, r4, r0, r8 @ c += a[1] * b[0]
146 ldr r14, [r2, #9*4] @ b[9]
147 umlal r11, r12, r7, r8 @ c' += a[2] * b[0]
148 ldr r0, [r1, #3*4] @ a[3]
149 umlal r5, r6, r7, r14 @ d += a[2] * b[9]
150 ldr r8, [r2, #8*4] @ b[8]
151 umull r9, r10, r0, r14 @ d' = a[3] * b[9]
152 ldr r7, [r1, #4*4] @ a[4]
153 umlal r5, r6, r0, r8 @ d += a[3] * b[8]
154 ldr r14, [r2, #7*4] @ b[7]
155 umlal r9, r10, r7, r8 @ d' += a[4] * b[8]
156 ldr r0, [r1, #5*4] @ a[5]
157 umlal r5, r6, r7, r14 @ d += a[4] * b[7]
158 ldr r8, [r2, #6*4] @ b[6]
159 umlal r9, r10, r0, r14 @ d' += a[5] * b[7]
160 ldr r7, [r1, #6*4] @ a[6]
161 umlal r5, r6, r0, r8 @ d += a[5] * b[6]
162 ldr r14, [r2, #5*4] @ b[5]
163 umlal r9, r10, r7, r8 @ d' += a[6] * b[6]
164 ldr r0, [r1, #7*4] @ a[7]
165 umlal r5, r6, r7, r14 @ d += a[6] * b[5]
166 ldr r8, [r2, #4*4] @ b[4]
167 umlal r9, r10, r0, r14 @ d' += a[7] * b[5]
168 ldr r7, [r1, #8*4] @ a[8]
169 umlal r5, r6, r0, r8 @ d += a[7] * b[4]
170 ldr r14, [r2, #3*4] @ b[3]
171 umlal r9, r10, r7, r8 @ d' += a[8] * b[4]
172 ldr r0, [r1, #9*4] @ a[9]
173 umlal r5, r6, r7, r14 @ d += a[8] * b[3]
174 ldr r8, [r2, #2*4] @ b[2]
175 umlal r9, r10, r0, r14 @ d' += a[9] * b[3]
176 umlal r5, r6, r0, r8 @ d += a[9] * b[2]
178 bic r0, r5, field_not_M @ u1 = d & M
179 mov r5, r5, lsr #26 @ d >>= 26
180 orr r5, r5, r6, asl #6
182 movw r14, field_R0 @ c += u1 * R0
183 umlal r3, r4, r0, r14
185 bic r14, r3, field_not_M @ t1 = c & M
186 str r14, [sp, #4 + 1*4]
187 mov r3, r3, lsr #26 @ c >>= 26
188 orr r3, r3, r4, asl #6
190 mov r14, field_R1 @ c += u1 * R1
191 umlal r3, r4, r0, r14
194 adds r3, r3, r11 @ c += c'
196 adds r5, r5, r9 @ d += d'
199 bic r0, r5, field_not_M @ u2 = d & M
200 mov r5, r5, lsr #26 @ d >>= 26
201 orr r5, r5, r6, asl #6
203 movw r14, field_R0 @ c += u2 * R0
204 umlal r3, r4, r0, r14
206 bic r14, r3, field_not_M @ t2 = c & M
207 str r14, [sp, #4 + 2*4]
208 mov r3, r3, lsr #26 @ c >>= 26
209 orr r3, r3, r4, asl #6
211 mov r14, field_R1 @ c += u2 * R1
212 umlal r3, r4, r0, r14
214 /* E - interleaved with F */
215 ldr r7, [r1, #0*4] @ a[0]
216 ldr r8, [r2, #4*4] @ b[4]
217 umull r11, r12, r7, r8 @ c' = a[0] * b[4]
218 ldr r8, [r2, #3*4] @ b[3]
219 umlal r3, r4, r7, r8 @ c += a[0] * b[3]
220 ldr r7, [r1, #1*4] @ a[1]
221 umlal r11, r12, r7, r8 @ c' += a[1] * b[3]
222 ldr r8, [r2, #2*4] @ b[2]
223 umlal r3, r4, r7, r8 @ c += a[1] * b[2]
224 ldr r7, [r1, #2*4] @ a[2]
225 umlal r11, r12, r7, r8 @ c' += a[2] * b[2]
226 ldr r8, [r2, #1*4] @ b[1]
227 umlal r3, r4, r7, r8 @ c += a[2] * b[1]
228 ldr r7, [r1, #3*4] @ a[3]
229 umlal r11, r12, r7, r8 @ c' += a[3] * b[1]
230 ldr r8, [r2, #0*4] @ b[0]
231 umlal r3, r4, r7, r8 @ c += a[3] * b[0]
232 ldr r7, [r1, #4*4] @ a[4]
233 umlal r11, r12, r7, r8 @ c' += a[4] * b[0]
234 ldr r8, [r2, #9*4] @ b[9]
235 umlal r5, r6, r7, r8 @ d += a[4] * b[9]
236 ldr r7, [r1, #5*4] @ a[5]
237 umull r9, r10, r7, r8 @ d' = a[5] * b[9]
238 ldr r8, [r2, #8*4] @ b[8]
239 umlal r5, r6, r7, r8 @ d += a[5] * b[8]
240 ldr r7, [r1, #6*4] @ a[6]
241 umlal r9, r10, r7, r8 @ d' += a[6] * b[8]
242 ldr r8, [r2, #7*4] @ b[7]
243 umlal r5, r6, r7, r8 @ d += a[6] * b[7]
244 ldr r7, [r1, #7*4] @ a[7]
245 umlal r9, r10, r7, r8 @ d' += a[7] * b[7]
246 ldr r8, [r2, #6*4] @ b[6]
247 umlal r5, r6, r7, r8 @ d += a[7] * b[6]
248 ldr r7, [r1, #8*4] @ a[8]
249 umlal r9, r10, r7, r8 @ d' += a[8] * b[6]
250 ldr r8, [r2, #5*4] @ b[5]
251 umlal r5, r6, r7, r8 @ d += a[8] * b[5]
252 ldr r7, [r1, #9*4] @ a[9]
253 umlal r9, r10, r7, r8 @ d' += a[9] * b[5]
254 ldr r8, [r2, #4*4] @ b[4]
255 umlal r5, r6, r7, r8 @ d += a[9] * b[4]
257 bic r0, r5, field_not_M @ u3 = d & M
258 mov r5, r5, lsr #26 @ d >>= 26
259 orr r5, r5, r6, asl #6
261 movw r14, field_R0 @ c += u3 * R0
262 umlal r3, r4, r0, r14
264 bic r14, r3, field_not_M @ t3 = c & M
265 str r14, [sp, #4 + 3*4]
266 mov r3, r3, lsr #26 @ c >>= 26
267 orr r3, r3, r4, asl #6
269 mov r14, field_R1 @ c += u3 * R1
270 umlal r3, r4, r0, r14
273 adds r3, r3, r11 @ c += c'
275 adds r5, r5, r9 @ d += d'
278 bic r0, r5, field_not_M @ u4 = d & M
279 mov r5, r5, lsr #26 @ d >>= 26
280 orr r5, r5, r6, asl #6
282 movw r14, field_R0 @ c += u4 * R0
283 umlal r3, r4, r0, r14
285 bic r14, r3, field_not_M @ t4 = c & M
286 str r14, [sp, #4 + 4*4]
287 mov r3, r3, lsr #26 @ c >>= 26
288 orr r3, r3, r4, asl #6
290 mov r14, field_R1 @ c += u4 * R1
291 umlal r3, r4, r0, r14
293 /* G - interleaved with H */
294 ldr r7, [r1, #0*4] @ a[0]
295 ldr r8, [r2, #6*4] @ b[6]
296 ldr r14, [r2, #5*4] @ b[5]
297 umull r11, r12, r7, r8 @ c' = a[0] * b[6]
298 ldr r0, [r1, #1*4] @ a[1]
299 umlal r3, r4, r7, r14 @ c += a[0] * b[5]
300 ldr r8, [r2, #4*4] @ b[4]
301 umlal r11, r12, r0, r14 @ c' += a[1] * b[5]
302 ldr r7, [r1, #2*4] @ a[2]
303 umlal r3, r4, r0, r8 @ c += a[1] * b[4]
304 ldr r14, [r2, #3*4] @ b[3]
305 umlal r11, r12, r7, r8 @ c' += a[2] * b[4]
306 ldr r0, [r1, #3*4] @ a[3]
307 umlal r3, r4, r7, r14 @ c += a[2] * b[3]
308 ldr r8, [r2, #2*4] @ b[2]
309 umlal r11, r12, r0, r14 @ c' += a[3] * b[3]
310 ldr r7, [r1, #4*4] @ a[4]
311 umlal r3, r4, r0, r8 @ c += a[3] * b[2]
312 ldr r14, [r2, #1*4] @ b[1]
313 umlal r11, r12, r7, r8 @ c' += a[4] * b[2]
314 ldr r0, [r1, #5*4] @ a[5]
315 umlal r3, r4, r7, r14 @ c += a[4] * b[1]
316 ldr r8, [r2, #0*4] @ b[0]
317 umlal r11, r12, r0, r14 @ c' += a[5] * b[1]
318 ldr r7, [r1, #6*4] @ a[6]
319 umlal r3, r4, r0, r8 @ c += a[5] * b[0]
320 ldr r14, [r2, #9*4] @ b[9]
321 umlal r11, r12, r7, r8 @ c' += a[6] * b[0]
322 ldr r0, [r1, #7*4] @ a[7]
323 umlal r5, r6, r7, r14 @ d += a[6] * b[9]
324 ldr r8, [r2, #8*4] @ b[8]
325 umull r9, r10, r0, r14 @ d' = a[7] * b[9]
326 ldr r7, [r1, #8*4] @ a[8]
327 umlal r5, r6, r0, r8 @ d += a[7] * b[8]
328 ldr r14, [r2, #7*4] @ b[7]
329 umlal r9, r10, r7, r8 @ d' += a[8] * b[8]
330 ldr r0, [r1, #9*4] @ a[9]
331 umlal r5, r6, r7, r14 @ d += a[8] * b[7]
332 ldr r8, [r2, #6*4] @ b[6]
333 umlal r9, r10, r0, r14 @ d' += a[9] * b[7]
334 umlal r5, r6, r0, r8 @ d += a[9] * b[6]
336 bic r0, r5, field_not_M @ u5 = d & M
337 mov r5, r5, lsr #26 @ d >>= 26
338 orr r5, r5, r6, asl #6
340 movw r14, field_R0 @ c += u5 * R0
341 umlal r3, r4, r0, r14
343 bic r14, r3, field_not_M @ t5 = c & M
344 str r14, [sp, #4 + 5*4]
345 mov r3, r3, lsr #26 @ c >>= 26
346 orr r3, r3, r4, asl #6
348 mov r14, field_R1 @ c += u5 * R1
349 umlal r3, r4, r0, r14
352 adds r3, r3, r11 @ c += c'
354 adds r5, r5, r9 @ d += d'
357 bic r0, r5, field_not_M @ u6 = d & M
358 mov r5, r5, lsr #26 @ d >>= 26
359 orr r5, r5, r6, asl #6
361 movw r14, field_R0 @ c += u6 * R0
362 umlal r3, r4, r0, r14
364 bic r14, r3, field_not_M @ t6 = c & M
365 str r14, [sp, #4 + 6*4]
366 mov r3, r3, lsr #26 @ c >>= 26
367 orr r3, r3, r4, asl #6
369 mov r14, field_R1 @ c += u6 * R1
370 umlal r3, r4, r0, r14
372 /* I - interleaved with J */
373 ldr r8, [r2, #8*4] @ b[8]
374 ldr r7, [r1, #0*4] @ a[0]
375 ldr r14, [r2, #7*4] @ b[7]
376 umull r11, r12, r7, r8 @ c' = a[0] * b[8]
377 ldr r0, [r1, #1*4] @ a[1]
378 umlal r3, r4, r7, r14 @ c += a[0] * b[7]
379 ldr r8, [r2, #6*4] @ b[6]
380 umlal r11, r12, r0, r14 @ c' += a[1] * b[7]
381 ldr r7, [r1, #2*4] @ a[2]
382 umlal r3, r4, r0, r8 @ c += a[1] * b[6]
383 ldr r14, [r2, #5*4] @ b[5]
384 umlal r11, r12, r7, r8 @ c' += a[2] * b[6]
385 ldr r0, [r1, #3*4] @ a[3]
386 umlal r3, r4, r7, r14 @ c += a[2] * b[5]
387 ldr r8, [r2, #4*4] @ b[4]
388 umlal r11, r12, r0, r14 @ c' += a[3] * b[5]
389 ldr r7, [r1, #4*4] @ a[4]
390 umlal r3, r4, r0, r8 @ c += a[3] * b[4]
391 ldr r14, [r2, #3*4] @ b[3]
392 umlal r11, r12, r7, r8 @ c' += a[4] * b[4]
393 ldr r0, [r1, #5*4] @ a[5]
394 umlal r3, r4, r7, r14 @ c += a[4] * b[3]
395 ldr r8, [r2, #2*4] @ b[2]
396 umlal r11, r12, r0, r14 @ c' += a[5] * b[3]
397 ldr r7, [r1, #6*4] @ a[6]
398 umlal r3, r4, r0, r8 @ c += a[5] * b[2]
399 ldr r14, [r2, #1*4] @ b[1]
400 umlal r11, r12, r7, r8 @ c' += a[6] * b[2]
401 ldr r0, [r1, #7*4] @ a[7]
402 umlal r3, r4, r7, r14 @ c += a[6] * b[1]
403 ldr r8, [r2, #0*4] @ b[0]
404 umlal r11, r12, r0, r14 @ c' += a[7] * b[1]
405 ldr r7, [r1, #8*4] @ a[8]
406 umlal r3, r4, r0, r8 @ c += a[7] * b[0]
407 ldr r14, [r2, #9*4] @ b[9]
408 umlal r11, r12, r7, r8 @ c' += a[8] * b[0]
409 ldr r0, [r1, #9*4] @ a[9]
410 umlal r5, r6, r7, r14 @ d += a[8] * b[9]
411 ldr r8, [r2, #8*4] @ b[8]
412 umull r9, r10, r0, r14 @ d' = a[9] * b[9]
413 umlal r5, r6, r0, r8 @ d += a[9] * b[8]
415 bic r0, r5, field_not_M @ u7 = d & M
416 mov r5, r5, lsr #26 @ d >>= 26
417 orr r5, r5, r6, asl #6
419 movw r14, field_R0 @ c += u7 * R0
420 umlal r3, r4, r0, r14
422 bic r14, r3, field_not_M @ t7 = c & M
423 str r14, [sp, #4 + 7*4]
424 mov r3, r3, lsr #26 @ c >>= 26
425 orr r3, r3, r4, asl #6
427 mov r14, field_R1 @ c += u7 * R1
428 umlal r3, r4, r0, r14
431 adds r3, r3, r11 @ c += c'
433 adds r5, r5, r9 @ d += d'
436 bic r0, r5, field_not_M @ u8 = d & M
437 str r0, [sp, #4 + 8*4]
438 mov r5, r5, lsr #26 @ d >>= 26
439 orr r5, r5, r6, asl #6
441 movw r14, field_R0 @ c += u8 * R0
442 umlal r3, r4, r0, r14
444 /******************************************
445 * compute and write back result
446 ******************************************
456 r1,r2,r10,r14 scratch
458 Note: do not read from a[] after here, it may overlap with r[]
461 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9
462 ldmia r1, {r2,r7,r8,r9,r10,r11,r12}
464 stmia r1, {r2,r7,r8,r9,r10}
466 bic r2, r3, field_not_M @ r[8] = c & M
468 mov r3, r3, lsr #26 @ c >>= 26
469 orr r3, r3, r4, asl #6
471 mov r14, field_R1 @ c += u8 * R1
472 umlal r3, r4, r11, r14
473 movw r14, field_R0 @ c += d * R0
474 umlal r3, r4, r5, r14
475 adds r3, r3, r12 @ c += t9
478 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2
481 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4)
483 mov r3, r3, lsr #22 @ c >>= 22
484 orr r3, r3, r4, asl #10
486 movw r14, field_R1 << 4 @ c += d * (R1 << 4)
487 umlal r3, r4, r5, r14
489 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
490 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4)
491 adds r5, r5, r7 @ d.lo += t0
492 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4)
493 adc r6, r6, 0 @ d.hi += carry
495 bic r2, r5, field_not_M @ r[0] = d & M
498 mov r5, r5, lsr #26 @ d >>= 26
499 orr r5, r5, r6, asl #6
502 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
503 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4)
504 adds r5, r5, r8 @ d.lo += t1
505 adc r6, r6, #0 @ d.hi += carry
506 adds r5, r5, r1 @ d.lo += tmp.lo
507 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4)
508 adc r6, r6, r2 @ d.hi += carry + tmp.hi
510 bic r2, r5, field_not_M @ r[1] = d & M
512 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi)
513 orr r5, r5, r6, asl #6
515 add r5, r5, r9 @ d += t2
516 str r5, [r0, #2*4] @ r[2] = d
519 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
520 .size secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner
523 .global secp256k1_fe_sqr_inner
524 .type secp256k1_fe_sqr_inner, %function
526 @ r0 r Can overlap with a
528 @ Stack (total 4+10*4 = 44)
529 @ sp + #0 saved 'r' pointer
530 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
531 secp256k1_fe_sqr_inner:
532 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
533 sub sp, sp, #48 @ frame=44 + alignment
534 str r0, [sp, #0] @ save result address, we need it only at the end
535 /******************************************
536 * Main computation code.
537 ******************************************
540 r0,r14,r2,r7,r8 scratch
547 Note: do not write to r[] here, it may overlap with a[]
549 /* A interleaved with B */
550 ldr r0, [r1, #1*4] @ a[1]*2
551 ldr r7, [r1, #0*4] @ a[0]
553 ldr r14, [r1, #9*4] @ a[9]
554 umull r3, r4, r7, r7 @ c = a[0] * a[0]
555 ldr r8, [r1, #8*4] @ a[8]
557 umull r5, r6, r7, r14 @ d = a[0]*2 * a[9]
558 ldr r7, [r1, #2*4] @ a[2]*2
559 umull r9, r10, r0, r14 @ d' = a[1]*2 * a[9]
560 ldr r14, [r1, #7*4] @ a[7]
561 umlal r5, r6, r0, r8 @ d += a[1]*2 * a[8]
563 ldr r0, [r1, #3*4] @ a[3]*2
564 umlal r9, r10, r7, r8 @ d' += a[2]*2 * a[8]
565 ldr r8, [r1, #6*4] @ a[6]
566 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[7]
568 ldr r7, [r1, #4*4] @ a[4]*2
569 umlal r9, r10, r0, r14 @ d' += a[3]*2 * a[7]
570 ldr r14, [r1, #5*4] @ a[5]
572 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[6]
573 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[6]
574 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[5]
575 umlal r9, r10, r14, r14 @ d' += a[5] * a[5]
577 bic r0, r5, field_not_M @ t9 = d & M
578 str r0, [sp, #4 + 9*4]
579 mov r5, r5, lsr #26 @ d >>= 26
580 orr r5, r5, r6, asl #6
584 adds r5, r5, r9 @ d += d'
587 bic r0, r5, field_not_M @ u0 = d & M
588 mov r5, r5, lsr #26 @ d >>= 26
589 orr r5, r5, r6, asl #6
591 movw r14, field_R0 @ c += u0 * R0
592 umlal r3, r4, r0, r14
593 bic r14, r3, field_not_M @ t0 = c & M
594 str r14, [sp, #4 + 0*4]
595 mov r3, r3, lsr #26 @ c >>= 26
596 orr r3, r3, r4, asl #6
598 mov r14, field_R1 @ c += u0 * R1
599 umlal r3, r4, r0, r14
601 /* C interleaved with D */
602 ldr r0, [r1, #0*4] @ a[0]*2
603 ldr r14, [r1, #1*4] @ a[1]
605 ldr r8, [r1, #2*4] @ a[2]
606 umlal r3, r4, r0, r14 @ c += a[0]*2 * a[1]
607 mov r7, r8, asl #1 @ a[2]*2
608 umull r11, r12, r14, r14 @ c' = a[1] * a[1]
609 ldr r14, [r1, #9*4] @ a[9]
610 umlal r11, r12, r0, r8 @ c' += a[0]*2 * a[2]
611 ldr r0, [r1, #3*4] @ a[3]*2
612 ldr r8, [r1, #8*4] @ a[8]
613 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[9]
615 ldr r7, [r1, #4*4] @ a[4]*2
616 umull r9, r10, r0, r14 @ d' = a[3]*2 * a[9]
617 ldr r14, [r1, #7*4] @ a[7]
618 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[8]
620 ldr r0, [r1, #5*4] @ a[5]*2
621 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[8]
622 ldr r8, [r1, #6*4] @ a[6]
624 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[7]
625 umlal r9, r10, r0, r14 @ d' += a[5]*2 * a[7]
626 umlal r5, r6, r0, r8 @ d += a[5]*2 * a[6]
627 umlal r9, r10, r8, r8 @ d' += a[6] * a[6]
629 bic r0, r5, field_not_M @ u1 = d & M
630 mov r5, r5, lsr #26 @ d >>= 26
631 orr r5, r5, r6, asl #6
633 movw r14, field_R0 @ c += u1 * R0
634 umlal r3, r4, r0, r14
635 bic r14, r3, field_not_M @ t1 = c & M
636 str r14, [sp, #4 + 1*4]
637 mov r3, r3, lsr #26 @ c >>= 26
638 orr r3, r3, r4, asl #6
640 mov r14, field_R1 @ c += u1 * R1
641 umlal r3, r4, r0, r14
644 adds r3, r3, r11 @ c += c'
646 adds r5, r5, r9 @ d += d'
649 bic r0, r5, field_not_M @ u2 = d & M
650 mov r5, r5, lsr #26 @ d >>= 26
651 orr r5, r5, r6, asl #6
653 movw r14, field_R0 @ c += u2 * R0
654 umlal r3, r4, r0, r14
655 bic r14, r3, field_not_M @ t2 = c & M
656 str r14, [sp, #4 + 2*4]
657 mov r3, r3, lsr #26 @ c >>= 26
658 orr r3, r3, r4, asl #6
660 mov r14, field_R1 @ c += u2 * R1
661 umlal r3, r4, r0, r14
663 /* E interleaved with F */
664 ldr r7, [r1, #0*4] @ a[0]*2
665 ldr r0, [r1, #1*4] @ a[1]*2
666 ldr r14, [r1, #2*4] @ a[2]
668 ldr r8, [r1, #3*4] @ a[3]
670 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[3]
672 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[4]
673 mov r2, r2, asl #1 @ a[4]*2
674 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[3]
675 ldr r8, [r1, #9*4] @ a[9]
676 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[2]
677 ldr r0, [r1, #5*4] @ a[5]*2
678 umlal r11, r12, r14, r14 @ c' += a[2] * a[2]
679 ldr r14, [r1, #8*4] @ a[8]
681 umlal r5, r6, r2, r8 @ d += a[4]*2 * a[9]
682 ldr r7, [r1, #6*4] @ a[6]*2
683 umull r9, r10, r0, r8 @ d' = a[5]*2 * a[9]
685 ldr r8, [r1, #7*4] @ a[7]
686 umlal r5, r6, r0, r14 @ d += a[5]*2 * a[8]
687 umlal r9, r10, r7, r14 @ d' += a[6]*2 * a[8]
688 umlal r5, r6, r7, r8 @ d += a[6]*2 * a[7]
689 umlal r9, r10, r8, r8 @ d' += a[7] * a[7]
691 bic r0, r5, field_not_M @ u3 = d & M
692 mov r5, r5, lsr #26 @ d >>= 26
693 orr r5, r5, r6, asl #6
695 movw r14, field_R0 @ c += u3 * R0
696 umlal r3, r4, r0, r14
697 bic r14, r3, field_not_M @ t3 = c & M
698 str r14, [sp, #4 + 3*4]
699 mov r3, r3, lsr #26 @ c >>= 26
700 orr r3, r3, r4, asl #6
702 mov r14, field_R1 @ c += u3 * R1
703 umlal r3, r4, r0, r14
706 adds r3, r3, r11 @ c += c'
708 adds r5, r5, r9 @ d += d'
711 bic r0, r5, field_not_M @ u4 = d & M
712 mov r5, r5, lsr #26 @ d >>= 26
713 orr r5, r5, r6, asl #6
715 movw r14, field_R0 @ c += u4 * R0
716 umlal r3, r4, r0, r14
717 bic r14, r3, field_not_M @ t4 = c & M
718 str r14, [sp, #4 + 4*4]
719 mov r3, r3, lsr #26 @ c >>= 26
720 orr r3, r3, r4, asl #6
722 mov r14, field_R1 @ c += u4 * R1
723 umlal r3, r4, r0, r14
725 /* G interleaved with H */
726 ldr r7, [r1, #0*4] @ a[0]*2
727 ldr r0, [r1, #1*4] @ a[1]*2
729 ldr r8, [r1, #5*4] @ a[5]
730 ldr r2, [r1, #6*4] @ a[6]
731 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[5]
732 ldr r14, [r1, #4*4] @ a[4]
734 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[6]
735 ldr r7, [r1, #2*4] @ a[2]*2
736 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[5]
738 ldr r8, [r1, #3*4] @ a[3]
739 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[4]
740 mov r0, r2, asl #1 @ a[6]*2
741 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[4]
742 ldr r14, [r1, #9*4] @ a[9]
743 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[3]
744 ldr r7, [r1, #7*4] @ a[7]*2
745 umlal r11, r12, r8, r8 @ c' += a[3] * a[3]
747 ldr r8, [r1, #8*4] @ a[8]
748 umlal r5, r6, r0, r14 @ d += a[6]*2 * a[9]
749 umull r9, r10, r7, r14 @ d' = a[7]*2 * a[9]
750 umlal r5, r6, r7, r8 @ d += a[7]*2 * a[8]
751 umlal r9, r10, r8, r8 @ d' += a[8] * a[8]
753 bic r0, r5, field_not_M @ u5 = d & M
754 mov r5, r5, lsr #26 @ d >>= 26
755 orr r5, r5, r6, asl #6
757 movw r14, field_R0 @ c += u5 * R0
758 umlal r3, r4, r0, r14
759 bic r14, r3, field_not_M @ t5 = c & M
760 str r14, [sp, #4 + 5*4]
761 mov r3, r3, lsr #26 @ c >>= 26
762 orr r3, r3, r4, asl #6
764 mov r14, field_R1 @ c += u5 * R1
765 umlal r3, r4, r0, r14
768 adds r3, r3, r11 @ c += c'
770 adds r5, r5, r9 @ d += d'
773 bic r0, r5, field_not_M @ u6 = d & M
774 mov r5, r5, lsr #26 @ d >>= 26
775 orr r5, r5, r6, asl #6
777 movw r14, field_R0 @ c += u6 * R0
778 umlal r3, r4, r0, r14
779 bic r14, r3, field_not_M @ t6 = c & M
780 str r14, [sp, #4 + 6*4]
781 mov r3, r3, lsr #26 @ c >>= 26
782 orr r3, r3, r4, asl #6
784 mov r14, field_R1 @ c += u6 * R1
785 umlal r3, r4, r0, r14
787 /* I interleaved with J */
788 ldr r7, [r1, #0*4] @ a[0]*2
789 ldr r0, [r1, #1*4] @ a[1]*2
791 ldr r8, [r1, #7*4] @ a[7]
792 ldr r2, [r1, #8*4] @ a[8]
793 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[7]
794 ldr r14, [r1, #6*4] @ a[6]
796 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[8]
797 ldr r7, [r1, #2*4] @ a[2]*2
798 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[7]
799 ldr r8, [r1, #5*4] @ a[5]
800 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[6]
801 ldr r0, [r1, #3*4] @ a[3]*2
803 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[6]
804 ldr r14, [r1, #4*4] @ a[4]
806 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[5]
807 mov r2, r2, asl #1 @ a[8]*2
808 umlal r11, r12, r0, r8 @ c' += a[3]*2 * a[5]
809 umlal r3, r4, r0, r14 @ c += a[3]*2 * a[4]
810 umlal r11, r12, r14, r14 @ c' += a[4] * a[4]
811 ldr r8, [r1, #9*4] @ a[9]
812 umlal r5, r6, r2, r8 @ d += a[8]*2 * a[9]
813 @ r8 will be used in J
815 bic r0, r5, field_not_M @ u7 = d & M
816 mov r5, r5, lsr #26 @ d >>= 26
817 orr r5, r5, r6, asl #6
819 movw r14, field_R0 @ c += u7 * R0
820 umlal r3, r4, r0, r14
821 bic r14, r3, field_not_M @ t7 = c & M
822 str r14, [sp, #4 + 7*4]
823 mov r3, r3, lsr #26 @ c >>= 26
824 orr r3, r3, r4, asl #6
826 mov r14, field_R1 @ c += u7 * R1
827 umlal r3, r4, r0, r14
830 adds r3, r3, r11 @ c += c'
832 umlal r5, r6, r8, r8 @ d += a[9] * a[9]
834 bic r0, r5, field_not_M @ u8 = d & M
835 str r0, [sp, #4 + 8*4]
836 mov r5, r5, lsr #26 @ d >>= 26
837 orr r5, r5, r6, asl #6
839 movw r14, field_R0 @ c += u8 * R0
840 umlal r3, r4, r0, r14
842 /******************************************
843 * compute and write back result
844 ******************************************
854 r1,r2,r10,r14 scratch
856 Note: do not read from a[] after here, it may overlap with r[]
859 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9
860 ldmia r1, {r2,r7,r8,r9,r10,r11,r12}
862 stmia r1, {r2,r7,r8,r9,r10}
864 bic r2, r3, field_not_M @ r[8] = c & M
866 mov r3, r3, lsr #26 @ c >>= 26
867 orr r3, r3, r4, asl #6
869 mov r14, field_R1 @ c += u8 * R1
870 umlal r3, r4, r11, r14
871 movw r14, field_R0 @ c += d * R0
872 umlal r3, r4, r5, r14
873 adds r3, r3, r12 @ c += t9
876 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2
879 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4)
881 mov r3, r3, lsr #22 @ c >>= 22
882 orr r3, r3, r4, asl #10
884 movw r14, field_R1 << 4 @ c += d * (R1 << 4)
885 umlal r3, r4, r5, r14
887 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add)
888 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4)
889 adds r5, r5, r7 @ d.lo += t0
890 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4)
891 adc r6, r6, 0 @ d.hi += carry
893 bic r2, r5, field_not_M @ r[0] = d & M
896 mov r5, r5, lsr #26 @ d >>= 26
897 orr r5, r5, r6, asl #6
900 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add)
901 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4)
902 adds r5, r5, r8 @ d.lo += t1
903 adc r6, r6, #0 @ d.hi += carry
904 adds r5, r5, r1 @ d.lo += tmp.lo
905 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4)
906 adc r6, r6, r2 @ d.hi += carry + tmp.hi
908 bic r2, r5, field_not_M @ r[1] = d & M
910 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi)
911 orr r5, r5, r6, asl #6
913 add r5, r5, r9 @ d += t2
914 str r5, [r0, #2*4] @ r[2] = d
917 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
918 .size secp256k1_fe_sqr_inner, .-secp256k1_fe_sqr_inner