]> Git Repo - secp256k1.git/blame - lin64.asm
Comments
[secp256k1.git] / lin64.asm
CommitLineData
d9137bb2
DH
1 ;; Added by Diederik Huys, March 2013
2 ;;
3 ;; Provided public procedures:
4 ;; ExSetMult
5 ;; ExSetSquare
6 ;;
7 ;; Needed tools: JWASM (http://www.japheth.de/JWasm.html)
8 ;;
9 ;; !!! WARNING !!! !!! WARNING !!! !!! WARNING !!!
10 ;;
11 ;; Please note that recompiling this binary (jwasm) under a 64-bit OS
12 ;; may yield unexpected results and create a corrupted ELF64 header.
13 ;;
14 ;;
15
16 .x64
37c4ab15
DH
17QTEST EQU 1
18 .code
d9137bb2 19
37c4ab15
DH
20 ;; Register Layout:
21 ;; INPUT: rdi = a.n
22 ;; rsi = b.n
23 ;; rdx = this.a
d9137bb2 24 ;;
37c4ab15 25 ;; INTERNAL: rdx:rax = multiplication accumulator
d9137bb2
DH
26 ;; r9:r8 = c
27 ;; r10-r13 = t0-t3
28 ;; r14 = b.n[0] / t4
29 ;; r15 = b.n[1] / t5
30 ;; rbx = b.n[2] / t6
31 ;; rcx = b.n[3] / t7
32 ;; rbp = Constant 0FFFFFFFFFFFFFh / t8
33 ;; rsi = b.n / b.n[4] / t9
37c4ab15
DH
34ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
35 push rdx
d9137bb2
DH
36 mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
37 ; b.n[0] is no longer needed, then we reassign
38 ; r14 to t4
37c4ab15 39 ;; c=a.n[0] * b.n[0]
d9137bb2 40 mov rax,[rdi+0*8] ; load a.n[0]
37c4ab15 41 mov rbp,0FFFFFFFFFFFFFh
d9137bb2 42 mul r14 ; rdx:rax=a.n[0]*b.n[0]
37c4ab15 43 mov r15,[rsi+1*8]
d9137bb2 44 mov r10,rbp ; load modulus into target register for t0
37c4ab15 45 mov r8,rax
d9137bb2 46 and r10,rax ; only need lower qword of c
37c4ab15 47 shrd r8,rdx,52
d9137bb2 48 xor r9,r9 ; c < 2^64, so we ditch the HO part
37c4ab15
DH
49
50 ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
51 mov rax,[rdi+0*8]
d9137bb2 52 mul r15
37c4ab15
DH
53 add r8,rax
54 adc r9,rdx
55
56 mov rax,[rdi+1*8]
d9137bb2 57 mul r14
37c4ab15
DH
58 mov r11,rbp
59 mov rbx,[rsi+2*8]
60 add r8,rax
61 adc r9,rdx
62 and r11,r8
63 shrd r8,r9,52
64 xor r9,r9
65
66 ;; c+=a.n[0 1 2] * b.n[2 1 0]
67 mov rax,[rdi+0*8]
d9137bb2 68 mul rbx
37c4ab15
DH
69 add r8,rax
70 adc r9,rdx
71
72 mov rax,[rdi+1*8]
d9137bb2 73 mul r15
37c4ab15
DH
74 add r8,rax
75 adc r9,rdx
76
77 mov rax,[rdi+2*8]
78 mul r14
d9137bb2 79 mov r12,rbp
37c4ab15
DH
80 mov rcx,[rsi+3*8]
81 add r8,rax
82 adc r9,rdx
d9137bb2 83 and r12,r8
37c4ab15
DH
84 shrd r8,r9,52
85 xor r9,r9
86
87 ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
88 mov rax,[rdi+0*8]
d9137bb2 89 mul rcx
37c4ab15
DH
90 add r8,rax
91 adc r9,rdx
92
93 mov rax,[rdi+1*8]
d9137bb2 94 mul rbx
37c4ab15
DH
95 add r8,rax
96 adc r9,rdx
97
98 mov rax,[rdi+2*8]
d9137bb2 99 mul r15
37c4ab15
DH
100 add r8,rax
101 adc r9,rdx
102
103 mov rax,[rdi+3*8]
d9137bb2
DH
104 mul r14
105 mov r13,rbp
37c4ab15
DH
106 mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
107 add r8,rax
108 adc r9,rdx
109 and r13,r8
110
111 shrd r8,r9,52
112 xor r9,r9
113
114
115 ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
116 mov rax,[rdi+0*8]
117 mul rsi
118 add r8,rax
119 adc r9,rdx
120
121 mov rax,[rdi+1*8]
122 mul rcx
123 add r8,rax
124 adc r9,rdx
125
126 mov rax,[rdi+2*8]
d9137bb2 127 mul rbx
37c4ab15
DH
128 add r8,rax
129 adc r9,rdx
130
131 mov rax,[rdi+3*8]
d9137bb2 132 mul r15
37c4ab15
DH
133 add r8,rax
134 adc r9,rdx
135
136 mov rax,[rdi+4*8]
d9137bb2
DH
137 mul r14
138 mov r14,rbp ; load modulus into t4 and destroy a.n[0]
37c4ab15
DH
139 add r8,rax
140 adc r9,rdx
141 and r14,r8
142 shrd r8,r9,52
143 xor r9,r9
144
145 ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
146 mov rax,[rdi+1*8]
147 mul rsi
148 add r8,rax
149 adc r9,rdx
150
151 mov rax,[rdi+2*8]
152 mul rcx
153 add r8,rax
154 adc r9,rdx
155
156 mov rax,[rdi+3*8]
157 mul rbx
158 add r8,rax
159 adc r9,rdx
160
161 mov rax,[rdi+4*8]
162 mul r15
d9137bb2 163 mov r15,rbp
37c4ab15
DH
164 add r8,rax
165 adc r9,rdx
166
167 and r15,r8
168 shrd r8,r9,52
169 xor r9,r9
170
171 ;; c+=a.n[2 3 4] * b.n[4 3 2]
172 mov rax,[rdi+2*8]
173 mul rsi
174 add r8,rax
175 adc r9,rdx
176
177 mov rax,[rdi+3*8]
178 mul rcx
179 add r8,rax
180 adc r9,rdx
181
182 mov rax,[rdi+4*8]
183 mul rbx
d9137bb2 184 mov rbx,rbp
37c4ab15
DH
185 add r8,rax
186 adc r9,rdx
187
d9137bb2 188 and rbx,r8
37c4ab15
DH
189 shrd r8,r9,52
190 xor r9,r9
191
192 ;; c+=a.n[3 4] * b.n[4 3]
193 mov rax,[rdi+3*8]
194 mul rsi
195 add r8,rax
196 adc r9,rdx
197
198 mov rax,[rdi+4*8]
199 mul rcx
d9137bb2 200 mov rcx,rbp
37c4ab15
DH
201 add r8,rax
202 adc r9,rdx
d9137bb2 203 and rcx,r8
37c4ab15
DH
204 shrd r8,r9,52
205 xor r9,r9
206
207 ;; c+=a.n[4] * b.n[4]
208 mov rax,[rdi+4*8]
209 mul rsi
210 ;; mov rbp,rbp ; modulus already there!
211 add r8,rax
212 adc r9,rdx
213 and rbp,r8
214 shrd r8,r9,52
215 xor r9,r9
216
d9137bb2 217 mov rsi,r8 ; load c into t9 and destroy b.n[4]
37c4ab15
DH
218
219 ;; *******************************************************
220common_exit_norm::
d9137bb2 221 mov rdi,01000003D10h ; load constant
37c4ab15
DH
222
223 mov rax,r15 ; get t5
224 mul rdi
225 add rax,r10 ; +t0
226 adc rdx,0
d9137bb2 227 mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
37c4ab15
DH
228 mov r8,rax ; +c
229 and r10,rax
230 shrd r8,rdx,52
231 xor r9,r9
232
233 mov rax,rbx ; get t6
234 mul rdi
235 add rax,r11 ; +t1
236 adc rdx,0
237 mov r11,0FFFFFFFFFFFFFh ; modulus
238 add r8,rax ; +c
239 adc r9,rdx
240 and r11,r8
241 shrd r8,r9,52
242 xor r9,r9
243
244 mov rax,rcx ; get t7
245 mul rdi
246 add rax,r12 ; +t2
247 adc rdx,0
d9137bb2 248 pop rbx ; retrieve pointer to this.n
37c4ab15
DH
249 mov r12,0FFFFFFFFFFFFFh ; modulus
250 add r8,rax ; +c
251 adc r9,rdx
252 and r12,r8
d9137bb2 253 mov [rbx+2*8],r12 ; mov into this.n[2]
37c4ab15
DH
254 shrd r8,r9,52
255 xor r9,r9
256
257 mov rax,rbp ; get t8
258 mul rdi
259 add rax,r13 ; +t3
260 adc rdx,0
261 mov r13,0FFFFFFFFFFFFFh ; modulus
262 add r8,rax ; +c
263 adc r9,rdx
264 and r13,r8
d9137bb2 265 mov [rbx+3*8],r13 ; -> this.n[3]
37c4ab15
DH
266 shrd r8,r9,52
267 xor r9,r9
268
269 mov rax,rsi ; get t9
270 mul rdi
271 add rax,r14 ; +t4
272 adc rdx,0
273 mov r14,0FFFFFFFFFFFFh ; !!!
274 add r8,rax ; +c
275 adc r9,rdx
276 and r14,r8
d9137bb2
DH
277 mov [rbx+4*8],r14 ; -> this.n[4]
278 shrd r8,r9,48 ; !!!
37c4ab15
DH
279 xor r9,r9
280
d9137bb2 281 mov rax,01000003D1h
37c4ab15
DH
282 mul r8
283 add rax,r10
284 adc rdx,0
285 mov r10,0FFFFFFFFFFFFFh ; modulus
286 mov r8,rax
287 and rax,r10
288 shrd r8,rdx,52
d9137bb2 289 mov [rbx+0*8],rax ; -> this.n[0]
37c4ab15 290 add r8,r11
d9137bb2 291 mov [rbx+1*8],r8 ; -> this.n[1]
37c4ab15
DH
292 ret
293ExSetMult ENDP
294
295
37c4ab15 296 ;; Register Layout:
d9137bb2
DH
297 ;; INPUT: rdi = a.n
298 ;; rsi = this.a
37c4ab15 299 ;; INTERNAL: rdx:rax = multiplication accumulator
d9137bb2
DH
300 ;; r9:r8 = c
301 ;; r10-r13 = t0-t3
302 ;; r14 = a.n[0] / t4
303 ;; r15 = a.n[1] / t5
304 ;; rbx = a.n[2] / t6
305 ;; rcx = a.n[3] / t7
306 ;; rbp = 0FFFFFFFFFFFFFh / t8
307 ;; rsi = a.n[4] / a.n[4] /t9
37c4ab15
DH
308ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
309 push rsi
1d8e4308 310 mov rbp,0FFFFFFFFFFFFFh
37c4ab15
DH
311
312 ;; c=a.n[0] * a.n[0]
1d8e4308
DH
313 mov r14,[rdi+0*8] ; r14=a.n[0]
314 mov r10,rbp ; modulus
315 mov rax,r14
316 mul rax
317 mov r15,[rdi+1*8] ; a.n[1]
318 add r14,r14 ; r14=2*a.n[0]
37c4ab15
DH
319 mov r8,rax
320 and r10,rax ; only need lower qword
321 shrd r8,rdx,52
322 xor r9,r9
323
324 ;; c+=2*a.n[0] * a.n[1]
1d8e4308
DH
325 mov rax,r14 ; r14=2*a.n[0]
326 mul r15
327 mov rbx,[rdi+2*8] ; rbx=a.n[2]
328 mov r11,rbp ; modulus
37c4ab15
DH
329 add r8,rax
330 adc r9,rdx
331 and r11,r8
332 shrd r8,r9,52
333 xor r9,r9
334
335 ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
1d8e4308
DH
336 mov rax,r14
337 mul rbx
37c4ab15
DH
338 add r8,rax
339 adc r9,rdx
340
1d8e4308
DH
341 mov rax,r15
342 mov r12,rbp ; modulus
37c4ab15 343 mul rax
1d8e4308
DH
344 mov rcx,[rdi+3*8] ; rcx=a.n[3]
345 add r15,r15 ; r15=a.n[1]*2
37c4ab15
DH
346 add r8,rax
347 adc r9,rdx
37c4ab15
DH
348 and r12,r8 ; only need lower dword
349 shrd r8,r9,52
350 xor r9,r9
351
352 ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
1d8e4308
DH
353 mov rax,r14
354 mul rcx
37c4ab15
DH
355 add r8,rax
356 adc r9,rdx
357
1d8e4308
DH
358 mov rax,r15 ; rax=2*a.n[1]
359 mov r13,rbp ; modulus
360 mul rbx
361 mov rsi,[rdi+4*8] ; rsi=a.n[4]
37c4ab15
DH
362 add r8,rax
363 adc r9,rdx
364 and r13,r8
365 shrd r8,r9,52
366 xor r9,r9
367
368 ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
1d8e4308 369 mov rax,r14 ; last time we need 2*a.n[0]
37c4ab15
DH
370 mul rsi
371 add r8,rax
372 adc r9,rdx
373
1d8e4308
DH
374 mov rax,r15
375 mul rcx
376 mov r14,rbp ; modulus
37c4ab15
DH
377 add r8,rax
378 adc r9,rdx
379
1d8e4308 380 mov rax,rbx
37c4ab15 381 mul rax
1d8e4308 382 add rbx,rbx ; rcx=2*a.n[2]
37c4ab15
DH
383 add r8,rax
384 adc r9,rdx
385 and r14,r8
386 shrd r8,r9,52
387 xor r9,r9
388
389 ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
1d8e4308 390 mov rax,r15 ; last time we need 2*a.n[1]
37c4ab15
DH
391 mul rsi
392 add r8,rax
393 adc r9,rdx
394
1d8e4308
DH
395 mov rax,rbx
396 mul rcx
397 mov r15,rbp ; modulus
37c4ab15
DH
398 add r8,rax
399 adc r9,rdx
400 and r15,r8
401 shrd r8,r9,52
402 xor r9,r9
403
404 ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
1d8e4308 405 mov rax,rbx ; last time we need 2*a.n[2]
37c4ab15
DH
406 mul rsi
407 add r8,rax
408 adc r9,rdx
409
1d8e4308 410 mov rax,rcx ; a.n[3]
37c4ab15 411 mul rax
1d8e4308 412 mov rbx,rbp ; modulus
37c4ab15
DH
413 add r8,rax
414 adc r9,rdx
415 and rbx,r8 ; only need lower dword
1d8e4308 416 lea rax,[2*rcx]
37c4ab15
DH
417 shrd r8,r9,52
418 xor r9,r9
419
420 ;; c+=2*a.n[3]*a.n[4]
421 mul rsi
1d8e4308 422 mov rcx,rbp ; modulus
37c4ab15
DH
423 add r8,rax
424 adc r9,rdx
425 and rcx,r8 ; only need lower dword
426 shrd r8,r9,52
427 xor r9,r9
428
429 ;; c+=a.n[4]*a.n[4]
430 mov rax,rsi
431 mul rax
1d8e4308 432 ;; mov rbp,rbp ; modulus is already there!
37c4ab15
DH
433 add r8,rax
434 adc r9,rdx
435 and rbp,r8
436 shrd r8,r9,52
437 xor r9,r9
438
439 mov rsi,r8
440
441 ;; *******************************************************
442 jmp common_exit_norm
443ExSetSquare ENDP
444 end
445
446
This page took 0.075566 seconds and 4 git commands to generate.