]>
Commit | Line | Data |
---|---|---|
d9137bb2 DH |
1 | ;; Added by Diederik Huys, March 2013 |
2 | ;; | |
3 | ;; Provided public procedures: | |
4 | ;; ExSetMult | |
5 | ;; ExSetSquare | |
6 | ;; | |
7 | ;; Needed tools: JWASM (http://www.japheth.de/JWasm.html) | |
8 | ;; | |
9 | ;; !!! WARNING !!! !!! WARNING !!! !!! WARNING !!! | |
10 | ;; | |
11 | ;; Please note that recompiling this binary (jwasm) under a 64-bit OS | |
12 | ;; may yield unexpected results and create a corrupted ELF64 header. | |
13 | ;; | |
14 | ;; | |
15 | ||
16 | .x64 | |
37c4ab15 DH |
17 | QTEST EQU 1 |
18 | .code | |
d9137bb2 | 19 | |
37c4ab15 DH |
20 | ;; Register Layout: |
21 | ;; INPUT: rdi = a.n | |
22 | ;; rsi = b.n | |
23 | ;; rdx = this.a | |
d9137bb2 | 24 | ;; |
37c4ab15 | 25 | ;; INTERNAL: rdx:rax = multiplication accumulator |
d9137bb2 DH |
26 | ;; r9:r8 = c |
27 | ;; r10-r13 = t0-t3 | |
28 | ;; r14 = b.n[0] / t4 | |
29 | ;; r15 = b.n[1] / t5 | |
30 | ;; rbx = b.n[2] / t6 | |
31 | ;; rcx = b.n[3] / t7 | |
32 | ;; rbp = Constant 0FFFFFFFFFFFFFh / t8 | |
33 | ;; rsi = b.n / b.n[4] / t9 | |
37c4ab15 DH |
34 | ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 |
35 | push rdx | |
d9137bb2 DH |
36 | mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until |
37 | ; b.n[0] is no longer needed, then we reassign | |
38 | ; r14 to t4 | |
37c4ab15 | 39 | ;; c=a.n[0] * b.n[0] |
d9137bb2 | 40 | mov rax,[rdi+0*8] ; load a.n[0] |
37c4ab15 | 41 | mov rbp,0FFFFFFFFFFFFFh |
d9137bb2 | 42 | mul r14 ; rdx:rax=a.n[0]*b.n[0] |
37c4ab15 | 43 | mov r15,[rsi+1*8] |
d9137bb2 | 44 | mov r10,rbp ; load modulus into target register for t0 |
37c4ab15 | 45 | mov r8,rax |
d9137bb2 | 46 | and r10,rax ; only need lower qword of c |
37c4ab15 | 47 | shrd r8,rdx,52 |
d9137bb2 | 48 | xor r9,r9 ; c < 2^64, so we ditch the HO part |
37c4ab15 DH |
49 | |
50 | ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] | |
51 | mov rax,[rdi+0*8] | |
d9137bb2 | 52 | mul r15 |
37c4ab15 DH |
53 | add r8,rax |
54 | adc r9,rdx | |
55 | ||
56 | mov rax,[rdi+1*8] | |
d9137bb2 | 57 | mul r14 |
37c4ab15 DH |
58 | mov r11,rbp |
59 | mov rbx,[rsi+2*8] | |
60 | add r8,rax | |
61 | adc r9,rdx | |
62 | and r11,r8 | |
63 | shrd r8,r9,52 | |
64 | xor r9,r9 | |
65 | ||
66 | ;; c+=a.n[0 1 2] * b.n[2 1 0] | |
67 | mov rax,[rdi+0*8] | |
d9137bb2 | 68 | mul rbx |
37c4ab15 DH |
69 | add r8,rax |
70 | adc r9,rdx | |
71 | ||
72 | mov rax,[rdi+1*8] | |
d9137bb2 | 73 | mul r15 |
37c4ab15 DH |
74 | add r8,rax |
75 | adc r9,rdx | |
76 | ||
77 | mov rax,[rdi+2*8] | |
78 | mul r14 | |
d9137bb2 | 79 | mov r12,rbp |
37c4ab15 DH |
80 | mov rcx,[rsi+3*8] |
81 | add r8,rax | |
82 | adc r9,rdx | |
d9137bb2 | 83 | and r12,r8 |
37c4ab15 DH |
84 | shrd r8,r9,52 |
85 | xor r9,r9 | |
86 | ||
87 | ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] | |
88 | mov rax,[rdi+0*8] | |
d9137bb2 | 89 | mul rcx |
37c4ab15 DH |
90 | add r8,rax |
91 | adc r9,rdx | |
92 | ||
93 | mov rax,[rdi+1*8] | |
d9137bb2 | 94 | mul rbx |
37c4ab15 DH |
95 | add r8,rax |
96 | adc r9,rdx | |
97 | ||
98 | mov rax,[rdi+2*8] | |
d9137bb2 | 99 | mul r15 |
37c4ab15 DH |
100 | add r8,rax |
101 | adc r9,rdx | |
102 | ||
103 | mov rax,[rdi+3*8] | |
d9137bb2 DH |
104 | mul r14 |
105 | mov r13,rbp | |
37c4ab15 DH |
106 | mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer |
107 | add r8,rax | |
108 | adc r9,rdx | |
109 | and r13,r8 | |
110 | ||
111 | shrd r8,r9,52 | |
112 | xor r9,r9 | |
113 | ||
114 | ||
115 | ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] | |
116 | mov rax,[rdi+0*8] | |
117 | mul rsi | |
118 | add r8,rax | |
119 | adc r9,rdx | |
120 | ||
121 | mov rax,[rdi+1*8] | |
122 | mul rcx | |
123 | add r8,rax | |
124 | adc r9,rdx | |
125 | ||
126 | mov rax,[rdi+2*8] | |
d9137bb2 | 127 | mul rbx |
37c4ab15 DH |
128 | add r8,rax |
129 | adc r9,rdx | |
130 | ||
131 | mov rax,[rdi+3*8] | |
d9137bb2 | 132 | mul r15 |
37c4ab15 DH |
133 | add r8,rax |
134 | adc r9,rdx | |
135 | ||
136 | mov rax,[rdi+4*8] | |
d9137bb2 DH |
137 | mul r14 |
138 | mov r14,rbp ; load modulus into t4 and destroy a.n[0] | |
37c4ab15 DH |
139 | add r8,rax |
140 | adc r9,rdx | |
141 | and r14,r8 | |
142 | shrd r8,r9,52 | |
143 | xor r9,r9 | |
144 | ||
145 | ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] | |
146 | mov rax,[rdi+1*8] | |
147 | mul rsi | |
148 | add r8,rax | |
149 | adc r9,rdx | |
150 | ||
151 | mov rax,[rdi+2*8] | |
152 | mul rcx | |
153 | add r8,rax | |
154 | adc r9,rdx | |
155 | ||
156 | mov rax,[rdi+3*8] | |
157 | mul rbx | |
158 | add r8,rax | |
159 | adc r9,rdx | |
160 | ||
161 | mov rax,[rdi+4*8] | |
162 | mul r15 | |
d9137bb2 | 163 | mov r15,rbp |
37c4ab15 DH |
164 | add r8,rax |
165 | adc r9,rdx | |
166 | ||
167 | and r15,r8 | |
168 | shrd r8,r9,52 | |
169 | xor r9,r9 | |
170 | ||
171 | ;; c+=a.n[2 3 4] * b.n[4 3 2] | |
172 | mov rax,[rdi+2*8] | |
173 | mul rsi | |
174 | add r8,rax | |
175 | adc r9,rdx | |
176 | ||
177 | mov rax,[rdi+3*8] | |
178 | mul rcx | |
179 | add r8,rax | |
180 | adc r9,rdx | |
181 | ||
182 | mov rax,[rdi+4*8] | |
183 | mul rbx | |
d9137bb2 | 184 | mov rbx,rbp |
37c4ab15 DH |
185 | add r8,rax |
186 | adc r9,rdx | |
187 | ||
d9137bb2 | 188 | and rbx,r8 |
37c4ab15 DH |
189 | shrd r8,r9,52 |
190 | xor r9,r9 | |
191 | ||
192 | ;; c+=a.n[3 4] * b.n[4 3] | |
193 | mov rax,[rdi+3*8] | |
194 | mul rsi | |
195 | add r8,rax | |
196 | adc r9,rdx | |
197 | ||
198 | mov rax,[rdi+4*8] | |
199 | mul rcx | |
d9137bb2 | 200 | mov rcx,rbp |
37c4ab15 DH |
201 | add r8,rax |
202 | adc r9,rdx | |
d9137bb2 | 203 | and rcx,r8 |
37c4ab15 DH |
204 | shrd r8,r9,52 |
205 | xor r9,r9 | |
206 | ||
207 | ;; c+=a.n[4] * b.n[4] | |
208 | mov rax,[rdi+4*8] | |
209 | mul rsi | |
210 | ;; mov rbp,rbp ; modulus already there! | |
211 | add r8,rax | |
212 | adc r9,rdx | |
213 | and rbp,r8 | |
214 | shrd r8,r9,52 | |
215 | xor r9,r9 | |
216 | ||
d9137bb2 | 217 | mov rsi,r8 ; load c into t9 and destroy b.n[4] |
37c4ab15 DH |
218 | |
219 | ;; ******************************************************* | |
220 | common_exit_norm:: | |
d9137bb2 | 221 | mov rdi,01000003D10h ; load constant |
37c4ab15 DH |
222 | |
223 | mov rax,r15 ; get t5 | |
224 | mul rdi | |
225 | add rax,r10 ; +t0 | |
226 | adc rdx,0 | |
d9137bb2 | 227 | mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers! |
37c4ab15 DH |
228 | mov r8,rax ; +c |
229 | and r10,rax | |
230 | shrd r8,rdx,52 | |
231 | xor r9,r9 | |
232 | ||
233 | mov rax,rbx ; get t6 | |
234 | mul rdi | |
235 | add rax,r11 ; +t1 | |
236 | adc rdx,0 | |
237 | mov r11,0FFFFFFFFFFFFFh ; modulus | |
238 | add r8,rax ; +c | |
239 | adc r9,rdx | |
240 | and r11,r8 | |
241 | shrd r8,r9,52 | |
242 | xor r9,r9 | |
243 | ||
244 | mov rax,rcx ; get t7 | |
245 | mul rdi | |
246 | add rax,r12 ; +t2 | |
247 | adc rdx,0 | |
d9137bb2 | 248 | pop rbx ; retrieve pointer to this.n |
37c4ab15 DH |
249 | mov r12,0FFFFFFFFFFFFFh ; modulus |
250 | add r8,rax ; +c | |
251 | adc r9,rdx | |
252 | and r12,r8 | |
d9137bb2 | 253 | mov [rbx+2*8],r12 ; mov into this.n[2] |
37c4ab15 DH |
254 | shrd r8,r9,52 |
255 | xor r9,r9 | |
256 | ||
257 | mov rax,rbp ; get t8 | |
258 | mul rdi | |
259 | add rax,r13 ; +t3 | |
260 | adc rdx,0 | |
261 | mov r13,0FFFFFFFFFFFFFh ; modulus | |
262 | add r8,rax ; +c | |
263 | adc r9,rdx | |
264 | and r13,r8 | |
d9137bb2 | 265 | mov [rbx+3*8],r13 ; -> this.n[3] |
37c4ab15 DH |
266 | shrd r8,r9,52 |
267 | xor r9,r9 | |
268 | ||
269 | mov rax,rsi ; get t9 | |
270 | mul rdi | |
271 | add rax,r14 ; +t4 | |
272 | adc rdx,0 | |
273 | mov r14,0FFFFFFFFFFFFh ; !!! | |
274 | add r8,rax ; +c | |
275 | adc r9,rdx | |
276 | and r14,r8 | |
d9137bb2 DH |
277 | mov [rbx+4*8],r14 ; -> this.n[4] |
278 | shrd r8,r9,48 ; !!! | |
37c4ab15 DH |
279 | xor r9,r9 |
280 | ||
d9137bb2 | 281 | mov rax,01000003D1h |
37c4ab15 DH |
282 | mul r8 |
283 | add rax,r10 | |
284 | adc rdx,0 | |
285 | mov r10,0FFFFFFFFFFFFFh ; modulus | |
286 | mov r8,rax | |
287 | and rax,r10 | |
288 | shrd r8,rdx,52 | |
d9137bb2 | 289 | mov [rbx+0*8],rax ; -> this.n[0] |
37c4ab15 | 290 | add r8,r11 |
d9137bb2 | 291 | mov [rbx+1*8],r8 ; -> this.n[1] |
37c4ab15 DH |
292 | ret |
293 | ExSetMult ENDP | |
294 | ||
295 | ||
37c4ab15 | 296 | ;; Register Layout: |
d9137bb2 DH |
297 | ;; INPUT: rdi = a.n |
298 | ;; rsi = this.a | |
37c4ab15 | 299 | ;; INTERNAL: rdx:rax = multiplication accumulator |
d9137bb2 DH |
300 | ;; r9:r8 = c |
301 | ;; r10-r13 = t0-t3 | |
302 | ;; r14 = a.n[0] / t4 | |
303 | ;; r15 = a.n[1] / t5 | |
304 | ;; rbx = a.n[2] / t6 | |
305 | ;; rcx = a.n[3] / t7 | |
306 | ;; rbp = 0FFFFFFFFFFFFFh / t8 | |
307 | ;; rsi = a.n[4] / a.n[4] /t9 | |
37c4ab15 DH |
308 | ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 |
309 | push rsi | |
1d8e4308 | 310 | mov rbp,0FFFFFFFFFFFFFh |
37c4ab15 DH |
311 | |
312 | ;; c=a.n[0] * a.n[0] | |
1d8e4308 DH |
313 | mov r14,[rdi+0*8] ; r14=a.n[0] |
314 | mov r10,rbp ; modulus | |
315 | mov rax,r14 | |
316 | mul rax | |
317 | mov r15,[rdi+1*8] ; a.n[1] | |
318 | add r14,r14 ; r14=2*a.n[0] | |
37c4ab15 DH |
319 | mov r8,rax |
320 | and r10,rax ; only need lower qword | |
321 | shrd r8,rdx,52 | |
322 | xor r9,r9 | |
323 | ||
324 | ;; c+=2*a.n[0] * a.n[1] | |
1d8e4308 DH |
325 | mov rax,r14 ; r14=2*a.n[0] |
326 | mul r15 | |
327 | mov rbx,[rdi+2*8] ; rbx=a.n[2] | |
328 | mov r11,rbp ; modulus | |
37c4ab15 DH |
329 | add r8,rax |
330 | adc r9,rdx | |
331 | and r11,r8 | |
332 | shrd r8,r9,52 | |
333 | xor r9,r9 | |
334 | ||
335 | ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] | |
1d8e4308 DH |
336 | mov rax,r14 |
337 | mul rbx | |
37c4ab15 DH |
338 | add r8,rax |
339 | adc r9,rdx | |
340 | ||
1d8e4308 DH |
341 | mov rax,r15 |
342 | mov r12,rbp ; modulus | |
37c4ab15 | 343 | mul rax |
1d8e4308 DH |
344 | mov rcx,[rdi+3*8] ; rcx=a.n[3] |
345 | add r15,r15 ; r15=a.n[1]*2 | |
37c4ab15 DH |
346 | add r8,rax |
347 | adc r9,rdx | |
37c4ab15 DH |
348 | and r12,r8 ; only need lower dword |
349 | shrd r8,r9,52 | |
350 | xor r9,r9 | |
351 | ||
352 | ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] | |
1d8e4308 DH |
353 | mov rax,r14 |
354 | mul rcx | |
37c4ab15 DH |
355 | add r8,rax |
356 | adc r9,rdx | |
357 | ||
1d8e4308 DH |
358 | mov rax,r15 ; rax=2*a.n[1] |
359 | mov r13,rbp ; modulus | |
360 | mul rbx | |
361 | mov rsi,[rdi+4*8] ; rsi=a.n[4] | |
37c4ab15 DH |
362 | add r8,rax |
363 | adc r9,rdx | |
364 | and r13,r8 | |
365 | shrd r8,r9,52 | |
366 | xor r9,r9 | |
367 | ||
368 | ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] | |
1d8e4308 | 369 | mov rax,r14 ; last time we need 2*a.n[0] |
37c4ab15 DH |
370 | mul rsi |
371 | add r8,rax | |
372 | adc r9,rdx | |
373 | ||
1d8e4308 DH |
374 | mov rax,r15 |
375 | mul rcx | |
376 | mov r14,rbp ; modulus | |
37c4ab15 DH |
377 | add r8,rax |
378 | adc r9,rdx | |
379 | ||
1d8e4308 | 380 | mov rax,rbx |
37c4ab15 | 381 | mul rax |
1d8e4308 | 382 | add rbx,rbx ; rcx=2*a.n[2] |
37c4ab15 DH |
383 | add r8,rax |
384 | adc r9,rdx | |
385 | and r14,r8 | |
386 | shrd r8,r9,52 | |
387 | xor r9,r9 | |
388 | ||
389 | ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] | |
1d8e4308 | 390 | mov rax,r15 ; last time we need 2*a.n[1] |
37c4ab15 DH |
391 | mul rsi |
392 | add r8,rax | |
393 | adc r9,rdx | |
394 | ||
1d8e4308 DH |
395 | mov rax,rbx |
396 | mul rcx | |
397 | mov r15,rbp ; modulus | |
37c4ab15 DH |
398 | add r8,rax |
399 | adc r9,rdx | |
400 | and r15,r8 | |
401 | shrd r8,r9,52 | |
402 | xor r9,r9 | |
403 | ||
404 | ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] | |
1d8e4308 | 405 | mov rax,rbx ; last time we need 2*a.n[2] |
37c4ab15 DH |
406 | mul rsi |
407 | add r8,rax | |
408 | adc r9,rdx | |
409 | ||
1d8e4308 | 410 | mov rax,rcx ; a.n[3] |
37c4ab15 | 411 | mul rax |
1d8e4308 | 412 | mov rbx,rbp ; modulus |
37c4ab15 DH |
413 | add r8,rax |
414 | adc r9,rdx | |
415 | and rbx,r8 ; only need lower dword | |
1d8e4308 | 416 | lea rax,[2*rcx] |
37c4ab15 DH |
417 | shrd r8,r9,52 |
418 | xor r9,r9 | |
419 | ||
420 | ;; c+=2*a.n[3]*a.n[4] | |
421 | mul rsi | |
1d8e4308 | 422 | mov rcx,rbp ; modulus |
37c4ab15 DH |
423 | add r8,rax |
424 | adc r9,rdx | |
425 | and rcx,r8 ; only need lower dword | |
426 | shrd r8,r9,52 | |
427 | xor r9,r9 | |
428 | ||
429 | ;; c+=a.n[4]*a.n[4] | |
430 | mov rax,rsi | |
431 | mul rax | |
1d8e4308 | 432 | ;; mov rbp,rbp ; modulus is already there! |
37c4ab15 DH |
433 | add r8,rax |
434 | adc r9,rdx | |
435 | and rbp,r8 | |
436 | shrd r8,r9,52 | |
437 | xor r9,r9 | |
438 | ||
439 | mov rsi,r8 | |
440 | ||
441 | ;; ******************************************************* | |
442 | jmp common_exit_norm | |
443 | ExSetSquare ENDP | |
444 | end | |
445 | ||
446 |