4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version. See COPYING for more details.
10 #include "cpuminer-config.h"
12 #if defined(__linux__) && defined(__ELF__)
13 .section .note.GNU-stack,"",%progbits
16 #if defined(USE_ASM) && defined(__x86_64__)
21 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
22 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
23 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
24 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
25 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
26 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
27 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
28 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
33 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
34 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
35 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
36 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
37 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
38 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
39 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
40 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
41 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
42 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
43 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
44 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
45 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
46 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
47 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
48 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
49 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
50 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
51 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
52 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
53 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
54 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
55 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
56 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
57 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
58 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
59 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
60 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
61 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
62 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
63 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
64 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
65 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
66 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
67 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
68 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
69 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
70 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
71 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
72 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
73 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
74 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
75 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
76 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
77 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
78 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
79 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
80 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
81 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
82 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
83 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
84 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
85 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
86 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
87 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
88 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
89 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
90 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
91 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
92 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
93 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
94 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
95 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
96 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
101 .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
103 .long 0x11002000, 0x11002000, 0x11002000, 0x11002000
105 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
107 .long 0x00400022, 0x00400022, 0x00400022, 0x00400022
115 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
116 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
117 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
118 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
119 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
120 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
121 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
122 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
127 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
128 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
129 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
130 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
131 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
132 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
133 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
134 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
135 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
136 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
137 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
138 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
139 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
140 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
141 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
142 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
143 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
144 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
145 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
146 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
147 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
148 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
149 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
150 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
151 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
152 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
153 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
154 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
155 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
156 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
157 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
158 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
159 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
160 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
161 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
162 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
163 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
164 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
165 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
166 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
167 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
168 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
169 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
170 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
171 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
172 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
173 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
174 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
175 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
176 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
177 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
178 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
179 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
180 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
181 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
182 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
183 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
184 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
185 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
186 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
187 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
188 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
189 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
190 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
195 .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
197 .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
199 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
201 .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
203 #endif /* USE_AVX2 */
208 .globl sha256_init_4way
209 .globl _sha256_init_4way
212 #if defined(_WIN64) || defined(__CYGWIN__)
216 movdqa sha256_4h+0(%rip), %xmm0
217 movdqa sha256_4h+16(%rip), %xmm1
218 movdqa sha256_4h+32(%rip), %xmm2
219 movdqa sha256_4h+48(%rip), %xmm3
220 movdqu %xmm0, 0(%rdi)
221 movdqu %xmm1, 16(%rdi)
222 movdqu %xmm2, 32(%rdi)
223 movdqu %xmm3, 48(%rdi)
224 movdqa sha256_4h+64(%rip), %xmm0
225 movdqa sha256_4h+80(%rip), %xmm1
226 movdqa sha256_4h+96(%rip), %xmm2
227 movdqa sha256_4h+112(%rip), %xmm3
228 movdqu %xmm0, 64(%rdi)
229 movdqu %xmm1, 80(%rdi)
230 movdqu %xmm2, 96(%rdi)
231 movdqu %xmm3, 112(%rdi)
232 #if defined(_WIN64) || defined(__CYGWIN__)
241 .globl sha256_init_8way
242 .globl _sha256_init_8way
245 #if defined(_WIN64) || defined(__CYGWIN__)
249 vpbroadcastd sha256_4h+0(%rip), %ymm0
250 vpbroadcastd sha256_4h+16(%rip), %ymm1
251 vpbroadcastd sha256_4h+32(%rip), %ymm2
252 vpbroadcastd sha256_4h+48(%rip), %ymm3
253 vmovdqu %ymm0, 0*32(%rdi)
254 vmovdqu %ymm1, 1*32(%rdi)
255 vmovdqu %ymm2, 2*32(%rdi)
256 vmovdqu %ymm3, 3*32(%rdi)
257 vpbroadcastd sha256_4h+64(%rip), %ymm0
258 vpbroadcastd sha256_4h+80(%rip), %ymm1
259 vpbroadcastd sha256_4h+96(%rip), %ymm2
260 vpbroadcastd sha256_4h+112(%rip), %ymm3
261 vmovdqu %ymm0, 4*32(%rdi)
262 vmovdqu %ymm1, 5*32(%rdi)
263 vmovdqu %ymm2, 6*32(%rdi)
264 vmovdqu %ymm3, 7*32(%rdi)
265 #if defined(_WIN64) || defined(__CYGWIN__)
269 #endif /* USE_AVX2 */
272 .macro sha256_sse2_extend_round i
273 movdqa (\i-15)*16(%rax), %xmm0
285 paddd (\i-16)*16(%rax), %xmm0
286 paddd (\i-7)*16(%rax), %xmm0
300 movdqa %xmm3, \i*16(%rax)
303 .macro sha256_sse2_extend_doubleround i
304 movdqa (\i-15)*16(%rax), %xmm0
305 movdqa (\i-14)*16(%rax), %xmm4
329 paddd (\i-16)*16(%rax), %xmm0
330 paddd (\i-15)*16(%rax), %xmm4
343 paddd (\i-7)*16(%rax), %xmm0
344 paddd (\i-6)*16(%rax), %xmm4
361 movdqa %xmm3, \i*16(%rax)
362 movdqa %xmm7, (\i+1)*16(%rax)
365 .macro sha256_sse2_main_round i
366 movdqa 16*(\i)(%rax), %xmm6
369 movdqa 16(%rsp), %xmm2
371 paddd 32(%rsp), %xmm6
373 movdqa %xmm2, 32(%rsp)
374 movdqa 0(%rsp), %xmm2
375 movdqa %xmm2, 16(%rsp)
379 movdqa %xmm0, 0(%rsp)
385 paddd 16*(\i)(%rcx), %xmm6
429 .macro sha256_sse2_main_quadround i
430 sha256_sse2_main_round \i+0
431 sha256_sse2_main_round \i+1
432 sha256_sse2_main_round \i+2
433 sha256_sse2_main_round \i+3
439 .macro sha256_avx_extend_round i
440 vmovdqa (\i-15)*16(%rax), %xmm0
441 vpslld $14, %xmm0, %xmm2
442 vpsrld $3, %xmm0, %xmm0
443 vpsrld $4, %xmm0, %xmm1
444 vpxor %xmm1, %xmm0, %xmm0
445 vpxor %xmm2, %xmm0, %xmm0
446 vpsrld $11, %xmm1, %xmm1
447 vpslld $11, %xmm2, %xmm2
448 vpxor %xmm1, %xmm0, %xmm0
449 vpxor %xmm2, %xmm0, %xmm0
450 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
451 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
453 vpslld $13, %xmm3, %xmm2
454 vpsrld $10, %xmm3, %xmm3
455 vpsrld $7, %xmm3, %xmm1
456 vpxor %xmm1, %xmm3, %xmm3
457 vpxor %xmm2, %xmm3, %xmm3
458 vpsrld $2, %xmm1, %xmm1
459 vpslld $2, %xmm2, %xmm2
460 vpxor %xmm1, %xmm3, %xmm3
461 vpxor %xmm2, %xmm3, %xmm3
462 vpaddd %xmm0, %xmm3, %xmm3
463 vmovdqa %xmm3, \i*16(%rax)
466 .macro sha256_avx_extend_doubleround i
467 vmovdqa (\i-15)*16(%rax), %xmm0
468 vmovdqa (\i-14)*16(%rax), %xmm4
469 vpslld $14, %xmm0, %xmm2
470 vpslld $14, %xmm4, %xmm6
471 vpsrld $3, %xmm0, %xmm8
472 vpsrld $3, %xmm4, %xmm4
473 vpsrld $7, %xmm0, %xmm1
474 vpsrld $4, %xmm4, %xmm5
475 vpxor %xmm1, %xmm8, %xmm8
476 vpxor %xmm5, %xmm4, %xmm4
477 vpsrld $11, %xmm1, %xmm1
478 vpsrld $11, %xmm5, %xmm5
479 vpxor %xmm2, %xmm8, %xmm8
480 vpxor %xmm6, %xmm4, %xmm4
481 vpslld $11, %xmm2, %xmm2
482 vpslld $11, %xmm6, %xmm6
483 vpxor %xmm1, %xmm8, %xmm8
484 vpxor %xmm5, %xmm4, %xmm4
485 vpxor %xmm2, %xmm8, %xmm8
486 vpxor %xmm6, %xmm4, %xmm4
488 vpaddd %xmm0, %xmm4, %xmm4
489 vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
491 vpslld $13, %xmm3, %xmm2
492 vpslld $13, %xmm7, %xmm6
493 vpsrld $10, %xmm3, %xmm3
494 vpsrld $10, %xmm7, %xmm7
496 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
497 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
499 vpsrld $7, %xmm3, %xmm1
500 vpsrld $7, %xmm7, %xmm5
501 vpxor %xmm1, %xmm3, %xmm3
502 vpxor %xmm5, %xmm7, %xmm7
503 vpsrld $2, %xmm1, %xmm1
504 vpsrld $2, %xmm5, %xmm5
505 vpxor %xmm2, %xmm3, %xmm3
506 vpxor %xmm6, %xmm7, %xmm7
507 vpslld $2, %xmm2, %xmm2
508 vpslld $2, %xmm6, %xmm6
509 vpxor %xmm1, %xmm3, %xmm3
510 vpxor %xmm5, %xmm7, %xmm7
511 vpxor %xmm2, %xmm3, %xmm3
512 vpxor %xmm6, %xmm7, %xmm7
514 vpaddd %xmm0, %xmm3, %xmm3
515 vpaddd %xmm4, %xmm7, %xmm7
516 vmovdqa %xmm3, \i*16(%rax)
517 vmovdqa %xmm7, (\i+1)*16(%rax)
520 .macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
521 vpaddd 16*(\i)(%rax), \r0, %xmm6
522 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
524 vpandn \r1, \r3, %xmm1
525 vpand \r3, \r2, %xmm2
526 vpxor %xmm2, %xmm1, %xmm1
527 vpaddd %xmm1, %xmm6, %xmm6
529 vpslld $7, \r3, %xmm1
531 vpsrld $5, \r0, %xmm2
532 vpxor %xmm1, \r0, \r0
533 vpxor %xmm2, \r0, \r0
534 vpslld $14, %xmm1, %xmm1
535 vpsrld $14, %xmm2, %xmm2
536 vpxor %xmm1, \r0, \r0
537 vpxor %xmm2, \r0, \r0
538 vpslld $5, %xmm1, %xmm1
539 vpxor %xmm1, \r0, \r0
540 vpaddd \r0, %xmm6, %xmm6
541 vpaddd %xmm6, \r4, \r0
543 vpand \r6, \r5, %xmm2
545 vpand \r7, \r6, %xmm1
546 vpxor \r4, %xmm1, %xmm1
547 vpxor %xmm2, %xmm1, %xmm1
548 vpaddd %xmm1, %xmm6, %xmm6
550 vpslld $10, \r7, %xmm2
552 vpsrld $11, \r4, %xmm1
553 vpxor %xmm2, \r4, \r4
554 vpxor %xmm1, \r4, \r4
555 vpslld $9, %xmm2, %xmm2
556 vpsrld $9, %xmm1, %xmm1
557 vpxor %xmm2, \r4, \r4
558 vpxor %xmm1, \r4, \r4
559 vpslld $11, %xmm2, %xmm2
560 vpxor %xmm2, \r4, \r4
561 vpaddd %xmm6, \r4, \r4
564 .macro sha256_avx_main_quadround i
565 sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
566 sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
567 sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
568 sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
574 #if defined(USE_AVX2)
576 .macro sha256_avx2_extend_round i
577 vmovdqa (\i-15)*32(%rax), %ymm0
578 vpslld $14, %ymm0, %ymm2
579 vpsrld $3, %ymm0, %ymm0
580 vpsrld $4, %ymm0, %ymm1
581 vpxor %ymm1, %ymm0, %ymm0
582 vpxor %ymm2, %ymm0, %ymm0
583 vpsrld $11, %ymm1, %ymm1
584 vpslld $11, %ymm2, %ymm2
585 vpxor %ymm1, %ymm0, %ymm0
586 vpxor %ymm2, %ymm0, %ymm0
587 vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
588 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
590 vpslld $13, %ymm3, %ymm2
591 vpsrld $10, %ymm3, %ymm3
592 vpsrld $7, %ymm3, %ymm1
593 vpxor %ymm1, %ymm3, %ymm3
594 vpxor %ymm2, %ymm3, %ymm3
595 vpsrld $2, %ymm1, %ymm1
596 vpslld $2, %ymm2, %ymm2
597 vpxor %ymm1, %ymm3, %ymm3
598 vpxor %ymm2, %ymm3, %ymm3
599 vpaddd %ymm0, %ymm3, %ymm3
600 vmovdqa %ymm3, \i*32(%rax)
603 .macro sha256_avx2_extend_doubleround i
604 vmovdqa (\i-15)*32(%rax), %ymm0
605 vmovdqa (\i-14)*32(%rax), %ymm4
606 vpslld $14, %ymm0, %ymm2
607 vpslld $14, %ymm4, %ymm6
608 vpsrld $3, %ymm0, %ymm8
609 vpsrld $3, %ymm4, %ymm4
610 vpsrld $7, %ymm0, %ymm1
611 vpsrld $4, %ymm4, %ymm5
612 vpxor %ymm1, %ymm8, %ymm8
613 vpxor %ymm5, %ymm4, %ymm4
614 vpsrld $11, %ymm1, %ymm1
615 vpsrld $11, %ymm5, %ymm5
616 vpxor %ymm2, %ymm8, %ymm8
617 vpxor %ymm6, %ymm4, %ymm4
618 vpslld $11, %ymm2, %ymm2
619 vpslld $11, %ymm6, %ymm6
620 vpxor %ymm1, %ymm8, %ymm8
621 vpxor %ymm5, %ymm4, %ymm4
622 vpxor %ymm2, %ymm8, %ymm8
623 vpxor %ymm6, %ymm4, %ymm4
625 vpaddd %ymm0, %ymm4, %ymm4
626 vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
628 vpslld $13, %ymm3, %ymm2
629 vpslld $13, %ymm7, %ymm6
630 vpsrld $10, %ymm3, %ymm3
631 vpsrld $10, %ymm7, %ymm7
633 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
634 vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
636 vpsrld $7, %ymm3, %ymm1
637 vpsrld $7, %ymm7, %ymm5
638 vpxor %ymm1, %ymm3, %ymm3
639 vpxor %ymm5, %ymm7, %ymm7
640 vpsrld $2, %ymm1, %ymm1
641 vpsrld $2, %ymm5, %ymm5
642 vpxor %ymm2, %ymm3, %ymm3
643 vpxor %ymm6, %ymm7, %ymm7
644 vpslld $2, %ymm2, %ymm2
645 vpslld $2, %ymm6, %ymm6
646 vpxor %ymm1, %ymm3, %ymm3
647 vpxor %ymm5, %ymm7, %ymm7
648 vpxor %ymm2, %ymm3, %ymm3
649 vpxor %ymm6, %ymm7, %ymm7
651 vpaddd %ymm0, %ymm3, %ymm3
652 vpaddd %ymm4, %ymm7, %ymm7
653 vmovdqa %ymm3, \i*32(%rax)
654 vmovdqa %ymm7, (\i+1)*32(%rax)
657 .macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
658 vpaddd 32*(\i)(%rax), \r0, %ymm6
659 vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
661 vpandn \r1, \r3, %ymm1
662 vpand \r3, \r2, %ymm2
663 vpxor %ymm2, %ymm1, %ymm1
664 vpaddd %ymm1, %ymm6, %ymm6
666 vpslld $7, \r3, %ymm1
668 vpsrld $5, \r0, %ymm2
669 vpxor %ymm1, \r0, \r0
670 vpxor %ymm2, \r0, \r0
671 vpslld $14, %ymm1, %ymm1
672 vpsrld $14, %ymm2, %ymm2
673 vpxor %ymm1, \r0, \r0
674 vpxor %ymm2, \r0, \r0
675 vpslld $5, %ymm1, %ymm1
676 vpxor %ymm1, \r0, \r0
677 vpaddd \r0, %ymm6, %ymm6
678 vpaddd %ymm6, \r4, \r0
680 vpand \r6, \r5, %ymm2
682 vpand \r7, \r6, %ymm1
683 vpxor \r4, %ymm1, %ymm1
684 vpxor %ymm2, %ymm1, %ymm1
685 vpaddd %ymm1, %ymm6, %ymm6
687 vpslld $10, \r7, %ymm2
689 vpsrld $11, \r4, %ymm1
690 vpxor %ymm2, \r4, \r4
691 vpxor %ymm1, \r4, \r4
692 vpslld $9, %ymm2, %ymm2
693 vpsrld $9, %ymm1, %ymm1
694 vpxor %ymm2, \r4, \r4
695 vpxor %ymm1, \r4, \r4
696 vpslld $11, %ymm2, %ymm2
697 vpxor %ymm2, \r4, \r4
698 vpaddd %ymm6, \r4, \r4
701 .macro sha256_avx2_main_quadround i
702 sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
703 sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
704 sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
705 sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
708 #endif /* USE_AVX2 */
713 .macro sha256_xop_extend_round i
714 vmovdqa (\i-15)*16(%rax), %xmm0
715 vprotd $25, %xmm0, %xmm1
716 vprotd $14, %xmm0, %xmm2
717 vpsrld $3, %xmm0, %xmm0
718 vpxor %xmm1, %xmm2, %xmm2
719 vpxor %xmm2, %xmm0, %xmm0
721 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
722 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
724 vprotd $15, %xmm3, %xmm1
725 vprotd $13, %xmm3, %xmm2
726 vpsrld $10, %xmm3, %xmm3
727 vpxor %xmm1, %xmm2, %xmm2
728 vpxor %xmm2, %xmm3, %xmm3
729 vpaddd %xmm0, %xmm3, %xmm3
730 vmovdqa %xmm3, \i*16(%rax)
733 .macro sha256_xop_extend_doubleround i
734 vmovdqa (\i-15)*16(%rax), %xmm0
735 vmovdqa (\i-14)*16(%rax), %xmm4
736 vprotd $25, %xmm0, %xmm1
737 vprotd $25, %xmm4, %xmm5
738 vprotd $14, %xmm0, %xmm2
739 vprotd $14, %xmm4, %xmm6
740 vpxor %xmm1, %xmm2, %xmm2
741 vpxor %xmm5, %xmm6, %xmm6
742 vpsrld $3, %xmm0, %xmm0
743 vpsrld $3, %xmm4, %xmm4
744 vpxor %xmm2, %xmm0, %xmm0
745 vpxor %xmm6, %xmm4, %xmm4
747 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
748 vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
750 vprotd $15, %xmm3, %xmm1
751 vprotd $15, %xmm7, %xmm5
752 vprotd $13, %xmm3, %xmm2
753 vprotd $13, %xmm7, %xmm6
754 vpxor %xmm1, %xmm2, %xmm2
755 vpxor %xmm5, %xmm6, %xmm6
757 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
758 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
760 vpsrld $10, %xmm3, %xmm3
761 vpsrld $10, %xmm7, %xmm7
762 vpxor %xmm2, %xmm3, %xmm3
763 vpxor %xmm6, %xmm7, %xmm7
765 vpaddd %xmm0, %xmm3, %xmm3
766 vpaddd %xmm4, %xmm7, %xmm7
767 vmovdqa %xmm3, \i*16(%rax)
768 vmovdqa %xmm7, (\i+1)*16(%rax)
771 .macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
772 vpaddd 16*(\i)(%rax), \r0, %xmm6
773 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
775 vpandn \r1, \r3, %xmm1
776 vpand \r3, \r2, %xmm2
777 vpxor %xmm2, %xmm1, %xmm1
778 vpaddd %xmm1, %xmm6, %xmm6
780 vprotd $26, \r3, %xmm1
781 vprotd $21, \r3, %xmm2
782 vpxor %xmm1, %xmm2, %xmm2
784 vpxor %xmm2, \r0, \r0
785 vpaddd \r0, %xmm6, %xmm6
786 vpaddd %xmm6, \r4, \r0
788 vpand \r6, \r5, %xmm2
790 vpand \r7, \r6, %xmm1
791 vpxor \r4, %xmm1, %xmm1
792 vpxor %xmm2, %xmm1, %xmm1
793 vpaddd %xmm1, %xmm6, %xmm6
795 vprotd $30, \r7, %xmm1
796 vprotd $19, \r7, %xmm2
797 vpxor %xmm1, %xmm2, %xmm2
799 vpxor %xmm2, \r4, \r4
800 vpaddd %xmm6, \r4, \r4
803 .macro sha256_xop_main_quadround i
804 sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
805 sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
806 sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
807 sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
815 sha256_transform_4way_core_sse2:
817 leaq 48*16(%rcx), %rax
818 movdqa -2*16(%rcx), %xmm3
819 movdqa -1*16(%rcx), %xmm7
820 sha256_transform_4way_sse2_extend_loop:
821 movdqa -15*16(%rcx), %xmm0
822 movdqa -14*16(%rcx), %xmm4
846 paddd -16*16(%rcx), %xmm0
847 paddd -15*16(%rcx), %xmm4
860 paddd -7*16(%rcx), %xmm0
861 paddd -6*16(%rcx), %xmm4
879 movdqa %xmm7, 16(%rcx)
882 jne sha256_transform_4way_sse2_extend_loop
884 movdqu 0(%rdi), %xmm7
885 movdqu 16(%rdi), %xmm5
886 movdqu 32(%rdi), %xmm4
887 movdqu 48(%rdi), %xmm3
888 movdqu 64(%rdi), %xmm0
889 movdqu 80(%rdi), %xmm8
890 movdqu 96(%rdi), %xmm9
891 movdqu 112(%rdi), %xmm10
893 leaq sha256_4k(%rip), %rcx
895 sha256_transform_4way_sse2_main_loop:
896 movdqa (%rsp, %rax), %xmm6
897 paddd (%rcx, %rax), %xmm6
961 jne sha256_transform_4way_sse2_main_loop
962 jmp sha256_transform_4way_finish
968 sha256_transform_4way_core_avx:
970 movdqa -2*16(%rax), %xmm3
971 movdqa -1*16(%rax), %xmm7
972 sha256_avx_extend_doubleround 0
973 sha256_avx_extend_doubleround 2
974 sha256_avx_extend_doubleround 4
975 sha256_avx_extend_doubleround 6
976 sha256_avx_extend_doubleround 8
977 sha256_avx_extend_doubleround 10
978 sha256_avx_extend_doubleround 12
979 sha256_avx_extend_doubleround 14
980 sha256_avx_extend_doubleround 16
981 sha256_avx_extend_doubleround 18
982 sha256_avx_extend_doubleround 20
983 sha256_avx_extend_doubleround 22
984 sha256_avx_extend_doubleround 24
985 sha256_avx_extend_doubleround 26
986 sha256_avx_extend_doubleround 28
987 sha256_avx_extend_doubleround 30
988 sha256_avx_extend_doubleround 32
989 sha256_avx_extend_doubleround 34
990 sha256_avx_extend_doubleround 36
991 sha256_avx_extend_doubleround 38
992 sha256_avx_extend_doubleround 40
993 sha256_avx_extend_doubleround 42
994 sha256_avx_extend_doubleround 44
995 sha256_avx_extend_doubleround 46
996 movdqu 0(%rdi), %xmm7
997 movdqu 16(%rdi), %xmm5
998 movdqu 32(%rdi), %xmm4
999 movdqu 48(%rdi), %xmm3
1000 movdqu 64(%rdi), %xmm0
1001 movdqu 80(%rdi), %xmm8
1002 movdqu 96(%rdi), %xmm9
1003 movdqu 112(%rdi), %xmm10
1005 leaq sha256_4k(%rip), %rcx
1006 sha256_avx_main_quadround 0
1007 sha256_avx_main_quadround 4
1008 sha256_avx_main_quadround 8
1009 sha256_avx_main_quadround 12
1010 sha256_avx_main_quadround 16
1011 sha256_avx_main_quadround 20
1012 sha256_avx_main_quadround 24
1013 sha256_avx_main_quadround 28
1014 sha256_avx_main_quadround 32
1015 sha256_avx_main_quadround 36
1016 sha256_avx_main_quadround 40
1017 sha256_avx_main_quadround 44
1018 sha256_avx_main_quadround 48
1019 sha256_avx_main_quadround 52
1020 sha256_avx_main_quadround 56
1021 sha256_avx_main_quadround 60
1022 jmp sha256_transform_4way_finish
1023 #endif /* USE_AVX */
1026 #if defined(USE_XOP)
1029 sha256_transform_4way_core_xop:
1030 leaq 256(%rsp), %rax
1031 movdqa -2*16(%rax), %xmm3
1032 movdqa -1*16(%rax), %xmm7
1033 sha256_xop_extend_doubleround 0
1034 sha256_xop_extend_doubleround 2
1035 sha256_xop_extend_doubleround 4
1036 sha256_xop_extend_doubleround 6
1037 sha256_xop_extend_doubleround 8
1038 sha256_xop_extend_doubleround 10
1039 sha256_xop_extend_doubleround 12
1040 sha256_xop_extend_doubleround 14
1041 sha256_xop_extend_doubleround 16
1042 sha256_xop_extend_doubleround 18
1043 sha256_xop_extend_doubleround 20
1044 sha256_xop_extend_doubleround 22
1045 sha256_xop_extend_doubleround 24
1046 sha256_xop_extend_doubleround 26
1047 sha256_xop_extend_doubleround 28
1048 sha256_xop_extend_doubleround 30
1049 sha256_xop_extend_doubleround 32
1050 sha256_xop_extend_doubleround 34
1051 sha256_xop_extend_doubleround 36
1052 sha256_xop_extend_doubleround 38
1053 sha256_xop_extend_doubleround 40
1054 sha256_xop_extend_doubleround 42
1055 sha256_xop_extend_doubleround 44
1056 sha256_xop_extend_doubleround 46
1057 movdqu 0(%rdi), %xmm7
1058 movdqu 16(%rdi), %xmm5
1059 movdqu 32(%rdi), %xmm4
1060 movdqu 48(%rdi), %xmm3
1061 movdqu 64(%rdi), %xmm0
1062 movdqu 80(%rdi), %xmm8
1063 movdqu 96(%rdi), %xmm9
1064 movdqu 112(%rdi), %xmm10
1066 leaq sha256_4k(%rip), %rcx
1067 sha256_xop_main_quadround 0
1068 sha256_xop_main_quadround 4
1069 sha256_xop_main_quadround 8
1070 sha256_xop_main_quadround 12
1071 sha256_xop_main_quadround 16
1072 sha256_xop_main_quadround 20
1073 sha256_xop_main_quadround 24
1074 sha256_xop_main_quadround 28
1075 sha256_xop_main_quadround 32
1076 sha256_xop_main_quadround 36
1077 sha256_xop_main_quadround 40
1078 sha256_xop_main_quadround 44
1079 sha256_xop_main_quadround 48
1080 sha256_xop_main_quadround 52
1081 sha256_xop_main_quadround 56
1082 sha256_xop_main_quadround 60
1083 jmp sha256_transform_4way_finish
1084 #endif /* USE_XOP */
1089 sha256_transform_4way_core_addr:
1092 .macro p2bswap_rsi_rsp i
1093 movdqu \i*16(%rsi), %xmm0
1094 movdqu (\i+1)*16(%rsi), %xmm2
1095 pshuflw $0xb1, %xmm0, %xmm0
1096 pshuflw $0xb1, %xmm2, %xmm2
1097 pshufhw $0xb1, %xmm0, %xmm0
1098 pshufhw $0xb1, %xmm2, %xmm2
1107 movdqa %xmm0, \i*16(%rsp)
1108 movdqa %xmm2, (\i+1)*16(%rsp)
1113 .globl sha256_transform_4way
1114 .globl _sha256_transform_4way
1115 sha256_transform_4way:
1116 _sha256_transform_4way:
1117 #if defined(_WIN64) || defined(__CYGWIN__)
1120 movdqa %xmm6, 0(%rsp)
1121 movdqa %xmm7, 16(%rsp)
1122 movdqa %xmm8, 32(%rsp)
1123 movdqa %xmm9, 48(%rsp)
1124 movdqa %xmm10, 64(%rsp)
1125 movdqa %xmm11, 80(%rsp)
1136 jnz sha256_transform_4way_swap
1138 movdqu 0*16(%rsi), %xmm0
1139 movdqu 1*16(%rsi), %xmm1
1140 movdqu 2*16(%rsi), %xmm2
1141 movdqu 3*16(%rsi), %xmm3
1142 movdqu 4*16(%rsi), %xmm4
1143 movdqu 5*16(%rsi), %xmm5
1144 movdqu 6*16(%rsi), %xmm6
1145 movdqu 7*16(%rsi), %xmm7
1146 movdqa %xmm0, 0*16(%rsp)
1147 movdqa %xmm1, 1*16(%rsp)
1148 movdqa %xmm2, 2*16(%rsp)
1149 movdqa %xmm3, 3*16(%rsp)
1150 movdqa %xmm4, 4*16(%rsp)
1151 movdqa %xmm5, 5*16(%rsp)
1152 movdqa %xmm6, 6*16(%rsp)
1153 movdqa %xmm7, 7*16(%rsp)
1154 movdqu 8*16(%rsi), %xmm0
1155 movdqu 9*16(%rsi), %xmm1
1156 movdqu 10*16(%rsi), %xmm2
1157 movdqu 11*16(%rsi), %xmm3
1158 movdqu 12*16(%rsi), %xmm4
1159 movdqu 13*16(%rsi), %xmm5
1160 movdqu 14*16(%rsi), %xmm6
1161 movdqu 15*16(%rsi), %xmm7
1162 movdqa %xmm0, 8*16(%rsp)
1163 movdqa %xmm1, 9*16(%rsp)
1164 movdqa %xmm2, 10*16(%rsp)
1165 movdqa %xmm3, 11*16(%rsp)
1166 movdqa %xmm4, 12*16(%rsp)
1167 movdqa %xmm5, 13*16(%rsp)
1168 movdqa %xmm6, 14*16(%rsp)
1169 movdqa %xmm7, 15*16(%rsp)
1170 jmp *sha256_transform_4way_core_addr(%rip)
1173 sha256_transform_4way_swap:
1182 jmp *sha256_transform_4way_core_addr(%rip)
1185 sha256_transform_4way_finish:
1186 movdqu 0(%rdi), %xmm2
1187 movdqu 16(%rdi), %xmm6
1188 movdqu 32(%rdi), %xmm11
1189 movdqu 48(%rdi), %xmm1
1194 movdqu 64(%rdi), %xmm2
1195 movdqu 80(%rdi), %xmm6
1196 movdqu 96(%rdi), %xmm11
1197 movdqu 112(%rdi), %xmm1
1203 movdqu %xmm7, 0(%rdi)
1204 movdqu %xmm5, 16(%rdi)
1205 movdqu %xmm4, 32(%rdi)
1206 movdqu %xmm3, 48(%rdi)
1207 movdqu %xmm0, 64(%rdi)
1208 movdqu %xmm8, 80(%rdi)
1209 movdqu %xmm9, 96(%rdi)
1210 movdqu %xmm10, 112(%rdi)
1213 #if defined(_WIN64) || defined(__CYGWIN__)
1215 movdqa 0(%rsp), %xmm6
1216 movdqa 16(%rsp), %xmm7
1217 movdqa 32(%rsp), %xmm8
1218 movdqa 48(%rsp), %xmm9
1219 movdqa 64(%rsp), %xmm10
1220 movdqa 80(%rsp), %xmm11
1231 sha256_transform_8way_core_avx2:
1232 leaq 8*64(%rsp), %rax
1233 vmovdqa -2*32(%rax), %ymm3
1234 vmovdqa -1*32(%rax), %ymm7
1235 sha256_avx2_extend_doubleround 0
1236 sha256_avx2_extend_doubleround 2
1237 sha256_avx2_extend_doubleround 4
1238 sha256_avx2_extend_doubleround 6
1239 sha256_avx2_extend_doubleround 8
1240 sha256_avx2_extend_doubleround 10
1241 sha256_avx2_extend_doubleround 12
1242 sha256_avx2_extend_doubleround 14
1243 sha256_avx2_extend_doubleround 16
1244 sha256_avx2_extend_doubleround 18
1245 sha256_avx2_extend_doubleround 20
1246 sha256_avx2_extend_doubleround 22
1247 sha256_avx2_extend_doubleround 24
1248 sha256_avx2_extend_doubleround 26
1249 sha256_avx2_extend_doubleround 28
1250 sha256_avx2_extend_doubleround 30
1251 sha256_avx2_extend_doubleround 32
1252 sha256_avx2_extend_doubleround 34
1253 sha256_avx2_extend_doubleround 36
1254 sha256_avx2_extend_doubleround 38
1255 sha256_avx2_extend_doubleround 40
1256 sha256_avx2_extend_doubleround 42
1257 sha256_avx2_extend_doubleround 44
1258 sha256_avx2_extend_doubleround 46
1259 vmovdqu 0*32(%rdi), %ymm7
1260 vmovdqu 1*32(%rdi), %ymm5
1261 vmovdqu 2*32(%rdi), %ymm4
1262 vmovdqu 3*32(%rdi), %ymm3
1263 vmovdqu 4*32(%rdi), %ymm0
1264 vmovdqu 5*32(%rdi), %ymm8
1265 vmovdqu 6*32(%rdi), %ymm9
1266 vmovdqu 7*32(%rdi), %ymm10
1268 leaq sha256_8k(%rip), %rcx
1269 sha256_avx2_main_quadround 0
1270 sha256_avx2_main_quadround 4
1271 sha256_avx2_main_quadround 8
1272 sha256_avx2_main_quadround 12
1273 sha256_avx2_main_quadround 16
1274 sha256_avx2_main_quadround 20
1275 sha256_avx2_main_quadround 24
1276 sha256_avx2_main_quadround 28
1277 sha256_avx2_main_quadround 32
1278 sha256_avx2_main_quadround 36
1279 sha256_avx2_main_quadround 40
1280 sha256_avx2_main_quadround 44
1281 sha256_avx2_main_quadround 48
1282 sha256_avx2_main_quadround 52
1283 sha256_avx2_main_quadround 56
1284 sha256_avx2_main_quadround 60
1285 jmp sha256_transform_8way_finish
1287 .macro p2bswap_avx2_rsi_rsp i
1288 vmovdqu \i*32(%rsi), %ymm0
1289 vmovdqu (\i+1)*32(%rsi), %ymm2
1290 vpshuflw $0xb1, %ymm0, %ymm0
1291 vpshuflw $0xb1, %ymm2, %ymm2
1292 vpshufhw $0xb1, %ymm0, %ymm0
1293 vpshufhw $0xb1, %ymm2, %ymm2
1294 vpsrlw $8, %ymm0, %ymm1
1295 vpsrlw $8, %ymm2, %ymm3
1296 vpsllw $8, %ymm0, %ymm0
1297 vpsllw $8, %ymm2, %ymm2
1298 vpxor %ymm1, %ymm0, %ymm0
1299 vpxor %ymm3, %ymm2, %ymm2
1300 vmovdqa %ymm0, \i*32(%rsp)
1301 vmovdqa %ymm2, (\i+1)*32(%rsp)
1306 .globl sha256_transform_8way
1307 .globl _sha256_transform_8way
1308 sha256_transform_8way:
1309 _sha256_transform_8way:
1310 #if defined(_WIN64) || defined(__CYGWIN__)
1313 vmovdqa %xmm6, 0(%rsp)
1314 vmovdqa %xmm7, 16(%rsp)
1315 vmovdqa %xmm8, 32(%rsp)
1316 vmovdqa %xmm9, 48(%rsp)
1317 vmovdqa %xmm10, 64(%rsp)
1318 vmovdqa %xmm11, 80(%rsp)
1329 jnz sha256_transform_8way_swap
1331 vmovdqu 0*32(%rsi), %ymm0
1332 vmovdqu 1*32(%rsi), %ymm1
1333 vmovdqu 2*32(%rsi), %ymm2
1334 vmovdqu 3*32(%rsi), %ymm3
1335 vmovdqu 4*32(%rsi), %ymm4
1336 vmovdqu 5*32(%rsi), %ymm5
1337 vmovdqu 6*32(%rsi), %ymm6
1338 vmovdqu 7*32(%rsi), %ymm7
1339 vmovdqa %ymm0, 0*32(%rsp)
1340 vmovdqa %ymm1, 1*32(%rsp)
1341 vmovdqa %ymm2, 2*32(%rsp)
1342 vmovdqa %ymm3, 3*32(%rsp)
1343 vmovdqa %ymm4, 4*32(%rsp)
1344 vmovdqa %ymm5, 5*32(%rsp)
1345 vmovdqa %ymm6, 6*32(%rsp)
1346 vmovdqa %ymm7, 7*32(%rsp)
1347 vmovdqu 8*32(%rsi), %ymm0
1348 vmovdqu 9*32(%rsi), %ymm1
1349 vmovdqu 10*32(%rsi), %ymm2
1350 vmovdqu 11*32(%rsi), %ymm3
1351 vmovdqu 12*32(%rsi), %ymm4
1352 vmovdqu 13*32(%rsi), %ymm5
1353 vmovdqu 14*32(%rsi), %ymm6
1354 vmovdqu 15*32(%rsi), %ymm7
1355 vmovdqa %ymm0, 8*32(%rsp)
1356 vmovdqa %ymm1, 9*32(%rsp)
1357 vmovdqa %ymm2, 10*32(%rsp)
1358 vmovdqa %ymm3, 11*32(%rsp)
1359 vmovdqa %ymm4, 12*32(%rsp)
1360 vmovdqa %ymm5, 13*32(%rsp)
1361 vmovdqa %ymm6, 14*32(%rsp)
1362 vmovdqa %ymm7, 15*32(%rsp)
1363 jmp sha256_transform_8way_core_avx2
1366 sha256_transform_8way_swap:
1367 p2bswap_avx2_rsi_rsp 0
1368 p2bswap_avx2_rsi_rsp 2
1369 p2bswap_avx2_rsi_rsp 4
1370 p2bswap_avx2_rsi_rsp 6
1371 p2bswap_avx2_rsi_rsp 8
1372 p2bswap_avx2_rsi_rsp 10
1373 p2bswap_avx2_rsi_rsp 12
1374 p2bswap_avx2_rsi_rsp 14
1375 jmp sha256_transform_8way_core_avx2
1378 sha256_transform_8way_finish:
1379 vmovdqu 0*32(%rdi), %ymm2
1380 vmovdqu 1*32(%rdi), %ymm6
1381 vmovdqu 2*32(%rdi), %ymm11
1382 vmovdqu 3*32(%rdi), %ymm1
1383 vpaddd %ymm2, %ymm7, %ymm7
1384 vpaddd %ymm6, %ymm5, %ymm5
1385 vpaddd %ymm11, %ymm4, %ymm4
1386 vpaddd %ymm1, %ymm3, %ymm3
1387 vmovdqu 4*32(%rdi), %ymm2
1388 vmovdqu 5*32(%rdi), %ymm6
1389 vmovdqu 6*32(%rdi), %ymm11
1390 vmovdqu 7*32(%rdi), %ymm1
1391 vpaddd %ymm2, %ymm0, %ymm0
1392 vpaddd %ymm6, %ymm8, %ymm8
1393 vpaddd %ymm11, %ymm9, %ymm9
1394 vpaddd %ymm1, %ymm10, %ymm10
1396 vmovdqu %ymm7, 0*32(%rdi)
1397 vmovdqu %ymm5, 1*32(%rdi)
1398 vmovdqu %ymm4, 2*32(%rdi)
1399 vmovdqu %ymm3, 3*32(%rdi)
1400 vmovdqu %ymm0, 4*32(%rdi)
1401 vmovdqu %ymm8, 5*32(%rdi)
1402 vmovdqu %ymm9, 6*32(%rdi)
1403 vmovdqu %ymm10, 7*32(%rdi)
1406 #if defined(_WIN64) || defined(__CYGWIN__)
1408 vmovdqa 0(%rsp), %xmm6
1409 vmovdqa 16(%rsp), %xmm7
1410 vmovdqa 32(%rsp), %xmm8
1411 vmovdqa 48(%rsp), %xmm9
1412 vmovdqa 64(%rsp), %xmm10
1413 vmovdqa 80(%rsp), %xmm11
1419 #endif /* USE_AVX2 */
1424 sha256d_ms_4way_addr:
1429 .globl sha256d_ms_4way
1430 .globl _sha256d_ms_4way
1433 jmp *sha256d_ms_4way_addr(%rip)
1437 sha256d_ms_4way_sse2:
1438 #if defined(_WIN64) || defined(__CYGWIN__)
1441 movdqa %xmm6, 0(%rsp)
1442 movdqa %xmm7, 16(%rsp)
1451 leaq 256(%rsi), %rax
1453 sha256d_ms_4way_sse2_extend_loop1:
1454 movdqa 3*16(%rsi), %xmm0
1455 movdqa 2*16(%rax), %xmm3
1456 movdqa 3*16(%rax), %xmm7
1457 movdqa %xmm3, 5*16(%rsp)
1458 movdqa %xmm7, 6*16(%rsp)
1472 movdqa %xmm3, 2*16(%rax)
1473 movdqa %xmm7, 3*16(%rax)
1475 movdqa 4*16(%rax), %xmm0
1476 movdqa %xmm0, 7*16(%rsp)
1500 movdqa %xmm3, 4*16(%rax)
1501 movdqa %xmm7, 5*16(%rax)
1503 movdqa 6*16(%rax), %xmm0
1504 movdqa 7*16(%rax), %xmm4
1505 movdqa %xmm0, 9*16(%rsp)
1506 movdqa %xmm4, 10*16(%rsp)
1531 movdqa %xmm3, 6*16(%rax)
1532 movdqa %xmm7, 7*16(%rax)
1534 movdqa 8*16(%rax), %xmm0
1535 movdqa 2*16(%rax), %xmm4
1536 movdqa %xmm0, 11*16(%rsp)
1561 movdqa %xmm3, 8*16(%rax)
1562 movdqa %xmm7, 9*16(%rax)
1586 paddd 3*16(%rax), %xmm3
1587 paddd 4*16(%rax), %xmm7
1588 movdqa %xmm3, 10*16(%rax)
1589 movdqa %xmm7, 11*16(%rax)
1613 paddd 5*16(%rax), %xmm3
1614 paddd 6*16(%rax), %xmm7
1615 movdqa %xmm3, 12*16(%rax)
1616 movdqa %xmm7, 13*16(%rax)
1618 movdqa 14*16(%rax), %xmm0
1619 movdqa 15*16(%rax), %xmm4
1620 movdqa %xmm0, 17*16(%rsp)
1621 movdqa %xmm4, 18*16(%rsp)
1628 paddd 7*16(%rax), %xmm0
1629 paddd 8*16(%rax), %xmm4
1648 movdqa %xmm3, 14*16(%rax)
1649 movdqa %xmm7, 15*16(%rax)
1651 sha256d_ms_4way_sse2_extend_loop2:
1652 sha256_sse2_extend_doubleround 16
1653 sha256_sse2_extend_doubleround 18
1654 sha256_sse2_extend_doubleround 20
1655 sha256_sse2_extend_doubleround 22
1656 sha256_sse2_extend_doubleround 24
1657 sha256_sse2_extend_doubleround 26
1658 sha256_sse2_extend_doubleround 28
1659 sha256_sse2_extend_doubleround 30
1660 sha256_sse2_extend_doubleround 32
1661 sha256_sse2_extend_doubleround 34
1662 sha256_sse2_extend_doubleround 36
1663 sha256_sse2_extend_doubleround 38
1664 sha256_sse2_extend_doubleround 40
1665 sha256_sse2_extend_doubleround 42
1666 jz sha256d_ms_4way_sse2_extend_coda2
1667 sha256_sse2_extend_doubleround 44
1668 sha256_sse2_extend_doubleround 46
1670 movdqa 0(%rcx), %xmm3
1671 movdqa 16(%rcx), %xmm0
1672 movdqa 32(%rcx), %xmm1
1673 movdqa 48(%rcx), %xmm2
1674 movdqa 64(%rcx), %xmm6
1675 movdqa 80(%rcx), %xmm7
1676 movdqa 96(%rcx), %xmm5
1677 movdqa 112(%rcx), %xmm4
1678 movdqa %xmm1, 0(%rsp)
1679 movdqa %xmm2, 16(%rsp)
1680 movdqa %xmm6, 32(%rsp)
1683 leaq sha256_4k(%rip), %rcx
1684 jmp sha256d_ms_4way_sse2_main_loop1
1686 sha256d_ms_4way_sse2_main_loop2:
1687 sha256_sse2_main_round 0
1688 sha256_sse2_main_round 1
1689 sha256_sse2_main_round 2
1690 sha256d_ms_4way_sse2_main_loop1:
1691 sha256_sse2_main_round 3
1692 sha256_sse2_main_quadround 4
1693 sha256_sse2_main_quadround 8
1694 sha256_sse2_main_quadround 12
1695 sha256_sse2_main_quadround 16
1696 sha256_sse2_main_quadround 20
1697 sha256_sse2_main_quadround 24
1698 sha256_sse2_main_quadround 28
1699 sha256_sse2_main_quadround 32
1700 sha256_sse2_main_quadround 36
1701 sha256_sse2_main_quadround 40
1702 sha256_sse2_main_quadround 44
1703 sha256_sse2_main_quadround 48
1704 sha256_sse2_main_quadround 52
1705 sha256_sse2_main_round 56
1706 jz sha256d_ms_4way_sse2_finish
1707 sha256_sse2_main_round 57
1708 sha256_sse2_main_round 58
1709 sha256_sse2_main_round 59
1710 sha256_sse2_main_quadround 60
1712 movdqa 5*16(%rsp), %xmm1
1713 movdqa 6*16(%rsp), %xmm2
1714 movdqa 7*16(%rsp), %xmm6
1715 movdqa %xmm1, 18*16(%rsi)
1716 movdqa %xmm2, 19*16(%rsi)
1717 movdqa %xmm6, 20*16(%rsi)
1718 movdqa 9*16(%rsp), %xmm1
1719 movdqa 10*16(%rsp), %xmm2
1720 movdqa 11*16(%rsp), %xmm6
1721 movdqa %xmm1, 22*16(%rsi)
1722 movdqa %xmm2, 23*16(%rsi)
1723 movdqa %xmm6, 24*16(%rsi)
1724 movdqa 17*16(%rsp), %xmm1
1725 movdqa 18*16(%rsp), %xmm2
1726 movdqa %xmm1, 30*16(%rsi)
1727 movdqa %xmm2, 31*16(%rsi)
1729 movdqa 0(%rsp), %xmm1
1730 movdqa 16(%rsp), %xmm2
1731 movdqa 32(%rsp), %xmm6
1732 paddd 0(%rdx), %xmm7
1733 paddd 16(%rdx), %xmm5
1734 paddd 32(%rdx), %xmm4
1735 paddd 48(%rdx), %xmm3
1736 paddd 64(%rdx), %xmm0
1737 paddd 80(%rdx), %xmm1
1738 paddd 96(%rdx), %xmm2
1739 paddd 112(%rdx), %xmm6
1741 movdqa %xmm7, 48+0(%rsp)
1742 movdqa %xmm5, 48+16(%rsp)
1743 movdqa %xmm4, 48+32(%rsp)
1744 movdqa %xmm3, 48+48(%rsp)
1745 movdqa %xmm0, 48+64(%rsp)
1746 movdqa %xmm1, 48+80(%rsp)
1747 movdqa %xmm2, 48+96(%rsp)
1748 movdqa %xmm6, 48+112(%rsp)
1751 movq $0x8000000000000100, %rax
1753 pshufd $0x55, %xmm1, %xmm2
1754 pshufd $0x00, %xmm1, %xmm1
1755 movdqa %xmm2, 48+128(%rsp)
1756 movdqa %xmm0, 48+144(%rsp)
1757 movdqa %xmm0, 48+160(%rsp)
1758 movdqa %xmm0, 48+176(%rsp)
1759 movdqa %xmm0, 48+192(%rsp)
1760 movdqa %xmm0, 48+208(%rsp)
1761 movdqa %xmm0, 48+224(%rsp)
1762 movdqa %xmm1, 48+240(%rsp)
1764 leaq 19*16(%rsp), %rax
1767 movdqa -15*16(%rax), %xmm0
1768 movdqa -14*16(%rax), %xmm4
1791 paddd -16*16(%rax), %xmm0
1792 paddd -15*16(%rax), %xmm4
1793 paddd sha256d_4preext2_17(%rip), %xmm4
1796 movdqa %xmm3, 0*16(%rax)
1797 movdqa %xmm7, 1*16(%rax)
1799 sha256_sse2_extend_doubleround 2
1800 sha256_sse2_extend_doubleround 4
1802 movdqa -9*16(%rax), %xmm0
1803 movdqa sha256d_4preext2_23(%rip), %xmm4
1815 paddd -10*16(%rax), %xmm0
1816 paddd -9*16(%rax), %xmm4
1823 paddd -1*16(%rax), %xmm0
1828 paddd 0*16(%rax), %xmm4
1843 movdqa %xmm3, 6*16(%rax)
1844 movdqa %xmm7, 7*16(%rax)
1846 movdqa sha256d_4preext2_24(%rip), %xmm0
1853 paddd 1*16(%rax), %xmm0
1871 paddd 2*16(%rax), %xmm7
1872 movdqa %xmm3, 8*16(%rax)
1873 movdqa %xmm7, 9*16(%rax)
1897 paddd 3*16(%rax), %xmm3
1898 paddd 4*16(%rax), %xmm7
1899 movdqa %xmm3, 10*16(%rax)
1900 movdqa %xmm7, 11*16(%rax)
1924 paddd 5*16(%rax), %xmm3
1925 paddd 6*16(%rax), %xmm7
1926 movdqa %xmm3, 12*16(%rax)
1927 movdqa %xmm7, 13*16(%rax)
1929 movdqa sha256d_4preext2_30(%rip), %xmm0
1930 movdqa 0*16(%rax), %xmm4
1942 paddd -1*16(%rax), %xmm4
1949 paddd 7*16(%rax), %xmm0
1954 paddd 8*16(%rax), %xmm4
1969 movdqa %xmm3, 14*16(%rax)
1970 movdqa %xmm7, 15*16(%rax)
1972 jmp sha256d_ms_4way_sse2_extend_loop2
1974 sha256d_ms_4way_sse2_extend_coda2:
1975 sha256_sse2_extend_round 44
1977 movdqa sha256_4h+0(%rip), %xmm7
1978 movdqa sha256_4h+16(%rip), %xmm5
1979 movdqa sha256_4h+32(%rip), %xmm4
1980 movdqa sha256_4h+48(%rip), %xmm3
1981 movdqa sha256_4h+64(%rip), %xmm0
1982 movdqa sha256_4h+80(%rip), %xmm1
1983 movdqa sha256_4h+96(%rip), %xmm2
1984 movdqa sha256_4h+112(%rip), %xmm6
1985 movdqa %xmm1, 0(%rsp)
1986 movdqa %xmm2, 16(%rsp)
1987 movdqa %xmm6, 32(%rsp)
1990 leaq sha256_4k(%rip), %rcx
1991 jmp sha256d_ms_4way_sse2_main_loop2
1993 .macro sha256_sse2_main_round_red i, r7
1994 movdqa 16*\i(%rax), %xmm6
1995 paddd 16*\i(%rcx), %xmm6
1996 paddd 32(%rsp), %xmm6
1998 movdqa 16(%rsp), %xmm2
2001 movdqa %xmm2, 32(%rsp)
2002 movdqa 0(%rsp), %xmm2
2003 movdqa %xmm2, 16(%rsp)
2006 movdqa %xmm0, 0(%rsp)
2024 sha256d_ms_4way_sse2_finish:
2025 sha256_sse2_main_round_red 57, %xmm3
2026 sha256_sse2_main_round_red 58, %xmm4
2027 sha256_sse2_main_round_red 59, %xmm5
2028 sha256_sse2_main_round_red 60, %xmm7
2030 paddd sha256_4h+112(%rip), %xmm0
2031 movdqa %xmm0, 112(%rdi)
2034 #if defined(_WIN64) || defined(__CYGWIN__)
2036 movdqa 0(%rsp), %xmm6
2037 movdqa 16(%rsp), %xmm7
2044 #if defined(USE_AVX)
2047 sha256d_ms_4way_avx:
2048 #if defined(_WIN64) || defined(__CYGWIN__)
2051 movdqa %xmm6, 0(%rsp)
2052 movdqa %xmm7, 16(%rsp)
2053 movdqa %xmm8, 32(%rsp)
2054 movdqa %xmm9, 48(%rsp)
2055 movdqa %xmm10, 64(%rsp)
2064 leaq 256(%rsi), %rax
2066 sha256d_ms_4way_avx_extend_loop1:
2067 vmovdqa 3*16(%rsi), %xmm0
2068 vmovdqa 2*16(%rax), %xmm3
2069 vmovdqa 3*16(%rax), %xmm7
2070 vmovdqa %xmm3, 2*16(%rsp)
2071 vmovdqa %xmm7, 3*16(%rsp)
2072 vpaddd %xmm0, %xmm7, %xmm7
2073 vpslld $14, %xmm0, %xmm2
2074 vpsrld $3, %xmm0, %xmm0
2075 vpsrld $4, %xmm0, %xmm1
2076 vpxor %xmm1, %xmm0, %xmm0
2077 vpxor %xmm2, %xmm0, %xmm0
2078 vpsrld $11, %xmm1, %xmm1
2079 vpslld $11, %xmm2, %xmm2
2080 vpxor %xmm1, %xmm0, %xmm0
2081 vpxor %xmm2, %xmm0, %xmm0
2082 vpaddd %xmm0, %xmm3, %xmm3
2083 vmovdqa %xmm3, 2*16(%rax)
2084 vmovdqa %xmm7, 3*16(%rax)
2086 vmovdqa 4*16(%rax), %xmm0
2087 vmovdqa %xmm0, 4*16(%rsp)
2088 vpslld $13, %xmm3, %xmm2
2089 vpslld $13, %xmm7, %xmm6
2090 vpsrld $10, %xmm3, %xmm3
2091 vpsrld $10, %xmm7, %xmm7
2092 vpsrld $7, %xmm3, %xmm1
2093 vpsrld $7, %xmm7, %xmm5
2094 vpxor %xmm1, %xmm3, %xmm3
2095 vpxor %xmm5, %xmm7, %xmm7
2096 vpsrld $2, %xmm1, %xmm1
2097 vpsrld $2, %xmm5, %xmm5
2098 vpxor %xmm2, %xmm3, %xmm3
2099 vpxor %xmm6, %xmm7, %xmm7
2100 vpslld $2, %xmm2, %xmm2
2101 vpslld $2, %xmm6, %xmm6
2102 vpxor %xmm1, %xmm3, %xmm3
2103 vpxor %xmm5, %xmm7, %xmm7
2104 vpxor %xmm2, %xmm3, %xmm3
2105 vpxor %xmm6, %xmm7, %xmm7
2106 vpaddd %xmm0, %xmm3, %xmm3
2107 vmovdqa %xmm3, 4*16(%rax)
2108 vmovdqa %xmm7, 5*16(%rax)
2110 vmovdqa 6*16(%rax), %xmm0
2111 vmovdqa 7*16(%rax), %xmm4
2112 vmovdqa %xmm0, 6*16(%rsp)
2113 vmovdqa %xmm4, 7*16(%rsp)
2114 vpslld $13, %xmm3, %xmm2
2115 vpslld $13, %xmm7, %xmm6
2116 vpsrld $10, %xmm3, %xmm3
2117 vpsrld $10, %xmm7, %xmm7
2118 vpsrld $7, %xmm3, %xmm1
2119 vpsrld $7, %xmm7, %xmm5
2120 vpxor %xmm1, %xmm3, %xmm3
2121 vpxor %xmm5, %xmm7, %xmm7
2122 vpsrld $2, %xmm1, %xmm1
2123 vpsrld $2, %xmm5, %xmm5
2124 vpxor %xmm2, %xmm3, %xmm3
2125 vpxor %xmm6, %xmm7, %xmm7
2126 vpslld $2, %xmm2, %xmm2
2127 vpslld $2, %xmm6, %xmm6
2128 vpxor %xmm1, %xmm3, %xmm3
2129 vpxor %xmm5, %xmm7, %xmm7
2130 vpxor %xmm2, %xmm3, %xmm3
2131 vpxor %xmm6, %xmm7, %xmm7
2132 vpaddd %xmm0, %xmm3, %xmm3
2133 vpaddd %xmm4, %xmm7, %xmm7
2134 vmovdqa %xmm3, 6*16(%rax)
2135 vmovdqa %xmm7, 7*16(%rax)
2137 vmovdqa 8*16(%rax), %xmm0
2138 vmovdqa 2*16(%rax), %xmm4
2139 vmovdqa %xmm0, 8*16(%rsp)
2140 vpslld $13, %xmm3, %xmm2
2141 vpslld $13, %xmm7, %xmm6
2142 vpsrld $10, %xmm3, %xmm3
2143 vpsrld $10, %xmm7, %xmm7
2144 vpsrld $7, %xmm3, %xmm1
2145 vpsrld $7, %xmm7, %xmm5
2146 vpxor %xmm1, %xmm3, %xmm3
2147 vpxor %xmm5, %xmm7, %xmm7
2148 vpsrld $2, %xmm1, %xmm1
2149 vpsrld $2, %xmm5, %xmm5
2150 vpxor %xmm2, %xmm3, %xmm3
2151 vpxor %xmm6, %xmm7, %xmm7
2152 vpslld $2, %xmm2, %xmm2
2153 vpslld $2, %xmm6, %xmm6
2154 vpxor %xmm1, %xmm3, %xmm3
2155 vpxor %xmm5, %xmm7, %xmm7
2156 vpxor %xmm2, %xmm3, %xmm3
2157 vpxor %xmm6, %xmm7, %xmm7
2158 vpaddd %xmm0, %xmm3, %xmm3
2159 vpaddd %xmm4, %xmm7, %xmm7
2160 vmovdqa %xmm3, 8*16(%rax)
2161 vmovdqa %xmm7, 9*16(%rax)
2163 vpslld $13, %xmm3, %xmm2
2164 vpslld $13, %xmm7, %xmm6
2165 vpsrld $10, %xmm3, %xmm3
2166 vpsrld $10, %xmm7, %xmm7
2167 vpsrld $7, %xmm3, %xmm1
2168 vpsrld $7, %xmm7, %xmm5
2169 vpxor %xmm1, %xmm3, %xmm3
2170 vpxor %xmm5, %xmm7, %xmm7
2171 vpsrld $2, %xmm1, %xmm1
2172 vpsrld $2, %xmm5, %xmm5
2173 vpxor %xmm2, %xmm3, %xmm3
2174 vpxor %xmm6, %xmm7, %xmm7
2175 vpslld $2, %xmm2, %xmm2
2176 vpslld $2, %xmm6, %xmm6
2177 vpxor %xmm1, %xmm3, %xmm3
2178 vpxor %xmm5, %xmm7, %xmm7
2179 vpxor %xmm2, %xmm3, %xmm3
2180 vpxor %xmm6, %xmm7, %xmm7
2181 vpaddd 3*16(%rax), %xmm3, %xmm3
2182 vpaddd 4*16(%rax), %xmm7, %xmm7
2183 vmovdqa %xmm3, 10*16(%rax)
2184 vmovdqa %xmm7, 11*16(%rax)
2186 vpslld $13, %xmm3, %xmm2
2187 vpslld $13, %xmm7, %xmm6
2188 vpsrld $10, %xmm3, %xmm3
2189 vpsrld $10, %xmm7, %xmm7
2190 vpsrld $7, %xmm3, %xmm1
2191 vpsrld $7, %xmm7, %xmm5
2192 vpxor %xmm1, %xmm3, %xmm3
2193 vpxor %xmm5, %xmm7, %xmm7
2194 vpsrld $2, %xmm1, %xmm1
2195 vpsrld $2, %xmm5, %xmm5
2196 vpxor %xmm2, %xmm3, %xmm3
2197 vpxor %xmm6, %xmm7, %xmm7
2198 vpslld $2, %xmm2, %xmm2
2199 vpslld $2, %xmm6, %xmm6
2200 vpxor %xmm1, %xmm3, %xmm3
2201 vpxor %xmm5, %xmm7, %xmm7
2202 vpxor %xmm2, %xmm3, %xmm3
2203 vpxor %xmm6, %xmm7, %xmm7
2204 vpaddd 5*16(%rax), %xmm3, %xmm3
2205 vpaddd 6*16(%rax), %xmm7, %xmm7
2206 vmovdqa %xmm3, 12*16(%rax)
2207 vmovdqa %xmm7, 13*16(%rax)
2209 vmovdqa 14*16(%rax), %xmm0
2210 vmovdqa 15*16(%rax), %xmm4
2211 vmovdqa %xmm0, 14*16(%rsp)
2212 vmovdqa %xmm4, 15*16(%rsp)
2213 vpslld $13, %xmm3, %xmm2
2214 vpslld $13, %xmm7, %xmm6
2215 vpsrld $10, %xmm3, %xmm3
2216 vpsrld $10, %xmm7, %xmm7
2217 vpaddd 7*16(%rax), %xmm0, %xmm0
2218 vpaddd 8*16(%rax), %xmm4, %xmm4
2219 vpsrld $7, %xmm3, %xmm1
2220 vpsrld $7, %xmm7, %xmm5
2221 vpxor %xmm1, %xmm3, %xmm3
2222 vpxor %xmm5, %xmm7, %xmm7
2223 vpsrld $2, %xmm1, %xmm1
2224 vpsrld $2, %xmm5, %xmm5
2225 vpxor %xmm2, %xmm3, %xmm3
2226 vpxor %xmm6, %xmm7, %xmm7
2227 vpslld $2, %xmm2, %xmm2
2228 vpslld $2, %xmm6, %xmm6
2229 vpxor %xmm1, %xmm3, %xmm3
2230 vpxor %xmm5, %xmm7, %xmm7
2231 vpxor %xmm2, %xmm3, %xmm3
2232 vpxor %xmm6, %xmm7, %xmm7
2233 vpaddd %xmm0, %xmm3, %xmm3
2234 vpaddd %xmm4, %xmm7, %xmm7
2235 vmovdqa %xmm3, 14*16(%rax)
2236 vmovdqa %xmm7, 15*16(%rax)
2238 sha256d_ms_4way_avx_extend_loop2:
2239 sha256_avx_extend_doubleround 16
2240 sha256_avx_extend_doubleround 18
2241 sha256_avx_extend_doubleround 20
2242 sha256_avx_extend_doubleround 22
2243 sha256_avx_extend_doubleround 24
2244 sha256_avx_extend_doubleround 26
2245 sha256_avx_extend_doubleround 28
2246 sha256_avx_extend_doubleround 30
2247 sha256_avx_extend_doubleround 32
2248 sha256_avx_extend_doubleround 34
2249 sha256_avx_extend_doubleround 36
2250 sha256_avx_extend_doubleround 38
2251 sha256_avx_extend_doubleround 40
2252 sha256_avx_extend_doubleround 42
2253 jz sha256d_ms_4way_avx_extend_coda2
2254 sha256_avx_extend_doubleround 44
2255 sha256_avx_extend_doubleround 46
2257 movdqa 0(%rcx), %xmm7
2258 movdqa 16(%rcx), %xmm8
2259 movdqa 32(%rcx), %xmm9
2260 movdqa 48(%rcx), %xmm10
2261 movdqa 64(%rcx), %xmm0
2262 movdqa 80(%rcx), %xmm5
2263 movdqa 96(%rcx), %xmm4
2264 movdqa 112(%rcx), %xmm3
2267 leaq sha256_4k(%rip), %rcx
2268 jmp sha256d_ms_4way_avx_main_loop1
2270 sha256d_ms_4way_avx_main_loop2:
2271 sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2272 sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2273 sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2274 sha256d_ms_4way_avx_main_loop1:
2275 sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2276 sha256_avx_main_quadround 4
2277 sha256_avx_main_quadround 8
2278 sha256_avx_main_quadround 12
2279 sha256_avx_main_quadround 16
2280 sha256_avx_main_quadround 20
2281 sha256_avx_main_quadround 24
2282 sha256_avx_main_quadround 28
2283 sha256_avx_main_quadround 32
2284 sha256_avx_main_quadround 36
2285 sha256_avx_main_quadround 40
2286 sha256_avx_main_quadround 44
2287 sha256_avx_main_quadround 48
2288 sha256_avx_main_quadround 52
2289 sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2290 jz sha256d_ms_4way_avx_finish
2291 sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2292 sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2293 sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2294 sha256_avx_main_quadround 60
2296 movdqa 2*16(%rsp), %xmm1
2297 movdqa 3*16(%rsp), %xmm2
2298 movdqa 4*16(%rsp), %xmm6
2299 movdqa %xmm1, 18*16(%rsi)
2300 movdqa %xmm2, 19*16(%rsi)
2301 movdqa %xmm6, 20*16(%rsi)
2302 movdqa 6*16(%rsp), %xmm1
2303 movdqa 7*16(%rsp), %xmm2
2304 movdqa 8*16(%rsp), %xmm6
2305 movdqa %xmm1, 22*16(%rsi)
2306 movdqa %xmm2, 23*16(%rsi)
2307 movdqa %xmm6, 24*16(%rsi)
2308 movdqa 14*16(%rsp), %xmm1
2309 movdqa 15*16(%rsp), %xmm2
2310 movdqa %xmm1, 30*16(%rsi)
2311 movdqa %xmm2, 31*16(%rsi)
2313 paddd 0(%rdx), %xmm7
2314 paddd 16(%rdx), %xmm5
2315 paddd 32(%rdx), %xmm4
2316 paddd 48(%rdx), %xmm3
2317 paddd 64(%rdx), %xmm0
2318 paddd 80(%rdx), %xmm8
2319 paddd 96(%rdx), %xmm9
2320 paddd 112(%rdx), %xmm10
2322 movdqa %xmm7, 0(%rsp)
2323 movdqa %xmm5, 16(%rsp)
2324 movdqa %xmm4, 32(%rsp)
2325 movdqa %xmm3, 48(%rsp)
2326 movdqa %xmm0, 64(%rsp)
2327 movdqa %xmm8, 80(%rsp)
2328 movdqa %xmm9, 96(%rsp)
2329 movdqa %xmm10, 112(%rsp)
2332 movq $0x8000000000000100, %rax
2334 pshufd $0x55, %xmm1, %xmm2
2335 pshufd $0x00, %xmm1, %xmm1
2336 movdqa %xmm2, 128(%rsp)
2337 movdqa %xmm0, 144(%rsp)
2338 movdqa %xmm0, 160(%rsp)
2339 movdqa %xmm0, 176(%rsp)
2340 movdqa %xmm0, 192(%rsp)
2341 movdqa %xmm0, 208(%rsp)
2342 movdqa %xmm0, 224(%rsp)
2343 movdqa %xmm1, 240(%rsp)
2345 leaq 256(%rsp), %rax
2348 vmovdqa -15*16(%rax), %xmm0
2349 vmovdqa -14*16(%rax), %xmm4
2350 vpslld $14, %xmm0, %xmm2
2351 vpslld $14, %xmm4, %xmm6
2352 vpsrld $3, %xmm0, %xmm8
2353 vpsrld $3, %xmm4, %xmm4
2354 vpsrld $7, %xmm0, %xmm1
2355 vpsrld $4, %xmm4, %xmm5
2356 vpxor %xmm1, %xmm8, %xmm8
2357 vpxor %xmm5, %xmm4, %xmm4
2358 vpsrld $11, %xmm1, %xmm1
2359 vpsrld $11, %xmm5, %xmm5
2360 vpxor %xmm2, %xmm8, %xmm8
2361 vpxor %xmm6, %xmm4, %xmm4
2362 vpslld $11, %xmm2, %xmm2
2363 vpslld $11, %xmm6, %xmm6
2364 vpxor %xmm1, %xmm8, %xmm8
2365 vpxor %xmm5, %xmm4, %xmm4
2366 vpxor %xmm2, %xmm8, %xmm8
2367 vpxor %xmm6, %xmm4, %xmm4
2368 vpaddd %xmm0, %xmm4, %xmm4
2369 vpaddd -16*16(%rax), %xmm8, %xmm3
2370 vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
2371 vmovdqa %xmm3, 0*16(%rax)
2372 vmovdqa %xmm7, 1*16(%rax)
2374 sha256_avx_extend_doubleround 2
2375 sha256_avx_extend_doubleround 4
2377 vmovdqa -9*16(%rax), %xmm0
2378 vpslld $14, %xmm0, %xmm2
2379 vpsrld $3, %xmm0, %xmm8
2380 vpsrld $7, %xmm0, %xmm1
2381 vpxor %xmm1, %xmm8, %xmm8
2382 vpxor %xmm2, %xmm8, %xmm8
2383 vpsrld $11, %xmm1, %xmm1
2384 vpslld $11, %xmm2, %xmm2
2385 vpxor %xmm1, %xmm8, %xmm8
2386 vpxor %xmm2, %xmm8, %xmm8
2387 vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
2388 vpaddd -10*16(%rax), %xmm8, %xmm0
2389 vpslld $13, %xmm3, %xmm2
2390 vpslld $13, %xmm7, %xmm6
2391 vpsrld $10, %xmm3, %xmm3
2392 vpsrld $10, %xmm7, %xmm7
2393 vpaddd -1*16(%rax), %xmm0, %xmm0
2394 vpaddd 0*16(%rax), %xmm4, %xmm4
2395 vpsrld $7, %xmm3, %xmm1
2396 vpsrld $7, %xmm7, %xmm5
2397 vpxor %xmm1, %xmm3, %xmm3
2398 vpxor %xmm5, %xmm7, %xmm7
2399 vpsrld $2, %xmm1, %xmm1
2400 vpsrld $2, %xmm5, %xmm5
2401 vpxor %xmm2, %xmm3, %xmm3
2402 vpxor %xmm6, %xmm7, %xmm7
2403 vpslld $2, %xmm2, %xmm2
2404 vpslld $2, %xmm6, %xmm6
2405 vpxor %xmm1, %xmm3, %xmm3
2406 vpxor %xmm5, %xmm7, %xmm7
2407 vpxor %xmm2, %xmm3, %xmm3
2408 vpxor %xmm6, %xmm7, %xmm7
2409 vpaddd %xmm0, %xmm3, %xmm3
2410 vpaddd %xmm4, %xmm7, %xmm7
2411 vmovdqa %xmm3, 6*16(%rax)
2412 vmovdqa %xmm7, 7*16(%rax)
2414 vpslld $13, %xmm3, %xmm2
2415 vpslld $13, %xmm7, %xmm6
2416 vpsrld $10, %xmm3, %xmm3
2417 vpsrld $10, %xmm7, %xmm7
2418 vpsrld $7, %xmm3, %xmm1
2419 vpsrld $7, %xmm7, %xmm5
2420 vpxor %xmm1, %xmm3, %xmm3
2421 vpxor %xmm5, %xmm7, %xmm7
2422 vpsrld $2, %xmm1, %xmm1
2423 vpsrld $2, %xmm5, %xmm5
2424 vpxor %xmm2, %xmm3, %xmm3
2425 vpxor %xmm6, %xmm7, %xmm7
2426 vpslld $2, %xmm2, %xmm2
2427 vpslld $2, %xmm6, %xmm6
2428 vpxor %xmm1, %xmm3, %xmm3
2429 vpxor %xmm5, %xmm7, %xmm7
2430 vpxor %xmm2, %xmm3, %xmm3
2431 vpxor %xmm6, %xmm7, %xmm7
2432 vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
2433 vpaddd 1*16(%rax), %xmm3, %xmm3
2434 vpaddd 2*16(%rax), %xmm7, %xmm7
2435 vmovdqa %xmm3, 8*16(%rax)
2436 vmovdqa %xmm7, 9*16(%rax)
2438 vpslld $13, %xmm3, %xmm2
2439 vpslld $13, %xmm7, %xmm6
2440 vpsrld $10, %xmm3, %xmm3
2441 vpsrld $10, %xmm7, %xmm7
2442 vpsrld $7, %xmm3, %xmm1
2443 vpsrld $7, %xmm7, %xmm5
2444 vpxor %xmm1, %xmm3, %xmm3
2445 vpxor %xmm5, %xmm7, %xmm7
2446 vpsrld $2, %xmm1, %xmm1
2447 vpsrld $2, %xmm5, %xmm5
2448 vpxor %xmm2, %xmm3, %xmm3
2449 vpxor %xmm6, %xmm7, %xmm7
2450 vpslld $2, %xmm2, %xmm2
2451 vpslld $2, %xmm6, %xmm6
2452 vpxor %xmm1, %xmm3, %xmm3
2453 vpxor %xmm5, %xmm7, %xmm7
2454 vpxor %xmm2, %xmm3, %xmm3
2455 vpxor %xmm6, %xmm7, %xmm7
2456 vpaddd 3*16(%rax), %xmm3, %xmm3
2457 vpaddd 4*16(%rax), %xmm7, %xmm7
2458 vmovdqa %xmm3, 10*16(%rax)
2459 vmovdqa %xmm7, 11*16(%rax)
2461 vpslld $13, %xmm3, %xmm2
2462 vpslld $13, %xmm7, %xmm6
2463 vpsrld $10, %xmm3, %xmm3
2464 vpsrld $10, %xmm7, %xmm7
2465 vpsrld $7, %xmm3, %xmm1
2466 vpsrld $7, %xmm7, %xmm5
2467 vpxor %xmm1, %xmm3, %xmm3
2468 vpxor %xmm5, %xmm7, %xmm7
2469 vpsrld $2, %xmm1, %xmm1
2470 vpsrld $2, %xmm5, %xmm5
2471 vpxor %xmm2, %xmm3, %xmm3
2472 vpxor %xmm6, %xmm7, %xmm7
2473 vpslld $2, %xmm2, %xmm2
2474 vpslld $2, %xmm6, %xmm6
2475 vpxor %xmm1, %xmm3, %xmm3
2476 vpxor %xmm5, %xmm7, %xmm7
2477 vpxor %xmm2, %xmm3, %xmm3
2478 vpxor %xmm6, %xmm7, %xmm7
2479 vpaddd 5*16(%rax), %xmm3, %xmm3
2480 vpaddd 6*16(%rax), %xmm7, %xmm7
2481 vmovdqa %xmm3, 12*16(%rax)
2482 vmovdqa %xmm7, 13*16(%rax)
2484 vmovdqa sha256d_4preext2_30(%rip), %xmm0
2485 vmovdqa 0*16(%rax), %xmm4
2486 vpslld $14, %xmm4, %xmm6
2487 vpsrld $3, %xmm4, %xmm4
2488 vpsrld $4, %xmm4, %xmm5
2489 vpxor %xmm5, %xmm4, %xmm4
2490 vpxor %xmm6, %xmm4, %xmm4
2491 vpsrld $11, %xmm5, %xmm5
2492 vpslld $11, %xmm6, %xmm6
2493 vpxor %xmm5, %xmm4, %xmm4
2494 vpxor %xmm6, %xmm4, %xmm4
2495 vpaddd -1*16(%rax), %xmm4, %xmm4
2496 vpslld $13, %xmm3, %xmm2
2497 vpslld $13, %xmm7, %xmm6
2498 vpsrld $10, %xmm3, %xmm3
2499 vpsrld $10, %xmm7, %xmm7
2500 vpaddd 7*16(%rax), %xmm0, %xmm0
2501 vpaddd 8*16(%rax), %xmm4, %xmm4
2502 vpsrld $7, %xmm3, %xmm1
2503 vpsrld $7, %xmm7, %xmm5
2504 vpxor %xmm1, %xmm3, %xmm3
2505 vpxor %xmm5, %xmm7, %xmm7
2506 vpsrld $2, %xmm1, %xmm1
2507 vpsrld $2, %xmm5, %xmm5
2508 vpxor %xmm2, %xmm3, %xmm3
2509 vpxor %xmm6, %xmm7, %xmm7
2510 vpslld $2, %xmm2, %xmm2
2511 vpslld $2, %xmm6, %xmm6
2512 vpxor %xmm1, %xmm3, %xmm3
2513 vpxor %xmm5, %xmm7, %xmm7
2514 vpxor %xmm2, %xmm3, %xmm3
2515 vpxor %xmm6, %xmm7, %xmm7
2516 vpaddd %xmm0, %xmm3, %xmm3
2517 vpaddd %xmm4, %xmm7, %xmm7
2518 vmovdqa %xmm3, 14*16(%rax)
2519 vmovdqa %xmm7, 15*16(%rax)
2521 jmp sha256d_ms_4way_avx_extend_loop2
2523 sha256d_ms_4way_avx_extend_coda2:
2524 sha256_avx_extend_round 44
2526 movdqa sha256_4h+0(%rip), %xmm7
2527 movdqa sha256_4h+16(%rip), %xmm5
2528 movdqa sha256_4h+32(%rip), %xmm4
2529 movdqa sha256_4h+48(%rip), %xmm3
2530 movdqa sha256_4h+64(%rip), %xmm0
2531 movdqa sha256_4h+80(%rip), %xmm8
2532 movdqa sha256_4h+96(%rip), %xmm9
2533 movdqa sha256_4h+112(%rip), %xmm10
2536 leaq sha256_4k(%rip), %rcx
2537 jmp sha256d_ms_4way_avx_main_loop2
2539 .macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
2540 vpaddd 16*\i(%rax), \r0, %xmm6
2541 vpaddd 16*\i(%rcx), %xmm6, %xmm6
2542 vpandn \r1, \r3, %xmm1
2543 vpand \r3, \r2, %xmm2
2544 vpxor %xmm2, %xmm1, %xmm1
2545 vpaddd %xmm1, %xmm6, %xmm6
2546 vpslld $7, \r3, %xmm1
2548 vpsrld $5, \r0, %xmm2
2549 vpxor %xmm1, \r0, \r0
2550 vpxor %xmm2, \r0, \r0
2551 vpslld $14, %xmm1, %xmm1
2552 vpsrld $14, %xmm2, %xmm2
2553 vpxor %xmm1, \r0, \r0
2554 vpxor %xmm2, \r0, \r0
2555 vpslld $5, %xmm1, %xmm1
2556 vpxor %xmm1, \r0, \r0
2557 vpaddd \r0, %xmm6, %xmm6
2558 vpaddd %xmm6, \r4, \r0
2561 sha256d_ms_4way_avx_finish:
2562 sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
2563 sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
2564 sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
2565 sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
2567 paddd sha256_4h+112(%rip), %xmm10
2568 movdqa %xmm10, 112(%rdi)
2571 #if defined(_WIN64) || defined(__CYGWIN__)
2573 movdqa 0(%rsp), %xmm6
2574 movdqa 16(%rsp), %xmm7
2575 movdqa 32(%rsp), %xmm8
2576 movdqa 48(%rsp), %xmm9
2577 movdqa 64(%rsp), %xmm10
2583 #endif /* USE_AVX */
2586 #if defined(USE_XOP)
2589 sha256d_ms_4way_xop:
2590 #if defined(_WIN64) || defined(__CYGWIN__)
2593 movdqa %xmm6, 0(%rsp)
2594 movdqa %xmm7, 16(%rsp)
2595 movdqa %xmm8, 32(%rsp)
2596 movdqa %xmm9, 48(%rsp)
2597 movdqa %xmm10, 64(%rsp)
2606 leaq 256(%rsi), %rax
2608 sha256d_ms_4way_xop_extend_loop1:
2609 vmovdqa 3*16(%rsi), %xmm0
2610 vmovdqa 2*16(%rax), %xmm3
2611 vmovdqa 3*16(%rax), %xmm7
2612 vmovdqa %xmm3, 2*16(%rsp)
2613 vmovdqa %xmm7, 3*16(%rsp)
2614 vpaddd %xmm0, %xmm7, %xmm7
2615 vprotd $25, %xmm0, %xmm1
2616 vprotd $14, %xmm0, %xmm2
2617 vpsrld $3, %xmm0, %xmm0
2618 vpxor %xmm1, %xmm2, %xmm2
2619 vpxor %xmm2, %xmm0, %xmm0
2620 vpaddd %xmm0, %xmm3, %xmm3
2621 vmovdqa %xmm3, 2*16(%rax)
2622 vmovdqa %xmm7, 3*16(%rax)
2624 vmovdqa 4*16(%rax), %xmm0
2625 vmovdqa %xmm0, 4*16(%rsp)
2626 vprotd $15, %xmm3, %xmm1
2627 vprotd $15, %xmm7, %xmm5
2628 vprotd $13, %xmm3, %xmm2
2629 vprotd $13, %xmm7, %xmm6
2630 vpxor %xmm1, %xmm2, %xmm2
2631 vpxor %xmm5, %xmm6, %xmm6
2632 vpsrld $10, %xmm3, %xmm3
2633 vpsrld $10, %xmm7, %xmm7
2634 vpxor %xmm2, %xmm3, %xmm3
2635 vpxor %xmm6, %xmm7, %xmm7
2636 vpaddd %xmm0, %xmm3, %xmm3
2637 vmovdqa %xmm3, 4*16(%rax)
2638 vmovdqa %xmm7, 5*16(%rax)
2640 vmovdqa 6*16(%rax), %xmm0
2641 vmovdqa 7*16(%rax), %xmm4
2642 vmovdqa %xmm0, 6*16(%rsp)
2643 vmovdqa %xmm4, 7*16(%rsp)
2644 vprotd $15, %xmm3, %xmm1
2645 vprotd $15, %xmm7, %xmm5
2646 vprotd $13, %xmm3, %xmm2
2647 vprotd $13, %xmm7, %xmm6
2648 vpxor %xmm1, %xmm2, %xmm2
2649 vpxor %xmm5, %xmm6, %xmm6
2650 vpsrld $10, %xmm3, %xmm3
2651 vpsrld $10, %xmm7, %xmm7
2652 vpxor %xmm2, %xmm3, %xmm3
2653 vpxor %xmm6, %xmm7, %xmm7
2654 vpaddd %xmm0, %xmm3, %xmm3
2655 vpaddd %xmm4, %xmm7, %xmm7
2656 vmovdqa %xmm3, 6*16(%rax)
2657 vmovdqa %xmm7, 7*16(%rax)
2659 vmovdqa 8*16(%rax), %xmm0
2660 vmovdqa 2*16(%rax), %xmm4
2661 vmovdqa %xmm0, 8*16(%rsp)
2662 vprotd $15, %xmm3, %xmm1
2663 vprotd $15, %xmm7, %xmm5
2664 vprotd $13, %xmm3, %xmm2
2665 vprotd $13, %xmm7, %xmm6
2666 vpxor %xmm1, %xmm2, %xmm2
2667 vpxor %xmm5, %xmm6, %xmm6
2668 vpsrld $10, %xmm3, %xmm3
2669 vpsrld $10, %xmm7, %xmm7
2670 vpxor %xmm2, %xmm3, %xmm3
2671 vpxor %xmm6, %xmm7, %xmm7
2672 vpaddd %xmm0, %xmm3, %xmm3
2673 vpaddd %xmm4, %xmm7, %xmm7
2674 vmovdqa %xmm3, 8*16(%rax)
2675 vmovdqa %xmm7, 9*16(%rax)
2677 vprotd $15, %xmm3, %xmm1
2678 vprotd $15, %xmm7, %xmm5
2679 vprotd $13, %xmm3, %xmm2
2680 vprotd $13, %xmm7, %xmm6
2681 vpxor %xmm1, %xmm2, %xmm2
2682 vpxor %xmm5, %xmm6, %xmm6
2683 vpsrld $10, %xmm3, %xmm3
2684 vpsrld $10, %xmm7, %xmm7
2685 vpxor %xmm2, %xmm3, %xmm3
2686 vpxor %xmm6, %xmm7, %xmm7
2687 vpaddd 3*16(%rax), %xmm3, %xmm3
2688 vpaddd 4*16(%rax), %xmm7, %xmm7
2689 vmovdqa %xmm3, 10*16(%rax)
2690 vmovdqa %xmm7, 11*16(%rax)
2692 vprotd $15, %xmm3, %xmm1
2693 vprotd $15, %xmm7, %xmm5
2694 vprotd $13, %xmm3, %xmm2
2695 vprotd $13, %xmm7, %xmm6
2696 vpxor %xmm1, %xmm2, %xmm2
2697 vpxor %xmm5, %xmm6, %xmm6
2698 vpsrld $10, %xmm3, %xmm3
2699 vpsrld $10, %xmm7, %xmm7
2700 vpxor %xmm2, %xmm3, %xmm3
2701 vpxor %xmm6, %xmm7, %xmm7
2702 vpaddd 5*16(%rax), %xmm3, %xmm3
2703 vpaddd 6*16(%rax), %xmm7, %xmm7
2704 vmovdqa %xmm3, 12*16(%rax)
2705 vmovdqa %xmm7, 13*16(%rax)
2707 vmovdqa 14*16(%rax), %xmm0
2708 vmovdqa 15*16(%rax), %xmm4
2709 vmovdqa %xmm0, 14*16(%rsp)
2710 vmovdqa %xmm4, 15*16(%rsp)
2711 vprotd $15, %xmm3, %xmm1
2712 vprotd $15, %xmm7, %xmm5
2713 vprotd $13, %xmm3, %xmm2
2714 vprotd $13, %xmm7, %xmm6
2715 vpxor %xmm1, %xmm2, %xmm2
2716 vpxor %xmm5, %xmm6, %xmm6
2717 vpaddd 7*16(%rax), %xmm0, %xmm0
2718 vpaddd 8*16(%rax), %xmm4, %xmm4
2719 vpsrld $10, %xmm3, %xmm3
2720 vpsrld $10, %xmm7, %xmm7
2721 vpxor %xmm2, %xmm3, %xmm3
2722 vpxor %xmm6, %xmm7, %xmm7
2723 vpaddd %xmm0, %xmm3, %xmm3
2724 vpaddd %xmm4, %xmm7, %xmm7
2725 vmovdqa %xmm3, 14*16(%rax)
2726 vmovdqa %xmm7, 15*16(%rax)
2728 sha256d_ms_4way_xop_extend_loop2:
2729 sha256_xop_extend_doubleround 16
2730 sha256_xop_extend_doubleround 18
2731 sha256_xop_extend_doubleround 20
2732 sha256_xop_extend_doubleround 22
2733 sha256_xop_extend_doubleround 24
2734 sha256_xop_extend_doubleround 26
2735 sha256_xop_extend_doubleround 28
2736 sha256_xop_extend_doubleround 30
2737 sha256_xop_extend_doubleround 32
2738 sha256_xop_extend_doubleround 34
2739 sha256_xop_extend_doubleround 36
2740 sha256_xop_extend_doubleround 38
2741 sha256_xop_extend_doubleround 40
2742 sha256_xop_extend_doubleround 42
2743 jz sha256d_ms_4way_xop_extend_coda2
2744 sha256_xop_extend_doubleround 44
2745 sha256_xop_extend_doubleround 46
2747 movdqa 0(%rcx), %xmm7
2748 movdqa 16(%rcx), %xmm8
2749 movdqa 32(%rcx), %xmm9
2750 movdqa 48(%rcx), %xmm10
2751 movdqa 64(%rcx), %xmm0
2752 movdqa 80(%rcx), %xmm5
2753 movdqa 96(%rcx), %xmm4
2754 movdqa 112(%rcx), %xmm3
2757 leaq sha256_4k(%rip), %rcx
2758 jmp sha256d_ms_4way_xop_main_loop1
2760 sha256d_ms_4way_xop_main_loop2:
2761 sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2762 sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2763 sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2764 sha256d_ms_4way_xop_main_loop1:
2765 sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2766 sha256_xop_main_quadround 4
2767 sha256_xop_main_quadround 8
2768 sha256_xop_main_quadround 12
2769 sha256_xop_main_quadround 16
2770 sha256_xop_main_quadround 20
2771 sha256_xop_main_quadround 24
2772 sha256_xop_main_quadround 28
2773 sha256_xop_main_quadround 32
2774 sha256_xop_main_quadround 36
2775 sha256_xop_main_quadround 40
2776 sha256_xop_main_quadround 44
2777 sha256_xop_main_quadround 48
2778 sha256_xop_main_quadround 52
2779 sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2780 jz sha256d_ms_4way_xop_finish
2781 sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2782 sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2783 sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2784 sha256_xop_main_quadround 60
2786 movdqa 2*16(%rsp), %xmm1
2787 movdqa 3*16(%rsp), %xmm2
2788 movdqa 4*16(%rsp), %xmm6
2789 movdqa %xmm1, 18*16(%rsi)
2790 movdqa %xmm2, 19*16(%rsi)
2791 movdqa %xmm6, 20*16(%rsi)
2792 movdqa 6*16(%rsp), %xmm1
2793 movdqa 7*16(%rsp), %xmm2
2794 movdqa 8*16(%rsp), %xmm6
2795 movdqa %xmm1, 22*16(%rsi)
2796 movdqa %xmm2, 23*16(%rsi)
2797 movdqa %xmm6, 24*16(%rsi)
2798 movdqa 14*16(%rsp), %xmm1
2799 movdqa 15*16(%rsp), %xmm2
2800 movdqa %xmm1, 30*16(%rsi)
2801 movdqa %xmm2, 31*16(%rsi)
2803 paddd 0(%rdx), %xmm7
2804 paddd 16(%rdx), %xmm5
2805 paddd 32(%rdx), %xmm4
2806 paddd 48(%rdx), %xmm3
2807 paddd 64(%rdx), %xmm0
2808 paddd 80(%rdx), %xmm8
2809 paddd 96(%rdx), %xmm9
2810 paddd 112(%rdx), %xmm10
2812 movdqa %xmm7, 0(%rsp)
2813 movdqa %xmm5, 16(%rsp)
2814 movdqa %xmm4, 32(%rsp)
2815 movdqa %xmm3, 48(%rsp)
2816 movdqa %xmm0, 64(%rsp)
2817 movdqa %xmm8, 80(%rsp)
2818 movdqa %xmm9, 96(%rsp)
2819 movdqa %xmm10, 112(%rsp)
2822 movq $0x8000000000000100, %rax
2824 pshufd $0x55, %xmm1, %xmm2
2825 pshufd $0x00, %xmm1, %xmm1
2826 movdqa %xmm2, 128(%rsp)
2827 movdqa %xmm0, 144(%rsp)
2828 movdqa %xmm0, 160(%rsp)
2829 movdqa %xmm0, 176(%rsp)
2830 movdqa %xmm0, 192(%rsp)
2831 movdqa %xmm0, 208(%rsp)
2832 movdqa %xmm0, 224(%rsp)
2833 movdqa %xmm1, 240(%rsp)
2835 leaq 256(%rsp), %rax
2838 vmovdqa -15*16(%rax), %xmm0
2839 vmovdqa -14*16(%rax), %xmm4
2840 vprotd $25, %xmm0, %xmm1
2841 vprotd $25, %xmm4, %xmm5
2842 vprotd $14, %xmm0, %xmm2
2843 vprotd $14, %xmm4, %xmm6
2844 vpxor %xmm1, %xmm2, %xmm2
2845 vpxor %xmm5, %xmm6, %xmm6
2846 vpsrld $3, %xmm0, %xmm8
2847 vpsrld $3, %xmm4, %xmm4
2848 vpxor %xmm2, %xmm8, %xmm8
2849 vpxor %xmm6, %xmm4, %xmm4
2850 vpaddd %xmm0, %xmm4, %xmm4
2851 vpaddd -16*16(%rax), %xmm8, %xmm3
2852 vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
2853 vmovdqa %xmm3, 0*16(%rax)
2854 vmovdqa %xmm7, 1*16(%rax)
2856 sha256_xop_extend_doubleround 2
2857 sha256_xop_extend_doubleround 4
2859 vmovdqa -9*16(%rax), %xmm0
2860 vprotd $25, %xmm0, %xmm1
2861 vprotd $14, %xmm0, %xmm2
2862 vpsrld $3, %xmm0, %xmm8
2863 vpxor %xmm1, %xmm2, %xmm2
2864 vpxor %xmm2, %xmm8, %xmm8
2865 vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
2866 vpaddd -10*16(%rax), %xmm8, %xmm0
2867 vprotd $15, %xmm3, %xmm1
2868 vprotd $15, %xmm7, %xmm5
2869 vprotd $13, %xmm3, %xmm2
2870 vprotd $13, %xmm7, %xmm6
2871 vpxor %xmm1, %xmm2, %xmm2
2872 vpxor %xmm5, %xmm6, %xmm6
2873 vpaddd -1*16(%rax), %xmm0, %xmm0
2874 vpaddd 0*16(%rax), %xmm4, %xmm4
2875 vpsrld $10, %xmm3, %xmm3
2876 vpsrld $10, %xmm7, %xmm7
2877 vpxor %xmm2, %xmm3, %xmm3
2878 vpxor %xmm6, %xmm7, %xmm7
2879 vpaddd %xmm0, %xmm3, %xmm3
2880 vpaddd %xmm4, %xmm7, %xmm7
2881 vmovdqa %xmm3, 6*16(%rax)
2882 vmovdqa %xmm7, 7*16(%rax)
2884 vprotd $15, %xmm3, %xmm1
2885 vprotd $15, %xmm7, %xmm5
2886 vprotd $13, %xmm3, %xmm2
2887 vprotd $13, %xmm7, %xmm6
2888 vpxor %xmm1, %xmm2, %xmm2
2889 vpxor %xmm5, %xmm6, %xmm6
2890 vpsrld $10, %xmm3, %xmm3
2891 vpsrld $10, %xmm7, %xmm7
2892 vpxor %xmm2, %xmm3, %xmm3
2893 vpxor %xmm6, %xmm7, %xmm7
2894 vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
2895 vpaddd 1*16(%rax), %xmm3, %xmm3
2896 vpaddd 2*16(%rax), %xmm7, %xmm7
2897 vmovdqa %xmm3, 8*16(%rax)
2898 vmovdqa %xmm7, 9*16(%rax)
2900 vprotd $15, %xmm3, %xmm1
2901 vprotd $15, %xmm7, %xmm5
2902 vprotd $13, %xmm3, %xmm2
2903 vprotd $13, %xmm7, %xmm6
2904 vpxor %xmm1, %xmm2, %xmm2
2905 vpxor %xmm5, %xmm6, %xmm6
2906 vpsrld $10, %xmm3, %xmm3
2907 vpsrld $10, %xmm7, %xmm7
2908 vpxor %xmm2, %xmm3, %xmm3
2909 vpxor %xmm6, %xmm7, %xmm7
2910 vpaddd 3*16(%rax), %xmm3, %xmm3
2911 vpaddd 4*16(%rax), %xmm7, %xmm7
2912 vmovdqa %xmm3, 10*16(%rax)
2913 vmovdqa %xmm7, 11*16(%rax)
2915 vprotd $15, %xmm3, %xmm1
2916 vprotd $15, %xmm7, %xmm5
2917 vprotd $13, %xmm3, %xmm2
2918 vprotd $13, %xmm7, %xmm6
2919 vpxor %xmm1, %xmm2, %xmm2
2920 vpxor %xmm5, %xmm6, %xmm6
2921 vpsrld $10, %xmm3, %xmm3
2922 vpsrld $10, %xmm7, %xmm7
2923 vpxor %xmm2, %xmm3, %xmm3
2924 vpxor %xmm6, %xmm7, %xmm7
2925 vpaddd 5*16(%rax), %xmm3, %xmm3
2926 vpaddd 6*16(%rax), %xmm7, %xmm7
2927 vmovdqa %xmm3, 12*16(%rax)
2928 vmovdqa %xmm7, 13*16(%rax)
2930 vmovdqa sha256d_4preext2_30(%rip), %xmm0
2931 vmovdqa 0*16(%rax), %xmm4
2932 vprotd $25, %xmm4, %xmm5
2933 vprotd $14, %xmm4, %xmm6
2934 vpxor %xmm5, %xmm6, %xmm6
2935 vpsrld $3, %xmm4, %xmm4
2936 vpxor %xmm6, %xmm4, %xmm4
2937 vpaddd -1*16(%rax), %xmm4, %xmm4
2938 vprotd $15, %xmm3, %xmm1
2939 vprotd $15, %xmm7, %xmm5
2940 vprotd $13, %xmm3, %xmm2
2941 vprotd $13, %xmm7, %xmm6
2942 vpxor %xmm1, %xmm2, %xmm2
2943 vpxor %xmm5, %xmm6, %xmm6
2944 vpaddd 7*16(%rax), %xmm0, %xmm0
2945 vpaddd 8*16(%rax), %xmm4, %xmm4
2946 vpsrld $10, %xmm3, %xmm3
2947 vpsrld $10, %xmm7, %xmm7
2948 vpxor %xmm2, %xmm3, %xmm3
2949 vpxor %xmm6, %xmm7, %xmm7
2950 vpaddd %xmm0, %xmm3, %xmm3
2951 vpaddd %xmm4, %xmm7, %xmm7
2952 vmovdqa %xmm3, 14*16(%rax)
2953 vmovdqa %xmm7, 15*16(%rax)
2955 jmp sha256d_ms_4way_xop_extend_loop2
2957 sha256d_ms_4way_xop_extend_coda2:
2958 sha256_xop_extend_round 44
2960 movdqa sha256_4h+0(%rip), %xmm7
2961 movdqa sha256_4h+16(%rip), %xmm5
2962 movdqa sha256_4h+32(%rip), %xmm4
2963 movdqa sha256_4h+48(%rip), %xmm3
2964 movdqa sha256_4h+64(%rip), %xmm0
2965 movdqa sha256_4h+80(%rip), %xmm8
2966 movdqa sha256_4h+96(%rip), %xmm9
2967 movdqa sha256_4h+112(%rip), %xmm10
2970 leaq sha256_4k(%rip), %rcx
2971 jmp sha256d_ms_4way_xop_main_loop2
2973 .macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
2974 vpaddd 16*\i(%rax), \r0, %xmm6
2975 vpaddd 16*\i(%rcx), %xmm6, %xmm6
2976 vpandn \r1, \r3, %xmm1
2977 vpand \r3, \r2, %xmm2
2978 vpxor %xmm2, %xmm1, %xmm1
2979 vpaddd %xmm1, %xmm6, %xmm6
2980 vprotd $26, \r3, %xmm1
2981 vprotd $21, \r3, %xmm2
2982 vpxor %xmm1, %xmm2, %xmm2
2984 vpxor %xmm2, \r0, \r0
2985 vpaddd \r0, %xmm6, %xmm6
2986 vpaddd %xmm6, \r4, \r0
2989 sha256d_ms_4way_xop_finish:
2990 sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
2991 sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
2992 sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
2993 sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
2995 paddd sha256_4h+112(%rip), %xmm10
2996 movdqa %xmm10, 112(%rdi)
2999 #if defined(_WIN64) || defined(__CYGWIN__)
3001 movdqa 0(%rsp), %xmm6
3002 movdqa 16(%rsp), %xmm7
3003 movdqa 32(%rsp), %xmm8
3004 movdqa 48(%rsp), %xmm9
3005 movdqa 64(%rsp), %xmm10
3011 #endif /* USE_XOP */
3016 .globl sha256_use_4way
3017 .globl _sha256_use_4way
3024 #if defined(USE_AVX)
3025 /* Check for AVX and OSXSAVE support */
3028 andl $0x18000000, %ecx
3029 cmpl $0x18000000, %ecx
3030 jne sha256_use_4way_base
3031 /* Check for XMM and YMM state support */
3034 andl $0x00000006, %eax
3035 cmpl $0x00000006, %eax
3036 jne sha256_use_4way_base
3037 #if defined(USE_XOP)
3038 /* Check for XOP support */
3039 movl $0x80000001, %eax
3041 andl $0x00000800, %ecx
3042 jz sha256_use_4way_avx
3044 sha256_use_4way_xop:
3045 leaq sha256d_ms_4way_xop(%rip), %rcx
3046 leaq sha256_transform_4way_core_xop(%rip), %rdx
3047 jmp sha256_use_4way_done
3048 #endif /* USE_XOP */
3050 sha256_use_4way_avx:
3051 leaq sha256d_ms_4way_avx(%rip), %rcx
3052 leaq sha256_transform_4way_core_avx(%rip), %rdx
3053 jmp sha256_use_4way_done
3054 #endif /* USE_AVX */
3056 sha256_use_4way_base:
3057 leaq sha256d_ms_4way_sse2(%rip), %rcx
3058 leaq sha256_transform_4way_core_sse2(%rip), %rdx
3060 sha256_use_4way_done:
3061 movq %rcx, sha256d_ms_4way_addr(%rip)
3062 movq %rdx, sha256_transform_4way_core_addr(%rip)
3070 #if defined(USE_AVX2)
3074 .globl sha256d_ms_8way
3075 .globl _sha256d_ms_8way
3078 sha256d_ms_8way_avx2:
3079 #if defined(_WIN64) || defined(__CYGWIN__)
3082 vmovdqa %xmm6, 0(%rsp)
3083 vmovdqa %xmm7, 16(%rsp)
3084 vmovdqa %xmm8, 32(%rsp)
3085 vmovdqa %xmm9, 48(%rsp)
3086 vmovdqa %xmm10, 64(%rsp)
3098 leaq 16*32(%rsi), %rax
3100 sha256d_ms_8way_avx2_extend_loop1:
3101 vmovdqa 3*32(%rsi), %ymm0
3102 vmovdqa 2*32(%rax), %ymm3
3103 vmovdqa 3*32(%rax), %ymm7
3104 vmovdqa %ymm3, 2*32(%rsp)
3105 vmovdqa %ymm7, 3*32(%rsp)
3106 vpaddd %ymm0, %ymm7, %ymm7
3107 vpslld $14, %ymm0, %ymm2
3108 vpsrld $3, %ymm0, %ymm0
3109 vpsrld $4, %ymm0, %ymm1
3110 vpxor %ymm1, %ymm0, %ymm0
3111 vpxor %ymm2, %ymm0, %ymm0
3112 vpsrld $11, %ymm1, %ymm1
3113 vpslld $11, %ymm2, %ymm2
3114 vpxor %ymm1, %ymm0, %ymm0
3115 vpxor %ymm2, %ymm0, %ymm0
3116 vpaddd %ymm0, %ymm3, %ymm3
3117 vmovdqa %ymm3, 2*32(%rax)
3118 vmovdqa %ymm7, 3*32(%rax)
3120 vmovdqa 4*32(%rax), %ymm0
3121 vmovdqa %ymm0, 4*32(%rsp)
3122 vpslld $13, %ymm3, %ymm2
3123 vpslld $13, %ymm7, %ymm6
3124 vpsrld $10, %ymm3, %ymm3
3125 vpsrld $10, %ymm7, %ymm7
3126 vpsrld $7, %ymm3, %ymm1
3127 vpsrld $7, %ymm7, %ymm5
3128 vpxor %ymm1, %ymm3, %ymm3
3129 vpxor %ymm5, %ymm7, %ymm7
3130 vpsrld $2, %ymm1, %ymm1
3131 vpsrld $2, %ymm5, %ymm5
3132 vpxor %ymm2, %ymm3, %ymm3
3133 vpxor %ymm6, %ymm7, %ymm7
3134 vpslld $2, %ymm2, %ymm2
3135 vpslld $2, %ymm6, %ymm6
3136 vpxor %ymm1, %ymm3, %ymm3
3137 vpxor %ymm5, %ymm7, %ymm7
3138 vpxor %ymm2, %ymm3, %ymm3
3139 vpxor %ymm6, %ymm7, %ymm7
3140 vpaddd %ymm0, %ymm3, %ymm3
3141 vmovdqa %ymm3, 4*32(%rax)
3142 vmovdqa %ymm7, 5*32(%rax)
3144 vmovdqa 6*32(%rax), %ymm0
3145 vmovdqa 7*32(%rax), %ymm4
3146 vmovdqa %ymm0, 6*32(%rsp)
3147 vmovdqa %ymm4, 7*32(%rsp)
3148 vpslld $13, %ymm3, %ymm2
3149 vpslld $13, %ymm7, %ymm6
3150 vpsrld $10, %ymm3, %ymm3
3151 vpsrld $10, %ymm7, %ymm7
3152 vpsrld $7, %ymm3, %ymm1
3153 vpsrld $7, %ymm7, %ymm5
3154 vpxor %ymm1, %ymm3, %ymm3
3155 vpxor %ymm5, %ymm7, %ymm7
3156 vpsrld $2, %ymm1, %ymm1
3157 vpsrld $2, %ymm5, %ymm5
3158 vpxor %ymm2, %ymm3, %ymm3
3159 vpxor %ymm6, %ymm7, %ymm7
3160 vpslld $2, %ymm2, %ymm2
3161 vpslld $2, %ymm6, %ymm6
3162 vpxor %ymm1, %ymm3, %ymm3
3163 vpxor %ymm5, %ymm7, %ymm7
3164 vpxor %ymm2, %ymm3, %ymm3
3165 vpxor %ymm6, %ymm7, %ymm7
3166 vpaddd %ymm0, %ymm3, %ymm3
3167 vpaddd %ymm4, %ymm7, %ymm7
3168 vmovdqa %ymm3, 6*32(%rax)
3169 vmovdqa %ymm7, 7*32(%rax)
3171 vmovdqa 8*32(%rax), %ymm0
3172 vmovdqa 2*32(%rax), %ymm4
3173 vmovdqa %ymm0, 8*32(%rsp)
3174 vpslld $13, %ymm3, %ymm2
3175 vpslld $13, %ymm7, %ymm6
3176 vpsrld $10, %ymm3, %ymm3
3177 vpsrld $10, %ymm7, %ymm7
3178 vpsrld $7, %ymm3, %ymm1
3179 vpsrld $7, %ymm7, %ymm5
3180 vpxor %ymm1, %ymm3, %ymm3
3181 vpxor %ymm5, %ymm7, %ymm7
3182 vpsrld $2, %ymm1, %ymm1
3183 vpsrld $2, %ymm5, %ymm5
3184 vpxor %ymm2, %ymm3, %ymm3
3185 vpxor %ymm6, %ymm7, %ymm7
3186 vpslld $2, %ymm2, %ymm2
3187 vpslld $2, %ymm6, %ymm6
3188 vpxor %ymm1, %ymm3, %ymm3
3189 vpxor %ymm5, %ymm7, %ymm7
3190 vpxor %ymm2, %ymm3, %ymm3
3191 vpxor %ymm6, %ymm7, %ymm7
3192 vpaddd %ymm0, %ymm3, %ymm3
3193 vpaddd %ymm4, %ymm7, %ymm7
3194 vmovdqa %ymm3, 8*32(%rax)
3195 vmovdqa %ymm7, 9*32(%rax)
3197 vpslld $13, %ymm3, %ymm2
3198 vpslld $13, %ymm7, %ymm6
3199 vpsrld $10, %ymm3, %ymm3
3200 vpsrld $10, %ymm7, %ymm7
3201 vpsrld $7, %ymm3, %ymm1
3202 vpsrld $7, %ymm7, %ymm5
3203 vpxor %ymm1, %ymm3, %ymm3
3204 vpxor %ymm5, %ymm7, %ymm7
3205 vpsrld $2, %ymm1, %ymm1
3206 vpsrld $2, %ymm5, %ymm5
3207 vpxor %ymm2, %ymm3, %ymm3
3208 vpxor %ymm6, %ymm7, %ymm7
3209 vpslld $2, %ymm2, %ymm2
3210 vpslld $2, %ymm6, %ymm6
3211 vpxor %ymm1, %ymm3, %ymm3
3212 vpxor %ymm5, %ymm7, %ymm7
3213 vpxor %ymm2, %ymm3, %ymm3
3214 vpxor %ymm6, %ymm7, %ymm7
3215 vpaddd 3*32(%rax), %ymm3, %ymm3
3216 vpaddd 4*32(%rax), %ymm7, %ymm7
3217 vmovdqa %ymm3, 10*32(%rax)
3218 vmovdqa %ymm7, 11*32(%rax)
3220 vpslld $13, %ymm3, %ymm2
3221 vpslld $13, %ymm7, %ymm6
3222 vpsrld $10, %ymm3, %ymm3
3223 vpsrld $10, %ymm7, %ymm7
3224 vpsrld $7, %ymm3, %ymm1
3225 vpsrld $7, %ymm7, %ymm5
3226 vpxor %ymm1, %ymm3, %ymm3
3227 vpxor %ymm5, %ymm7, %ymm7
3228 vpsrld $2, %ymm1, %ymm1
3229 vpsrld $2, %ymm5, %ymm5
3230 vpxor %ymm2, %ymm3, %ymm3
3231 vpxor %ymm6, %ymm7, %ymm7
3232 vpslld $2, %ymm2, %ymm2
3233 vpslld $2, %ymm6, %ymm6
3234 vpxor %ymm1, %ymm3, %ymm3
3235 vpxor %ymm5, %ymm7, %ymm7
3236 vpxor %ymm2, %ymm3, %ymm3
3237 vpxor %ymm6, %ymm7, %ymm7
3238 vpaddd 5*32(%rax), %ymm3, %ymm3
3239 vpaddd 6*32(%rax), %ymm7, %ymm7
3240 vmovdqa %ymm3, 12*32(%rax)
3241 vmovdqa %ymm7, 13*32(%rax)
3243 vmovdqa 14*32(%rax), %ymm0
3244 vmovdqa 15*32(%rax), %ymm4
3245 vmovdqa %ymm0, 14*32(%rsp)
3246 vmovdqa %ymm4, 15*32(%rsp)
3247 vpslld $13, %ymm3, %ymm2
3248 vpslld $13, %ymm7, %ymm6
3249 vpsrld $10, %ymm3, %ymm3
3250 vpsrld $10, %ymm7, %ymm7
3251 vpaddd 7*32(%rax), %ymm0, %ymm0
3252 vpaddd 8*32(%rax), %ymm4, %ymm4
3253 vpsrld $7, %ymm3, %ymm1
3254 vpsrld $7, %ymm7, %ymm5
3255 vpxor %ymm1, %ymm3, %ymm3
3256 vpxor %ymm5, %ymm7, %ymm7
3257 vpsrld $2, %ymm1, %ymm1
3258 vpsrld $2, %ymm5, %ymm5
3259 vpxor %ymm2, %ymm3, %ymm3
3260 vpxor %ymm6, %ymm7, %ymm7
3261 vpslld $2, %ymm2, %ymm2
3262 vpslld $2, %ymm6, %ymm6
3263 vpxor %ymm1, %ymm3, %ymm3
3264 vpxor %ymm5, %ymm7, %ymm7
3265 vpxor %ymm2, %ymm3, %ymm3
3266 vpxor %ymm6, %ymm7, %ymm7
3267 vpaddd %ymm0, %ymm3, %ymm3
3268 vpaddd %ymm4, %ymm7, %ymm7
3269 vmovdqa %ymm3, 14*32(%rax)
3270 vmovdqa %ymm7, 15*32(%rax)
3272 sha256d_ms_8way_avx2_extend_loop2:
3273 sha256_avx2_extend_doubleround 16
3274 sha256_avx2_extend_doubleround 18
3275 sha256_avx2_extend_doubleround 20
3276 sha256_avx2_extend_doubleround 22
3277 sha256_avx2_extend_doubleround 24
3278 sha256_avx2_extend_doubleround 26
3279 sha256_avx2_extend_doubleround 28
3280 sha256_avx2_extend_doubleround 30
3281 sha256_avx2_extend_doubleround 32
3282 sha256_avx2_extend_doubleround 34
3283 sha256_avx2_extend_doubleround 36
3284 sha256_avx2_extend_doubleround 38
3285 sha256_avx2_extend_doubleround 40
3286 sha256_avx2_extend_doubleround 42
3287 jz sha256d_ms_8way_avx2_extend_coda2
3288 sha256_avx2_extend_doubleround 44
3289 sha256_avx2_extend_doubleround 46
3291 vmovdqa 0(%rcx), %ymm7
3292 vmovdqa 32(%rcx), %ymm8
3293 vmovdqa 64(%rcx), %ymm9
3294 vmovdqa 96(%rcx), %ymm10
3295 vmovdqa 128(%rcx), %ymm0
3296 vmovdqa 160(%rcx), %ymm5
3297 vmovdqa 192(%rcx), %ymm4
3298 vmovdqa 224(%rcx), %ymm3
3301 leaq sha256_8k(%rip), %rcx
3302 jmp sha256d_ms_8way_avx2_main_loop1
3304 sha256d_ms_8way_avx2_main_loop2:
3305 sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3306 sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3307 sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3308 sha256d_ms_8way_avx2_main_loop1:
3309 sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3310 sha256_avx2_main_quadround 4
3311 sha256_avx2_main_quadround 8
3312 sha256_avx2_main_quadround 12
3313 sha256_avx2_main_quadround 16
3314 sha256_avx2_main_quadround 20
3315 sha256_avx2_main_quadround 24
3316 sha256_avx2_main_quadround 28
3317 sha256_avx2_main_quadround 32
3318 sha256_avx2_main_quadround 36
3319 sha256_avx2_main_quadround 40
3320 sha256_avx2_main_quadround 44
3321 sha256_avx2_main_quadround 48
3322 sha256_avx2_main_quadround 52
3323 sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3324 jz sha256d_ms_8way_avx2_finish
3325 sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3326 sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3327 sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3328 sha256_avx2_main_quadround 60
3330 vmovdqa 2*32(%rsp), %ymm1
3331 vmovdqa 3*32(%rsp), %ymm2
3332 vmovdqa 4*32(%rsp), %ymm6
3333 vmovdqa %ymm1, 18*32(%rsi)
3334 vmovdqa %ymm2, 19*32(%rsi)
3335 vmovdqa %ymm6, 20*32(%rsi)
3336 vmovdqa 6*32(%rsp), %ymm1
3337 vmovdqa 7*32(%rsp), %ymm2
3338 vmovdqa 8*32(%rsp), %ymm6
3339 vmovdqa %ymm1, 22*32(%rsi)
3340 vmovdqa %ymm2, 23*32(%rsi)
3341 vmovdqa %ymm6, 24*32(%rsi)
3342 vmovdqa 14*32(%rsp), %ymm1
3343 vmovdqa 15*32(%rsp), %ymm2
3344 vmovdqa %ymm1, 30*32(%rsi)
3345 vmovdqa %ymm2, 31*32(%rsi)
3347 vpaddd 0(%rdx), %ymm7, %ymm7
3348 vpaddd 32(%rdx), %ymm5, %ymm5
3349 vpaddd 64(%rdx), %ymm4, %ymm4
3350 vpaddd 96(%rdx), %ymm3, %ymm3
3351 vpaddd 128(%rdx), %ymm0, %ymm0
3352 vpaddd 160(%rdx), %ymm8, %ymm8
3353 vpaddd 192(%rdx), %ymm9, %ymm9
3354 vpaddd 224(%rdx), %ymm10, %ymm10
3356 vmovdqa %ymm7, 0(%rsp)
3357 vmovdqa %ymm5, 32(%rsp)
3358 vmovdqa %ymm4, 64(%rsp)
3359 vmovdqa %ymm3, 96(%rsp)
3360 vmovdqa %ymm0, 128(%rsp)
3361 vmovdqa %ymm8, 160(%rsp)
3362 vmovdqa %ymm9, 192(%rsp)
3363 vmovdqa %ymm10, 224(%rsp)
3365 vpxor %ymm0, %ymm0, %ymm0
3366 movq $0x8000000000000100, %rax
3368 vinserti128 $1, %xmm1, %ymm1, %ymm1
3369 vpshufd $0x55, %ymm1, %ymm2
3370 vpshufd $0x00, %ymm1, %ymm1
3371 vmovdqa %ymm2, 8*32(%rsp)
3372 vmovdqa %ymm0, 9*32(%rsp)
3373 vmovdqa %ymm0, 10*32(%rsp)
3374 vmovdqa %ymm0, 11*32(%rsp)
3375 vmovdqa %ymm0, 12*32(%rsp)
3376 vmovdqa %ymm0, 13*32(%rsp)
3377 vmovdqa %ymm0, 14*32(%rsp)
3378 vmovdqa %ymm1, 15*32(%rsp)
3380 leaq 16*32(%rsp), %rax
3383 vmovdqa -15*32(%rax), %ymm0
3384 vmovdqa -14*32(%rax), %ymm4
3385 vpslld $14, %ymm0, %ymm2
3386 vpslld $14, %ymm4, %ymm6
3387 vpsrld $3, %ymm0, %ymm8
3388 vpsrld $3, %ymm4, %ymm4
3389 vpsrld $7, %ymm0, %ymm1
3390 vpsrld $4, %ymm4, %ymm5
3391 vpxor %ymm1, %ymm8, %ymm8
3392 vpxor %ymm5, %ymm4, %ymm4
3393 vpsrld $11, %ymm1, %ymm1
3394 vpsrld $11, %ymm5, %ymm5
3395 vpxor %ymm2, %ymm8, %ymm8
3396 vpxor %ymm6, %ymm4, %ymm4
3397 vpslld $11, %ymm2, %ymm2
3398 vpslld $11, %ymm6, %ymm6
3399 vpxor %ymm1, %ymm8, %ymm8
3400 vpxor %ymm5, %ymm4, %ymm4
3401 vpxor %ymm2, %ymm8, %ymm8
3402 vpxor %ymm6, %ymm4, %ymm4
3403 vpaddd %ymm0, %ymm4, %ymm4
3404 vpaddd -16*32(%rax), %ymm8, %ymm3
3405 vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7
3406 vmovdqa %ymm3, 0*32(%rax)
3407 vmovdqa %ymm7, 1*32(%rax)
3409 sha256_avx2_extend_doubleround 2
3410 sha256_avx2_extend_doubleround 4
3412 vmovdqa -9*32(%rax), %ymm0
3413 vpslld $14, %ymm0, %ymm2
3414 vpsrld $3, %ymm0, %ymm8
3415 vpsrld $7, %ymm0, %ymm1
3416 vpxor %ymm1, %ymm8, %ymm8
3417 vpxor %ymm2, %ymm8, %ymm8
3418 vpsrld $11, %ymm1, %ymm1
3419 vpslld $11, %ymm2, %ymm2
3420 vpxor %ymm1, %ymm8, %ymm8
3421 vpxor %ymm2, %ymm8, %ymm8
3422 vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4
3423 vpaddd -10*32(%rax), %ymm8, %ymm0
3424 vpslld $13, %ymm3, %ymm2
3425 vpslld $13, %ymm7, %ymm6
3426 vpsrld $10, %ymm3, %ymm3
3427 vpsrld $10, %ymm7, %ymm7
3428 vpaddd -1*32(%rax), %ymm0, %ymm0
3429 vpaddd 0*32(%rax), %ymm4, %ymm4
3430 vpsrld $7, %ymm3, %ymm1
3431 vpsrld $7, %ymm7, %ymm5
3432 vpxor %ymm1, %ymm3, %ymm3
3433 vpxor %ymm5, %ymm7, %ymm7
3434 vpsrld $2, %ymm1, %ymm1
3435 vpsrld $2, %ymm5, %ymm5
3436 vpxor %ymm2, %ymm3, %ymm3
3437 vpxor %ymm6, %ymm7, %ymm7
3438 vpslld $2, %ymm2, %ymm2
3439 vpslld $2, %ymm6, %ymm6
3440 vpxor %ymm1, %ymm3, %ymm3
3441 vpxor %ymm5, %ymm7, %ymm7
3442 vpxor %ymm2, %ymm3, %ymm3
3443 vpxor %ymm6, %ymm7, %ymm7
3444 vpaddd %ymm0, %ymm3, %ymm3
3445 vpaddd %ymm4, %ymm7, %ymm7
3446 vmovdqa %ymm3, 6*32(%rax)
3447 vmovdqa %ymm7, 7*32(%rax)
3449 vpslld $13, %ymm3, %ymm2
3450 vpslld $13, %ymm7, %ymm6
3451 vpsrld $10, %ymm3, %ymm3
3452 vpsrld $10, %ymm7, %ymm7
3453 vpsrld $7, %ymm3, %ymm1
3454 vpsrld $7, %ymm7, %ymm5
3455 vpxor %ymm1, %ymm3, %ymm3
3456 vpxor %ymm5, %ymm7, %ymm7
3457 vpsrld $2, %ymm1, %ymm1
3458 vpsrld $2, %ymm5, %ymm5
3459 vpxor %ymm2, %ymm3, %ymm3
3460 vpxor %ymm6, %ymm7, %ymm7
3461 vpslld $2, %ymm2, %ymm2
3462 vpslld $2, %ymm6, %ymm6
3463 vpxor %ymm1, %ymm3, %ymm3
3464 vpxor %ymm5, %ymm7, %ymm7
3465 vpxor %ymm2, %ymm3, %ymm3
3466 vpxor %ymm6, %ymm7, %ymm7
3467 vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3
3468 vpaddd 1*32(%rax), %ymm3, %ymm3
3469 vpaddd 2*32(%rax), %ymm7, %ymm7
3470 vmovdqa %ymm3, 8*32(%rax)
3471 vmovdqa %ymm7, 9*32(%rax)
3473 vpslld $13, %ymm3, %ymm2
3474 vpslld $13, %ymm7, %ymm6
3475 vpsrld $10, %ymm3, %ymm3
3476 vpsrld $10, %ymm7, %ymm7
3477 vpsrld $7, %ymm3, %ymm1
3478 vpsrld $7, %ymm7, %ymm5
3479 vpxor %ymm1, %ymm3, %ymm3
3480 vpxor %ymm5, %ymm7, %ymm7
3481 vpsrld $2, %ymm1, %ymm1
3482 vpsrld $2, %ymm5, %ymm5
3483 vpxor %ymm2, %ymm3, %ymm3
3484 vpxor %ymm6, %ymm7, %ymm7
3485 vpslld $2, %ymm2, %ymm2
3486 vpslld $2, %ymm6, %ymm6
3487 vpxor %ymm1, %ymm3, %ymm3
3488 vpxor %ymm5, %ymm7, %ymm7
3489 vpxor %ymm2, %ymm3, %ymm3
3490 vpxor %ymm6, %ymm7, %ymm7
3491 vpaddd 3*32(%rax), %ymm3, %ymm3
3492 vpaddd 4*32(%rax), %ymm7, %ymm7
3493 vmovdqa %ymm3, 10*32(%rax)
3494 vmovdqa %ymm7, 11*32(%rax)
3496 vpslld $13, %ymm3, %ymm2
3497 vpslld $13, %ymm7, %ymm6
3498 vpsrld $10, %ymm3, %ymm3
3499 vpsrld $10, %ymm7, %ymm7
3500 vpsrld $7, %ymm3, %ymm1
3501 vpsrld $7, %ymm7, %ymm5
3502 vpxor %ymm1, %ymm3, %ymm3
3503 vpxor %ymm5, %ymm7, %ymm7
3504 vpsrld $2, %ymm1, %ymm1
3505 vpsrld $2, %ymm5, %ymm5
3506 vpxor %ymm2, %ymm3, %ymm3
3507 vpxor %ymm6, %ymm7, %ymm7
3508 vpslld $2, %ymm2, %ymm2
3509 vpslld $2, %ymm6, %ymm6
3510 vpxor %ymm1, %ymm3, %ymm3
3511 vpxor %ymm5, %ymm7, %ymm7
3512 vpxor %ymm2, %ymm3, %ymm3
3513 vpxor %ymm6, %ymm7, %ymm7
3514 vpaddd 5*32(%rax), %ymm3, %ymm3
3515 vpaddd 6*32(%rax), %ymm7, %ymm7
3516 vmovdqa %ymm3, 12*32(%rax)
3517 vmovdqa %ymm7, 13*32(%rax)
3519 vmovdqa sha256d_8preext2_30(%rip), %ymm0
3520 vmovdqa 0*32(%rax), %ymm4
3521 vpslld $14, %ymm4, %ymm6
3522 vpsrld $3, %ymm4, %ymm4
3523 vpsrld $4, %ymm4, %ymm5
3524 vpxor %ymm5, %ymm4, %ymm4
3525 vpxor %ymm6, %ymm4, %ymm4
3526 vpsrld $11, %ymm5, %ymm5
3527 vpslld $11, %ymm6, %ymm6
3528 vpxor %ymm5, %ymm4, %ymm4
3529 vpxor %ymm6, %ymm4, %ymm4
3530 vpaddd -1*32(%rax), %ymm4, %ymm4
3531 vpslld $13, %ymm3, %ymm2
3532 vpslld $13, %ymm7, %ymm6
3533 vpsrld $10, %ymm3, %ymm3
3534 vpsrld $10, %ymm7, %ymm7
3535 vpaddd 7*32(%rax), %ymm0, %ymm0
3536 vpaddd 8*32(%rax), %ymm4, %ymm4
3537 vpsrld $7, %ymm3, %ymm1
3538 vpsrld $7, %ymm7, %ymm5
3539 vpxor %ymm1, %ymm3, %ymm3
3540 vpxor %ymm5, %ymm7, %ymm7
3541 vpsrld $2, %ymm1, %ymm1
3542 vpsrld $2, %ymm5, %ymm5
3543 vpxor %ymm2, %ymm3, %ymm3
3544 vpxor %ymm6, %ymm7, %ymm7
3545 vpslld $2, %ymm2, %ymm2
3546 vpslld $2, %ymm6, %ymm6
3547 vpxor %ymm1, %ymm3, %ymm3
3548 vpxor %ymm5, %ymm7, %ymm7
3549 vpxor %ymm2, %ymm3, %ymm3
3550 vpxor %ymm6, %ymm7, %ymm7
3551 vpaddd %ymm0, %ymm3, %ymm3
3552 vpaddd %ymm4, %ymm7, %ymm7
3553 vmovdqa %ymm3, 14*32(%rax)
3554 vmovdqa %ymm7, 15*32(%rax)
3556 jmp sha256d_ms_8way_avx2_extend_loop2
3558 sha256d_ms_8way_avx2_extend_coda2:
3559 sha256_avx2_extend_round 44
3561 vmovdqa sha256_8h+0(%rip), %ymm7
3562 vmovdqa sha256_8h+32(%rip), %ymm5
3563 vmovdqa sha256_8h+64(%rip), %ymm4
3564 vmovdqa sha256_8h+96(%rip), %ymm3
3565 vmovdqa sha256_8h+128(%rip), %ymm0
3566 vmovdqa sha256_8h+160(%rip), %ymm8
3567 vmovdqa sha256_8h+192(%rip), %ymm9
3568 vmovdqa sha256_8h+224(%rip), %ymm10
3571 leaq sha256_8k(%rip), %rcx
3572 jmp sha256d_ms_8way_avx2_main_loop2
3574 .macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
3575 vpaddd 32*\i(%rax), \r0, %ymm6
3576 vpaddd 32*\i(%rcx), %ymm6, %ymm6
3577 vpandn \r1, \r3, %ymm1
3578 vpand \r3, \r2, %ymm2
3579 vpxor %ymm2, %ymm1, %ymm1
3580 vpaddd %ymm1, %ymm6, %ymm6
3581 vpslld $7, \r3, %ymm1
3583 vpsrld $5, \r0, %ymm2
3584 vpxor %ymm1, \r0, \r0
3585 vpxor %ymm2, \r0, \r0
3586 vpslld $14, %ymm1, %ymm1
3587 vpsrld $14, %ymm2, %ymm2
3588 vpxor %ymm1, \r0, \r0
3589 vpxor %ymm2, \r0, \r0
3590 vpslld $5, %ymm1, %ymm1
3591 vpxor %ymm1, \r0, \r0
3592 vpaddd \r0, %ymm6, %ymm6
3593 vpaddd %ymm6, \r4, \r0
3596 sha256d_ms_8way_avx2_finish:
3597 sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
3598 sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
3599 sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
3600 sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
3602 vpaddd sha256_8h+224(%rip), %ymm10, %ymm10
3603 vmovdqa %ymm10, 224(%rdi)
3607 #if defined(_WIN64) || defined(__CYGWIN__)
3609 vmovdqa 0(%rsp), %xmm6
3610 vmovdqa 16(%rsp), %xmm7
3611 vmovdqa 32(%rsp), %xmm8
3612 vmovdqa 48(%rsp), %xmm9
3613 vmovdqa 64(%rsp), %xmm10
3622 .globl sha256_use_8way
3623 .globl _sha256_use_8way
3628 /* Check for AVX and OSXSAVE support */
3631 andl $0x18000000, %ecx
3632 cmpl $0x18000000, %ecx
3633 jne sha256_use_8way_no
3634 /* Check for AVX2 support */
3638 andl $0x00000020, %ebx
3639 cmpl $0x00000020, %ebx
3640 jne sha256_use_8way_no
3641 /* Check for XMM and YMM state support */
3644 andl $0x00000006, %eax
3645 cmpl $0x00000006, %eax
3646 jne sha256_use_8way_no
3648 sha256_use_8way_yes:
3650 jmp sha256_use_8way_done
3655 sha256_use_8way_done:
3659 #endif /* USE_AVX2 */