]>
Commit | Line | Data |
---|---|---|
d9061ec3 RH |
1 | /* |
2 | * ARM AdvSIMD / SVE Vector Operations | |
3 | * | |
4 | * Copyright (c) 2018 Linaro | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
d9061ec3 RH |
22 | #include "exec/helper-proto.h" |
23 | #include "tcg/tcg-gvec-desc.h" | |
1695cd61 | 24 | #include "fpu/softfloat.h" |
a04b68e1 | 25 | #include "vec_internal.h" |
d9061ec3 | 26 | |
1695cd61 RH |
27 | /* Note that vector data is stored in host-endian 64-bit chunks, |
28 | so addressing units smaller than that needs a host-endian fixup. */ | |
29 | #ifdef HOST_WORDS_BIGENDIAN | |
30 | #define H1(x) ((x) ^ 7) | |
31 | #define H2(x) ((x) ^ 3) | |
32 | #define H4(x) ((x) ^ 1) | |
33 | #else | |
34 | #define H1(x) (x) | |
35 | #define H2(x) (x) | |
36 | #define H4(x) (x) | |
37 | #endif | |
38 | ||
d9061ec3 | 39 | /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ |
e286bf4a RH |
40 | static int16_t inl_qrdmlah_s16(int16_t src1, int16_t src2, |
41 | int16_t src3, uint32_t *sat) | |
d9061ec3 RH |
42 | { |
43 | /* Simplify: | |
44 | * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
45 | * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 | |
46 | */ | |
47 | int32_t ret = (int32_t)src1 * src2; | |
48 | ret = ((int32_t)src3 << 15) + ret + (1 << 14); | |
49 | ret >>= 15; | |
50 | if (ret != (int16_t)ret) { | |
e286bf4a | 51 | *sat = 1; |
d9061ec3 RH |
52 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
53 | } | |
54 | return ret; | |
55 | } | |
56 | ||
57 | uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, | |
58 | uint32_t src2, uint32_t src3) | |
59 | { | |
e286bf4a RH |
60 | uint32_t *sat = &env->vfp.qc[0]; |
61 | uint16_t e1 = inl_qrdmlah_s16(src1, src2, src3, sat); | |
62 | uint16_t e2 = inl_qrdmlah_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); | |
d9061ec3 RH |
63 | return deposit32(e1, 16, 16, e2); |
64 | } | |
65 | ||
e7186d82 | 66 | void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 67 | void *vq, uint32_t desc) |
e7186d82 RH |
68 | { |
69 | uintptr_t opr_sz = simd_oprsz(desc); | |
70 | int16_t *d = vd; | |
71 | int16_t *n = vn; | |
72 | int16_t *m = vm; | |
e7186d82 RH |
73 | uintptr_t i; |
74 | ||
75 | for (i = 0; i < opr_sz / 2; ++i) { | |
e286bf4a | 76 | d[i] = inl_qrdmlah_s16(n[i], m[i], d[i], vq); |
e7186d82 RH |
77 | } |
78 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
79 | } | |
80 | ||
d9061ec3 | 81 | /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ |
e286bf4a RH |
82 | static int16_t inl_qrdmlsh_s16(int16_t src1, int16_t src2, |
83 | int16_t src3, uint32_t *sat) | |
d9061ec3 RH |
84 | { |
85 | /* Similarly, using subtraction: | |
86 | * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
87 | * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 | |
88 | */ | |
89 | int32_t ret = (int32_t)src1 * src2; | |
90 | ret = ((int32_t)src3 << 15) - ret + (1 << 14); | |
91 | ret >>= 15; | |
92 | if (ret != (int16_t)ret) { | |
e286bf4a | 93 | *sat = 1; |
d9061ec3 RH |
94 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
95 | } | |
96 | return ret; | |
97 | } | |
98 | ||
99 | uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, | |
100 | uint32_t src2, uint32_t src3) | |
101 | { | |
e286bf4a RH |
102 | uint32_t *sat = &env->vfp.qc[0]; |
103 | uint16_t e1 = inl_qrdmlsh_s16(src1, src2, src3, sat); | |
104 | uint16_t e2 = inl_qrdmlsh_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); | |
d9061ec3 RH |
105 | return deposit32(e1, 16, 16, e2); |
106 | } | |
107 | ||
e7186d82 | 108 | void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 109 | void *vq, uint32_t desc) |
e7186d82 RH |
110 | { |
111 | uintptr_t opr_sz = simd_oprsz(desc); | |
112 | int16_t *d = vd; | |
113 | int16_t *n = vn; | |
114 | int16_t *m = vm; | |
e7186d82 RH |
115 | uintptr_t i; |
116 | ||
117 | for (i = 0; i < opr_sz / 2; ++i) { | |
e286bf4a | 118 | d[i] = inl_qrdmlsh_s16(n[i], m[i], d[i], vq); |
e7186d82 RH |
119 | } |
120 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
121 | } | |
122 | ||
d9061ec3 | 123 | /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ |
e286bf4a RH |
124 | static int32_t inl_qrdmlah_s32(int32_t src1, int32_t src2, |
125 | int32_t src3, uint32_t *sat) | |
d9061ec3 RH |
126 | { |
127 | /* Simplify similarly to int_qrdmlah_s16 above. */ | |
128 | int64_t ret = (int64_t)src1 * src2; | |
129 | ret = ((int64_t)src3 << 31) + ret + (1 << 30); | |
130 | ret >>= 31; | |
131 | if (ret != (int32_t)ret) { | |
e286bf4a | 132 | *sat = 1; |
d9061ec3 RH |
133 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
134 | } | |
135 | return ret; | |
136 | } | |
137 | ||
e286bf4a RH |
138 | uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, |
139 | int32_t src2, int32_t src3) | |
140 | { | |
141 | uint32_t *sat = &env->vfp.qc[0]; | |
142 | return inl_qrdmlah_s32(src1, src2, src3, sat); | |
143 | } | |
144 | ||
e7186d82 | 145 | void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 146 | void *vq, uint32_t desc) |
e7186d82 RH |
147 | { |
148 | uintptr_t opr_sz = simd_oprsz(desc); | |
149 | int32_t *d = vd; | |
150 | int32_t *n = vn; | |
151 | int32_t *m = vm; | |
e7186d82 RH |
152 | uintptr_t i; |
153 | ||
154 | for (i = 0; i < opr_sz / 4; ++i) { | |
e286bf4a | 155 | d[i] = inl_qrdmlah_s32(n[i], m[i], d[i], vq); |
e7186d82 RH |
156 | } |
157 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
158 | } | |
159 | ||
d9061ec3 | 160 | /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ |
e286bf4a RH |
161 | static int32_t inl_qrdmlsh_s32(int32_t src1, int32_t src2, |
162 | int32_t src3, uint32_t *sat) | |
d9061ec3 RH |
163 | { |
164 | /* Simplify similarly to int_qrdmlsh_s16 above. */ | |
165 | int64_t ret = (int64_t)src1 * src2; | |
166 | ret = ((int64_t)src3 << 31) - ret + (1 << 30); | |
167 | ret >>= 31; | |
168 | if (ret != (int32_t)ret) { | |
e286bf4a | 169 | *sat = 1; |
d9061ec3 RH |
170 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
171 | } | |
172 | return ret; | |
173 | } | |
e7186d82 | 174 | |
e286bf4a RH |
175 | uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, |
176 | int32_t src2, int32_t src3) | |
177 | { | |
178 | uint32_t *sat = &env->vfp.qc[0]; | |
179 | return inl_qrdmlsh_s32(src1, src2, src3, sat); | |
180 | } | |
181 | ||
e7186d82 | 182 | void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 183 | void *vq, uint32_t desc) |
e7186d82 RH |
184 | { |
185 | uintptr_t opr_sz = simd_oprsz(desc); | |
186 | int32_t *d = vd; | |
187 | int32_t *n = vn; | |
188 | int32_t *m = vm; | |
e7186d82 RH |
189 | uintptr_t i; |
190 | ||
191 | for (i = 0; i < opr_sz / 4; ++i) { | |
e286bf4a | 192 | d[i] = inl_qrdmlsh_s32(n[i], m[i], d[i], vq); |
e7186d82 RH |
193 | } |
194 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
195 | } | |
1695cd61 | 196 | |
d730ecaa RH |
197 | /* Integer 8 and 16-bit dot-product. |
198 | * | |
199 | * Note that for the loops herein, host endianness does not matter | |
200 | * with respect to the ordering of data within the 64-bit lanes. | |
201 | * All elements are treated equally, no matter where they are. | |
202 | */ | |
203 | ||
204 | void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
205 | { | |
206 | intptr_t i, opr_sz = simd_oprsz(desc); | |
207 | uint32_t *d = vd; | |
208 | int8_t *n = vn, *m = vm; | |
209 | ||
210 | for (i = 0; i < opr_sz / 4; ++i) { | |
211 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
212 | + n[i * 4 + 1] * m[i * 4 + 1] | |
213 | + n[i * 4 + 2] * m[i * 4 + 2] | |
214 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
215 | } | |
216 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
217 | } | |
218 | ||
219 | void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
220 | { | |
221 | intptr_t i, opr_sz = simd_oprsz(desc); | |
222 | uint32_t *d = vd; | |
223 | uint8_t *n = vn, *m = vm; | |
224 | ||
225 | for (i = 0; i < opr_sz / 4; ++i) { | |
226 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
227 | + n[i * 4 + 1] * m[i * 4 + 1] | |
228 | + n[i * 4 + 2] * m[i * 4 + 2] | |
229 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
230 | } | |
231 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
232 | } | |
233 | ||
234 | void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
235 | { | |
236 | intptr_t i, opr_sz = simd_oprsz(desc); | |
237 | uint64_t *d = vd; | |
238 | int16_t *n = vn, *m = vm; | |
239 | ||
240 | for (i = 0; i < opr_sz / 8; ++i) { | |
241 | d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
242 | + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
243 | + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
244 | + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
245 | } | |
246 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
247 | } | |
248 | ||
249 | void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
250 | { | |
251 | intptr_t i, opr_sz = simd_oprsz(desc); | |
252 | uint64_t *d = vd; | |
253 | uint16_t *n = vn, *m = vm; | |
254 | ||
255 | for (i = 0; i < opr_sz / 8; ++i) { | |
256 | d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
257 | + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
258 | + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
259 | + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
260 | } | |
261 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
262 | } | |
263 | ||
16fcfdc7 RH |
264 | void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) |
265 | { | |
266 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
267 | intptr_t index = simd_data(desc); | |
268 | uint32_t *d = vd; | |
269 | int8_t *n = vn; | |
270 | int8_t *m_indexed = (int8_t *)vm + index * 4; | |
271 | ||
272 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
273 | * Otherwise opr_sz is a multiple of 16. | |
274 | */ | |
275 | segend = MIN(4, opr_sz_4); | |
276 | i = 0; | |
277 | do { | |
278 | int8_t m0 = m_indexed[i * 4 + 0]; | |
279 | int8_t m1 = m_indexed[i * 4 + 1]; | |
280 | int8_t m2 = m_indexed[i * 4 + 2]; | |
281 | int8_t m3 = m_indexed[i * 4 + 3]; | |
282 | ||
283 | do { | |
284 | d[i] += n[i * 4 + 0] * m0 | |
285 | + n[i * 4 + 1] * m1 | |
286 | + n[i * 4 + 2] * m2 | |
287 | + n[i * 4 + 3] * m3; | |
288 | } while (++i < segend); | |
289 | segend = i + 4; | |
290 | } while (i < opr_sz_4); | |
291 | ||
292 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
293 | } | |
294 | ||
295 | void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
296 | { | |
297 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
298 | intptr_t index = simd_data(desc); | |
299 | uint32_t *d = vd; | |
300 | uint8_t *n = vn; | |
301 | uint8_t *m_indexed = (uint8_t *)vm + index * 4; | |
302 | ||
303 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
304 | * Otherwise opr_sz is a multiple of 16. | |
305 | */ | |
306 | segend = MIN(4, opr_sz_4); | |
307 | i = 0; | |
308 | do { | |
309 | uint8_t m0 = m_indexed[i * 4 + 0]; | |
310 | uint8_t m1 = m_indexed[i * 4 + 1]; | |
311 | uint8_t m2 = m_indexed[i * 4 + 2]; | |
312 | uint8_t m3 = m_indexed[i * 4 + 3]; | |
313 | ||
314 | do { | |
315 | d[i] += n[i * 4 + 0] * m0 | |
316 | + n[i * 4 + 1] * m1 | |
317 | + n[i * 4 + 2] * m2 | |
318 | + n[i * 4 + 3] * m3; | |
319 | } while (++i < segend); | |
320 | segend = i + 4; | |
321 | } while (i < opr_sz_4); | |
322 | ||
323 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
324 | } | |
325 | ||
326 | void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
327 | { | |
328 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
329 | intptr_t index = simd_data(desc); | |
330 | uint64_t *d = vd; | |
331 | int16_t *n = vn; | |
332 | int16_t *m_indexed = (int16_t *)vm + index * 4; | |
333 | ||
334 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
335 | * Process the entire segment all at once, writing back the results | |
336 | * only after we've consumed all of the inputs. | |
337 | */ | |
338 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
339 | uint64_t d0, d1; | |
340 | ||
341 | d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; | |
342 | d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; | |
343 | d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; | |
344 | d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; | |
345 | d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; | |
346 | d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; | |
347 | d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; | |
348 | d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; | |
349 | ||
350 | d[i + 0] += d0; | |
351 | d[i + 1] += d1; | |
352 | } | |
353 | ||
354 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
355 | } | |
356 | ||
357 | void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
358 | { | |
359 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
360 | intptr_t index = simd_data(desc); | |
361 | uint64_t *d = vd; | |
362 | uint16_t *n = vn; | |
363 | uint16_t *m_indexed = (uint16_t *)vm + index * 4; | |
364 | ||
365 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
366 | * Process the entire segment all at once, writing back the results | |
367 | * only after we've consumed all of the inputs. | |
368 | */ | |
369 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
370 | uint64_t d0, d1; | |
371 | ||
372 | d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; | |
373 | d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; | |
374 | d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; | |
375 | d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; | |
376 | d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; | |
377 | d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; | |
378 | d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; | |
379 | d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; | |
380 | ||
381 | d[i + 0] += d0; | |
382 | d[i + 1] += d1; | |
383 | } | |
384 | ||
385 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
386 | } | |
387 | ||
1695cd61 RH |
388 | void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, |
389 | void *vfpst, uint32_t desc) | |
390 | { | |
391 | uintptr_t opr_sz = simd_oprsz(desc); | |
392 | float16 *d = vd; | |
393 | float16 *n = vn; | |
394 | float16 *m = vm; | |
395 | float_status *fpst = vfpst; | |
396 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
397 | uint32_t neg_imag = neg_real ^ 1; | |
398 | uintptr_t i; | |
399 | ||
400 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
401 | neg_real <<= 15; | |
402 | neg_imag <<= 15; | |
403 | ||
404 | for (i = 0; i < opr_sz / 2; i += 2) { | |
405 | float16 e0 = n[H2(i)]; | |
406 | float16 e1 = m[H2(i + 1)] ^ neg_imag; | |
407 | float16 e2 = n[H2(i + 1)]; | |
408 | float16 e3 = m[H2(i)] ^ neg_real; | |
409 | ||
410 | d[H2(i)] = float16_add(e0, e1, fpst); | |
411 | d[H2(i + 1)] = float16_add(e2, e3, fpst); | |
412 | } | |
413 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
414 | } | |
415 | ||
416 | void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, | |
417 | void *vfpst, uint32_t desc) | |
418 | { | |
419 | uintptr_t opr_sz = simd_oprsz(desc); | |
420 | float32 *d = vd; | |
421 | float32 *n = vn; | |
422 | float32 *m = vm; | |
423 | float_status *fpst = vfpst; | |
424 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
425 | uint32_t neg_imag = neg_real ^ 1; | |
426 | uintptr_t i; | |
427 | ||
428 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
429 | neg_real <<= 31; | |
430 | neg_imag <<= 31; | |
431 | ||
432 | for (i = 0; i < opr_sz / 4; i += 2) { | |
433 | float32 e0 = n[H4(i)]; | |
434 | float32 e1 = m[H4(i + 1)] ^ neg_imag; | |
435 | float32 e2 = n[H4(i + 1)]; | |
436 | float32 e3 = m[H4(i)] ^ neg_real; | |
437 | ||
438 | d[H4(i)] = float32_add(e0, e1, fpst); | |
439 | d[H4(i + 1)] = float32_add(e2, e3, fpst); | |
440 | } | |
441 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
442 | } | |
443 | ||
444 | void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, | |
445 | void *vfpst, uint32_t desc) | |
446 | { | |
447 | uintptr_t opr_sz = simd_oprsz(desc); | |
448 | float64 *d = vd; | |
449 | float64 *n = vn; | |
450 | float64 *m = vm; | |
451 | float_status *fpst = vfpst; | |
452 | uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); | |
453 | uint64_t neg_imag = neg_real ^ 1; | |
454 | uintptr_t i; | |
455 | ||
456 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
457 | neg_real <<= 63; | |
458 | neg_imag <<= 63; | |
459 | ||
460 | for (i = 0; i < opr_sz / 8; i += 2) { | |
461 | float64 e0 = n[i]; | |
462 | float64 e1 = m[i + 1] ^ neg_imag; | |
463 | float64 e2 = n[i + 1]; | |
464 | float64 e3 = m[i] ^ neg_real; | |
465 | ||
466 | d[i] = float64_add(e0, e1, fpst); | |
467 | d[i + 1] = float64_add(e2, e3, fpst); | |
468 | } | |
469 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
470 | } | |
d17b7cdc RH |
471 | |
472 | void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, | |
473 | void *vfpst, uint32_t desc) | |
474 | { | |
475 | uintptr_t opr_sz = simd_oprsz(desc); | |
476 | float16 *d = vd; | |
477 | float16 *n = vn; | |
478 | float16 *m = vm; | |
479 | float_status *fpst = vfpst; | |
480 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
481 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
482 | uint32_t neg_real = flip ^ neg_imag; | |
483 | uintptr_t i; | |
484 | ||
485 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
486 | neg_real <<= 15; | |
487 | neg_imag <<= 15; | |
488 | ||
489 | for (i = 0; i < opr_sz / 2; i += 2) { | |
490 | float16 e2 = n[H2(i + flip)]; | |
491 | float16 e1 = m[H2(i + flip)] ^ neg_real; | |
492 | float16 e4 = e2; | |
493 | float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; | |
494 | ||
495 | d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); | |
496 | d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); | |
497 | } | |
498 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
499 | } | |
500 | ||
501 | void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, | |
502 | void *vfpst, uint32_t desc) | |
503 | { | |
504 | uintptr_t opr_sz = simd_oprsz(desc); | |
505 | float16 *d = vd; | |
506 | float16 *n = vn; | |
507 | float16 *m = vm; | |
508 | float_status *fpst = vfpst; | |
509 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
510 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 511 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 512 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
513 | intptr_t elements = opr_sz / sizeof(float16); |
514 | intptr_t eltspersegment = 16 / sizeof(float16); | |
515 | intptr_t i, j; | |
d17b7cdc RH |
516 | |
517 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
518 | neg_real <<= 15; | |
519 | neg_imag <<= 15; | |
d17b7cdc | 520 | |
18fc2405 RH |
521 | for (i = 0; i < elements; i += eltspersegment) { |
522 | float16 mr = m[H2(i + 2 * index + 0)]; | |
523 | float16 mi = m[H2(i + 2 * index + 1)]; | |
524 | float16 e1 = neg_real ^ (flip ? mi : mr); | |
525 | float16 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 526 | |
18fc2405 RH |
527 | for (j = i; j < i + eltspersegment; j += 2) { |
528 | float16 e2 = n[H2(j + flip)]; | |
529 | float16 e4 = e2; | |
530 | ||
531 | d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); | |
532 | d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); | |
533 | } | |
d17b7cdc RH |
534 | } |
535 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
536 | } | |
537 | ||
538 | void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, | |
539 | void *vfpst, uint32_t desc) | |
540 | { | |
541 | uintptr_t opr_sz = simd_oprsz(desc); | |
542 | float32 *d = vd; | |
543 | float32 *n = vn; | |
544 | float32 *m = vm; | |
545 | float_status *fpst = vfpst; | |
546 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
547 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
548 | uint32_t neg_real = flip ^ neg_imag; | |
549 | uintptr_t i; | |
550 | ||
551 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
552 | neg_real <<= 31; | |
553 | neg_imag <<= 31; | |
554 | ||
555 | for (i = 0; i < opr_sz / 4; i += 2) { | |
556 | float32 e2 = n[H4(i + flip)]; | |
557 | float32 e1 = m[H4(i + flip)] ^ neg_real; | |
558 | float32 e4 = e2; | |
559 | float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; | |
560 | ||
561 | d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); | |
562 | d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); | |
563 | } | |
564 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
565 | } | |
566 | ||
567 | void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, | |
568 | void *vfpst, uint32_t desc) | |
569 | { | |
570 | uintptr_t opr_sz = simd_oprsz(desc); | |
571 | float32 *d = vd; | |
572 | float32 *n = vn; | |
573 | float32 *m = vm; | |
574 | float_status *fpst = vfpst; | |
575 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
576 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 577 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 578 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
579 | intptr_t elements = opr_sz / sizeof(float32); |
580 | intptr_t eltspersegment = 16 / sizeof(float32); | |
581 | intptr_t i, j; | |
d17b7cdc RH |
582 | |
583 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
584 | neg_real <<= 31; | |
585 | neg_imag <<= 31; | |
d17b7cdc | 586 | |
18fc2405 RH |
587 | for (i = 0; i < elements; i += eltspersegment) { |
588 | float32 mr = m[H4(i + 2 * index + 0)]; | |
589 | float32 mi = m[H4(i + 2 * index + 1)]; | |
590 | float32 e1 = neg_real ^ (flip ? mi : mr); | |
591 | float32 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 592 | |
18fc2405 RH |
593 | for (j = i; j < i + eltspersegment; j += 2) { |
594 | float32 e2 = n[H4(j + flip)]; | |
595 | float32 e4 = e2; | |
596 | ||
597 | d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); | |
598 | d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); | |
599 | } | |
d17b7cdc RH |
600 | } |
601 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
602 | } | |
603 | ||
604 | void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, | |
605 | void *vfpst, uint32_t desc) | |
606 | { | |
607 | uintptr_t opr_sz = simd_oprsz(desc); | |
608 | float64 *d = vd; | |
609 | float64 *n = vn; | |
610 | float64 *m = vm; | |
611 | float_status *fpst = vfpst; | |
612 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
613 | uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
614 | uint64_t neg_real = flip ^ neg_imag; | |
615 | uintptr_t i; | |
616 | ||
617 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
618 | neg_real <<= 63; | |
619 | neg_imag <<= 63; | |
620 | ||
621 | for (i = 0; i < opr_sz / 8; i += 2) { | |
622 | float64 e2 = n[i + flip]; | |
623 | float64 e1 = m[i + flip] ^ neg_real; | |
624 | float64 e4 = e2; | |
625 | float64 e3 = m[i + 1 - flip] ^ neg_imag; | |
626 | ||
627 | d[i] = float64_muladd(e2, e1, d[i], 0, fpst); | |
628 | d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); | |
629 | } | |
630 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
631 | } | |
29b80469 | 632 | |
3887c038 RH |
633 | #define DO_2OP(NAME, FUNC, TYPE) \ |
634 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
635 | { \ | |
636 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
637 | TYPE *d = vd, *n = vn; \ | |
638 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
639 | d[i] = FUNC(n[i], stat); \ | |
640 | } \ | |
d8efe78e | 641 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
3887c038 RH |
642 | } |
643 | ||
644 | DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) | |
645 | DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) | |
646 | DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) | |
647 | ||
648 | DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) | |
649 | DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) | |
650 | DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) | |
651 | ||
652 | #undef DO_2OP | |
653 | ||
29b80469 RH |
654 | /* Floating-point trigonometric starting value. |
655 | * See the ARM ARM pseudocode function FPTrigSMul. | |
656 | */ | |
657 | static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) | |
658 | { | |
659 | float16 result = float16_mul(op1, op1, stat); | |
660 | if (!float16_is_any_nan(result)) { | |
661 | result = float16_set_sign(result, op2 & 1); | |
662 | } | |
663 | return result; | |
664 | } | |
665 | ||
666 | static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) | |
667 | { | |
668 | float32 result = float32_mul(op1, op1, stat); | |
669 | if (!float32_is_any_nan(result)) { | |
670 | result = float32_set_sign(result, op2 & 1); | |
671 | } | |
672 | return result; | |
673 | } | |
674 | ||
675 | static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) | |
676 | { | |
677 | float64 result = float64_mul(op1, op1, stat); | |
678 | if (!float64_is_any_nan(result)) { | |
679 | result = float64_set_sign(result, op2 & 1); | |
680 | } | |
681 | return result; | |
682 | } | |
683 | ||
a26a352b PM |
684 | static float32 float32_abd(float32 op1, float32 op2, float_status *stat) |
685 | { | |
686 | return float32_abs(float32_sub(op1, op2, stat)); | |
687 | } | |
688 | ||
29b80469 RH |
689 | #define DO_3OP(NAME, FUNC, TYPE) \ |
690 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
691 | { \ | |
692 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
693 | TYPE *d = vd, *n = vn, *m = vm; \ | |
694 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
695 | d[i] = FUNC(n[i], m[i], stat); \ | |
696 | } \ | |
d8efe78e | 697 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
29b80469 RH |
698 | } |
699 | ||
700 | DO_3OP(gvec_fadd_h, float16_add, float16) | |
701 | DO_3OP(gvec_fadd_s, float32_add, float32) | |
702 | DO_3OP(gvec_fadd_d, float64_add, float64) | |
703 | ||
704 | DO_3OP(gvec_fsub_h, float16_sub, float16) | |
705 | DO_3OP(gvec_fsub_s, float32_sub, float32) | |
706 | DO_3OP(gvec_fsub_d, float64_sub, float64) | |
707 | ||
708 | DO_3OP(gvec_fmul_h, float16_mul, float16) | |
709 | DO_3OP(gvec_fmul_s, float32_mul, float32) | |
710 | DO_3OP(gvec_fmul_d, float64_mul, float64) | |
711 | ||
712 | DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) | |
713 | DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) | |
714 | DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) | |
715 | ||
a26a352b PM |
716 | DO_3OP(gvec_fabd_s, float32_abd, float32) |
717 | ||
29b80469 RH |
718 | #ifdef TARGET_AARCH64 |
719 | ||
720 | DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) | |
721 | DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) | |
722 | DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) | |
723 | ||
724 | DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) | |
725 | DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) | |
726 | DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) | |
727 | ||
728 | #endif | |
729 | #undef DO_3OP | |
ca40a6e6 RH |
730 | |
731 | /* For the indexed ops, SVE applies the index per 128-bit vector segment. | |
732 | * For AdvSIMD, there is of course only one such vector segment. | |
733 | */ | |
734 | ||
735 | #define DO_MUL_IDX(NAME, TYPE, H) \ | |
736 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
737 | { \ | |
738 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
739 | intptr_t idx = simd_data(desc); \ | |
740 | TYPE *d = vd, *n = vn, *m = vm; \ | |
741 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
742 | TYPE mm = m[H(i + idx)]; \ | |
743 | for (j = 0; j < segment; j++) { \ | |
744 | d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ | |
745 | } \ | |
746 | } \ | |
525d9b6d | 747 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
748 | } |
749 | ||
750 | DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) | |
751 | DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) | |
752 | DO_MUL_IDX(gvec_fmul_idx_d, float64, ) | |
753 | ||
754 | #undef DO_MUL_IDX | |
755 | ||
756 | #define DO_FMLA_IDX(NAME, TYPE, H) \ | |
757 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ | |
758 | void *stat, uint32_t desc) \ | |
759 | { \ | |
760 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
761 | TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ | |
762 | intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ | |
763 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ | |
764 | op1_neg <<= (8 * sizeof(TYPE) - 1); \ | |
765 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
766 | TYPE mm = m[H(i + idx)]; \ | |
767 | for (j = 0; j < segment; j++) { \ | |
768 | d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ | |
769 | mm, a[i + j], 0, stat); \ | |
770 | } \ | |
771 | } \ | |
525d9b6d | 772 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
773 | } |
774 | ||
775 | DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) | |
776 | DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) | |
777 | DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) | |
778 | ||
779 | #undef DO_FMLA_IDX | |
89e68b57 RH |
780 | |
781 | #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ | |
782 | void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ | |
783 | { \ | |
784 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
785 | TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ | |
786 | bool q = false; \ | |
787 | for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ | |
788 | WTYPE dd = (WTYPE)n[i] OP m[i]; \ | |
789 | if (dd < MIN) { \ | |
790 | dd = MIN; \ | |
791 | q = true; \ | |
792 | } else if (dd > MAX) { \ | |
793 | dd = MAX; \ | |
794 | q = true; \ | |
795 | } \ | |
796 | d[i] = dd; \ | |
797 | } \ | |
798 | if (q) { \ | |
799 | uint32_t *qc = vq; \ | |
800 | qc[0] = 1; \ | |
801 | } \ | |
802 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
803 | } | |
804 | ||
805 | DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) | |
806 | DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) | |
807 | DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) | |
808 | ||
809 | DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) | |
810 | DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) | |
811 | DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) | |
812 | ||
813 | DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) | |
814 | DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) | |
815 | DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) | |
816 | ||
817 | DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) | |
818 | DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) | |
819 | DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) | |
820 | ||
821 | #undef DO_SAT | |
822 | ||
823 | void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, | |
824 | void *vm, uint32_t desc) | |
825 | { | |
826 | intptr_t i, oprsz = simd_oprsz(desc); | |
827 | uint64_t *d = vd, *n = vn, *m = vm; | |
828 | bool q = false; | |
829 | ||
830 | for (i = 0; i < oprsz / 8; i++) { | |
831 | uint64_t nn = n[i], mm = m[i], dd = nn + mm; | |
832 | if (dd < nn) { | |
833 | dd = UINT64_MAX; | |
834 | q = true; | |
835 | } | |
836 | d[i] = dd; | |
837 | } | |
838 | if (q) { | |
839 | uint32_t *qc = vq; | |
840 | qc[0] = 1; | |
841 | } | |
842 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
843 | } | |
844 | ||
845 | void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, | |
846 | void *vm, uint32_t desc) | |
847 | { | |
848 | intptr_t i, oprsz = simd_oprsz(desc); | |
849 | uint64_t *d = vd, *n = vn, *m = vm; | |
850 | bool q = false; | |
851 | ||
852 | for (i = 0; i < oprsz / 8; i++) { | |
853 | uint64_t nn = n[i], mm = m[i], dd = nn - mm; | |
854 | if (nn < mm) { | |
855 | dd = 0; | |
856 | q = true; | |
857 | } | |
858 | d[i] = dd; | |
859 | } | |
860 | if (q) { | |
861 | uint32_t *qc = vq; | |
862 | qc[0] = 1; | |
863 | } | |
864 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
865 | } | |
866 | ||
867 | void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, | |
868 | void *vm, uint32_t desc) | |
869 | { | |
870 | intptr_t i, oprsz = simd_oprsz(desc); | |
871 | int64_t *d = vd, *n = vn, *m = vm; | |
872 | bool q = false; | |
873 | ||
874 | for (i = 0; i < oprsz / 8; i++) { | |
875 | int64_t nn = n[i], mm = m[i], dd = nn + mm; | |
876 | if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { | |
877 | dd = (nn >> 63) ^ ~INT64_MIN; | |
878 | q = true; | |
879 | } | |
880 | d[i] = dd; | |
881 | } | |
882 | if (q) { | |
883 | uint32_t *qc = vq; | |
884 | qc[0] = 1; | |
885 | } | |
886 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
887 | } | |
888 | ||
889 | void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, | |
890 | void *vm, uint32_t desc) | |
891 | { | |
892 | intptr_t i, oprsz = simd_oprsz(desc); | |
893 | int64_t *d = vd, *n = vn, *m = vm; | |
894 | bool q = false; | |
895 | ||
896 | for (i = 0; i < oprsz / 8; i++) { | |
897 | int64_t nn = n[i], mm = m[i], dd = nn - mm; | |
898 | if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { | |
899 | dd = (nn >> 63) ^ ~INT64_MIN; | |
900 | q = true; | |
901 | } | |
902 | d[i] = dd; | |
903 | } | |
904 | if (q) { | |
905 | uint32_t *qc = vq; | |
906 | qc[0] = 1; | |
907 | } | |
908 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
909 | } | |
a4e943a7 | 910 | |
631e5654 RH |
911 | |
912 | #define DO_SRA(NAME, TYPE) \ | |
913 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
914 | { \ | |
915 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
916 | int shift = simd_data(desc); \ | |
917 | TYPE *d = vd, *n = vn; \ | |
918 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
919 | d[i] += n[i] >> shift; \ | |
920 | } \ | |
921 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
922 | } | |
923 | ||
924 | DO_SRA(gvec_ssra_b, int8_t) | |
925 | DO_SRA(gvec_ssra_h, int16_t) | |
926 | DO_SRA(gvec_ssra_s, int32_t) | |
927 | DO_SRA(gvec_ssra_d, int64_t) | |
928 | ||
929 | DO_SRA(gvec_usra_b, uint8_t) | |
930 | DO_SRA(gvec_usra_h, uint16_t) | |
931 | DO_SRA(gvec_usra_s, uint32_t) | |
932 | DO_SRA(gvec_usra_d, uint64_t) | |
933 | ||
934 | #undef DO_SRA | |
935 | ||
6ccd48d4 RH |
936 | #define DO_RSHR(NAME, TYPE) \ |
937 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
938 | { \ | |
939 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
940 | int shift = simd_data(desc); \ | |
941 | TYPE *d = vd, *n = vn; \ | |
942 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
943 | TYPE tmp = n[i] >> (shift - 1); \ | |
944 | d[i] = (tmp >> 1) + (tmp & 1); \ | |
945 | } \ | |
946 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
947 | } | |
948 | ||
949 | DO_RSHR(gvec_srshr_b, int8_t) | |
950 | DO_RSHR(gvec_srshr_h, int16_t) | |
951 | DO_RSHR(gvec_srshr_s, int32_t) | |
952 | DO_RSHR(gvec_srshr_d, int64_t) | |
953 | ||
954 | DO_RSHR(gvec_urshr_b, uint8_t) | |
955 | DO_RSHR(gvec_urshr_h, uint16_t) | |
956 | DO_RSHR(gvec_urshr_s, uint32_t) | |
957 | DO_RSHR(gvec_urshr_d, uint64_t) | |
958 | ||
959 | #undef DO_RSHR | |
960 | ||
961 | #define DO_RSRA(NAME, TYPE) \ | |
962 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
963 | { \ | |
964 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
965 | int shift = simd_data(desc); \ | |
966 | TYPE *d = vd, *n = vn; \ | |
967 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
968 | TYPE tmp = n[i] >> (shift - 1); \ | |
969 | d[i] += (tmp >> 1) + (tmp & 1); \ | |
970 | } \ | |
971 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
972 | } | |
973 | ||
974 | DO_RSRA(gvec_srsra_b, int8_t) | |
975 | DO_RSRA(gvec_srsra_h, int16_t) | |
976 | DO_RSRA(gvec_srsra_s, int32_t) | |
977 | DO_RSRA(gvec_srsra_d, int64_t) | |
978 | ||
979 | DO_RSRA(gvec_ursra_b, uint8_t) | |
980 | DO_RSRA(gvec_ursra_h, uint16_t) | |
981 | DO_RSRA(gvec_ursra_s, uint32_t) | |
982 | DO_RSRA(gvec_ursra_d, uint64_t) | |
983 | ||
984 | #undef DO_RSRA | |
985 | ||
893ab054 RH |
986 | #define DO_SRI(NAME, TYPE) \ |
987 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
988 | { \ | |
989 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
990 | int shift = simd_data(desc); \ | |
991 | TYPE *d = vd, *n = vn; \ | |
992 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
993 | d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ | |
994 | } \ | |
995 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
996 | } | |
997 | ||
998 | DO_SRI(gvec_sri_b, uint8_t) | |
999 | DO_SRI(gvec_sri_h, uint16_t) | |
1000 | DO_SRI(gvec_sri_s, uint32_t) | |
1001 | DO_SRI(gvec_sri_d, uint64_t) | |
1002 | ||
1003 | #undef DO_SRI | |
1004 | ||
1005 | #define DO_SLI(NAME, TYPE) \ | |
1006 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1007 | { \ | |
1008 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1009 | int shift = simd_data(desc); \ | |
1010 | TYPE *d = vd, *n = vn; \ | |
1011 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1012 | d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ | |
1013 | } \ | |
1014 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1015 | } | |
1016 | ||
1017 | DO_SLI(gvec_sli_b, uint8_t) | |
1018 | DO_SLI(gvec_sli_h, uint16_t) | |
1019 | DO_SLI(gvec_sli_s, uint32_t) | |
1020 | DO_SLI(gvec_sli_d, uint64_t) | |
1021 | ||
1022 | #undef DO_SLI | |
1023 | ||
a4e943a7 RH |
1024 | /* |
1025 | * Convert float16 to float32, raising no exceptions and | |
1026 | * preserving exceptional values, including SNaN. | |
1027 | * This is effectively an unpack+repack operation. | |
1028 | */ | |
1029 | static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) | |
1030 | { | |
1031 | const int f16_bias = 15; | |
1032 | const int f32_bias = 127; | |
1033 | uint32_t sign = extract32(f16, 15, 1); | |
1034 | uint32_t exp = extract32(f16, 10, 5); | |
1035 | uint32_t frac = extract32(f16, 0, 10); | |
1036 | ||
1037 | if (exp == 0x1f) { | |
1038 | /* Inf or NaN */ | |
1039 | exp = 0xff; | |
1040 | } else if (exp == 0) { | |
1041 | /* Zero or denormal. */ | |
1042 | if (frac != 0) { | |
1043 | if (fz16) { | |
1044 | frac = 0; | |
1045 | } else { | |
1046 | /* | |
1047 | * Denormal; these are all normal float32. | |
1048 | * Shift the fraction so that the msb is at bit 11, | |
1049 | * then remove bit 11 as the implicit bit of the | |
1050 | * normalized float32. Note that we still go through | |
1051 | * the shift for normal numbers below, to put the | |
1052 | * float32 fraction at the right place. | |
1053 | */ | |
1054 | int shift = clz32(frac) - 21; | |
1055 | frac = (frac << shift) & 0x3ff; | |
1056 | exp = f32_bias - f16_bias - shift + 1; | |
1057 | } | |
1058 | } | |
1059 | } else { | |
1060 | /* Normal number; adjust the bias. */ | |
1061 | exp += f32_bias - f16_bias; | |
1062 | } | |
1063 | sign <<= 31; | |
1064 | exp <<= 23; | |
1065 | frac <<= 23 - 10; | |
1066 | ||
1067 | return sign | exp | frac; | |
1068 | } | |
1069 | ||
1070 | static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) | |
1071 | { | |
1072 | /* | |
1073 | * Branchless load of u32[0], u64[0], u32[1], or u64[1]. | |
1074 | * Load the 2nd qword iff is_q & is_2. | |
1075 | * Shift to the 2nd dword iff !is_q & is_2. | |
1076 | * For !is_q & !is_2, the upper bits of the result are garbage. | |
1077 | */ | |
1078 | return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); | |
1079 | } | |
1080 | ||
1081 | /* | |
1082 | * Note that FMLAL requires oprsz == 8 or oprsz == 16, | |
1083 | * as there is not yet SVE versions that might use blocking. | |
1084 | */ | |
1085 | ||
1086 | static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, | |
1087 | uint32_t desc, bool fz16) | |
1088 | { | |
1089 | intptr_t i, oprsz = simd_oprsz(desc); | |
1090 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1091 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1092 | int is_q = oprsz == 16; | |
1093 | uint64_t n_4, m_4; | |
1094 | ||
1095 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1096 | n_4 = load4_f16(vn, is_q, is_2); | |
1097 | m_4 = load4_f16(vm, is_q, is_2); | |
1098 | ||
1099 | /* Negate all inputs for FMLSL at once. */ | |
1100 | if (is_s) { | |
1101 | n_4 ^= 0x8000800080008000ull; | |
1102 | } | |
1103 | ||
1104 | for (i = 0; i < oprsz / 4; i++) { | |
1105 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1106 | float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); | |
1107 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1108 | } | |
1109 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1110 | } | |
1111 | ||
1112 | void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, | |
1113 | void *venv, uint32_t desc) | |
1114 | { | |
1115 | CPUARMState *env = venv; | |
1116 | do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1117 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1118 | } | |
1119 | ||
1120 | void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, | |
1121 | void *venv, uint32_t desc) | |
1122 | { | |
1123 | CPUARMState *env = venv; | |
1124 | do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, | |
1125 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1126 | } | |
1127 | ||
1128 | static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, | |
1129 | uint32_t desc, bool fz16) | |
1130 | { | |
1131 | intptr_t i, oprsz = simd_oprsz(desc); | |
1132 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1133 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1134 | int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); | |
1135 | int is_q = oprsz == 16; | |
1136 | uint64_t n_4; | |
1137 | float32 m_1; | |
1138 | ||
1139 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1140 | n_4 = load4_f16(vn, is_q, is_2); | |
1141 | ||
1142 | /* Negate all inputs for FMLSL at once. */ | |
1143 | if (is_s) { | |
1144 | n_4 ^= 0x8000800080008000ull; | |
1145 | } | |
1146 | ||
1147 | m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); | |
1148 | ||
1149 | for (i = 0; i < oprsz / 4; i++) { | |
1150 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1151 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1152 | } | |
1153 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1154 | } | |
1155 | ||
1156 | void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, | |
1157 | void *venv, uint32_t desc) | |
1158 | { | |
1159 | CPUARMState *env = venv; | |
1160 | do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1161 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1162 | } | |
1163 | ||
1164 | void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, | |
1165 | void *venv, uint32_t desc) | |
1166 | { | |
1167 | CPUARMState *env = venv; | |
1168 | do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, | |
1169 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1170 | } | |
87b74e8b RH |
1171 | |
1172 | void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1173 | { | |
1174 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1175 | int8_t *d = vd, *n = vn, *m = vm; | |
1176 | ||
1177 | for (i = 0; i < opr_sz; ++i) { | |
1178 | int8_t mm = m[i]; | |
1179 | int8_t nn = n[i]; | |
1180 | int8_t res = 0; | |
1181 | if (mm >= 0) { | |
1182 | if (mm < 8) { | |
1183 | res = nn << mm; | |
1184 | } | |
1185 | } else { | |
1186 | res = nn >> (mm > -8 ? -mm : 7); | |
1187 | } | |
1188 | d[i] = res; | |
1189 | } | |
1190 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1191 | } | |
1192 | ||
1193 | void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1194 | { | |
1195 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1196 | int16_t *d = vd, *n = vn, *m = vm; | |
1197 | ||
1198 | for (i = 0; i < opr_sz / 2; ++i) { | |
1199 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1200 | int16_t nn = n[i]; | |
1201 | int16_t res = 0; | |
1202 | if (mm >= 0) { | |
1203 | if (mm < 16) { | |
1204 | res = nn << mm; | |
1205 | } | |
1206 | } else { | |
1207 | res = nn >> (mm > -16 ? -mm : 15); | |
1208 | } | |
1209 | d[i] = res; | |
1210 | } | |
1211 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1212 | } | |
1213 | ||
1214 | void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1215 | { | |
1216 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1217 | uint8_t *d = vd, *n = vn, *m = vm; | |
1218 | ||
1219 | for (i = 0; i < opr_sz; ++i) { | |
1220 | int8_t mm = m[i]; | |
1221 | uint8_t nn = n[i]; | |
1222 | uint8_t res = 0; | |
1223 | if (mm >= 0) { | |
1224 | if (mm < 8) { | |
1225 | res = nn << mm; | |
1226 | } | |
1227 | } else { | |
1228 | if (mm > -8) { | |
1229 | res = nn >> -mm; | |
1230 | } | |
1231 | } | |
1232 | d[i] = res; | |
1233 | } | |
1234 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1235 | } | |
1236 | ||
1237 | void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1238 | { | |
1239 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1240 | uint16_t *d = vd, *n = vn, *m = vm; | |
1241 | ||
1242 | for (i = 0; i < opr_sz / 2; ++i) { | |
1243 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1244 | uint16_t nn = n[i]; | |
1245 | uint16_t res = 0; | |
1246 | if (mm >= 0) { | |
1247 | if (mm < 16) { | |
1248 | res = nn << mm; | |
1249 | } | |
1250 | } else { | |
1251 | if (mm > -16) { | |
1252 | res = nn >> -mm; | |
1253 | } | |
1254 | } | |
1255 | d[i] = res; | |
1256 | } | |
1257 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1258 | } | |
a21bb78e RH |
1259 | |
1260 | /* | |
1261 | * 8x8->8 polynomial multiply. | |
1262 | * | |
1263 | * Polynomial multiplication is like integer multiplication except the | |
1264 | * partial products are XORed, not added. | |
1265 | * | |
1266 | * TODO: expose this as a generic vector operation, as it is a common | |
1267 | * crypto building block. | |
1268 | */ | |
1269 | void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1270 | { | |
1271 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
1272 | uint64_t *d = vd, *n = vn, *m = vm; | |
1273 | ||
1274 | for (i = 0; i < opr_sz / 8; ++i) { | |
1275 | uint64_t nn = n[i]; | |
1276 | uint64_t mm = m[i]; | |
1277 | uint64_t rr = 0; | |
1278 | ||
1279 | for (j = 0; j < 8; ++j) { | |
1280 | uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; | |
1281 | rr ^= mm & mask; | |
1282 | mm = (mm << 1) & 0xfefefefefefefefeull; | |
1283 | nn >>= 1; | |
1284 | } | |
1285 | d[i] = rr; | |
1286 | } | |
1287 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1288 | } | |
b9ed510e RH |
1289 | |
1290 | /* | |
1291 | * 64x64->128 polynomial multiply. | |
1292 | * Because of the lanes are not accessed in strict columns, | |
1293 | * this probably cannot be turned into a generic helper. | |
1294 | */ | |
1295 | void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) | |
1296 | { | |
1297 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
1298 | intptr_t hi = simd_data(desc); | |
1299 | uint64_t *d = vd, *n = vn, *m = vm; | |
1300 | ||
1301 | for (i = 0; i < opr_sz / 8; i += 2) { | |
1302 | uint64_t nn = n[i + hi]; | |
1303 | uint64_t mm = m[i + hi]; | |
1304 | uint64_t rhi = 0; | |
1305 | uint64_t rlo = 0; | |
1306 | ||
1307 | /* Bit 0 can only influence the low 64-bit result. */ | |
1308 | if (nn & 1) { | |
1309 | rlo = mm; | |
1310 | } | |
1311 | ||
1312 | for (j = 1; j < 64; ++j) { | |
1313 | uint64_t mask = -((nn >> j) & 1); | |
1314 | rlo ^= (mm << j) & mask; | |
1315 | rhi ^= (mm >> (64 - j)) & mask; | |
1316 | } | |
1317 | d[i] = rlo; | |
1318 | d[i + 1] = rhi; | |
1319 | } | |
1320 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1321 | } | |
e7e96fc5 RH |
1322 | |
1323 | /* | |
1324 | * 8x8->16 polynomial multiply. | |
1325 | * | |
1326 | * The byte inputs are expanded to (or extracted from) half-words. | |
1327 | * Note that neon and sve2 get the inputs from different positions. | |
1328 | * This allows 4 bytes to be processed in parallel with uint64_t. | |
1329 | */ | |
1330 | ||
1331 | static uint64_t expand_byte_to_half(uint64_t x) | |
1332 | { | |
1333 | return (x & 0x000000ff) | |
1334 | | ((x & 0x0000ff00) << 8) | |
1335 | | ((x & 0x00ff0000) << 16) | |
1336 | | ((x & 0xff000000) << 24); | |
1337 | } | |
1338 | ||
1339 | static uint64_t pmull_h(uint64_t op1, uint64_t op2) | |
1340 | { | |
1341 | uint64_t result = 0; | |
1342 | int i; | |
1343 | ||
1344 | for (i = 0; i < 8; ++i) { | |
1345 | uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; | |
1346 | result ^= op2 & mask; | |
1347 | op1 >>= 1; | |
1348 | op2 <<= 1; | |
1349 | } | |
1350 | return result; | |
1351 | } | |
1352 | ||
1353 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1354 | { | |
1355 | int hi = simd_data(desc); | |
1356 | uint64_t *d = vd, *n = vn, *m = vm; | |
1357 | uint64_t nn = n[hi], mm = m[hi]; | |
1358 | ||
1359 | d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | |
1360 | nn >>= 32; | |
1361 | mm >>= 32; | |
1362 | d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | |
1363 | ||
1364 | clear_tail(d, 16, simd_maxsz(desc)); | |
1365 | } | |
1366 | ||
1367 | #ifdef TARGET_AARCH64 | |
1368 | void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1369 | { | |
1370 | int shift = simd_data(desc) * 8; | |
1371 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1372 | uint64_t *d = vd, *n = vn, *m = vm; | |
1373 | ||
1374 | for (i = 0; i < opr_sz / 8; ++i) { | |
1375 | uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; | |
1376 | uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; | |
1377 | ||
1378 | d[i] = pmull_h(nn, mm); | |
1379 | } | |
1380 | } | |
1381 | #endif | |
6b375d35 RH |
1382 | |
1383 | #define DO_CMP0(NAME, TYPE, OP) \ | |
1384 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1385 | { \ | |
1386 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1387 | for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ | |
1388 | TYPE nn = *(TYPE *)(vn + i); \ | |
1389 | *(TYPE *)(vd + i) = -(nn OP 0); \ | |
1390 | } \ | |
1391 | clear_tail(vd, opr_sz, simd_maxsz(desc)); \ | |
1392 | } | |
1393 | ||
1394 | DO_CMP0(gvec_ceq0_b, int8_t, ==) | |
1395 | DO_CMP0(gvec_clt0_b, int8_t, <) | |
1396 | DO_CMP0(gvec_cle0_b, int8_t, <=) | |
1397 | DO_CMP0(gvec_cgt0_b, int8_t, >) | |
1398 | DO_CMP0(gvec_cge0_b, int8_t, >=) | |
1399 | ||
1400 | DO_CMP0(gvec_ceq0_h, int16_t, ==) | |
1401 | DO_CMP0(gvec_clt0_h, int16_t, <) | |
1402 | DO_CMP0(gvec_cle0_h, int16_t, <=) | |
1403 | DO_CMP0(gvec_cgt0_h, int16_t, >) | |
1404 | DO_CMP0(gvec_cge0_h, int16_t, >=) | |
1405 | ||
1406 | #undef DO_CMP0 | |
50c160d4 RH |
1407 | |
1408 | #define DO_ABD(NAME, TYPE) \ | |
1409 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
1410 | { \ | |
1411 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1412 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1413 | \ | |
1414 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
1415 | d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
1416 | } \ | |
1417 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
1418 | } | |
1419 | ||
1420 | DO_ABD(gvec_sabd_b, int8_t) | |
1421 | DO_ABD(gvec_sabd_h, int16_t) | |
1422 | DO_ABD(gvec_sabd_s, int32_t) | |
1423 | DO_ABD(gvec_sabd_d, int64_t) | |
1424 | ||
1425 | DO_ABD(gvec_uabd_b, uint8_t) | |
1426 | DO_ABD(gvec_uabd_h, uint16_t) | |
1427 | DO_ABD(gvec_uabd_s, uint32_t) | |
1428 | DO_ABD(gvec_uabd_d, uint64_t) | |
1429 | ||
1430 | #undef DO_ABD | |
cfdb2c0c RH |
1431 | |
1432 | #define DO_ABA(NAME, TYPE) \ | |
1433 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
1434 | { \ | |
1435 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1436 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1437 | \ | |
1438 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
1439 | d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
1440 | } \ | |
1441 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
1442 | } | |
1443 | ||
1444 | DO_ABA(gvec_saba_b, int8_t) | |
1445 | DO_ABA(gvec_saba_h, int16_t) | |
1446 | DO_ABA(gvec_saba_s, int32_t) | |
1447 | DO_ABA(gvec_saba_d, int64_t) | |
1448 | ||
1449 | DO_ABA(gvec_uaba_b, uint8_t) | |
1450 | DO_ABA(gvec_uaba_h, uint16_t) | |
1451 | DO_ABA(gvec_uaba_s, uint32_t) | |
1452 | DO_ABA(gvec_uaba_d, uint64_t) | |
1453 | ||
1454 | #undef DO_ABA |