]>
Commit | Line | Data |
---|---|---|
d9061ec3 RH |
1 | /* |
2 | * ARM AdvSIMD / SVE Vector Operations | |
3 | * | |
4 | * Copyright (c) 2018 Linaro | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
d9061ec3 RH |
22 | #include "exec/helper-proto.h" |
23 | #include "tcg/tcg-gvec-desc.h" | |
1695cd61 | 24 | #include "fpu/softfloat.h" |
d9061ec3 RH |
25 | |
26 | ||
1695cd61 RH |
27 | /* Note that vector data is stored in host-endian 64-bit chunks, |
28 | so addressing units smaller than that needs a host-endian fixup. */ | |
29 | #ifdef HOST_WORDS_BIGENDIAN | |
30 | #define H1(x) ((x) ^ 7) | |
31 | #define H2(x) ((x) ^ 3) | |
32 | #define H4(x) ((x) ^ 1) | |
33 | #else | |
34 | #define H1(x) (x) | |
35 | #define H2(x) (x) | |
36 | #define H4(x) (x) | |
37 | #endif | |
38 | ||
e7186d82 RH |
39 | static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) |
40 | { | |
41 | uint64_t *d = vd + opr_sz; | |
42 | uintptr_t i; | |
43 | ||
44 | for (i = opr_sz; i < max_sz; i += 8) { | |
45 | *d++ = 0; | |
46 | } | |
47 | } | |
48 | ||
d9061ec3 | 49 | /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ |
e286bf4a RH |
50 | static int16_t inl_qrdmlah_s16(int16_t src1, int16_t src2, |
51 | int16_t src3, uint32_t *sat) | |
d9061ec3 RH |
52 | { |
53 | /* Simplify: | |
54 | * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
55 | * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 | |
56 | */ | |
57 | int32_t ret = (int32_t)src1 * src2; | |
58 | ret = ((int32_t)src3 << 15) + ret + (1 << 14); | |
59 | ret >>= 15; | |
60 | if (ret != (int16_t)ret) { | |
e286bf4a | 61 | *sat = 1; |
d9061ec3 RH |
62 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
63 | } | |
64 | return ret; | |
65 | } | |
66 | ||
67 | uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, | |
68 | uint32_t src2, uint32_t src3) | |
69 | { | |
e286bf4a RH |
70 | uint32_t *sat = &env->vfp.qc[0]; |
71 | uint16_t e1 = inl_qrdmlah_s16(src1, src2, src3, sat); | |
72 | uint16_t e2 = inl_qrdmlah_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); | |
d9061ec3 RH |
73 | return deposit32(e1, 16, 16, e2); |
74 | } | |
75 | ||
e7186d82 | 76 | void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 77 | void *vq, uint32_t desc) |
e7186d82 RH |
78 | { |
79 | uintptr_t opr_sz = simd_oprsz(desc); | |
80 | int16_t *d = vd; | |
81 | int16_t *n = vn; | |
82 | int16_t *m = vm; | |
e7186d82 RH |
83 | uintptr_t i; |
84 | ||
85 | for (i = 0; i < opr_sz / 2; ++i) { | |
e286bf4a | 86 | d[i] = inl_qrdmlah_s16(n[i], m[i], d[i], vq); |
e7186d82 RH |
87 | } |
88 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
89 | } | |
90 | ||
d9061ec3 | 91 | /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ |
e286bf4a RH |
92 | static int16_t inl_qrdmlsh_s16(int16_t src1, int16_t src2, |
93 | int16_t src3, uint32_t *sat) | |
d9061ec3 RH |
94 | { |
95 | /* Similarly, using subtraction: | |
96 | * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
97 | * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 | |
98 | */ | |
99 | int32_t ret = (int32_t)src1 * src2; | |
100 | ret = ((int32_t)src3 << 15) - ret + (1 << 14); | |
101 | ret >>= 15; | |
102 | if (ret != (int16_t)ret) { | |
e286bf4a | 103 | *sat = 1; |
d9061ec3 RH |
104 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
105 | } | |
106 | return ret; | |
107 | } | |
108 | ||
109 | uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, | |
110 | uint32_t src2, uint32_t src3) | |
111 | { | |
e286bf4a RH |
112 | uint32_t *sat = &env->vfp.qc[0]; |
113 | uint16_t e1 = inl_qrdmlsh_s16(src1, src2, src3, sat); | |
114 | uint16_t e2 = inl_qrdmlsh_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); | |
d9061ec3 RH |
115 | return deposit32(e1, 16, 16, e2); |
116 | } | |
117 | ||
e7186d82 | 118 | void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 119 | void *vq, uint32_t desc) |
e7186d82 RH |
120 | { |
121 | uintptr_t opr_sz = simd_oprsz(desc); | |
122 | int16_t *d = vd; | |
123 | int16_t *n = vn; | |
124 | int16_t *m = vm; | |
e7186d82 RH |
125 | uintptr_t i; |
126 | ||
127 | for (i = 0; i < opr_sz / 2; ++i) { | |
e286bf4a | 128 | d[i] = inl_qrdmlsh_s16(n[i], m[i], d[i], vq); |
e7186d82 RH |
129 | } |
130 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
131 | } | |
132 | ||
d9061ec3 | 133 | /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ |
e286bf4a RH |
134 | static int32_t inl_qrdmlah_s32(int32_t src1, int32_t src2, |
135 | int32_t src3, uint32_t *sat) | |
d9061ec3 RH |
136 | { |
137 | /* Simplify similarly to int_qrdmlah_s16 above. */ | |
138 | int64_t ret = (int64_t)src1 * src2; | |
139 | ret = ((int64_t)src3 << 31) + ret + (1 << 30); | |
140 | ret >>= 31; | |
141 | if (ret != (int32_t)ret) { | |
e286bf4a | 142 | *sat = 1; |
d9061ec3 RH |
143 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
144 | } | |
145 | return ret; | |
146 | } | |
147 | ||
e286bf4a RH |
148 | uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, |
149 | int32_t src2, int32_t src3) | |
150 | { | |
151 | uint32_t *sat = &env->vfp.qc[0]; | |
152 | return inl_qrdmlah_s32(src1, src2, src3, sat); | |
153 | } | |
154 | ||
e7186d82 | 155 | void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 156 | void *vq, uint32_t desc) |
e7186d82 RH |
157 | { |
158 | uintptr_t opr_sz = simd_oprsz(desc); | |
159 | int32_t *d = vd; | |
160 | int32_t *n = vn; | |
161 | int32_t *m = vm; | |
e7186d82 RH |
162 | uintptr_t i; |
163 | ||
164 | for (i = 0; i < opr_sz / 4; ++i) { | |
e286bf4a | 165 | d[i] = inl_qrdmlah_s32(n[i], m[i], d[i], vq); |
e7186d82 RH |
166 | } |
167 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
168 | } | |
169 | ||
d9061ec3 | 170 | /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ |
e286bf4a RH |
171 | static int32_t inl_qrdmlsh_s32(int32_t src1, int32_t src2, |
172 | int32_t src3, uint32_t *sat) | |
d9061ec3 RH |
173 | { |
174 | /* Simplify similarly to int_qrdmlsh_s16 above. */ | |
175 | int64_t ret = (int64_t)src1 * src2; | |
176 | ret = ((int64_t)src3 << 31) - ret + (1 << 30); | |
177 | ret >>= 31; | |
178 | if (ret != (int32_t)ret) { | |
e286bf4a | 179 | *sat = 1; |
d9061ec3 RH |
180 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
181 | } | |
182 | return ret; | |
183 | } | |
e7186d82 | 184 | |
e286bf4a RH |
185 | uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, |
186 | int32_t src2, int32_t src3) | |
187 | { | |
188 | uint32_t *sat = &env->vfp.qc[0]; | |
189 | return inl_qrdmlsh_s32(src1, src2, src3, sat); | |
190 | } | |
191 | ||
e7186d82 | 192 | void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 193 | void *vq, uint32_t desc) |
e7186d82 RH |
194 | { |
195 | uintptr_t opr_sz = simd_oprsz(desc); | |
196 | int32_t *d = vd; | |
197 | int32_t *n = vn; | |
198 | int32_t *m = vm; | |
e7186d82 RH |
199 | uintptr_t i; |
200 | ||
201 | for (i = 0; i < opr_sz / 4; ++i) { | |
e286bf4a | 202 | d[i] = inl_qrdmlsh_s32(n[i], m[i], d[i], vq); |
e7186d82 RH |
203 | } |
204 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
205 | } | |
1695cd61 | 206 | |
d730ecaa RH |
207 | /* Integer 8 and 16-bit dot-product. |
208 | * | |
209 | * Note that for the loops herein, host endianness does not matter | |
210 | * with respect to the ordering of data within the 64-bit lanes. | |
211 | * All elements are treated equally, no matter where they are. | |
212 | */ | |
213 | ||
214 | void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
215 | { | |
216 | intptr_t i, opr_sz = simd_oprsz(desc); | |
217 | uint32_t *d = vd; | |
218 | int8_t *n = vn, *m = vm; | |
219 | ||
220 | for (i = 0; i < opr_sz / 4; ++i) { | |
221 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
222 | + n[i * 4 + 1] * m[i * 4 + 1] | |
223 | + n[i * 4 + 2] * m[i * 4 + 2] | |
224 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
225 | } | |
226 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
227 | } | |
228 | ||
229 | void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
230 | { | |
231 | intptr_t i, opr_sz = simd_oprsz(desc); | |
232 | uint32_t *d = vd; | |
233 | uint8_t *n = vn, *m = vm; | |
234 | ||
235 | for (i = 0; i < opr_sz / 4; ++i) { | |
236 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
237 | + n[i * 4 + 1] * m[i * 4 + 1] | |
238 | + n[i * 4 + 2] * m[i * 4 + 2] | |
239 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
240 | } | |
241 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
242 | } | |
243 | ||
244 | void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
245 | { | |
246 | intptr_t i, opr_sz = simd_oprsz(desc); | |
247 | uint64_t *d = vd; | |
248 | int16_t *n = vn, *m = vm; | |
249 | ||
250 | for (i = 0; i < opr_sz / 8; ++i) { | |
251 | d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
252 | + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
253 | + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
254 | + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
255 | } | |
256 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
257 | } | |
258 | ||
259 | void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
260 | { | |
261 | intptr_t i, opr_sz = simd_oprsz(desc); | |
262 | uint64_t *d = vd; | |
263 | uint16_t *n = vn, *m = vm; | |
264 | ||
265 | for (i = 0; i < opr_sz / 8; ++i) { | |
266 | d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
267 | + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
268 | + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
269 | + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
270 | } | |
271 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
272 | } | |
273 | ||
16fcfdc7 RH |
274 | void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) |
275 | { | |
276 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
277 | intptr_t index = simd_data(desc); | |
278 | uint32_t *d = vd; | |
279 | int8_t *n = vn; | |
280 | int8_t *m_indexed = (int8_t *)vm + index * 4; | |
281 | ||
282 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
283 | * Otherwise opr_sz is a multiple of 16. | |
284 | */ | |
285 | segend = MIN(4, opr_sz_4); | |
286 | i = 0; | |
287 | do { | |
288 | int8_t m0 = m_indexed[i * 4 + 0]; | |
289 | int8_t m1 = m_indexed[i * 4 + 1]; | |
290 | int8_t m2 = m_indexed[i * 4 + 2]; | |
291 | int8_t m3 = m_indexed[i * 4 + 3]; | |
292 | ||
293 | do { | |
294 | d[i] += n[i * 4 + 0] * m0 | |
295 | + n[i * 4 + 1] * m1 | |
296 | + n[i * 4 + 2] * m2 | |
297 | + n[i * 4 + 3] * m3; | |
298 | } while (++i < segend); | |
299 | segend = i + 4; | |
300 | } while (i < opr_sz_4); | |
301 | ||
302 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
303 | } | |
304 | ||
305 | void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
306 | { | |
307 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
308 | intptr_t index = simd_data(desc); | |
309 | uint32_t *d = vd; | |
310 | uint8_t *n = vn; | |
311 | uint8_t *m_indexed = (uint8_t *)vm + index * 4; | |
312 | ||
313 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
314 | * Otherwise opr_sz is a multiple of 16. | |
315 | */ | |
316 | segend = MIN(4, opr_sz_4); | |
317 | i = 0; | |
318 | do { | |
319 | uint8_t m0 = m_indexed[i * 4 + 0]; | |
320 | uint8_t m1 = m_indexed[i * 4 + 1]; | |
321 | uint8_t m2 = m_indexed[i * 4 + 2]; | |
322 | uint8_t m3 = m_indexed[i * 4 + 3]; | |
323 | ||
324 | do { | |
325 | d[i] += n[i * 4 + 0] * m0 | |
326 | + n[i * 4 + 1] * m1 | |
327 | + n[i * 4 + 2] * m2 | |
328 | + n[i * 4 + 3] * m3; | |
329 | } while (++i < segend); | |
330 | segend = i + 4; | |
331 | } while (i < opr_sz_4); | |
332 | ||
333 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
334 | } | |
335 | ||
336 | void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
337 | { | |
338 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
339 | intptr_t index = simd_data(desc); | |
340 | uint64_t *d = vd; | |
341 | int16_t *n = vn; | |
342 | int16_t *m_indexed = (int16_t *)vm + index * 4; | |
343 | ||
344 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
345 | * Process the entire segment all at once, writing back the results | |
346 | * only after we've consumed all of the inputs. | |
347 | */ | |
348 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
349 | uint64_t d0, d1; | |
350 | ||
351 | d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; | |
352 | d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; | |
353 | d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; | |
354 | d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; | |
355 | d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; | |
356 | d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; | |
357 | d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; | |
358 | d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; | |
359 | ||
360 | d[i + 0] += d0; | |
361 | d[i + 1] += d1; | |
362 | } | |
363 | ||
364 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
365 | } | |
366 | ||
367 | void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
368 | { | |
369 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
370 | intptr_t index = simd_data(desc); | |
371 | uint64_t *d = vd; | |
372 | uint16_t *n = vn; | |
373 | uint16_t *m_indexed = (uint16_t *)vm + index * 4; | |
374 | ||
375 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
376 | * Process the entire segment all at once, writing back the results | |
377 | * only after we've consumed all of the inputs. | |
378 | */ | |
379 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
380 | uint64_t d0, d1; | |
381 | ||
382 | d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; | |
383 | d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; | |
384 | d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; | |
385 | d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; | |
386 | d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; | |
387 | d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; | |
388 | d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; | |
389 | d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; | |
390 | ||
391 | d[i + 0] += d0; | |
392 | d[i + 1] += d1; | |
393 | } | |
394 | ||
395 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
396 | } | |
397 | ||
1695cd61 RH |
398 | void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, |
399 | void *vfpst, uint32_t desc) | |
400 | { | |
401 | uintptr_t opr_sz = simd_oprsz(desc); | |
402 | float16 *d = vd; | |
403 | float16 *n = vn; | |
404 | float16 *m = vm; | |
405 | float_status *fpst = vfpst; | |
406 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
407 | uint32_t neg_imag = neg_real ^ 1; | |
408 | uintptr_t i; | |
409 | ||
410 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
411 | neg_real <<= 15; | |
412 | neg_imag <<= 15; | |
413 | ||
414 | for (i = 0; i < opr_sz / 2; i += 2) { | |
415 | float16 e0 = n[H2(i)]; | |
416 | float16 e1 = m[H2(i + 1)] ^ neg_imag; | |
417 | float16 e2 = n[H2(i + 1)]; | |
418 | float16 e3 = m[H2(i)] ^ neg_real; | |
419 | ||
420 | d[H2(i)] = float16_add(e0, e1, fpst); | |
421 | d[H2(i + 1)] = float16_add(e2, e3, fpst); | |
422 | } | |
423 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
424 | } | |
425 | ||
426 | void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, | |
427 | void *vfpst, uint32_t desc) | |
428 | { | |
429 | uintptr_t opr_sz = simd_oprsz(desc); | |
430 | float32 *d = vd; | |
431 | float32 *n = vn; | |
432 | float32 *m = vm; | |
433 | float_status *fpst = vfpst; | |
434 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
435 | uint32_t neg_imag = neg_real ^ 1; | |
436 | uintptr_t i; | |
437 | ||
438 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
439 | neg_real <<= 31; | |
440 | neg_imag <<= 31; | |
441 | ||
442 | for (i = 0; i < opr_sz / 4; i += 2) { | |
443 | float32 e0 = n[H4(i)]; | |
444 | float32 e1 = m[H4(i + 1)] ^ neg_imag; | |
445 | float32 e2 = n[H4(i + 1)]; | |
446 | float32 e3 = m[H4(i)] ^ neg_real; | |
447 | ||
448 | d[H4(i)] = float32_add(e0, e1, fpst); | |
449 | d[H4(i + 1)] = float32_add(e2, e3, fpst); | |
450 | } | |
451 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
452 | } | |
453 | ||
454 | void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, | |
455 | void *vfpst, uint32_t desc) | |
456 | { | |
457 | uintptr_t opr_sz = simd_oprsz(desc); | |
458 | float64 *d = vd; | |
459 | float64 *n = vn; | |
460 | float64 *m = vm; | |
461 | float_status *fpst = vfpst; | |
462 | uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); | |
463 | uint64_t neg_imag = neg_real ^ 1; | |
464 | uintptr_t i; | |
465 | ||
466 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
467 | neg_real <<= 63; | |
468 | neg_imag <<= 63; | |
469 | ||
470 | for (i = 0; i < opr_sz / 8; i += 2) { | |
471 | float64 e0 = n[i]; | |
472 | float64 e1 = m[i + 1] ^ neg_imag; | |
473 | float64 e2 = n[i + 1]; | |
474 | float64 e3 = m[i] ^ neg_real; | |
475 | ||
476 | d[i] = float64_add(e0, e1, fpst); | |
477 | d[i + 1] = float64_add(e2, e3, fpst); | |
478 | } | |
479 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
480 | } | |
d17b7cdc RH |
481 | |
482 | void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, | |
483 | void *vfpst, uint32_t desc) | |
484 | { | |
485 | uintptr_t opr_sz = simd_oprsz(desc); | |
486 | float16 *d = vd; | |
487 | float16 *n = vn; | |
488 | float16 *m = vm; | |
489 | float_status *fpst = vfpst; | |
490 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
491 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
492 | uint32_t neg_real = flip ^ neg_imag; | |
493 | uintptr_t i; | |
494 | ||
495 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
496 | neg_real <<= 15; | |
497 | neg_imag <<= 15; | |
498 | ||
499 | for (i = 0; i < opr_sz / 2; i += 2) { | |
500 | float16 e2 = n[H2(i + flip)]; | |
501 | float16 e1 = m[H2(i + flip)] ^ neg_real; | |
502 | float16 e4 = e2; | |
503 | float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; | |
504 | ||
505 | d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); | |
506 | d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); | |
507 | } | |
508 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
509 | } | |
510 | ||
511 | void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, | |
512 | void *vfpst, uint32_t desc) | |
513 | { | |
514 | uintptr_t opr_sz = simd_oprsz(desc); | |
515 | float16 *d = vd; | |
516 | float16 *n = vn; | |
517 | float16 *m = vm; | |
518 | float_status *fpst = vfpst; | |
519 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
520 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 521 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 522 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
523 | intptr_t elements = opr_sz / sizeof(float16); |
524 | intptr_t eltspersegment = 16 / sizeof(float16); | |
525 | intptr_t i, j; | |
d17b7cdc RH |
526 | |
527 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
528 | neg_real <<= 15; | |
529 | neg_imag <<= 15; | |
d17b7cdc | 530 | |
18fc2405 RH |
531 | for (i = 0; i < elements; i += eltspersegment) { |
532 | float16 mr = m[H2(i + 2 * index + 0)]; | |
533 | float16 mi = m[H2(i + 2 * index + 1)]; | |
534 | float16 e1 = neg_real ^ (flip ? mi : mr); | |
535 | float16 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 536 | |
18fc2405 RH |
537 | for (j = i; j < i + eltspersegment; j += 2) { |
538 | float16 e2 = n[H2(j + flip)]; | |
539 | float16 e4 = e2; | |
540 | ||
541 | d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); | |
542 | d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); | |
543 | } | |
d17b7cdc RH |
544 | } |
545 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
546 | } | |
547 | ||
548 | void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, | |
549 | void *vfpst, uint32_t desc) | |
550 | { | |
551 | uintptr_t opr_sz = simd_oprsz(desc); | |
552 | float32 *d = vd; | |
553 | float32 *n = vn; | |
554 | float32 *m = vm; | |
555 | float_status *fpst = vfpst; | |
556 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
557 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
558 | uint32_t neg_real = flip ^ neg_imag; | |
559 | uintptr_t i; | |
560 | ||
561 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
562 | neg_real <<= 31; | |
563 | neg_imag <<= 31; | |
564 | ||
565 | for (i = 0; i < opr_sz / 4; i += 2) { | |
566 | float32 e2 = n[H4(i + flip)]; | |
567 | float32 e1 = m[H4(i + flip)] ^ neg_real; | |
568 | float32 e4 = e2; | |
569 | float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; | |
570 | ||
571 | d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); | |
572 | d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); | |
573 | } | |
574 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
575 | } | |
576 | ||
577 | void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, | |
578 | void *vfpst, uint32_t desc) | |
579 | { | |
580 | uintptr_t opr_sz = simd_oprsz(desc); | |
581 | float32 *d = vd; | |
582 | float32 *n = vn; | |
583 | float32 *m = vm; | |
584 | float_status *fpst = vfpst; | |
585 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
586 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 587 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 588 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
589 | intptr_t elements = opr_sz / sizeof(float32); |
590 | intptr_t eltspersegment = 16 / sizeof(float32); | |
591 | intptr_t i, j; | |
d17b7cdc RH |
592 | |
593 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
594 | neg_real <<= 31; | |
595 | neg_imag <<= 31; | |
d17b7cdc | 596 | |
18fc2405 RH |
597 | for (i = 0; i < elements; i += eltspersegment) { |
598 | float32 mr = m[H4(i + 2 * index + 0)]; | |
599 | float32 mi = m[H4(i + 2 * index + 1)]; | |
600 | float32 e1 = neg_real ^ (flip ? mi : mr); | |
601 | float32 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 602 | |
18fc2405 RH |
603 | for (j = i; j < i + eltspersegment; j += 2) { |
604 | float32 e2 = n[H4(j + flip)]; | |
605 | float32 e4 = e2; | |
606 | ||
607 | d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); | |
608 | d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); | |
609 | } | |
d17b7cdc RH |
610 | } |
611 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
612 | } | |
613 | ||
614 | void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, | |
615 | void *vfpst, uint32_t desc) | |
616 | { | |
617 | uintptr_t opr_sz = simd_oprsz(desc); | |
618 | float64 *d = vd; | |
619 | float64 *n = vn; | |
620 | float64 *m = vm; | |
621 | float_status *fpst = vfpst; | |
622 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
623 | uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
624 | uint64_t neg_real = flip ^ neg_imag; | |
625 | uintptr_t i; | |
626 | ||
627 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
628 | neg_real <<= 63; | |
629 | neg_imag <<= 63; | |
630 | ||
631 | for (i = 0; i < opr_sz / 8; i += 2) { | |
632 | float64 e2 = n[i + flip]; | |
633 | float64 e1 = m[i + flip] ^ neg_real; | |
634 | float64 e4 = e2; | |
635 | float64 e3 = m[i + 1 - flip] ^ neg_imag; | |
636 | ||
637 | d[i] = float64_muladd(e2, e1, d[i], 0, fpst); | |
638 | d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); | |
639 | } | |
640 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
641 | } | |
29b80469 | 642 | |
3887c038 RH |
643 | #define DO_2OP(NAME, FUNC, TYPE) \ |
644 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
645 | { \ | |
646 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
647 | TYPE *d = vd, *n = vn; \ | |
648 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
649 | d[i] = FUNC(n[i], stat); \ | |
650 | } \ | |
d8efe78e | 651 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
3887c038 RH |
652 | } |
653 | ||
654 | DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) | |
655 | DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) | |
656 | DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) | |
657 | ||
658 | DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) | |
659 | DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) | |
660 | DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) | |
661 | ||
662 | #undef DO_2OP | |
663 | ||
29b80469 RH |
664 | /* Floating-point trigonometric starting value. |
665 | * See the ARM ARM pseudocode function FPTrigSMul. | |
666 | */ | |
667 | static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) | |
668 | { | |
669 | float16 result = float16_mul(op1, op1, stat); | |
670 | if (!float16_is_any_nan(result)) { | |
671 | result = float16_set_sign(result, op2 & 1); | |
672 | } | |
673 | return result; | |
674 | } | |
675 | ||
676 | static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) | |
677 | { | |
678 | float32 result = float32_mul(op1, op1, stat); | |
679 | if (!float32_is_any_nan(result)) { | |
680 | result = float32_set_sign(result, op2 & 1); | |
681 | } | |
682 | return result; | |
683 | } | |
684 | ||
685 | static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) | |
686 | { | |
687 | float64 result = float64_mul(op1, op1, stat); | |
688 | if (!float64_is_any_nan(result)) { | |
689 | result = float64_set_sign(result, op2 & 1); | |
690 | } | |
691 | return result; | |
692 | } | |
693 | ||
a26a352b PM |
694 | static float32 float32_abd(float32 op1, float32 op2, float_status *stat) |
695 | { | |
696 | return float32_abs(float32_sub(op1, op2, stat)); | |
697 | } | |
698 | ||
29b80469 RH |
699 | #define DO_3OP(NAME, FUNC, TYPE) \ |
700 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
701 | { \ | |
702 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
703 | TYPE *d = vd, *n = vn, *m = vm; \ | |
704 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
705 | d[i] = FUNC(n[i], m[i], stat); \ | |
706 | } \ | |
d8efe78e | 707 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
29b80469 RH |
708 | } |
709 | ||
710 | DO_3OP(gvec_fadd_h, float16_add, float16) | |
711 | DO_3OP(gvec_fadd_s, float32_add, float32) | |
712 | DO_3OP(gvec_fadd_d, float64_add, float64) | |
713 | ||
714 | DO_3OP(gvec_fsub_h, float16_sub, float16) | |
715 | DO_3OP(gvec_fsub_s, float32_sub, float32) | |
716 | DO_3OP(gvec_fsub_d, float64_sub, float64) | |
717 | ||
718 | DO_3OP(gvec_fmul_h, float16_mul, float16) | |
719 | DO_3OP(gvec_fmul_s, float32_mul, float32) | |
720 | DO_3OP(gvec_fmul_d, float64_mul, float64) | |
721 | ||
722 | DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) | |
723 | DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) | |
724 | DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) | |
725 | ||
a26a352b PM |
726 | DO_3OP(gvec_fabd_s, float32_abd, float32) |
727 | ||
29b80469 RH |
728 | #ifdef TARGET_AARCH64 |
729 | ||
730 | DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) | |
731 | DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) | |
732 | DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) | |
733 | ||
734 | DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) | |
735 | DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) | |
736 | DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) | |
737 | ||
738 | #endif | |
739 | #undef DO_3OP | |
ca40a6e6 RH |
740 | |
741 | /* For the indexed ops, SVE applies the index per 128-bit vector segment. | |
742 | * For AdvSIMD, there is of course only one such vector segment. | |
743 | */ | |
744 | ||
745 | #define DO_MUL_IDX(NAME, TYPE, H) \ | |
746 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
747 | { \ | |
748 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
749 | intptr_t idx = simd_data(desc); \ | |
750 | TYPE *d = vd, *n = vn, *m = vm; \ | |
751 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
752 | TYPE mm = m[H(i + idx)]; \ | |
753 | for (j = 0; j < segment; j++) { \ | |
754 | d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ | |
755 | } \ | |
756 | } \ | |
525d9b6d | 757 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
758 | } |
759 | ||
760 | DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) | |
761 | DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) | |
762 | DO_MUL_IDX(gvec_fmul_idx_d, float64, ) | |
763 | ||
764 | #undef DO_MUL_IDX | |
765 | ||
766 | #define DO_FMLA_IDX(NAME, TYPE, H) \ | |
767 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ | |
768 | void *stat, uint32_t desc) \ | |
769 | { \ | |
770 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
771 | TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ | |
772 | intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ | |
773 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ | |
774 | op1_neg <<= (8 * sizeof(TYPE) - 1); \ | |
775 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
776 | TYPE mm = m[H(i + idx)]; \ | |
777 | for (j = 0; j < segment; j++) { \ | |
778 | d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ | |
779 | mm, a[i + j], 0, stat); \ | |
780 | } \ | |
781 | } \ | |
525d9b6d | 782 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
783 | } |
784 | ||
785 | DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) | |
786 | DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) | |
787 | DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) | |
788 | ||
789 | #undef DO_FMLA_IDX | |
89e68b57 RH |
790 | |
791 | #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ | |
792 | void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ | |
793 | { \ | |
794 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
795 | TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ | |
796 | bool q = false; \ | |
797 | for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ | |
798 | WTYPE dd = (WTYPE)n[i] OP m[i]; \ | |
799 | if (dd < MIN) { \ | |
800 | dd = MIN; \ | |
801 | q = true; \ | |
802 | } else if (dd > MAX) { \ | |
803 | dd = MAX; \ | |
804 | q = true; \ | |
805 | } \ | |
806 | d[i] = dd; \ | |
807 | } \ | |
808 | if (q) { \ | |
809 | uint32_t *qc = vq; \ | |
810 | qc[0] = 1; \ | |
811 | } \ | |
812 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
813 | } | |
814 | ||
815 | DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) | |
816 | DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) | |
817 | DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) | |
818 | ||
819 | DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) | |
820 | DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) | |
821 | DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) | |
822 | ||
823 | DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) | |
824 | DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) | |
825 | DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) | |
826 | ||
827 | DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) | |
828 | DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) | |
829 | DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) | |
830 | ||
831 | #undef DO_SAT | |
832 | ||
833 | void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, | |
834 | void *vm, uint32_t desc) | |
835 | { | |
836 | intptr_t i, oprsz = simd_oprsz(desc); | |
837 | uint64_t *d = vd, *n = vn, *m = vm; | |
838 | bool q = false; | |
839 | ||
840 | for (i = 0; i < oprsz / 8; i++) { | |
841 | uint64_t nn = n[i], mm = m[i], dd = nn + mm; | |
842 | if (dd < nn) { | |
843 | dd = UINT64_MAX; | |
844 | q = true; | |
845 | } | |
846 | d[i] = dd; | |
847 | } | |
848 | if (q) { | |
849 | uint32_t *qc = vq; | |
850 | qc[0] = 1; | |
851 | } | |
852 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
853 | } | |
854 | ||
855 | void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, | |
856 | void *vm, uint32_t desc) | |
857 | { | |
858 | intptr_t i, oprsz = simd_oprsz(desc); | |
859 | uint64_t *d = vd, *n = vn, *m = vm; | |
860 | bool q = false; | |
861 | ||
862 | for (i = 0; i < oprsz / 8; i++) { | |
863 | uint64_t nn = n[i], mm = m[i], dd = nn - mm; | |
864 | if (nn < mm) { | |
865 | dd = 0; | |
866 | q = true; | |
867 | } | |
868 | d[i] = dd; | |
869 | } | |
870 | if (q) { | |
871 | uint32_t *qc = vq; | |
872 | qc[0] = 1; | |
873 | } | |
874 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
875 | } | |
876 | ||
877 | void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, | |
878 | void *vm, uint32_t desc) | |
879 | { | |
880 | intptr_t i, oprsz = simd_oprsz(desc); | |
881 | int64_t *d = vd, *n = vn, *m = vm; | |
882 | bool q = false; | |
883 | ||
884 | for (i = 0; i < oprsz / 8; i++) { | |
885 | int64_t nn = n[i], mm = m[i], dd = nn + mm; | |
886 | if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { | |
887 | dd = (nn >> 63) ^ ~INT64_MIN; | |
888 | q = true; | |
889 | } | |
890 | d[i] = dd; | |
891 | } | |
892 | if (q) { | |
893 | uint32_t *qc = vq; | |
894 | qc[0] = 1; | |
895 | } | |
896 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
897 | } | |
898 | ||
899 | void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, | |
900 | void *vm, uint32_t desc) | |
901 | { | |
902 | intptr_t i, oprsz = simd_oprsz(desc); | |
903 | int64_t *d = vd, *n = vn, *m = vm; | |
904 | bool q = false; | |
905 | ||
906 | for (i = 0; i < oprsz / 8; i++) { | |
907 | int64_t nn = n[i], mm = m[i], dd = nn - mm; | |
908 | if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { | |
909 | dd = (nn >> 63) ^ ~INT64_MIN; | |
910 | q = true; | |
911 | } | |
912 | d[i] = dd; | |
913 | } | |
914 | if (q) { | |
915 | uint32_t *qc = vq; | |
916 | qc[0] = 1; | |
917 | } | |
918 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
919 | } | |
a4e943a7 | 920 | |
631e5654 RH |
921 | |
922 | #define DO_SRA(NAME, TYPE) \ | |
923 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
924 | { \ | |
925 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
926 | int shift = simd_data(desc); \ | |
927 | TYPE *d = vd, *n = vn; \ | |
928 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
929 | d[i] += n[i] >> shift; \ | |
930 | } \ | |
931 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
932 | } | |
933 | ||
934 | DO_SRA(gvec_ssra_b, int8_t) | |
935 | DO_SRA(gvec_ssra_h, int16_t) | |
936 | DO_SRA(gvec_ssra_s, int32_t) | |
937 | DO_SRA(gvec_ssra_d, int64_t) | |
938 | ||
939 | DO_SRA(gvec_usra_b, uint8_t) | |
940 | DO_SRA(gvec_usra_h, uint16_t) | |
941 | DO_SRA(gvec_usra_s, uint32_t) | |
942 | DO_SRA(gvec_usra_d, uint64_t) | |
943 | ||
944 | #undef DO_SRA | |
945 | ||
6ccd48d4 RH |
946 | #define DO_RSHR(NAME, TYPE) \ |
947 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
948 | { \ | |
949 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
950 | int shift = simd_data(desc); \ | |
951 | TYPE *d = vd, *n = vn; \ | |
952 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
953 | TYPE tmp = n[i] >> (shift - 1); \ | |
954 | d[i] = (tmp >> 1) + (tmp & 1); \ | |
955 | } \ | |
956 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
957 | } | |
958 | ||
959 | DO_RSHR(gvec_srshr_b, int8_t) | |
960 | DO_RSHR(gvec_srshr_h, int16_t) | |
961 | DO_RSHR(gvec_srshr_s, int32_t) | |
962 | DO_RSHR(gvec_srshr_d, int64_t) | |
963 | ||
964 | DO_RSHR(gvec_urshr_b, uint8_t) | |
965 | DO_RSHR(gvec_urshr_h, uint16_t) | |
966 | DO_RSHR(gvec_urshr_s, uint32_t) | |
967 | DO_RSHR(gvec_urshr_d, uint64_t) | |
968 | ||
969 | #undef DO_RSHR | |
970 | ||
971 | #define DO_RSRA(NAME, TYPE) \ | |
972 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
973 | { \ | |
974 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
975 | int shift = simd_data(desc); \ | |
976 | TYPE *d = vd, *n = vn; \ | |
977 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
978 | TYPE tmp = n[i] >> (shift - 1); \ | |
979 | d[i] += (tmp >> 1) + (tmp & 1); \ | |
980 | } \ | |
981 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
982 | } | |
983 | ||
984 | DO_RSRA(gvec_srsra_b, int8_t) | |
985 | DO_RSRA(gvec_srsra_h, int16_t) | |
986 | DO_RSRA(gvec_srsra_s, int32_t) | |
987 | DO_RSRA(gvec_srsra_d, int64_t) | |
988 | ||
989 | DO_RSRA(gvec_ursra_b, uint8_t) | |
990 | DO_RSRA(gvec_ursra_h, uint16_t) | |
991 | DO_RSRA(gvec_ursra_s, uint32_t) | |
992 | DO_RSRA(gvec_ursra_d, uint64_t) | |
993 | ||
994 | #undef DO_RSRA | |
995 | ||
893ab054 RH |
996 | #define DO_SRI(NAME, TYPE) \ |
997 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
998 | { \ | |
999 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1000 | int shift = simd_data(desc); \ | |
1001 | TYPE *d = vd, *n = vn; \ | |
1002 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1003 | d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ | |
1004 | } \ | |
1005 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1006 | } | |
1007 | ||
1008 | DO_SRI(gvec_sri_b, uint8_t) | |
1009 | DO_SRI(gvec_sri_h, uint16_t) | |
1010 | DO_SRI(gvec_sri_s, uint32_t) | |
1011 | DO_SRI(gvec_sri_d, uint64_t) | |
1012 | ||
1013 | #undef DO_SRI | |
1014 | ||
1015 | #define DO_SLI(NAME, TYPE) \ | |
1016 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1017 | { \ | |
1018 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1019 | int shift = simd_data(desc); \ | |
1020 | TYPE *d = vd, *n = vn; \ | |
1021 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1022 | d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ | |
1023 | } \ | |
1024 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1025 | } | |
1026 | ||
1027 | DO_SLI(gvec_sli_b, uint8_t) | |
1028 | DO_SLI(gvec_sli_h, uint16_t) | |
1029 | DO_SLI(gvec_sli_s, uint32_t) | |
1030 | DO_SLI(gvec_sli_d, uint64_t) | |
1031 | ||
1032 | #undef DO_SLI | |
1033 | ||
a4e943a7 RH |
1034 | /* |
1035 | * Convert float16 to float32, raising no exceptions and | |
1036 | * preserving exceptional values, including SNaN. | |
1037 | * This is effectively an unpack+repack operation. | |
1038 | */ | |
1039 | static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) | |
1040 | { | |
1041 | const int f16_bias = 15; | |
1042 | const int f32_bias = 127; | |
1043 | uint32_t sign = extract32(f16, 15, 1); | |
1044 | uint32_t exp = extract32(f16, 10, 5); | |
1045 | uint32_t frac = extract32(f16, 0, 10); | |
1046 | ||
1047 | if (exp == 0x1f) { | |
1048 | /* Inf or NaN */ | |
1049 | exp = 0xff; | |
1050 | } else if (exp == 0) { | |
1051 | /* Zero or denormal. */ | |
1052 | if (frac != 0) { | |
1053 | if (fz16) { | |
1054 | frac = 0; | |
1055 | } else { | |
1056 | /* | |
1057 | * Denormal; these are all normal float32. | |
1058 | * Shift the fraction so that the msb is at bit 11, | |
1059 | * then remove bit 11 as the implicit bit of the | |
1060 | * normalized float32. Note that we still go through | |
1061 | * the shift for normal numbers below, to put the | |
1062 | * float32 fraction at the right place. | |
1063 | */ | |
1064 | int shift = clz32(frac) - 21; | |
1065 | frac = (frac << shift) & 0x3ff; | |
1066 | exp = f32_bias - f16_bias - shift + 1; | |
1067 | } | |
1068 | } | |
1069 | } else { | |
1070 | /* Normal number; adjust the bias. */ | |
1071 | exp += f32_bias - f16_bias; | |
1072 | } | |
1073 | sign <<= 31; | |
1074 | exp <<= 23; | |
1075 | frac <<= 23 - 10; | |
1076 | ||
1077 | return sign | exp | frac; | |
1078 | } | |
1079 | ||
1080 | static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) | |
1081 | { | |
1082 | /* | |
1083 | * Branchless load of u32[0], u64[0], u32[1], or u64[1]. | |
1084 | * Load the 2nd qword iff is_q & is_2. | |
1085 | * Shift to the 2nd dword iff !is_q & is_2. | |
1086 | * For !is_q & !is_2, the upper bits of the result are garbage. | |
1087 | */ | |
1088 | return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); | |
1089 | } | |
1090 | ||
1091 | /* | |
1092 | * Note that FMLAL requires oprsz == 8 or oprsz == 16, | |
1093 | * as there is not yet SVE versions that might use blocking. | |
1094 | */ | |
1095 | ||
1096 | static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, | |
1097 | uint32_t desc, bool fz16) | |
1098 | { | |
1099 | intptr_t i, oprsz = simd_oprsz(desc); | |
1100 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1101 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1102 | int is_q = oprsz == 16; | |
1103 | uint64_t n_4, m_4; | |
1104 | ||
1105 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1106 | n_4 = load4_f16(vn, is_q, is_2); | |
1107 | m_4 = load4_f16(vm, is_q, is_2); | |
1108 | ||
1109 | /* Negate all inputs for FMLSL at once. */ | |
1110 | if (is_s) { | |
1111 | n_4 ^= 0x8000800080008000ull; | |
1112 | } | |
1113 | ||
1114 | for (i = 0; i < oprsz / 4; i++) { | |
1115 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1116 | float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); | |
1117 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1118 | } | |
1119 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1120 | } | |
1121 | ||
1122 | void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, | |
1123 | void *venv, uint32_t desc) | |
1124 | { | |
1125 | CPUARMState *env = venv; | |
1126 | do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1127 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1128 | } | |
1129 | ||
1130 | void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, | |
1131 | void *venv, uint32_t desc) | |
1132 | { | |
1133 | CPUARMState *env = venv; | |
1134 | do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, | |
1135 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1136 | } | |
1137 | ||
1138 | static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, | |
1139 | uint32_t desc, bool fz16) | |
1140 | { | |
1141 | intptr_t i, oprsz = simd_oprsz(desc); | |
1142 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1143 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1144 | int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); | |
1145 | int is_q = oprsz == 16; | |
1146 | uint64_t n_4; | |
1147 | float32 m_1; | |
1148 | ||
1149 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1150 | n_4 = load4_f16(vn, is_q, is_2); | |
1151 | ||
1152 | /* Negate all inputs for FMLSL at once. */ | |
1153 | if (is_s) { | |
1154 | n_4 ^= 0x8000800080008000ull; | |
1155 | } | |
1156 | ||
1157 | m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); | |
1158 | ||
1159 | for (i = 0; i < oprsz / 4; i++) { | |
1160 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1161 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1162 | } | |
1163 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1164 | } | |
1165 | ||
1166 | void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, | |
1167 | void *venv, uint32_t desc) | |
1168 | { | |
1169 | CPUARMState *env = venv; | |
1170 | do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1171 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1172 | } | |
1173 | ||
1174 | void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, | |
1175 | void *venv, uint32_t desc) | |
1176 | { | |
1177 | CPUARMState *env = venv; | |
1178 | do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, | |
1179 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1180 | } | |
87b74e8b RH |
1181 | |
1182 | void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1183 | { | |
1184 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1185 | int8_t *d = vd, *n = vn, *m = vm; | |
1186 | ||
1187 | for (i = 0; i < opr_sz; ++i) { | |
1188 | int8_t mm = m[i]; | |
1189 | int8_t nn = n[i]; | |
1190 | int8_t res = 0; | |
1191 | if (mm >= 0) { | |
1192 | if (mm < 8) { | |
1193 | res = nn << mm; | |
1194 | } | |
1195 | } else { | |
1196 | res = nn >> (mm > -8 ? -mm : 7); | |
1197 | } | |
1198 | d[i] = res; | |
1199 | } | |
1200 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1201 | } | |
1202 | ||
1203 | void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1204 | { | |
1205 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1206 | int16_t *d = vd, *n = vn, *m = vm; | |
1207 | ||
1208 | for (i = 0; i < opr_sz / 2; ++i) { | |
1209 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1210 | int16_t nn = n[i]; | |
1211 | int16_t res = 0; | |
1212 | if (mm >= 0) { | |
1213 | if (mm < 16) { | |
1214 | res = nn << mm; | |
1215 | } | |
1216 | } else { | |
1217 | res = nn >> (mm > -16 ? -mm : 15); | |
1218 | } | |
1219 | d[i] = res; | |
1220 | } | |
1221 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1222 | } | |
1223 | ||
1224 | void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1225 | { | |
1226 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1227 | uint8_t *d = vd, *n = vn, *m = vm; | |
1228 | ||
1229 | for (i = 0; i < opr_sz; ++i) { | |
1230 | int8_t mm = m[i]; | |
1231 | uint8_t nn = n[i]; | |
1232 | uint8_t res = 0; | |
1233 | if (mm >= 0) { | |
1234 | if (mm < 8) { | |
1235 | res = nn << mm; | |
1236 | } | |
1237 | } else { | |
1238 | if (mm > -8) { | |
1239 | res = nn >> -mm; | |
1240 | } | |
1241 | } | |
1242 | d[i] = res; | |
1243 | } | |
1244 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1245 | } | |
1246 | ||
1247 | void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1248 | { | |
1249 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1250 | uint16_t *d = vd, *n = vn, *m = vm; | |
1251 | ||
1252 | for (i = 0; i < opr_sz / 2; ++i) { | |
1253 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1254 | uint16_t nn = n[i]; | |
1255 | uint16_t res = 0; | |
1256 | if (mm >= 0) { | |
1257 | if (mm < 16) { | |
1258 | res = nn << mm; | |
1259 | } | |
1260 | } else { | |
1261 | if (mm > -16) { | |
1262 | res = nn >> -mm; | |
1263 | } | |
1264 | } | |
1265 | d[i] = res; | |
1266 | } | |
1267 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1268 | } | |
a21bb78e RH |
1269 | |
1270 | /* | |
1271 | * 8x8->8 polynomial multiply. | |
1272 | * | |
1273 | * Polynomial multiplication is like integer multiplication except the | |
1274 | * partial products are XORed, not added. | |
1275 | * | |
1276 | * TODO: expose this as a generic vector operation, as it is a common | |
1277 | * crypto building block. | |
1278 | */ | |
1279 | void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1280 | { | |
1281 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
1282 | uint64_t *d = vd, *n = vn, *m = vm; | |
1283 | ||
1284 | for (i = 0; i < opr_sz / 8; ++i) { | |
1285 | uint64_t nn = n[i]; | |
1286 | uint64_t mm = m[i]; | |
1287 | uint64_t rr = 0; | |
1288 | ||
1289 | for (j = 0; j < 8; ++j) { | |
1290 | uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; | |
1291 | rr ^= mm & mask; | |
1292 | mm = (mm << 1) & 0xfefefefefefefefeull; | |
1293 | nn >>= 1; | |
1294 | } | |
1295 | d[i] = rr; | |
1296 | } | |
1297 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1298 | } | |
b9ed510e RH |
1299 | |
1300 | /* | |
1301 | * 64x64->128 polynomial multiply. | |
1302 | * Because of the lanes are not accessed in strict columns, | |
1303 | * this probably cannot be turned into a generic helper. | |
1304 | */ | |
1305 | void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) | |
1306 | { | |
1307 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
1308 | intptr_t hi = simd_data(desc); | |
1309 | uint64_t *d = vd, *n = vn, *m = vm; | |
1310 | ||
1311 | for (i = 0; i < opr_sz / 8; i += 2) { | |
1312 | uint64_t nn = n[i + hi]; | |
1313 | uint64_t mm = m[i + hi]; | |
1314 | uint64_t rhi = 0; | |
1315 | uint64_t rlo = 0; | |
1316 | ||
1317 | /* Bit 0 can only influence the low 64-bit result. */ | |
1318 | if (nn & 1) { | |
1319 | rlo = mm; | |
1320 | } | |
1321 | ||
1322 | for (j = 1; j < 64; ++j) { | |
1323 | uint64_t mask = -((nn >> j) & 1); | |
1324 | rlo ^= (mm << j) & mask; | |
1325 | rhi ^= (mm >> (64 - j)) & mask; | |
1326 | } | |
1327 | d[i] = rlo; | |
1328 | d[i + 1] = rhi; | |
1329 | } | |
1330 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1331 | } | |
e7e96fc5 RH |
1332 | |
1333 | /* | |
1334 | * 8x8->16 polynomial multiply. | |
1335 | * | |
1336 | * The byte inputs are expanded to (or extracted from) half-words. | |
1337 | * Note that neon and sve2 get the inputs from different positions. | |
1338 | * This allows 4 bytes to be processed in parallel with uint64_t. | |
1339 | */ | |
1340 | ||
1341 | static uint64_t expand_byte_to_half(uint64_t x) | |
1342 | { | |
1343 | return (x & 0x000000ff) | |
1344 | | ((x & 0x0000ff00) << 8) | |
1345 | | ((x & 0x00ff0000) << 16) | |
1346 | | ((x & 0xff000000) << 24); | |
1347 | } | |
1348 | ||
1349 | static uint64_t pmull_h(uint64_t op1, uint64_t op2) | |
1350 | { | |
1351 | uint64_t result = 0; | |
1352 | int i; | |
1353 | ||
1354 | for (i = 0; i < 8; ++i) { | |
1355 | uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; | |
1356 | result ^= op2 & mask; | |
1357 | op1 >>= 1; | |
1358 | op2 <<= 1; | |
1359 | } | |
1360 | return result; | |
1361 | } | |
1362 | ||
1363 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1364 | { | |
1365 | int hi = simd_data(desc); | |
1366 | uint64_t *d = vd, *n = vn, *m = vm; | |
1367 | uint64_t nn = n[hi], mm = m[hi]; | |
1368 | ||
1369 | d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | |
1370 | nn >>= 32; | |
1371 | mm >>= 32; | |
1372 | d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | |
1373 | ||
1374 | clear_tail(d, 16, simd_maxsz(desc)); | |
1375 | } | |
1376 | ||
1377 | #ifdef TARGET_AARCH64 | |
1378 | void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1379 | { | |
1380 | int shift = simd_data(desc) * 8; | |
1381 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1382 | uint64_t *d = vd, *n = vn, *m = vm; | |
1383 | ||
1384 | for (i = 0; i < opr_sz / 8; ++i) { | |
1385 | uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; | |
1386 | uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; | |
1387 | ||
1388 | d[i] = pmull_h(nn, mm); | |
1389 | } | |
1390 | } | |
1391 | #endif | |
6b375d35 RH |
1392 | |
1393 | #define DO_CMP0(NAME, TYPE, OP) \ | |
1394 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1395 | { \ | |
1396 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1397 | for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ | |
1398 | TYPE nn = *(TYPE *)(vn + i); \ | |
1399 | *(TYPE *)(vd + i) = -(nn OP 0); \ | |
1400 | } \ | |
1401 | clear_tail(vd, opr_sz, simd_maxsz(desc)); \ | |
1402 | } | |
1403 | ||
1404 | DO_CMP0(gvec_ceq0_b, int8_t, ==) | |
1405 | DO_CMP0(gvec_clt0_b, int8_t, <) | |
1406 | DO_CMP0(gvec_cle0_b, int8_t, <=) | |
1407 | DO_CMP0(gvec_cgt0_b, int8_t, >) | |
1408 | DO_CMP0(gvec_cge0_b, int8_t, >=) | |
1409 | ||
1410 | DO_CMP0(gvec_ceq0_h, int16_t, ==) | |
1411 | DO_CMP0(gvec_clt0_h, int16_t, <) | |
1412 | DO_CMP0(gvec_cle0_h, int16_t, <=) | |
1413 | DO_CMP0(gvec_cgt0_h, int16_t, >) | |
1414 | DO_CMP0(gvec_cge0_h, int16_t, >=) | |
1415 | ||
1416 | #undef DO_CMP0 | |
50c160d4 RH |
1417 | |
1418 | #define DO_ABD(NAME, TYPE) \ | |
1419 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
1420 | { \ | |
1421 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1422 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1423 | \ | |
1424 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
1425 | d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
1426 | } \ | |
1427 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
1428 | } | |
1429 | ||
1430 | DO_ABD(gvec_sabd_b, int8_t) | |
1431 | DO_ABD(gvec_sabd_h, int16_t) | |
1432 | DO_ABD(gvec_sabd_s, int32_t) | |
1433 | DO_ABD(gvec_sabd_d, int64_t) | |
1434 | ||
1435 | DO_ABD(gvec_uabd_b, uint8_t) | |
1436 | DO_ABD(gvec_uabd_h, uint16_t) | |
1437 | DO_ABD(gvec_uabd_s, uint32_t) | |
1438 | DO_ABD(gvec_uabd_d, uint64_t) | |
1439 | ||
1440 | #undef DO_ABD | |
cfdb2c0c RH |
1441 | |
1442 | #define DO_ABA(NAME, TYPE) \ | |
1443 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
1444 | { \ | |
1445 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
1446 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1447 | \ | |
1448 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
1449 | d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
1450 | } \ | |
1451 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
1452 | } | |
1453 | ||
1454 | DO_ABA(gvec_saba_b, int8_t) | |
1455 | DO_ABA(gvec_saba_h, int16_t) | |
1456 | DO_ABA(gvec_saba_s, int32_t) | |
1457 | DO_ABA(gvec_saba_d, int64_t) | |
1458 | ||
1459 | DO_ABA(gvec_uaba_b, uint8_t) | |
1460 | DO_ABA(gvec_uaba_h, uint16_t) | |
1461 | DO_ABA(gvec_uaba_s, uint32_t) | |
1462 | DO_ABA(gvec_uaba_d, uint64_t) | |
1463 | ||
1464 | #undef DO_ABA |