]>
Commit | Line | Data |
---|---|---|
d9061ec3 RH |
1 | /* |
2 | * ARM AdvSIMD / SVE Vector Operations | |
3 | * | |
4 | * Copyright (c) 2018 Linaro | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
d9061ec3 RH |
22 | #include "exec/helper-proto.h" |
23 | #include "tcg/tcg-gvec-desc.h" | |
1695cd61 | 24 | #include "fpu/softfloat.h" |
d9061ec3 RH |
25 | |
26 | ||
1695cd61 RH |
27 | /* Note that vector data is stored in host-endian 64-bit chunks, |
28 | so addressing units smaller than that needs a host-endian fixup. */ | |
29 | #ifdef HOST_WORDS_BIGENDIAN | |
30 | #define H1(x) ((x) ^ 7) | |
31 | #define H2(x) ((x) ^ 3) | |
32 | #define H4(x) ((x) ^ 1) | |
33 | #else | |
34 | #define H1(x) (x) | |
35 | #define H2(x) (x) | |
36 | #define H4(x) (x) | |
37 | #endif | |
38 | ||
d9061ec3 RH |
39 | #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q |
40 | ||
e7186d82 RH |
41 | static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) |
42 | { | |
43 | uint64_t *d = vd + opr_sz; | |
44 | uintptr_t i; | |
45 | ||
46 | for (i = opr_sz; i < max_sz; i += 8) { | |
47 | *d++ = 0; | |
48 | } | |
49 | } | |
50 | ||
d9061ec3 RH |
51 | /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ |
52 | static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, | |
53 | int16_t src2, int16_t src3) | |
54 | { | |
55 | /* Simplify: | |
56 | * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
57 | * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 | |
58 | */ | |
59 | int32_t ret = (int32_t)src1 * src2; | |
60 | ret = ((int32_t)src3 << 15) + ret + (1 << 14); | |
61 | ret >>= 15; | |
62 | if (ret != (int16_t)ret) { | |
63 | SET_QC(); | |
64 | ret = (ret < 0 ? -0x8000 : 0x7fff); | |
65 | } | |
66 | return ret; | |
67 | } | |
68 | ||
69 | uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, | |
70 | uint32_t src2, uint32_t src3) | |
71 | { | |
72 | uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); | |
73 | uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); | |
74 | return deposit32(e1, 16, 16, e2); | |
75 | } | |
76 | ||
e7186d82 RH |
77 | void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, |
78 | void *ve, uint32_t desc) | |
79 | { | |
80 | uintptr_t opr_sz = simd_oprsz(desc); | |
81 | int16_t *d = vd; | |
82 | int16_t *n = vn; | |
83 | int16_t *m = vm; | |
84 | CPUARMState *env = ve; | |
85 | uintptr_t i; | |
86 | ||
87 | for (i = 0; i < opr_sz / 2; ++i) { | |
88 | d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); | |
89 | } | |
90 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
91 | } | |
92 | ||
d9061ec3 RH |
93 | /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ |
94 | static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, | |
95 | int16_t src2, int16_t src3) | |
96 | { | |
97 | /* Similarly, using subtraction: | |
98 | * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 | |
99 | * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 | |
100 | */ | |
101 | int32_t ret = (int32_t)src1 * src2; | |
102 | ret = ((int32_t)src3 << 15) - ret + (1 << 14); | |
103 | ret >>= 15; | |
104 | if (ret != (int16_t)ret) { | |
105 | SET_QC(); | |
106 | ret = (ret < 0 ? -0x8000 : 0x7fff); | |
107 | } | |
108 | return ret; | |
109 | } | |
110 | ||
111 | uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, | |
112 | uint32_t src2, uint32_t src3) | |
113 | { | |
114 | uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); | |
115 | uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); | |
116 | return deposit32(e1, 16, 16, e2); | |
117 | } | |
118 | ||
e7186d82 RH |
119 | void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, |
120 | void *ve, uint32_t desc) | |
121 | { | |
122 | uintptr_t opr_sz = simd_oprsz(desc); | |
123 | int16_t *d = vd; | |
124 | int16_t *n = vn; | |
125 | int16_t *m = vm; | |
126 | CPUARMState *env = ve; | |
127 | uintptr_t i; | |
128 | ||
129 | for (i = 0; i < opr_sz / 2; ++i) { | |
130 | d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); | |
131 | } | |
132 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
133 | } | |
134 | ||
d9061ec3 RH |
135 | /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ |
136 | uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, | |
137 | int32_t src2, int32_t src3) | |
138 | { | |
139 | /* Simplify similarly to int_qrdmlah_s16 above. */ | |
140 | int64_t ret = (int64_t)src1 * src2; | |
141 | ret = ((int64_t)src3 << 31) + ret + (1 << 30); | |
142 | ret >>= 31; | |
143 | if (ret != (int32_t)ret) { | |
144 | SET_QC(); | |
145 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); | |
146 | } | |
147 | return ret; | |
148 | } | |
149 | ||
e7186d82 RH |
150 | void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, |
151 | void *ve, uint32_t desc) | |
152 | { | |
153 | uintptr_t opr_sz = simd_oprsz(desc); | |
154 | int32_t *d = vd; | |
155 | int32_t *n = vn; | |
156 | int32_t *m = vm; | |
157 | CPUARMState *env = ve; | |
158 | uintptr_t i; | |
159 | ||
160 | for (i = 0; i < opr_sz / 4; ++i) { | |
161 | d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); | |
162 | } | |
163 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
164 | } | |
165 | ||
d9061ec3 RH |
166 | /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ |
167 | uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, | |
168 | int32_t src2, int32_t src3) | |
169 | { | |
170 | /* Simplify similarly to int_qrdmlsh_s16 above. */ | |
171 | int64_t ret = (int64_t)src1 * src2; | |
172 | ret = ((int64_t)src3 << 31) - ret + (1 << 30); | |
173 | ret >>= 31; | |
174 | if (ret != (int32_t)ret) { | |
175 | SET_QC(); | |
176 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); | |
177 | } | |
178 | return ret; | |
179 | } | |
e7186d82 RH |
180 | |
181 | void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, | |
182 | void *ve, uint32_t desc) | |
183 | { | |
184 | uintptr_t opr_sz = simd_oprsz(desc); | |
185 | int32_t *d = vd; | |
186 | int32_t *n = vn; | |
187 | int32_t *m = vm; | |
188 | CPUARMState *env = ve; | |
189 | uintptr_t i; | |
190 | ||
191 | for (i = 0; i < opr_sz / 4; ++i) { | |
192 | d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); | |
193 | } | |
194 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
195 | } | |
1695cd61 | 196 | |
d730ecaa RH |
197 | /* Integer 8 and 16-bit dot-product. |
198 | * | |
199 | * Note that for the loops herein, host endianness does not matter | |
200 | * with respect to the ordering of data within the 64-bit lanes. | |
201 | * All elements are treated equally, no matter where they are. | |
202 | */ | |
203 | ||
204 | void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
205 | { | |
206 | intptr_t i, opr_sz = simd_oprsz(desc); | |
207 | uint32_t *d = vd; | |
208 | int8_t *n = vn, *m = vm; | |
209 | ||
210 | for (i = 0; i < opr_sz / 4; ++i) { | |
211 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
212 | + n[i * 4 + 1] * m[i * 4 + 1] | |
213 | + n[i * 4 + 2] * m[i * 4 + 2] | |
214 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
215 | } | |
216 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
217 | } | |
218 | ||
219 | void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
220 | { | |
221 | intptr_t i, opr_sz = simd_oprsz(desc); | |
222 | uint32_t *d = vd; | |
223 | uint8_t *n = vn, *m = vm; | |
224 | ||
225 | for (i = 0; i < opr_sz / 4; ++i) { | |
226 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] | |
227 | + n[i * 4 + 1] * m[i * 4 + 1] | |
228 | + n[i * 4 + 2] * m[i * 4 + 2] | |
229 | + n[i * 4 + 3] * m[i * 4 + 3]; | |
230 | } | |
231 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
232 | } | |
233 | ||
234 | void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
235 | { | |
236 | intptr_t i, opr_sz = simd_oprsz(desc); | |
237 | uint64_t *d = vd; | |
238 | int16_t *n = vn, *m = vm; | |
239 | ||
240 | for (i = 0; i < opr_sz / 8; ++i) { | |
241 | d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
242 | + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
243 | + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
244 | + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
245 | } | |
246 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
247 | } | |
248 | ||
249 | void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
250 | { | |
251 | intptr_t i, opr_sz = simd_oprsz(desc); | |
252 | uint64_t *d = vd; | |
253 | uint16_t *n = vn, *m = vm; | |
254 | ||
255 | for (i = 0; i < opr_sz / 8; ++i) { | |
256 | d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] | |
257 | + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] | |
258 | + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] | |
259 | + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; | |
260 | } | |
261 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
262 | } | |
263 | ||
16fcfdc7 RH |
264 | void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) |
265 | { | |
266 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
267 | intptr_t index = simd_data(desc); | |
268 | uint32_t *d = vd; | |
269 | int8_t *n = vn; | |
270 | int8_t *m_indexed = (int8_t *)vm + index * 4; | |
271 | ||
272 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
273 | * Otherwise opr_sz is a multiple of 16. | |
274 | */ | |
275 | segend = MIN(4, opr_sz_4); | |
276 | i = 0; | |
277 | do { | |
278 | int8_t m0 = m_indexed[i * 4 + 0]; | |
279 | int8_t m1 = m_indexed[i * 4 + 1]; | |
280 | int8_t m2 = m_indexed[i * 4 + 2]; | |
281 | int8_t m3 = m_indexed[i * 4 + 3]; | |
282 | ||
283 | do { | |
284 | d[i] += n[i * 4 + 0] * m0 | |
285 | + n[i * 4 + 1] * m1 | |
286 | + n[i * 4 + 2] * m2 | |
287 | + n[i * 4 + 3] * m3; | |
288 | } while (++i < segend); | |
289 | segend = i + 4; | |
290 | } while (i < opr_sz_4); | |
291 | ||
292 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
293 | } | |
294 | ||
295 | void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
296 | { | |
297 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; | |
298 | intptr_t index = simd_data(desc); | |
299 | uint32_t *d = vd; | |
300 | uint8_t *n = vn; | |
301 | uint8_t *m_indexed = (uint8_t *)vm + index * 4; | |
302 | ||
303 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. | |
304 | * Otherwise opr_sz is a multiple of 16. | |
305 | */ | |
306 | segend = MIN(4, opr_sz_4); | |
307 | i = 0; | |
308 | do { | |
309 | uint8_t m0 = m_indexed[i * 4 + 0]; | |
310 | uint8_t m1 = m_indexed[i * 4 + 1]; | |
311 | uint8_t m2 = m_indexed[i * 4 + 2]; | |
312 | uint8_t m3 = m_indexed[i * 4 + 3]; | |
313 | ||
314 | do { | |
315 | d[i] += n[i * 4 + 0] * m0 | |
316 | + n[i * 4 + 1] * m1 | |
317 | + n[i * 4 + 2] * m2 | |
318 | + n[i * 4 + 3] * m3; | |
319 | } while (++i < segend); | |
320 | segend = i + 4; | |
321 | } while (i < opr_sz_4); | |
322 | ||
323 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
324 | } | |
325 | ||
326 | void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
327 | { | |
328 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
329 | intptr_t index = simd_data(desc); | |
330 | uint64_t *d = vd; | |
331 | int16_t *n = vn; | |
332 | int16_t *m_indexed = (int16_t *)vm + index * 4; | |
333 | ||
334 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
335 | * Process the entire segment all at once, writing back the results | |
336 | * only after we've consumed all of the inputs. | |
337 | */ | |
338 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
339 | uint64_t d0, d1; | |
340 | ||
341 | d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; | |
342 | d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; | |
343 | d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; | |
344 | d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; | |
345 | d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; | |
346 | d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; | |
347 | d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; | |
348 | d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; | |
349 | ||
350 | d[i + 0] += d0; | |
351 | d[i + 1] += d1; | |
352 | } | |
353 | ||
354 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
355 | } | |
356 | ||
357 | void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
358 | { | |
359 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; | |
360 | intptr_t index = simd_data(desc); | |
361 | uint64_t *d = vd; | |
362 | uint16_t *n = vn; | |
363 | uint16_t *m_indexed = (uint16_t *)vm + index * 4; | |
364 | ||
365 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. | |
366 | * Process the entire segment all at once, writing back the results | |
367 | * only after we've consumed all of the inputs. | |
368 | */ | |
369 | for (i = 0; i < opr_sz_8 ; i += 2) { | |
370 | uint64_t d0, d1; | |
371 | ||
372 | d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; | |
373 | d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; | |
374 | d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; | |
375 | d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; | |
376 | d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; | |
377 | d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; | |
378 | d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; | |
379 | d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; | |
380 | ||
381 | d[i + 0] += d0; | |
382 | d[i + 1] += d1; | |
383 | } | |
384 | ||
385 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
386 | } | |
387 | ||
1695cd61 RH |
388 | void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, |
389 | void *vfpst, uint32_t desc) | |
390 | { | |
391 | uintptr_t opr_sz = simd_oprsz(desc); | |
392 | float16 *d = vd; | |
393 | float16 *n = vn; | |
394 | float16 *m = vm; | |
395 | float_status *fpst = vfpst; | |
396 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
397 | uint32_t neg_imag = neg_real ^ 1; | |
398 | uintptr_t i; | |
399 | ||
400 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
401 | neg_real <<= 15; | |
402 | neg_imag <<= 15; | |
403 | ||
404 | for (i = 0; i < opr_sz / 2; i += 2) { | |
405 | float16 e0 = n[H2(i)]; | |
406 | float16 e1 = m[H2(i + 1)] ^ neg_imag; | |
407 | float16 e2 = n[H2(i + 1)]; | |
408 | float16 e3 = m[H2(i)] ^ neg_real; | |
409 | ||
410 | d[H2(i)] = float16_add(e0, e1, fpst); | |
411 | d[H2(i + 1)] = float16_add(e2, e3, fpst); | |
412 | } | |
413 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
414 | } | |
415 | ||
416 | void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, | |
417 | void *vfpst, uint32_t desc) | |
418 | { | |
419 | uintptr_t opr_sz = simd_oprsz(desc); | |
420 | float32 *d = vd; | |
421 | float32 *n = vn; | |
422 | float32 *m = vm; | |
423 | float_status *fpst = vfpst; | |
424 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
425 | uint32_t neg_imag = neg_real ^ 1; | |
426 | uintptr_t i; | |
427 | ||
428 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
429 | neg_real <<= 31; | |
430 | neg_imag <<= 31; | |
431 | ||
432 | for (i = 0; i < opr_sz / 4; i += 2) { | |
433 | float32 e0 = n[H4(i)]; | |
434 | float32 e1 = m[H4(i + 1)] ^ neg_imag; | |
435 | float32 e2 = n[H4(i + 1)]; | |
436 | float32 e3 = m[H4(i)] ^ neg_real; | |
437 | ||
438 | d[H4(i)] = float32_add(e0, e1, fpst); | |
439 | d[H4(i + 1)] = float32_add(e2, e3, fpst); | |
440 | } | |
441 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
442 | } | |
443 | ||
444 | void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, | |
445 | void *vfpst, uint32_t desc) | |
446 | { | |
447 | uintptr_t opr_sz = simd_oprsz(desc); | |
448 | float64 *d = vd; | |
449 | float64 *n = vn; | |
450 | float64 *m = vm; | |
451 | float_status *fpst = vfpst; | |
452 | uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); | |
453 | uint64_t neg_imag = neg_real ^ 1; | |
454 | uintptr_t i; | |
455 | ||
456 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
457 | neg_real <<= 63; | |
458 | neg_imag <<= 63; | |
459 | ||
460 | for (i = 0; i < opr_sz / 8; i += 2) { | |
461 | float64 e0 = n[i]; | |
462 | float64 e1 = m[i + 1] ^ neg_imag; | |
463 | float64 e2 = n[i + 1]; | |
464 | float64 e3 = m[i] ^ neg_real; | |
465 | ||
466 | d[i] = float64_add(e0, e1, fpst); | |
467 | d[i + 1] = float64_add(e2, e3, fpst); | |
468 | } | |
469 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
470 | } | |
d17b7cdc RH |
471 | |
472 | void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, | |
473 | void *vfpst, uint32_t desc) | |
474 | { | |
475 | uintptr_t opr_sz = simd_oprsz(desc); | |
476 | float16 *d = vd; | |
477 | float16 *n = vn; | |
478 | float16 *m = vm; | |
479 | float_status *fpst = vfpst; | |
480 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
481 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
482 | uint32_t neg_real = flip ^ neg_imag; | |
483 | uintptr_t i; | |
484 | ||
485 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
486 | neg_real <<= 15; | |
487 | neg_imag <<= 15; | |
488 | ||
489 | for (i = 0; i < opr_sz / 2; i += 2) { | |
490 | float16 e2 = n[H2(i + flip)]; | |
491 | float16 e1 = m[H2(i + flip)] ^ neg_real; | |
492 | float16 e4 = e2; | |
493 | float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; | |
494 | ||
495 | d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); | |
496 | d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); | |
497 | } | |
498 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
499 | } | |
500 | ||
501 | void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, | |
502 | void *vfpst, uint32_t desc) | |
503 | { | |
504 | uintptr_t opr_sz = simd_oprsz(desc); | |
505 | float16 *d = vd; | |
506 | float16 *n = vn; | |
507 | float16 *m = vm; | |
508 | float_status *fpst = vfpst; | |
509 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
510 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 511 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 512 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
513 | intptr_t elements = opr_sz / sizeof(float16); |
514 | intptr_t eltspersegment = 16 / sizeof(float16); | |
515 | intptr_t i, j; | |
d17b7cdc RH |
516 | |
517 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
518 | neg_real <<= 15; | |
519 | neg_imag <<= 15; | |
d17b7cdc | 520 | |
18fc2405 RH |
521 | for (i = 0; i < elements; i += eltspersegment) { |
522 | float16 mr = m[H2(i + 2 * index + 0)]; | |
523 | float16 mi = m[H2(i + 2 * index + 1)]; | |
524 | float16 e1 = neg_real ^ (flip ? mi : mr); | |
525 | float16 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 526 | |
18fc2405 RH |
527 | for (j = i; j < i + eltspersegment; j += 2) { |
528 | float16 e2 = n[H2(j + flip)]; | |
529 | float16 e4 = e2; | |
530 | ||
531 | d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); | |
532 | d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); | |
533 | } | |
d17b7cdc RH |
534 | } |
535 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
536 | } | |
537 | ||
538 | void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, | |
539 | void *vfpst, uint32_t desc) | |
540 | { | |
541 | uintptr_t opr_sz = simd_oprsz(desc); | |
542 | float32 *d = vd; | |
543 | float32 *n = vn; | |
544 | float32 *m = vm; | |
545 | float_status *fpst = vfpst; | |
546 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
547 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
548 | uint32_t neg_real = flip ^ neg_imag; | |
549 | uintptr_t i; | |
550 | ||
551 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
552 | neg_real <<= 31; | |
553 | neg_imag <<= 31; | |
554 | ||
555 | for (i = 0; i < opr_sz / 4; i += 2) { | |
556 | float32 e2 = n[H4(i + flip)]; | |
557 | float32 e1 = m[H4(i + flip)] ^ neg_real; | |
558 | float32 e4 = e2; | |
559 | float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; | |
560 | ||
561 | d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); | |
562 | d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); | |
563 | } | |
564 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
565 | } | |
566 | ||
567 | void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, | |
568 | void *vfpst, uint32_t desc) | |
569 | { | |
570 | uintptr_t opr_sz = simd_oprsz(desc); | |
571 | float32 *d = vd; | |
572 | float32 *n = vn; | |
573 | float32 *m = vm; | |
574 | float_status *fpst = vfpst; | |
575 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
576 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 577 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 578 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
579 | intptr_t elements = opr_sz / sizeof(float32); |
580 | intptr_t eltspersegment = 16 / sizeof(float32); | |
581 | intptr_t i, j; | |
d17b7cdc RH |
582 | |
583 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
584 | neg_real <<= 31; | |
585 | neg_imag <<= 31; | |
d17b7cdc | 586 | |
18fc2405 RH |
587 | for (i = 0; i < elements; i += eltspersegment) { |
588 | float32 mr = m[H4(i + 2 * index + 0)]; | |
589 | float32 mi = m[H4(i + 2 * index + 1)]; | |
590 | float32 e1 = neg_real ^ (flip ? mi : mr); | |
591 | float32 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 592 | |
18fc2405 RH |
593 | for (j = i; j < i + eltspersegment; j += 2) { |
594 | float32 e2 = n[H4(j + flip)]; | |
595 | float32 e4 = e2; | |
596 | ||
597 | d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); | |
598 | d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); | |
599 | } | |
d17b7cdc RH |
600 | } |
601 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
602 | } | |
603 | ||
604 | void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, | |
605 | void *vfpst, uint32_t desc) | |
606 | { | |
607 | uintptr_t opr_sz = simd_oprsz(desc); | |
608 | float64 *d = vd; | |
609 | float64 *n = vn; | |
610 | float64 *m = vm; | |
611 | float_status *fpst = vfpst; | |
612 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
613 | uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
614 | uint64_t neg_real = flip ^ neg_imag; | |
615 | uintptr_t i; | |
616 | ||
617 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
618 | neg_real <<= 63; | |
619 | neg_imag <<= 63; | |
620 | ||
621 | for (i = 0; i < opr_sz / 8; i += 2) { | |
622 | float64 e2 = n[i + flip]; | |
623 | float64 e1 = m[i + flip] ^ neg_real; | |
624 | float64 e4 = e2; | |
625 | float64 e3 = m[i + 1 - flip] ^ neg_imag; | |
626 | ||
627 | d[i] = float64_muladd(e2, e1, d[i], 0, fpst); | |
628 | d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); | |
629 | } | |
630 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
631 | } | |
29b80469 | 632 | |
3887c038 RH |
633 | #define DO_2OP(NAME, FUNC, TYPE) \ |
634 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
635 | { \ | |
636 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
637 | TYPE *d = vd, *n = vn; \ | |
638 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
639 | d[i] = FUNC(n[i], stat); \ | |
640 | } \ | |
641 | } | |
642 | ||
643 | DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) | |
644 | DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) | |
645 | DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) | |
646 | ||
647 | DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) | |
648 | DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) | |
649 | DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) | |
650 | ||
651 | #undef DO_2OP | |
652 | ||
29b80469 RH |
653 | /* Floating-point trigonometric starting value. |
654 | * See the ARM ARM pseudocode function FPTrigSMul. | |
655 | */ | |
656 | static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) | |
657 | { | |
658 | float16 result = float16_mul(op1, op1, stat); | |
659 | if (!float16_is_any_nan(result)) { | |
660 | result = float16_set_sign(result, op2 & 1); | |
661 | } | |
662 | return result; | |
663 | } | |
664 | ||
665 | static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) | |
666 | { | |
667 | float32 result = float32_mul(op1, op1, stat); | |
668 | if (!float32_is_any_nan(result)) { | |
669 | result = float32_set_sign(result, op2 & 1); | |
670 | } | |
671 | return result; | |
672 | } | |
673 | ||
674 | static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) | |
675 | { | |
676 | float64 result = float64_mul(op1, op1, stat); | |
677 | if (!float64_is_any_nan(result)) { | |
678 | result = float64_set_sign(result, op2 & 1); | |
679 | } | |
680 | return result; | |
681 | } | |
682 | ||
683 | #define DO_3OP(NAME, FUNC, TYPE) \ | |
684 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
685 | { \ | |
686 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
687 | TYPE *d = vd, *n = vn, *m = vm; \ | |
688 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
689 | d[i] = FUNC(n[i], m[i], stat); \ | |
690 | } \ | |
691 | } | |
692 | ||
693 | DO_3OP(gvec_fadd_h, float16_add, float16) | |
694 | DO_3OP(gvec_fadd_s, float32_add, float32) | |
695 | DO_3OP(gvec_fadd_d, float64_add, float64) | |
696 | ||
697 | DO_3OP(gvec_fsub_h, float16_sub, float16) | |
698 | DO_3OP(gvec_fsub_s, float32_sub, float32) | |
699 | DO_3OP(gvec_fsub_d, float64_sub, float64) | |
700 | ||
701 | DO_3OP(gvec_fmul_h, float16_mul, float16) | |
702 | DO_3OP(gvec_fmul_s, float32_mul, float32) | |
703 | DO_3OP(gvec_fmul_d, float64_mul, float64) | |
704 | ||
705 | DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) | |
706 | DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) | |
707 | DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) | |
708 | ||
709 | #ifdef TARGET_AARCH64 | |
710 | ||
711 | DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) | |
712 | DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) | |
713 | DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) | |
714 | ||
715 | DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) | |
716 | DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) | |
717 | DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) | |
718 | ||
719 | #endif | |
720 | #undef DO_3OP | |
ca40a6e6 RH |
721 | |
722 | /* For the indexed ops, SVE applies the index per 128-bit vector segment. | |
723 | * For AdvSIMD, there is of course only one such vector segment. | |
724 | */ | |
725 | ||
726 | #define DO_MUL_IDX(NAME, TYPE, H) \ | |
727 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
728 | { \ | |
729 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
730 | intptr_t idx = simd_data(desc); \ | |
731 | TYPE *d = vd, *n = vn, *m = vm; \ | |
732 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
733 | TYPE mm = m[H(i + idx)]; \ | |
734 | for (j = 0; j < segment; j++) { \ | |
735 | d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ | |
736 | } \ | |
737 | } \ | |
738 | } | |
739 | ||
740 | DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) | |
741 | DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) | |
742 | DO_MUL_IDX(gvec_fmul_idx_d, float64, ) | |
743 | ||
744 | #undef DO_MUL_IDX | |
745 | ||
746 | #define DO_FMLA_IDX(NAME, TYPE, H) \ | |
747 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ | |
748 | void *stat, uint32_t desc) \ | |
749 | { \ | |
750 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ | |
751 | TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ | |
752 | intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ | |
753 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ | |
754 | op1_neg <<= (8 * sizeof(TYPE) - 1); \ | |
755 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
756 | TYPE mm = m[H(i + idx)]; \ | |
757 | for (j = 0; j < segment; j++) { \ | |
758 | d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ | |
759 | mm, a[i + j], 0, stat); \ | |
760 | } \ | |
761 | } \ | |
762 | } | |
763 | ||
764 | DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) | |
765 | DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) | |
766 | DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) | |
767 | ||
768 | #undef DO_FMLA_IDX |