4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
55 /* This is an iterative function, called for each Pd and Pg word
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
64 flags |= ((d & (g & -g)) != 0) << 31;
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
77 /* This is an iterative function, called for each Pd and Pg word
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
127 * printf("0x%016lx,\n", m);
130 static inline uint64_t expand_pred_b(uint8_t byte)
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 static inline uint64_t expand_pred_h(uint8_t byte)
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
249 return word[byte & 0x55];
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
260 return word[byte & 0x11];
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
272 uint64_t m = 0x0000ffff0000ffffull;
274 return ((h & m) << 16) | ((h >> 16) & m);
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
381 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
384 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
389 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
394 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
399 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
404 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
409 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
414 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
419 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
424 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
429 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
434 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
439 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
444 /* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
451 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
453 return (n * m) >> 16;
456 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
458 return (n * m) >> 32;
461 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464 muls64(&lo, &hi, n, m);
468 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471 mulu64(&lo, &hi, n, m);
475 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
480 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
485 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
490 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
493 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
496 /* Note that all bits of the shift are significant
497 and not modulo the element size. */
498 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
502 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
506 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
510 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
514 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
521 /* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
525 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
542 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
546 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
550 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
556 /* Fully general two-operand expander, controlled by a predicate.
558 #define DO_ZPZ(NAME, TYPE, H, OP) \
559 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
574 /* Similarly, specialized for 64-bit operands. */
575 #define DO_ZPZ_D(NAME, TYPE, OP) \
576 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
589 #define DO_CLS_B(N) (clrsb32(N) - 24)
590 #define DO_CLS_H(N) (clrsb32(N) - 16)
592 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
597 #define DO_CLZ_B(N) (clz32(N) - 24)
598 #define DO_CLZ_H(N) (clz32(N) - 16)
600 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
605 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
610 #define DO_CNOT(N) (N == 0)
612 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
617 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
619 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
623 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
625 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
629 #define DO_NOT(N) (~N)
631 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
636 #define DO_SXTB(N) ((int8_t)N)
637 #define DO_SXTH(N) ((int16_t)N)
638 #define DO_SXTS(N) ((int32_t)N)
639 #define DO_UXTB(N) ((uint8_t)N)
640 #define DO_UXTH(N) ((uint16_t)N)
641 #define DO_UXTS(N) ((uint32_t)N)
643 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
650 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
657 #define DO_ABS(N) (N < 0 ? -N : N)
659 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
664 #define DO_NEG(N) (-N)
666 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
671 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
675 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
678 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
680 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
685 /* Three-operand expander, unpredicated, in which the third operand is "wide".
687 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
701 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
705 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
709 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
727 /* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
733 /* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
736 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 return (TYPERET)ret; \
754 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
770 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
775 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
780 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
785 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
789 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
794 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
799 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
804 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
809 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
817 /* Two vector operand, one scalar operand, unpredicated. */
818 #define DO_ZZI(NAME, TYPE, OP) \
819 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
828 #define DO_SUBR(X, Y) (Y - X)
830 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
835 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
840 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
845 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
850 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
873 /* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
878 uint64_t mask = pred_esz_masks[esz];
882 uint64_t this_g = g[--i] & mask;
884 return i * 64 + (63 - clz64(this_g));
887 return (intptr_t)-1 << esz;
890 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
906 flags = iter_predtest_fwd(this_d, this_g, flags);
908 } while (++i < words);
913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
930 mask = ~((1ull << (next & 63)) - 1);
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
936 next = (next & -64) + ctz64(this_g);
941 } while (next < words * 64);
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
957 /* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
966 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
976 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
986 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
996 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1008 /* Copy Zn into Zd, and store zero into inactive elements. */
1009 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1019 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1029 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1039 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1049 /* Three-operand expander, immediate operand, controlled by a predicate.
1051 #define DO_ZPZI(NAME, TYPE, H, OP) \
1052 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1068 /* Similarly, specialized for 64-bit operands. */
1069 #define DO_ZPZI_D(NAME, TYPE, OP) \
1070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1079 d[i] = OP(nn, imm); \
1084 #define DO_SHR(N, M) (N >> M)
1085 #define DO_SHL(N, M) (N << M)
1087 /* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1118 /* Fully general four-operand expander, controlled by a predicate.
1120 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1139 /* Similarly, specialized for 64-bit operands. */
1140 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1155 #define DO_MLA(A, N, M) (A + N * M)
1156 #define DO_MLS(A, N, M) (A - N * M)
1158 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1161 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1164 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1167 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1175 void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1185 void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1195 void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1205 void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1215 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1225 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1235 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1245 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1255 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1267 for (i = 0; i < opr_sz; i++) {
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1275 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1299 for (i = 0; i < opr_sz; i++) {
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1307 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1337 for (i = 0; i < opr_sz; i++) {
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1345 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1355 d[i] = nn ^ (mm & 2) << 14;
1359 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1369 d[i] = nn ^ (mm & 2) << 30;
1373 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1383 d[i] = nn ^ (mm & 2) << 62;
1388 * Signed saturating addition with scalar operand.
1391 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1393 intptr_t i, oprsz = simd_oprsz(desc);
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1399 } else if (r < INT8_MIN) {
1402 *(int8_t *)(d + i) = r;
1406 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1408 intptr_t i, oprsz = simd_oprsz(desc);
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1414 } else if (r < INT16_MIN) {
1417 *(int16_t *)(d + i) = r;
1421 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1423 intptr_t i, oprsz = simd_oprsz(desc);
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1429 } else if (r < INT32_MIN) {
1432 *(int32_t *)(d + i) = r;
1436 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1438 intptr_t i, oprsz = simd_oprsz(desc);
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1447 *(int64_t *)(d + i) = r;
1452 * Unsigned saturating addition with scalar operand.
1455 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1457 intptr_t i, oprsz = simd_oprsz(desc);
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1466 *(uint8_t *)(d + i) = r;
1470 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1472 intptr_t i, oprsz = simd_oprsz(desc);
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1481 *(uint16_t *)(d + i) = r;
1485 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1487 intptr_t i, oprsz = simd_oprsz(desc);
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1496 *(uint32_t *)(d + i) = r;
1500 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1502 intptr_t i, oprsz = simd_oprsz(desc);
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1509 *(uint64_t *)(d + i) = r;
1513 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1515 intptr_t i, oprsz = simd_oprsz(desc);
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1523 /* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1526 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1541 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1556 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1571 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1578 for (i = 0; i < opr_sz; i += 1) {
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1584 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1596 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1608 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1620 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1631 /* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1634 static void swap_memmove(void *vd, void *vs, size_t n)
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1641 #ifndef HOST_WORDS_BIGENDIAN
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1655 for (i = n; i > 0; ) {
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1669 for (i = n; i > 0; ) {
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1682 for (i = n; i > 0; ) {
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1691 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1693 intptr_t opr_sz = simd_oprsz(desc);
1694 size_t n_ofs = simd_data(desc);
1695 size_t n_siz = opr_sz - n_ofs;
1698 swap_memmove(vd, vn + n_ofs, n_siz);
1699 swap_memmove(vd + n_siz, vm, n_ofs);
1700 } else if (vd != vn) {
1701 swap_memmove(vd + n_siz, vd, n_ofs);
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1704 /* vd == vn == vm. Need temp space. */
1706 swap_memmove(&tmp, vm, n_ofs);
1707 swap_memmove(vd, vd + n_ofs, n_siz);
1708 memcpy(vd + n_siz, &tmp, n_ofs);
1712 #define DO_INSR(NAME, TYPE, H) \
1713 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1715 intptr_t opr_sz = simd_oprsz(desc); \
1716 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1717 *(TYPE *)(vd + H(0)) = val; \
1720 DO_INSR(sve_insr_b, uint8_t, H1)
1721 DO_INSR(sve_insr_h, uint16_t, H1_2)
1722 DO_INSR(sve_insr_s, uint32_t, H1_4)
1723 DO_INSR(sve_insr_d, uint64_t, )
1727 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1729 intptr_t i, j, opr_sz = simd_oprsz(desc);
1730 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731 uint64_t f = *(uint64_t *)(vn + i);
1732 uint64_t b = *(uint64_t *)(vn + j);
1733 *(uint64_t *)(vd + i) = bswap64(b);
1734 *(uint64_t *)(vd + j) = bswap64(f);
1738 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1740 intptr_t i, j, opr_sz = simd_oprsz(desc);
1741 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742 uint64_t f = *(uint64_t *)(vn + i);
1743 uint64_t b = *(uint64_t *)(vn + j);
1744 *(uint64_t *)(vd + i) = hswap64(b);
1745 *(uint64_t *)(vd + j) = hswap64(f);
1749 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1751 intptr_t i, j, opr_sz = simd_oprsz(desc);
1752 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753 uint64_t f = *(uint64_t *)(vn + i);
1754 uint64_t b = *(uint64_t *)(vn + j);
1755 *(uint64_t *)(vd + i) = rol64(b, 32);
1756 *(uint64_t *)(vd + j) = rol64(f, 32);
1760 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1762 intptr_t i, j, opr_sz = simd_oprsz(desc);
1763 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764 uint64_t f = *(uint64_t *)(vn + i);
1765 uint64_t b = *(uint64_t *)(vn + j);
1766 *(uint64_t *)(vd + i) = b;
1767 *(uint64_t *)(vd + j) = f;
1771 #define DO_TBL(NAME, TYPE, H) \
1772 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1774 intptr_t i, opr_sz = simd_oprsz(desc); \
1775 uintptr_t elem = opr_sz / sizeof(TYPE); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1778 if (unlikely(vd == vn)) { \
1779 n = memcpy(&tmp, vn, opr_sz); \
1781 for (i = 0; i < elem; i++) { \
1783 d[H(i)] = j < elem ? n[H(j)] : 0; \
1787 DO_TBL(sve_tbl_b, uint8_t, H1)
1788 DO_TBL(sve_tbl_h, uint16_t, H2)
1789 DO_TBL(sve_tbl_s, uint32_t, H4)
1790 DO_TBL(sve_tbl_d, uint64_t, )
1794 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1801 if (unlikely(vn - vd < opr_sz)) { \
1802 n = memcpy(&tmp, n, opr_sz / 2); \
1804 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1805 d[HD(i)] = n[HS(i)]; \
1809 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1813 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1819 /* Mask of bits included in the even numbered predicates of width esz.
1820 * We also use this for expand_bits/compress_bits, and so extend the
1821 * same pattern out to 16-bit units.
1823 static const uint64_t even_bit_esz_masks[5] = {
1824 0x5555555555555555ull,
1825 0x3333333333333333ull,
1826 0x0f0f0f0f0f0f0f0full,
1827 0x00ff00ff00ff00ffull,
1828 0x0000ffff0000ffffull,
1831 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1832 * For N==0, this corresponds to the operation that in qemu/bitops.h
1833 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1834 * section 7-2 Shuffling Bits.
1836 static uint64_t expand_bits(uint64_t x, int n)
1841 for (i = 4; i >= n; i--) {
1843 x = ((x << sh) | x) & even_bit_esz_masks[i];
1848 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1849 * For N==0, this corresponds to the operation that in qemu/bitops.h
1850 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1851 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1853 static uint64_t compress_bits(uint64_t x, int n)
1857 for (i = n; i <= 4; i++) {
1859 x &= even_bit_esz_masks[i];
1862 return x & 0xffffffffu;
1865 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1867 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1874 uint64_t nn = *(uint64_t *)vn;
1875 uint64_t mm = *(uint64_t *)vm;
1876 int half = 4 * oprsz;
1878 nn = extract64(nn, high * half, half);
1879 mm = extract64(mm, high * half, half);
1880 nn = expand_bits(nn, esz);
1881 mm = expand_bits(mm, esz);
1882 d[0] = nn + (mm << (1 << esz));
1884 ARMPredicateReg tmp_n, tmp_m;
1886 /* We produce output faster than we consume input.
1887 Therefore we must be mindful of possible overlap. */
1888 if ((vn - vd) < (uintptr_t)oprsz) {
1889 vn = memcpy(&tmp_n, vn, oprsz);
1891 if ((vm - vd) < (uintptr_t)oprsz) {
1892 vm = memcpy(&tmp_m, vm, oprsz);
1898 if ((high & 3) == 0) {
1899 uint32_t *n = vn, *m = vm;
1902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903 uint64_t nn = n[H4(high + i)];
1904 uint64_t mm = m[H4(high + i)];
1906 nn = expand_bits(nn, esz);
1907 mm = expand_bits(mm, esz);
1908 d[i] = nn + (mm << (1 << esz));
1911 uint8_t *n = vn, *m = vm;
1914 for (i = 0; i < oprsz / 2; i++) {
1915 uint16_t nn = n[H1(high + i)];
1916 uint16_t mm = m[H1(high + i)];
1918 nn = expand_bits(nn, esz);
1919 mm = expand_bits(mm, esz);
1920 d16[H2(i)] = nn + (mm << (1 << esz));
1926 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1928 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931 uint64_t *d = vd, *n = vn, *m = vm;
1936 l = compress_bits(n[0] >> odd, esz);
1937 h = compress_bits(m[0] >> odd, esz);
1938 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1940 ARMPredicateReg tmp_m;
1941 intptr_t oprsz_16 = oprsz / 16;
1943 if ((vm - vd) < (uintptr_t)oprsz) {
1944 m = memcpy(&tmp_m, vm, oprsz);
1947 for (i = 0; i < oprsz_16; i++) {
1950 l = compress_bits(l >> odd, esz);
1951 h = compress_bits(h >> odd, esz);
1952 d[i] = l + (h << 32);
1955 /* For VL which is not a power of 2, the results from M do not
1956 align nicely with the uint64_t for D. Put the aligned results
1957 from M into TMP_M and then copy it into place afterward. */
1959 d[i] = compress_bits(n[2 * i] >> odd, esz);
1961 for (i = 0; i < oprsz_16; i++) {
1964 l = compress_bits(l >> odd, esz);
1965 h = compress_bits(h >> odd, esz);
1966 tmp_m.p[i] = l + (h << 32);
1968 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1970 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1972 for (i = 0; i < oprsz_16; i++) {
1975 l = compress_bits(l >> odd, esz);
1976 h = compress_bits(h >> odd, esz);
1977 d[oprsz_16 + i] = l + (h << 32);
1983 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1985 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988 uint64_t *d = vd, *n = vn, *m = vm;
1995 mask = even_bit_esz_masks[esz];
2002 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003 uint64_t nn = (n[i] & mask) >> shr;
2004 uint64_t mm = (m[i] & mask) << shl;
2009 /* Reverse units of 2**N bits. */
2010 static uint64_t reverse_bits_64(uint64_t x, int n)
2015 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016 uint64_t mask = even_bit_esz_masks[i];
2017 x = ((x & mask) << sh) | ((x >> sh) & mask);
2022 static uint8_t reverse_bits_8(uint8_t x, int n)
2024 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2027 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2033 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2035 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037 intptr_t i, oprsz_2 = oprsz / 2;
2040 uint64_t l = *(uint64_t *)vn;
2041 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042 *(uint64_t *)vd = l;
2043 } else if ((oprsz & 15) == 0) {
2044 for (i = 0; i < oprsz_2; i += 8) {
2045 intptr_t ih = oprsz - 8 - i;
2046 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048 *(uint64_t *)(vd + i) = h;
2049 *(uint64_t *)(vd + ih) = l;
2052 for (i = 0; i < oprsz_2; i += 1) {
2053 intptr_t il = H1(i);
2054 intptr_t ih = H1(oprsz - 1 - i);
2055 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057 *(uint8_t *)(vd + il) = h;
2058 *(uint8_t *)(vd + ih) = l;
2063 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2065 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2071 uint64_t nn = *(uint64_t *)vn;
2072 int half = 4 * oprsz;
2074 nn = extract64(nn, high * half, half);
2075 nn = expand_bits(nn, 0);
2078 ARMPredicateReg tmp_n;
2080 /* We produce output faster than we consume input.
2081 Therefore we must be mindful of possible overlap. */
2082 if ((vn - vd) < (uintptr_t)oprsz) {
2083 vn = memcpy(&tmp_n, vn, oprsz);
2089 if ((high & 3) == 0) {
2093 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094 uint64_t nn = n[H4(high + i)];
2095 d[i] = expand_bits(nn, 0);
2101 for (i = 0; i < oprsz / 2; i++) {
2102 uint16_t nn = n[H1(high + i)];
2103 d16[H2(i)] = expand_bits(nn, 0);
2109 #define DO_ZIP(NAME, TYPE, H) \
2110 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2112 intptr_t oprsz = simd_oprsz(desc); \
2113 intptr_t i, oprsz_2 = oprsz / 2; \
2114 ARMVectorReg tmp_n, tmp_m; \
2115 /* We produce output faster than we consume input. \
2116 Therefore we must be mindful of possible overlap. */ \
2117 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2118 vn = memcpy(&tmp_n, vn, oprsz_2); \
2120 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2121 vm = memcpy(&tmp_m, vm, oprsz_2); \
2123 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2124 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2125 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2129 DO_ZIP(sve_zip_b, uint8_t, H1)
2130 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132 DO_ZIP(sve_zip_d, uint64_t, )
2134 #define DO_UZP(NAME, TYPE, H) \
2135 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2137 intptr_t oprsz = simd_oprsz(desc); \
2138 intptr_t oprsz_2 = oprsz / 2; \
2139 intptr_t odd_ofs = simd_data(desc); \
2141 ARMVectorReg tmp_m; \
2142 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2143 vm = memcpy(&tmp_m, vm, oprsz); \
2145 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2146 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2153 DO_UZP(sve_uzp_b, uint8_t, H1)
2154 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156 DO_UZP(sve_uzp_d, uint64_t, )
2158 #define DO_TRN(NAME, TYPE, H) \
2159 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2161 intptr_t oprsz = simd_oprsz(desc); \
2162 intptr_t odd_ofs = simd_data(desc); \
2164 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2165 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2166 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2167 *(TYPE *)(vd + H(i + 0)) = ae; \
2168 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2172 DO_TRN(sve_trn_b, uint8_t, H1)
2173 DO_TRN(sve_trn_h, uint16_t, H1_2)
2174 DO_TRN(sve_trn_s, uint32_t, H1_4)
2175 DO_TRN(sve_trn_d, uint64_t, )
2181 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2183 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184 uint32_t *d = vd, *n = vn;
2187 for (i = j = 0; i < opr_sz; i++) {
2188 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189 d[H4(j)] = n[H4(i)];
2193 for (; j < opr_sz; j++) {
2198 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2200 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201 uint64_t *d = vd, *n = vn;
2204 for (i = j = 0; i < opr_sz; i++) {
2205 if (pg[H1(i)] & 1) {
2210 for (; j < opr_sz; j++) {
2215 /* Similar to the ARM LastActiveElement pseudocode function, except the
2216 * result is multiplied by the element size. This includes the not found
2217 * indication; e.g. not found for esz=3 is -8.
2219 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2221 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2224 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2227 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2229 intptr_t opr_sz = simd_oprsz(desc) / 8;
2230 int esz = simd_data(desc);
2231 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232 intptr_t i, first_i, last_i;
2235 first_i = last_i = 0;
2236 first_g = last_g = 0;
2238 /* Find the extent of the active elements within VG. */
2239 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240 pg = *(uint64_t *)(vg + i) & mask;
2253 first_i = first_i * 8 + ctz64(first_g);
2254 last_i = last_i * 8 + 63 - clz64(last_g);
2255 len = last_i - first_i + (1 << esz);
2257 vm = memcpy(&tmp, vm, opr_sz * 8);
2259 swap_memmove(vd, vn + first_i, len);
2261 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2264 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265 void *vg, uint32_t desc)
2267 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268 uint64_t *d = vd, *n = vn, *m = vm;
2271 for (i = 0; i < opr_sz; i += 1) {
2272 uint64_t nn = n[i], mm = m[i];
2273 uint64_t pp = expand_pred_b(pg[H1(i)]);
2274 d[i] = (nn & pp) | (mm & ~pp);
2278 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279 void *vg, uint32_t desc)
2281 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 uint64_t *d = vd, *n = vn, *m = vm;
2285 for (i = 0; i < opr_sz; i += 1) {
2286 uint64_t nn = n[i], mm = m[i];
2287 uint64_t pp = expand_pred_h(pg[H1(i)]);
2288 d[i] = (nn & pp) | (mm & ~pp);
2292 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293 void *vg, uint32_t desc)
2295 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296 uint64_t *d = vd, *n = vn, *m = vm;
2299 for (i = 0; i < opr_sz; i += 1) {
2300 uint64_t nn = n[i], mm = m[i];
2301 uint64_t pp = expand_pred_s(pg[H1(i)]);
2302 d[i] = (nn & pp) | (mm & ~pp);
2306 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2319 /* Two operand comparison controlled by a predicate.
2320 * ??? It is very tempting to want to be able to expand this inline
2321 * with x86 instructions, e.g.
2323 * vcmpeqw zm, zn, %ymm0
2324 * vpmovmskb %ymm0, %eax
2328 * or even aarch64, e.g.
2330 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2331 * cmeq v0.8h, zn, zm
2332 * and v0.8h, v0.8h, mask
2336 * However, coming up with an abstraction that allows vector inputs and
2337 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2338 * scalar outputs, is tricky.
2340 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2341 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2343 intptr_t opr_sz = simd_oprsz(desc); \
2344 uint32_t flags = PREDTEST_INIT; \
2345 intptr_t i = opr_sz; \
2347 uint64_t out = 0, pg; \
2349 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2350 TYPE nn = *(TYPE *)(vn + H(i)); \
2351 TYPE mm = *(TYPE *)(vm + H(i)); \
2354 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2356 *(uint64_t *)(vd + (i >> 3)) = out; \
2357 flags = iter_predtest_bwd(out, pg, flags); \
2362 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2364 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2371 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2372 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2376 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2377 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2381 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2382 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2386 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2387 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2391 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2392 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2396 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2397 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2401 #undef DO_CMP_PPZZ_B
2402 #undef DO_CMP_PPZZ_H
2403 #undef DO_CMP_PPZZ_S
2404 #undef DO_CMP_PPZZ_D
2407 /* Similar, but the second source is "wide". */
2408 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2409 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2411 intptr_t opr_sz = simd_oprsz(desc); \
2412 uint32_t flags = PREDTEST_INIT; \
2413 intptr_t i = opr_sz; \
2415 uint64_t out = 0, pg; \
2417 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2419 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2420 TYPE nn = *(TYPE *)(vn + H(i)); \
2424 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2426 *(uint64_t *)(vd + (i >> 3)) = out; \
2427 flags = iter_predtest_bwd(out, pg, flags); \
2432 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2434 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2439 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2440 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2441 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2443 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2444 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2445 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2447 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2448 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2449 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2451 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2452 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2453 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2455 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2456 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2459 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2460 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2463 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2464 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2465 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2467 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2468 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2469 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2471 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2472 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2475 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2476 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2479 #undef DO_CMP_PPZW_B
2480 #undef DO_CMP_PPZW_H
2481 #undef DO_CMP_PPZW_S
2484 /* Similar, but the second source is immediate. */
2485 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2486 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2488 intptr_t opr_sz = simd_oprsz(desc); \
2489 uint32_t flags = PREDTEST_INIT; \
2490 TYPE mm = simd_data(desc); \
2491 intptr_t i = opr_sz; \
2493 uint64_t out = 0, pg; \
2495 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2496 TYPE nn = *(TYPE *)(vn + H(i)); \
2499 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2501 *(uint64_t *)(vd + (i >> 3)) = out; \
2502 flags = iter_predtest_bwd(out, pg, flags); \
2507 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2509 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2516 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2517 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2521 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2522 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2526 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2527 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2531 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2532 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2536 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2537 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2541 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2542 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2546 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2547 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2551 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2552 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2556 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2557 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2561 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2562 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2566 #undef DO_CMP_PPZI_B
2567 #undef DO_CMP_PPZI_H
2568 #undef DO_CMP_PPZI_S
2569 #undef DO_CMP_PPZI_D
2572 /* Similar to the ARM LastActive pseudocode function. */
2573 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2577 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578 uint64_t pg = *(uint64_t *)(vg + i);
2580 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2586 /* Compute a mask into RETB that is true for all G, up to and including
2587 * (if after) or excluding (if !after) the first G & N.
2588 * Return true if BRK found.
2590 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591 bool brk, bool after)
2597 } else if ((g & n) == 0) {
2598 /* For all G, no N are set; break not found. */
2601 /* Break somewhere in N. Locate it. */
2602 b = g & n; /* guard true, pred true */
2603 b = b & -b; /* first such */
2605 b = b | (b - 1); /* break after same */
2607 b = b - 1; /* break before same */
2616 /* Compute a zeroing BRK. */
2617 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618 intptr_t oprsz, bool after)
2623 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624 uint64_t this_b, this_g = g[i];
2626 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627 d[i] = this_b & this_g;
2631 /* Likewise, but also compute flags. */
2632 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633 intptr_t oprsz, bool after)
2635 uint32_t flags = PREDTEST_INIT;
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_d, this_g = g[i];
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_d = this_b & this_g;
2644 flags = iter_predtest_fwd(this_d, this_g, flags);
2649 /* Compute a merging BRK. */
2650 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651 intptr_t oprsz, bool after)
2656 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657 uint64_t this_b, this_g = g[i];
2659 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2664 /* Likewise, but also compute flags. */
2665 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666 intptr_t oprsz, bool after)
2668 uint32_t flags = PREDTEST_INIT;
2672 for (i = 0; i < oprsz / 8; ++i) {
2673 uint64_t this_b, this_d = d[i], this_g = g[i];
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677 flags = iter_predtest_fwd(this_d, this_g, flags);
2682 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2684 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2685 * The compiler should turn this into 4 64-bit integer stores.
2687 memset(d, 0, sizeof(ARMPredicateReg));
2688 return PREDTEST_INIT;
2691 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2694 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695 if (last_active_pred(vn, vg, oprsz)) {
2696 compute_brk_z(vd, vm, vg, oprsz, true);
2702 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2705 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706 if (last_active_pred(vn, vg, oprsz)) {
2707 return compute_brks_z(vd, vm, vg, oprsz, true);
2709 return do_zero(vd, oprsz);
2713 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 if (last_active_pred(vn, vg, oprsz)) {
2718 compute_brk_z(vd, vm, vg, oprsz, false);
2724 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 if (last_active_pred(vn, vg, oprsz)) {
2729 return compute_brks_z(vd, vm, vg, oprsz, false);
2731 return do_zero(vd, oprsz);
2735 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 compute_brk_z(vd, vn, vg, oprsz, true);
2741 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 return compute_brks_z(vd, vn, vg, oprsz, true);
2747 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2749 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750 compute_brk_z(vd, vn, vg, oprsz, false);
2753 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2755 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756 return compute_brks_z(vd, vn, vg, oprsz, false);
2759 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2761 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762 compute_brk_m(vd, vn, vg, oprsz, true);
2765 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2767 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768 return compute_brks_m(vd, vn, vg, oprsz, true);
2771 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2773 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774 compute_brk_m(vd, vn, vg, oprsz, false);
2777 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 return compute_brks_m(vd, vn, vg, oprsz, false);
2783 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 if (!last_active_pred(vn, vg, oprsz)) {
2792 /* As if PredTest(Ones(PL), D, esz). */
2793 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2796 uint32_t flags = PREDTEST_INIT;
2799 for (i = 0; i < oprsz / 8; i++) {
2800 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2803 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2809 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2811 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2813 if (last_active_pred(vn, vg, oprsz)) {
2814 return predtest_ones(vd, oprsz, -1);
2816 return do_zero(vd, oprsz);
2820 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2827 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828 uint64_t t = n[i] & g[i] & mask;
2834 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2836 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838 uint64_t esz_mask = pred_esz_masks[esz];
2839 ARMPredicateReg *d = vd;
2843 /* Begin with a zero predicate register. */
2844 flags = do_zero(d, oprsz);
2849 /* Scale from predicate element count to bits. */
2851 /* Bound to the bits in the predicate. */
2852 count = MIN(count, oprsz * 8);
2854 /* Set all of the requested bits. */
2855 for (i = 0; i < count / 64; ++i) {
2859 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2862 return predtest_ones(d, oprsz, esz_mask);
2865 /* Recursive reduction on a function;
2866 * C.f. the ARM ARM function ReducePredicated.
2868 * While it would be possible to write this without the DATA temporary,
2869 * it is much simpler to process the predicate register this way.
2870 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2871 * little to gain with a more complex non-recursive form.
2873 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2874 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2879 uintptr_t half = n / 2; \
2880 TYPE lo = NAME##_reduce(data, status, half); \
2881 TYPE hi = NAME##_reduce(data + half, status, half); \
2882 return TYPE##_##FUNC(lo, hi, status); \
2885 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2887 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2888 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2889 for (i = 0; i < oprsz; ) { \
2890 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2892 TYPE nn = *(TYPE *)(vn + H(i)); \
2893 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2894 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2897 for (; i < maxsz; i += sizeof(TYPE)) { \
2898 *(TYPE *)((void *)data + i) = IDENT; \
2900 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2903 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2904 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2905 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2907 /* Identity is floatN_default_nan, without the function call. */
2908 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2909 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2910 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2912 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2913 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2914 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2916 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2917 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2918 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2920 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2921 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2922 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2926 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2927 void *status, uint32_t desc)
2929 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2930 float16 result = nn;
2933 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2936 float16 mm = *(float16 *)(vm + H1_2(i));
2937 result = float16_add(result, mm, status);
2939 i += sizeof(float16), pg >>= sizeof(float16);
2941 } while (i < opr_sz);
2946 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2947 void *status, uint32_t desc)
2949 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2950 float32 result = nn;
2953 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2956 float32 mm = *(float32 *)(vm + H1_2(i));
2957 result = float32_add(result, mm, status);
2959 i += sizeof(float32), pg >>= sizeof(float32);
2961 } while (i < opr_sz);
2966 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2967 void *status, uint32_t desc)
2969 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2973 for (i = 0; i < opr_sz; i++) {
2974 if (pg[H1(i)] & 1) {
2975 nn = float64_add(nn, m[i], status);
2982 /* Fully general three-operand expander, controlled by a predicate,
2983 * With the extra float_status parameter.
2985 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2986 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2987 void *status, uint32_t desc) \
2989 intptr_t i = simd_oprsz(desc); \
2992 uint64_t pg = g[(i - 1) >> 6]; \
2994 i -= sizeof(TYPE); \
2995 if (likely((pg >> (i & 63)) & 1)) { \
2996 TYPE nn = *(TYPE *)(vn + H(i)); \
2997 TYPE mm = *(TYPE *)(vm + H(i)); \
2998 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3004 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3005 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3006 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3008 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3009 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3010 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3012 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3013 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3014 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3016 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3017 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3018 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3020 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3021 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3022 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3024 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3025 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3026 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3028 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3029 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3030 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3032 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3033 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3034 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3036 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3038 return float16_abs(float16_sub(a, b, s));
3041 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3043 return float32_abs(float32_sub(a, b, s));
3046 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3048 return float64_abs(float64_sub(a, b, s));
3051 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3052 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3053 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3055 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3057 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3058 return float64_scalbn(a, b_int, s);
3061 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3062 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3063 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3065 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3066 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3067 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3071 /* Three-operand expander, with one scalar operand, controlled by
3072 * a predicate, with the extra float_status parameter.
3074 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3075 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3076 void *status, uint32_t desc) \
3078 intptr_t i = simd_oprsz(desc); \
3082 uint64_t pg = g[(i - 1) >> 6]; \
3084 i -= sizeof(TYPE); \
3085 if (likely((pg >> (i & 63)) & 1)) { \
3086 TYPE nn = *(TYPE *)(vn + H(i)); \
3087 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3093 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3094 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3095 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3097 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3098 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3099 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3101 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3102 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3103 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3105 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3107 return float16_sub(b, a, s);
3110 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3112 return float32_sub(b, a, s);
3115 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3117 return float64_sub(b, a, s);
3120 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3121 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3122 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3124 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3125 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3126 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3128 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3129 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3130 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3132 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3133 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3134 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3136 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3137 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3138 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3140 /* Fully general two-operand expander, controlled by a predicate,
3141 * With the extra float_status parameter.
3143 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3144 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3146 intptr_t i = simd_oprsz(desc); \
3149 uint64_t pg = g[(i - 1) >> 6]; \
3151 i -= sizeof(TYPE); \
3152 if (likely((pg >> (i & 63)) & 1)) { \
3153 TYPE nn = *(TYPE *)(vn + H(i)); \
3154 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3160 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3161 * FZ16. When converting from fp16, this affects flushing input denormals;
3162 * when converting to fp16, this affects flushing output denormals.
3164 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3166 flag save = get_flush_inputs_to_zero(fpst);
3169 set_flush_inputs_to_zero(false, fpst);
3170 ret = float16_to_float32(f, true, fpst);
3171 set_flush_inputs_to_zero(save, fpst);
3175 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3177 flag save = get_flush_inputs_to_zero(fpst);
3180 set_flush_inputs_to_zero(false, fpst);
3181 ret = float16_to_float64(f, true, fpst);
3182 set_flush_inputs_to_zero(save, fpst);
3186 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3188 flag save = get_flush_to_zero(fpst);
3191 set_flush_to_zero(false, fpst);
3192 ret = float32_to_float16(f, true, fpst);
3193 set_flush_to_zero(save, fpst);
3197 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3199 flag save = get_flush_to_zero(fpst);
3202 set_flush_to_zero(false, fpst);
3203 ret = float64_to_float16(f, true, fpst);
3204 set_flush_to_zero(save, fpst);
3208 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3210 if (float16_is_any_nan(f)) {
3211 float_raise(float_flag_invalid, s);
3214 return float16_to_int16_round_to_zero(f, s);
3217 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3219 if (float16_is_any_nan(f)) {
3220 float_raise(float_flag_invalid, s);
3223 return float16_to_int64_round_to_zero(f, s);
3226 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3228 if (float32_is_any_nan(f)) {
3229 float_raise(float_flag_invalid, s);
3232 return float32_to_int64_round_to_zero(f, s);
3235 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3237 if (float64_is_any_nan(f)) {
3238 float_raise(float_flag_invalid, s);
3241 return float64_to_int64_round_to_zero(f, s);
3244 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3250 return float16_to_uint16_round_to_zero(f, s);
3253 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3259 return float16_to_uint64_round_to_zero(f, s);
3262 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3268 return float32_to_uint64_round_to_zero(f, s);
3271 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3277 return float64_to_uint64_round_to_zero(f, s);
3280 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3281 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3282 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3283 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3284 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3285 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3287 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3288 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3289 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3290 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3291 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3292 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3293 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3295 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3296 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3297 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3298 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3299 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3300 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3301 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3303 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3304 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3305 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3307 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3308 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3309 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3311 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3312 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3313 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3315 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3316 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3317 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3319 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3320 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3321 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3322 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3323 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3324 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3325 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3327 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3328 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3329 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3330 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3331 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3332 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3333 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3337 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3338 * "properly", so we need to encode some of the registers into DESC.
3340 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3342 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3343 uint16_t neg1, uint16_t neg3)
3345 intptr_t i = simd_oprsz(desc);
3346 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3347 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3348 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3349 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3350 void *vd = &env->vfp.zregs[rd];
3351 void *vn = &env->vfp.zregs[rn];
3352 void *vm = &env->vfp.zregs[rm];
3353 void *va = &env->vfp.zregs[ra];
3357 uint64_t pg = g[(i - 1) >> 6];
3360 if (likely((pg >> (i & 63)) & 1)) {
3361 float16 e1, e2, e3, r;
3363 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364 e2 = *(uint16_t *)(vm + H1_2(i));
3365 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3367 *(uint16_t *)(vd + H1_2(i)) = r;
3373 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3375 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3378 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3380 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3383 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3385 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3388 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3390 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3393 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3394 uint32_t neg1, uint32_t neg3)
3396 intptr_t i = simd_oprsz(desc);
3397 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3398 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3399 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3400 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3401 void *vd = &env->vfp.zregs[rd];
3402 void *vn = &env->vfp.zregs[rn];
3403 void *vm = &env->vfp.zregs[rm];
3404 void *va = &env->vfp.zregs[ra];
3408 uint64_t pg = g[(i - 1) >> 6];
3411 if (likely((pg >> (i & 63)) & 1)) {
3412 float32 e1, e2, e3, r;
3414 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3415 e2 = *(uint32_t *)(vm + H1_4(i));
3416 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3417 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3418 *(uint32_t *)(vd + H1_4(i)) = r;
3424 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3426 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3429 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3431 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3434 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3436 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3439 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3441 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3444 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3445 uint64_t neg1, uint64_t neg3)
3447 intptr_t i = simd_oprsz(desc);
3448 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3449 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3450 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3451 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3452 void *vd = &env->vfp.zregs[rd];
3453 void *vn = &env->vfp.zregs[rn];
3454 void *vm = &env->vfp.zregs[rm];
3455 void *va = &env->vfp.zregs[ra];
3459 uint64_t pg = g[(i - 1) >> 6];
3462 if (likely((pg >> (i & 63)) & 1)) {
3463 float64 e1, e2, e3, r;
3465 e1 = *(uint64_t *)(vn + i) ^ neg1;
3466 e2 = *(uint64_t *)(vm + i);
3467 e3 = *(uint64_t *)(va + i) ^ neg3;
3468 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3469 *(uint64_t *)(vd + i) = r;
3475 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3477 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3480 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3482 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3485 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3487 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3490 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3492 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3495 /* Two operand floating-point comparison controlled by a predicate.
3496 * Unlike the integer version, we are not allowed to optimistically
3497 * compare operands, since the comparison may have side effects wrt
3500 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3501 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3502 void *status, uint32_t desc) \
3504 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3505 uint64_t *d = vd, *g = vg; \
3507 uint64_t out = 0, pg = g[j]; \
3509 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3510 if (likely((pg >> (i & 63)) & 1)) { \
3511 TYPE nn = *(TYPE *)(vn + H(i)); \
3512 TYPE mm = *(TYPE *)(vm + H(i)); \
3513 out |= OP(TYPE, nn, mm, status); \
3520 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3522 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3524 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3525 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3527 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3528 DO_FPCMP_PPZZ_H(NAME, OP) \
3529 DO_FPCMP_PPZZ_S(NAME, OP) \
3530 DO_FPCMP_PPZZ_D(NAME, OP)
3532 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3533 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3534 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3535 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3536 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3537 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3538 #define DO_FCMUO(TYPE, X, Y, ST) \
3539 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3540 #define DO_FACGE(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3542 #define DO_FACGT(TYPE, X, Y, ST) \
3543 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3545 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3546 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3547 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3548 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3549 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3550 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3551 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3553 #undef DO_FPCMP_PPZZ_ALL
3554 #undef DO_FPCMP_PPZZ_D
3555 #undef DO_FPCMP_PPZZ_S
3556 #undef DO_FPCMP_PPZZ_H
3557 #undef DO_FPCMP_PPZZ
3559 /* One operand floating-point comparison against zero, controlled
3562 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3563 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3564 void *status, uint32_t desc) \
3566 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3567 uint64_t *d = vd, *g = vg; \
3569 uint64_t out = 0, pg = g[j]; \
3571 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3572 if ((pg >> (i & 63)) & 1) { \
3573 TYPE nn = *(TYPE *)(vn + H(i)); \
3574 out |= OP(TYPE, nn, 0, status); \
3581 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3583 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3585 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3586 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3588 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3589 DO_FPCMP_PPZ0_H(NAME, OP) \
3590 DO_FPCMP_PPZ0_S(NAME, OP) \
3591 DO_FPCMP_PPZ0_D(NAME, OP)
3593 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3594 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3595 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3596 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3597 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3598 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3600 /* FP Trig Multiply-Add. */
3602 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3604 static const float16 coeff[16] = {
3605 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3608 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3609 intptr_t x = simd_data(desc);
3610 float16 *d = vd, *n = vn, *m = vm;
3611 for (i = 0; i < opr_sz; i++) {
3614 if (float16_is_neg(mm)) {
3615 mm = float16_abs(mm);
3618 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3622 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3624 static const float32 coeff[16] = {
3625 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3626 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3627 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3628 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3630 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3631 intptr_t x = simd_data(desc);
3632 float32 *d = vd, *n = vn, *m = vm;
3633 for (i = 0; i < opr_sz; i++) {
3636 if (float32_is_neg(mm)) {
3637 mm = float32_abs(mm);
3640 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3644 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3646 static const float64 coeff[16] = {
3647 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3648 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3649 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3650 0x3de5d8408868552full, 0x0000000000000000ull,
3651 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3652 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3653 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3654 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3656 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3657 intptr_t x = simd_data(desc);
3658 float64 *d = vd, *n = vn, *m = vm;
3659 for (i = 0; i < opr_sz; i++) {
3662 if (float64_is_neg(mm)) {
3663 mm = float64_abs(mm);
3666 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3674 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3675 void *vs, uint32_t desc)
3677 intptr_t j, i = simd_oprsz(desc);
3679 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3680 float16 neg_real = float16_chs(neg_imag);
3683 uint64_t pg = g[(i - 1) >> 6];
3685 float16 e0, e1, e2, e3;
3687 /* I holds the real index; J holds the imag index. */
3688 j = i - sizeof(float16);
3689 i -= 2 * sizeof(float16);
3691 e0 = *(float16 *)(vn + H1_2(i));
3692 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3693 e2 = *(float16 *)(vn + H1_2(j));
3694 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3696 if (likely((pg >> (i & 63)) & 1)) {
3697 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3699 if (likely((pg >> (j & 63)) & 1)) {
3700 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3706 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3707 void *vs, uint32_t desc)
3709 intptr_t j, i = simd_oprsz(desc);
3711 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3712 float32 neg_real = float32_chs(neg_imag);
3715 uint64_t pg = g[(i - 1) >> 6];
3717 float32 e0, e1, e2, e3;
3719 /* I holds the real index; J holds the imag index. */
3720 j = i - sizeof(float32);
3721 i -= 2 * sizeof(float32);
3723 e0 = *(float32 *)(vn + H1_2(i));
3724 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3725 e2 = *(float32 *)(vn + H1_2(j));
3726 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3728 if (likely((pg >> (i & 63)) & 1)) {
3729 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3731 if (likely((pg >> (j & 63)) & 1)) {
3732 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3738 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3739 void *vs, uint32_t desc)
3741 intptr_t j, i = simd_oprsz(desc);
3743 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3744 float64 neg_real = float64_chs(neg_imag);
3747 uint64_t pg = g[(i - 1) >> 6];
3749 float64 e0, e1, e2, e3;
3751 /* I holds the real index; J holds the imag index. */
3752 j = i - sizeof(float64);
3753 i -= 2 * sizeof(float64);
3755 e0 = *(float64 *)(vn + H1_2(i));
3756 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3757 e2 = *(float64 *)(vn + H1_2(j));
3758 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3760 if (likely((pg >> (i & 63)) & 1)) {
3761 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3763 if (likely((pg >> (j & 63)) & 1)) {
3764 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3771 * FP Complex Multiply
3774 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3776 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3778 intptr_t j, i = simd_oprsz(desc);
3779 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3780 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3781 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3782 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3783 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3784 bool flip = rot & 1;
3785 float16 neg_imag, neg_real;
3786 void *vd = &env->vfp.zregs[rd];
3787 void *vn = &env->vfp.zregs[rn];
3788 void *vm = &env->vfp.zregs[rm];
3789 void *va = &env->vfp.zregs[ra];
3792 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3793 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3796 uint64_t pg = g[(i - 1) >> 6];
3798 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3800 /* I holds the real index; J holds the imag index. */
3801 j = i - sizeof(float16);
3802 i -= 2 * sizeof(float16);
3804 nr = *(float16 *)(vn + H1_2(i));
3805 ni = *(float16 *)(vn + H1_2(j));
3806 mr = *(float16 *)(vm + H1_2(i));
3807 mi = *(float16 *)(vm + H1_2(j));
3809 e2 = (flip ? ni : nr);
3810 e1 = (flip ? mi : mr) ^ neg_real;
3812 e3 = (flip ? mr : mi) ^ neg_imag;
3814 if (likely((pg >> (i & 63)) & 1)) {
3815 d = *(float16 *)(va + H1_2(i));
3816 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3817 *(float16 *)(vd + H1_2(i)) = d;
3819 if (likely((pg >> (j & 63)) & 1)) {
3820 d = *(float16 *)(va + H1_2(j));
3821 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3822 *(float16 *)(vd + H1_2(j)) = d;
3828 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3830 intptr_t j, i = simd_oprsz(desc);
3831 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3832 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3833 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3834 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3835 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3836 bool flip = rot & 1;
3837 float32 neg_imag, neg_real;
3838 void *vd = &env->vfp.zregs[rd];
3839 void *vn = &env->vfp.zregs[rn];
3840 void *vm = &env->vfp.zregs[rm];
3841 void *va = &env->vfp.zregs[ra];
3844 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3845 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3848 uint64_t pg = g[(i - 1) >> 6];
3850 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3852 /* I holds the real index; J holds the imag index. */
3853 j = i - sizeof(float32);
3854 i -= 2 * sizeof(float32);
3856 nr = *(float32 *)(vn + H1_2(i));
3857 ni = *(float32 *)(vn + H1_2(j));
3858 mr = *(float32 *)(vm + H1_2(i));
3859 mi = *(float32 *)(vm + H1_2(j));
3861 e2 = (flip ? ni : nr);
3862 e1 = (flip ? mi : mr) ^ neg_real;
3864 e3 = (flip ? mr : mi) ^ neg_imag;
3866 if (likely((pg >> (i & 63)) & 1)) {
3867 d = *(float32 *)(va + H1_2(i));
3868 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3869 *(float32 *)(vd + H1_2(i)) = d;
3871 if (likely((pg >> (j & 63)) & 1)) {
3872 d = *(float32 *)(va + H1_2(j));
3873 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3874 *(float32 *)(vd + H1_2(j)) = d;
3880 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3882 intptr_t j, i = simd_oprsz(desc);
3883 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3884 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3885 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3886 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3887 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3888 bool flip = rot & 1;
3889 float64 neg_imag, neg_real;
3890 void *vd = &env->vfp.zregs[rd];
3891 void *vn = &env->vfp.zregs[rn];
3892 void *vm = &env->vfp.zregs[rm];
3893 void *va = &env->vfp.zregs[ra];
3896 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3897 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3900 uint64_t pg = g[(i - 1) >> 6];
3902 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3904 /* I holds the real index; J holds the imag index. */
3905 j = i - sizeof(float64);
3906 i -= 2 * sizeof(float64);
3908 nr = *(float64 *)(vn + H1_2(i));
3909 ni = *(float64 *)(vn + H1_2(j));
3910 mr = *(float64 *)(vm + H1_2(i));
3911 mi = *(float64 *)(vm + H1_2(j));
3913 e2 = (flip ? ni : nr);
3914 e1 = (flip ? mi : mr) ^ neg_real;
3916 e3 = (flip ? mr : mi) ^ neg_imag;
3918 if (likely((pg >> (i & 63)) & 1)) {
3919 d = *(float64 *)(va + H1_2(i));
3920 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3921 *(float64 *)(vd + H1_2(i)) = d;
3923 if (likely((pg >> (j & 63)) & 1)) {
3924 d = *(float64 *)(va + H1_2(j));
3925 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3926 *(float64 *)(vd + H1_2(j)) = d;
3933 * Load contiguous data, protected by a governing predicate.
3935 #define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3936 static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3937 target_ulong addr, intptr_t oprsz, \
3942 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3946 m = FN(env, addr, ra); \
3948 *(TYPEE *)(vd + H(i)) = m; \
3949 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3950 addr += sizeof(TYPEM); \
3952 } while (i < oprsz); \
3954 void HELPER(NAME)(CPUARMState *env, void *vg, \
3955 target_ulong addr, uint32_t desc) \
3957 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3958 addr, simd_oprsz(desc), GETPC()); \
3961 #define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3962 void HELPER(NAME)(CPUARMState *env, void *vg, \
3963 target_ulong addr, uint32_t desc) \
3965 intptr_t i, oprsz = simd_oprsz(desc); \
3966 intptr_t ra = GETPC(); \
3967 unsigned rd = simd_data(desc); \
3968 void *d1 = &env->vfp.zregs[rd]; \
3969 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3970 for (i = 0; i < oprsz; ) { \
3971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3973 TYPEM m1 = 0, m2 = 0; \
3975 m1 = FN(env, addr, ra); \
3976 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3978 *(TYPEE *)(d1 + H(i)) = m1; \
3979 *(TYPEE *)(d2 + H(i)) = m2; \
3980 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3981 addr += 2 * sizeof(TYPEM); \
3986 #define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3987 void HELPER(NAME)(CPUARMState *env, void *vg, \
3988 target_ulong addr, uint32_t desc) \
3990 intptr_t i, oprsz = simd_oprsz(desc); \
3991 intptr_t ra = GETPC(); \
3992 unsigned rd = simd_data(desc); \
3993 void *d1 = &env->vfp.zregs[rd]; \
3994 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3995 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3996 for (i = 0; i < oprsz; ) { \
3997 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3999 TYPEM m1 = 0, m2 = 0, m3 = 0; \
4001 m1 = FN(env, addr, ra); \
4002 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4003 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4005 *(TYPEE *)(d1 + H(i)) = m1; \
4006 *(TYPEE *)(d2 + H(i)) = m2; \
4007 *(TYPEE *)(d3 + H(i)) = m3; \
4008 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4009 addr += 3 * sizeof(TYPEM); \
4014 #define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
4015 void HELPER(NAME)(CPUARMState *env, void *vg, \
4016 target_ulong addr, uint32_t desc) \
4018 intptr_t i, oprsz = simd_oprsz(desc); \
4019 intptr_t ra = GETPC(); \
4020 unsigned rd = simd_data(desc); \
4021 void *d1 = &env->vfp.zregs[rd]; \
4022 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4023 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4024 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4025 for (i = 0; i < oprsz; ) { \
4026 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4028 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
4030 m1 = FN(env, addr, ra); \
4031 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4032 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4033 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4035 *(TYPEE *)(d1 + H(i)) = m1; \
4036 *(TYPEE *)(d2 + H(i)) = m2; \
4037 *(TYPEE *)(d3 + H(i)) = m3; \
4038 *(TYPEE *)(d4 + H(i)) = m4; \
4039 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4040 addr += 4 * sizeof(TYPEM); \
4045 DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4046 DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4047 DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4048 DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4049 DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4050 DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4052 DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4053 DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4054 DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4055 DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4057 DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4058 DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4060 DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4061 DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4062 DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4063 DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4065 DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4066 DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4067 DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4068 DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4070 DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4071 DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4072 DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4073 DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4075 DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4076 DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4077 DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4078 DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4086 * Load contiguous data, first-fault and no-fault.
4089 #ifdef CONFIG_USER_ONLY
4091 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4092 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4093 * option, which leaves subsequent data unchanged.
4095 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4097 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4100 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4101 i = ROUND_UP(i, 64);
4103 for (; i < oprsz; i += 64) {
4108 /* Hold the mmap lock during the operation so that there is no race
4109 * between page_check_range and the load operation. We expect the
4110 * usual case to have no faults at all, so we check the whole range
4111 * first and if successful defer to the normal load operation.
4113 * TODO: Change mmap_lock to a rwlock so that multiple readers
4114 * can run simultaneously. This will probably help other uses
4115 * within QEMU as well.
4117 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4118 static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
4119 target_ulong addr, intptr_t oprsz, \
4120 bool first, uintptr_t ra) \
4124 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4129 unlikely(page_check_range(addr, sizeof(TYPEM), \
4131 record_fault(env, i, oprsz); \
4134 m = FN(env, addr, ra); \
4137 *(TYPEE *)(vd + H(i)) = m; \
4138 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4139 addr += sizeof(TYPEM); \
4141 } while (i < oprsz); \
4143 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4144 target_ulong addr, uint32_t desc) \
4146 intptr_t oprsz = simd_oprsz(desc); \
4147 unsigned rd = simd_data(desc); \
4148 void *vd = &env->vfp.zregs[rd]; \
4150 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4151 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4153 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
4158 /* No-fault loads are like first-fault loads without the
4159 * first faulting special case.
4161 #define DO_LDNF1(PART) \
4162 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4163 target_ulong addr, uint32_t desc) \
4165 intptr_t oprsz = simd_oprsz(desc); \
4166 unsigned rd = simd_data(desc); \
4167 void *vd = &env->vfp.zregs[rd]; \
4169 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4170 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4172 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
4179 /* TODO: System mode is not yet supported.
4180 * This would probably use tlb_vaddr_to_host.
4182 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4183 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4184 target_ulong addr, uint32_t desc) \
4186 g_assert_not_reached(); \
4189 #define DO_LDNF1(PART) \
4190 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4191 target_ulong addr, uint32_t desc) \
4193 g_assert_not_reached(); \
4198 DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4199 DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4200 DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4201 DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4202 DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4203 DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4204 DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4206 DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4207 DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4208 DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4209 DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4210 DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4212 DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4213 DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4214 DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4216 DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4243 * Store contiguous data, protected by a governing predicate.
4245 #define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
4246 void HELPER(NAME)(CPUARMState *env, void *vg, \
4247 target_ulong addr, uint32_t desc) \
4249 intptr_t i, oprsz = simd_oprsz(desc); \
4250 intptr_t ra = GETPC(); \
4251 unsigned rd = simd_data(desc); \
4252 void *vd = &env->vfp.zregs[rd]; \
4253 for (i = 0; i < oprsz; ) { \
4254 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4257 TYPEM m = *(TYPEE *)(vd + H(i)); \
4258 FN(env, addr, m, ra); \
4260 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4261 addr += sizeof(TYPEM); \
4266 #define DO_ST1_D(NAME, FN, TYPEM) \
4267 void HELPER(NAME)(CPUARMState *env, void *vg, \
4268 target_ulong addr, uint32_t desc) \
4270 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4271 intptr_t ra = GETPC(); \
4272 unsigned rd = simd_data(desc); \
4273 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4275 for (i = 0; i < oprsz; i += 1) { \
4276 if (pg[H1(i)] & 1) { \
4277 FN(env, addr, d[i], ra); \
4279 addr += sizeof(TYPEM); \
4283 #define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4284 void HELPER(NAME)(CPUARMState *env, void *vg, \
4285 target_ulong addr, uint32_t desc) \
4287 intptr_t i, oprsz = simd_oprsz(desc); \
4288 intptr_t ra = GETPC(); \
4289 unsigned rd = simd_data(desc); \
4290 void *d1 = &env->vfp.zregs[rd]; \
4291 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4292 for (i = 0; i < oprsz; ) { \
4293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4296 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4297 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4298 FN(env, addr, m1, ra); \
4299 FN(env, addr + sizeof(TYPEM), m2, ra); \
4301 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4302 addr += 2 * sizeof(TYPEM); \
4307 #define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4308 void HELPER(NAME)(CPUARMState *env, void *vg, \
4309 target_ulong addr, uint32_t desc) \
4311 intptr_t i, oprsz = simd_oprsz(desc); \
4312 intptr_t ra = GETPC(); \
4313 unsigned rd = simd_data(desc); \
4314 void *d1 = &env->vfp.zregs[rd]; \
4315 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4316 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4317 for (i = 0; i < oprsz; ) { \
4318 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4321 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4322 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4323 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4324 FN(env, addr, m1, ra); \
4325 FN(env, addr + sizeof(TYPEM), m2, ra); \
4326 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4328 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4329 addr += 3 * sizeof(TYPEM); \
4334 #define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4335 void HELPER(NAME)(CPUARMState *env, void *vg, \
4336 target_ulong addr, uint32_t desc) \
4338 intptr_t i, oprsz = simd_oprsz(desc); \
4339 intptr_t ra = GETPC(); \
4340 unsigned rd = simd_data(desc); \
4341 void *d1 = &env->vfp.zregs[rd]; \
4342 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4343 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4344 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4345 for (i = 0; i < oprsz; ) { \
4346 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4349 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4350 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4351 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4352 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4353 FN(env, addr, m1, ra); \
4354 FN(env, addr + sizeof(TYPEM), m2, ra); \
4355 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4356 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4358 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4359 addr += 4 * sizeof(TYPEM); \
4364 DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4365 DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4366 DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4368 DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4369 DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4371 DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4373 DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4374 DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4375 DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4376 DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4378 DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4379 DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4380 DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4381 DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4383 DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4384 DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4385 DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4386 DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4388 DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4390 void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4391 target_ulong addr, uint32_t desc)
4393 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4394 intptr_t ra = GETPC();
4395 unsigned rd = simd_data(desc);
4396 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4397 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4400 for (i = 0; i < oprsz; i += 1) {
4401 if (pg[H1(i)] & 1) {
4402 cpu_stq_data_ra(env, addr, d1[i], ra);
4403 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4409 void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4410 target_ulong addr, uint32_t desc)
4412 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4413 intptr_t ra = GETPC();
4414 unsigned rd = simd_data(desc);
4415 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4416 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4417 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4420 for (i = 0; i < oprsz; i += 1) {
4421 if (pg[H1(i)] & 1) {
4422 cpu_stq_data_ra(env, addr, d1[i], ra);
4423 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4424 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4430 void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4431 target_ulong addr, uint32_t desc)
4433 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4434 intptr_t ra = GETPC();
4435 unsigned rd = simd_data(desc);
4436 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4437 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4438 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4439 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4442 for (i = 0; i < oprsz; i += 1) {
4443 if (pg[H1(i)] & 1) {
4444 cpu_stq_data_ra(env, addr, d1[i], ra);
4445 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4446 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4447 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4453 /* Loads with a vector index. */
4455 #define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4456 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4457 target_ulong base, uint32_t desc) \
4459 intptr_t i, oprsz = simd_oprsz(desc); \
4460 unsigned scale = simd_data(desc); \
4461 uintptr_t ra = GETPC(); \
4462 for (i = 0; i < oprsz; ) { \
4463 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4467 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4468 m = FN(env, base + (off << scale), ra); \
4470 *(uint32_t *)(vd + H1_4(i)) = m; \
4476 #define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4477 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4478 target_ulong base, uint32_t desc) \
4480 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4481 unsigned scale = simd_data(desc); \
4482 uintptr_t ra = GETPC(); \
4483 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4484 for (i = 0; i < oprsz; i++) { \
4486 if (pg[H1(i)] & 1) { \
4487 target_ulong off = (TYPEI)m[i]; \
4488 mm = FN(env, base + (off << scale), ra); \
4494 DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4495 DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4496 DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4497 DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4498 DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4500 DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4501 DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4502 DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4503 DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4504 DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4506 DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4507 DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4508 DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4509 DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4510 DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4511 DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4512 DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4514 DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4515 DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4516 DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4517 DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4518 DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4519 DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4520 DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4522 DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4523 DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4524 DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4525 DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4526 DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4527 DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4528 DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4530 /* First fault loads with a vector index. */
4532 #ifdef CONFIG_USER_ONLY
4534 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4535 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4536 target_ulong base, uint32_t desc) \
4538 intptr_t i, oprsz = simd_oprsz(desc); \
4539 unsigned scale = simd_data(desc); \
4540 uintptr_t ra = GETPC(); \
4541 bool first = true; \
4543 for (i = 0; i < oprsz; ) { \
4544 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4548 target_ulong off = *(TYPEI *)(vm + H(i)); \
4549 target_ulong addr = base + (off << scale); \
4551 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4552 record_fault(env, i, oprsz); \
4555 m = FN(env, addr, ra); \
4558 *(TYPEE *)(vd + H(i)) = m; \
4559 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4568 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4569 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4570 target_ulong base, uint32_t desc) \
4572 g_assert_not_reached(); \
4577 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4578 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4579 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4580 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4582 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4583 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4584 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4585 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4586 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4588 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4589 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4590 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4591 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4592 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4594 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4595 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4596 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4597 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4598 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4599 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4600 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4602 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4603 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4604 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4605 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4606 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4607 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4608 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4610 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4611 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4612 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4613 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4614 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4615 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4616 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4618 /* Stores with a vector index. */
4620 #define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4621 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4622 target_ulong base, uint32_t desc) \
4624 intptr_t i, oprsz = simd_oprsz(desc); \
4625 unsigned scale = simd_data(desc); \
4626 uintptr_t ra = GETPC(); \
4627 for (i = 0; i < oprsz; ) { \
4628 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4630 if (likely(pg & 1)) { \
4631 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4632 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4633 FN(env, base + (off << scale), d, ra); \
4635 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4640 #define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4641 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4642 target_ulong base, uint32_t desc) \
4644 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4645 unsigned scale = simd_data(desc); \
4646 uintptr_t ra = GETPC(); \
4647 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4648 for (i = 0; i < oprsz; i++) { \
4649 if (likely(pg[H1(i)] & 1)) { \
4650 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4651 FN(env, base + off, d[i], ra); \
4656 DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4657 DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4658 DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4660 DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4661 DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4662 DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4664 DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4665 DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4666 DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4667 DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4669 DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4670 DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4671 DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4672 DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4674 DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4675 DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4676 DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4677 DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)