4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
57 /* This is an iterative function, called for each Pd and Pg word
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
66 flags |= ((d & (g & -g)) != 0) << 31;
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
79 /* This is an iterative function, called for each Pd and Pg word
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
129 * printf("0x%016lx,\n", m);
132 static inline uint64_t expand_pred_b(uint8_t byte)
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
239 static inline uint64_t expand_pred_h(uint8_t byte)
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
251 return word[byte & 0x55];
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
262 return word[byte & 0x11];
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
274 uint64_t m = 0x0000ffff0000ffffull;
276 return ((h & m) << 16) | ((h >> 16) & m);
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455 return (n * m) >> 16;
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460 return (n * m) >> 32;
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
466 muls64(&lo, &hi, n, m);
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
473 mulu64(&lo, &hi, n, m);
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
558 /* Fully general two-operand expander, controlled by a predicate.
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
612 #define DO_CNOT(N) (N == 0)
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
631 #define DO_NOT(N) (~N)
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
659 #define DO_ABS(N) (N < 0 ? -N : N)
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
666 #define DO_NEG(N) (-N)
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
753 return (TYPERET)ret; \
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
830 #define DO_SUBR(X, Y) (Y - X)
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
880 uint64_t mask = pred_esz_masks[esz];
884 uint64_t this_g = g[--i] & mask;
886 return i * 64 + (63 - clz64(this_g));
889 return (intptr_t)-1 << esz;
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
894 uint32_t flags = PREDTEST_INIT;
895 uint64_t *d = vd, *g = vg;
899 uint64_t this_d = d[i];
900 uint64_t this_g = g[i];
904 /* Set in D the first bit of G. */
905 this_d |= this_g & -this_g;
908 flags = iter_predtest_fwd(this_d, this_g, flags);
910 } while (++i < words);
915 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
917 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
918 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
919 uint32_t flags = PREDTEST_INIT;
920 uint64_t *d = vd, *g = vg, esz_mask;
923 next = last_active_element(vd, words, esz) + (1 << esz);
924 esz_mask = pred_esz_masks[esz];
926 /* Similar to the pseudocode for pnext, but scaled by ESZ
927 so that we find the correct bit. */
928 if (next < words * 64) {
932 mask = ~((1ull << (next & 63)) - 1);
936 uint64_t this_g = g[next / 64] & esz_mask & mask;
938 next = (next & -64) + ctz64(this_g);
943 } while (next < words * 64);
949 if (i == next / 64) {
950 this_d = 1ull << (next & 63);
953 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
954 } while (++i < words);
959 /* Store zero into every active element of Zd. We will use this for two
960 * and three-operand predicated instructions for which logic dictates a
961 * zero result. In particular, logical shift by element size, which is
962 * otherwise undefined on the host.
964 * For element sizes smaller than uint64_t, we use tables to expand
965 * the N bits of the controlling predicate to a byte mask, and clear
968 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
973 for (i = 0; i < opr_sz; i += 1) {
974 d[i] &= ~expand_pred_b(pg[H1(i)]);
978 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
980 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] &= ~expand_pred_h(pg[H1(i)]);
988 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
993 for (i = 0; i < opr_sz; i += 1) {
994 d[i] &= ~expand_pred_s(pg[H1(i)]);
998 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
1000 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 for (i = 0; i < opr_sz; i += 1) {
1004 if (pg[H1(i)] & 1) {
1010 /* Copy Zn into Zd, and store zero into inactive elements. */
1011 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1013 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1014 uint64_t *d = vd, *n = vn;
1016 for (i = 0; i < opr_sz; i += 1) {
1017 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1021 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1023 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1024 uint64_t *d = vd, *n = vn;
1026 for (i = 0; i < opr_sz; i += 1) {
1027 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1031 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1033 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1034 uint64_t *d = vd, *n = vn;
1036 for (i = 0; i < opr_sz; i += 1) {
1037 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1041 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1043 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1044 uint64_t *d = vd, *n = vn;
1046 for (i = 0; i < opr_sz; i += 1) {
1047 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1051 /* Three-operand expander, immediate operand, controlled by a predicate.
1053 #define DO_ZPZI(NAME, TYPE, H, OP) \
1054 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1056 intptr_t i, opr_sz = simd_oprsz(desc); \
1057 TYPE imm = simd_data(desc); \
1058 for (i = 0; i < opr_sz; ) { \
1059 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1062 TYPE nn = *(TYPE *)(vn + H(i)); \
1063 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1065 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1070 /* Similarly, specialized for 64-bit operands. */
1071 #define DO_ZPZI_D(NAME, TYPE, OP) \
1072 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1074 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1075 TYPE *d = vd, *n = vn; \
1076 TYPE imm = simd_data(desc); \
1078 for (i = 0; i < opr_sz; i += 1) { \
1079 if (pg[H1(i)] & 1) { \
1081 d[i] = OP(nn, imm); \
1086 #define DO_SHR(N, M) (N >> M)
1087 #define DO_SHL(N, M) (N << M)
1089 /* Arithmetic shift right for division. This rounds negative numbers
1090 toward zero as per signed division. Therefore before shifting,
1091 when N is negative, add 2**M-1. */
1092 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1094 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1095 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1096 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1097 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1100 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1101 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1102 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1104 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1105 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1106 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1107 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1109 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1110 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1111 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1112 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1120 /* Fully general four-operand expander, controlled by a predicate.
1122 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1123 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1124 void *vg, uint32_t desc) \
1126 intptr_t i, opr_sz = simd_oprsz(desc); \
1127 for (i = 0; i < opr_sz; ) { \
1128 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1131 TYPE nn = *(TYPE *)(vn + H(i)); \
1132 TYPE mm = *(TYPE *)(vm + H(i)); \
1133 TYPE aa = *(TYPE *)(va + H(i)); \
1134 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1136 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1141 /* Similarly, specialized for 64-bit operands. */
1142 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1143 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1144 void *vg, uint32_t desc) \
1146 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1147 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1149 for (i = 0; i < opr_sz; i += 1) { \
1150 if (pg[H1(i)] & 1) { \
1151 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1152 d[i] = OP(aa, nn, mm); \
1157 #define DO_MLA(A, N, M) (A + N * M)
1158 #define DO_MLS(A, N, M) (A - N * M)
1160 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1161 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1163 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1164 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1166 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1167 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1169 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1170 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1177 void HELPER(sve_index_b)(void *vd, uint32_t start,
1178 uint32_t incr, uint32_t desc)
1180 intptr_t i, opr_sz = simd_oprsz(desc);
1182 for (i = 0; i < opr_sz; i += 1) {
1183 d[H1(i)] = start + i * incr;
1187 void HELPER(sve_index_h)(void *vd, uint32_t start,
1188 uint32_t incr, uint32_t desc)
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1192 for (i = 0; i < opr_sz; i += 1) {
1193 d[H2(i)] = start + i * incr;
1197 void HELPER(sve_index_s)(void *vd, uint32_t start,
1198 uint32_t incr, uint32_t desc)
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1202 for (i = 0; i < opr_sz; i += 1) {
1203 d[H4(i)] = start + i * incr;
1207 void HELPER(sve_index_d)(void *vd, uint64_t start,
1208 uint64_t incr, uint32_t desc)
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1212 for (i = 0; i < opr_sz; i += 1) {
1213 d[i] = start + i * incr;
1217 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1219 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1220 uint32_t sh = simd_data(desc);
1221 uint32_t *d = vd, *n = vn, *m = vm;
1222 for (i = 0; i < opr_sz; i += 1) {
1223 d[i] = n[i] + (m[i] << sh);
1227 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1229 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1230 uint64_t sh = simd_data(desc);
1231 uint64_t *d = vd, *n = vn, *m = vm;
1232 for (i = 0; i < opr_sz; i += 1) {
1233 d[i] = n[i] + (m[i] << sh);
1237 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1239 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1240 uint64_t sh = simd_data(desc);
1241 uint64_t *d = vd, *n = vn, *m = vm;
1242 for (i = 0; i < opr_sz; i += 1) {
1243 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1247 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1249 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1250 uint64_t sh = simd_data(desc);
1251 uint64_t *d = vd, *n = vn, *m = vm;
1252 for (i = 0; i < opr_sz; i += 1) {
1253 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1257 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1259 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1260 static const uint16_t coeff[] = {
1261 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1262 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1263 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1264 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1266 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1267 uint16_t *d = vd, *n = vn;
1269 for (i = 0; i < opr_sz; i++) {
1271 intptr_t idx = extract32(nn, 0, 5);
1272 uint16_t exp = extract32(nn, 5, 5);
1273 d[i] = coeff[idx] | (exp << 10);
1277 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1279 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1280 static const uint32_t coeff[] = {
1281 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1282 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1283 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1284 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1285 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1286 0x1ef532, 0x20b051, 0x227043, 0x243516,
1287 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1288 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1289 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1290 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1291 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1292 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1293 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1294 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1295 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1296 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1298 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1299 uint32_t *d = vd, *n = vn;
1301 for (i = 0; i < opr_sz; i++) {
1303 intptr_t idx = extract32(nn, 0, 6);
1304 uint32_t exp = extract32(nn, 6, 8);
1305 d[i] = coeff[idx] | (exp << 23);
1309 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1311 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1312 static const uint64_t coeff[] = {
1313 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1314 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1315 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1316 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1317 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1318 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1319 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1320 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1321 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1322 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1323 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1324 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1325 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1326 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1327 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1328 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1329 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1330 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1331 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1332 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1333 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1337 uint64_t *d = vd, *n = vn;
1339 for (i = 0; i < opr_sz; i++) {
1341 intptr_t idx = extract32(nn, 0, 6);
1342 uint64_t exp = extract32(nn, 6, 11);
1343 d[i] = coeff[idx] | (exp << 52);
1347 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1349 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1350 uint16_t *d = vd, *n = vn, *m = vm;
1351 for (i = 0; i < opr_sz; i += 1) {
1357 d[i] = nn ^ (mm & 2) << 14;
1361 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1363 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1364 uint32_t *d = vd, *n = vn, *m = vm;
1365 for (i = 0; i < opr_sz; i += 1) {
1371 d[i] = nn ^ (mm & 2) << 30;
1375 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1377 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1378 uint64_t *d = vd, *n = vn, *m = vm;
1379 for (i = 0; i < opr_sz; i += 1) {
1385 d[i] = nn ^ (mm & 2) << 62;
1390 * Signed saturating addition with scalar operand.
1393 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1395 intptr_t i, oprsz = simd_oprsz(desc);
1397 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1398 int r = *(int8_t *)(a + i) + b;
1401 } else if (r < INT8_MIN) {
1404 *(int8_t *)(d + i) = r;
1408 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1410 intptr_t i, oprsz = simd_oprsz(desc);
1412 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1413 int r = *(int16_t *)(a + i) + b;
1414 if (r > INT16_MAX) {
1416 } else if (r < INT16_MIN) {
1419 *(int16_t *)(d + i) = r;
1423 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1425 intptr_t i, oprsz = simd_oprsz(desc);
1427 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1428 int64_t r = *(int32_t *)(a + i) + b;
1429 if (r > INT32_MAX) {
1431 } else if (r < INT32_MIN) {
1434 *(int32_t *)(d + i) = r;
1438 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1440 intptr_t i, oprsz = simd_oprsz(desc);
1442 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1443 int64_t ai = *(int64_t *)(a + i);
1445 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1446 /* Signed overflow. */
1447 r = (r < 0 ? INT64_MAX : INT64_MIN);
1449 *(int64_t *)(d + i) = r;
1454 * Unsigned saturating addition with scalar operand.
1457 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1459 intptr_t i, oprsz = simd_oprsz(desc);
1461 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1462 int r = *(uint8_t *)(a + i) + b;
1463 if (r > UINT8_MAX) {
1468 *(uint8_t *)(d + i) = r;
1472 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1474 intptr_t i, oprsz = simd_oprsz(desc);
1476 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1477 int r = *(uint16_t *)(a + i) + b;
1478 if (r > UINT16_MAX) {
1483 *(uint16_t *)(d + i) = r;
1487 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1489 intptr_t i, oprsz = simd_oprsz(desc);
1491 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1492 int64_t r = *(uint32_t *)(a + i) + b;
1493 if (r > UINT32_MAX) {
1498 *(uint32_t *)(d + i) = r;
1502 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1504 intptr_t i, oprsz = simd_oprsz(desc);
1506 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1507 uint64_t r = *(uint64_t *)(a + i) + b;
1511 *(uint64_t *)(d + i) = r;
1515 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1517 intptr_t i, oprsz = simd_oprsz(desc);
1519 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1520 uint64_t ai = *(uint64_t *)(a + i);
1521 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1525 /* Two operand predicated copy immediate with merge. All valid immediates
1526 * can fit within 17 signed bits in the simd_data field.
1528 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1529 uint64_t mm, uint32_t desc)
1531 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1532 uint64_t *d = vd, *n = vn;
1535 mm = dup_const(MO_8, mm);
1536 for (i = 0; i < opr_sz; i += 1) {
1538 uint64_t pp = expand_pred_b(pg[H1(i)]);
1539 d[i] = (mm & pp) | (nn & ~pp);
1543 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1544 uint64_t mm, uint32_t desc)
1546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1547 uint64_t *d = vd, *n = vn;
1550 mm = dup_const(MO_16, mm);
1551 for (i = 0; i < opr_sz; i += 1) {
1553 uint64_t pp = expand_pred_h(pg[H1(i)]);
1554 d[i] = (mm & pp) | (nn & ~pp);
1558 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1559 uint64_t mm, uint32_t desc)
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd, *n = vn;
1565 mm = dup_const(MO_32, mm);
1566 for (i = 0; i < opr_sz; i += 1) {
1568 uint64_t pp = expand_pred_s(pg[H1(i)]);
1569 d[i] = (mm & pp) | (nn & ~pp);
1573 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1574 uint64_t mm, uint32_t desc)
1576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 uint64_t *d = vd, *n = vn;
1580 for (i = 0; i < opr_sz; i += 1) {
1582 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1586 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1588 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1592 val = dup_const(MO_8, val);
1593 for (i = 0; i < opr_sz; i += 1) {
1594 d[i] = val & expand_pred_b(pg[H1(i)]);
1598 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1604 val = dup_const(MO_16, val);
1605 for (i = 0; i < opr_sz; i += 1) {
1606 d[i] = val & expand_pred_h(pg[H1(i)]);
1610 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1612 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1616 val = dup_const(MO_32, val);
1617 for (i = 0; i < opr_sz; i += 1) {
1618 d[i] = val & expand_pred_s(pg[H1(i)]);
1622 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1624 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1628 for (i = 0; i < opr_sz; i += 1) {
1629 d[i] = (pg[H1(i)] & 1 ? val : 0);
1633 /* Big-endian hosts need to frob the byte indicies. If the copy
1634 * happens to be 8-byte aligned, then no frobbing necessary.
1636 static void swap_memmove(void *vd, void *vs, size_t n)
1638 uintptr_t d = (uintptr_t)vd;
1639 uintptr_t s = (uintptr_t)vs;
1640 uintptr_t o = (d | s | n) & 7;
1643 #ifndef HOST_WORDS_BIGENDIAN
1652 if (d < s || d >= s + n) {
1653 for (i = 0; i < n; i += 4) {
1654 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1657 for (i = n; i > 0; ) {
1659 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1666 if (d < s || d >= s + n) {
1667 for (i = 0; i < n; i += 2) {
1668 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1671 for (i = n; i > 0; ) {
1673 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1679 if (d < s || d >= s + n) {
1680 for (i = 0; i < n; i++) {
1681 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1684 for (i = n; i > 0; ) {
1686 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1693 /* Similarly for memset of 0. */
1694 static void swap_memzero(void *vd, size_t n)
1696 uintptr_t d = (uintptr_t)vd;
1697 uintptr_t o = (d | n) & 7;
1700 /* Usually, the first bit of a predicate is set, so N is 0. */
1701 if (likely(n == 0)) {
1705 #ifndef HOST_WORDS_BIGENDIAN
1714 for (i = 0; i < n; i += 4) {
1715 *(uint32_t *)H1_4(d + i) = 0;
1721 for (i = 0; i < n; i += 2) {
1722 *(uint16_t *)H1_2(d + i) = 0;
1727 for (i = 0; i < n; i++) {
1728 *(uint8_t *)H1(d + i) = 0;
1734 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1736 intptr_t opr_sz = simd_oprsz(desc);
1737 size_t n_ofs = simd_data(desc);
1738 size_t n_siz = opr_sz - n_ofs;
1741 swap_memmove(vd, vn + n_ofs, n_siz);
1742 swap_memmove(vd + n_siz, vm, n_ofs);
1743 } else if (vd != vn) {
1744 swap_memmove(vd + n_siz, vd, n_ofs);
1745 swap_memmove(vd, vn + n_ofs, n_siz);
1747 /* vd == vn == vm. Need temp space. */
1749 swap_memmove(&tmp, vm, n_ofs);
1750 swap_memmove(vd, vd + n_ofs, n_siz);
1751 memcpy(vd + n_siz, &tmp, n_ofs);
1755 #define DO_INSR(NAME, TYPE, H) \
1756 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1758 intptr_t opr_sz = simd_oprsz(desc); \
1759 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1760 *(TYPE *)(vd + H(0)) = val; \
1763 DO_INSR(sve_insr_b, uint8_t, H1)
1764 DO_INSR(sve_insr_h, uint16_t, H1_2)
1765 DO_INSR(sve_insr_s, uint32_t, H1_4)
1766 DO_INSR(sve_insr_d, uint64_t, )
1770 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1772 intptr_t i, j, opr_sz = simd_oprsz(desc);
1773 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1774 uint64_t f = *(uint64_t *)(vn + i);
1775 uint64_t b = *(uint64_t *)(vn + j);
1776 *(uint64_t *)(vd + i) = bswap64(b);
1777 *(uint64_t *)(vd + j) = bswap64(f);
1781 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1783 intptr_t i, j, opr_sz = simd_oprsz(desc);
1784 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1785 uint64_t f = *(uint64_t *)(vn + i);
1786 uint64_t b = *(uint64_t *)(vn + j);
1787 *(uint64_t *)(vd + i) = hswap64(b);
1788 *(uint64_t *)(vd + j) = hswap64(f);
1792 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1794 intptr_t i, j, opr_sz = simd_oprsz(desc);
1795 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1796 uint64_t f = *(uint64_t *)(vn + i);
1797 uint64_t b = *(uint64_t *)(vn + j);
1798 *(uint64_t *)(vd + i) = rol64(b, 32);
1799 *(uint64_t *)(vd + j) = rol64(f, 32);
1803 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1805 intptr_t i, j, opr_sz = simd_oprsz(desc);
1806 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1807 uint64_t f = *(uint64_t *)(vn + i);
1808 uint64_t b = *(uint64_t *)(vn + j);
1809 *(uint64_t *)(vd + i) = b;
1810 *(uint64_t *)(vd + j) = f;
1814 #define DO_TBL(NAME, TYPE, H) \
1815 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1817 intptr_t i, opr_sz = simd_oprsz(desc); \
1818 uintptr_t elem = opr_sz / sizeof(TYPE); \
1819 TYPE *d = vd, *n = vn, *m = vm; \
1821 if (unlikely(vd == vn)) { \
1822 n = memcpy(&tmp, vn, opr_sz); \
1824 for (i = 0; i < elem; i++) { \
1826 d[H(i)] = j < elem ? n[H(j)] : 0; \
1830 DO_TBL(sve_tbl_b, uint8_t, H1)
1831 DO_TBL(sve_tbl_h, uint16_t, H2)
1832 DO_TBL(sve_tbl_s, uint32_t, H4)
1833 DO_TBL(sve_tbl_d, uint64_t, )
1837 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1838 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1840 intptr_t i, opr_sz = simd_oprsz(desc); \
1844 if (unlikely(vn - vd < opr_sz)) { \
1845 n = memcpy(&tmp, n, opr_sz / 2); \
1847 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1848 d[HD(i)] = n[HS(i)]; \
1852 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1853 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1854 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1856 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1857 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1858 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1862 /* Mask of bits included in the even numbered predicates of width esz.
1863 * We also use this for expand_bits/compress_bits, and so extend the
1864 * same pattern out to 16-bit units.
1866 static const uint64_t even_bit_esz_masks[5] = {
1867 0x5555555555555555ull,
1868 0x3333333333333333ull,
1869 0x0f0f0f0f0f0f0f0full,
1870 0x00ff00ff00ff00ffull,
1871 0x0000ffff0000ffffull,
1874 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1875 * For N==0, this corresponds to the operation that in qemu/bitops.h
1876 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1877 * section 7-2 Shuffling Bits.
1879 static uint64_t expand_bits(uint64_t x, int n)
1884 for (i = 4; i >= n; i--) {
1886 x = ((x << sh) | x) & even_bit_esz_masks[i];
1891 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1892 * For N==0, this corresponds to the operation that in qemu/bitops.h
1893 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1894 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1896 static uint64_t compress_bits(uint64_t x, int n)
1900 for (i = n; i <= 4; i++) {
1902 x &= even_bit_esz_masks[i];
1905 return x & 0xffffffffu;
1908 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1910 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1911 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1912 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1917 uint64_t nn = *(uint64_t *)vn;
1918 uint64_t mm = *(uint64_t *)vm;
1919 int half = 4 * oprsz;
1921 nn = extract64(nn, high * half, half);
1922 mm = extract64(mm, high * half, half);
1923 nn = expand_bits(nn, esz);
1924 mm = expand_bits(mm, esz);
1925 d[0] = nn + (mm << (1 << esz));
1927 ARMPredicateReg tmp_n, tmp_m;
1929 /* We produce output faster than we consume input.
1930 Therefore we must be mindful of possible overlap. */
1931 if ((vn - vd) < (uintptr_t)oprsz) {
1932 vn = memcpy(&tmp_n, vn, oprsz);
1934 if ((vm - vd) < (uintptr_t)oprsz) {
1935 vm = memcpy(&tmp_m, vm, oprsz);
1941 if ((high & 3) == 0) {
1942 uint32_t *n = vn, *m = vm;
1945 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1946 uint64_t nn = n[H4(high + i)];
1947 uint64_t mm = m[H4(high + i)];
1949 nn = expand_bits(nn, esz);
1950 mm = expand_bits(mm, esz);
1951 d[i] = nn + (mm << (1 << esz));
1954 uint8_t *n = vn, *m = vm;
1957 for (i = 0; i < oprsz / 2; i++) {
1958 uint16_t nn = n[H1(high + i)];
1959 uint16_t mm = m[H1(high + i)];
1961 nn = expand_bits(nn, esz);
1962 mm = expand_bits(mm, esz);
1963 d16[H2(i)] = nn + (mm << (1 << esz));
1969 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1971 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1972 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1973 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1974 uint64_t *d = vd, *n = vn, *m = vm;
1979 l = compress_bits(n[0] >> odd, esz);
1980 h = compress_bits(m[0] >> odd, esz);
1981 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1983 ARMPredicateReg tmp_m;
1984 intptr_t oprsz_16 = oprsz / 16;
1986 if ((vm - vd) < (uintptr_t)oprsz) {
1987 m = memcpy(&tmp_m, vm, oprsz);
1990 for (i = 0; i < oprsz_16; i++) {
1993 l = compress_bits(l >> odd, esz);
1994 h = compress_bits(h >> odd, esz);
1995 d[i] = l + (h << 32);
1998 /* For VL which is not a power of 2, the results from M do not
1999 align nicely with the uint64_t for D. Put the aligned results
2000 from M into TMP_M and then copy it into place afterward. */
2002 d[i] = compress_bits(n[2 * i] >> odd, esz);
2004 for (i = 0; i < oprsz_16; i++) {
2007 l = compress_bits(l >> odd, esz);
2008 h = compress_bits(h >> odd, esz);
2009 tmp_m.p[i] = l + (h << 32);
2011 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2013 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2015 for (i = 0; i < oprsz_16; i++) {
2018 l = compress_bits(l >> odd, esz);
2019 h = compress_bits(h >> odd, esz);
2020 d[oprsz_16 + i] = l + (h << 32);
2026 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2028 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2029 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2030 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2031 uint64_t *d = vd, *n = vn, *m = vm;
2038 mask = even_bit_esz_masks[esz];
2045 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2046 uint64_t nn = (n[i] & mask) >> shr;
2047 uint64_t mm = (m[i] & mask) << shl;
2052 /* Reverse units of 2**N bits. */
2053 static uint64_t reverse_bits_64(uint64_t x, int n)
2058 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2059 uint64_t mask = even_bit_esz_masks[i];
2060 x = ((x & mask) << sh) | ((x >> sh) & mask);
2065 static uint8_t reverse_bits_8(uint8_t x, int n)
2067 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2070 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2071 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2076 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2078 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2079 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2080 intptr_t i, oprsz_2 = oprsz / 2;
2083 uint64_t l = *(uint64_t *)vn;
2084 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2085 *(uint64_t *)vd = l;
2086 } else if ((oprsz & 15) == 0) {
2087 for (i = 0; i < oprsz_2; i += 8) {
2088 intptr_t ih = oprsz - 8 - i;
2089 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2090 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2091 *(uint64_t *)(vd + i) = h;
2092 *(uint64_t *)(vd + ih) = l;
2095 for (i = 0; i < oprsz_2; i += 1) {
2096 intptr_t il = H1(i);
2097 intptr_t ih = H1(oprsz - 1 - i);
2098 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2099 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2100 *(uint8_t *)(vd + il) = h;
2101 *(uint8_t *)(vd + ih) = l;
2106 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2108 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2109 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2114 uint64_t nn = *(uint64_t *)vn;
2115 int half = 4 * oprsz;
2117 nn = extract64(nn, high * half, half);
2118 nn = expand_bits(nn, 0);
2121 ARMPredicateReg tmp_n;
2123 /* We produce output faster than we consume input.
2124 Therefore we must be mindful of possible overlap. */
2125 if ((vn - vd) < (uintptr_t)oprsz) {
2126 vn = memcpy(&tmp_n, vn, oprsz);
2132 if ((high & 3) == 0) {
2136 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2137 uint64_t nn = n[H4(high + i)];
2138 d[i] = expand_bits(nn, 0);
2144 for (i = 0; i < oprsz / 2; i++) {
2145 uint16_t nn = n[H1(high + i)];
2146 d16[H2(i)] = expand_bits(nn, 0);
2152 #define DO_ZIP(NAME, TYPE, H) \
2153 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2155 intptr_t oprsz = simd_oprsz(desc); \
2156 intptr_t i, oprsz_2 = oprsz / 2; \
2157 ARMVectorReg tmp_n, tmp_m; \
2158 /* We produce output faster than we consume input. \
2159 Therefore we must be mindful of possible overlap. */ \
2160 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2161 vn = memcpy(&tmp_n, vn, oprsz_2); \
2163 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2164 vm = memcpy(&tmp_m, vm, oprsz_2); \
2166 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2167 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2168 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2172 DO_ZIP(sve_zip_b, uint8_t, H1)
2173 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2174 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2175 DO_ZIP(sve_zip_d, uint64_t, )
2177 #define DO_UZP(NAME, TYPE, H) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t oprsz_2 = oprsz / 2; \
2182 intptr_t odd_ofs = simd_data(desc); \
2184 ARMVectorReg tmp_m; \
2185 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2186 vm = memcpy(&tmp_m, vm, oprsz); \
2188 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2189 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2191 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2192 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2196 DO_UZP(sve_uzp_b, uint8_t, H1)
2197 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2198 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2199 DO_UZP(sve_uzp_d, uint64_t, )
2201 #define DO_TRN(NAME, TYPE, H) \
2202 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2204 intptr_t oprsz = simd_oprsz(desc); \
2205 intptr_t odd_ofs = simd_data(desc); \
2207 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2208 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2209 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2210 *(TYPE *)(vd + H(i + 0)) = ae; \
2211 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2215 DO_TRN(sve_trn_b, uint8_t, H1)
2216 DO_TRN(sve_trn_h, uint16_t, H1_2)
2217 DO_TRN(sve_trn_s, uint32_t, H1_4)
2218 DO_TRN(sve_trn_d, uint64_t, )
2224 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2226 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2227 uint32_t *d = vd, *n = vn;
2230 for (i = j = 0; i < opr_sz; i++) {
2231 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2232 d[H4(j)] = n[H4(i)];
2236 for (; j < opr_sz; j++) {
2241 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2243 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2244 uint64_t *d = vd, *n = vn;
2247 for (i = j = 0; i < opr_sz; i++) {
2248 if (pg[H1(i)] & 1) {
2253 for (; j < opr_sz; j++) {
2258 /* Similar to the ARM LastActiveElement pseudocode function, except the
2259 * result is multiplied by the element size. This includes the not found
2260 * indication; e.g. not found for esz=3 is -8.
2262 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2264 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2265 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2267 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2270 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2272 intptr_t opr_sz = simd_oprsz(desc) / 8;
2273 int esz = simd_data(desc);
2274 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2275 intptr_t i, first_i, last_i;
2278 first_i = last_i = 0;
2279 first_g = last_g = 0;
2281 /* Find the extent of the active elements within VG. */
2282 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2283 pg = *(uint64_t *)(vg + i) & mask;
2296 first_i = first_i * 8 + ctz64(first_g);
2297 last_i = last_i * 8 + 63 - clz64(last_g);
2298 len = last_i - first_i + (1 << esz);
2300 vm = memcpy(&tmp, vm, opr_sz * 8);
2302 swap_memmove(vd, vn + first_i, len);
2304 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2307 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2308 void *vg, uint32_t desc)
2310 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2311 uint64_t *d = vd, *n = vn, *m = vm;
2314 for (i = 0; i < opr_sz; i += 1) {
2315 uint64_t nn = n[i], mm = m[i];
2316 uint64_t pp = expand_pred_b(pg[H1(i)]);
2317 d[i] = (nn & pp) | (mm & ~pp);
2321 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2322 void *vg, uint32_t desc)
2324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2325 uint64_t *d = vd, *n = vn, *m = vm;
2328 for (i = 0; i < opr_sz; i += 1) {
2329 uint64_t nn = n[i], mm = m[i];
2330 uint64_t pp = expand_pred_h(pg[H1(i)]);
2331 d[i] = (nn & pp) | (mm & ~pp);
2335 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2336 void *vg, uint32_t desc)
2338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2339 uint64_t *d = vd, *n = vn, *m = vm;
2342 for (i = 0; i < opr_sz; i += 1) {
2343 uint64_t nn = n[i], mm = m[i];
2344 uint64_t pp = expand_pred_s(pg[H1(i)]);
2345 d[i] = (nn & pp) | (mm & ~pp);
2349 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2350 void *vg, uint32_t desc)
2352 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2353 uint64_t *d = vd, *n = vn, *m = vm;
2356 for (i = 0; i < opr_sz; i += 1) {
2357 uint64_t nn = n[i], mm = m[i];
2358 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2362 /* Two operand comparison controlled by a predicate.
2363 * ??? It is very tempting to want to be able to expand this inline
2364 * with x86 instructions, e.g.
2366 * vcmpeqw zm, zn, %ymm0
2367 * vpmovmskb %ymm0, %eax
2371 * or even aarch64, e.g.
2373 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2374 * cmeq v0.8h, zn, zm
2375 * and v0.8h, v0.8h, mask
2379 * However, coming up with an abstraction that allows vector inputs and
2380 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2381 * scalar outputs, is tricky.
2383 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2384 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2386 intptr_t opr_sz = simd_oprsz(desc); \
2387 uint32_t flags = PREDTEST_INIT; \
2388 intptr_t i = opr_sz; \
2390 uint64_t out = 0, pg; \
2392 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2393 TYPE nn = *(TYPE *)(vn + H(i)); \
2394 TYPE mm = *(TYPE *)(vm + H(i)); \
2397 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2399 *(uint64_t *)(vd + (i >> 3)) = out; \
2400 flags = iter_predtest_bwd(out, pg, flags); \
2405 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2407 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2409 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2411 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2412 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2414 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2415 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2416 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2417 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2419 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2420 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2421 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2422 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2424 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2425 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2426 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2427 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2429 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2430 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2431 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2432 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2434 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2435 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2436 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2437 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2439 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2440 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2441 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2442 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2444 #undef DO_CMP_PPZZ_B
2445 #undef DO_CMP_PPZZ_H
2446 #undef DO_CMP_PPZZ_S
2447 #undef DO_CMP_PPZZ_D
2450 /* Similar, but the second source is "wide". */
2451 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2452 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2454 intptr_t opr_sz = simd_oprsz(desc); \
2455 uint32_t flags = PREDTEST_INIT; \
2456 intptr_t i = opr_sz; \
2458 uint64_t out = 0, pg; \
2460 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2462 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2463 TYPE nn = *(TYPE *)(vn + H(i)); \
2467 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2469 *(uint64_t *)(vd + (i >> 3)) = out; \
2470 flags = iter_predtest_bwd(out, pg, flags); \
2475 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2477 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2479 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2480 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2482 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2483 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2484 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2486 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2487 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2488 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2490 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2491 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2492 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2494 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2495 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2496 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2498 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2499 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2500 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2502 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2503 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2504 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2506 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2507 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2508 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2510 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2511 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2512 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2514 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2515 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2516 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2518 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2519 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2520 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2522 #undef DO_CMP_PPZW_B
2523 #undef DO_CMP_PPZW_H
2524 #undef DO_CMP_PPZW_S
2527 /* Similar, but the second source is immediate. */
2528 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2529 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2531 intptr_t opr_sz = simd_oprsz(desc); \
2532 uint32_t flags = PREDTEST_INIT; \
2533 TYPE mm = simd_data(desc); \
2534 intptr_t i = opr_sz; \
2536 uint64_t out = 0, pg; \
2538 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2539 TYPE nn = *(TYPE *)(vn + H(i)); \
2542 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2544 *(uint64_t *)(vd + (i >> 3)) = out; \
2545 flags = iter_predtest_bwd(out, pg, flags); \
2550 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2552 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2554 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2556 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2557 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2559 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2560 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2561 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2562 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2564 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2565 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2566 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2567 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2569 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2570 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2571 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2572 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2574 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2575 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2576 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2577 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2579 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2580 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2581 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2582 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2584 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2585 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2586 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2587 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2589 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2590 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2591 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2592 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2594 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2595 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2596 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2597 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2599 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2600 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2601 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2602 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2604 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2605 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2606 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2607 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2609 #undef DO_CMP_PPZI_B
2610 #undef DO_CMP_PPZI_H
2611 #undef DO_CMP_PPZI_S
2612 #undef DO_CMP_PPZI_D
2615 /* Similar to the ARM LastActive pseudocode function. */
2616 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2620 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2621 uint64_t pg = *(uint64_t *)(vg + i);
2623 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2629 /* Compute a mask into RETB that is true for all G, up to and including
2630 * (if after) or excluding (if !after) the first G & N.
2631 * Return true if BRK found.
2633 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2634 bool brk, bool after)
2640 } else if ((g & n) == 0) {
2641 /* For all G, no N are set; break not found. */
2644 /* Break somewhere in N. Locate it. */
2645 b = g & n; /* guard true, pred true */
2646 b = b & -b; /* first such */
2648 b = b | (b - 1); /* break after same */
2650 b = b - 1; /* break before same */
2659 /* Compute a zeroing BRK. */
2660 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2661 intptr_t oprsz, bool after)
2666 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2667 uint64_t this_b, this_g = g[i];
2669 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2670 d[i] = this_b & this_g;
2674 /* Likewise, but also compute flags. */
2675 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2676 intptr_t oprsz, bool after)
2678 uint32_t flags = PREDTEST_INIT;
2682 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2683 uint64_t this_b, this_d, this_g = g[i];
2685 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2686 d[i] = this_d = this_b & this_g;
2687 flags = iter_predtest_fwd(this_d, this_g, flags);
2692 /* Compute a merging BRK. */
2693 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2694 intptr_t oprsz, bool after)
2699 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2700 uint64_t this_b, this_g = g[i];
2702 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2703 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2707 /* Likewise, but also compute flags. */
2708 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2709 intptr_t oprsz, bool after)
2711 uint32_t flags = PREDTEST_INIT;
2715 for (i = 0; i < oprsz / 8; ++i) {
2716 uint64_t this_b, this_d = d[i], this_g = g[i];
2718 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2719 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2720 flags = iter_predtest_fwd(this_d, this_g, flags);
2725 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2727 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2728 * The compiler should turn this into 4 64-bit integer stores.
2730 memset(d, 0, sizeof(ARMPredicateReg));
2731 return PREDTEST_INIT;
2734 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 if (last_active_pred(vn, vg, oprsz)) {
2739 compute_brk_z(vd, vm, vg, oprsz, true);
2745 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2748 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2749 if (last_active_pred(vn, vg, oprsz)) {
2750 return compute_brks_z(vd, vm, vg, oprsz, true);
2752 return do_zero(vd, oprsz);
2756 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2759 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760 if (last_active_pred(vn, vg, oprsz)) {
2761 compute_brk_z(vd, vm, vg, oprsz, false);
2767 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2770 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771 if (last_active_pred(vn, vg, oprsz)) {
2772 return compute_brks_z(vd, vm, vg, oprsz, false);
2774 return do_zero(vd, oprsz);
2778 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2780 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2781 compute_brk_z(vd, vn, vg, oprsz, true);
2784 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2786 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 return compute_brks_z(vd, vn, vg, oprsz, true);
2790 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2792 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2793 compute_brk_z(vd, vn, vg, oprsz, false);
2796 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2798 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2799 return compute_brks_z(vd, vn, vg, oprsz, false);
2802 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2804 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2805 compute_brk_m(vd, vn, vg, oprsz, true);
2808 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2810 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2811 return compute_brks_m(vd, vn, vg, oprsz, true);
2814 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2816 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2817 compute_brk_m(vd, vn, vg, oprsz, false);
2820 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 return compute_brks_m(vd, vn, vg, oprsz, false);
2826 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2828 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2830 if (!last_active_pred(vn, vg, oprsz)) {
2835 /* As if PredTest(Ones(PL), D, esz). */
2836 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2839 uint32_t flags = PREDTEST_INIT;
2842 for (i = 0; i < oprsz / 8; i++) {
2843 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2846 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2847 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2852 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2854 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2856 if (last_active_pred(vn, vg, oprsz)) {
2857 return predtest_ones(vd, oprsz, -1);
2859 return do_zero(vd, oprsz);
2863 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2865 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2866 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2867 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2870 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2871 uint64_t t = n[i] & g[i] & mask;
2877 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2879 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2880 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2881 uint64_t esz_mask = pred_esz_masks[esz];
2882 ARMPredicateReg *d = vd;
2886 /* Begin with a zero predicate register. */
2887 flags = do_zero(d, oprsz);
2892 /* Set all of the requested bits. */
2893 for (i = 0; i < count / 64; ++i) {
2897 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2900 return predtest_ones(d, oprsz, esz_mask);
2903 /* Recursive reduction on a function;
2904 * C.f. the ARM ARM function ReducePredicated.
2906 * While it would be possible to write this without the DATA temporary,
2907 * it is much simpler to process the predicate register this way.
2908 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2909 * little to gain with a more complex non-recursive form.
2911 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2912 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2917 uintptr_t half = n / 2; \
2918 TYPE lo = NAME##_reduce(data, status, half); \
2919 TYPE hi = NAME##_reduce(data + half, status, half); \
2920 return TYPE##_##FUNC(lo, hi, status); \
2923 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2925 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2926 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2927 for (i = 0; i < oprsz; ) { \
2928 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2930 TYPE nn = *(TYPE *)(vn + H(i)); \
2931 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2932 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2935 for (; i < maxsz; i += sizeof(TYPE)) { \
2936 *(TYPE *)((void *)data + i) = IDENT; \
2938 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2941 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2942 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2943 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2945 /* Identity is floatN_default_nan, without the function call. */
2946 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2947 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2948 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2950 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2951 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2952 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2954 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2955 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2956 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2958 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2959 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2960 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2964 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2965 void *status, uint32_t desc)
2967 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2968 float16 result = nn;
2971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2974 float16 mm = *(float16 *)(vm + H1_2(i));
2975 result = float16_add(result, mm, status);
2977 i += sizeof(float16), pg >>= sizeof(float16);
2979 } while (i < opr_sz);
2984 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2985 void *status, uint32_t desc)
2987 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2988 float32 result = nn;
2991 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2994 float32 mm = *(float32 *)(vm + H1_2(i));
2995 result = float32_add(result, mm, status);
2997 i += sizeof(float32), pg >>= sizeof(float32);
2999 } while (i < opr_sz);
3004 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3005 void *status, uint32_t desc)
3007 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3011 for (i = 0; i < opr_sz; i++) {
3012 if (pg[H1(i)] & 1) {
3013 nn = float64_add(nn, m[i], status);
3020 /* Fully general three-operand expander, controlled by a predicate,
3021 * With the extra float_status parameter.
3023 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3024 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3025 void *status, uint32_t desc) \
3027 intptr_t i = simd_oprsz(desc); \
3030 uint64_t pg = g[(i - 1) >> 6]; \
3032 i -= sizeof(TYPE); \
3033 if (likely((pg >> (i & 63)) & 1)) { \
3034 TYPE nn = *(TYPE *)(vn + H(i)); \
3035 TYPE mm = *(TYPE *)(vm + H(i)); \
3036 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3042 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3043 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3044 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3046 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3047 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3048 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3050 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3051 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3052 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3054 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3055 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3056 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3058 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3059 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3060 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3062 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3063 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3064 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3066 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3067 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3068 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3070 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3071 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3072 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3074 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3076 return float16_abs(float16_sub(a, b, s));
3079 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3081 return float32_abs(float32_sub(a, b, s));
3084 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3086 return float64_abs(float64_sub(a, b, s));
3089 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3090 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3091 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3093 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3095 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3096 return float64_scalbn(a, b_int, s);
3099 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3100 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3101 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3103 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3104 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3105 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3109 /* Three-operand expander, with one scalar operand, controlled by
3110 * a predicate, with the extra float_status parameter.
3112 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3113 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3114 void *status, uint32_t desc) \
3116 intptr_t i = simd_oprsz(desc); \
3120 uint64_t pg = g[(i - 1) >> 6]; \
3122 i -= sizeof(TYPE); \
3123 if (likely((pg >> (i & 63)) & 1)) { \
3124 TYPE nn = *(TYPE *)(vn + H(i)); \
3125 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3131 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3132 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3133 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3135 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3136 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3137 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3139 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3140 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3141 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3143 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3145 return float16_sub(b, a, s);
3148 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3150 return float32_sub(b, a, s);
3153 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3155 return float64_sub(b, a, s);
3158 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3159 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3160 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3162 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3163 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3164 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3166 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3167 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3168 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3170 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3171 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3172 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3174 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3175 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3176 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3178 /* Fully general two-operand expander, controlled by a predicate,
3179 * With the extra float_status parameter.
3181 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3182 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3184 intptr_t i = simd_oprsz(desc); \
3187 uint64_t pg = g[(i - 1) >> 6]; \
3189 i -= sizeof(TYPE); \
3190 if (likely((pg >> (i & 63)) & 1)) { \
3191 TYPE nn = *(TYPE *)(vn + H(i)); \
3192 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3198 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3199 * FZ16. When converting from fp16, this affects flushing input denormals;
3200 * when converting to fp16, this affects flushing output denormals.
3202 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3204 flag save = get_flush_inputs_to_zero(fpst);
3207 set_flush_inputs_to_zero(false, fpst);
3208 ret = float16_to_float32(f, true, fpst);
3209 set_flush_inputs_to_zero(save, fpst);
3213 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3215 flag save = get_flush_inputs_to_zero(fpst);
3218 set_flush_inputs_to_zero(false, fpst);
3219 ret = float16_to_float64(f, true, fpst);
3220 set_flush_inputs_to_zero(save, fpst);
3224 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3226 flag save = get_flush_to_zero(fpst);
3229 set_flush_to_zero(false, fpst);
3230 ret = float32_to_float16(f, true, fpst);
3231 set_flush_to_zero(save, fpst);
3235 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3237 flag save = get_flush_to_zero(fpst);
3240 set_flush_to_zero(false, fpst);
3241 ret = float64_to_float16(f, true, fpst);
3242 set_flush_to_zero(save, fpst);
3246 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3248 if (float16_is_any_nan(f)) {
3249 float_raise(float_flag_invalid, s);
3252 return float16_to_int16_round_to_zero(f, s);
3255 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3257 if (float16_is_any_nan(f)) {
3258 float_raise(float_flag_invalid, s);
3261 return float16_to_int64_round_to_zero(f, s);
3264 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3266 if (float32_is_any_nan(f)) {
3267 float_raise(float_flag_invalid, s);
3270 return float32_to_int64_round_to_zero(f, s);
3273 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3275 if (float64_is_any_nan(f)) {
3276 float_raise(float_flag_invalid, s);
3279 return float64_to_int64_round_to_zero(f, s);
3282 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3284 if (float16_is_any_nan(f)) {
3285 float_raise(float_flag_invalid, s);
3288 return float16_to_uint16_round_to_zero(f, s);
3291 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3293 if (float16_is_any_nan(f)) {
3294 float_raise(float_flag_invalid, s);
3297 return float16_to_uint64_round_to_zero(f, s);
3300 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3302 if (float32_is_any_nan(f)) {
3303 float_raise(float_flag_invalid, s);
3306 return float32_to_uint64_round_to_zero(f, s);
3309 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3311 if (float64_is_any_nan(f)) {
3312 float_raise(float_flag_invalid, s);
3315 return float64_to_uint64_round_to_zero(f, s);
3318 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3319 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3320 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3321 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3322 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3323 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3325 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3326 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3327 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3328 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3329 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3330 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3331 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3333 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3334 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3335 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3336 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3337 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3338 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3339 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3341 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3342 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3343 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3345 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3346 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3347 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3349 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3350 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3351 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3353 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3354 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3355 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3357 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3358 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3359 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3360 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3361 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3362 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3363 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3365 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3366 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3367 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3368 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3369 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3370 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3371 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3375 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3376 * "properly", so we need to encode some of the registers into DESC.
3378 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3380 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3381 uint16_t neg1, uint16_t neg3)
3383 intptr_t i = simd_oprsz(desc);
3384 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3385 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3386 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3387 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3388 void *vd = &env->vfp.zregs[rd];
3389 void *vn = &env->vfp.zregs[rn];
3390 void *vm = &env->vfp.zregs[rm];
3391 void *va = &env->vfp.zregs[ra];
3395 uint64_t pg = g[(i - 1) >> 6];
3398 if (likely((pg >> (i & 63)) & 1)) {
3399 float16 e1, e2, e3, r;
3401 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3402 e2 = *(uint16_t *)(vm + H1_2(i));
3403 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3404 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
3405 *(uint16_t *)(vd + H1_2(i)) = r;
3411 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3413 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3416 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3418 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3421 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3423 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3426 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3428 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3431 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3432 uint32_t neg1, uint32_t neg3)
3434 intptr_t i = simd_oprsz(desc);
3435 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3436 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3437 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3438 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3439 void *vd = &env->vfp.zregs[rd];
3440 void *vn = &env->vfp.zregs[rn];
3441 void *vm = &env->vfp.zregs[rm];
3442 void *va = &env->vfp.zregs[ra];
3446 uint64_t pg = g[(i - 1) >> 6];
3449 if (likely((pg >> (i & 63)) & 1)) {
3450 float32 e1, e2, e3, r;
3452 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3453 e2 = *(uint32_t *)(vm + H1_4(i));
3454 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3455 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3456 *(uint32_t *)(vd + H1_4(i)) = r;
3462 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3464 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3467 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3469 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3472 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3474 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3477 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3479 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3482 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3483 uint64_t neg1, uint64_t neg3)
3485 intptr_t i = simd_oprsz(desc);
3486 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3487 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3488 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3489 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3490 void *vd = &env->vfp.zregs[rd];
3491 void *vn = &env->vfp.zregs[rn];
3492 void *vm = &env->vfp.zregs[rm];
3493 void *va = &env->vfp.zregs[ra];
3497 uint64_t pg = g[(i - 1) >> 6];
3500 if (likely((pg >> (i & 63)) & 1)) {
3501 float64 e1, e2, e3, r;
3503 e1 = *(uint64_t *)(vn + i) ^ neg1;
3504 e2 = *(uint64_t *)(vm + i);
3505 e3 = *(uint64_t *)(va + i) ^ neg3;
3506 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3507 *(uint64_t *)(vd + i) = r;
3513 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3515 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3518 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3520 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3523 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3525 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3528 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3530 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3533 /* Two operand floating-point comparison controlled by a predicate.
3534 * Unlike the integer version, we are not allowed to optimistically
3535 * compare operands, since the comparison may have side effects wrt
3538 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3539 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3540 void *status, uint32_t desc) \
3542 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3543 uint64_t *d = vd, *g = vg; \
3545 uint64_t out = 0, pg = g[j]; \
3547 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3548 if (likely((pg >> (i & 63)) & 1)) { \
3549 TYPE nn = *(TYPE *)(vn + H(i)); \
3550 TYPE mm = *(TYPE *)(vm + H(i)); \
3551 out |= OP(TYPE, nn, mm, status); \
3558 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3559 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3560 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3561 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3562 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3563 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3565 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3566 DO_FPCMP_PPZZ_H(NAME, OP) \
3567 DO_FPCMP_PPZZ_S(NAME, OP) \
3568 DO_FPCMP_PPZZ_D(NAME, OP)
3570 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3571 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3572 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3573 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3574 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3575 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3576 #define DO_FCMUO(TYPE, X, Y, ST) \
3577 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3578 #define DO_FACGE(TYPE, X, Y, ST) \
3579 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3580 #define DO_FACGT(TYPE, X, Y, ST) \
3581 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3583 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3584 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3585 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3586 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3587 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3588 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3589 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3591 #undef DO_FPCMP_PPZZ_ALL
3592 #undef DO_FPCMP_PPZZ_D
3593 #undef DO_FPCMP_PPZZ_S
3594 #undef DO_FPCMP_PPZZ_H
3595 #undef DO_FPCMP_PPZZ
3597 /* One operand floating-point comparison against zero, controlled
3600 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3601 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3602 void *status, uint32_t desc) \
3604 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3605 uint64_t *d = vd, *g = vg; \
3607 uint64_t out = 0, pg = g[j]; \
3609 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3610 if ((pg >> (i & 63)) & 1) { \
3611 TYPE nn = *(TYPE *)(vn + H(i)); \
3612 out |= OP(TYPE, nn, 0, status); \
3619 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3620 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3621 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3622 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3623 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3624 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3626 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3627 DO_FPCMP_PPZ0_H(NAME, OP) \
3628 DO_FPCMP_PPZ0_S(NAME, OP) \
3629 DO_FPCMP_PPZ0_D(NAME, OP)
3631 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3632 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3633 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3634 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3635 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3636 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3638 /* FP Trig Multiply-Add. */
3640 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3642 static const float16 coeff[16] = {
3643 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3644 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3646 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3647 intptr_t x = simd_data(desc);
3648 float16 *d = vd, *n = vn, *m = vm;
3649 for (i = 0; i < opr_sz; i++) {
3652 if (float16_is_neg(mm)) {
3653 mm = float16_abs(mm);
3656 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3660 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3662 static const float32 coeff[16] = {
3663 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3664 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3665 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3666 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3668 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3669 intptr_t x = simd_data(desc);
3670 float32 *d = vd, *n = vn, *m = vm;
3671 for (i = 0; i < opr_sz; i++) {
3674 if (float32_is_neg(mm)) {
3675 mm = float32_abs(mm);
3678 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3682 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3684 static const float64 coeff[16] = {
3685 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3686 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3687 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3688 0x3de5d8408868552full, 0x0000000000000000ull,
3689 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3690 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3691 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3692 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3694 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3695 intptr_t x = simd_data(desc);
3696 float64 *d = vd, *n = vn, *m = vm;
3697 for (i = 0; i < opr_sz; i++) {
3700 if (float64_is_neg(mm)) {
3701 mm = float64_abs(mm);
3704 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3712 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3713 void *vs, uint32_t desc)
3715 intptr_t j, i = simd_oprsz(desc);
3717 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3718 float16 neg_real = float16_chs(neg_imag);
3721 uint64_t pg = g[(i - 1) >> 6];
3723 float16 e0, e1, e2, e3;
3725 /* I holds the real index; J holds the imag index. */
3726 j = i - sizeof(float16);
3727 i -= 2 * sizeof(float16);
3729 e0 = *(float16 *)(vn + H1_2(i));
3730 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3731 e2 = *(float16 *)(vn + H1_2(j));
3732 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3734 if (likely((pg >> (i & 63)) & 1)) {
3735 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3737 if (likely((pg >> (j & 63)) & 1)) {
3738 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3744 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3745 void *vs, uint32_t desc)
3747 intptr_t j, i = simd_oprsz(desc);
3749 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3750 float32 neg_real = float32_chs(neg_imag);
3753 uint64_t pg = g[(i - 1) >> 6];
3755 float32 e0, e1, e2, e3;
3757 /* I holds the real index; J holds the imag index. */
3758 j = i - sizeof(float32);
3759 i -= 2 * sizeof(float32);
3761 e0 = *(float32 *)(vn + H1_2(i));
3762 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3763 e2 = *(float32 *)(vn + H1_2(j));
3764 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3766 if (likely((pg >> (i & 63)) & 1)) {
3767 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3769 if (likely((pg >> (j & 63)) & 1)) {
3770 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3776 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3777 void *vs, uint32_t desc)
3779 intptr_t j, i = simd_oprsz(desc);
3781 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3782 float64 neg_real = float64_chs(neg_imag);
3785 uint64_t pg = g[(i - 1) >> 6];
3787 float64 e0, e1, e2, e3;
3789 /* I holds the real index; J holds the imag index. */
3790 j = i - sizeof(float64);
3791 i -= 2 * sizeof(float64);
3793 e0 = *(float64 *)(vn + H1_2(i));
3794 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3795 e2 = *(float64 *)(vn + H1_2(j));
3796 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3798 if (likely((pg >> (i & 63)) & 1)) {
3799 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3801 if (likely((pg >> (j & 63)) & 1)) {
3802 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3809 * FP Complex Multiply
3812 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3814 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3816 intptr_t j, i = simd_oprsz(desc);
3817 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3818 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3819 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3820 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3821 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3822 bool flip = rot & 1;
3823 float16 neg_imag, neg_real;
3824 void *vd = &env->vfp.zregs[rd];
3825 void *vn = &env->vfp.zregs[rn];
3826 void *vm = &env->vfp.zregs[rm];
3827 void *va = &env->vfp.zregs[ra];
3830 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3831 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3834 uint64_t pg = g[(i - 1) >> 6];
3836 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3838 /* I holds the real index; J holds the imag index. */
3839 j = i - sizeof(float16);
3840 i -= 2 * sizeof(float16);
3842 nr = *(float16 *)(vn + H1_2(i));
3843 ni = *(float16 *)(vn + H1_2(j));
3844 mr = *(float16 *)(vm + H1_2(i));
3845 mi = *(float16 *)(vm + H1_2(j));
3847 e2 = (flip ? ni : nr);
3848 e1 = (flip ? mi : mr) ^ neg_real;
3850 e3 = (flip ? mr : mi) ^ neg_imag;
3852 if (likely((pg >> (i & 63)) & 1)) {
3853 d = *(float16 *)(va + H1_2(i));
3854 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3855 *(float16 *)(vd + H1_2(i)) = d;
3857 if (likely((pg >> (j & 63)) & 1)) {
3858 d = *(float16 *)(va + H1_2(j));
3859 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3860 *(float16 *)(vd + H1_2(j)) = d;
3866 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3868 intptr_t j, i = simd_oprsz(desc);
3869 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3870 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3871 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3872 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3873 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3874 bool flip = rot & 1;
3875 float32 neg_imag, neg_real;
3876 void *vd = &env->vfp.zregs[rd];
3877 void *vn = &env->vfp.zregs[rn];
3878 void *vm = &env->vfp.zregs[rm];
3879 void *va = &env->vfp.zregs[ra];
3882 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3883 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3886 uint64_t pg = g[(i - 1) >> 6];
3888 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3890 /* I holds the real index; J holds the imag index. */
3891 j = i - sizeof(float32);
3892 i -= 2 * sizeof(float32);
3894 nr = *(float32 *)(vn + H1_2(i));
3895 ni = *(float32 *)(vn + H1_2(j));
3896 mr = *(float32 *)(vm + H1_2(i));
3897 mi = *(float32 *)(vm + H1_2(j));
3899 e2 = (flip ? ni : nr);
3900 e1 = (flip ? mi : mr) ^ neg_real;
3902 e3 = (flip ? mr : mi) ^ neg_imag;
3904 if (likely((pg >> (i & 63)) & 1)) {
3905 d = *(float32 *)(va + H1_2(i));
3906 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3907 *(float32 *)(vd + H1_2(i)) = d;
3909 if (likely((pg >> (j & 63)) & 1)) {
3910 d = *(float32 *)(va + H1_2(j));
3911 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3912 *(float32 *)(vd + H1_2(j)) = d;
3918 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3920 intptr_t j, i = simd_oprsz(desc);
3921 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3922 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3923 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3924 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3925 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3926 bool flip = rot & 1;
3927 float64 neg_imag, neg_real;
3928 void *vd = &env->vfp.zregs[rd];
3929 void *vn = &env->vfp.zregs[rn];
3930 void *vm = &env->vfp.zregs[rm];
3931 void *va = &env->vfp.zregs[ra];
3934 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3935 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3938 uint64_t pg = g[(i - 1) >> 6];
3940 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3942 /* I holds the real index; J holds the imag index. */
3943 j = i - sizeof(float64);
3944 i -= 2 * sizeof(float64);
3946 nr = *(float64 *)(vn + H1_2(i));
3947 ni = *(float64 *)(vn + H1_2(j));
3948 mr = *(float64 *)(vm + H1_2(i));
3949 mi = *(float64 *)(vm + H1_2(j));
3951 e2 = (flip ? ni : nr);
3952 e1 = (flip ? mi : mr) ^ neg_real;
3954 e3 = (flip ? mr : mi) ^ neg_imag;
3956 if (likely((pg >> (i & 63)) & 1)) {
3957 d = *(float64 *)(va + H1_2(i));
3958 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3959 *(float64 *)(vd + H1_2(i)) = d;
3961 if (likely((pg >> (j & 63)) & 1)) {
3962 d = *(float64 *)(va + H1_2(j));
3963 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3964 *(float64 *)(vd + H1_2(j)) = d;
3971 * Load contiguous data, protected by a governing predicate.
3975 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3976 * Memory is valid through @host + @mem_max. The register element
3977 * indicies are inferred from @mem_ofs, as modified by the types for
3978 * which the helper is built. Return the @mem_ofs of the first element
3979 * not loaded (which is @mem_max if they are all loaded).
3981 * For softmmu, we have fully validated the guest page. For user-only,
3982 * we cannot fully validate without taking the mmap lock, but since we
3983 * know the access is within one host page, if any access is valid they
3984 * all must be valid. However, when @vg is all false, it may be that
3985 * no access is valid.
3987 typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3988 intptr_t mem_ofs, intptr_t mem_max);
3991 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3992 * The controlling predicate is known to be true.
3994 typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3995 target_ulong vaddr, TCGMemOpIdx oi, uintptr_t ra);
3996 typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
3999 * Generate the above primitives.
4002 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4003 static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4004 intptr_t mem_off, const intptr_t mem_max) \
4006 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4007 uint64_t *pg = vg; \
4008 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4010 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4011 val = HOST(host + mem_off); \
4013 *(TYPEE *)(vd + H(reg_off)) = val; \
4014 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4019 #ifdef CONFIG_SOFTMMU
4020 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4021 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4022 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4024 TYPEM val = TLB(env, addr, oi, ra); \
4025 *(TYPEE *)(vd + H(reg_off)) = val; \
4028 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4029 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4030 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4032 TYPEM val = HOST(g2h(addr)); \
4033 *(TYPEE *)(vd + H(reg_off)) = val; \
4037 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4038 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4039 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4041 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4042 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4043 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4044 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4045 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4046 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4047 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4049 #define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4050 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4051 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4052 MOEND, helper_##end##_##PT##_mmu)
4054 DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4055 DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4056 DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
4057 DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
4058 DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
4060 DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4061 DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
4062 DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
4064 DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
4066 DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4067 DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4068 DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
4069 DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
4070 DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
4072 DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4073 DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
4074 DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
4076 DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
4084 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4085 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4086 * element >= @reg_off, or @reg_max if there were no active elements at all.
4088 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4089 intptr_t reg_max, int esz)
4091 uint64_t pg_mask = pred_esz_masks[esz];
4092 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4094 /* In normal usage, the first element is active. */
4095 if (likely(pg & 1)) {
4103 if (unlikely(reg_off >= reg_max)) {
4104 /* The entire predicate was false. */
4107 pg = vg[reg_off >> 6] & pg_mask;
4110 reg_off += ctz64(pg);
4112 /* We should never see an out of range predicate bit set. */
4113 tcg_debug_assert(reg_off < reg_max);
4118 * Return the maximum offset <= @mem_max which is still within the page
4119 * referenced by @base + @mem_off.
4121 static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4124 target_ulong addr = base + mem_off;
4125 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4126 return MIN(split, mem_max - mem_off) + mem_off;
4129 #ifndef CONFIG_USER_ONLY
4130 /* These are normally defined only for CONFIG_USER_ONLY in <exec/cpu_ldst.h> */
4131 static inline void set_helper_retaddr(uintptr_t ra) { }
4132 static inline void clear_helper_retaddr(void) { }
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4139 static inline bool test_host_page(void *host)
4141 #ifdef CONFIG_USER_ONLY
4144 return likely(host != NULL);
4149 * Common helper for all contiguous one-register predicated loads.
4151 static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152 uint32_t desc, const uintptr_t retaddr,
4153 const int esz, const int msz,
4154 sve_ld1_host_fn *host_fn,
4155 sve_ld1_tlb_fn *tlb_fn)
4157 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4158 const int mmu_idx = get_mmuidx(oi);
4159 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4160 void *vd = &env->vfp.zregs[rd];
4161 const int diffsz = esz - msz;
4162 const intptr_t reg_max = simd_oprsz(desc);
4163 const intptr_t mem_max = reg_max >> diffsz;
4164 ARMVectorReg scratch;
4166 intptr_t split, reg_off, mem_off;
4168 /* Find the first active element. */
4169 reg_off = find_next_active(vg, 0, reg_max, esz);
4170 if (unlikely(reg_off == reg_max)) {
4171 /* The entire predicate was false; no load occurs. */
4172 memset(vd, 0, reg_max);
4175 mem_off = reg_off >> diffsz;
4176 set_helper_retaddr(retaddr);
4179 * If the (remaining) load is entirely within a single page, then:
4180 * For softmmu, and the tlb hits, then no faults will occur;
4181 * For user-only, either the first load will fault or none will.
4182 * We can thus perform the load directly to the destination and
4183 * Vd will be unmodified on any exception path.
4185 split = max_for_page(addr, mem_off, mem_max);
4186 if (likely(split == mem_max)) {
4187 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4188 if (test_host_page(host)) {
4189 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4190 tcg_debug_assert(mem_off == mem_max);
4191 clear_helper_retaddr();
4192 /* After having taken any fault, zero leading inactive elements. */
4193 swap_memzero(vd, reg_off);
4199 * Perform the predicated read into a temporary, thus ensuring
4200 * if the load of the last element faults, Vd is not modified.
4202 #ifdef CONFIG_USER_ONLY
4203 swap_memzero(&scratch, reg_off);
4204 host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4206 memset(&scratch, 0, reg_max);
4209 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4210 if (reg_off >= reg_max) {
4213 mem_off = reg_off >> diffsz;
4214 split = max_for_page(addr, mem_off, mem_max);
4217 if (split - mem_off >= (1 << msz)) {
4218 /* At least one whole element on this page. */
4219 host = tlb_vaddr_to_host(env, addr + mem_off,
4220 MMU_DATA_LOAD, mmu_idx);
4222 mem_off = host_fn(&scratch, vg, host - mem_off,
4224 reg_off = mem_off << diffsz;
4230 * Perform one normal read. This may fault, longjmping out to the
4231 * main loop in order to raise an exception. It may succeed, and
4232 * as a side-effect load the TLB entry for the next round. Finally,
4233 * in the extremely unlikely case we're performing this operation
4234 * on I/O memory, it may succeed but not bring in the TLB entry.
4235 * But even then we have still made forward progress.
4237 tlb_fn(env, &scratch, reg_off, addr + mem_off, oi, retaddr);
4238 reg_off += 1 << esz;
4242 clear_helper_retaddr();
4243 memcpy(vd, &scratch, reg_max);
4246 #define DO_LD1_1(NAME, ESZ) \
4247 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4248 target_ulong addr, uint32_t desc) \
4250 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4251 sve_##NAME##_host, sve_##NAME##_tlb); \
4254 #define DO_LD1_2(NAME, ESZ, MSZ) \
4255 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4256 target_ulong addr, uint32_t desc) \
4258 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4259 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4261 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4262 target_ulong addr, uint32_t desc) \
4264 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4265 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4276 DO_LD1_2(ld1hh, 1, 1)
4277 DO_LD1_2(ld1hsu, 2, 1)
4278 DO_LD1_2(ld1hss, 2, 1)
4279 DO_LD1_2(ld1hdu, 3, 1)
4280 DO_LD1_2(ld1hds, 3, 1)
4282 DO_LD1_2(ld1ss, 2, 2)
4283 DO_LD1_2(ld1sdu, 3, 2)
4284 DO_LD1_2(ld1sds, 3, 2)
4286 DO_LD1_2(ld1dd, 3, 3)
4292 * Common helpers for all contiguous 2,3,4-register predicated loads.
4294 static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4295 uint32_t desc, int size, uintptr_t ra,
4296 sve_ld1_tlb_fn *tlb_fn)
4298 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4299 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4300 intptr_t i, oprsz = simd_oprsz(desc);
4301 ARMVectorReg scratch[2] = { };
4303 set_helper_retaddr(ra);
4304 for (i = 0; i < oprsz; ) {
4305 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4308 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4309 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4311 i += size, pg >>= size;
4315 clear_helper_retaddr();
4317 /* Wait until all exceptions have been raised to write back. */
4318 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4319 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4322 static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4323 uint32_t desc, int size, uintptr_t ra,
4324 sve_ld1_tlb_fn *tlb_fn)
4326 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4327 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4328 intptr_t i, oprsz = simd_oprsz(desc);
4329 ARMVectorReg scratch[3] = { };
4331 set_helper_retaddr(ra);
4332 for (i = 0; i < oprsz; ) {
4333 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4336 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4337 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4338 tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
4340 i += size, pg >>= size;
4344 clear_helper_retaddr();
4346 /* Wait until all exceptions have been raised to write back. */
4347 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4348 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4349 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4352 static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4353 uint32_t desc, int size, uintptr_t ra,
4354 sve_ld1_tlb_fn *tlb_fn)
4356 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4357 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4358 intptr_t i, oprsz = simd_oprsz(desc);
4359 ARMVectorReg scratch[4] = { };
4361 set_helper_retaddr(ra);
4362 for (i = 0; i < oprsz; ) {
4363 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4366 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4367 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4368 tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
4369 tlb_fn(env, &scratch[3], i, addr + 3 * size, oi, ra);
4371 i += size, pg >>= size;
4375 clear_helper_retaddr();
4377 /* Wait until all exceptions have been raised to write back. */
4378 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4379 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4380 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4381 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4384 #define DO_LDN_1(N) \
4385 void QEMU_FLATTEN HELPER(sve_ld##N##bb_r) \
4386 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4388 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4391 #define DO_LDN_2(N, SUFF, SIZE) \
4392 void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_le_r) \
4393 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4395 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4396 sve_ld1##SUFF##_le_tlb); \
4398 void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_be_r) \
4399 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4401 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4402 sve_ld1##SUFF##_be_tlb); \
4425 * Load contiguous data, first-fault and no-fault.
4427 * For user-only, one could argue that we should hold the mmap_lock during
4428 * the operation so that there is no race between page_check_range and the
4429 * load operation. However, unmapping pages out from under a running thread
4430 * is extraordinarily unlikely. This theoretical race condition also affects
4431 * linux-user/ in its get_user/put_user macros.
4433 * TODO: Construct some helpers, written in assembly, that interact with
4434 * handle_cpu_signal to produce memory ops which can properly report errors
4438 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4439 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4440 * option, which leaves subsequent data unchanged.
4442 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4444 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4447 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4448 i = ROUND_UP(i, 64);
4450 for (; i < oprsz; i += 64) {
4456 * Common helper for all contiguous first-fault loads.
4458 static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4459 uint32_t desc, const uintptr_t retaddr,
4460 const int esz, const int msz,
4461 sve_ld1_host_fn *host_fn,
4462 sve_ld1_tlb_fn *tlb_fn)
4464 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4465 const int mmu_idx = get_mmuidx(oi);
4466 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4467 void *vd = &env->vfp.zregs[rd];
4468 const int diffsz = esz - msz;
4469 const intptr_t reg_max = simd_oprsz(desc);
4470 const intptr_t mem_max = reg_max >> diffsz;
4471 intptr_t split, reg_off, mem_off;
4474 /* Skip to the first active element. */
4475 reg_off = find_next_active(vg, 0, reg_max, esz);
4476 if (unlikely(reg_off == reg_max)) {
4477 /* The entire predicate was false; no load occurs. */
4478 memset(vd, 0, reg_max);
4481 mem_off = reg_off >> diffsz;
4482 set_helper_retaddr(retaddr);
4485 * If the (remaining) load is entirely within a single page, then:
4486 * For softmmu, and the tlb hits, then no faults will occur;
4487 * For user-only, either the first load will fault or none will.
4488 * We can thus perform the load directly to the destination and
4489 * Vd will be unmodified on any exception path.
4491 split = max_for_page(addr, mem_off, mem_max);
4492 if (likely(split == mem_max)) {
4493 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4494 if (test_host_page(host)) {
4495 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4496 tcg_debug_assert(mem_off == mem_max);
4497 clear_helper_retaddr();
4498 /* After any fault, zero any leading inactive elements. */
4499 swap_memzero(vd, reg_off);
4504 #ifdef CONFIG_USER_ONLY
4506 * The page(s) containing this first element at ADDR+MEM_OFF must
4507 * be valid. Considering that this first element may be misaligned
4508 * and cross a page boundary itself, take the rest of the page from
4509 * the last byte of the element.
4511 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4512 mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4514 /* After any fault, zero any leading inactive elements. */
4515 swap_memzero(vd, reg_off);
4516 reg_off = mem_off << diffsz;
4519 * Perform one normal read, which will fault or not.
4520 * But it is likely to bring the page into the tlb.
4522 tlb_fn(env, vd, reg_off, addr + mem_off, oi, retaddr);
4524 /* After any fault, zero any leading predicated false elts. */
4525 swap_memzero(vd, reg_off);
4526 mem_off += 1 << msz;
4527 reg_off += 1 << esz;
4529 /* Try again to read the balance of the page. */
4530 split = max_for_page(addr, mem_off - 1, mem_max);
4531 if (split >= (1 << msz)) {
4532 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4534 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4535 reg_off = mem_off << diffsz;
4540 clear_helper_retaddr();
4541 record_fault(env, reg_off, reg_max);
4545 * Common helper for all contiguous no-fault loads.
4547 static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4548 uint32_t desc, const int esz, const int msz,
4549 sve_ld1_host_fn *host_fn)
4551 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4552 void *vd = &env->vfp.zregs[rd];
4553 const int diffsz = esz - msz;
4554 const intptr_t reg_max = simd_oprsz(desc);
4555 const intptr_t mem_max = reg_max >> diffsz;
4556 const int mmu_idx = cpu_mmu_index(env, false);
4557 intptr_t split, reg_off, mem_off;
4560 #ifdef CONFIG_USER_ONLY
4561 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4562 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4563 /* The entire operation is valid and will not fault. */
4564 host_fn(vd, vg, host, 0, mem_max);
4569 /* There will be no fault, so we may modify in advance. */
4570 memset(vd, 0, reg_max);
4572 /* Skip to the first active element. */
4573 reg_off = find_next_active(vg, 0, reg_max, esz);
4574 if (unlikely(reg_off == reg_max)) {
4575 /* The entire predicate was false; no load occurs. */
4578 mem_off = reg_off >> diffsz;
4580 #ifdef CONFIG_USER_ONLY
4581 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4582 /* At least one load is valid; take the rest of the page. */
4583 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4584 mem_off = host_fn(vd, vg, host, mem_off, split);
4585 reg_off = mem_off << diffsz;
4589 * If the address is not in the TLB, we have no way to bring the
4590 * entry into the TLB without also risking a fault. Note that
4591 * the corollary is that we never load from an address not in RAM.
4593 * This last is out of spec, in a weird corner case.
4594 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4595 * must not actually hit the bus -- it returns UNKNOWN data instead.
4596 * But if you map non-RAM with Normal memory attributes and do a NF
4597 * load then it should access the bus. (Nobody ought actually do this
4598 * in the real world, obviously.)
4600 * Then there are the annoying special cases with watchpoints...
4601 * TODO: Add a form of non-faulting loads using cc->tlb_fill(probe=true).
4603 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4604 split = max_for_page(addr, mem_off, mem_max);
4605 if (host && split >= (1 << msz)) {
4606 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4607 reg_off = mem_off << diffsz;
4611 record_fault(env, reg_off, reg_max);
4614 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4615 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4616 target_ulong addr, uint32_t desc) \
4618 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4619 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4621 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4622 target_ulong addr, uint32_t desc) \
4624 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
4627 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4628 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4629 target_ulong addr, uint32_t desc) \
4631 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4632 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4634 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4635 target_ulong addr, uint32_t desc) \
4637 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4639 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4640 target_ulong addr, uint32_t desc) \
4642 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4643 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4645 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4646 target_ulong addr, uint32_t desc) \
4648 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
4651 DO_LDFF1_LDNF1_1(bb, 0)
4652 DO_LDFF1_LDNF1_1(bhu, 1)
4653 DO_LDFF1_LDNF1_1(bhs, 1)
4654 DO_LDFF1_LDNF1_1(bsu, 2)
4655 DO_LDFF1_LDNF1_1(bss, 2)
4656 DO_LDFF1_LDNF1_1(bdu, 3)
4657 DO_LDFF1_LDNF1_1(bds, 3)
4659 DO_LDFF1_LDNF1_2(hh, 1, 1)
4660 DO_LDFF1_LDNF1_2(hsu, 2, 1)
4661 DO_LDFF1_LDNF1_2(hss, 2, 1)
4662 DO_LDFF1_LDNF1_2(hdu, 3, 1)
4663 DO_LDFF1_LDNF1_2(hds, 3, 1)
4665 DO_LDFF1_LDNF1_2(ss, 2, 2)
4666 DO_LDFF1_LDNF1_2(sdu, 3, 2)
4667 DO_LDFF1_LDNF1_2(sds, 3, 2)
4669 DO_LDFF1_LDNF1_2(dd, 3, 3)
4671 #undef DO_LDFF1_LDNF1_1
4672 #undef DO_LDFF1_LDNF1_2
4675 * Store contiguous data, protected by a governing predicate.
4678 #ifdef CONFIG_SOFTMMU
4679 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4680 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4681 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4683 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
4686 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4687 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4688 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4690 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
4694 DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
4695 DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
4696 DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
4697 DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
4699 DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4700 DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4701 DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4703 DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4704 DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4706 DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
4708 DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4709 DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4710 DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4712 DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4713 DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4715 DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
4720 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4722 static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
4723 uint32_t desc, const uintptr_t ra,
4724 const int esize, const int msize,
4725 sve_st1_tlb_fn *tlb_fn)
4727 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4728 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4729 intptr_t i, oprsz = simd_oprsz(desc);
4730 void *vd = &env->vfp.zregs[rd];
4732 set_helper_retaddr(ra);
4733 for (i = 0; i < oprsz; ) {
4734 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4737 tlb_fn(env, vd, i, addr, oi, ra);
4739 i += esize, pg >>= esize;
4743 clear_helper_retaddr();
4746 static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
4747 uint32_t desc, const uintptr_t ra,
4748 const int esize, const int msize,
4749 sve_st1_tlb_fn *tlb_fn)
4751 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4752 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4753 intptr_t i, oprsz = simd_oprsz(desc);
4754 void *d1 = &env->vfp.zregs[rd];
4755 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4757 set_helper_retaddr(ra);
4758 for (i = 0; i < oprsz; ) {
4759 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4762 tlb_fn(env, d1, i, addr, oi, ra);
4763 tlb_fn(env, d2, i, addr + msize, oi, ra);
4765 i += esize, pg >>= esize;
4769 clear_helper_retaddr();
4772 static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
4773 uint32_t desc, const uintptr_t ra,
4774 const int esize, const int msize,
4775 sve_st1_tlb_fn *tlb_fn)
4777 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4778 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4779 intptr_t i, oprsz = simd_oprsz(desc);
4780 void *d1 = &env->vfp.zregs[rd];
4781 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4782 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4784 set_helper_retaddr(ra);
4785 for (i = 0; i < oprsz; ) {
4786 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4789 tlb_fn(env, d1, i, addr, oi, ra);
4790 tlb_fn(env, d2, i, addr + msize, oi, ra);
4791 tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
4793 i += esize, pg >>= esize;
4797 clear_helper_retaddr();
4800 static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
4801 uint32_t desc, const uintptr_t ra,
4802 const int esize, const int msize,
4803 sve_st1_tlb_fn *tlb_fn)
4805 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4806 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4807 intptr_t i, oprsz = simd_oprsz(desc);
4808 void *d1 = &env->vfp.zregs[rd];
4809 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4810 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4811 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
4813 set_helper_retaddr(ra);
4814 for (i = 0; i < oprsz; ) {
4815 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4818 tlb_fn(env, d1, i, addr, oi, ra);
4819 tlb_fn(env, d2, i, addr + msize, oi, ra);
4820 tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
4821 tlb_fn(env, d4, i, addr + 3 * msize, oi, ra);
4823 i += esize, pg >>= esize;
4827 clear_helper_retaddr();
4830 #define DO_STN_1(N, NAME, ESIZE) \
4831 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_r) \
4832 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4834 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4835 sve_st1##NAME##_tlb); \
4838 #define DO_STN_2(N, NAME, ESIZE, MSIZE) \
4839 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_le_r) \
4840 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4842 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4843 sve_st1##NAME##_le_tlb); \
4845 void QEMU_FLATTEN HELPER(sve_st##N##NAME##_be_r) \
4846 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4848 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4849 sve_st1##NAME##_be_tlb); \
4860 DO_STN_2(1, hh, 2, 2)
4861 DO_STN_2(1, hs, 4, 2)
4862 DO_STN_2(1, hd, 8, 2)
4863 DO_STN_2(2, hh, 2, 2)
4864 DO_STN_2(3, hh, 2, 2)
4865 DO_STN_2(4, hh, 2, 2)
4867 DO_STN_2(1, ss, 4, 4)
4868 DO_STN_2(1, sd, 8, 4)
4869 DO_STN_2(2, ss, 4, 4)
4870 DO_STN_2(3, ss, 4, 4)
4871 DO_STN_2(4, ss, 4, 4)
4873 DO_STN_2(1, dd, 8, 8)
4874 DO_STN_2(2, dd, 8, 8)
4875 DO_STN_2(3, dd, 8, 8)
4876 DO_STN_2(4, dd, 8, 8)
4882 * Loads with a vector index.
4886 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
4888 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
4890 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
4892 return *(uint32_t *)(reg + H1_4(reg_ofs));
4895 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
4897 return *(int32_t *)(reg + H1_4(reg_ofs));
4900 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
4902 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
4905 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
4907 return (int32_t)*(uint64_t *)(reg + reg_ofs);
4910 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
4912 return *(uint64_t *)(reg + reg_ofs);
4915 static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
4916 target_ulong base, uint32_t desc, uintptr_t ra,
4917 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4919 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4920 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
4921 intptr_t i, oprsz = simd_oprsz(desc);
4922 ARMVectorReg scratch = { };
4924 set_helper_retaddr(ra);
4925 for (i = 0; i < oprsz; ) {
4926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4928 if (likely(pg & 1)) {
4929 target_ulong off = off_fn(vm, i);
4930 tlb_fn(env, &scratch, i, base + (off << scale), oi, ra);
4935 clear_helper_retaddr();
4937 /* Wait until all exceptions have been raised to write back. */
4938 memcpy(vd, &scratch, oprsz);
4941 static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
4942 target_ulong base, uint32_t desc, uintptr_t ra,
4943 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4945 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4946 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
4947 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4948 ARMVectorReg scratch = { };
4950 set_helper_retaddr(ra);
4951 for (i = 0; i < oprsz; i++) {
4952 uint8_t pg = *(uint8_t *)(vg + H1(i));
4953 if (likely(pg & 1)) {
4954 target_ulong off = off_fn(vm, i * 8);
4955 tlb_fn(env, &scratch, i * 8, base + (off << scale), oi, ra);
4958 clear_helper_retaddr();
4960 /* Wait until all exceptions have been raised to write back. */
4961 memcpy(vd, &scratch, oprsz * 8);
4964 #define DO_LD1_ZPZ_S(MEM, OFS) \
4965 void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
4966 (CPUARMState *env, void *vd, void *vg, void *vm, \
4967 target_ulong base, uint32_t desc) \
4969 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
4970 off_##OFS##_s, sve_ld1##MEM##_tlb); \
4973 #define DO_LD1_ZPZ_D(MEM, OFS) \
4974 void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
4975 (CPUARMState *env, void *vd, void *vg, void *vm, \
4976 target_ulong base, uint32_t desc) \
4978 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
4979 off_##OFS##_d, sve_ld1##MEM##_tlb); \
4982 DO_LD1_ZPZ_S(bsu, zsu)
4983 DO_LD1_ZPZ_S(bsu, zss)
4984 DO_LD1_ZPZ_D(bdu, zsu)
4985 DO_LD1_ZPZ_D(bdu, zss)
4986 DO_LD1_ZPZ_D(bdu, zd)
4988 DO_LD1_ZPZ_S(bss, zsu)
4989 DO_LD1_ZPZ_S(bss, zss)
4990 DO_LD1_ZPZ_D(bds, zsu)
4991 DO_LD1_ZPZ_D(bds, zss)
4992 DO_LD1_ZPZ_D(bds, zd)
4994 DO_LD1_ZPZ_S(hsu_le, zsu)
4995 DO_LD1_ZPZ_S(hsu_le, zss)
4996 DO_LD1_ZPZ_D(hdu_le, zsu)
4997 DO_LD1_ZPZ_D(hdu_le, zss)
4998 DO_LD1_ZPZ_D(hdu_le, zd)
5000 DO_LD1_ZPZ_S(hsu_be, zsu)
5001 DO_LD1_ZPZ_S(hsu_be, zss)
5002 DO_LD1_ZPZ_D(hdu_be, zsu)
5003 DO_LD1_ZPZ_D(hdu_be, zss)
5004 DO_LD1_ZPZ_D(hdu_be, zd)
5006 DO_LD1_ZPZ_S(hss_le, zsu)
5007 DO_LD1_ZPZ_S(hss_le, zss)
5008 DO_LD1_ZPZ_D(hds_le, zsu)
5009 DO_LD1_ZPZ_D(hds_le, zss)
5010 DO_LD1_ZPZ_D(hds_le, zd)
5012 DO_LD1_ZPZ_S(hss_be, zsu)
5013 DO_LD1_ZPZ_S(hss_be, zss)
5014 DO_LD1_ZPZ_D(hds_be, zsu)
5015 DO_LD1_ZPZ_D(hds_be, zss)
5016 DO_LD1_ZPZ_D(hds_be, zd)
5018 DO_LD1_ZPZ_S(ss_le, zsu)
5019 DO_LD1_ZPZ_S(ss_le, zss)
5020 DO_LD1_ZPZ_D(sdu_le, zsu)
5021 DO_LD1_ZPZ_D(sdu_le, zss)
5022 DO_LD1_ZPZ_D(sdu_le, zd)
5024 DO_LD1_ZPZ_S(ss_be, zsu)
5025 DO_LD1_ZPZ_S(ss_be, zss)
5026 DO_LD1_ZPZ_D(sdu_be, zsu)
5027 DO_LD1_ZPZ_D(sdu_be, zss)
5028 DO_LD1_ZPZ_D(sdu_be, zd)
5030 DO_LD1_ZPZ_D(sds_le, zsu)
5031 DO_LD1_ZPZ_D(sds_le, zss)
5032 DO_LD1_ZPZ_D(sds_le, zd)
5034 DO_LD1_ZPZ_D(sds_be, zsu)
5035 DO_LD1_ZPZ_D(sds_be, zss)
5036 DO_LD1_ZPZ_D(sds_be, zd)
5038 DO_LD1_ZPZ_D(dd_le, zsu)
5039 DO_LD1_ZPZ_D(dd_le, zss)
5040 DO_LD1_ZPZ_D(dd_le, zd)
5042 DO_LD1_ZPZ_D(dd_be, zsu)
5043 DO_LD1_ZPZ_D(dd_be, zss)
5044 DO_LD1_ZPZ_D(dd_be, zd)
5049 /* First fault loads with a vector index. */
5051 /* Load one element into VD+REG_OFF from (ENV,VADDR) without faulting.
5052 * The controlling predicate is known to be true. Return true if the
5053 * load was successful.
5055 typedef bool sve_ld1_nf_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5056 target_ulong vaddr, int mmu_idx);
5058 #ifdef CONFIG_SOFTMMU
5059 #define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5060 static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5061 target_ulong addr, int mmu_idx) \
5063 target_ulong next_page = -(addr | TARGET_PAGE_MASK); \
5064 if (likely(next_page - addr >= sizeof(TYPEM))) { \
5065 void *host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx); \
5066 if (likely(host)) { \
5067 TYPEM val = HOST(host); \
5068 *(TYPEE *)(vd + H(reg_off)) = val; \
5075 #define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5076 static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5077 target_ulong addr, int mmu_idx) \
5079 if (likely(page_check_range(addr, sizeof(TYPEM), PAGE_READ))) { \
5080 TYPEM val = HOST(g2h(addr)); \
5081 *(TYPEE *)(vd + H(reg_off)) = val; \
5088 DO_LD_NF(bsu, H1_4, uint32_t, uint8_t, ldub_p)
5089 DO_LD_NF(bss, H1_4, uint32_t, int8_t, ldsb_p)
5090 DO_LD_NF(bdu, , uint64_t, uint8_t, ldub_p)
5091 DO_LD_NF(bds, , uint64_t, int8_t, ldsb_p)
5093 DO_LD_NF(hsu_le, H1_4, uint32_t, uint16_t, lduw_le_p)
5094 DO_LD_NF(hss_le, H1_4, uint32_t, int16_t, ldsw_le_p)
5095 DO_LD_NF(hsu_be, H1_4, uint32_t, uint16_t, lduw_be_p)
5096 DO_LD_NF(hss_be, H1_4, uint32_t, int16_t, ldsw_be_p)
5097 DO_LD_NF(hdu_le, , uint64_t, uint16_t, lduw_le_p)
5098 DO_LD_NF(hds_le, , uint64_t, int16_t, ldsw_le_p)
5099 DO_LD_NF(hdu_be, , uint64_t, uint16_t, lduw_be_p)
5100 DO_LD_NF(hds_be, , uint64_t, int16_t, ldsw_be_p)
5102 DO_LD_NF(ss_le, H1_4, uint32_t, uint32_t, ldl_le_p)
5103 DO_LD_NF(ss_be, H1_4, uint32_t, uint32_t, ldl_be_p)
5104 DO_LD_NF(sdu_le, , uint64_t, uint32_t, ldl_le_p)
5105 DO_LD_NF(sds_le, , uint64_t, int32_t, ldl_le_p)
5106 DO_LD_NF(sdu_be, , uint64_t, uint32_t, ldl_be_p)
5107 DO_LD_NF(sds_be, , uint64_t, int32_t, ldl_be_p)
5109 DO_LD_NF(dd_le, , uint64_t, uint64_t, ldq_le_p)
5110 DO_LD_NF(dd_be, , uint64_t, uint64_t, ldq_be_p)
5113 * Common helper for all gather first-faulting loads.
5115 static inline void sve_ldff1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5116 target_ulong base, uint32_t desc, uintptr_t ra,
5117 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
5118 sve_ld1_nf_fn *nonfault_fn)
5120 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5121 const int mmu_idx = get_mmuidx(oi);
5122 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
5123 intptr_t reg_off, reg_max = simd_oprsz(desc);
5126 /* Skip to the first true predicate. */
5127 reg_off = find_next_active(vg, 0, reg_max, MO_32);
5128 if (likely(reg_off < reg_max)) {
5129 /* Perform one normal read, which will fault or not. */
5130 set_helper_retaddr(ra);
5131 addr = off_fn(vm, reg_off);
5132 addr = base + (addr << scale);
5133 tlb_fn(env, vd, reg_off, addr, oi, ra);
5135 /* The rest of the reads will be non-faulting. */
5136 clear_helper_retaddr();
5139 /* After any fault, zero the leading predicated false elements. */
5140 swap_memzero(vd, reg_off);
5142 while (likely((reg_off += 4) < reg_max)) {
5143 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 6) * 8);
5144 if (likely((pg >> (reg_off & 63)) & 1)) {
5145 addr = off_fn(vm, reg_off);
5146 addr = base + (addr << scale);
5147 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5148 record_fault(env, reg_off, reg_max);
5152 *(uint32_t *)(vd + H1_4(reg_off)) = 0;
5157 static inline void sve_ldff1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5158 target_ulong base, uint32_t desc, uintptr_t ra,
5159 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
5160 sve_ld1_nf_fn *nonfault_fn)
5162 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5163 const int mmu_idx = get_mmuidx(oi);
5164 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
5165 intptr_t reg_off, reg_max = simd_oprsz(desc);
5168 /* Skip to the first true predicate. */
5169 reg_off = find_next_active(vg, 0, reg_max, MO_64);
5170 if (likely(reg_off < reg_max)) {
5171 /* Perform one normal read, which will fault or not. */
5172 set_helper_retaddr(ra);
5173 addr = off_fn(vm, reg_off);
5174 addr = base + (addr << scale);
5175 tlb_fn(env, vd, reg_off, addr, oi, ra);
5177 /* The rest of the reads will be non-faulting. */
5178 clear_helper_retaddr();
5181 /* After any fault, zero the leading predicated false elements. */
5182 swap_memzero(vd, reg_off);
5184 while (likely((reg_off += 8) < reg_max)) {
5185 uint8_t pg = *(uint8_t *)(vg + H1(reg_off >> 3));
5186 if (likely(pg & 1)) {
5187 addr = off_fn(vm, reg_off);
5188 addr = base + (addr << scale);
5189 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5190 record_fault(env, reg_off, reg_max);
5194 *(uint64_t *)(vd + reg_off) = 0;
5199 #define DO_LDFF1_ZPZ_S(MEM, OFS) \
5200 void HELPER(sve_ldff##MEM##_##OFS) \
5201 (CPUARMState *env, void *vd, void *vg, void *vm, \
5202 target_ulong base, uint32_t desc) \
5204 sve_ldff1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5205 off_##OFS##_s, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5208 #define DO_LDFF1_ZPZ_D(MEM, OFS) \
5209 void HELPER(sve_ldff##MEM##_##OFS) \
5210 (CPUARMState *env, void *vd, void *vg, void *vm, \
5211 target_ulong base, uint32_t desc) \
5213 sve_ldff1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5214 off_##OFS##_d, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5217 DO_LDFF1_ZPZ_S(bsu, zsu)
5218 DO_LDFF1_ZPZ_S(bsu, zss)
5219 DO_LDFF1_ZPZ_D(bdu, zsu)
5220 DO_LDFF1_ZPZ_D(bdu, zss)
5221 DO_LDFF1_ZPZ_D(bdu, zd)
5223 DO_LDFF1_ZPZ_S(bss, zsu)
5224 DO_LDFF1_ZPZ_S(bss, zss)
5225 DO_LDFF1_ZPZ_D(bds, zsu)
5226 DO_LDFF1_ZPZ_D(bds, zss)
5227 DO_LDFF1_ZPZ_D(bds, zd)
5229 DO_LDFF1_ZPZ_S(hsu_le, zsu)
5230 DO_LDFF1_ZPZ_S(hsu_le, zss)
5231 DO_LDFF1_ZPZ_D(hdu_le, zsu)
5232 DO_LDFF1_ZPZ_D(hdu_le, zss)
5233 DO_LDFF1_ZPZ_D(hdu_le, zd)
5235 DO_LDFF1_ZPZ_S(hsu_be, zsu)
5236 DO_LDFF1_ZPZ_S(hsu_be, zss)
5237 DO_LDFF1_ZPZ_D(hdu_be, zsu)
5238 DO_LDFF1_ZPZ_D(hdu_be, zss)
5239 DO_LDFF1_ZPZ_D(hdu_be, zd)
5241 DO_LDFF1_ZPZ_S(hss_le, zsu)
5242 DO_LDFF1_ZPZ_S(hss_le, zss)
5243 DO_LDFF1_ZPZ_D(hds_le, zsu)
5244 DO_LDFF1_ZPZ_D(hds_le, zss)
5245 DO_LDFF1_ZPZ_D(hds_le, zd)
5247 DO_LDFF1_ZPZ_S(hss_be, zsu)
5248 DO_LDFF1_ZPZ_S(hss_be, zss)
5249 DO_LDFF1_ZPZ_D(hds_be, zsu)
5250 DO_LDFF1_ZPZ_D(hds_be, zss)
5251 DO_LDFF1_ZPZ_D(hds_be, zd)
5253 DO_LDFF1_ZPZ_S(ss_le, zsu)
5254 DO_LDFF1_ZPZ_S(ss_le, zss)
5255 DO_LDFF1_ZPZ_D(sdu_le, zsu)
5256 DO_LDFF1_ZPZ_D(sdu_le, zss)
5257 DO_LDFF1_ZPZ_D(sdu_le, zd)
5259 DO_LDFF1_ZPZ_S(ss_be, zsu)
5260 DO_LDFF1_ZPZ_S(ss_be, zss)
5261 DO_LDFF1_ZPZ_D(sdu_be, zsu)
5262 DO_LDFF1_ZPZ_D(sdu_be, zss)
5263 DO_LDFF1_ZPZ_D(sdu_be, zd)
5265 DO_LDFF1_ZPZ_D(sds_le, zsu)
5266 DO_LDFF1_ZPZ_D(sds_le, zss)
5267 DO_LDFF1_ZPZ_D(sds_le, zd)
5269 DO_LDFF1_ZPZ_D(sds_be, zsu)
5270 DO_LDFF1_ZPZ_D(sds_be, zss)
5271 DO_LDFF1_ZPZ_D(sds_be, zd)
5273 DO_LDFF1_ZPZ_D(dd_le, zsu)
5274 DO_LDFF1_ZPZ_D(dd_le, zss)
5275 DO_LDFF1_ZPZ_D(dd_le, zd)
5277 DO_LDFF1_ZPZ_D(dd_be, zsu)
5278 DO_LDFF1_ZPZ_D(dd_be, zss)
5279 DO_LDFF1_ZPZ_D(dd_be, zd)
5281 /* Stores with a vector index. */
5283 static void sve_st1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5284 target_ulong base, uint32_t desc, uintptr_t ra,
5285 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5287 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5288 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
5289 intptr_t i, oprsz = simd_oprsz(desc);
5291 set_helper_retaddr(ra);
5292 for (i = 0; i < oprsz; ) {
5293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5295 if (likely(pg & 1)) {
5296 target_ulong off = off_fn(vm, i);
5297 tlb_fn(env, vd, i, base + (off << scale), oi, ra);
5302 clear_helper_retaddr();
5305 static void sve_st1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5306 target_ulong base, uint32_t desc, uintptr_t ra,
5307 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5309 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5310 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
5311 intptr_t i, oprsz = simd_oprsz(desc) / 8;
5313 set_helper_retaddr(ra);
5314 for (i = 0; i < oprsz; i++) {
5315 uint8_t pg = *(uint8_t *)(vg + H1(i));
5316 if (likely(pg & 1)) {
5317 target_ulong off = off_fn(vm, i * 8);
5318 tlb_fn(env, vd, i * 8, base + (off << scale), oi, ra);
5321 clear_helper_retaddr();
5324 #define DO_ST1_ZPZ_S(MEM, OFS) \
5325 void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
5326 (CPUARMState *env, void *vd, void *vg, void *vm, \
5327 target_ulong base, uint32_t desc) \
5329 sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5330 off_##OFS##_s, sve_st1##MEM##_tlb); \
5333 #define DO_ST1_ZPZ_D(MEM, OFS) \
5334 void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
5335 (CPUARMState *env, void *vd, void *vg, void *vm, \
5336 target_ulong base, uint32_t desc) \
5338 sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5339 off_##OFS##_d, sve_st1##MEM##_tlb); \
5342 DO_ST1_ZPZ_S(bs, zsu)
5343 DO_ST1_ZPZ_S(hs_le, zsu)
5344 DO_ST1_ZPZ_S(hs_be, zsu)
5345 DO_ST1_ZPZ_S(ss_le, zsu)
5346 DO_ST1_ZPZ_S(ss_be, zsu)
5348 DO_ST1_ZPZ_S(bs, zss)
5349 DO_ST1_ZPZ_S(hs_le, zss)
5350 DO_ST1_ZPZ_S(hs_be, zss)
5351 DO_ST1_ZPZ_S(ss_le, zss)
5352 DO_ST1_ZPZ_S(ss_be, zss)
5354 DO_ST1_ZPZ_D(bd, zsu)
5355 DO_ST1_ZPZ_D(hd_le, zsu)
5356 DO_ST1_ZPZ_D(hd_be, zsu)
5357 DO_ST1_ZPZ_D(sd_le, zsu)
5358 DO_ST1_ZPZ_D(sd_be, zsu)
5359 DO_ST1_ZPZ_D(dd_le, zsu)
5360 DO_ST1_ZPZ_D(dd_be, zsu)
5362 DO_ST1_ZPZ_D(bd, zss)
5363 DO_ST1_ZPZ_D(hd_le, zss)
5364 DO_ST1_ZPZ_D(hd_be, zss)
5365 DO_ST1_ZPZ_D(sd_le, zss)
5366 DO_ST1_ZPZ_D(sd_be, zss)
5367 DO_ST1_ZPZ_D(dd_le, zss)
5368 DO_ST1_ZPZ_D(dd_be, zss)
5370 DO_ST1_ZPZ_D(bd, zd)
5371 DO_ST1_ZPZ_D(hd_le, zd)
5372 DO_ST1_ZPZ_D(hd_be, zd)
5373 DO_ST1_ZPZ_D(sd_le, zd)
5374 DO_ST1_ZPZ_D(sd_be, zd)
5375 DO_ST1_ZPZ_D(dd_le, zd)
5376 DO_ST1_ZPZ_D(dd_be, zd)