4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
57 /* This is an iterative function, called for each Pd and Pg word
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
66 flags |= ((d & (g & -g)) != 0) << 31;
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
79 /* This is an iterative function, called for each Pd and Pg word
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
129 * printf("0x%016lx,\n", m);
132 static inline uint64_t expand_pred_b(uint8_t byte)
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
239 static inline uint64_t expand_pred_h(uint8_t byte)
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
251 return word[byte & 0x55];
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
262 return word[byte & 0x11];
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
274 uint64_t m = 0x0000ffff0000ffffull;
276 return ((h & m) << 16) | ((h >> 16) & m);
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455 return (n * m) >> 16;
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460 return (n * m) >> 32;
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
466 muls64(&lo, &hi, n, m);
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
473 mulu64(&lo, &hi, n, m);
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
558 /* Fully general two-operand expander, controlled by a predicate.
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
612 #define DO_CNOT(N) (N == 0)
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
631 #define DO_NOT(N) (~N)
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
659 #define DO_ABS(N) (N < 0 ? -N : N)
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
666 #define DO_NEG(N) (-N)
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
753 return (TYPERET)ret; \
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
830 #define DO_SUBR(X, Y) (Y - X)
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
880 uint64_t mask = pred_esz_masks[esz];
884 uint64_t this_g = g[--i] & mask;
886 return i * 64 + (63 - clz64(this_g));
889 return (intptr_t)-1 << esz;
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
895 uint32_t flags = PREDTEST_INIT;
896 uint64_t *d = vd, *g = vg;
900 uint64_t this_d = d[i];
901 uint64_t this_g = g[i];
905 /* Set in D the first bit of G. */
906 this_d |= this_g & -this_g;
909 flags = iter_predtest_fwd(this_d, this_g, flags);
911 } while (++i < words);
916 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
918 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
919 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
920 uint32_t flags = PREDTEST_INIT;
921 uint64_t *d = vd, *g = vg, esz_mask;
924 next = last_active_element(vd, words, esz) + (1 << esz);
925 esz_mask = pred_esz_masks[esz];
927 /* Similar to the pseudocode for pnext, but scaled by ESZ
928 so that we find the correct bit. */
929 if (next < words * 64) {
933 mask = ~((1ull << (next & 63)) - 1);
937 uint64_t this_g = g[next / 64] & esz_mask & mask;
939 next = (next & -64) + ctz64(this_g);
944 } while (next < words * 64);
950 if (i == next / 64) {
951 this_d = 1ull << (next & 63);
954 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
955 } while (++i < words);
961 * Copy Zn into Zd, and store zero into inactive elements.
962 * If inv, store zeros into the active elements.
964 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
968 uint64_t *d = vd, *n = vn;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
976 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
980 uint64_t *d = vd, *n = vn;
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
988 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
992 uint64_t *d = vd, *n = vn;
995 for (i = 0; i < opr_sz; i += 1) {
996 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1000 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1002 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 uint64_t *d = vd, *n = vn;
1005 uint8_t inv = simd_data(desc);
1007 for (i = 0; i < opr_sz; i += 1) {
1008 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1012 /* Three-operand expander, immediate operand, controlled by a predicate.
1014 #define DO_ZPZI(NAME, TYPE, H, OP) \
1015 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1017 intptr_t i, opr_sz = simd_oprsz(desc); \
1018 TYPE imm = simd_data(desc); \
1019 for (i = 0; i < opr_sz; ) { \
1020 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1023 TYPE nn = *(TYPE *)(vn + H(i)); \
1024 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1026 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1031 /* Similarly, specialized for 64-bit operands. */
1032 #define DO_ZPZI_D(NAME, TYPE, OP) \
1033 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1035 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1036 TYPE *d = vd, *n = vn; \
1037 TYPE imm = simd_data(desc); \
1039 for (i = 0; i < opr_sz; i += 1) { \
1040 if (pg[H1(i)] & 1) { \
1042 d[i] = OP(nn, imm); \
1047 #define DO_SHR(N, M) (N >> M)
1048 #define DO_SHL(N, M) (N << M)
1050 /* Arithmetic shift right for division. This rounds negative numbers
1051 toward zero as per signed division. Therefore before shifting,
1052 when N is negative, add 2**M-1. */
1053 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1055 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1056 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1057 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1058 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1060 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1061 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1062 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1063 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1065 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1066 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1067 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1068 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1070 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1071 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1072 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1073 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1081 /* Fully general four-operand expander, controlled by a predicate.
1083 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1084 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1085 void *vg, uint32_t desc) \
1087 intptr_t i, opr_sz = simd_oprsz(desc); \
1088 for (i = 0; i < opr_sz; ) { \
1089 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 TYPE mm = *(TYPE *)(vm + H(i)); \
1094 TYPE aa = *(TYPE *)(va + H(i)); \
1095 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1097 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1102 /* Similarly, specialized for 64-bit operands. */
1103 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1104 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1105 void *vg, uint32_t desc) \
1107 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1108 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1110 for (i = 0; i < opr_sz; i += 1) { \
1111 if (pg[H1(i)] & 1) { \
1112 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1113 d[i] = OP(aa, nn, mm); \
1118 #define DO_MLA(A, N, M) (A + N * M)
1119 #define DO_MLS(A, N, M) (A - N * M)
1121 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1122 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1124 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1125 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1127 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1128 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1130 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1131 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1138 void HELPER(sve_index_b)(void *vd, uint32_t start,
1139 uint32_t incr, uint32_t desc)
1141 intptr_t i, opr_sz = simd_oprsz(desc);
1143 for (i = 0; i < opr_sz; i += 1) {
1144 d[H1(i)] = start + i * incr;
1148 void HELPER(sve_index_h)(void *vd, uint32_t start,
1149 uint32_t incr, uint32_t desc)
1151 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1153 for (i = 0; i < opr_sz; i += 1) {
1154 d[H2(i)] = start + i * incr;
1158 void HELPER(sve_index_s)(void *vd, uint32_t start,
1159 uint32_t incr, uint32_t desc)
1161 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1163 for (i = 0; i < opr_sz; i += 1) {
1164 d[H4(i)] = start + i * incr;
1168 void HELPER(sve_index_d)(void *vd, uint64_t start,
1169 uint64_t incr, uint32_t desc)
1171 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1173 for (i = 0; i < opr_sz; i += 1) {
1174 d[i] = start + i * incr;
1178 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1180 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1181 uint32_t sh = simd_data(desc);
1182 uint32_t *d = vd, *n = vn, *m = vm;
1183 for (i = 0; i < opr_sz; i += 1) {
1184 d[i] = n[i] + (m[i] << sh);
1188 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1191 uint64_t sh = simd_data(desc);
1192 uint64_t *d = vd, *n = vn, *m = vm;
1193 for (i = 0; i < opr_sz; i += 1) {
1194 d[i] = n[i] + (m[i] << sh);
1198 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1201 uint64_t sh = simd_data(desc);
1202 uint64_t *d = vd, *n = vn, *m = vm;
1203 for (i = 0; i < opr_sz; i += 1) {
1204 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1208 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t sh = simd_data(desc);
1212 uint64_t *d = vd, *n = vn, *m = vm;
1213 for (i = 0; i < opr_sz; i += 1) {
1214 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1218 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1220 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1221 static const uint16_t coeff[] = {
1222 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1223 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1224 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1225 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1228 uint16_t *d = vd, *n = vn;
1230 for (i = 0; i < opr_sz; i++) {
1232 intptr_t idx = extract32(nn, 0, 5);
1233 uint16_t exp = extract32(nn, 5, 5);
1234 d[i] = coeff[idx] | (exp << 10);
1238 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1240 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1241 static const uint32_t coeff[] = {
1242 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1243 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1244 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1245 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1246 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1247 0x1ef532, 0x20b051, 0x227043, 0x243516,
1248 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1249 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1250 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1251 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1252 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1253 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1254 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1255 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1256 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1257 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1260 uint32_t *d = vd, *n = vn;
1262 for (i = 0; i < opr_sz; i++) {
1264 intptr_t idx = extract32(nn, 0, 6);
1265 uint32_t exp = extract32(nn, 6, 8);
1266 d[i] = coeff[idx] | (exp << 23);
1270 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1272 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1273 static const uint64_t coeff[] = {
1274 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1275 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1276 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1277 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1278 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1279 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1280 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1281 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1282 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1283 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1284 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1285 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1286 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1287 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1288 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1289 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1290 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1291 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1292 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1293 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1294 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1298 uint64_t *d = vd, *n = vn;
1300 for (i = 0; i < opr_sz; i++) {
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint64_t exp = extract32(nn, 6, 11);
1304 d[i] = coeff[idx] | (exp << 52);
1308 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1311 uint16_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1318 d[i] = nn ^ (mm & 2) << 14;
1322 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1325 uint32_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1332 d[i] = nn ^ (mm & 2) << 30;
1336 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1339 uint64_t *d = vd, *n = vn, *m = vm;
1340 for (i = 0; i < opr_sz; i += 1) {
1346 d[i] = nn ^ (mm & 2) << 62;
1351 * Signed saturating addition with scalar operand.
1354 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1356 intptr_t i, oprsz = simd_oprsz(desc);
1358 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1359 int r = *(int8_t *)(a + i) + b;
1362 } else if (r < INT8_MIN) {
1365 *(int8_t *)(d + i) = r;
1369 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1371 intptr_t i, oprsz = simd_oprsz(desc);
1373 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1374 int r = *(int16_t *)(a + i) + b;
1375 if (r > INT16_MAX) {
1377 } else if (r < INT16_MIN) {
1380 *(int16_t *)(d + i) = r;
1384 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1386 intptr_t i, oprsz = simd_oprsz(desc);
1388 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1389 int64_t r = *(int32_t *)(a + i) + b;
1390 if (r > INT32_MAX) {
1392 } else if (r < INT32_MIN) {
1395 *(int32_t *)(d + i) = r;
1399 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1401 intptr_t i, oprsz = simd_oprsz(desc);
1403 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1404 int64_t ai = *(int64_t *)(a + i);
1406 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1407 /* Signed overflow. */
1408 r = (r < 0 ? INT64_MAX : INT64_MIN);
1410 *(int64_t *)(d + i) = r;
1415 * Unsigned saturating addition with scalar operand.
1418 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1420 intptr_t i, oprsz = simd_oprsz(desc);
1422 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1423 int r = *(uint8_t *)(a + i) + b;
1424 if (r > UINT8_MAX) {
1429 *(uint8_t *)(d + i) = r;
1433 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1435 intptr_t i, oprsz = simd_oprsz(desc);
1437 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1438 int r = *(uint16_t *)(a + i) + b;
1439 if (r > UINT16_MAX) {
1444 *(uint16_t *)(d + i) = r;
1448 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1450 intptr_t i, oprsz = simd_oprsz(desc);
1452 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1453 int64_t r = *(uint32_t *)(a + i) + b;
1454 if (r > UINT32_MAX) {
1459 *(uint32_t *)(d + i) = r;
1463 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1465 intptr_t i, oprsz = simd_oprsz(desc);
1467 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1468 uint64_t r = *(uint64_t *)(a + i) + b;
1472 *(uint64_t *)(d + i) = r;
1476 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1478 intptr_t i, oprsz = simd_oprsz(desc);
1480 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1481 uint64_t ai = *(uint64_t *)(a + i);
1482 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1486 /* Two operand predicated copy immediate with merge. All valid immediates
1487 * can fit within 17 signed bits in the simd_data field.
1489 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1490 uint64_t mm, uint32_t desc)
1492 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1493 uint64_t *d = vd, *n = vn;
1496 mm = dup_const(MO_8, mm);
1497 for (i = 0; i < opr_sz; i += 1) {
1499 uint64_t pp = expand_pred_b(pg[H1(i)]);
1500 d[i] = (mm & pp) | (nn & ~pp);
1504 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1505 uint64_t mm, uint32_t desc)
1507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1508 uint64_t *d = vd, *n = vn;
1511 mm = dup_const(MO_16, mm);
1512 for (i = 0; i < opr_sz; i += 1) {
1514 uint64_t pp = expand_pred_h(pg[H1(i)]);
1515 d[i] = (mm & pp) | (nn & ~pp);
1519 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1520 uint64_t mm, uint32_t desc)
1522 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1523 uint64_t *d = vd, *n = vn;
1526 mm = dup_const(MO_32, mm);
1527 for (i = 0; i < opr_sz; i += 1) {
1529 uint64_t pp = expand_pred_s(pg[H1(i)]);
1530 d[i] = (mm & pp) | (nn & ~pp);
1534 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1535 uint64_t mm, uint32_t desc)
1537 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1538 uint64_t *d = vd, *n = vn;
1541 for (i = 0; i < opr_sz; i += 1) {
1543 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1547 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1553 val = dup_const(MO_8, val);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 d[i] = val & expand_pred_b(pg[H1(i)]);
1559 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1565 val = dup_const(MO_16, val);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 d[i] = val & expand_pred_h(pg[H1(i)]);
1571 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 val = dup_const(MO_32, val);
1578 for (i = 0; i < opr_sz; i += 1) {
1579 d[i] = val & expand_pred_s(pg[H1(i)]);
1583 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589 for (i = 0; i < opr_sz; i += 1) {
1590 d[i] = (pg[H1(i)] & 1 ? val : 0);
1594 /* Big-endian hosts need to frob the byte indices. If the copy
1595 * happens to be 8-byte aligned, then no frobbing necessary.
1597 static void swap_memmove(void *vd, void *vs, size_t n)
1599 uintptr_t d = (uintptr_t)vd;
1600 uintptr_t s = (uintptr_t)vs;
1601 uintptr_t o = (d | s | n) & 7;
1604 #ifndef HOST_WORDS_BIGENDIAN
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 4) {
1615 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1618 for (i = n; i > 0; ) {
1620 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1627 if (d < s || d >= s + n) {
1628 for (i = 0; i < n; i += 2) {
1629 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1632 for (i = n; i > 0; ) {
1634 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i++) {
1642 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1645 for (i = n; i > 0; ) {
1647 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1654 /* Similarly for memset of 0. */
1655 static void swap_memzero(void *vd, size_t n)
1657 uintptr_t d = (uintptr_t)vd;
1658 uintptr_t o = (d | n) & 7;
1661 /* Usually, the first bit of a predicate is set, so N is 0. */
1662 if (likely(n == 0)) {
1666 #ifndef HOST_WORDS_BIGENDIAN
1675 for (i = 0; i < n; i += 4) {
1676 *(uint32_t *)H1_4(d + i) = 0;
1682 for (i = 0; i < n; i += 2) {
1683 *(uint16_t *)H1_2(d + i) = 0;
1688 for (i = 0; i < n; i++) {
1689 *(uint8_t *)H1(d + i) = 0;
1695 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1697 intptr_t opr_sz = simd_oprsz(desc);
1698 size_t n_ofs = simd_data(desc);
1699 size_t n_siz = opr_sz - n_ofs;
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 swap_memmove(vd + n_siz, vm, n_ofs);
1704 } else if (vd != vn) {
1705 swap_memmove(vd + n_siz, vd, n_ofs);
1706 swap_memmove(vd, vn + n_ofs, n_siz);
1708 /* vd == vn == vm. Need temp space. */
1710 swap_memmove(&tmp, vm, n_ofs);
1711 swap_memmove(vd, vd + n_ofs, n_siz);
1712 memcpy(vd + n_siz, &tmp, n_ofs);
1716 #define DO_INSR(NAME, TYPE, H) \
1717 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1719 intptr_t opr_sz = simd_oprsz(desc); \
1720 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1721 *(TYPE *)(vd + H(0)) = val; \
1724 DO_INSR(sve_insr_b, uint8_t, H1)
1725 DO_INSR(sve_insr_h, uint16_t, H1_2)
1726 DO_INSR(sve_insr_s, uint32_t, H1_4)
1727 DO_INSR(sve_insr_d, uint64_t, )
1731 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1733 intptr_t i, j, opr_sz = simd_oprsz(desc);
1734 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1735 uint64_t f = *(uint64_t *)(vn + i);
1736 uint64_t b = *(uint64_t *)(vn + j);
1737 *(uint64_t *)(vd + i) = bswap64(b);
1738 *(uint64_t *)(vd + j) = bswap64(f);
1742 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1744 intptr_t i, j, opr_sz = simd_oprsz(desc);
1745 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1746 uint64_t f = *(uint64_t *)(vn + i);
1747 uint64_t b = *(uint64_t *)(vn + j);
1748 *(uint64_t *)(vd + i) = hswap64(b);
1749 *(uint64_t *)(vd + j) = hswap64(f);
1753 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1755 intptr_t i, j, opr_sz = simd_oprsz(desc);
1756 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1757 uint64_t f = *(uint64_t *)(vn + i);
1758 uint64_t b = *(uint64_t *)(vn + j);
1759 *(uint64_t *)(vd + i) = rol64(b, 32);
1760 *(uint64_t *)(vd + j) = rol64(f, 32);
1764 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1766 intptr_t i, j, opr_sz = simd_oprsz(desc);
1767 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1768 uint64_t f = *(uint64_t *)(vn + i);
1769 uint64_t b = *(uint64_t *)(vn + j);
1770 *(uint64_t *)(vd + i) = b;
1771 *(uint64_t *)(vd + j) = f;
1775 #define DO_TBL(NAME, TYPE, H) \
1776 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1778 intptr_t i, opr_sz = simd_oprsz(desc); \
1779 uintptr_t elem = opr_sz / sizeof(TYPE); \
1780 TYPE *d = vd, *n = vn, *m = vm; \
1782 if (unlikely(vd == vn)) { \
1783 n = memcpy(&tmp, vn, opr_sz); \
1785 for (i = 0; i < elem; i++) { \
1787 d[H(i)] = j < elem ? n[H(j)] : 0; \
1791 DO_TBL(sve_tbl_b, uint8_t, H1)
1792 DO_TBL(sve_tbl_h, uint16_t, H2)
1793 DO_TBL(sve_tbl_s, uint32_t, H4)
1794 DO_TBL(sve_tbl_d, uint64_t, )
1798 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1799 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1805 if (unlikely(vn - vd < opr_sz)) { \
1806 n = memcpy(&tmp, n, opr_sz / 2); \
1808 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1809 d[HD(i)] = n[HS(i)]; \
1813 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1814 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1815 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1817 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1818 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1819 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1823 /* Mask of bits included in the even numbered predicates of width esz.
1824 * We also use this for expand_bits/compress_bits, and so extend the
1825 * same pattern out to 16-bit units.
1827 static const uint64_t even_bit_esz_masks[5] = {
1828 0x5555555555555555ull,
1829 0x3333333333333333ull,
1830 0x0f0f0f0f0f0f0f0full,
1831 0x00ff00ff00ff00ffull,
1832 0x0000ffff0000ffffull,
1835 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1836 * For N==0, this corresponds to the operation that in qemu/bitops.h
1837 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1838 * section 7-2 Shuffling Bits.
1840 static uint64_t expand_bits(uint64_t x, int n)
1845 for (i = 4; i >= n; i--) {
1847 x = ((x << sh) | x) & even_bit_esz_masks[i];
1852 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1853 * For N==0, this corresponds to the operation that in qemu/bitops.h
1854 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1855 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1857 static uint64_t compress_bits(uint64_t x, int n)
1861 for (i = n; i <= 4; i++) {
1863 x &= even_bit_esz_masks[i];
1866 return x & 0xffffffffu;
1869 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1872 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1873 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
1874 int esize = 1 << esz;
1879 uint64_t nn = *(uint64_t *)vn;
1880 uint64_t mm = *(uint64_t *)vm;
1881 int half = 4 * oprsz;
1883 nn = extract64(nn, high * half, half);
1884 mm = extract64(mm, high * half, half);
1885 nn = expand_bits(nn, esz);
1886 mm = expand_bits(mm, esz);
1887 d[0] = nn | (mm << esize);
1889 ARMPredicateReg tmp;
1891 /* We produce output faster than we consume input.
1892 Therefore we must be mindful of possible overlap. */
1894 vn = memcpy(&tmp, vn, oprsz);
1898 } else if (vd == vm) {
1899 vm = memcpy(&tmp, vm, oprsz);
1905 if ((oprsz & 7) == 0) {
1906 uint32_t *n = vn, *m = vm;
1909 for (i = 0; i < oprsz / 8; i++) {
1910 uint64_t nn = n[H4(high + i)];
1911 uint64_t mm = m[H4(high + i)];
1913 nn = expand_bits(nn, esz);
1914 mm = expand_bits(mm, esz);
1915 d[i] = nn | (mm << esize);
1918 uint8_t *n = vn, *m = vm;
1921 for (i = 0; i < oprsz / 2; i++) {
1922 uint16_t nn = n[H1(high + i)];
1923 uint16_t mm = m[H1(high + i)];
1925 nn = expand_bits(nn, esz);
1926 mm = expand_bits(mm, esz);
1927 d16[H2(i)] = nn | (mm << esize);
1933 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1935 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1936 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1937 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
1938 uint64_t *d = vd, *n = vn, *m = vm;
1943 l = compress_bits(n[0] >> odd, esz);
1944 h = compress_bits(m[0] >> odd, esz);
1945 d[0] = l | (h << (4 * oprsz));
1947 ARMPredicateReg tmp_m;
1948 intptr_t oprsz_16 = oprsz / 16;
1950 if ((vm - vd) < (uintptr_t)oprsz) {
1951 m = memcpy(&tmp_m, vm, oprsz);
1954 for (i = 0; i < oprsz_16; i++) {
1957 l = compress_bits(l >> odd, esz);
1958 h = compress_bits(h >> odd, esz);
1959 d[i] = l | (h << 32);
1963 * For VL which is not a multiple of 512, the results from M do not
1964 * align nicely with the uint64_t for D. Put the aligned results
1965 * from M into TMP_M and then copy it into place afterward.
1968 int final_shift = (oprsz & 15) * 2;
1972 l = compress_bits(l >> odd, esz);
1973 h = compress_bits(h >> odd, esz);
1974 d[i] = l | (h << final_shift);
1976 for (i = 0; i < oprsz_16; i++) {
1979 l = compress_bits(l >> odd, esz);
1980 h = compress_bits(h >> odd, esz);
1981 tmp_m.p[i] = l | (h << 32);
1985 l = compress_bits(l >> odd, esz);
1986 h = compress_bits(h >> odd, esz);
1987 tmp_m.p[i] = l | (h << final_shift);
1989 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1991 for (i = 0; i < oprsz_16; i++) {
1994 l = compress_bits(l >> odd, esz);
1995 h = compress_bits(h >> odd, esz);
1996 d[oprsz_16 + i] = l | (h << 32);
2002 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2005 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
2007 uint64_t *d = vd, *n = vn, *m = vm;
2014 mask = even_bit_esz_masks[esz];
2021 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2022 uint64_t nn = (n[i] & mask) >> shr;
2023 uint64_t mm = (m[i] & mask) << shl;
2028 /* Reverse units of 2**N bits. */
2029 static uint64_t reverse_bits_64(uint64_t x, int n)
2034 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2035 uint64_t mask = even_bit_esz_masks[i];
2036 x = ((x & mask) << sh) | ((x >> sh) & mask);
2041 static uint8_t reverse_bits_8(uint8_t x, int n)
2043 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2046 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2047 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2052 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2054 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2055 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2056 intptr_t i, oprsz_2 = oprsz / 2;
2059 uint64_t l = *(uint64_t *)vn;
2060 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2061 *(uint64_t *)vd = l;
2062 } else if ((oprsz & 15) == 0) {
2063 for (i = 0; i < oprsz_2; i += 8) {
2064 intptr_t ih = oprsz - 8 - i;
2065 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2066 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2067 *(uint64_t *)(vd + i) = h;
2068 *(uint64_t *)(vd + ih) = l;
2071 for (i = 0; i < oprsz_2; i += 1) {
2072 intptr_t il = H1(i);
2073 intptr_t ih = H1(oprsz - 1 - i);
2074 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2075 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2076 *(uint8_t *)(vd + il) = h;
2077 *(uint8_t *)(vd + ih) = l;
2082 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2084 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2085 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2090 uint64_t nn = *(uint64_t *)vn;
2091 int half = 4 * oprsz;
2093 nn = extract64(nn, high * half, half);
2094 nn = expand_bits(nn, 0);
2097 ARMPredicateReg tmp_n;
2099 /* We produce output faster than we consume input.
2100 Therefore we must be mindful of possible overlap. */
2101 if ((vn - vd) < (uintptr_t)oprsz) {
2102 vn = memcpy(&tmp_n, vn, oprsz);
2108 if ((oprsz & 7) == 0) {
2112 for (i = 0; i < oprsz / 8; i++) {
2113 uint64_t nn = n[H4(high + i)];
2114 d[i] = expand_bits(nn, 0);
2120 for (i = 0; i < oprsz / 2; i++) {
2121 uint16_t nn = n[H1(high + i)];
2122 d16[H2(i)] = expand_bits(nn, 0);
2128 #define DO_ZIP(NAME, TYPE, H) \
2129 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2131 intptr_t oprsz = simd_oprsz(desc); \
2132 intptr_t i, oprsz_2 = oprsz / 2; \
2133 ARMVectorReg tmp_n, tmp_m; \
2134 /* We produce output faster than we consume input. \
2135 Therefore we must be mindful of possible overlap. */ \
2136 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2137 vn = memcpy(&tmp_n, vn, oprsz_2); \
2139 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2140 vm = memcpy(&tmp_m, vm, oprsz_2); \
2142 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2143 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2144 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2148 DO_ZIP(sve_zip_b, uint8_t, H1)
2149 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2150 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2151 DO_ZIP(sve_zip_d, uint64_t, )
2153 #define DO_UZP(NAME, TYPE, H) \
2154 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2156 intptr_t oprsz = simd_oprsz(desc); \
2157 intptr_t oprsz_2 = oprsz / 2; \
2158 intptr_t odd_ofs = simd_data(desc); \
2160 ARMVectorReg tmp_m; \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz); \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2167 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2168 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2172 DO_UZP(sve_uzp_b, uint8_t, H1)
2173 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2174 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2175 DO_UZP(sve_uzp_d, uint64_t, )
2177 #define DO_TRN(NAME, TYPE, H) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t odd_ofs = simd_data(desc); \
2183 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2184 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2185 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2186 *(TYPE *)(vd + H(i + 0)) = ae; \
2187 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2191 DO_TRN(sve_trn_b, uint8_t, H1)
2192 DO_TRN(sve_trn_h, uint16_t, H1_2)
2193 DO_TRN(sve_trn_s, uint32_t, H1_4)
2194 DO_TRN(sve_trn_d, uint64_t, )
2200 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2202 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2203 uint32_t *d = vd, *n = vn;
2206 for (i = j = 0; i < opr_sz; i++) {
2207 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2208 d[H4(j)] = n[H4(i)];
2212 for (; j < opr_sz; j++) {
2217 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2219 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2220 uint64_t *d = vd, *n = vn;
2223 for (i = j = 0; i < opr_sz; i++) {
2224 if (pg[H1(i)] & 1) {
2229 for (; j < opr_sz; j++) {
2234 /* Similar to the ARM LastActiveElement pseudocode function, except the
2235 * result is multiplied by the element size. This includes the not found
2236 * indication; e.g. not found for esz=3 is -8.
2238 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2240 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2241 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2243 return last_active_element(vg, words, esz);
2246 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2248 intptr_t opr_sz = simd_oprsz(desc) / 8;
2249 int esz = simd_data(desc);
2250 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2251 intptr_t i, first_i, last_i;
2254 first_i = last_i = 0;
2255 first_g = last_g = 0;
2257 /* Find the extent of the active elements within VG. */
2258 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2259 pg = *(uint64_t *)(vg + i) & mask;
2272 first_i = first_i * 8 + ctz64(first_g);
2273 last_i = last_i * 8 + 63 - clz64(last_g);
2274 len = last_i - first_i + (1 << esz);
2276 vm = memcpy(&tmp, vm, opr_sz * 8);
2278 swap_memmove(vd, vn + first_i, len);
2280 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2283 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2284 void *vg, uint32_t desc)
2286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2287 uint64_t *d = vd, *n = vn, *m = vm;
2290 for (i = 0; i < opr_sz; i += 1) {
2291 uint64_t nn = n[i], mm = m[i];
2292 uint64_t pp = expand_pred_b(pg[H1(i)]);
2293 d[i] = (nn & pp) | (mm & ~pp);
2297 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2298 void *vg, uint32_t desc)
2300 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2301 uint64_t *d = vd, *n = vn, *m = vm;
2304 for (i = 0; i < opr_sz; i += 1) {
2305 uint64_t nn = n[i], mm = m[i];
2306 uint64_t pp = expand_pred_h(pg[H1(i)]);
2307 d[i] = (nn & pp) | (mm & ~pp);
2311 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2312 void *vg, uint32_t desc)
2314 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2315 uint64_t *d = vd, *n = vn, *m = vm;
2318 for (i = 0; i < opr_sz; i += 1) {
2319 uint64_t nn = n[i], mm = m[i];
2320 uint64_t pp = expand_pred_s(pg[H1(i)]);
2321 d[i] = (nn & pp) | (mm & ~pp);
2325 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2326 void *vg, uint32_t desc)
2328 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2329 uint64_t *d = vd, *n = vn, *m = vm;
2332 for (i = 0; i < opr_sz; i += 1) {
2333 uint64_t nn = n[i], mm = m[i];
2334 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2338 /* Two operand comparison controlled by a predicate.
2339 * ??? It is very tempting to want to be able to expand this inline
2340 * with x86 instructions, e.g.
2342 * vcmpeqw zm, zn, %ymm0
2343 * vpmovmskb %ymm0, %eax
2347 * or even aarch64, e.g.
2349 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2350 * cmeq v0.8h, zn, zm
2351 * and v0.8h, v0.8h, mask
2355 * However, coming up with an abstraction that allows vector inputs and
2356 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2357 * scalar outputs, is tricky.
2359 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2360 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2362 intptr_t opr_sz = simd_oprsz(desc); \
2363 uint32_t flags = PREDTEST_INIT; \
2364 intptr_t i = opr_sz; \
2366 uint64_t out = 0, pg; \
2368 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2369 TYPE nn = *(TYPE *)(vn + H(i)); \
2370 TYPE mm = *(TYPE *)(vm + H(i)); \
2373 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2375 *(uint64_t *)(vd + (i >> 3)) = out; \
2376 flags = iter_predtest_bwd(out, pg, flags); \
2381 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2382 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2383 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2384 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2385 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2386 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2387 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2388 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2390 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2391 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2392 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2393 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2395 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2396 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2397 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2398 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2400 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2401 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2402 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2403 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2405 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2406 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2407 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2408 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2410 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2411 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2412 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2413 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2415 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2416 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2417 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2418 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2420 #undef DO_CMP_PPZZ_B
2421 #undef DO_CMP_PPZZ_H
2422 #undef DO_CMP_PPZZ_S
2423 #undef DO_CMP_PPZZ_D
2426 /* Similar, but the second source is "wide". */
2427 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2428 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2430 intptr_t opr_sz = simd_oprsz(desc); \
2431 uint32_t flags = PREDTEST_INIT; \
2432 intptr_t i = opr_sz; \
2434 uint64_t out = 0, pg; \
2436 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2438 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2439 TYPE nn = *(TYPE *)(vn + H(i)); \
2443 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2445 *(uint64_t *)(vd + (i >> 3)) = out; \
2446 flags = iter_predtest_bwd(out, pg, flags); \
2451 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2452 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2453 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2454 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2455 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2456 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2458 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2459 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2460 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2462 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2463 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2464 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2466 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2467 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2468 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2470 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2471 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2472 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2474 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2475 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2476 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2478 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2479 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2480 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2482 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2483 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2484 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2486 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2487 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2488 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2490 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2491 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2492 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2494 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2495 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2496 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2498 #undef DO_CMP_PPZW_B
2499 #undef DO_CMP_PPZW_H
2500 #undef DO_CMP_PPZW_S
2503 /* Similar, but the second source is immediate. */
2504 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2505 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2507 intptr_t opr_sz = simd_oprsz(desc); \
2508 uint32_t flags = PREDTEST_INIT; \
2509 TYPE mm = simd_data(desc); \
2510 intptr_t i = opr_sz; \
2512 uint64_t out = 0, pg; \
2514 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2515 TYPE nn = *(TYPE *)(vn + H(i)); \
2518 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2520 *(uint64_t *)(vd + (i >> 3)) = out; \
2521 flags = iter_predtest_bwd(out, pg, flags); \
2526 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2527 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2528 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2529 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2530 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2531 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2532 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2533 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2535 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2536 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2537 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2538 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2540 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2541 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2542 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2543 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2545 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2546 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2547 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2548 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2550 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2551 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2552 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2553 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2555 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2556 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2557 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2558 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2560 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2561 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2562 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2563 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2565 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2566 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2567 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2568 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2570 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2571 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2572 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2573 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2575 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2576 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2577 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2578 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2580 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2581 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2582 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2583 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2585 #undef DO_CMP_PPZI_B
2586 #undef DO_CMP_PPZI_H
2587 #undef DO_CMP_PPZI_S
2588 #undef DO_CMP_PPZI_D
2591 /* Similar to the ARM LastActive pseudocode function. */
2592 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2596 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2597 uint64_t pg = *(uint64_t *)(vg + i);
2599 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2605 /* Compute a mask into RETB that is true for all G, up to and including
2606 * (if after) or excluding (if !after) the first G & N.
2607 * Return true if BRK found.
2609 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2610 bool brk, bool after)
2616 } else if ((g & n) == 0) {
2617 /* For all G, no N are set; break not found. */
2620 /* Break somewhere in N. Locate it. */
2621 b = g & n; /* guard true, pred true */
2622 b = b & -b; /* first such */
2624 b = b | (b - 1); /* break after same */
2626 b = b - 1; /* break before same */
2635 /* Compute a zeroing BRK. */
2636 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2637 intptr_t oprsz, bool after)
2642 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643 uint64_t this_b, this_g = g[i];
2645 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646 d[i] = this_b & this_g;
2650 /* Likewise, but also compute flags. */
2651 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2652 intptr_t oprsz, bool after)
2654 uint32_t flags = PREDTEST_INIT;
2658 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2659 uint64_t this_b, this_d, this_g = g[i];
2661 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2662 d[i] = this_d = this_b & this_g;
2663 flags = iter_predtest_fwd(this_d, this_g, flags);
2668 /* Compute a merging BRK. */
2669 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2670 intptr_t oprsz, bool after)
2675 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2676 uint64_t this_b, this_g = g[i];
2678 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2683 /* Likewise, but also compute flags. */
2684 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2685 intptr_t oprsz, bool after)
2687 uint32_t flags = PREDTEST_INIT;
2691 for (i = 0; i < oprsz / 8; ++i) {
2692 uint64_t this_b, this_d = d[i], this_g = g[i];
2694 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2695 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2696 flags = iter_predtest_fwd(this_d, this_g, flags);
2701 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2703 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2704 * The compiler should turn this into 4 64-bit integer stores.
2706 memset(d, 0, sizeof(ARMPredicateReg));
2707 return PREDTEST_INIT;
2710 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2713 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2714 if (last_active_pred(vn, vg, oprsz)) {
2715 compute_brk_z(vd, vm, vg, oprsz, true);
2721 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2724 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2725 if (last_active_pred(vn, vg, oprsz)) {
2726 return compute_brks_z(vd, vm, vg, oprsz, true);
2728 return do_zero(vd, oprsz);
2732 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2735 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, false);
2743 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2746 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, false);
2750 return do_zero(vd, oprsz);
2754 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2756 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2757 compute_brk_z(vd, vn, vg, oprsz, true);
2760 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2762 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2763 return compute_brks_z(vd, vn, vg, oprsz, true);
2766 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2768 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2769 compute_brk_z(vd, vn, vg, oprsz, false);
2772 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2774 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2775 return compute_brks_z(vd, vn, vg, oprsz, false);
2778 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2780 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2781 compute_brk_m(vd, vn, vg, oprsz, true);
2784 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2786 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2787 return compute_brks_m(vd, vn, vg, oprsz, true);
2790 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2792 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2793 compute_brk_m(vd, vn, vg, oprsz, false);
2796 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2798 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2799 return compute_brks_m(vd, vn, vg, oprsz, false);
2802 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2804 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2805 if (!last_active_pred(vn, vg, oprsz)) {
2810 /* As if PredTest(Ones(PL), D, esz). */
2811 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2814 uint32_t flags = PREDTEST_INIT;
2817 for (i = 0; i < oprsz / 8; i++) {
2818 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2821 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2822 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2827 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2829 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2830 if (last_active_pred(vn, vg, oprsz)) {
2831 return predtest_ones(vd, oprsz, -1);
2833 return do_zero(vd, oprsz);
2837 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2839 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2840 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2841 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2844 for (i = 0; i < words; ++i) {
2845 uint64_t t = n[i] & g[i] & mask;
2851 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2853 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2854 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2855 uint64_t esz_mask = pred_esz_masks[esz];
2856 ARMPredicateReg *d = vd;
2860 /* Begin with a zero predicate register. */
2861 flags = do_zero(d, oprsz);
2866 /* Set all of the requested bits. */
2867 for (i = 0; i < count / 64; ++i) {
2871 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2874 return predtest_ones(d, oprsz, esz_mask);
2877 /* Recursive reduction on a function;
2878 * C.f. the ARM ARM function ReducePredicated.
2880 * While it would be possible to write this without the DATA temporary,
2881 * it is much simpler to process the predicate register this way.
2882 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2883 * little to gain with a more complex non-recursive form.
2885 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2886 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2891 uintptr_t half = n / 2; \
2892 TYPE lo = NAME##_reduce(data, status, half); \
2893 TYPE hi = NAME##_reduce(data + half, status, half); \
2894 return TYPE##_##FUNC(lo, hi, status); \
2897 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2899 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
2900 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2901 for (i = 0; i < oprsz; ) { \
2902 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2904 TYPE nn = *(TYPE *)(vn + H(i)); \
2905 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2906 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2909 for (; i < maxsz; i += sizeof(TYPE)) { \
2910 *(TYPE *)((void *)data + i) = IDENT; \
2912 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2915 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2916 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2917 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2919 /* Identity is floatN_default_nan, without the function call. */
2920 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2921 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2922 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2924 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2925 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2926 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2928 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2929 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2930 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2932 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2933 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2934 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2938 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2939 void *status, uint32_t desc)
2941 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2942 float16 result = nn;
2945 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2948 float16 mm = *(float16 *)(vm + H1_2(i));
2949 result = float16_add(result, mm, status);
2951 i += sizeof(float16), pg >>= sizeof(float16);
2953 } while (i < opr_sz);
2958 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2959 void *status, uint32_t desc)
2961 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2962 float32 result = nn;
2965 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2968 float32 mm = *(float32 *)(vm + H1_2(i));
2969 result = float32_add(result, mm, status);
2971 i += sizeof(float32), pg >>= sizeof(float32);
2973 } while (i < opr_sz);
2978 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2979 void *status, uint32_t desc)
2981 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2985 for (i = 0; i < opr_sz; i++) {
2986 if (pg[H1(i)] & 1) {
2987 nn = float64_add(nn, m[i], status);
2994 /* Fully general three-operand expander, controlled by a predicate,
2995 * With the extra float_status parameter.
2997 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2998 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2999 void *status, uint32_t desc) \
3001 intptr_t i = simd_oprsz(desc); \
3004 uint64_t pg = g[(i - 1) >> 6]; \
3006 i -= sizeof(TYPE); \
3007 if (likely((pg >> (i & 63)) & 1)) { \
3008 TYPE nn = *(TYPE *)(vn + H(i)); \
3009 TYPE mm = *(TYPE *)(vm + H(i)); \
3010 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3016 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3017 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3018 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3020 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3021 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3022 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3024 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3025 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3026 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3028 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3029 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3030 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3032 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3033 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3034 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3036 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3037 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3038 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3040 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3041 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3042 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3044 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3045 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3046 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3048 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3050 return float16_abs(float16_sub(a, b, s));
3053 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3055 return float32_abs(float32_sub(a, b, s));
3058 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3060 return float64_abs(float64_sub(a, b, s));
3063 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3064 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3065 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3067 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3069 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3070 return float64_scalbn(a, b_int, s);
3073 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3074 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3075 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3077 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3078 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3079 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3083 /* Three-operand expander, with one scalar operand, controlled by
3084 * a predicate, with the extra float_status parameter.
3086 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3087 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3088 void *status, uint32_t desc) \
3090 intptr_t i = simd_oprsz(desc); \
3094 uint64_t pg = g[(i - 1) >> 6]; \
3096 i -= sizeof(TYPE); \
3097 if (likely((pg >> (i & 63)) & 1)) { \
3098 TYPE nn = *(TYPE *)(vn + H(i)); \
3099 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3105 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3106 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3107 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3109 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3110 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3111 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3113 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3114 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3115 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3117 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3119 return float16_sub(b, a, s);
3122 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3124 return float32_sub(b, a, s);
3127 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3129 return float64_sub(b, a, s);
3132 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3133 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3134 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3136 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3137 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3138 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3140 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3141 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3142 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3144 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3145 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3146 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3148 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3149 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3150 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3152 /* Fully general two-operand expander, controlled by a predicate,
3153 * With the extra float_status parameter.
3155 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3156 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3158 intptr_t i = simd_oprsz(desc); \
3161 uint64_t pg = g[(i - 1) >> 6]; \
3163 i -= sizeof(TYPE); \
3164 if (likely((pg >> (i & 63)) & 1)) { \
3165 TYPE nn = *(TYPE *)(vn + H(i)); \
3166 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3172 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3173 * FZ16. When converting from fp16, this affects flushing input denormals;
3174 * when converting to fp16, this affects flushing output denormals.
3176 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3178 bool save = get_flush_inputs_to_zero(fpst);
3181 set_flush_inputs_to_zero(false, fpst);
3182 ret = float16_to_float32(f, true, fpst);
3183 set_flush_inputs_to_zero(save, fpst);
3187 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3189 bool save = get_flush_inputs_to_zero(fpst);
3192 set_flush_inputs_to_zero(false, fpst);
3193 ret = float16_to_float64(f, true, fpst);
3194 set_flush_inputs_to_zero(save, fpst);
3198 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3200 bool save = get_flush_to_zero(fpst);
3203 set_flush_to_zero(false, fpst);
3204 ret = float32_to_float16(f, true, fpst);
3205 set_flush_to_zero(save, fpst);
3209 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3211 bool save = get_flush_to_zero(fpst);
3214 set_flush_to_zero(false, fpst);
3215 ret = float64_to_float16(f, true, fpst);
3216 set_flush_to_zero(save, fpst);
3220 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3222 if (float16_is_any_nan(f)) {
3223 float_raise(float_flag_invalid, s);
3226 return float16_to_int16_round_to_zero(f, s);
3229 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3231 if (float16_is_any_nan(f)) {
3232 float_raise(float_flag_invalid, s);
3235 return float16_to_int64_round_to_zero(f, s);
3238 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3240 if (float32_is_any_nan(f)) {
3241 float_raise(float_flag_invalid, s);
3244 return float32_to_int64_round_to_zero(f, s);
3247 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3249 if (float64_is_any_nan(f)) {
3250 float_raise(float_flag_invalid, s);
3253 return float64_to_int64_round_to_zero(f, s);
3256 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3258 if (float16_is_any_nan(f)) {
3259 float_raise(float_flag_invalid, s);
3262 return float16_to_uint16_round_to_zero(f, s);
3265 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3267 if (float16_is_any_nan(f)) {
3268 float_raise(float_flag_invalid, s);
3271 return float16_to_uint64_round_to_zero(f, s);
3274 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3276 if (float32_is_any_nan(f)) {
3277 float_raise(float_flag_invalid, s);
3280 return float32_to_uint64_round_to_zero(f, s);
3283 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3285 if (float64_is_any_nan(f)) {
3286 float_raise(float_flag_invalid, s);
3289 return float64_to_uint64_round_to_zero(f, s);
3292 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3293 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3294 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3295 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3296 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3297 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3299 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3300 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3301 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3302 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3303 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3304 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3305 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3307 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3308 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3309 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3310 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3311 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3312 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3313 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3315 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3316 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3317 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3319 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3320 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3321 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3323 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3324 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3325 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3327 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3328 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3329 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3331 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3332 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3333 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3334 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3335 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3336 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3337 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3339 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3340 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3341 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3342 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3343 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3344 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3345 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3349 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3350 float_status *status, uint32_t desc,
3351 uint16_t neg1, uint16_t neg3)
3353 intptr_t i = simd_oprsz(desc);
3357 uint64_t pg = g[(i - 1) >> 6];
3360 if (likely((pg >> (i & 63)) & 1)) {
3361 float16 e1, e2, e3, r;
3363 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364 e2 = *(uint16_t *)(vm + H1_2(i));
3365 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366 r = float16_muladd(e1, e2, e3, 0, status);
3367 *(uint16_t *)(vd + H1_2(i)) = r;
3373 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3374 void *vg, void *status, uint32_t desc)
3376 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3379 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3380 void *vg, void *status, uint32_t desc)
3382 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3385 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3386 void *vg, void *status, uint32_t desc)
3388 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3391 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3392 void *vg, void *status, uint32_t desc)
3394 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3397 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3398 float_status *status, uint32_t desc,
3399 uint32_t neg1, uint32_t neg3)
3401 intptr_t i = simd_oprsz(desc);
3405 uint64_t pg = g[(i - 1) >> 6];
3408 if (likely((pg >> (i & 63)) & 1)) {
3409 float32 e1, e2, e3, r;
3411 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3412 e2 = *(uint32_t *)(vm + H1_4(i));
3413 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3414 r = float32_muladd(e1, e2, e3, 0, status);
3415 *(uint32_t *)(vd + H1_4(i)) = r;
3421 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3422 void *vg, void *status, uint32_t desc)
3424 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3427 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3428 void *vg, void *status, uint32_t desc)
3430 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3433 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3434 void *vg, void *status, uint32_t desc)
3436 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3439 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3440 void *vg, void *status, uint32_t desc)
3442 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3445 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3446 float_status *status, uint32_t desc,
3447 uint64_t neg1, uint64_t neg3)
3449 intptr_t i = simd_oprsz(desc);
3453 uint64_t pg = g[(i - 1) >> 6];
3456 if (likely((pg >> (i & 63)) & 1)) {
3457 float64 e1, e2, e3, r;
3459 e1 = *(uint64_t *)(vn + i) ^ neg1;
3460 e2 = *(uint64_t *)(vm + i);
3461 e3 = *(uint64_t *)(va + i) ^ neg3;
3462 r = float64_muladd(e1, e2, e3, 0, status);
3463 *(uint64_t *)(vd + i) = r;
3469 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3470 void *vg, void *status, uint32_t desc)
3472 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3475 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3476 void *vg, void *status, uint32_t desc)
3478 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3481 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3482 void *vg, void *status, uint32_t desc)
3484 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3487 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3488 void *vg, void *status, uint32_t desc)
3490 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3493 /* Two operand floating-point comparison controlled by a predicate.
3494 * Unlike the integer version, we are not allowed to optimistically
3495 * compare operands, since the comparison may have side effects wrt
3498 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3499 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3500 void *status, uint32_t desc) \
3502 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3503 uint64_t *d = vd, *g = vg; \
3505 uint64_t out = 0, pg = g[j]; \
3507 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3508 if (likely((pg >> (i & 63)) & 1)) { \
3509 TYPE nn = *(TYPE *)(vn + H(i)); \
3510 TYPE mm = *(TYPE *)(vm + H(i)); \
3511 out |= OP(TYPE, nn, mm, status); \
3518 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3519 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3520 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3522 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3525 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3526 DO_FPCMP_PPZZ_H(NAME, OP) \
3527 DO_FPCMP_PPZZ_S(NAME, OP) \
3528 DO_FPCMP_PPZZ_D(NAME, OP)
3530 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3531 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3532 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3533 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3534 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3535 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3536 #define DO_FCMUO(TYPE, X, Y, ST) \
3537 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3538 #define DO_FACGE(TYPE, X, Y, ST) \
3539 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3540 #define DO_FACGT(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3543 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3544 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3545 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3546 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3547 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3548 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3549 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3551 #undef DO_FPCMP_PPZZ_ALL
3552 #undef DO_FPCMP_PPZZ_D
3553 #undef DO_FPCMP_PPZZ_S
3554 #undef DO_FPCMP_PPZZ_H
3555 #undef DO_FPCMP_PPZZ
3557 /* One operand floating-point comparison against zero, controlled
3560 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3561 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3562 void *status, uint32_t desc) \
3564 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3565 uint64_t *d = vd, *g = vg; \
3567 uint64_t out = 0, pg = g[j]; \
3569 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3570 if ((pg >> (i & 63)) & 1) { \
3571 TYPE nn = *(TYPE *)(vn + H(i)); \
3572 out |= OP(TYPE, nn, 0, status); \
3579 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3580 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3581 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3583 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3586 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3587 DO_FPCMP_PPZ0_H(NAME, OP) \
3588 DO_FPCMP_PPZ0_S(NAME, OP) \
3589 DO_FPCMP_PPZ0_D(NAME, OP)
3591 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3592 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3593 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3594 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3595 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3596 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3598 /* FP Trig Multiply-Add. */
3600 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3602 static const float16 coeff[16] = {
3603 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3604 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3607 intptr_t x = simd_data(desc);
3608 float16 *d = vd, *n = vn, *m = vm;
3609 for (i = 0; i < opr_sz; i++) {
3612 if (float16_is_neg(mm)) {
3613 mm = float16_abs(mm);
3616 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3620 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3622 static const float32 coeff[16] = {
3623 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3624 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3625 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3626 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3628 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3629 intptr_t x = simd_data(desc);
3630 float32 *d = vd, *n = vn, *m = vm;
3631 for (i = 0; i < opr_sz; i++) {
3634 if (float32_is_neg(mm)) {
3635 mm = float32_abs(mm);
3638 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3642 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3644 static const float64 coeff[16] = {
3645 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3646 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3647 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3648 0x3de5d8408868552full, 0x0000000000000000ull,
3649 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3650 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3651 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3652 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3654 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3655 intptr_t x = simd_data(desc);
3656 float64 *d = vd, *n = vn, *m = vm;
3657 for (i = 0; i < opr_sz; i++) {
3660 if (float64_is_neg(mm)) {
3661 mm = float64_abs(mm);
3664 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3672 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3673 void *vs, uint32_t desc)
3675 intptr_t j, i = simd_oprsz(desc);
3677 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3678 float16 neg_real = float16_chs(neg_imag);
3681 uint64_t pg = g[(i - 1) >> 6];
3683 float16 e0, e1, e2, e3;
3685 /* I holds the real index; J holds the imag index. */
3686 j = i - sizeof(float16);
3687 i -= 2 * sizeof(float16);
3689 e0 = *(float16 *)(vn + H1_2(i));
3690 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3691 e2 = *(float16 *)(vn + H1_2(j));
3692 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3694 if (likely((pg >> (i & 63)) & 1)) {
3695 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3697 if (likely((pg >> (j & 63)) & 1)) {
3698 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3704 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3705 void *vs, uint32_t desc)
3707 intptr_t j, i = simd_oprsz(desc);
3709 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3710 float32 neg_real = float32_chs(neg_imag);
3713 uint64_t pg = g[(i - 1) >> 6];
3715 float32 e0, e1, e2, e3;
3717 /* I holds the real index; J holds the imag index. */
3718 j = i - sizeof(float32);
3719 i -= 2 * sizeof(float32);
3721 e0 = *(float32 *)(vn + H1_2(i));
3722 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3723 e2 = *(float32 *)(vn + H1_2(j));
3724 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3726 if (likely((pg >> (i & 63)) & 1)) {
3727 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3729 if (likely((pg >> (j & 63)) & 1)) {
3730 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3736 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3737 void *vs, uint32_t desc)
3739 intptr_t j, i = simd_oprsz(desc);
3741 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3742 float64 neg_real = float64_chs(neg_imag);
3745 uint64_t pg = g[(i - 1) >> 6];
3747 float64 e0, e1, e2, e3;
3749 /* I holds the real index; J holds the imag index. */
3750 j = i - sizeof(float64);
3751 i -= 2 * sizeof(float64);
3753 e0 = *(float64 *)(vn + H1_2(i));
3754 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3755 e2 = *(float64 *)(vn + H1_2(j));
3756 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3758 if (likely((pg >> (i & 63)) & 1)) {
3759 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3761 if (likely((pg >> (j & 63)) & 1)) {
3762 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3769 * FP Complex Multiply
3772 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3773 void *vg, void *status, uint32_t desc)
3775 intptr_t j, i = simd_oprsz(desc);
3776 unsigned rot = simd_data(desc);
3777 bool flip = rot & 1;
3778 float16 neg_imag, neg_real;
3781 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3782 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3785 uint64_t pg = g[(i - 1) >> 6];
3787 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3789 /* I holds the real index; J holds the imag index. */
3790 j = i - sizeof(float16);
3791 i -= 2 * sizeof(float16);
3793 nr = *(float16 *)(vn + H1_2(i));
3794 ni = *(float16 *)(vn + H1_2(j));
3795 mr = *(float16 *)(vm + H1_2(i));
3796 mi = *(float16 *)(vm + H1_2(j));
3798 e2 = (flip ? ni : nr);
3799 e1 = (flip ? mi : mr) ^ neg_real;
3801 e3 = (flip ? mr : mi) ^ neg_imag;
3803 if (likely((pg >> (i & 63)) & 1)) {
3804 d = *(float16 *)(va + H1_2(i));
3805 d = float16_muladd(e2, e1, d, 0, status);
3806 *(float16 *)(vd + H1_2(i)) = d;
3808 if (likely((pg >> (j & 63)) & 1)) {
3809 d = *(float16 *)(va + H1_2(j));
3810 d = float16_muladd(e4, e3, d, 0, status);
3811 *(float16 *)(vd + H1_2(j)) = d;
3817 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3818 void *vg, void *status, uint32_t desc)
3820 intptr_t j, i = simd_oprsz(desc);
3821 unsigned rot = simd_data(desc);
3822 bool flip = rot & 1;
3823 float32 neg_imag, neg_real;
3826 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3827 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3830 uint64_t pg = g[(i - 1) >> 6];
3832 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3834 /* I holds the real index; J holds the imag index. */
3835 j = i - sizeof(float32);
3836 i -= 2 * sizeof(float32);
3838 nr = *(float32 *)(vn + H1_2(i));
3839 ni = *(float32 *)(vn + H1_2(j));
3840 mr = *(float32 *)(vm + H1_2(i));
3841 mi = *(float32 *)(vm + H1_2(j));
3843 e2 = (flip ? ni : nr);
3844 e1 = (flip ? mi : mr) ^ neg_real;
3846 e3 = (flip ? mr : mi) ^ neg_imag;
3848 if (likely((pg >> (i & 63)) & 1)) {
3849 d = *(float32 *)(va + H1_2(i));
3850 d = float32_muladd(e2, e1, d, 0, status);
3851 *(float32 *)(vd + H1_2(i)) = d;
3853 if (likely((pg >> (j & 63)) & 1)) {
3854 d = *(float32 *)(va + H1_2(j));
3855 d = float32_muladd(e4, e3, d, 0, status);
3856 *(float32 *)(vd + H1_2(j)) = d;
3862 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3863 void *vg, void *status, uint32_t desc)
3865 intptr_t j, i = simd_oprsz(desc);
3866 unsigned rot = simd_data(desc);
3867 bool flip = rot & 1;
3868 float64 neg_imag, neg_real;
3871 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3872 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3875 uint64_t pg = g[(i - 1) >> 6];
3877 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3879 /* I holds the real index; J holds the imag index. */
3880 j = i - sizeof(float64);
3881 i -= 2 * sizeof(float64);
3883 nr = *(float64 *)(vn + H1_2(i));
3884 ni = *(float64 *)(vn + H1_2(j));
3885 mr = *(float64 *)(vm + H1_2(i));
3886 mi = *(float64 *)(vm + H1_2(j));
3888 e2 = (flip ? ni : nr);
3889 e1 = (flip ? mi : mr) ^ neg_real;
3891 e3 = (flip ? mr : mi) ^ neg_imag;
3893 if (likely((pg >> (i & 63)) & 1)) {
3894 d = *(float64 *)(va + H1_2(i));
3895 d = float64_muladd(e2, e1, d, 0, status);
3896 *(float64 *)(vd + H1_2(i)) = d;
3898 if (likely((pg >> (j & 63)) & 1)) {
3899 d = *(float64 *)(va + H1_2(j));
3900 d = float64_muladd(e4, e3, d, 0, status);
3901 *(float64 *)(vd + H1_2(j)) = d;
3908 * Load contiguous data, protected by a governing predicate.
3912 * Load one element into @vd + @reg_off from @host.
3913 * The controlling predicate is known to be true.
3915 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3918 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3919 * The controlling predicate is known to be true.
3921 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3922 target_ulong vaddr, uintptr_t retaddr);
3925 * Generate the above primitives.
3928 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3929 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3931 TYPEM val = HOST(host); \
3932 *(TYPEE *)(vd + H(reg_off)) = val; \
3935 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3936 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3937 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3939 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3940 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3941 target_ulong addr, uintptr_t ra) \
3943 *(TYPEE *)(vd + H(reg_off)) = \
3944 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
3947 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3948 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3949 target_ulong addr, uintptr_t ra) \
3951 TLB(env, useronly_clean_ptr(addr), \
3952 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3955 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
3956 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3957 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3959 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3960 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3961 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3962 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3963 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3964 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3965 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3967 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
3968 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3969 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3971 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3972 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3973 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3974 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3976 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3977 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3978 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3979 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3980 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3982 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3983 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3984 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
3985 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3986 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3988 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3989 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3990 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3991 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3992 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
3994 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3995 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3996 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
3998 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
3999 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4000 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
4002 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4003 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4005 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4006 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
4017 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4018 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4019 * element >= @reg_off, or @reg_max if there were no active elements at all.
4021 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4022 intptr_t reg_max, int esz)
4024 uint64_t pg_mask = pred_esz_masks[esz];
4025 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4027 /* In normal usage, the first element is active. */
4028 if (likely(pg & 1)) {
4036 if (unlikely(reg_off >= reg_max)) {
4037 /* The entire predicate was false. */
4040 pg = vg[reg_off >> 6] & pg_mask;
4043 reg_off += ctz64(pg);
4045 /* We should never see an out of range predicate bit set. */
4046 tcg_debug_assert(reg_off < reg_max);
4051 * Resolve the guest virtual address to info->host and info->flags.
4052 * If @nofault, return false if the page is invalid, otherwise
4053 * exit via page fault exception.
4062 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4063 CPUARMState *env, target_ulong addr,
4064 int mem_off, MMUAccessType access_type,
4065 int mmu_idx, uintptr_t retaddr)
4072 * User-only currently always issues with TBI. See the comment
4073 * above useronly_clean_ptr. Usually we clean this top byte away
4074 * during translation, but we can't do that for e.g. vector + imm
4077 * We currently always enable TBI for user-only, and do not provide
4078 * a way to turn it off. So clean the pointer unconditionally here,
4079 * rather than look it up here, or pass it down from above.
4081 addr = useronly_clean_ptr(addr);
4083 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4084 &info->host, retaddr);
4085 info->flags = flags;
4087 if (flags & TLB_INVALID_MASK) {
4092 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4093 info->host -= mem_off;
4095 #ifdef CONFIG_USER_ONLY
4096 memset(&info->attrs, 0, sizeof(info->attrs));
4099 * Find the iotlbentry for addr and return the transaction attributes.
4100 * This *must* be present in the TLB because we just found the mapping.
4103 uintptr_t index = tlb_index(env, mmu_idx, addr);
4105 # ifdef CONFIG_DEBUG_TCG
4106 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4107 target_ulong comparator = (access_type == MMU_DATA_LOAD
4109 : tlb_addr_write(entry));
4110 g_assert(tlb_hit(comparator, addr));
4113 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4114 info->attrs = iotlbentry->attrs;
4123 * Analyse contiguous data, protected by a governing predicate.
4134 * First and last element wholly contained within the two pages.
4135 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4136 * reg_off_last[0] may be < 0 if the first element crosses pages.
4137 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4138 * are set >= 0 only if there are complete elements on a second page.
4140 * The reg_off_* offsets are relative to the internal vector register.
4141 * The mem_off_first offset is relative to the memory address; the
4142 * two offsets are different when a load operation extends, a store
4143 * operation truncates, or for multi-register operations.
4145 int16_t mem_off_first[2];
4146 int16_t reg_off_first[2];
4147 int16_t reg_off_last[2];
4150 * One element that is misaligned and spans both pages,
4151 * or -1 if there is no such active element.
4153 int16_t mem_off_split;
4154 int16_t reg_off_split;
4157 * The byte offset at which the entire operation crosses a page boundary.
4158 * Set >= 0 if and only if the entire operation spans two pages.
4162 /* TLB data for the two pages. */
4163 SVEHostPage page[2];
4167 * Find first active element on each page, and a loose bound for the
4168 * final element on each page. Identify any single element that spans
4169 * the page boundary. Return true if there are any active elements.
4171 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4172 uint64_t *vg, intptr_t reg_max,
4175 const int esize = 1 << esz;
4176 const uint64_t pg_mask = pred_esz_masks[esz];
4177 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4178 intptr_t mem_off_last, mem_off_split;
4179 intptr_t page_split, elt_split;
4182 /* Set all of the element indices to -1, and the TLB data to 0. */
4183 memset(info, -1, offsetof(SVEContLdSt, page));
4184 memset(info->page, 0, sizeof(info->page));
4186 /* Gross scan over the entire predicate to find bounds. */
4189 uint64_t pg = vg[i] & pg_mask;
4191 reg_off_last = i * 64 + 63 - clz64(pg);
4192 if (reg_off_first < 0) {
4193 reg_off_first = i * 64 + ctz64(pg);
4196 } while (++i * 64 < reg_max);
4198 if (unlikely(reg_off_first < 0)) {
4199 /* No active elements, no pages touched. */
4202 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4204 info->reg_off_first[0] = reg_off_first;
4205 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4206 mem_off_last = (reg_off_last >> esz) * msize;
4208 page_split = -(addr | TARGET_PAGE_MASK);
4209 if (likely(mem_off_last + msize <= page_split)) {
4210 /* The entire operation fits within a single page. */
4211 info->reg_off_last[0] = reg_off_last;
4215 info->page_split = page_split;
4216 elt_split = page_split / msize;
4217 reg_off_split = elt_split << esz;
4218 mem_off_split = elt_split * msize;
4221 * This is the last full element on the first page, but it is not
4222 * necessarily active. If there is no full element, i.e. the first
4223 * active element is the one that's split, this value remains -1.
4224 * It is useful as iteration bounds.
4226 if (elt_split != 0) {
4227 info->reg_off_last[0] = reg_off_split - esize;
4230 /* Determine if an unaligned element spans the pages. */
4231 if (page_split % msize != 0) {
4232 /* It is helpful to know if the split element is active. */
4233 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4234 info->reg_off_split = reg_off_split;
4235 info->mem_off_split = mem_off_split;
4237 if (reg_off_split == reg_off_last) {
4238 /* The page crossing element is last. */
4242 reg_off_split += esize;
4243 mem_off_split += msize;
4247 * We do want the first active element on the second page, because
4248 * this may affect the address reported in an exception.
4250 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4251 tcg_debug_assert(reg_off_split <= reg_off_last);
4252 info->reg_off_first[1] = reg_off_split;
4253 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4254 info->reg_off_last[1] = reg_off_last;
4259 * Resolve the guest virtual addresses to info->page[].
4260 * Control the generation of page faults with @fault. Return false if
4261 * there is no work to do, which can only happen with @fault == FAULT_NO.
4263 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4264 CPUARMState *env, target_ulong addr,
4265 MMUAccessType access_type, uintptr_t retaddr)
4267 int mmu_idx = cpu_mmu_index(env, false);
4268 int mem_off = info->mem_off_first[0];
4269 bool nofault = fault == FAULT_NO;
4270 bool have_work = true;
4272 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4273 access_type, mmu_idx, retaddr)) {
4274 /* No work to be done. */
4278 if (likely(info->page_split < 0)) {
4279 /* The entire operation was on the one page. */
4284 * If the second page is invalid, then we want the fault address to be
4285 * the first byte on that page which is accessed.
4287 if (info->mem_off_split >= 0) {
4289 * There is an element split across the pages. The fault address
4290 * should be the first byte of the second page.
4292 mem_off = info->page_split;
4294 * If the split element is also the first active element
4295 * of the vector, then: For first-fault we should continue
4296 * to generate faults for the second page. For no-fault,
4297 * we have work only if the second page is valid.
4299 if (info->mem_off_first[0] < info->mem_off_split) {
4300 nofault = FAULT_FIRST;
4305 * There is no element split across the pages. The fault address
4306 * should be the first active element on the second page.
4308 mem_off = info->mem_off_first[1];
4310 * There must have been one active element on the first page,
4311 * so we're out of first-fault territory.
4313 nofault = fault != FAULT_ALL;
4316 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4317 access_type, mmu_idx, retaddr);
4321 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4322 uint64_t *vg, target_ulong addr,
4323 int esize, int msize, int wp_access,
4326 #ifndef CONFIG_USER_ONLY
4327 intptr_t mem_off, reg_off, reg_last;
4328 int flags0 = info->page[0].flags;
4329 int flags1 = info->page[1].flags;
4331 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4335 /* Indicate that watchpoints are handled. */
4336 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4337 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4339 if (flags0 & TLB_WATCHPOINT) {
4340 mem_off = info->mem_off_first[0];
4341 reg_off = info->reg_off_first[0];
4342 reg_last = info->reg_off_last[0];
4344 while (reg_off <= reg_last) {
4345 uint64_t pg = vg[reg_off >> 6];
4347 if ((pg >> (reg_off & 63)) & 1) {
4348 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4349 msize, info->page[0].attrs,
4350 wp_access, retaddr);
4354 } while (reg_off <= reg_last && (reg_off & 63));
4358 mem_off = info->mem_off_split;
4360 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4361 info->page[0].attrs, wp_access, retaddr);
4364 mem_off = info->mem_off_first[1];
4365 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4366 reg_off = info->reg_off_first[1];
4367 reg_last = info->reg_off_last[1];
4370 uint64_t pg = vg[reg_off >> 6];
4372 if ((pg >> (reg_off & 63)) & 1) {
4373 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4374 msize, info->page[1].attrs,
4375 wp_access, retaddr);
4379 } while (reg_off & 63);
4380 } while (reg_off <= reg_last);
4385 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
4386 uint64_t *vg, target_ulong addr, int esize,
4387 int msize, uint32_t mtedesc, uintptr_t ra)
4389 intptr_t mem_off, reg_off, reg_last;
4391 /* Process the page only if MemAttr == Tagged. */
4392 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4393 mem_off = info->mem_off_first[0];
4394 reg_off = info->reg_off_first[0];
4395 reg_last = info->reg_off_split;
4397 reg_last = info->reg_off_last[0];
4401 uint64_t pg = vg[reg_off >> 6];
4403 if ((pg >> (reg_off & 63)) & 1) {
4404 mte_check(env, mtedesc, addr, ra);
4408 } while (reg_off <= reg_last && (reg_off & 63));
4409 } while (reg_off <= reg_last);
4412 mem_off = info->mem_off_first[1];
4413 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4414 reg_off = info->reg_off_first[1];
4415 reg_last = info->reg_off_last[1];
4418 uint64_t pg = vg[reg_off >> 6];
4420 if ((pg >> (reg_off & 63)) & 1) {
4421 mte_check(env, mtedesc, addr, ra);
4425 } while (reg_off & 63);
4426 } while (reg_off <= reg_last);
4431 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4433 static inline QEMU_ALWAYS_INLINE
4434 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4435 uint32_t desc, const uintptr_t retaddr,
4436 const int esz, const int msz, const int N, uint32_t mtedesc,
4437 sve_ldst1_host_fn *host_fn,
4438 sve_ldst1_tlb_fn *tlb_fn)
4440 const unsigned rd = simd_data(desc);
4441 const intptr_t reg_max = simd_oprsz(desc);
4442 intptr_t reg_off, reg_last, mem_off;
4447 /* Find the active elements. */
4448 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4449 /* The entire predicate was false; no load occurs. */
4450 for (i = 0; i < N; ++i) {
4451 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4456 /* Probe the page(s). Exit with exception for any invalid page. */
4457 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4459 /* Handle watchpoints for all active elements. */
4460 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4461 BP_MEM_READ, retaddr);
4464 * Handle mte checks for all active elements.
4465 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4468 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
4472 flags = info.page[0].flags | info.page[1].flags;
4473 if (unlikely(flags != 0)) {
4474 #ifdef CONFIG_USER_ONLY
4475 g_assert_not_reached();
4478 * At least one page includes MMIO.
4479 * Any bus operation can fail with cpu_transaction_failed,
4480 * which for ARM will raise SyncExternal. Perform the load
4481 * into scratch memory to preserve register state until the end.
4483 ARMVectorReg scratch[4] = { };
4485 mem_off = info.mem_off_first[0];
4486 reg_off = info.reg_off_first[0];
4487 reg_last = info.reg_off_last[1];
4489 reg_last = info.reg_off_split;
4491 reg_last = info.reg_off_last[0];
4496 uint64_t pg = vg[reg_off >> 6];
4498 if ((pg >> (reg_off & 63)) & 1) {
4499 for (i = 0; i < N; ++i) {
4500 tlb_fn(env, &scratch[i], reg_off,
4501 addr + mem_off + (i << msz), retaddr);
4504 reg_off += 1 << esz;
4505 mem_off += N << msz;
4506 } while (reg_off & 63);
4507 } while (reg_off <= reg_last);
4509 for (i = 0; i < N; ++i) {
4510 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4516 /* The entire operation is in RAM, on valid pages. */
4518 for (i = 0; i < N; ++i) {
4519 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4522 mem_off = info.mem_off_first[0];
4523 reg_off = info.reg_off_first[0];
4524 reg_last = info.reg_off_last[0];
4525 host = info.page[0].host;
4527 while (reg_off <= reg_last) {
4528 uint64_t pg = vg[reg_off >> 6];
4530 if ((pg >> (reg_off & 63)) & 1) {
4531 for (i = 0; i < N; ++i) {
4532 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4533 host + mem_off + (i << msz));
4536 reg_off += 1 << esz;
4537 mem_off += N << msz;
4538 } while (reg_off <= reg_last && (reg_off & 63));
4542 * Use the slow path to manage the cross-page misalignment.
4543 * But we know this is RAM and cannot trap.
4545 mem_off = info.mem_off_split;
4546 if (unlikely(mem_off >= 0)) {
4547 reg_off = info.reg_off_split;
4548 for (i = 0; i < N; ++i) {
4549 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4550 addr + mem_off + (i << msz), retaddr);
4554 mem_off = info.mem_off_first[1];
4555 if (unlikely(mem_off >= 0)) {
4556 reg_off = info.reg_off_first[1];
4557 reg_last = info.reg_off_last[1];
4558 host = info.page[1].host;
4561 uint64_t pg = vg[reg_off >> 6];
4563 if ((pg >> (reg_off & 63)) & 1) {
4564 for (i = 0; i < N; ++i) {
4565 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4566 host + mem_off + (i << msz));
4569 reg_off += 1 << esz;
4570 mem_off += N << msz;
4571 } while (reg_off & 63);
4572 } while (reg_off <= reg_last);
4576 static inline QEMU_ALWAYS_INLINE
4577 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4578 uint32_t desc, const uintptr_t ra,
4579 const int esz, const int msz, const int N,
4580 sve_ldst1_host_fn *host_fn,
4581 sve_ldst1_tlb_fn *tlb_fn)
4583 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4584 int bit55 = extract64(addr, 55, 1);
4586 /* Remove mtedesc from the normal sve descriptor. */
4587 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4589 /* Perform gross MTE suppression early. */
4590 if (!tbi_check(desc, bit55) ||
4591 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4595 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
4598 #define DO_LD1_1(NAME, ESZ) \
4599 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4600 target_ulong addr, uint32_t desc) \
4602 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4603 sve_##NAME##_host, sve_##NAME##_tlb); \
4605 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4606 target_ulong addr, uint32_t desc) \
4608 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4609 sve_##NAME##_host, sve_##NAME##_tlb); \
4612 #define DO_LD1_2(NAME, ESZ, MSZ) \
4613 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4614 target_ulong addr, uint32_t desc) \
4616 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4617 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4619 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4620 target_ulong addr, uint32_t desc) \
4622 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4623 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4625 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4626 target_ulong addr, uint32_t desc) \
4628 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4629 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4631 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4632 target_ulong addr, uint32_t desc) \
4634 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4635 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4638 DO_LD1_1(ld1bb, MO_8)
4639 DO_LD1_1(ld1bhu, MO_16)
4640 DO_LD1_1(ld1bhs, MO_16)
4641 DO_LD1_1(ld1bsu, MO_32)
4642 DO_LD1_1(ld1bss, MO_32)
4643 DO_LD1_1(ld1bdu, MO_64)
4644 DO_LD1_1(ld1bds, MO_64)
4646 DO_LD1_2(ld1hh, MO_16, MO_16)
4647 DO_LD1_2(ld1hsu, MO_32, MO_16)
4648 DO_LD1_2(ld1hss, MO_32, MO_16)
4649 DO_LD1_2(ld1hdu, MO_64, MO_16)
4650 DO_LD1_2(ld1hds, MO_64, MO_16)
4652 DO_LD1_2(ld1ss, MO_32, MO_32)
4653 DO_LD1_2(ld1sdu, MO_64, MO_32)
4654 DO_LD1_2(ld1sds, MO_64, MO_32)
4656 DO_LD1_2(ld1dd, MO_64, MO_64)
4661 #define DO_LDN_1(N) \
4662 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4663 target_ulong addr, uint32_t desc) \
4665 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4666 sve_ld1bb_host, sve_ld1bb_tlb); \
4668 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4669 target_ulong addr, uint32_t desc) \
4671 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4672 sve_ld1bb_host, sve_ld1bb_tlb); \
4675 #define DO_LDN_2(N, SUFF, ESZ) \
4676 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4677 target_ulong addr, uint32_t desc) \
4679 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4680 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4682 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4683 target_ulong addr, uint32_t desc) \
4685 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4686 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4688 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4689 target_ulong addr, uint32_t desc) \
4691 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4692 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4694 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4695 target_ulong addr, uint32_t desc) \
4697 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4698 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4705 DO_LDN_2(2, hh, MO_16)
4706 DO_LDN_2(3, hh, MO_16)
4707 DO_LDN_2(4, hh, MO_16)
4709 DO_LDN_2(2, ss, MO_32)
4710 DO_LDN_2(3, ss, MO_32)
4711 DO_LDN_2(4, ss, MO_32)
4713 DO_LDN_2(2, dd, MO_64)
4714 DO_LDN_2(3, dd, MO_64)
4715 DO_LDN_2(4, dd, MO_64)
4721 * Load contiguous data, first-fault and no-fault.
4723 * For user-only, one could argue that we should hold the mmap_lock during
4724 * the operation so that there is no race between page_check_range and the
4725 * load operation. However, unmapping pages out from under a running thread
4726 * is extraordinarily unlikely. This theoretical race condition also affects
4727 * linux-user/ in its get_user/put_user macros.
4729 * TODO: Construct some helpers, written in assembly, that interact with
4730 * handle_cpu_signal to produce memory ops which can properly report errors
4734 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4735 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4736 * option, which leaves subsequent data unchanged.
4738 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4740 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4743 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4744 i = ROUND_UP(i, 64);
4746 for (; i < oprsz; i += 64) {
4752 * Common helper for all contiguous no-fault and first-fault loads.
4754 static inline QEMU_ALWAYS_INLINE
4755 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4756 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4757 const int esz, const int msz, const SVEContFault fault,
4758 sve_ldst1_host_fn *host_fn,
4759 sve_ldst1_tlb_fn *tlb_fn)
4761 const unsigned rd = simd_data(desc);
4762 void *vd = &env->vfp.zregs[rd];
4763 const intptr_t reg_max = simd_oprsz(desc);
4764 intptr_t reg_off, mem_off, reg_last;
4769 /* Find the active elements. */
4770 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4771 /* The entire predicate was false; no load occurs. */
4772 memset(vd, 0, reg_max);
4775 reg_off = info.reg_off_first[0];
4777 /* Probe the page(s). */
4778 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4779 /* Fault on first element. */
4780 tcg_debug_assert(fault == FAULT_NO);
4781 memset(vd, 0, reg_max);
4785 mem_off = info.mem_off_first[0];
4786 flags = info.page[0].flags;
4789 * Disable MTE checking if the Tagged bit is not set. Since TBI must
4790 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4792 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4796 if (fault == FAULT_FIRST) {
4797 /* Trapping mte check for the first-fault element. */
4799 mte_check(env, mtedesc, addr + mem_off, retaddr);
4803 * Special handling of the first active element,
4804 * if it crosses a page boundary or is MMIO.
4806 bool is_split = mem_off == info.mem_off_split;
4807 if (unlikely(flags != 0) || unlikely(is_split)) {
4809 * Use the slow path for cross-page handling.
4810 * Might trap for MMIO or watchpoints.
4812 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4814 /* After any fault, zero the other elements. */
4815 swap_memzero(vd, reg_off);
4816 reg_off += 1 << esz;
4817 mem_off += 1 << msz;
4818 swap_memzero(vd + reg_off, reg_max - reg_off);
4824 memset(vd, 0, reg_max);
4827 memset(vd, 0, reg_max);
4828 if (unlikely(mem_off == info.mem_off_split)) {
4829 /* The first active element crosses a page boundary. */
4830 flags |= info.page[1].flags;
4831 if (unlikely(flags & TLB_MMIO)) {
4832 /* Some page is MMIO, see below. */
4835 if (unlikely(flags & TLB_WATCHPOINT) &&
4836 (cpu_watchpoint_address_matches
4837 (env_cpu(env), addr + mem_off, 1 << msz)
4839 /* Watchpoint hit, see below. */
4842 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
4846 * Use the slow path for cross-page handling.
4847 * This is RAM, without a watchpoint, and will not trap.
4849 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4855 * From this point on, all memory operations are MemSingleNF.
4857 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4858 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4860 * Unfortuately we do not have access to the memory attributes from the
4861 * PTE to tell Device memory from Normal memory. So we make a mostly
4862 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4863 * This gives the right answer for the common cases of "Normal memory,
4864 * backed by host RAM" and "Device memory, backed by MMIO".
4865 * The architecture allows us to suppress an NF load and return
4866 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4867 * case of "Normal memory, backed by MMIO" is permitted. The case we
4868 * get wrong is "Device memory, backed by host RAM", for which we
4869 * should return (UNKNOWN, FAULT) for but do not.
4871 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4872 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4873 * architectural breakpoints the same.
4875 if (unlikely(flags & TLB_MMIO)) {
4879 reg_last = info.reg_off_last[0];
4880 host = info.page[0].host;
4883 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4885 if ((pg >> (reg_off & 63)) & 1) {
4886 if (unlikely(flags & TLB_WATCHPOINT) &&
4887 (cpu_watchpoint_address_matches
4888 (env_cpu(env), addr + mem_off, 1 << msz)
4892 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
4895 host_fn(vd, reg_off, host + mem_off);
4897 reg_off += 1 << esz;
4898 mem_off += 1 << msz;
4899 } while (reg_off <= reg_last && (reg_off & 63));
4900 } while (reg_off <= reg_last);
4903 * MemSingleNF is allowed to fail for any reason. We have special
4904 * code above to handle the first element crossing a page boundary.
4905 * As an implementation choice, decline to handle a cross-page element
4906 * in any other position.
4908 reg_off = info.reg_off_split;
4914 reg_off = info.reg_off_first[1];
4915 if (likely(reg_off < 0)) {
4916 /* No active elements on the second page. All done. */
4921 * MemSingleNF is allowed to fail for any reason. As an implementation
4922 * choice, decline to handle elements on the second page. This should
4923 * be low frequency as the guest walks through memory -- the next
4924 * iteration of the guest's loop should be aligned on the page boundary,
4925 * and then all following iterations will stay aligned.
4929 record_fault(env, reg_off, reg_max);
4932 static inline QEMU_ALWAYS_INLINE
4933 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4934 uint32_t desc, const uintptr_t retaddr,
4935 const int esz, const int msz, const SVEContFault fault,
4936 sve_ldst1_host_fn *host_fn,
4937 sve_ldst1_tlb_fn *tlb_fn)
4939 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4940 int bit55 = extract64(addr, 55, 1);
4942 /* Remove mtedesc from the normal sve descriptor. */
4943 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4945 /* Perform gross MTE suppression early. */
4946 if (!tbi_check(desc, bit55) ||
4947 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4951 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4952 esz, msz, fault, host_fn, tlb_fn);
4955 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4956 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4957 target_ulong addr, uint32_t desc) \
4959 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4960 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4962 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4963 target_ulong addr, uint32_t desc) \
4965 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4966 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4968 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
4969 target_ulong addr, uint32_t desc) \
4971 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4972 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4974 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
4975 target_ulong addr, uint32_t desc) \
4977 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
4978 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4981 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4982 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4983 target_ulong addr, uint32_t desc) \
4985 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
4986 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4988 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4989 target_ulong addr, uint32_t desc) \
4991 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
4992 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4994 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4995 target_ulong addr, uint32_t desc) \
4997 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
4998 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5000 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5001 target_ulong addr, uint32_t desc) \
5003 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5004 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5006 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5007 target_ulong addr, uint32_t desc) \
5009 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5010 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5012 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5013 target_ulong addr, uint32_t desc) \
5015 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5016 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5018 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5019 target_ulong addr, uint32_t desc) \
5021 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5022 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5024 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5025 target_ulong addr, uint32_t desc) \
5027 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5028 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5031 DO_LDFF1_LDNF1_1(bb, MO_8)
5032 DO_LDFF1_LDNF1_1(bhu, MO_16)
5033 DO_LDFF1_LDNF1_1(bhs, MO_16)
5034 DO_LDFF1_LDNF1_1(bsu, MO_32)
5035 DO_LDFF1_LDNF1_1(bss, MO_32)
5036 DO_LDFF1_LDNF1_1(bdu, MO_64)
5037 DO_LDFF1_LDNF1_1(bds, MO_64)
5039 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5040 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5041 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5042 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5043 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5045 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5046 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5047 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5049 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5051 #undef DO_LDFF1_LDNF1_1
5052 #undef DO_LDFF1_LDNF1_2
5055 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5058 static inline QEMU_ALWAYS_INLINE
5059 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5060 uint32_t desc, const uintptr_t retaddr,
5061 const int esz, const int msz, const int N, uint32_t mtedesc,
5062 sve_ldst1_host_fn *host_fn,
5063 sve_ldst1_tlb_fn *tlb_fn)
5065 const unsigned rd = simd_data(desc);
5066 const intptr_t reg_max = simd_oprsz(desc);
5067 intptr_t reg_off, reg_last, mem_off;
5072 /* Find the active elements. */
5073 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5074 /* The entire predicate was false; no store occurs. */
5078 /* Probe the page(s). Exit with exception for any invalid page. */
5079 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5081 /* Handle watchpoints for all active elements. */
5082 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5083 BP_MEM_WRITE, retaddr);
5086 * Handle mte checks for all active elements.
5087 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5090 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5094 flags = info.page[0].flags | info.page[1].flags;
5095 if (unlikely(flags != 0)) {
5096 #ifdef CONFIG_USER_ONLY
5097 g_assert_not_reached();
5100 * At least one page includes MMIO.
5101 * Any bus operation can fail with cpu_transaction_failed,
5102 * which for ARM will raise SyncExternal. We cannot avoid
5103 * this fault and will leave with the store incomplete.
5105 mem_off = info.mem_off_first[0];
5106 reg_off = info.reg_off_first[0];
5107 reg_last = info.reg_off_last[1];
5109 reg_last = info.reg_off_split;
5111 reg_last = info.reg_off_last[0];
5116 uint64_t pg = vg[reg_off >> 6];
5118 if ((pg >> (reg_off & 63)) & 1) {
5119 for (i = 0; i < N; ++i) {
5120 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5121 addr + mem_off + (i << msz), retaddr);
5124 reg_off += 1 << esz;
5125 mem_off += N << msz;
5126 } while (reg_off & 63);
5127 } while (reg_off <= reg_last);
5132 mem_off = info.mem_off_first[0];
5133 reg_off = info.reg_off_first[0];
5134 reg_last = info.reg_off_last[0];
5135 host = info.page[0].host;
5137 while (reg_off <= reg_last) {
5138 uint64_t pg = vg[reg_off >> 6];
5140 if ((pg >> (reg_off & 63)) & 1) {
5141 for (i = 0; i < N; ++i) {
5142 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5143 host + mem_off + (i << msz));
5146 reg_off += 1 << esz;
5147 mem_off += N << msz;
5148 } while (reg_off <= reg_last && (reg_off & 63));
5152 * Use the slow path to manage the cross-page misalignment.
5153 * But we know this is RAM and cannot trap.
5155 mem_off = info.mem_off_split;
5156 if (unlikely(mem_off >= 0)) {
5157 reg_off = info.reg_off_split;
5158 for (i = 0; i < N; ++i) {
5159 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5160 addr + mem_off + (i << msz), retaddr);
5164 mem_off = info.mem_off_first[1];
5165 if (unlikely(mem_off >= 0)) {
5166 reg_off = info.reg_off_first[1];
5167 reg_last = info.reg_off_last[1];
5168 host = info.page[1].host;
5171 uint64_t pg = vg[reg_off >> 6];
5173 if ((pg >> (reg_off & 63)) & 1) {
5174 for (i = 0; i < N; ++i) {
5175 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5176 host + mem_off + (i << msz));
5179 reg_off += 1 << esz;
5180 mem_off += N << msz;
5181 } while (reg_off & 63);
5182 } while (reg_off <= reg_last);
5186 static inline QEMU_ALWAYS_INLINE
5187 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5188 uint32_t desc, const uintptr_t ra,
5189 const int esz, const int msz, const int N,
5190 sve_ldst1_host_fn *host_fn,
5191 sve_ldst1_tlb_fn *tlb_fn)
5193 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5194 int bit55 = extract64(addr, 55, 1);
5196 /* Remove mtedesc from the normal sve descriptor. */
5197 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5199 /* Perform gross MTE suppression early. */
5200 if (!tbi_check(desc, bit55) ||
5201 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5205 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5208 #define DO_STN_1(N, NAME, ESZ) \
5209 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5210 target_ulong addr, uint32_t desc) \
5212 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5213 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5215 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5216 target_ulong addr, uint32_t desc) \
5218 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5219 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5222 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5223 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5224 target_ulong addr, uint32_t desc) \
5226 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5227 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5229 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5230 target_ulong addr, uint32_t desc) \
5232 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5233 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5235 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5236 target_ulong addr, uint32_t desc) \
5238 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5239 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5241 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5242 target_ulong addr, uint32_t desc) \
5244 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5245 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5248 DO_STN_1(1, bb, MO_8)
5249 DO_STN_1(1, bh, MO_16)
5250 DO_STN_1(1, bs, MO_32)
5251 DO_STN_1(1, bd, MO_64)
5252 DO_STN_1(2, bb, MO_8)
5253 DO_STN_1(3, bb, MO_8)
5254 DO_STN_1(4, bb, MO_8)
5256 DO_STN_2(1, hh, MO_16, MO_16)
5257 DO_STN_2(1, hs, MO_32, MO_16)
5258 DO_STN_2(1, hd, MO_64, MO_16)
5259 DO_STN_2(2, hh, MO_16, MO_16)
5260 DO_STN_2(3, hh, MO_16, MO_16)
5261 DO_STN_2(4, hh, MO_16, MO_16)
5263 DO_STN_2(1, ss, MO_32, MO_32)
5264 DO_STN_2(1, sd, MO_64, MO_32)
5265 DO_STN_2(2, ss, MO_32, MO_32)
5266 DO_STN_2(3, ss, MO_32, MO_32)
5267 DO_STN_2(4, ss, MO_32, MO_32)
5269 DO_STN_2(1, dd, MO_64, MO_64)
5270 DO_STN_2(2, dd, MO_64, MO_64)
5271 DO_STN_2(3, dd, MO_64, MO_64)
5272 DO_STN_2(4, dd, MO_64, MO_64)
5278 * Loads with a vector index.
5282 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5284 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5286 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5288 return *(uint32_t *)(reg + H1_4(reg_ofs));
5291 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5293 return *(int32_t *)(reg + H1_4(reg_ofs));
5296 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5298 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5301 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5303 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5306 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5308 return *(uint64_t *)(reg + reg_ofs);
5311 static inline QEMU_ALWAYS_INLINE
5312 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5313 target_ulong base, uint32_t desc, uintptr_t retaddr,
5314 uint32_t mtedesc, int esize, int msize,
5315 zreg_off_fn *off_fn,
5316 sve_ldst1_host_fn *host_fn,
5317 sve_ldst1_tlb_fn *tlb_fn)
5319 const int mmu_idx = cpu_mmu_index(env, false);
5320 const intptr_t reg_max = simd_oprsz(desc);
5321 const int scale = simd_data(desc);
5322 ARMVectorReg scratch;
5324 SVEHostPage info, info2;
5326 memset(&scratch, 0, reg_max);
5329 uint64_t pg = vg[reg_off >> 6];
5331 if (likely(pg & 1)) {
5332 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5333 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5335 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5338 if (likely(in_page >= msize)) {
5339 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5340 cpu_check_watchpoint(env_cpu(env), addr, msize,
5341 info.attrs, BP_MEM_READ, retaddr);
5343 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5344 mte_check(env, mtedesc, addr, retaddr);
5346 host_fn(&scratch, reg_off, info.host);
5348 /* Element crosses the page boundary. */
5349 sve_probe_page(&info2, false, env, addr + in_page, 0,
5350 MMU_DATA_LOAD, mmu_idx, retaddr);
5351 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5352 cpu_check_watchpoint(env_cpu(env), addr,
5354 BP_MEM_READ, retaddr);
5356 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5357 mte_check(env, mtedesc, addr, retaddr);
5359 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5364 } while (reg_off & 63);
5365 } while (reg_off < reg_max);
5367 /* Wait until all exceptions have been raised to write back. */
5368 memcpy(vd, &scratch, reg_max);
5371 static inline QEMU_ALWAYS_INLINE
5372 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5373 target_ulong base, uint32_t desc, uintptr_t retaddr,
5374 int esize, int msize, zreg_off_fn *off_fn,
5375 sve_ldst1_host_fn *host_fn,
5376 sve_ldst1_tlb_fn *tlb_fn)
5378 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5379 /* Remove mtedesc from the normal sve descriptor. */
5380 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5383 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5384 * offset base entirely over the address space hole to change the
5385 * pointer tag, or change the bit55 selector. So we could here
5386 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5388 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5389 esize, msize, off_fn, host_fn, tlb_fn);
5392 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5393 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5394 void *vm, target_ulong base, uint32_t desc) \
5396 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5397 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5399 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5400 void *vm, target_ulong base, uint32_t desc) \
5402 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5403 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5406 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5407 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5408 void *vm, target_ulong base, uint32_t desc) \
5410 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5411 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5413 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5414 void *vm, target_ulong base, uint32_t desc) \
5416 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5417 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5420 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5421 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5422 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5423 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5424 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5426 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5427 DO_LD1_ZPZ_S(bss, zss, MO_8)
5428 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5429 DO_LD1_ZPZ_D(bds, zss, MO_8)
5430 DO_LD1_ZPZ_D(bds, zd, MO_8)
5432 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5433 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5434 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5435 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5436 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5438 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5439 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5440 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5441 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5442 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5444 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5445 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5446 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5447 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5448 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5450 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5451 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5452 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5453 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5454 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5456 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5457 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5458 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5459 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5460 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5462 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5463 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5464 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5465 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5466 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5468 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5469 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5470 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5472 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5473 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5474 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5476 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5477 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5478 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5480 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5481 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5482 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5487 /* First fault loads with a vector index. */
5490 * Common helpers for all gather first-faulting loads.
5493 static inline QEMU_ALWAYS_INLINE
5494 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5495 target_ulong base, uint32_t desc, uintptr_t retaddr,
5496 uint32_t mtedesc, const int esz, const int msz,
5497 zreg_off_fn *off_fn,
5498 sve_ldst1_host_fn *host_fn,
5499 sve_ldst1_tlb_fn *tlb_fn)
5501 const int mmu_idx = cpu_mmu_index(env, false);
5502 const intptr_t reg_max = simd_oprsz(desc);
5503 const int scale = simd_data(desc);
5504 const int esize = 1 << esz;
5505 const int msize = 1 << msz;
5508 target_ulong addr, in_page;
5510 /* Skip to the first true predicate. */
5511 reg_off = find_next_active(vg, 0, reg_max, esz);
5512 if (unlikely(reg_off >= reg_max)) {
5513 /* The entire predicate was false; no load occurs. */
5514 memset(vd, 0, reg_max);
5519 * Probe the first element, allowing faults.
5521 addr = base + (off_fn(vm, reg_off) << scale);
5523 mte_check(env, mtedesc, addr, retaddr);
5525 tlb_fn(env, vd, reg_off, addr, retaddr);
5527 /* After any fault, zero the other elements. */
5528 swap_memzero(vd, reg_off);
5530 swap_memzero(vd + reg_off, reg_max - reg_off);
5533 * Probe the remaining elements, not allowing faults.
5535 while (reg_off < reg_max) {
5536 uint64_t pg = vg[reg_off >> 6];
5538 if (likely((pg >> (reg_off & 63)) & 1)) {
5539 addr = base + (off_fn(vm, reg_off) << scale);
5540 in_page = -(addr | TARGET_PAGE_MASK);
5542 if (unlikely(in_page < msize)) {
5543 /* Stop if the element crosses a page boundary. */
5547 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5549 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5552 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5553 (cpu_watchpoint_address_matches
5554 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5558 arm_tlb_mte_tagged(&info.attrs) &&
5559 !mte_probe(env, mtedesc, addr)) {
5563 host_fn(vd, reg_off, info.host);
5566 } while (reg_off & 63);
5571 record_fault(env, reg_off, reg_max);
5574 static inline QEMU_ALWAYS_INLINE
5575 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5576 target_ulong base, uint32_t desc, uintptr_t retaddr,
5577 const int esz, const int msz,
5578 zreg_off_fn *off_fn,
5579 sve_ldst1_host_fn *host_fn,
5580 sve_ldst1_tlb_fn *tlb_fn)
5582 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5583 /* Remove mtedesc from the normal sve descriptor. */
5584 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5587 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5588 * offset base entirely over the address space hole to change the
5589 * pointer tag, or change the bit55 selector. So we could here
5590 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5592 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5593 esz, msz, off_fn, host_fn, tlb_fn);
5596 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5597 void HELPER(sve_ldff##MEM##_##OFS) \
5598 (CPUARMState *env, void *vd, void *vg, \
5599 void *vm, target_ulong base, uint32_t desc) \
5601 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5602 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5604 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5605 (CPUARMState *env, void *vd, void *vg, \
5606 void *vm, target_ulong base, uint32_t desc) \
5608 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5609 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5612 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5613 void HELPER(sve_ldff##MEM##_##OFS) \
5614 (CPUARMState *env, void *vd, void *vg, \
5615 void *vm, target_ulong base, uint32_t desc) \
5617 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5618 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5620 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5621 (CPUARMState *env, void *vd, void *vg, \
5622 void *vm, target_ulong base, uint32_t desc) \
5624 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5625 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5628 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5629 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5630 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5631 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5632 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5634 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5635 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5636 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5637 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5638 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5640 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5641 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5642 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5643 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5644 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5646 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5647 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5648 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5649 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5650 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5652 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5653 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5654 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5655 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5656 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5658 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5659 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5660 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5661 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5662 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5664 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5665 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5666 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5667 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5668 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5670 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5671 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5672 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5673 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5674 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5676 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5677 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5678 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5680 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5681 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5682 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5684 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5685 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5686 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5688 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5689 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5690 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5692 /* Stores with a vector index. */
5694 static inline QEMU_ALWAYS_INLINE
5695 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5696 target_ulong base, uint32_t desc, uintptr_t retaddr,
5697 uint32_t mtedesc, int esize, int msize,
5698 zreg_off_fn *off_fn,
5699 sve_ldst1_host_fn *host_fn,
5700 sve_ldst1_tlb_fn *tlb_fn)
5702 const int mmu_idx = cpu_mmu_index(env, false);
5703 const intptr_t reg_max = simd_oprsz(desc);
5704 const int scale = simd_data(desc);
5705 void *host[ARM_MAX_VQ * 4];
5706 intptr_t reg_off, i;
5707 SVEHostPage info, info2;
5710 * Probe all of the elements for host addresses and flags.
5714 uint64_t pg = vg[reg_off >> 6];
5716 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5717 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5720 if (likely((pg >> (reg_off & 63)) & 1)) {
5721 if (likely(in_page >= msize)) {
5722 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5724 host[i] = info.host;
5727 * Element crosses the page boundary.
5728 * Probe both pages, but do not record the host address,
5729 * so that we use the slow path.
5731 sve_probe_page(&info, false, env, addr, 0,
5732 MMU_DATA_STORE, mmu_idx, retaddr);
5733 sve_probe_page(&info2, false, env, addr + in_page, 0,
5734 MMU_DATA_STORE, mmu_idx, retaddr);
5735 info.flags |= info2.flags;
5738 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5739 cpu_check_watchpoint(env_cpu(env), addr, msize,
5740 info.attrs, BP_MEM_WRITE, retaddr);
5743 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5744 mte_check(env, mtedesc, addr, retaddr);
5749 } while (reg_off & 63);
5750 } while (reg_off < reg_max);
5753 * Now that we have recognized all exceptions except SyncExternal
5754 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5756 * Note for the common case of an element in RAM, not crossing a page
5757 * boundary, we have stored the host address in host[]. This doubles
5758 * as a first-level check against the predicate, since only enabled
5759 * elements have non-null host addresses.
5764 if (likely(h != NULL)) {
5765 host_fn(vd, reg_off, h);
5766 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5767 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5768 tlb_fn(env, vd, reg_off, addr, retaddr);
5772 } while (reg_off < reg_max);
5775 static inline QEMU_ALWAYS_INLINE
5776 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5777 target_ulong base, uint32_t desc, uintptr_t retaddr,
5778 int esize, int msize, zreg_off_fn *off_fn,
5779 sve_ldst1_host_fn *host_fn,
5780 sve_ldst1_tlb_fn *tlb_fn)
5782 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5783 /* Remove mtedesc from the normal sve descriptor. */
5784 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5787 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5788 * offset base entirely over the address space hole to change the
5789 * pointer tag, or change the bit55 selector. So we could here
5790 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5792 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5793 esize, msize, off_fn, host_fn, tlb_fn);
5796 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5797 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5798 void *vm, target_ulong base, uint32_t desc) \
5800 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5801 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5803 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5804 void *vm, target_ulong base, uint32_t desc) \
5806 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5807 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5810 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5811 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5812 void *vm, target_ulong base, uint32_t desc) \
5814 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5815 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5817 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5818 void *vm, target_ulong base, uint32_t desc) \
5820 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5821 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5824 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5825 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5826 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5827 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5828 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5830 DO_ST1_ZPZ_S(bs, zss, MO_8)
5831 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5832 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5833 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5834 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5836 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5837 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5838 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5839 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5840 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5841 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5842 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5844 DO_ST1_ZPZ_D(bd, zss, MO_8)
5845 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5846 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5847 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5848 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5849 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5850 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5852 DO_ST1_ZPZ_D(bd, zd, MO_8)
5853 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5854 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5855 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5856 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5857 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5858 DO_ST1_ZPZ_D(dd_be, zd, MO_64)