4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags |= ((d & (g & -g)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte)
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte)
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word[byte & 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte)
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
263 return word[byte & 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h)
275 uint64_t m = 0x0000ffff0000ffffull;
277 return ((h & m) << 16) | ((h >> 16) & m);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp, DO_AND)
307 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
392 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
397 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
402 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
407 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
412 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
417 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
427 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
437 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
454 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
456 return (n * m) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
461 return (n * m) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
467 muls64(&lo, &hi, n, m);
471 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
474 mulu64(&lo, &hi, n, m);
478 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
483 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
488 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
493 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
496 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
509 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
513 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
517 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
521 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
523 int8_t n1 = n, n2 = n >> 8;
527 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
529 int16_t n1 = n, n2 = n >> 16;
533 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
535 int32_t n1 = n, n2 = n >> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
543 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
545 uint8_t n1 = n, n2 = n >> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
551 uint16_t n1 = n, n2 = n >> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
557 uint32_t n1 = n, n2 = n >> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
659 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
672 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
685 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
690 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
692 return val >= max ? max : val <= min ? min : val;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
709 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
721 return r < n ? UINT64_MAX : r;
724 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
743 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
754 return n > m ? n - m : 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
809 return n < -m ? 0 : r;
811 return r < n ? UINT64_MAX : r;
814 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
965 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
966 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
967 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
969 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
970 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
971 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
973 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
974 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
975 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1016 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1017 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1018 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1024 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1025 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1026 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1028 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1029 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1030 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1031 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1036 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1037 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1038 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1043 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1044 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1049 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1050 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1055 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1056 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1057 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1067 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1068 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1069 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1070 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1071 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1073 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1074 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1075 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1076 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1077 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1078 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1083 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1084 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1085 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1090 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1091 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1092 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1094 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1095 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1096 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1098 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1099 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1101 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1103 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1104 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1105 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1106 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1113 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1114 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1115 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1122 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1123 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1124 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1126 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1127 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1145 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1146 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1147 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1149 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1150 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1151 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1153 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1154 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1155 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1189 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1190 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1192 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1193 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1194 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1196 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1197 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1198 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1200 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1201 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1202 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1204 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1205 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1206 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1208 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1209 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1210 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1212 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1213 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1214 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1216 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1217 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1218 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1223 int16_t val = n * m;
1224 return DO_SQADD_H(val, val);
1227 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1229 int32_t val = n * m;
1230 return DO_SQADD_S(val, val);
1233 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1235 int64_t val = n * m;
1236 return do_sqadd_d(val, val);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1258 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1259 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1261 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1262 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1263 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1265 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1266 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1267 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1269 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1270 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1271 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1289 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1290 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1291 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1309 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1310 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1312 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1313 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1314 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1318 #define DO_XTNB(NAME, TYPE, OP) \
1319 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1321 intptr_t i, opr_sz = simd_oprsz(desc); \
1322 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1323 TYPE nn = *(TYPE *)(vn + i); \
1324 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1325 *(TYPE *)(vd + i) = nn; \
1329 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1330 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1332 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1333 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1334 TYPE nn = *(TYPE *)(vn + i); \
1335 *(TYPEN *)(vd + i + odd) = OP(nn); \
1339 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1340 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1341 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1343 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1344 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1345 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1347 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1348 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1349 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1351 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1352 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1353 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1355 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1356 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1357 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1359 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1360 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1361 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1363 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1364 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1365 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1367 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1368 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1369 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1374 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1376 intptr_t i, opr_sz = simd_oprsz(desc);
1377 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1378 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1379 uint32_t *a = va, *n = vn;
1380 uint64_t *d = vd, *m = vm;
1382 for (i = 0; i < opr_sz / 8; ++i) {
1383 uint32_t e1 = a[2 * i + H4(0)];
1384 uint32_t e2 = n[2 * i + sel] ^ inv;
1385 uint64_t c = extract64(m[i], 32, 1);
1386 /* Compute and store the entire 33-bit result at once. */
1391 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1393 intptr_t i, opr_sz = simd_oprsz(desc);
1394 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1395 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1396 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1398 for (i = 0; i < opr_sz / 8; i += 2) {
1399 Int128 e1 = int128_make64(a[i]);
1400 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1401 Int128 c = int128_make64(m[i + 1] & 1);
1402 Int128 r = int128_add(int128_add(e1, e2), c);
1403 d[i + 0] = int128_getlo(r);
1404 d[i + 1] = int128_gethi(r);
1408 #define DO_BITPERM(NAME, TYPE, OP) \
1409 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1411 intptr_t i, opr_sz = simd_oprsz(desc); \
1412 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1413 TYPE nn = *(TYPE *)(vn + i); \
1414 TYPE mm = *(TYPE *)(vm + i); \
1415 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1419 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1424 for (db = 0; db < n; ++db) {
1425 if ((mask >> db) & 1) {
1426 res |= ((data >> db) & 1) << rb;
1433 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1434 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1435 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1436 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1438 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1443 for (rb = 0; rb < n; ++rb) {
1444 if ((mask >> rb) & 1) {
1445 res |= ((data >> db) & 1) << rb;
1452 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1453 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1454 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1455 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1457 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1459 uint64_t resm = 0, resu = 0;
1460 int db, rbm = 0, rbu = 0;
1462 for (db = 0; db < n; ++db) {
1463 uint64_t val = (data >> db) & 1;
1464 if ((mask >> db) & 1) {
1465 resm |= val << rbm++;
1467 resu |= val << rbu++;
1471 return resm | (resu << rbm);
1474 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1475 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1476 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1477 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1481 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1482 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1484 intptr_t i, opr_sz = simd_oprsz(desc); \
1485 int sub_r = simd_data(desc); \
1487 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1488 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1489 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1490 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1491 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1492 acc_r = ADD_OP(acc_r, el2_i); \
1493 acc_i = SUB_OP(acc_i, el2_r); \
1494 *(TYPE *)(vd + H(i)) = acc_r; \
1495 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1498 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1499 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1500 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1501 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1502 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1503 acc_r = SUB_OP(acc_r, el2_i); \
1504 acc_i = ADD_OP(acc_i, el2_r); \
1505 *(TYPE *)(vd + H(i)) = acc_r; \
1506 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1511 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1512 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1513 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1514 DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1516 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1517 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1518 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1519 DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1523 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1524 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1526 intptr_t i, opr_sz = simd_oprsz(desc); \
1527 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1528 int shift = simd_data(desc) >> 1; \
1529 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1530 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1531 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1535 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1536 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1537 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1539 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1540 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1541 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1545 /* Two-operand reduction expander, controlled by a predicate.
1546 * The difference between TYPERED and TYPERET has to do with
1547 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1548 * but TYPERET must be unsigned so that e.g. a 32-bit value
1549 * is not sign-extended to the ABI uint64_t return type.
1551 /* ??? If we were to vectorize this by hand the reduction ordering
1552 * would change. For integer operands, this is perfectly fine.
1554 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1555 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1557 intptr_t i, opr_sz = simd_oprsz(desc); \
1558 TYPERED ret = INIT; \
1559 for (i = 0; i < opr_sz; ) { \
1560 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1563 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1564 ret = OP(ret, nn); \
1566 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1569 return (TYPERET)ret; \
1572 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1573 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1575 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1579 for (i = 0; i < opr_sz; i += 1) { \
1580 if (pg[H1(i)] & 1) { \
1582 ret = OP(ret, nn); \
1588 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1589 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1590 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1591 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1593 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1594 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1595 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1596 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1598 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1599 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1600 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1601 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1603 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1604 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1605 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1607 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1608 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1609 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1610 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1612 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1613 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1614 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1615 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1617 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1618 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1619 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1620 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1622 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1623 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1624 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1625 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1627 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1628 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1629 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1630 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1635 /* Two vector operand, one scalar operand, unpredicated. */
1636 #define DO_ZZI(NAME, TYPE, OP) \
1637 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1639 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1640 TYPE s = s64, *d = vd, *n = vn; \
1641 for (i = 0; i < opr_sz; ++i) { \
1642 d[i] = OP(n[i], s); \
1646 #define DO_SUBR(X, Y) (Y - X)
1648 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1649 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1650 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1651 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1653 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1654 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1655 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1656 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1658 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1659 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1660 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1661 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1663 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1664 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1665 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1666 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1668 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1669 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1670 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1671 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1691 /* Similar to the ARM LastActiveElement pseudocode function, except the
1692 result is multiplied by the element size. This includes the not found
1693 indication; e.g. not found for esz=3 is -8. */
1694 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1696 uint64_t mask = pred_esz_masks[esz];
1700 uint64_t this_g = g[--i] & mask;
1702 return i * 64 + (63 - clz64(this_g));
1705 return (intptr_t)-1 << esz;
1708 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1710 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1711 uint32_t flags = PREDTEST_INIT;
1712 uint64_t *d = vd, *g = vg;
1716 uint64_t this_d = d[i];
1717 uint64_t this_g = g[i];
1721 /* Set in D the first bit of G. */
1722 this_d |= this_g & -this_g;
1725 flags = iter_predtest_fwd(this_d, this_g, flags);
1727 } while (++i < words);
1732 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1734 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1735 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1736 uint32_t flags = PREDTEST_INIT;
1737 uint64_t *d = vd, *g = vg, esz_mask;
1740 next = last_active_element(vd, words, esz) + (1 << esz);
1741 esz_mask = pred_esz_masks[esz];
1743 /* Similar to the pseudocode for pnext, but scaled by ESZ
1744 so that we find the correct bit. */
1745 if (next < words * 64) {
1749 mask = ~((1ull << (next & 63)) - 1);
1753 uint64_t this_g = g[next / 64] & esz_mask & mask;
1755 next = (next & -64) + ctz64(this_g);
1760 } while (next < words * 64);
1765 uint64_t this_d = 0;
1766 if (i == next / 64) {
1767 this_d = 1ull << (next & 63);
1770 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1771 } while (++i < words);
1777 * Copy Zn into Zd, and store zero into inactive elements.
1778 * If inv, store zeros into the active elements.
1780 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1782 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1783 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1784 uint64_t *d = vd, *n = vn;
1787 for (i = 0; i < opr_sz; i += 1) {
1788 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1792 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1794 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1795 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1796 uint64_t *d = vd, *n = vn;
1799 for (i = 0; i < opr_sz; i += 1) {
1800 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1804 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1806 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1807 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1808 uint64_t *d = vd, *n = vn;
1811 for (i = 0; i < opr_sz; i += 1) {
1812 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1816 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1818 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1819 uint64_t *d = vd, *n = vn;
1821 uint8_t inv = simd_data(desc);
1823 for (i = 0; i < opr_sz; i += 1) {
1824 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1828 /* Three-operand expander, immediate operand, controlled by a predicate.
1830 #define DO_ZPZI(NAME, TYPE, H, OP) \
1831 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1833 intptr_t i, opr_sz = simd_oprsz(desc); \
1834 TYPE imm = simd_data(desc); \
1835 for (i = 0; i < opr_sz; ) { \
1836 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1839 TYPE nn = *(TYPE *)(vn + H(i)); \
1840 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1847 /* Similarly, specialized for 64-bit operands. */
1848 #define DO_ZPZI_D(NAME, TYPE, OP) \
1849 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1851 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1852 TYPE *d = vd, *n = vn; \
1853 TYPE imm = simd_data(desc); \
1855 for (i = 0; i < opr_sz; i += 1) { \
1856 if (pg[H1(i)] & 1) { \
1858 d[i] = OP(nn, imm); \
1863 #define DO_SHR(N, M) (N >> M)
1864 #define DO_SHL(N, M) (N << M)
1866 /* Arithmetic shift right for division. This rounds negative numbers
1867 toward zero as per signed division. Therefore before shifting,
1868 when N is negative, add 2**M-1. */
1869 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1871 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
1873 if (likely(sh < 64)) {
1874 return (x >> sh) + ((x >> (sh - 1)) & 1);
1875 } else if (sh == 64) {
1882 static inline int64_t do_srshr(int64_t x, unsigned sh)
1884 if (likely(sh < 64)) {
1885 return (x >> sh) + ((x >> (sh - 1)) & 1);
1887 /* Rounding the sign bit always produces 0. */
1892 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1893 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1894 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1895 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1897 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1898 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1899 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1900 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1902 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1903 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1904 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1905 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1907 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1908 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1909 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1910 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1916 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
1917 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1919 intptr_t i, opr_sz = simd_oprsz(desc); \
1920 int shift = simd_data(desc); \
1921 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1922 TYPEW nn = *(TYPEW *)(vn + i); \
1923 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
1927 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
1928 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1930 intptr_t i, opr_sz = simd_oprsz(desc); \
1931 int shift = simd_data(desc); \
1932 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1933 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1934 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
1938 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
1939 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
1940 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
1942 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
1943 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
1944 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
1946 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
1947 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
1948 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
1950 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
1951 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
1952 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
1954 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
1955 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
1956 #define DO_SQSHRUN_D(x, sh) \
1957 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
1959 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
1960 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
1961 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
1963 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
1964 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
1965 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
1967 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
1968 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
1969 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
1971 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
1972 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
1973 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
1975 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
1976 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
1977 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
1979 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
1980 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
1981 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
1983 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
1984 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
1985 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
1987 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
1988 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
1989 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
1991 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
1992 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
1993 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
1995 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
1996 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
1997 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
1999 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2000 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2001 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
2003 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2004 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2005 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2007 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2008 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2009 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2011 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2012 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2013 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
2015 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2016 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2017 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2019 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2020 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2021 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2023 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2024 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2025 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
2030 /* Fully general four-operand expander, controlled by a predicate.
2032 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2033 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2034 void *vg, uint32_t desc) \
2036 intptr_t i, opr_sz = simd_oprsz(desc); \
2037 for (i = 0; i < opr_sz; ) { \
2038 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2041 TYPE nn = *(TYPE *)(vn + H(i)); \
2042 TYPE mm = *(TYPE *)(vm + H(i)); \
2043 TYPE aa = *(TYPE *)(va + H(i)); \
2044 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2046 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2051 /* Similarly, specialized for 64-bit operands. */
2052 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2053 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2054 void *vg, uint32_t desc) \
2056 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2057 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2059 for (i = 0; i < opr_sz; i += 1) { \
2060 if (pg[H1(i)] & 1) { \
2061 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2062 d[i] = OP(aa, nn, mm); \
2067 #define DO_MLA(A, N, M) (A + N * M)
2068 #define DO_MLS(A, N, M) (A - N * M)
2070 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2071 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2073 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2074 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2076 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2077 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2079 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2080 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2087 void HELPER(sve_index_b)(void *vd, uint32_t start,
2088 uint32_t incr, uint32_t desc)
2090 intptr_t i, opr_sz = simd_oprsz(desc);
2092 for (i = 0; i < opr_sz; i += 1) {
2093 d[H1(i)] = start + i * incr;
2097 void HELPER(sve_index_h)(void *vd, uint32_t start,
2098 uint32_t incr, uint32_t desc)
2100 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2102 for (i = 0; i < opr_sz; i += 1) {
2103 d[H2(i)] = start + i * incr;
2107 void HELPER(sve_index_s)(void *vd, uint32_t start,
2108 uint32_t incr, uint32_t desc)
2110 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2112 for (i = 0; i < opr_sz; i += 1) {
2113 d[H4(i)] = start + i * incr;
2117 void HELPER(sve_index_d)(void *vd, uint64_t start,
2118 uint64_t incr, uint32_t desc)
2120 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2122 for (i = 0; i < opr_sz; i += 1) {
2123 d[i] = start + i * incr;
2127 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2129 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2130 uint32_t sh = simd_data(desc);
2131 uint32_t *d = vd, *n = vn, *m = vm;
2132 for (i = 0; i < opr_sz; i += 1) {
2133 d[i] = n[i] + (m[i] << sh);
2137 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2139 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2140 uint64_t sh = simd_data(desc);
2141 uint64_t *d = vd, *n = vn, *m = vm;
2142 for (i = 0; i < opr_sz; i += 1) {
2143 d[i] = n[i] + (m[i] << sh);
2147 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2149 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2150 uint64_t sh = simd_data(desc);
2151 uint64_t *d = vd, *n = vn, *m = vm;
2152 for (i = 0; i < opr_sz; i += 1) {
2153 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2157 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2159 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2160 uint64_t sh = simd_data(desc);
2161 uint64_t *d = vd, *n = vn, *m = vm;
2162 for (i = 0; i < opr_sz; i += 1) {
2163 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2167 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2169 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2170 static const uint16_t coeff[] = {
2171 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2172 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2173 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2174 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2176 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2177 uint16_t *d = vd, *n = vn;
2179 for (i = 0; i < opr_sz; i++) {
2181 intptr_t idx = extract32(nn, 0, 5);
2182 uint16_t exp = extract32(nn, 5, 5);
2183 d[i] = coeff[idx] | (exp << 10);
2187 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2189 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2190 static const uint32_t coeff[] = {
2191 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2192 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2193 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2194 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2195 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2196 0x1ef532, 0x20b051, 0x227043, 0x243516,
2197 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2198 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2199 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2200 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2201 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2202 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2203 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2204 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2205 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2206 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2208 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2209 uint32_t *d = vd, *n = vn;
2211 for (i = 0; i < opr_sz; i++) {
2213 intptr_t idx = extract32(nn, 0, 6);
2214 uint32_t exp = extract32(nn, 6, 8);
2215 d[i] = coeff[idx] | (exp << 23);
2219 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2221 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2222 static const uint64_t coeff[] = {
2223 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2224 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2225 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2226 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2227 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2228 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2229 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2230 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2231 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2232 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2233 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2234 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2235 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2236 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2237 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2238 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2239 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2240 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2241 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2242 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2243 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2246 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2247 uint64_t *d = vd, *n = vn;
2249 for (i = 0; i < opr_sz; i++) {
2251 intptr_t idx = extract32(nn, 0, 6);
2252 uint64_t exp = extract32(nn, 6, 11);
2253 d[i] = coeff[idx] | (exp << 52);
2257 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2259 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2260 uint16_t *d = vd, *n = vn, *m = vm;
2261 for (i = 0; i < opr_sz; i += 1) {
2267 d[i] = nn ^ (mm & 2) << 14;
2271 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2273 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2274 uint32_t *d = vd, *n = vn, *m = vm;
2275 for (i = 0; i < opr_sz; i += 1) {
2281 d[i] = nn ^ (mm & 2) << 30;
2285 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2287 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2288 uint64_t *d = vd, *n = vn, *m = vm;
2289 for (i = 0; i < opr_sz; i += 1) {
2295 d[i] = nn ^ (mm & 2) << 62;
2300 * Signed saturating addition with scalar operand.
2303 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2305 intptr_t i, oprsz = simd_oprsz(desc);
2307 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2308 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2312 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2314 intptr_t i, oprsz = simd_oprsz(desc);
2316 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2317 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2321 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2323 intptr_t i, oprsz = simd_oprsz(desc);
2325 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2326 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2330 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2332 intptr_t i, oprsz = simd_oprsz(desc);
2334 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2335 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2340 * Unsigned saturating addition with scalar operand.
2343 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2345 intptr_t i, oprsz = simd_oprsz(desc);
2347 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2348 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2352 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2354 intptr_t i, oprsz = simd_oprsz(desc);
2356 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2357 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2361 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2363 intptr_t i, oprsz = simd_oprsz(desc);
2365 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2366 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2370 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2372 intptr_t i, oprsz = simd_oprsz(desc);
2374 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2375 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2379 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2381 intptr_t i, oprsz = simd_oprsz(desc);
2383 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2384 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2388 /* Two operand predicated copy immediate with merge. All valid immediates
2389 * can fit within 17 signed bits in the simd_data field.
2391 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2392 uint64_t mm, uint32_t desc)
2394 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2395 uint64_t *d = vd, *n = vn;
2398 mm = dup_const(MO_8, mm);
2399 for (i = 0; i < opr_sz; i += 1) {
2401 uint64_t pp = expand_pred_b(pg[H1(i)]);
2402 d[i] = (mm & pp) | (nn & ~pp);
2406 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2407 uint64_t mm, uint32_t desc)
2409 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2410 uint64_t *d = vd, *n = vn;
2413 mm = dup_const(MO_16, mm);
2414 for (i = 0; i < opr_sz; i += 1) {
2416 uint64_t pp = expand_pred_h(pg[H1(i)]);
2417 d[i] = (mm & pp) | (nn & ~pp);
2421 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2422 uint64_t mm, uint32_t desc)
2424 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2425 uint64_t *d = vd, *n = vn;
2428 mm = dup_const(MO_32, mm);
2429 for (i = 0; i < opr_sz; i += 1) {
2431 uint64_t pp = expand_pred_s(pg[H1(i)]);
2432 d[i] = (mm & pp) | (nn & ~pp);
2436 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2437 uint64_t mm, uint32_t desc)
2439 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2440 uint64_t *d = vd, *n = vn;
2443 for (i = 0; i < opr_sz; i += 1) {
2445 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2449 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2451 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2455 val = dup_const(MO_8, val);
2456 for (i = 0; i < opr_sz; i += 1) {
2457 d[i] = val & expand_pred_b(pg[H1(i)]);
2461 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2463 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2467 val = dup_const(MO_16, val);
2468 for (i = 0; i < opr_sz; i += 1) {
2469 d[i] = val & expand_pred_h(pg[H1(i)]);
2473 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2475 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2479 val = dup_const(MO_32, val);
2480 for (i = 0; i < opr_sz; i += 1) {
2481 d[i] = val & expand_pred_s(pg[H1(i)]);
2485 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2487 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2491 for (i = 0; i < opr_sz; i += 1) {
2492 d[i] = (pg[H1(i)] & 1 ? val : 0);
2496 /* Big-endian hosts need to frob the byte indices. If the copy
2497 * happens to be 8-byte aligned, then no frobbing necessary.
2499 static void swap_memmove(void *vd, void *vs, size_t n)
2501 uintptr_t d = (uintptr_t)vd;
2502 uintptr_t s = (uintptr_t)vs;
2503 uintptr_t o = (d | s | n) & 7;
2506 #ifndef HOST_WORDS_BIGENDIAN
2515 if (d < s || d >= s + n) {
2516 for (i = 0; i < n; i += 4) {
2517 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2520 for (i = n; i > 0; ) {
2522 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2529 if (d < s || d >= s + n) {
2530 for (i = 0; i < n; i += 2) {
2531 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2534 for (i = n; i > 0; ) {
2536 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2542 if (d < s || d >= s + n) {
2543 for (i = 0; i < n; i++) {
2544 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2547 for (i = n; i > 0; ) {
2549 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2556 /* Similarly for memset of 0. */
2557 static void swap_memzero(void *vd, size_t n)
2559 uintptr_t d = (uintptr_t)vd;
2560 uintptr_t o = (d | n) & 7;
2563 /* Usually, the first bit of a predicate is set, so N is 0. */
2564 if (likely(n == 0)) {
2568 #ifndef HOST_WORDS_BIGENDIAN
2577 for (i = 0; i < n; i += 4) {
2578 *(uint32_t *)H1_4(d + i) = 0;
2584 for (i = 0; i < n; i += 2) {
2585 *(uint16_t *)H1_2(d + i) = 0;
2590 for (i = 0; i < n; i++) {
2591 *(uint8_t *)H1(d + i) = 0;
2597 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2599 intptr_t opr_sz = simd_oprsz(desc);
2600 size_t n_ofs = simd_data(desc);
2601 size_t n_siz = opr_sz - n_ofs;
2604 swap_memmove(vd, vn + n_ofs, n_siz);
2605 swap_memmove(vd + n_siz, vm, n_ofs);
2606 } else if (vd != vn) {
2607 swap_memmove(vd + n_siz, vd, n_ofs);
2608 swap_memmove(vd, vn + n_ofs, n_siz);
2610 /* vd == vn == vm. Need temp space. */
2612 swap_memmove(&tmp, vm, n_ofs);
2613 swap_memmove(vd, vd + n_ofs, n_siz);
2614 memcpy(vd + n_siz, &tmp, n_ofs);
2618 #define DO_INSR(NAME, TYPE, H) \
2619 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2621 intptr_t opr_sz = simd_oprsz(desc); \
2622 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2623 *(TYPE *)(vd + H(0)) = val; \
2626 DO_INSR(sve_insr_b, uint8_t, H1)
2627 DO_INSR(sve_insr_h, uint16_t, H1_2)
2628 DO_INSR(sve_insr_s, uint32_t, H1_4)
2629 DO_INSR(sve_insr_d, uint64_t, )
2633 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2635 intptr_t i, j, opr_sz = simd_oprsz(desc);
2636 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2637 uint64_t f = *(uint64_t *)(vn + i);
2638 uint64_t b = *(uint64_t *)(vn + j);
2639 *(uint64_t *)(vd + i) = bswap64(b);
2640 *(uint64_t *)(vd + j) = bswap64(f);
2644 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2646 intptr_t i, j, opr_sz = simd_oprsz(desc);
2647 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2648 uint64_t f = *(uint64_t *)(vn + i);
2649 uint64_t b = *(uint64_t *)(vn + j);
2650 *(uint64_t *)(vd + i) = hswap64(b);
2651 *(uint64_t *)(vd + j) = hswap64(f);
2655 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2657 intptr_t i, j, opr_sz = simd_oprsz(desc);
2658 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2659 uint64_t f = *(uint64_t *)(vn + i);
2660 uint64_t b = *(uint64_t *)(vn + j);
2661 *(uint64_t *)(vd + i) = rol64(b, 32);
2662 *(uint64_t *)(vd + j) = rol64(f, 32);
2666 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2668 intptr_t i, j, opr_sz = simd_oprsz(desc);
2669 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2670 uint64_t f = *(uint64_t *)(vn + i);
2671 uint64_t b = *(uint64_t *)(vn + j);
2672 *(uint64_t *)(vd + i) = b;
2673 *(uint64_t *)(vd + j) = f;
2677 #define DO_TBL(NAME, TYPE, H) \
2678 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2680 intptr_t i, opr_sz = simd_oprsz(desc); \
2681 uintptr_t elem = opr_sz / sizeof(TYPE); \
2682 TYPE *d = vd, *n = vn, *m = vm; \
2684 if (unlikely(vd == vn)) { \
2685 n = memcpy(&tmp, vn, opr_sz); \
2687 for (i = 0; i < elem; i++) { \
2689 d[H(i)] = j < elem ? n[H(j)] : 0; \
2693 DO_TBL(sve_tbl_b, uint8_t, H1)
2694 DO_TBL(sve_tbl_h, uint16_t, H2)
2695 DO_TBL(sve_tbl_s, uint32_t, H4)
2696 DO_TBL(sve_tbl_d, uint64_t, )
2700 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2701 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2703 intptr_t i, opr_sz = simd_oprsz(desc); \
2707 if (unlikely(vn - vd < opr_sz)) { \
2708 n = memcpy(&tmp, n, opr_sz / 2); \
2710 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2711 d[HD(i)] = n[HS(i)]; \
2715 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2716 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2717 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2719 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2720 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2721 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2725 /* Mask of bits included in the even numbered predicates of width esz.
2726 * We also use this for expand_bits/compress_bits, and so extend the
2727 * same pattern out to 16-bit units.
2729 static const uint64_t even_bit_esz_masks[5] = {
2730 0x5555555555555555ull,
2731 0x3333333333333333ull,
2732 0x0f0f0f0f0f0f0f0full,
2733 0x00ff00ff00ff00ffull,
2734 0x0000ffff0000ffffull,
2737 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2738 * For N==0, this corresponds to the operation that in qemu/bitops.h
2739 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2740 * section 7-2 Shuffling Bits.
2742 static uint64_t expand_bits(uint64_t x, int n)
2747 for (i = 4; i >= n; i--) {
2749 x = ((x << sh) | x) & even_bit_esz_masks[i];
2754 /* Compress units of 2**(N+1) bits to units of 2**N bits.
2755 * For N==0, this corresponds to the operation that in qemu/bitops.h
2756 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2757 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2759 static uint64_t compress_bits(uint64_t x, int n)
2763 for (i = n; i <= 4; i++) {
2765 x &= even_bit_esz_masks[i];
2768 return x & 0xffffffffu;
2771 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2773 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2774 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2775 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2776 int esize = 1 << esz;
2781 uint64_t nn = *(uint64_t *)vn;
2782 uint64_t mm = *(uint64_t *)vm;
2783 int half = 4 * oprsz;
2785 nn = extract64(nn, high * half, half);
2786 mm = extract64(mm, high * half, half);
2787 nn = expand_bits(nn, esz);
2788 mm = expand_bits(mm, esz);
2789 d[0] = nn | (mm << esize);
2791 ARMPredicateReg tmp;
2793 /* We produce output faster than we consume input.
2794 Therefore we must be mindful of possible overlap. */
2796 vn = memcpy(&tmp, vn, oprsz);
2800 } else if (vd == vm) {
2801 vm = memcpy(&tmp, vm, oprsz);
2807 if ((oprsz & 7) == 0) {
2808 uint32_t *n = vn, *m = vm;
2811 for (i = 0; i < oprsz / 8; i++) {
2812 uint64_t nn = n[H4(high + i)];
2813 uint64_t mm = m[H4(high + i)];
2815 nn = expand_bits(nn, esz);
2816 mm = expand_bits(mm, esz);
2817 d[i] = nn | (mm << esize);
2820 uint8_t *n = vn, *m = vm;
2823 for (i = 0; i < oprsz / 2; i++) {
2824 uint16_t nn = n[H1(high + i)];
2825 uint16_t mm = m[H1(high + i)];
2827 nn = expand_bits(nn, esz);
2828 mm = expand_bits(mm, esz);
2829 d16[H2(i)] = nn | (mm << esize);
2835 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2837 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2838 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2839 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
2840 uint64_t *d = vd, *n = vn, *m = vm;
2845 l = compress_bits(n[0] >> odd, esz);
2846 h = compress_bits(m[0] >> odd, esz);
2847 d[0] = l | (h << (4 * oprsz));
2849 ARMPredicateReg tmp_m;
2850 intptr_t oprsz_16 = oprsz / 16;
2852 if ((vm - vd) < (uintptr_t)oprsz) {
2853 m = memcpy(&tmp_m, vm, oprsz);
2856 for (i = 0; i < oprsz_16; i++) {
2859 l = compress_bits(l >> odd, esz);
2860 h = compress_bits(h >> odd, esz);
2861 d[i] = l | (h << 32);
2865 * For VL which is not a multiple of 512, the results from M do not
2866 * align nicely with the uint64_t for D. Put the aligned results
2867 * from M into TMP_M and then copy it into place afterward.
2870 int final_shift = (oprsz & 15) * 2;
2874 l = compress_bits(l >> odd, esz);
2875 h = compress_bits(h >> odd, esz);
2876 d[i] = l | (h << final_shift);
2878 for (i = 0; i < oprsz_16; i++) {
2881 l = compress_bits(l >> odd, esz);
2882 h = compress_bits(h >> odd, esz);
2883 tmp_m.p[i] = l | (h << 32);
2887 l = compress_bits(l >> odd, esz);
2888 h = compress_bits(h >> odd, esz);
2889 tmp_m.p[i] = l | (h << final_shift);
2891 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2893 for (i = 0; i < oprsz_16; i++) {
2896 l = compress_bits(l >> odd, esz);
2897 h = compress_bits(h >> odd, esz);
2898 d[oprsz_16 + i] = l | (h << 32);
2904 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2906 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2907 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2908 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
2909 uint64_t *d = vd, *n = vn, *m = vm;
2916 mask = even_bit_esz_masks[esz];
2923 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2924 uint64_t nn = (n[i] & mask) >> shr;
2925 uint64_t mm = (m[i] & mask) << shl;
2930 /* Reverse units of 2**N bits. */
2931 static uint64_t reverse_bits_64(uint64_t x, int n)
2936 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2937 uint64_t mask = even_bit_esz_masks[i];
2938 x = ((x & mask) << sh) | ((x >> sh) & mask);
2943 static uint8_t reverse_bits_8(uint8_t x, int n)
2945 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2948 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2949 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2954 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2956 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2957 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2958 intptr_t i, oprsz_2 = oprsz / 2;
2961 uint64_t l = *(uint64_t *)vn;
2962 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2963 *(uint64_t *)vd = l;
2964 } else if ((oprsz & 15) == 0) {
2965 for (i = 0; i < oprsz_2; i += 8) {
2966 intptr_t ih = oprsz - 8 - i;
2967 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2968 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2969 *(uint64_t *)(vd + i) = h;
2970 *(uint64_t *)(vd + ih) = l;
2973 for (i = 0; i < oprsz_2; i += 1) {
2974 intptr_t il = H1(i);
2975 intptr_t ih = H1(oprsz - 1 - i);
2976 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2977 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2978 *(uint8_t *)(vd + il) = h;
2979 *(uint8_t *)(vd + ih) = l;
2984 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2986 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2987 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2992 uint64_t nn = *(uint64_t *)vn;
2993 int half = 4 * oprsz;
2995 nn = extract64(nn, high * half, half);
2996 nn = expand_bits(nn, 0);
2999 ARMPredicateReg tmp_n;
3001 /* We produce output faster than we consume input.
3002 Therefore we must be mindful of possible overlap. */
3003 if ((vn - vd) < (uintptr_t)oprsz) {
3004 vn = memcpy(&tmp_n, vn, oprsz);
3010 if ((oprsz & 7) == 0) {
3014 for (i = 0; i < oprsz / 8; i++) {
3015 uint64_t nn = n[H4(high + i)];
3016 d[i] = expand_bits(nn, 0);
3022 for (i = 0; i < oprsz / 2; i++) {
3023 uint16_t nn = n[H1(high + i)];
3024 d16[H2(i)] = expand_bits(nn, 0);
3030 #define DO_ZIP(NAME, TYPE, H) \
3031 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3033 intptr_t oprsz = simd_oprsz(desc); \
3034 intptr_t i, oprsz_2 = oprsz / 2; \
3035 ARMVectorReg tmp_n, tmp_m; \
3036 /* We produce output faster than we consume input. \
3037 Therefore we must be mindful of possible overlap. */ \
3038 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3039 vn = memcpy(&tmp_n, vn, oprsz_2); \
3041 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3042 vm = memcpy(&tmp_m, vm, oprsz_2); \
3044 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3045 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3046 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3050 DO_ZIP(sve_zip_b, uint8_t, H1)
3051 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3052 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3053 DO_ZIP(sve_zip_d, uint64_t, )
3055 #define DO_UZP(NAME, TYPE, H) \
3056 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3058 intptr_t oprsz = simd_oprsz(desc); \
3059 intptr_t oprsz_2 = oprsz / 2; \
3060 intptr_t odd_ofs = simd_data(desc); \
3062 ARMVectorReg tmp_m; \
3063 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3064 vm = memcpy(&tmp_m, vm, oprsz); \
3066 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3067 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3069 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3070 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3074 DO_UZP(sve_uzp_b, uint8_t, H1)
3075 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3076 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3077 DO_UZP(sve_uzp_d, uint64_t, )
3079 #define DO_TRN(NAME, TYPE, H) \
3080 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3082 intptr_t oprsz = simd_oprsz(desc); \
3083 intptr_t odd_ofs = simd_data(desc); \
3085 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3086 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3087 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3088 *(TYPE *)(vd + H(i + 0)) = ae; \
3089 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3093 DO_TRN(sve_trn_b, uint8_t, H1)
3094 DO_TRN(sve_trn_h, uint16_t, H1_2)
3095 DO_TRN(sve_trn_s, uint32_t, H1_4)
3096 DO_TRN(sve_trn_d, uint64_t, )
3102 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3104 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3105 uint32_t *d = vd, *n = vn;
3108 for (i = j = 0; i < opr_sz; i++) {
3109 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3110 d[H4(j)] = n[H4(i)];
3114 for (; j < opr_sz; j++) {
3119 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3121 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3122 uint64_t *d = vd, *n = vn;
3125 for (i = j = 0; i < opr_sz; i++) {
3126 if (pg[H1(i)] & 1) {
3131 for (; j < opr_sz; j++) {
3136 /* Similar to the ARM LastActiveElement pseudocode function, except the
3137 * result is multiplied by the element size. This includes the not found
3138 * indication; e.g. not found for esz=3 is -8.
3140 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3142 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3143 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3145 return last_active_element(vg, words, esz);
3148 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3150 intptr_t opr_sz = simd_oprsz(desc) / 8;
3151 int esz = simd_data(desc);
3152 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3153 intptr_t i, first_i, last_i;
3156 first_i = last_i = 0;
3157 first_g = last_g = 0;
3159 /* Find the extent of the active elements within VG. */
3160 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3161 pg = *(uint64_t *)(vg + i) & mask;
3174 first_i = first_i * 8 + ctz64(first_g);
3175 last_i = last_i * 8 + 63 - clz64(last_g);
3176 len = last_i - first_i + (1 << esz);
3178 vm = memcpy(&tmp, vm, opr_sz * 8);
3180 swap_memmove(vd, vn + first_i, len);
3182 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3185 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3186 void *vg, uint32_t desc)
3188 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3189 uint64_t *d = vd, *n = vn, *m = vm;
3192 for (i = 0; i < opr_sz; i += 1) {
3193 uint64_t nn = n[i], mm = m[i];
3194 uint64_t pp = expand_pred_b(pg[H1(i)]);
3195 d[i] = (nn & pp) | (mm & ~pp);
3199 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3200 void *vg, uint32_t desc)
3202 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3203 uint64_t *d = vd, *n = vn, *m = vm;
3206 for (i = 0; i < opr_sz; i += 1) {
3207 uint64_t nn = n[i], mm = m[i];
3208 uint64_t pp = expand_pred_h(pg[H1(i)]);
3209 d[i] = (nn & pp) | (mm & ~pp);
3213 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3214 void *vg, uint32_t desc)
3216 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3217 uint64_t *d = vd, *n = vn, *m = vm;
3220 for (i = 0; i < opr_sz; i += 1) {
3221 uint64_t nn = n[i], mm = m[i];
3222 uint64_t pp = expand_pred_s(pg[H1(i)]);
3223 d[i] = (nn & pp) | (mm & ~pp);
3227 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3228 void *vg, uint32_t desc)
3230 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3231 uint64_t *d = vd, *n = vn, *m = vm;
3234 for (i = 0; i < opr_sz; i += 1) {
3235 uint64_t nn = n[i], mm = m[i];
3236 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3240 /* Two operand comparison controlled by a predicate.
3241 * ??? It is very tempting to want to be able to expand this inline
3242 * with x86 instructions, e.g.
3244 * vcmpeqw zm, zn, %ymm0
3245 * vpmovmskb %ymm0, %eax
3249 * or even aarch64, e.g.
3251 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3252 * cmeq v0.8h, zn, zm
3253 * and v0.8h, v0.8h, mask
3257 * However, coming up with an abstraction that allows vector inputs and
3258 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3259 * scalar outputs, is tricky.
3261 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3262 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3264 intptr_t opr_sz = simd_oprsz(desc); \
3265 uint32_t flags = PREDTEST_INIT; \
3266 intptr_t i = opr_sz; \
3268 uint64_t out = 0, pg; \
3270 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3271 TYPE nn = *(TYPE *)(vn + H(i)); \
3272 TYPE mm = *(TYPE *)(vm + H(i)); \
3275 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3277 *(uint64_t *)(vd + (i >> 3)) = out; \
3278 flags = iter_predtest_bwd(out, pg, flags); \
3283 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3284 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3285 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3286 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3287 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3288 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3289 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3290 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3292 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3293 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3294 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3295 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3297 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3298 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3299 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3300 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3302 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3303 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3304 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3305 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3307 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3308 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3309 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3310 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3312 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3313 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3314 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3315 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3317 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3318 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3319 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3320 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3322 #undef DO_CMP_PPZZ_B
3323 #undef DO_CMP_PPZZ_H
3324 #undef DO_CMP_PPZZ_S
3325 #undef DO_CMP_PPZZ_D
3328 /* Similar, but the second source is "wide". */
3329 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3330 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3332 intptr_t opr_sz = simd_oprsz(desc); \
3333 uint32_t flags = PREDTEST_INIT; \
3334 intptr_t i = opr_sz; \
3336 uint64_t out = 0, pg; \
3338 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3340 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3341 TYPE nn = *(TYPE *)(vn + H(i)); \
3345 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3347 *(uint64_t *)(vd + (i >> 3)) = out; \
3348 flags = iter_predtest_bwd(out, pg, flags); \
3353 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3354 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3355 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3356 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3357 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3358 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3360 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3361 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3362 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3364 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3365 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3366 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3368 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3369 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3370 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3372 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3373 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3374 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3376 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3377 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3378 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3380 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3381 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3382 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3384 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3385 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3386 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3388 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3389 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3390 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3392 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3393 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3394 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3396 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3397 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3398 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3400 #undef DO_CMP_PPZW_B
3401 #undef DO_CMP_PPZW_H
3402 #undef DO_CMP_PPZW_S
3405 /* Similar, but the second source is immediate. */
3406 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3407 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3409 intptr_t opr_sz = simd_oprsz(desc); \
3410 uint32_t flags = PREDTEST_INIT; \
3411 TYPE mm = simd_data(desc); \
3412 intptr_t i = opr_sz; \
3414 uint64_t out = 0, pg; \
3416 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3417 TYPE nn = *(TYPE *)(vn + H(i)); \
3420 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3422 *(uint64_t *)(vd + (i >> 3)) = out; \
3423 flags = iter_predtest_bwd(out, pg, flags); \
3428 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3429 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3430 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3431 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3432 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3433 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3434 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3435 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3437 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3438 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3439 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3440 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3442 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3443 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3444 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3445 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3447 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3448 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3449 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3450 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3452 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3453 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3454 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3455 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3457 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3458 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3459 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3460 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3462 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3463 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3464 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3465 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3467 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3468 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3469 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3470 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3472 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3473 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3474 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3475 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3477 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3478 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3479 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3480 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3482 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3483 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3484 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3485 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3487 #undef DO_CMP_PPZI_B
3488 #undef DO_CMP_PPZI_H
3489 #undef DO_CMP_PPZI_S
3490 #undef DO_CMP_PPZI_D
3493 /* Similar to the ARM LastActive pseudocode function. */
3494 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3498 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3499 uint64_t pg = *(uint64_t *)(vg + i);
3501 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3507 /* Compute a mask into RETB that is true for all G, up to and including
3508 * (if after) or excluding (if !after) the first G & N.
3509 * Return true if BRK found.
3511 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3512 bool brk, bool after)
3518 } else if ((g & n) == 0) {
3519 /* For all G, no N are set; break not found. */
3522 /* Break somewhere in N. Locate it. */
3523 b = g & n; /* guard true, pred true */
3524 b = b & -b; /* first such */
3526 b = b | (b - 1); /* break after same */
3528 b = b - 1; /* break before same */
3537 /* Compute a zeroing BRK. */
3538 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3539 intptr_t oprsz, bool after)
3544 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3545 uint64_t this_b, this_g = g[i];
3547 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3548 d[i] = this_b & this_g;
3552 /* Likewise, but also compute flags. */
3553 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3554 intptr_t oprsz, bool after)
3556 uint32_t flags = PREDTEST_INIT;
3560 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3561 uint64_t this_b, this_d, this_g = g[i];
3563 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3564 d[i] = this_d = this_b & this_g;
3565 flags = iter_predtest_fwd(this_d, this_g, flags);
3570 /* Compute a merging BRK. */
3571 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3572 intptr_t oprsz, bool after)
3577 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3578 uint64_t this_b, this_g = g[i];
3580 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3581 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3585 /* Likewise, but also compute flags. */
3586 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3587 intptr_t oprsz, bool after)
3589 uint32_t flags = PREDTEST_INIT;
3593 for (i = 0; i < oprsz / 8; ++i) {
3594 uint64_t this_b, this_d = d[i], this_g = g[i];
3596 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3597 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3598 flags = iter_predtest_fwd(this_d, this_g, flags);
3603 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3605 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3606 * The compiler should turn this into 4 64-bit integer stores.
3608 memset(d, 0, sizeof(ARMPredicateReg));
3609 return PREDTEST_INIT;
3612 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3615 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3616 if (last_active_pred(vn, vg, oprsz)) {
3617 compute_brk_z(vd, vm, vg, oprsz, true);
3623 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3626 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3627 if (last_active_pred(vn, vg, oprsz)) {
3628 return compute_brks_z(vd, vm, vg, oprsz, true);
3630 return do_zero(vd, oprsz);
3634 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3637 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3638 if (last_active_pred(vn, vg, oprsz)) {
3639 compute_brk_z(vd, vm, vg, oprsz, false);
3645 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3648 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3649 if (last_active_pred(vn, vg, oprsz)) {
3650 return compute_brks_z(vd, vm, vg, oprsz, false);
3652 return do_zero(vd, oprsz);
3656 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3658 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3659 compute_brk_z(vd, vn, vg, oprsz, true);
3662 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3664 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3665 return compute_brks_z(vd, vn, vg, oprsz, true);
3668 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3670 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3671 compute_brk_z(vd, vn, vg, oprsz, false);
3674 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3676 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3677 return compute_brks_z(vd, vn, vg, oprsz, false);
3680 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3682 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3683 compute_brk_m(vd, vn, vg, oprsz, true);
3686 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3688 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3689 return compute_brks_m(vd, vn, vg, oprsz, true);
3692 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3694 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3695 compute_brk_m(vd, vn, vg, oprsz, false);
3698 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3700 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3701 return compute_brks_m(vd, vn, vg, oprsz, false);
3704 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3706 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3707 if (!last_active_pred(vn, vg, oprsz)) {
3712 /* As if PredTest(Ones(PL), D, esz). */
3713 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3716 uint32_t flags = PREDTEST_INIT;
3719 for (i = 0; i < oprsz / 8; i++) {
3720 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3723 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3724 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3729 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3731 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3732 if (last_active_pred(vn, vg, oprsz)) {
3733 return predtest_ones(vd, oprsz, -1);
3735 return do_zero(vd, oprsz);
3739 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3741 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3742 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3743 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3746 for (i = 0; i < words; ++i) {
3747 uint64_t t = n[i] & g[i] & mask;
3753 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
3755 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3756 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3757 uint64_t esz_mask = pred_esz_masks[esz];
3758 ARMPredicateReg *d = vd;
3762 /* Begin with a zero predicate register. */
3763 flags = do_zero(d, oprsz);
3768 /* Set all of the requested bits. */
3769 for (i = 0; i < count / 64; ++i) {
3773 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3776 return predtest_ones(d, oprsz, esz_mask);
3779 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
3781 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3782 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3783 uint64_t esz_mask = pred_esz_masks[esz];
3784 ARMPredicateReg *d = vd;
3785 intptr_t i, invcount, oprbits;
3789 return do_zero(d, oprsz);
3792 oprbits = oprsz * 8;
3793 tcg_debug_assert(count <= oprbits);
3797 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
3800 invcount = oprbits - count;
3801 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
3806 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
3812 return predtest_ones(d, oprsz, esz_mask);
3815 /* Recursive reduction on a function;
3816 * C.f. the ARM ARM function ReducePredicated.
3818 * While it would be possible to write this without the DATA temporary,
3819 * it is much simpler to process the predicate register this way.
3820 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3821 * little to gain with a more complex non-recursive form.
3823 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3824 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3829 uintptr_t half = n / 2; \
3830 TYPE lo = NAME##_reduce(data, status, half); \
3831 TYPE hi = NAME##_reduce(data + half, status, half); \
3832 return TYPE##_##FUNC(lo, hi, status); \
3835 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3837 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
3838 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3839 for (i = 0; i < oprsz; ) { \
3840 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3842 TYPE nn = *(TYPE *)(vn + H(i)); \
3843 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3844 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3847 for (; i < maxsz; i += sizeof(TYPE)) { \
3848 *(TYPE *)((void *)data + i) = IDENT; \
3850 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3853 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
3854 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
3855 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
3857 /* Identity is floatN_default_nan, without the function call. */
3858 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
3859 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
3860 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
3862 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
3863 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
3864 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
3866 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
3867 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
3868 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
3870 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
3871 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
3872 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
3876 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
3877 void *status, uint32_t desc)
3879 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3880 float16 result = nn;
3883 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3886 float16 mm = *(float16 *)(vm + H1_2(i));
3887 result = float16_add(result, mm, status);
3889 i += sizeof(float16), pg >>= sizeof(float16);
3891 } while (i < opr_sz);
3896 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
3897 void *status, uint32_t desc)
3899 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3900 float32 result = nn;
3903 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3906 float32 mm = *(float32 *)(vm + H1_2(i));
3907 result = float32_add(result, mm, status);
3909 i += sizeof(float32), pg >>= sizeof(float32);
3911 } while (i < opr_sz);
3916 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3917 void *status, uint32_t desc)
3919 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3923 for (i = 0; i < opr_sz; i++) {
3924 if (pg[H1(i)] & 1) {
3925 nn = float64_add(nn, m[i], status);
3932 /* Fully general three-operand expander, controlled by a predicate,
3933 * With the extra float_status parameter.
3935 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3936 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3937 void *status, uint32_t desc) \
3939 intptr_t i = simd_oprsz(desc); \
3942 uint64_t pg = g[(i - 1) >> 6]; \
3944 i -= sizeof(TYPE); \
3945 if (likely((pg >> (i & 63)) & 1)) { \
3946 TYPE nn = *(TYPE *)(vn + H(i)); \
3947 TYPE mm = *(TYPE *)(vm + H(i)); \
3948 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3954 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3955 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3956 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3958 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3959 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3960 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3962 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3963 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3964 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3966 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3967 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3968 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3970 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3971 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3972 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3974 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3975 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3976 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3978 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3979 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3980 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3982 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3983 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3984 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3986 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3988 return float16_abs(float16_sub(a, b, s));
3991 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3993 return float32_abs(float32_sub(a, b, s));
3996 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3998 return float64_abs(float64_sub(a, b, s));
4001 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4002 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4003 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
4005 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4007 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4008 return float64_scalbn(a, b_int, s);
4011 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4012 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4013 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
4015 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4016 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4017 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
4021 /* Three-operand expander, with one scalar operand, controlled by
4022 * a predicate, with the extra float_status parameter.
4024 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4025 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4026 void *status, uint32_t desc) \
4028 intptr_t i = simd_oprsz(desc); \
4032 uint64_t pg = g[(i - 1) >> 6]; \
4034 i -= sizeof(TYPE); \
4035 if (likely((pg >> (i & 63)) & 1)) { \
4036 TYPE nn = *(TYPE *)(vn + H(i)); \
4037 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4043 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4044 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4045 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
4047 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4048 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4049 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
4051 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4052 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4053 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
4055 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4057 return float16_sub(b, a, s);
4060 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4062 return float32_sub(b, a, s);
4065 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4067 return float64_sub(b, a, s);
4070 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4071 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4072 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
4074 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4075 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4076 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
4078 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4079 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4080 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
4082 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4083 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4084 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
4086 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4087 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4088 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
4090 /* Fully general two-operand expander, controlled by a predicate,
4091 * With the extra float_status parameter.
4093 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4094 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4096 intptr_t i = simd_oprsz(desc); \
4099 uint64_t pg = g[(i - 1) >> 6]; \
4101 i -= sizeof(TYPE); \
4102 if (likely((pg >> (i & 63)) & 1)) { \
4103 TYPE nn = *(TYPE *)(vn + H(i)); \
4104 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4110 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4111 * FZ16. When converting from fp16, this affects flushing input denormals;
4112 * when converting to fp16, this affects flushing output denormals.
4114 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4116 bool save = get_flush_inputs_to_zero(fpst);
4119 set_flush_inputs_to_zero(false, fpst);
4120 ret = float16_to_float32(f, true, fpst);
4121 set_flush_inputs_to_zero(save, fpst);
4125 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4127 bool save = get_flush_inputs_to_zero(fpst);
4130 set_flush_inputs_to_zero(false, fpst);
4131 ret = float16_to_float64(f, true, fpst);
4132 set_flush_inputs_to_zero(save, fpst);
4136 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4138 bool save = get_flush_to_zero(fpst);
4141 set_flush_to_zero(false, fpst);
4142 ret = float32_to_float16(f, true, fpst);
4143 set_flush_to_zero(save, fpst);
4147 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4149 bool save = get_flush_to_zero(fpst);
4152 set_flush_to_zero(false, fpst);
4153 ret = float64_to_float16(f, true, fpst);
4154 set_flush_to_zero(save, fpst);
4158 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4160 if (float16_is_any_nan(f)) {
4161 float_raise(float_flag_invalid, s);
4164 return float16_to_int16_round_to_zero(f, s);
4167 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4169 if (float16_is_any_nan(f)) {
4170 float_raise(float_flag_invalid, s);
4173 return float16_to_int64_round_to_zero(f, s);
4176 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4178 if (float32_is_any_nan(f)) {
4179 float_raise(float_flag_invalid, s);
4182 return float32_to_int64_round_to_zero(f, s);
4185 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4187 if (float64_is_any_nan(f)) {
4188 float_raise(float_flag_invalid, s);
4191 return float64_to_int64_round_to_zero(f, s);
4194 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4196 if (float16_is_any_nan(f)) {
4197 float_raise(float_flag_invalid, s);
4200 return float16_to_uint16_round_to_zero(f, s);
4203 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4205 if (float16_is_any_nan(f)) {
4206 float_raise(float_flag_invalid, s);
4209 return float16_to_uint64_round_to_zero(f, s);
4212 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4214 if (float32_is_any_nan(f)) {
4215 float_raise(float_flag_invalid, s);
4218 return float32_to_uint64_round_to_zero(f, s);
4221 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4223 if (float64_is_any_nan(f)) {
4224 float_raise(float_flag_invalid, s);
4227 return float64_to_uint64_round_to_zero(f, s);
4230 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4231 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4232 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
4233 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
4234 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
4235 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
4237 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4238 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4239 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4240 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
4241 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
4242 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
4243 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
4245 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4246 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4247 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4248 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
4249 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
4250 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
4251 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
4253 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4254 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4255 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
4257 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4258 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4259 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
4261 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4262 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4263 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
4265 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4266 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4267 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
4269 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4270 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4271 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4272 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4273 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4274 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4275 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4277 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4278 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4279 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4280 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4281 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4282 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4283 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4287 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4288 float_status *status, uint32_t desc,
4289 uint16_t neg1, uint16_t neg3)
4291 intptr_t i = simd_oprsz(desc);
4295 uint64_t pg = g[(i - 1) >> 6];
4298 if (likely((pg >> (i & 63)) & 1)) {
4299 float16 e1, e2, e3, r;
4301 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4302 e2 = *(uint16_t *)(vm + H1_2(i));
4303 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4304 r = float16_muladd(e1, e2, e3, 0, status);
4305 *(uint16_t *)(vd + H1_2(i)) = r;
4311 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4312 void *vg, void *status, uint32_t desc)
4314 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4317 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4318 void *vg, void *status, uint32_t desc)
4320 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4323 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4324 void *vg, void *status, uint32_t desc)
4326 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4329 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4330 void *vg, void *status, uint32_t desc)
4332 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4335 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4336 float_status *status, uint32_t desc,
4337 uint32_t neg1, uint32_t neg3)
4339 intptr_t i = simd_oprsz(desc);
4343 uint64_t pg = g[(i - 1) >> 6];
4346 if (likely((pg >> (i & 63)) & 1)) {
4347 float32 e1, e2, e3, r;
4349 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4350 e2 = *(uint32_t *)(vm + H1_4(i));
4351 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4352 r = float32_muladd(e1, e2, e3, 0, status);
4353 *(uint32_t *)(vd + H1_4(i)) = r;
4359 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4360 void *vg, void *status, uint32_t desc)
4362 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4365 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4366 void *vg, void *status, uint32_t desc)
4368 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4371 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4372 void *vg, void *status, uint32_t desc)
4374 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4377 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4378 void *vg, void *status, uint32_t desc)
4380 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4383 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4384 float_status *status, uint32_t desc,
4385 uint64_t neg1, uint64_t neg3)
4387 intptr_t i = simd_oprsz(desc);
4391 uint64_t pg = g[(i - 1) >> 6];
4394 if (likely((pg >> (i & 63)) & 1)) {
4395 float64 e1, e2, e3, r;
4397 e1 = *(uint64_t *)(vn + i) ^ neg1;
4398 e2 = *(uint64_t *)(vm + i);
4399 e3 = *(uint64_t *)(va + i) ^ neg3;
4400 r = float64_muladd(e1, e2, e3, 0, status);
4401 *(uint64_t *)(vd + i) = r;
4407 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4408 void *vg, void *status, uint32_t desc)
4410 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4413 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4414 void *vg, void *status, uint32_t desc)
4416 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4419 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4420 void *vg, void *status, uint32_t desc)
4422 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4425 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4426 void *vg, void *status, uint32_t desc)
4428 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4431 /* Two operand floating-point comparison controlled by a predicate.
4432 * Unlike the integer version, we are not allowed to optimistically
4433 * compare operands, since the comparison may have side effects wrt
4436 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4437 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4438 void *status, uint32_t desc) \
4440 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4441 uint64_t *d = vd, *g = vg; \
4443 uint64_t out = 0, pg = g[j]; \
4445 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4446 if (likely((pg >> (i & 63)) & 1)) { \
4447 TYPE nn = *(TYPE *)(vn + H(i)); \
4448 TYPE mm = *(TYPE *)(vm + H(i)); \
4449 out |= OP(TYPE, nn, mm, status); \
4456 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4457 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4458 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4459 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4460 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4461 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4463 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4464 DO_FPCMP_PPZZ_H(NAME, OP) \
4465 DO_FPCMP_PPZZ_S(NAME, OP) \
4466 DO_FPCMP_PPZZ_D(NAME, OP)
4468 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4469 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4470 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4471 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4472 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4473 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4474 #define DO_FCMUO(TYPE, X, Y, ST) \
4475 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4476 #define DO_FACGE(TYPE, X, Y, ST) \
4477 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4478 #define DO_FACGT(TYPE, X, Y, ST) \
4479 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4481 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4482 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4483 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4484 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4485 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4486 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4487 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4489 #undef DO_FPCMP_PPZZ_ALL
4490 #undef DO_FPCMP_PPZZ_D
4491 #undef DO_FPCMP_PPZZ_S
4492 #undef DO_FPCMP_PPZZ_H
4493 #undef DO_FPCMP_PPZZ
4495 /* One operand floating-point comparison against zero, controlled
4498 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4499 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4500 void *status, uint32_t desc) \
4502 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4503 uint64_t *d = vd, *g = vg; \
4505 uint64_t out = 0, pg = g[j]; \
4507 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4508 if ((pg >> (i & 63)) & 1) { \
4509 TYPE nn = *(TYPE *)(vn + H(i)); \
4510 out |= OP(TYPE, nn, 0, status); \
4517 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4518 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4519 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4520 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4521 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4522 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4524 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4525 DO_FPCMP_PPZ0_H(NAME, OP) \
4526 DO_FPCMP_PPZ0_S(NAME, OP) \
4527 DO_FPCMP_PPZ0_D(NAME, OP)
4529 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4530 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4531 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4532 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4533 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4534 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4536 /* FP Trig Multiply-Add. */
4538 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4540 static const float16 coeff[16] = {
4541 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4542 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4544 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4545 intptr_t x = simd_data(desc);
4546 float16 *d = vd, *n = vn, *m = vm;
4547 for (i = 0; i < opr_sz; i++) {
4550 if (float16_is_neg(mm)) {
4551 mm = float16_abs(mm);
4554 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4558 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4560 static const float32 coeff[16] = {
4561 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4562 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4563 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4564 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4566 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4567 intptr_t x = simd_data(desc);
4568 float32 *d = vd, *n = vn, *m = vm;
4569 for (i = 0; i < opr_sz; i++) {
4572 if (float32_is_neg(mm)) {
4573 mm = float32_abs(mm);
4576 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4580 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4582 static const float64 coeff[16] = {
4583 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4584 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4585 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4586 0x3de5d8408868552full, 0x0000000000000000ull,
4587 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4588 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4589 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4590 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4592 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4593 intptr_t x = simd_data(desc);
4594 float64 *d = vd, *n = vn, *m = vm;
4595 for (i = 0; i < opr_sz; i++) {
4598 if (float64_is_neg(mm)) {
4599 mm = float64_abs(mm);
4602 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4610 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4611 void *vs, uint32_t desc)
4613 intptr_t j, i = simd_oprsz(desc);
4615 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4616 float16 neg_real = float16_chs(neg_imag);
4619 uint64_t pg = g[(i - 1) >> 6];
4621 float16 e0, e1, e2, e3;
4623 /* I holds the real index; J holds the imag index. */
4624 j = i - sizeof(float16);
4625 i -= 2 * sizeof(float16);
4627 e0 = *(float16 *)(vn + H1_2(i));
4628 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4629 e2 = *(float16 *)(vn + H1_2(j));
4630 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4632 if (likely((pg >> (i & 63)) & 1)) {
4633 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4635 if (likely((pg >> (j & 63)) & 1)) {
4636 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4642 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4643 void *vs, uint32_t desc)
4645 intptr_t j, i = simd_oprsz(desc);
4647 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4648 float32 neg_real = float32_chs(neg_imag);
4651 uint64_t pg = g[(i - 1) >> 6];
4653 float32 e0, e1, e2, e3;
4655 /* I holds the real index; J holds the imag index. */
4656 j = i - sizeof(float32);
4657 i -= 2 * sizeof(float32);
4659 e0 = *(float32 *)(vn + H1_2(i));
4660 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4661 e2 = *(float32 *)(vn + H1_2(j));
4662 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4664 if (likely((pg >> (i & 63)) & 1)) {
4665 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4667 if (likely((pg >> (j & 63)) & 1)) {
4668 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4674 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4675 void *vs, uint32_t desc)
4677 intptr_t j, i = simd_oprsz(desc);
4679 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4680 float64 neg_real = float64_chs(neg_imag);
4683 uint64_t pg = g[(i - 1) >> 6];
4685 float64 e0, e1, e2, e3;
4687 /* I holds the real index; J holds the imag index. */
4688 j = i - sizeof(float64);
4689 i -= 2 * sizeof(float64);
4691 e0 = *(float64 *)(vn + H1_2(i));
4692 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4693 e2 = *(float64 *)(vn + H1_2(j));
4694 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4696 if (likely((pg >> (i & 63)) & 1)) {
4697 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4699 if (likely((pg >> (j & 63)) & 1)) {
4700 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4707 * FP Complex Multiply
4710 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4711 void *vg, void *status, uint32_t desc)
4713 intptr_t j, i = simd_oprsz(desc);
4714 unsigned rot = simd_data(desc);
4715 bool flip = rot & 1;
4716 float16 neg_imag, neg_real;
4719 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4720 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4723 uint64_t pg = g[(i - 1) >> 6];
4725 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4727 /* I holds the real index; J holds the imag index. */
4728 j = i - sizeof(float16);
4729 i -= 2 * sizeof(float16);
4731 nr = *(float16 *)(vn + H1_2(i));
4732 ni = *(float16 *)(vn + H1_2(j));
4733 mr = *(float16 *)(vm + H1_2(i));
4734 mi = *(float16 *)(vm + H1_2(j));
4736 e2 = (flip ? ni : nr);
4737 e1 = (flip ? mi : mr) ^ neg_real;
4739 e3 = (flip ? mr : mi) ^ neg_imag;
4741 if (likely((pg >> (i & 63)) & 1)) {
4742 d = *(float16 *)(va + H1_2(i));
4743 d = float16_muladd(e2, e1, d, 0, status);
4744 *(float16 *)(vd + H1_2(i)) = d;
4746 if (likely((pg >> (j & 63)) & 1)) {
4747 d = *(float16 *)(va + H1_2(j));
4748 d = float16_muladd(e4, e3, d, 0, status);
4749 *(float16 *)(vd + H1_2(j)) = d;
4755 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4756 void *vg, void *status, uint32_t desc)
4758 intptr_t j, i = simd_oprsz(desc);
4759 unsigned rot = simd_data(desc);
4760 bool flip = rot & 1;
4761 float32 neg_imag, neg_real;
4764 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4765 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4768 uint64_t pg = g[(i - 1) >> 6];
4770 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4772 /* I holds the real index; J holds the imag index. */
4773 j = i - sizeof(float32);
4774 i -= 2 * sizeof(float32);
4776 nr = *(float32 *)(vn + H1_2(i));
4777 ni = *(float32 *)(vn + H1_2(j));
4778 mr = *(float32 *)(vm + H1_2(i));
4779 mi = *(float32 *)(vm + H1_2(j));
4781 e2 = (flip ? ni : nr);
4782 e1 = (flip ? mi : mr) ^ neg_real;
4784 e3 = (flip ? mr : mi) ^ neg_imag;
4786 if (likely((pg >> (i & 63)) & 1)) {
4787 d = *(float32 *)(va + H1_2(i));
4788 d = float32_muladd(e2, e1, d, 0, status);
4789 *(float32 *)(vd + H1_2(i)) = d;
4791 if (likely((pg >> (j & 63)) & 1)) {
4792 d = *(float32 *)(va + H1_2(j));
4793 d = float32_muladd(e4, e3, d, 0, status);
4794 *(float32 *)(vd + H1_2(j)) = d;
4800 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4801 void *vg, void *status, uint32_t desc)
4803 intptr_t j, i = simd_oprsz(desc);
4804 unsigned rot = simd_data(desc);
4805 bool flip = rot & 1;
4806 float64 neg_imag, neg_real;
4809 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4810 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4813 uint64_t pg = g[(i - 1) >> 6];
4815 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4817 /* I holds the real index; J holds the imag index. */
4818 j = i - sizeof(float64);
4819 i -= 2 * sizeof(float64);
4821 nr = *(float64 *)(vn + H1_2(i));
4822 ni = *(float64 *)(vn + H1_2(j));
4823 mr = *(float64 *)(vm + H1_2(i));
4824 mi = *(float64 *)(vm + H1_2(j));
4826 e2 = (flip ? ni : nr);
4827 e1 = (flip ? mi : mr) ^ neg_real;
4829 e3 = (flip ? mr : mi) ^ neg_imag;
4831 if (likely((pg >> (i & 63)) & 1)) {
4832 d = *(float64 *)(va + H1_2(i));
4833 d = float64_muladd(e2, e1, d, 0, status);
4834 *(float64 *)(vd + H1_2(i)) = d;
4836 if (likely((pg >> (j & 63)) & 1)) {
4837 d = *(float64 *)(va + H1_2(j));
4838 d = float64_muladd(e4, e3, d, 0, status);
4839 *(float64 *)(vd + H1_2(j)) = d;
4846 * Load contiguous data, protected by a governing predicate.
4850 * Load one element into @vd + @reg_off from @host.
4851 * The controlling predicate is known to be true.
4853 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
4856 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4857 * The controlling predicate is known to be true.
4859 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
4860 target_ulong vaddr, uintptr_t retaddr);
4863 * Generate the above primitives.
4866 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4867 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4869 TYPEM val = HOST(host); \
4870 *(TYPEE *)(vd + H(reg_off)) = val; \
4873 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4874 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4875 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4877 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4878 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4879 target_ulong addr, uintptr_t ra) \
4881 *(TYPEE *)(vd + H(reg_off)) = \
4882 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
4885 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4886 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4887 target_ulong addr, uintptr_t ra) \
4889 TLB(env, useronly_clean_ptr(addr), \
4890 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
4893 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4894 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4895 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
4897 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4898 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4899 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4900 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4901 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4902 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4903 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4905 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
4906 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
4907 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4909 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
4910 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
4911 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
4912 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
4914 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4915 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4916 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4917 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4918 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
4920 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4921 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4922 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
4923 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4924 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
4926 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4927 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4928 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4929 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4930 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
4932 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4933 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4934 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
4936 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4937 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4938 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
4940 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4941 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4943 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4944 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
4955 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4956 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4957 * element >= @reg_off, or @reg_max if there were no active elements at all.
4959 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4960 intptr_t reg_max, int esz)
4962 uint64_t pg_mask = pred_esz_masks[esz];
4963 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4965 /* In normal usage, the first element is active. */
4966 if (likely(pg & 1)) {
4974 if (unlikely(reg_off >= reg_max)) {
4975 /* The entire predicate was false. */
4978 pg = vg[reg_off >> 6] & pg_mask;
4981 reg_off += ctz64(pg);
4983 /* We should never see an out of range predicate bit set. */
4984 tcg_debug_assert(reg_off < reg_max);
4989 * Resolve the guest virtual address to info->host and info->flags.
4990 * If @nofault, return false if the page is invalid, otherwise
4991 * exit via page fault exception.
5000 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5001 CPUARMState *env, target_ulong addr,
5002 int mem_off, MMUAccessType access_type,
5003 int mmu_idx, uintptr_t retaddr)
5010 * User-only currently always issues with TBI. See the comment
5011 * above useronly_clean_ptr. Usually we clean this top byte away
5012 * during translation, but we can't do that for e.g. vector + imm
5015 * We currently always enable TBI for user-only, and do not provide
5016 * a way to turn it off. So clean the pointer unconditionally here,
5017 * rather than look it up here, or pass it down from above.
5019 addr = useronly_clean_ptr(addr);
5021 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5022 &info->host, retaddr);
5023 info->flags = flags;
5025 if (flags & TLB_INVALID_MASK) {
5030 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5031 info->host -= mem_off;
5033 #ifdef CONFIG_USER_ONLY
5034 memset(&info->attrs, 0, sizeof(info->attrs));
5037 * Find the iotlbentry for addr and return the transaction attributes.
5038 * This *must* be present in the TLB because we just found the mapping.
5041 uintptr_t index = tlb_index(env, mmu_idx, addr);
5043 # ifdef CONFIG_DEBUG_TCG
5044 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5045 target_ulong comparator = (access_type == MMU_DATA_LOAD
5047 : tlb_addr_write(entry));
5048 g_assert(tlb_hit(comparator, addr));
5051 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5052 info->attrs = iotlbentry->attrs;
5061 * Analyse contiguous data, protected by a governing predicate.
5072 * First and last element wholly contained within the two pages.
5073 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5074 * reg_off_last[0] may be < 0 if the first element crosses pages.
5075 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5076 * are set >= 0 only if there are complete elements on a second page.
5078 * The reg_off_* offsets are relative to the internal vector register.
5079 * The mem_off_first offset is relative to the memory address; the
5080 * two offsets are different when a load operation extends, a store
5081 * operation truncates, or for multi-register operations.
5083 int16_t mem_off_first[2];
5084 int16_t reg_off_first[2];
5085 int16_t reg_off_last[2];
5088 * One element that is misaligned and spans both pages,
5089 * or -1 if there is no such active element.
5091 int16_t mem_off_split;
5092 int16_t reg_off_split;
5095 * The byte offset at which the entire operation crosses a page boundary.
5096 * Set >= 0 if and only if the entire operation spans two pages.
5100 /* TLB data for the two pages. */
5101 SVEHostPage page[2];
5105 * Find first active element on each page, and a loose bound for the
5106 * final element on each page. Identify any single element that spans
5107 * the page boundary. Return true if there are any active elements.
5109 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5110 uint64_t *vg, intptr_t reg_max,
5113 const int esize = 1 << esz;
5114 const uint64_t pg_mask = pred_esz_masks[esz];
5115 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5116 intptr_t mem_off_last, mem_off_split;
5117 intptr_t page_split, elt_split;
5120 /* Set all of the element indices to -1, and the TLB data to 0. */
5121 memset(info, -1, offsetof(SVEContLdSt, page));
5122 memset(info->page, 0, sizeof(info->page));
5124 /* Gross scan over the entire predicate to find bounds. */
5127 uint64_t pg = vg[i] & pg_mask;
5129 reg_off_last = i * 64 + 63 - clz64(pg);
5130 if (reg_off_first < 0) {
5131 reg_off_first = i * 64 + ctz64(pg);
5134 } while (++i * 64 < reg_max);
5136 if (unlikely(reg_off_first < 0)) {
5137 /* No active elements, no pages touched. */
5140 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5142 info->reg_off_first[0] = reg_off_first;
5143 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5144 mem_off_last = (reg_off_last >> esz) * msize;
5146 page_split = -(addr | TARGET_PAGE_MASK);
5147 if (likely(mem_off_last + msize <= page_split)) {
5148 /* The entire operation fits within a single page. */
5149 info->reg_off_last[0] = reg_off_last;
5153 info->page_split = page_split;
5154 elt_split = page_split / msize;
5155 reg_off_split = elt_split << esz;
5156 mem_off_split = elt_split * msize;
5159 * This is the last full element on the first page, but it is not
5160 * necessarily active. If there is no full element, i.e. the first
5161 * active element is the one that's split, this value remains -1.
5162 * It is useful as iteration bounds.
5164 if (elt_split != 0) {
5165 info->reg_off_last[0] = reg_off_split - esize;
5168 /* Determine if an unaligned element spans the pages. */
5169 if (page_split % msize != 0) {
5170 /* It is helpful to know if the split element is active. */
5171 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5172 info->reg_off_split = reg_off_split;
5173 info->mem_off_split = mem_off_split;
5175 if (reg_off_split == reg_off_last) {
5176 /* The page crossing element is last. */
5180 reg_off_split += esize;
5181 mem_off_split += msize;
5185 * We do want the first active element on the second page, because
5186 * this may affect the address reported in an exception.
5188 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5189 tcg_debug_assert(reg_off_split <= reg_off_last);
5190 info->reg_off_first[1] = reg_off_split;
5191 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5192 info->reg_off_last[1] = reg_off_last;
5197 * Resolve the guest virtual addresses to info->page[].
5198 * Control the generation of page faults with @fault. Return false if
5199 * there is no work to do, which can only happen with @fault == FAULT_NO.
5201 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5202 CPUARMState *env, target_ulong addr,
5203 MMUAccessType access_type, uintptr_t retaddr)
5205 int mmu_idx = cpu_mmu_index(env, false);
5206 int mem_off = info->mem_off_first[0];
5207 bool nofault = fault == FAULT_NO;
5208 bool have_work = true;
5210 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5211 access_type, mmu_idx, retaddr)) {
5212 /* No work to be done. */
5216 if (likely(info->page_split < 0)) {
5217 /* The entire operation was on the one page. */
5222 * If the second page is invalid, then we want the fault address to be
5223 * the first byte on that page which is accessed.
5225 if (info->mem_off_split >= 0) {
5227 * There is an element split across the pages. The fault address
5228 * should be the first byte of the second page.
5230 mem_off = info->page_split;
5232 * If the split element is also the first active element
5233 * of the vector, then: For first-fault we should continue
5234 * to generate faults for the second page. For no-fault,
5235 * we have work only if the second page is valid.
5237 if (info->mem_off_first[0] < info->mem_off_split) {
5238 nofault = FAULT_FIRST;
5243 * There is no element split across the pages. The fault address
5244 * should be the first active element on the second page.
5246 mem_off = info->mem_off_first[1];
5248 * There must have been one active element on the first page,
5249 * so we're out of first-fault territory.
5251 nofault = fault != FAULT_ALL;
5254 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5255 access_type, mmu_idx, retaddr);
5259 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5260 uint64_t *vg, target_ulong addr,
5261 int esize, int msize, int wp_access,
5264 #ifndef CONFIG_USER_ONLY
5265 intptr_t mem_off, reg_off, reg_last;
5266 int flags0 = info->page[0].flags;
5267 int flags1 = info->page[1].flags;
5269 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5273 /* Indicate that watchpoints are handled. */
5274 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5275 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5277 if (flags0 & TLB_WATCHPOINT) {
5278 mem_off = info->mem_off_first[0];
5279 reg_off = info->reg_off_first[0];
5280 reg_last = info->reg_off_last[0];
5282 while (reg_off <= reg_last) {
5283 uint64_t pg = vg[reg_off >> 6];
5285 if ((pg >> (reg_off & 63)) & 1) {
5286 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5287 msize, info->page[0].attrs,
5288 wp_access, retaddr);
5292 } while (reg_off <= reg_last && (reg_off & 63));
5296 mem_off = info->mem_off_split;
5298 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5299 info->page[0].attrs, wp_access, retaddr);
5302 mem_off = info->mem_off_first[1];
5303 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5304 reg_off = info->reg_off_first[1];
5305 reg_last = info->reg_off_last[1];
5308 uint64_t pg = vg[reg_off >> 6];
5310 if ((pg >> (reg_off & 63)) & 1) {
5311 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5312 msize, info->page[1].attrs,
5313 wp_access, retaddr);
5317 } while (reg_off & 63);
5318 } while (reg_off <= reg_last);
5323 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5324 uint64_t *vg, target_ulong addr, int esize,
5325 int msize, uint32_t mtedesc, uintptr_t ra)
5327 intptr_t mem_off, reg_off, reg_last;
5329 /* Process the page only if MemAttr == Tagged. */
5330 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5331 mem_off = info->mem_off_first[0];
5332 reg_off = info->reg_off_first[0];
5333 reg_last = info->reg_off_split;
5335 reg_last = info->reg_off_last[0];
5339 uint64_t pg = vg[reg_off >> 6];
5341 if ((pg >> (reg_off & 63)) & 1) {
5342 mte_check(env, mtedesc, addr, ra);
5346 } while (reg_off <= reg_last && (reg_off & 63));
5347 } while (reg_off <= reg_last);
5350 mem_off = info->mem_off_first[1];
5351 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5352 reg_off = info->reg_off_first[1];
5353 reg_last = info->reg_off_last[1];
5356 uint64_t pg = vg[reg_off >> 6];
5358 if ((pg >> (reg_off & 63)) & 1) {
5359 mte_check(env, mtedesc, addr, ra);
5363 } while (reg_off & 63);
5364 } while (reg_off <= reg_last);
5369 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5371 static inline QEMU_ALWAYS_INLINE
5372 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5373 uint32_t desc, const uintptr_t retaddr,
5374 const int esz, const int msz, const int N, uint32_t mtedesc,
5375 sve_ldst1_host_fn *host_fn,
5376 sve_ldst1_tlb_fn *tlb_fn)
5378 const unsigned rd = simd_data(desc);
5379 const intptr_t reg_max = simd_oprsz(desc);
5380 intptr_t reg_off, reg_last, mem_off;
5385 /* Find the active elements. */
5386 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5387 /* The entire predicate was false; no load occurs. */
5388 for (i = 0; i < N; ++i) {
5389 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5394 /* Probe the page(s). Exit with exception for any invalid page. */
5395 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5397 /* Handle watchpoints for all active elements. */
5398 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5399 BP_MEM_READ, retaddr);
5402 * Handle mte checks for all active elements.
5403 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5406 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5410 flags = info.page[0].flags | info.page[1].flags;
5411 if (unlikely(flags != 0)) {
5412 #ifdef CONFIG_USER_ONLY
5413 g_assert_not_reached();
5416 * At least one page includes MMIO.
5417 * Any bus operation can fail with cpu_transaction_failed,
5418 * which for ARM will raise SyncExternal. Perform the load
5419 * into scratch memory to preserve register state until the end.
5421 ARMVectorReg scratch[4] = { };
5423 mem_off = info.mem_off_first[0];
5424 reg_off = info.reg_off_first[0];
5425 reg_last = info.reg_off_last[1];
5427 reg_last = info.reg_off_split;
5429 reg_last = info.reg_off_last[0];
5434 uint64_t pg = vg[reg_off >> 6];
5436 if ((pg >> (reg_off & 63)) & 1) {
5437 for (i = 0; i < N; ++i) {
5438 tlb_fn(env, &scratch[i], reg_off,
5439 addr + mem_off + (i << msz), retaddr);
5442 reg_off += 1 << esz;
5443 mem_off += N << msz;
5444 } while (reg_off & 63);
5445 } while (reg_off <= reg_last);
5447 for (i = 0; i < N; ++i) {
5448 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5454 /* The entire operation is in RAM, on valid pages. */
5456 for (i = 0; i < N; ++i) {
5457 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5460 mem_off = info.mem_off_first[0];
5461 reg_off = info.reg_off_first[0];
5462 reg_last = info.reg_off_last[0];
5463 host = info.page[0].host;
5465 while (reg_off <= reg_last) {
5466 uint64_t pg = vg[reg_off >> 6];
5468 if ((pg >> (reg_off & 63)) & 1) {
5469 for (i = 0; i < N; ++i) {
5470 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5471 host + mem_off + (i << msz));
5474 reg_off += 1 << esz;
5475 mem_off += N << msz;
5476 } while (reg_off <= reg_last && (reg_off & 63));
5480 * Use the slow path to manage the cross-page misalignment.
5481 * But we know this is RAM and cannot trap.
5483 mem_off = info.mem_off_split;
5484 if (unlikely(mem_off >= 0)) {
5485 reg_off = info.reg_off_split;
5486 for (i = 0; i < N; ++i) {
5487 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5488 addr + mem_off + (i << msz), retaddr);
5492 mem_off = info.mem_off_first[1];
5493 if (unlikely(mem_off >= 0)) {
5494 reg_off = info.reg_off_first[1];
5495 reg_last = info.reg_off_last[1];
5496 host = info.page[1].host;
5499 uint64_t pg = vg[reg_off >> 6];
5501 if ((pg >> (reg_off & 63)) & 1) {
5502 for (i = 0; i < N; ++i) {
5503 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5504 host + mem_off + (i << msz));
5507 reg_off += 1 << esz;
5508 mem_off += N << msz;
5509 } while (reg_off & 63);
5510 } while (reg_off <= reg_last);
5514 static inline QEMU_ALWAYS_INLINE
5515 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5516 uint32_t desc, const uintptr_t ra,
5517 const int esz, const int msz, const int N,
5518 sve_ldst1_host_fn *host_fn,
5519 sve_ldst1_tlb_fn *tlb_fn)
5521 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5522 int bit55 = extract64(addr, 55, 1);
5524 /* Remove mtedesc from the normal sve descriptor. */
5525 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5527 /* Perform gross MTE suppression early. */
5528 if (!tbi_check(desc, bit55) ||
5529 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5533 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5536 #define DO_LD1_1(NAME, ESZ) \
5537 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5538 target_ulong addr, uint32_t desc) \
5540 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5541 sve_##NAME##_host, sve_##NAME##_tlb); \
5543 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5544 target_ulong addr, uint32_t desc) \
5546 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5547 sve_##NAME##_host, sve_##NAME##_tlb); \
5550 #define DO_LD1_2(NAME, ESZ, MSZ) \
5551 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5552 target_ulong addr, uint32_t desc) \
5554 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5555 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5557 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5558 target_ulong addr, uint32_t desc) \
5560 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5561 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5563 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5564 target_ulong addr, uint32_t desc) \
5566 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5567 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5569 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5570 target_ulong addr, uint32_t desc) \
5572 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5573 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5576 DO_LD1_1(ld1bb, MO_8)
5577 DO_LD1_1(ld1bhu, MO_16)
5578 DO_LD1_1(ld1bhs, MO_16)
5579 DO_LD1_1(ld1bsu, MO_32)
5580 DO_LD1_1(ld1bss, MO_32)
5581 DO_LD1_1(ld1bdu, MO_64)
5582 DO_LD1_1(ld1bds, MO_64)
5584 DO_LD1_2(ld1hh, MO_16, MO_16)
5585 DO_LD1_2(ld1hsu, MO_32, MO_16)
5586 DO_LD1_2(ld1hss, MO_32, MO_16)
5587 DO_LD1_2(ld1hdu, MO_64, MO_16)
5588 DO_LD1_2(ld1hds, MO_64, MO_16)
5590 DO_LD1_2(ld1ss, MO_32, MO_32)
5591 DO_LD1_2(ld1sdu, MO_64, MO_32)
5592 DO_LD1_2(ld1sds, MO_64, MO_32)
5594 DO_LD1_2(ld1dd, MO_64, MO_64)
5599 #define DO_LDN_1(N) \
5600 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5601 target_ulong addr, uint32_t desc) \
5603 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5604 sve_ld1bb_host, sve_ld1bb_tlb); \
5606 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5607 target_ulong addr, uint32_t desc) \
5609 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5610 sve_ld1bb_host, sve_ld1bb_tlb); \
5613 #define DO_LDN_2(N, SUFF, ESZ) \
5614 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5615 target_ulong addr, uint32_t desc) \
5617 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5618 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5620 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5621 target_ulong addr, uint32_t desc) \
5623 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5624 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5626 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5627 target_ulong addr, uint32_t desc) \
5629 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5630 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5632 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5633 target_ulong addr, uint32_t desc) \
5635 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5636 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5643 DO_LDN_2(2, hh, MO_16)
5644 DO_LDN_2(3, hh, MO_16)
5645 DO_LDN_2(4, hh, MO_16)
5647 DO_LDN_2(2, ss, MO_32)
5648 DO_LDN_2(3, ss, MO_32)
5649 DO_LDN_2(4, ss, MO_32)
5651 DO_LDN_2(2, dd, MO_64)
5652 DO_LDN_2(3, dd, MO_64)
5653 DO_LDN_2(4, dd, MO_64)
5659 * Load contiguous data, first-fault and no-fault.
5661 * For user-only, one could argue that we should hold the mmap_lock during
5662 * the operation so that there is no race between page_check_range and the
5663 * load operation. However, unmapping pages out from under a running thread
5664 * is extraordinarily unlikely. This theoretical race condition also affects
5665 * linux-user/ in its get_user/put_user macros.
5667 * TODO: Construct some helpers, written in assembly, that interact with
5668 * handle_cpu_signal to produce memory ops which can properly report errors
5672 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5673 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5674 * option, which leaves subsequent data unchanged.
5676 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5678 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5681 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5682 i = ROUND_UP(i, 64);
5684 for (; i < oprsz; i += 64) {
5690 * Common helper for all contiguous no-fault and first-fault loads.
5692 static inline QEMU_ALWAYS_INLINE
5693 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5694 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5695 const int esz, const int msz, const SVEContFault fault,
5696 sve_ldst1_host_fn *host_fn,
5697 sve_ldst1_tlb_fn *tlb_fn)
5699 const unsigned rd = simd_data(desc);
5700 void *vd = &env->vfp.zregs[rd];
5701 const intptr_t reg_max = simd_oprsz(desc);
5702 intptr_t reg_off, mem_off, reg_last;
5707 /* Find the active elements. */
5708 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5709 /* The entire predicate was false; no load occurs. */
5710 memset(vd, 0, reg_max);
5713 reg_off = info.reg_off_first[0];
5715 /* Probe the page(s). */
5716 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5717 /* Fault on first element. */
5718 tcg_debug_assert(fault == FAULT_NO);
5719 memset(vd, 0, reg_max);
5723 mem_off = info.mem_off_first[0];
5724 flags = info.page[0].flags;
5727 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5728 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5730 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5734 if (fault == FAULT_FIRST) {
5735 /* Trapping mte check for the first-fault element. */
5737 mte_check(env, mtedesc, addr + mem_off, retaddr);
5741 * Special handling of the first active element,
5742 * if it crosses a page boundary or is MMIO.
5744 bool is_split = mem_off == info.mem_off_split;
5745 if (unlikely(flags != 0) || unlikely(is_split)) {
5747 * Use the slow path for cross-page handling.
5748 * Might trap for MMIO or watchpoints.
5750 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5752 /* After any fault, zero the other elements. */
5753 swap_memzero(vd, reg_off);
5754 reg_off += 1 << esz;
5755 mem_off += 1 << msz;
5756 swap_memzero(vd + reg_off, reg_max - reg_off);
5762 memset(vd, 0, reg_max);
5765 memset(vd, 0, reg_max);
5766 if (unlikely(mem_off == info.mem_off_split)) {
5767 /* The first active element crosses a page boundary. */
5768 flags |= info.page[1].flags;
5769 if (unlikely(flags & TLB_MMIO)) {
5770 /* Some page is MMIO, see below. */
5773 if (unlikely(flags & TLB_WATCHPOINT) &&
5774 (cpu_watchpoint_address_matches
5775 (env_cpu(env), addr + mem_off, 1 << msz)
5777 /* Watchpoint hit, see below. */
5780 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
5784 * Use the slow path for cross-page handling.
5785 * This is RAM, without a watchpoint, and will not trap.
5787 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5793 * From this point on, all memory operations are MemSingleNF.
5795 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5796 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5798 * Unfortuately we do not have access to the memory attributes from the
5799 * PTE to tell Device memory from Normal memory. So we make a mostly
5800 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5801 * This gives the right answer for the common cases of "Normal memory,
5802 * backed by host RAM" and "Device memory, backed by MMIO".
5803 * The architecture allows us to suppress an NF load and return
5804 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5805 * case of "Normal memory, backed by MMIO" is permitted. The case we
5806 * get wrong is "Device memory, backed by host RAM", for which we
5807 * should return (UNKNOWN, FAULT) for but do not.
5809 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5810 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5811 * architectural breakpoints the same.
5813 if (unlikely(flags & TLB_MMIO)) {
5817 reg_last = info.reg_off_last[0];
5818 host = info.page[0].host;
5821 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
5823 if ((pg >> (reg_off & 63)) & 1) {
5824 if (unlikely(flags & TLB_WATCHPOINT) &&
5825 (cpu_watchpoint_address_matches
5826 (env_cpu(env), addr + mem_off, 1 << msz)
5830 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
5833 host_fn(vd, reg_off, host + mem_off);
5835 reg_off += 1 << esz;
5836 mem_off += 1 << msz;
5837 } while (reg_off <= reg_last && (reg_off & 63));
5838 } while (reg_off <= reg_last);
5841 * MemSingleNF is allowed to fail for any reason. We have special
5842 * code above to handle the first element crossing a page boundary.
5843 * As an implementation choice, decline to handle a cross-page element
5844 * in any other position.
5846 reg_off = info.reg_off_split;
5852 reg_off = info.reg_off_first[1];
5853 if (likely(reg_off < 0)) {
5854 /* No active elements on the second page. All done. */
5859 * MemSingleNF is allowed to fail for any reason. As an implementation
5860 * choice, decline to handle elements on the second page. This should
5861 * be low frequency as the guest walks through memory -- the next
5862 * iteration of the guest's loop should be aligned on the page boundary,
5863 * and then all following iterations will stay aligned.
5867 record_fault(env, reg_off, reg_max);
5870 static inline QEMU_ALWAYS_INLINE
5871 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
5872 uint32_t desc, const uintptr_t retaddr,
5873 const int esz, const int msz, const SVEContFault fault,
5874 sve_ldst1_host_fn *host_fn,
5875 sve_ldst1_tlb_fn *tlb_fn)
5877 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5878 int bit55 = extract64(addr, 55, 1);
5880 /* Remove mtedesc from the normal sve descriptor. */
5881 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5883 /* Perform gross MTE suppression early. */
5884 if (!tbi_check(desc, bit55) ||
5885 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5889 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
5890 esz, msz, fault, host_fn, tlb_fn);
5893 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
5894 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5895 target_ulong addr, uint32_t desc) \
5897 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
5898 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5900 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5901 target_ulong addr, uint32_t desc) \
5903 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5904 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5906 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5907 target_ulong addr, uint32_t desc) \
5909 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5910 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5912 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5913 target_ulong addr, uint32_t desc) \
5915 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
5916 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5919 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
5920 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5921 target_ulong addr, uint32_t desc) \
5923 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5924 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5926 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5927 target_ulong addr, uint32_t desc) \
5929 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5930 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5932 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5933 target_ulong addr, uint32_t desc) \
5935 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5936 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5938 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5939 target_ulong addr, uint32_t desc) \
5941 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5942 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5944 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5945 target_ulong addr, uint32_t desc) \
5947 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5948 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5950 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5951 target_ulong addr, uint32_t desc) \
5953 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5954 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5956 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5957 target_ulong addr, uint32_t desc) \
5959 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5960 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5962 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5963 target_ulong addr, uint32_t desc) \
5965 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5966 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5969 DO_LDFF1_LDNF1_1(bb, MO_8)
5970 DO_LDFF1_LDNF1_1(bhu, MO_16)
5971 DO_LDFF1_LDNF1_1(bhs, MO_16)
5972 DO_LDFF1_LDNF1_1(bsu, MO_32)
5973 DO_LDFF1_LDNF1_1(bss, MO_32)
5974 DO_LDFF1_LDNF1_1(bdu, MO_64)
5975 DO_LDFF1_LDNF1_1(bds, MO_64)
5977 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5978 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5979 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5980 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5981 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5983 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5984 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5985 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5987 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5989 #undef DO_LDFF1_LDNF1_1
5990 #undef DO_LDFF1_LDNF1_2
5993 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5996 static inline QEMU_ALWAYS_INLINE
5997 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5998 uint32_t desc, const uintptr_t retaddr,
5999 const int esz, const int msz, const int N, uint32_t mtedesc,
6000 sve_ldst1_host_fn *host_fn,
6001 sve_ldst1_tlb_fn *tlb_fn)
6003 const unsigned rd = simd_data(desc);
6004 const intptr_t reg_max = simd_oprsz(desc);
6005 intptr_t reg_off, reg_last, mem_off;
6010 /* Find the active elements. */
6011 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6012 /* The entire predicate was false; no store occurs. */
6016 /* Probe the page(s). Exit with exception for any invalid page. */
6017 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6019 /* Handle watchpoints for all active elements. */
6020 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6021 BP_MEM_WRITE, retaddr);
6024 * Handle mte checks for all active elements.
6025 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6028 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6032 flags = info.page[0].flags | info.page[1].flags;
6033 if (unlikely(flags != 0)) {
6034 #ifdef CONFIG_USER_ONLY
6035 g_assert_not_reached();
6038 * At least one page includes MMIO.
6039 * Any bus operation can fail with cpu_transaction_failed,
6040 * which for ARM will raise SyncExternal. We cannot avoid
6041 * this fault and will leave with the store incomplete.
6043 mem_off = info.mem_off_first[0];
6044 reg_off = info.reg_off_first[0];
6045 reg_last = info.reg_off_last[1];
6047 reg_last = info.reg_off_split;
6049 reg_last = info.reg_off_last[0];
6054 uint64_t pg = vg[reg_off >> 6];
6056 if ((pg >> (reg_off & 63)) & 1) {
6057 for (i = 0; i < N; ++i) {
6058 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6059 addr + mem_off + (i << msz), retaddr);
6062 reg_off += 1 << esz;
6063 mem_off += N << msz;
6064 } while (reg_off & 63);
6065 } while (reg_off <= reg_last);
6070 mem_off = info.mem_off_first[0];
6071 reg_off = info.reg_off_first[0];
6072 reg_last = info.reg_off_last[0];
6073 host = info.page[0].host;
6075 while (reg_off <= reg_last) {
6076 uint64_t pg = vg[reg_off >> 6];
6078 if ((pg >> (reg_off & 63)) & 1) {
6079 for (i = 0; i < N; ++i) {
6080 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6081 host + mem_off + (i << msz));
6084 reg_off += 1 << esz;
6085 mem_off += N << msz;
6086 } while (reg_off <= reg_last && (reg_off & 63));
6090 * Use the slow path to manage the cross-page misalignment.
6091 * But we know this is RAM and cannot trap.
6093 mem_off = info.mem_off_split;
6094 if (unlikely(mem_off >= 0)) {
6095 reg_off = info.reg_off_split;
6096 for (i = 0; i < N; ++i) {
6097 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6098 addr + mem_off + (i << msz), retaddr);
6102 mem_off = info.mem_off_first[1];
6103 if (unlikely(mem_off >= 0)) {
6104 reg_off = info.reg_off_first[1];
6105 reg_last = info.reg_off_last[1];
6106 host = info.page[1].host;
6109 uint64_t pg = vg[reg_off >> 6];
6111 if ((pg >> (reg_off & 63)) & 1) {
6112 for (i = 0; i < N; ++i) {
6113 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6114 host + mem_off + (i << msz));
6117 reg_off += 1 << esz;
6118 mem_off += N << msz;
6119 } while (reg_off & 63);
6120 } while (reg_off <= reg_last);
6124 static inline QEMU_ALWAYS_INLINE
6125 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6126 uint32_t desc, const uintptr_t ra,
6127 const int esz, const int msz, const int N,
6128 sve_ldst1_host_fn *host_fn,
6129 sve_ldst1_tlb_fn *tlb_fn)
6131 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6132 int bit55 = extract64(addr, 55, 1);
6134 /* Remove mtedesc from the normal sve descriptor. */
6135 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6137 /* Perform gross MTE suppression early. */
6138 if (!tbi_check(desc, bit55) ||
6139 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6143 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6146 #define DO_STN_1(N, NAME, ESZ) \
6147 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6148 target_ulong addr, uint32_t desc) \
6150 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6151 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6153 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6154 target_ulong addr, uint32_t desc) \
6156 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6157 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6160 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6161 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6162 target_ulong addr, uint32_t desc) \
6164 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6165 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6167 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6168 target_ulong addr, uint32_t desc) \
6170 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6171 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6173 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6174 target_ulong addr, uint32_t desc) \
6176 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6177 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6179 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6180 target_ulong addr, uint32_t desc) \
6182 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6183 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6186 DO_STN_1(1, bb, MO_8)
6187 DO_STN_1(1, bh, MO_16)
6188 DO_STN_1(1, bs, MO_32)
6189 DO_STN_1(1, bd, MO_64)
6190 DO_STN_1(2, bb, MO_8)
6191 DO_STN_1(3, bb, MO_8)
6192 DO_STN_1(4, bb, MO_8)
6194 DO_STN_2(1, hh, MO_16, MO_16)
6195 DO_STN_2(1, hs, MO_32, MO_16)
6196 DO_STN_2(1, hd, MO_64, MO_16)
6197 DO_STN_2(2, hh, MO_16, MO_16)
6198 DO_STN_2(3, hh, MO_16, MO_16)
6199 DO_STN_2(4, hh, MO_16, MO_16)
6201 DO_STN_2(1, ss, MO_32, MO_32)
6202 DO_STN_2(1, sd, MO_64, MO_32)
6203 DO_STN_2(2, ss, MO_32, MO_32)
6204 DO_STN_2(3, ss, MO_32, MO_32)
6205 DO_STN_2(4, ss, MO_32, MO_32)
6207 DO_STN_2(1, dd, MO_64, MO_64)
6208 DO_STN_2(2, dd, MO_64, MO_64)
6209 DO_STN_2(3, dd, MO_64, MO_64)
6210 DO_STN_2(4, dd, MO_64, MO_64)
6216 * Loads with a vector index.
6220 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6222 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6224 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6226 return *(uint32_t *)(reg + H1_4(reg_ofs));
6229 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6231 return *(int32_t *)(reg + H1_4(reg_ofs));
6234 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6236 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6239 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6241 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6244 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6246 return *(uint64_t *)(reg + reg_ofs);
6249 static inline QEMU_ALWAYS_INLINE
6250 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6251 target_ulong base, uint32_t desc, uintptr_t retaddr,
6252 uint32_t mtedesc, int esize, int msize,
6253 zreg_off_fn *off_fn,
6254 sve_ldst1_host_fn *host_fn,
6255 sve_ldst1_tlb_fn *tlb_fn)
6257 const int mmu_idx = cpu_mmu_index(env, false);
6258 const intptr_t reg_max = simd_oprsz(desc);
6259 const int scale = simd_data(desc);
6260 ARMVectorReg scratch;
6262 SVEHostPage info, info2;
6264 memset(&scratch, 0, reg_max);
6267 uint64_t pg = vg[reg_off >> 6];
6269 if (likely(pg & 1)) {
6270 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6271 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6273 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6276 if (likely(in_page >= msize)) {
6277 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6278 cpu_check_watchpoint(env_cpu(env), addr, msize,
6279 info.attrs, BP_MEM_READ, retaddr);
6281 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6282 mte_check(env, mtedesc, addr, retaddr);
6284 host_fn(&scratch, reg_off, info.host);
6286 /* Element crosses the page boundary. */
6287 sve_probe_page(&info2, false, env, addr + in_page, 0,
6288 MMU_DATA_LOAD, mmu_idx, retaddr);
6289 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6290 cpu_check_watchpoint(env_cpu(env), addr,
6292 BP_MEM_READ, retaddr);
6294 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6295 mte_check(env, mtedesc, addr, retaddr);
6297 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6302 } while (reg_off & 63);
6303 } while (reg_off < reg_max);
6305 /* Wait until all exceptions have been raised to write back. */
6306 memcpy(vd, &scratch, reg_max);
6309 static inline QEMU_ALWAYS_INLINE
6310 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6311 target_ulong base, uint32_t desc, uintptr_t retaddr,
6312 int esize, int msize, zreg_off_fn *off_fn,
6313 sve_ldst1_host_fn *host_fn,
6314 sve_ldst1_tlb_fn *tlb_fn)
6316 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6317 /* Remove mtedesc from the normal sve descriptor. */
6318 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6321 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6322 * offset base entirely over the address space hole to change the
6323 * pointer tag, or change the bit55 selector. So we could here
6324 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6326 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6327 esize, msize, off_fn, host_fn, tlb_fn);
6330 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6331 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6332 void *vm, target_ulong base, uint32_t desc) \
6334 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6335 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6337 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6338 void *vm, target_ulong base, uint32_t desc) \
6340 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6341 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6344 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6345 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6346 void *vm, target_ulong base, uint32_t desc) \
6348 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6349 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6351 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6352 void *vm, target_ulong base, uint32_t desc) \
6354 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6355 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6358 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6359 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6360 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6361 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6362 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6364 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6365 DO_LD1_ZPZ_S(bss, zss, MO_8)
6366 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6367 DO_LD1_ZPZ_D(bds, zss, MO_8)
6368 DO_LD1_ZPZ_D(bds, zd, MO_8)
6370 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6371 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6372 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6373 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6374 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6376 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6377 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6378 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6379 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6380 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6382 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6383 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6384 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6385 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6386 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6388 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6389 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6390 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6391 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6392 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6394 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6395 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6396 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6397 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6398 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6400 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6401 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6402 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6403 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6404 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6406 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6407 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6408 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6410 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6411 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6412 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6414 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6415 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6416 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6418 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6419 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6420 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6425 /* First fault loads with a vector index. */
6428 * Common helpers for all gather first-faulting loads.
6431 static inline QEMU_ALWAYS_INLINE
6432 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6433 target_ulong base, uint32_t desc, uintptr_t retaddr,
6434 uint32_t mtedesc, const int esz, const int msz,
6435 zreg_off_fn *off_fn,
6436 sve_ldst1_host_fn *host_fn,
6437 sve_ldst1_tlb_fn *tlb_fn)
6439 const int mmu_idx = cpu_mmu_index(env, false);
6440 const intptr_t reg_max = simd_oprsz(desc);
6441 const int scale = simd_data(desc);
6442 const int esize = 1 << esz;
6443 const int msize = 1 << msz;
6446 target_ulong addr, in_page;
6448 /* Skip to the first true predicate. */
6449 reg_off = find_next_active(vg, 0, reg_max, esz);
6450 if (unlikely(reg_off >= reg_max)) {
6451 /* The entire predicate was false; no load occurs. */
6452 memset(vd, 0, reg_max);
6457 * Probe the first element, allowing faults.
6459 addr = base + (off_fn(vm, reg_off) << scale);
6461 mte_check(env, mtedesc, addr, retaddr);
6463 tlb_fn(env, vd, reg_off, addr, retaddr);
6465 /* After any fault, zero the other elements. */
6466 swap_memzero(vd, reg_off);
6468 swap_memzero(vd + reg_off, reg_max - reg_off);
6471 * Probe the remaining elements, not allowing faults.
6473 while (reg_off < reg_max) {
6474 uint64_t pg = vg[reg_off >> 6];
6476 if (likely((pg >> (reg_off & 63)) & 1)) {
6477 addr = base + (off_fn(vm, reg_off) << scale);
6478 in_page = -(addr | TARGET_PAGE_MASK);
6480 if (unlikely(in_page < msize)) {
6481 /* Stop if the element crosses a page boundary. */
6485 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6487 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6490 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6491 (cpu_watchpoint_address_matches
6492 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6496 arm_tlb_mte_tagged(&info.attrs) &&
6497 !mte_probe(env, mtedesc, addr)) {
6501 host_fn(vd, reg_off, info.host);
6504 } while (reg_off & 63);
6509 record_fault(env, reg_off, reg_max);
6512 static inline QEMU_ALWAYS_INLINE
6513 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6514 target_ulong base, uint32_t desc, uintptr_t retaddr,
6515 const int esz, const int msz,
6516 zreg_off_fn *off_fn,
6517 sve_ldst1_host_fn *host_fn,
6518 sve_ldst1_tlb_fn *tlb_fn)
6520 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6521 /* Remove mtedesc from the normal sve descriptor. */
6522 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6525 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6526 * offset base entirely over the address space hole to change the
6527 * pointer tag, or change the bit55 selector. So we could here
6528 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6530 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6531 esz, msz, off_fn, host_fn, tlb_fn);
6534 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6535 void HELPER(sve_ldff##MEM##_##OFS) \
6536 (CPUARMState *env, void *vd, void *vg, \
6537 void *vm, target_ulong base, uint32_t desc) \
6539 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6540 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6542 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6543 (CPUARMState *env, void *vd, void *vg, \
6544 void *vm, target_ulong base, uint32_t desc) \
6546 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6547 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6550 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6551 void HELPER(sve_ldff##MEM##_##OFS) \
6552 (CPUARMState *env, void *vd, void *vg, \
6553 void *vm, target_ulong base, uint32_t desc) \
6555 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6556 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6558 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6559 (CPUARMState *env, void *vd, void *vg, \
6560 void *vm, target_ulong base, uint32_t desc) \
6562 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6563 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6566 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6567 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6568 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6569 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6570 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6572 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6573 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6574 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6575 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6576 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6578 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6579 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6580 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6581 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6582 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6584 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6585 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6586 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6587 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6588 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6590 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6591 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6592 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6593 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6594 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6596 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6597 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6598 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6599 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6600 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6602 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6603 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6604 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6605 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6606 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6608 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6609 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6610 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6611 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6612 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6614 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6615 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6616 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6618 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6619 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6620 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6622 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6623 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6624 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6626 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6627 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6628 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6630 /* Stores with a vector index. */
6632 static inline QEMU_ALWAYS_INLINE
6633 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6634 target_ulong base, uint32_t desc, uintptr_t retaddr,
6635 uint32_t mtedesc, int esize, int msize,
6636 zreg_off_fn *off_fn,
6637 sve_ldst1_host_fn *host_fn,
6638 sve_ldst1_tlb_fn *tlb_fn)
6640 const int mmu_idx = cpu_mmu_index(env, false);
6641 const intptr_t reg_max = simd_oprsz(desc);
6642 const int scale = simd_data(desc);
6643 void *host[ARM_MAX_VQ * 4];
6644 intptr_t reg_off, i;
6645 SVEHostPage info, info2;
6648 * Probe all of the elements for host addresses and flags.
6652 uint64_t pg = vg[reg_off >> 6];
6654 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6655 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6658 if (likely((pg >> (reg_off & 63)) & 1)) {
6659 if (likely(in_page >= msize)) {
6660 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6662 host[i] = info.host;
6665 * Element crosses the page boundary.
6666 * Probe both pages, but do not record the host address,
6667 * so that we use the slow path.
6669 sve_probe_page(&info, false, env, addr, 0,
6670 MMU_DATA_STORE, mmu_idx, retaddr);
6671 sve_probe_page(&info2, false, env, addr + in_page, 0,
6672 MMU_DATA_STORE, mmu_idx, retaddr);
6673 info.flags |= info2.flags;
6676 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6677 cpu_check_watchpoint(env_cpu(env), addr, msize,
6678 info.attrs, BP_MEM_WRITE, retaddr);
6681 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6682 mte_check(env, mtedesc, addr, retaddr);
6687 } while (reg_off & 63);
6688 } while (reg_off < reg_max);
6691 * Now that we have recognized all exceptions except SyncExternal
6692 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6694 * Note for the common case of an element in RAM, not crossing a page
6695 * boundary, we have stored the host address in host[]. This doubles
6696 * as a first-level check against the predicate, since only enabled
6697 * elements have non-null host addresses.
6702 if (likely(h != NULL)) {
6703 host_fn(vd, reg_off, h);
6704 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6705 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6706 tlb_fn(env, vd, reg_off, addr, retaddr);
6710 } while (reg_off < reg_max);
6713 static inline QEMU_ALWAYS_INLINE
6714 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6715 target_ulong base, uint32_t desc, uintptr_t retaddr,
6716 int esize, int msize, zreg_off_fn *off_fn,
6717 sve_ldst1_host_fn *host_fn,
6718 sve_ldst1_tlb_fn *tlb_fn)
6720 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6721 /* Remove mtedesc from the normal sve descriptor. */
6722 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6725 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6726 * offset base entirely over the address space hole to change the
6727 * pointer tag, or change the bit55 selector. So we could here
6728 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6730 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6731 esize, msize, off_fn, host_fn, tlb_fn);
6734 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6735 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6736 void *vm, target_ulong base, uint32_t desc) \
6738 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6739 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6741 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6742 void *vm, target_ulong base, uint32_t desc) \
6744 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6745 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6748 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6749 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6750 void *vm, target_ulong base, uint32_t desc) \
6752 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6753 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6755 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6756 void *vm, target_ulong base, uint32_t desc) \
6758 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6759 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6762 DO_ST1_ZPZ_S(bs, zsu, MO_8)
6763 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6764 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6765 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6766 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6768 DO_ST1_ZPZ_S(bs, zss, MO_8)
6769 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6770 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6771 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6772 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6774 DO_ST1_ZPZ_D(bd, zsu, MO_8)
6775 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6776 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6777 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6778 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6779 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6780 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6782 DO_ST1_ZPZ_D(bd, zss, MO_8)
6783 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6784 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6785 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6786 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6787 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6788 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6790 DO_ST1_ZPZ_D(bd, zd, MO_8)
6791 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6792 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6793 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6794 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6795 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6796 DO_ST1_ZPZ_D(dd_be, zd, MO_64)