4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags |= ((d & (g & -g)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte)
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte)
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word[byte & 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte)
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
263 return word[byte & 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h)
275 uint64_t m = 0x0000ffff0000ffffull;
277 return ((h & m) << 16) | ((h >> 16) & m);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp, DO_AND)
307 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
392 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
397 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
402 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
407 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
412 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
417 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
427 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
437 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
454 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
456 return (n * m) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
461 return (n * m) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
467 muls64(&lo, &hi, n, m);
471 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
474 mulu64(&lo, &hi, n, m);
478 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
483 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
488 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
493 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
496 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
509 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
513 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
517 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
521 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
523 int8_t n1 = n, n2 = n >> 8;
527 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
529 int16_t n1 = n, n2 = n >> 16;
533 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
535 int32_t n1 = n, n2 = n >> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
543 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
545 uint8_t n1 = n, n2 = n >> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
551 uint16_t n1 = n, n2 = n >> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
557 uint32_t n1 = n, n2 = n >> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
659 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
672 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
685 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
690 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
692 return val >= max ? max : val <= min ? min : val;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
709 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
721 return r < n ? UINT64_MAX : r;
724 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
743 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
754 return n > m ? n - m : 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
809 return n < -m ? 0 : r;
811 return r < n ? UINT64_MAX : r;
814 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
965 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
966 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
967 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
969 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
970 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
971 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
973 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
974 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
975 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1016 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1017 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1018 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1024 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1025 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1026 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1028 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1029 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1030 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1031 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1036 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1037 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1038 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1043 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1044 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1049 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1050 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1055 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1056 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1057 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1067 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1068 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1069 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1070 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1071 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1073 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1074 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1075 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1076 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1077 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1078 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1083 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1084 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1085 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1090 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1091 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1092 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1094 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1095 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1096 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1098 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1099 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1101 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1103 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1104 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1105 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1106 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1113 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1114 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1115 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1122 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1123 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1124 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1126 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1127 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1145 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1146 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1147 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1149 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1150 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1151 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1153 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1154 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1155 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1189 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1190 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1192 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1193 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1194 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1196 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1197 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1198 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1200 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1201 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1202 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1204 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1205 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1206 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1208 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1209 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1210 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1212 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1213 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1214 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1216 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1217 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1218 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1223 int16_t val = n * m;
1224 return DO_SQADD_H(val, val);
1227 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1229 int32_t val = n * m;
1230 return DO_SQADD_S(val, val);
1233 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1235 int64_t val = n * m;
1236 return do_sqadd_d(val, val);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1258 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1259 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1261 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1262 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1263 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1265 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1266 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1267 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1269 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1270 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1271 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1289 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1290 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1291 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1309 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1310 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1312 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1313 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1314 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1316 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1317 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1318 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, , H1_4, DO_MUL)
1320 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1321 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1322 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1324 #define DO_NMUL(N, M) -(N * M)
1326 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1327 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1328 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, , H1_4, DO_NMUL)
1330 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1331 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1332 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, , H1_4, DO_NMUL)
1336 #define DO_XTNB(NAME, TYPE, OP) \
1337 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1339 intptr_t i, opr_sz = simd_oprsz(desc); \
1340 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1341 TYPE nn = *(TYPE *)(vn + i); \
1342 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1343 *(TYPE *)(vd + i) = nn; \
1347 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1348 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1350 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1351 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1352 TYPE nn = *(TYPE *)(vn + i); \
1353 *(TYPEN *)(vd + i + odd) = OP(nn); \
1357 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1358 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1359 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1361 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1362 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1363 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1365 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1366 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1367 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1369 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1370 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1371 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1373 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1374 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1375 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1377 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1378 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1379 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1381 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1382 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1383 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1385 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1386 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1387 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1392 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1394 intptr_t i, opr_sz = simd_oprsz(desc);
1395 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1396 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1397 uint32_t *a = va, *n = vn;
1398 uint64_t *d = vd, *m = vm;
1400 for (i = 0; i < opr_sz / 8; ++i) {
1401 uint32_t e1 = a[2 * i + H4(0)];
1402 uint32_t e2 = n[2 * i + sel] ^ inv;
1403 uint64_t c = extract64(m[i], 32, 1);
1404 /* Compute and store the entire 33-bit result at once. */
1409 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1411 intptr_t i, opr_sz = simd_oprsz(desc);
1412 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1413 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1414 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1416 for (i = 0; i < opr_sz / 8; i += 2) {
1417 Int128 e1 = int128_make64(a[i]);
1418 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1419 Int128 c = int128_make64(m[i + 1] & 1);
1420 Int128 r = int128_add(int128_add(e1, e2), c);
1421 d[i + 0] = int128_getlo(r);
1422 d[i + 1] = int128_gethi(r);
1426 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1429 intptr_t i, opr_sz = simd_oprsz(desc); \
1430 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1431 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1432 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1433 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1434 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1435 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1436 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1440 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1441 do_sqdmull_h, DO_SQADD_H)
1442 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1443 do_sqdmull_s, DO_SQADD_S)
1444 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, , H1_4,
1445 do_sqdmull_d, do_sqadd_d)
1447 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1448 do_sqdmull_h, DO_SQSUB_H)
1449 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1450 do_sqdmull_s, DO_SQSUB_S)
1451 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, , H1_4,
1452 do_sqdmull_d, do_sqsub_d)
1456 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1457 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1459 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1460 int rot = simd_data(desc); \
1461 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1462 bool sub_r = rot == 1 || rot == 2; \
1463 bool sub_i = rot >= 2; \
1464 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1465 for (i = 0; i < opr_sz; i += 2) { \
1466 TYPE elt1_a = n[H(i + sel_a)]; \
1467 TYPE elt2_a = m[H(i + sel_a)]; \
1468 TYPE elt2_b = m[H(i + sel_b)]; \
1469 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1470 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1474 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1476 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1477 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1478 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1479 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA)
1481 #define DO_SQRDMLAH_B(N, M, A, S) \
1482 do_sqrdmlah_b(N, M, A, S, true)
1483 #define DO_SQRDMLAH_H(N, M, A, S) \
1484 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1485 #define DO_SQRDMLAH_S(N, M, A, S) \
1486 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1487 #define DO_SQRDMLAH_D(N, M, A, S) \
1488 do_sqrdmlah_d(N, M, A, S, true)
1490 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1491 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1492 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1493 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, , DO_SQRDMLAH_D)
1497 #undef DO_SQRDMLAH_B
1498 #undef DO_SQRDMLAH_H
1499 #undef DO_SQRDMLAH_S
1500 #undef DO_SQRDMLAH_D
1502 #define DO_BITPERM(NAME, TYPE, OP) \
1503 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1505 intptr_t i, opr_sz = simd_oprsz(desc); \
1506 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1507 TYPE nn = *(TYPE *)(vn + i); \
1508 TYPE mm = *(TYPE *)(vm + i); \
1509 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1513 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1518 for (db = 0; db < n; ++db) {
1519 if ((mask >> db) & 1) {
1520 res |= ((data >> db) & 1) << rb;
1527 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1528 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1529 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1530 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1532 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1537 for (rb = 0; rb < n; ++rb) {
1538 if ((mask >> rb) & 1) {
1539 res |= ((data >> db) & 1) << rb;
1546 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1547 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1548 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1549 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1551 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1553 uint64_t resm = 0, resu = 0;
1554 int db, rbm = 0, rbu = 0;
1556 for (db = 0; db < n; ++db) {
1557 uint64_t val = (data >> db) & 1;
1558 if ((mask >> db) & 1) {
1559 resm |= val << rbm++;
1561 resu |= val << rbu++;
1565 return resm | (resu << rbm);
1568 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1569 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1570 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1571 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1575 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1576 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1578 intptr_t i, opr_sz = simd_oprsz(desc); \
1579 int sub_r = simd_data(desc); \
1581 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1582 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1583 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1584 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1585 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1586 acc_r = ADD_OP(acc_r, el2_i); \
1587 acc_i = SUB_OP(acc_i, el2_r); \
1588 *(TYPE *)(vd + H(i)) = acc_r; \
1589 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1592 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1593 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1594 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1595 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1596 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1597 acc_r = SUB_OP(acc_r, el2_i); \
1598 acc_i = ADD_OP(acc_i, el2_r); \
1599 *(TYPE *)(vd + H(i)) = acc_r; \
1600 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1605 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1606 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1607 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1608 DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1610 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1611 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1612 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1613 DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1617 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1618 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1620 intptr_t i, opr_sz = simd_oprsz(desc); \
1621 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1622 int shift = simd_data(desc) >> 1; \
1623 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1624 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1625 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1629 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1630 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1631 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1633 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1634 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1635 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1639 /* Two-operand reduction expander, controlled by a predicate.
1640 * The difference between TYPERED and TYPERET has to do with
1641 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1642 * but TYPERET must be unsigned so that e.g. a 32-bit value
1643 * is not sign-extended to the ABI uint64_t return type.
1645 /* ??? If we were to vectorize this by hand the reduction ordering
1646 * would change. For integer operands, this is perfectly fine.
1648 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1649 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1651 intptr_t i, opr_sz = simd_oprsz(desc); \
1652 TYPERED ret = INIT; \
1653 for (i = 0; i < opr_sz; ) { \
1654 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1657 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1658 ret = OP(ret, nn); \
1660 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1663 return (TYPERET)ret; \
1666 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1667 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1669 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1673 for (i = 0; i < opr_sz; i += 1) { \
1674 if (pg[H1(i)] & 1) { \
1676 ret = OP(ret, nn); \
1682 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1683 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1684 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1685 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1687 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1688 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1689 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1690 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1692 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1693 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1694 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1695 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1697 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1698 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1699 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1701 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1702 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1703 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1704 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1706 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1707 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1708 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1709 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1711 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1712 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1713 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1714 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1716 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1717 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1718 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1719 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1721 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1722 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1723 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1724 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1729 /* Two vector operand, one scalar operand, unpredicated. */
1730 #define DO_ZZI(NAME, TYPE, OP) \
1731 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1733 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1734 TYPE s = s64, *d = vd, *n = vn; \
1735 for (i = 0; i < opr_sz; ++i) { \
1736 d[i] = OP(n[i], s); \
1740 #define DO_SUBR(X, Y) (Y - X)
1742 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1743 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1744 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1745 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1747 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1748 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1749 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1750 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1752 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1753 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1754 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1755 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1757 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1758 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1759 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1760 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1762 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1763 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1764 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1765 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1785 /* Similar to the ARM LastActiveElement pseudocode function, except the
1786 result is multiplied by the element size. This includes the not found
1787 indication; e.g. not found for esz=3 is -8. */
1788 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1790 uint64_t mask = pred_esz_masks[esz];
1794 uint64_t this_g = g[--i] & mask;
1796 return i * 64 + (63 - clz64(this_g));
1799 return (intptr_t)-1 << esz;
1802 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1804 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1805 uint32_t flags = PREDTEST_INIT;
1806 uint64_t *d = vd, *g = vg;
1810 uint64_t this_d = d[i];
1811 uint64_t this_g = g[i];
1815 /* Set in D the first bit of G. */
1816 this_d |= this_g & -this_g;
1819 flags = iter_predtest_fwd(this_d, this_g, flags);
1821 } while (++i < words);
1826 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1828 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1829 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1830 uint32_t flags = PREDTEST_INIT;
1831 uint64_t *d = vd, *g = vg, esz_mask;
1834 next = last_active_element(vd, words, esz) + (1 << esz);
1835 esz_mask = pred_esz_masks[esz];
1837 /* Similar to the pseudocode for pnext, but scaled by ESZ
1838 so that we find the correct bit. */
1839 if (next < words * 64) {
1843 mask = ~((1ull << (next & 63)) - 1);
1847 uint64_t this_g = g[next / 64] & esz_mask & mask;
1849 next = (next & -64) + ctz64(this_g);
1854 } while (next < words * 64);
1859 uint64_t this_d = 0;
1860 if (i == next / 64) {
1861 this_d = 1ull << (next & 63);
1864 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1865 } while (++i < words);
1871 * Copy Zn into Zd, and store zero into inactive elements.
1872 * If inv, store zeros into the active elements.
1874 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1876 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1877 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1878 uint64_t *d = vd, *n = vn;
1881 for (i = 0; i < opr_sz; i += 1) {
1882 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1886 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1888 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1889 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1890 uint64_t *d = vd, *n = vn;
1893 for (i = 0; i < opr_sz; i += 1) {
1894 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1898 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1900 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1901 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1902 uint64_t *d = vd, *n = vn;
1905 for (i = 0; i < opr_sz; i += 1) {
1906 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1910 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1912 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1913 uint64_t *d = vd, *n = vn;
1915 uint8_t inv = simd_data(desc);
1917 for (i = 0; i < opr_sz; i += 1) {
1918 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1922 /* Three-operand expander, immediate operand, controlled by a predicate.
1924 #define DO_ZPZI(NAME, TYPE, H, OP) \
1925 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1927 intptr_t i, opr_sz = simd_oprsz(desc); \
1928 TYPE imm = simd_data(desc); \
1929 for (i = 0; i < opr_sz; ) { \
1930 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1933 TYPE nn = *(TYPE *)(vn + H(i)); \
1934 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1936 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1941 /* Similarly, specialized for 64-bit operands. */
1942 #define DO_ZPZI_D(NAME, TYPE, OP) \
1943 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1945 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1946 TYPE *d = vd, *n = vn; \
1947 TYPE imm = simd_data(desc); \
1949 for (i = 0; i < opr_sz; i += 1) { \
1950 if (pg[H1(i)] & 1) { \
1952 d[i] = OP(nn, imm); \
1957 #define DO_SHR(N, M) (N >> M)
1958 #define DO_SHL(N, M) (N << M)
1960 /* Arithmetic shift right for division. This rounds negative numbers
1961 toward zero as per signed division. Therefore before shifting,
1962 when N is negative, add 2**M-1. */
1963 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1965 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
1967 if (likely(sh < 64)) {
1968 return (x >> sh) + ((x >> (sh - 1)) & 1);
1969 } else if (sh == 64) {
1976 static inline int64_t do_srshr(int64_t x, unsigned sh)
1978 if (likely(sh < 64)) {
1979 return (x >> sh) + ((x >> (sh - 1)) & 1);
1981 /* Rounding the sign bit always produces 0. */
1986 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1987 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1988 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1989 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1991 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1992 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1993 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1994 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1996 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1997 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1998 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1999 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2001 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2002 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2003 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2004 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2010 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2011 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2013 intptr_t i, opr_sz = simd_oprsz(desc); \
2014 int shift = simd_data(desc); \
2015 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2016 TYPEW nn = *(TYPEW *)(vn + i); \
2017 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2021 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2022 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2024 intptr_t i, opr_sz = simd_oprsz(desc); \
2025 int shift = simd_data(desc); \
2026 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2027 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2028 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2032 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2033 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2034 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2036 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2037 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2038 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
2040 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2041 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2042 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2044 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2045 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2046 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
2048 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2049 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2050 #define DO_SQSHRUN_D(x, sh) \
2051 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2053 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2054 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2055 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2057 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2058 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2059 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
2061 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2062 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2063 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2065 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2066 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2067 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2069 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2070 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2071 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
2073 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2074 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2075 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2077 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2078 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2079 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2081 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2082 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2083 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
2085 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2086 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2087 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2089 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2090 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2091 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2093 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2094 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2095 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
2097 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2098 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2099 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2101 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2102 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2103 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2105 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2106 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2107 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
2109 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2110 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2111 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2113 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2114 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2115 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2117 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2118 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2119 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
2124 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2127 intptr_t i, opr_sz = simd_oprsz(desc); \
2128 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2129 TYPEW nn = *(TYPEW *)(vn + i); \
2130 TYPEW mm = *(TYPEW *)(vm + i); \
2131 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2135 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2136 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2138 intptr_t i, opr_sz = simd_oprsz(desc); \
2139 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2140 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2141 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2142 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2146 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2147 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2148 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2149 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2151 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2152 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2153 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2155 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2156 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2157 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_ADDHN)
2159 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2160 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2161 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2163 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2164 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2165 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RADDHN)
2167 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2168 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2169 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2171 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2172 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2173 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_SUBHN)
2175 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2176 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2177 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2179 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2180 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2181 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RSUBHN)
2190 /* Fully general four-operand expander, controlled by a predicate.
2192 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2193 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2194 void *vg, uint32_t desc) \
2196 intptr_t i, opr_sz = simd_oprsz(desc); \
2197 for (i = 0; i < opr_sz; ) { \
2198 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2201 TYPE nn = *(TYPE *)(vn + H(i)); \
2202 TYPE mm = *(TYPE *)(vm + H(i)); \
2203 TYPE aa = *(TYPE *)(va + H(i)); \
2204 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2206 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2211 /* Similarly, specialized for 64-bit operands. */
2212 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2213 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2214 void *vg, uint32_t desc) \
2216 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2217 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2219 for (i = 0; i < opr_sz; i += 1) { \
2220 if (pg[H1(i)] & 1) { \
2221 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2222 d[i] = OP(aa, nn, mm); \
2227 #define DO_MLA(A, N, M) (A + N * M)
2228 #define DO_MLS(A, N, M) (A - N * M)
2230 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2231 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2233 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2234 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2236 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2237 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2239 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2240 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2247 void HELPER(sve_index_b)(void *vd, uint32_t start,
2248 uint32_t incr, uint32_t desc)
2250 intptr_t i, opr_sz = simd_oprsz(desc);
2252 for (i = 0; i < opr_sz; i += 1) {
2253 d[H1(i)] = start + i * incr;
2257 void HELPER(sve_index_h)(void *vd, uint32_t start,
2258 uint32_t incr, uint32_t desc)
2260 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2262 for (i = 0; i < opr_sz; i += 1) {
2263 d[H2(i)] = start + i * incr;
2267 void HELPER(sve_index_s)(void *vd, uint32_t start,
2268 uint32_t incr, uint32_t desc)
2270 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2272 for (i = 0; i < opr_sz; i += 1) {
2273 d[H4(i)] = start + i * incr;
2277 void HELPER(sve_index_d)(void *vd, uint64_t start,
2278 uint64_t incr, uint32_t desc)
2280 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 for (i = 0; i < opr_sz; i += 1) {
2283 d[i] = start + i * incr;
2287 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2289 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2290 uint32_t sh = simd_data(desc);
2291 uint32_t *d = vd, *n = vn, *m = vm;
2292 for (i = 0; i < opr_sz; i += 1) {
2293 d[i] = n[i] + (m[i] << sh);
2297 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2299 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2300 uint64_t sh = simd_data(desc);
2301 uint64_t *d = vd, *n = vn, *m = vm;
2302 for (i = 0; i < opr_sz; i += 1) {
2303 d[i] = n[i] + (m[i] << sh);
2307 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t sh = simd_data(desc);
2311 uint64_t *d = vd, *n = vn, *m = vm;
2312 for (i = 0; i < opr_sz; i += 1) {
2313 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2317 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2319 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2320 uint64_t sh = simd_data(desc);
2321 uint64_t *d = vd, *n = vn, *m = vm;
2322 for (i = 0; i < opr_sz; i += 1) {
2323 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2327 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2329 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2330 static const uint16_t coeff[] = {
2331 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2332 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2333 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2334 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2336 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2337 uint16_t *d = vd, *n = vn;
2339 for (i = 0; i < opr_sz; i++) {
2341 intptr_t idx = extract32(nn, 0, 5);
2342 uint16_t exp = extract32(nn, 5, 5);
2343 d[i] = coeff[idx] | (exp << 10);
2347 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2349 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2350 static const uint32_t coeff[] = {
2351 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2352 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2353 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2354 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2355 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2356 0x1ef532, 0x20b051, 0x227043, 0x243516,
2357 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2358 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2359 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2360 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2361 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2362 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2363 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2364 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2365 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2366 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2368 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2369 uint32_t *d = vd, *n = vn;
2371 for (i = 0; i < opr_sz; i++) {
2373 intptr_t idx = extract32(nn, 0, 6);
2374 uint32_t exp = extract32(nn, 6, 8);
2375 d[i] = coeff[idx] | (exp << 23);
2379 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2381 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2382 static const uint64_t coeff[] = {
2383 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2384 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2385 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2386 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2387 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2388 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2389 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2390 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2391 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2392 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2393 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2394 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2395 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2396 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2397 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2398 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2399 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2400 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2401 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2402 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2403 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2406 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2407 uint64_t *d = vd, *n = vn;
2409 for (i = 0; i < opr_sz; i++) {
2411 intptr_t idx = extract32(nn, 0, 6);
2412 uint64_t exp = extract32(nn, 6, 11);
2413 d[i] = coeff[idx] | (exp << 52);
2417 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2419 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2420 uint16_t *d = vd, *n = vn, *m = vm;
2421 for (i = 0; i < opr_sz; i += 1) {
2427 d[i] = nn ^ (mm & 2) << 14;
2431 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2433 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2434 uint32_t *d = vd, *n = vn, *m = vm;
2435 for (i = 0; i < opr_sz; i += 1) {
2441 d[i] = nn ^ (mm & 2) << 30;
2445 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2447 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2448 uint64_t *d = vd, *n = vn, *m = vm;
2449 for (i = 0; i < opr_sz; i += 1) {
2455 d[i] = nn ^ (mm & 2) << 62;
2460 * Signed saturating addition with scalar operand.
2463 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2465 intptr_t i, oprsz = simd_oprsz(desc);
2467 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2468 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2472 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2474 intptr_t i, oprsz = simd_oprsz(desc);
2476 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2477 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2481 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2483 intptr_t i, oprsz = simd_oprsz(desc);
2485 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2486 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2490 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2492 intptr_t i, oprsz = simd_oprsz(desc);
2494 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2495 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2500 * Unsigned saturating addition with scalar operand.
2503 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2505 intptr_t i, oprsz = simd_oprsz(desc);
2507 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2508 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2512 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2514 intptr_t i, oprsz = simd_oprsz(desc);
2516 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2517 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2521 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2523 intptr_t i, oprsz = simd_oprsz(desc);
2525 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2526 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2530 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2532 intptr_t i, oprsz = simd_oprsz(desc);
2534 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2535 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2539 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2541 intptr_t i, oprsz = simd_oprsz(desc);
2543 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2544 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2548 /* Two operand predicated copy immediate with merge. All valid immediates
2549 * can fit within 17 signed bits in the simd_data field.
2551 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2552 uint64_t mm, uint32_t desc)
2554 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2555 uint64_t *d = vd, *n = vn;
2558 mm = dup_const(MO_8, mm);
2559 for (i = 0; i < opr_sz; i += 1) {
2561 uint64_t pp = expand_pred_b(pg[H1(i)]);
2562 d[i] = (mm & pp) | (nn & ~pp);
2566 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2567 uint64_t mm, uint32_t desc)
2569 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2570 uint64_t *d = vd, *n = vn;
2573 mm = dup_const(MO_16, mm);
2574 for (i = 0; i < opr_sz; i += 1) {
2576 uint64_t pp = expand_pred_h(pg[H1(i)]);
2577 d[i] = (mm & pp) | (nn & ~pp);
2581 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2582 uint64_t mm, uint32_t desc)
2584 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2585 uint64_t *d = vd, *n = vn;
2588 mm = dup_const(MO_32, mm);
2589 for (i = 0; i < opr_sz; i += 1) {
2591 uint64_t pp = expand_pred_s(pg[H1(i)]);
2592 d[i] = (mm & pp) | (nn & ~pp);
2596 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2597 uint64_t mm, uint32_t desc)
2599 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2600 uint64_t *d = vd, *n = vn;
2603 for (i = 0; i < opr_sz; i += 1) {
2605 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2609 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2611 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2615 val = dup_const(MO_8, val);
2616 for (i = 0; i < opr_sz; i += 1) {
2617 d[i] = val & expand_pred_b(pg[H1(i)]);
2621 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2623 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2627 val = dup_const(MO_16, val);
2628 for (i = 0; i < opr_sz; i += 1) {
2629 d[i] = val & expand_pred_h(pg[H1(i)]);
2633 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2635 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2639 val = dup_const(MO_32, val);
2640 for (i = 0; i < opr_sz; i += 1) {
2641 d[i] = val & expand_pred_s(pg[H1(i)]);
2645 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2647 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2651 for (i = 0; i < opr_sz; i += 1) {
2652 d[i] = (pg[H1(i)] & 1 ? val : 0);
2656 /* Big-endian hosts need to frob the byte indices. If the copy
2657 * happens to be 8-byte aligned, then no frobbing necessary.
2659 static void swap_memmove(void *vd, void *vs, size_t n)
2661 uintptr_t d = (uintptr_t)vd;
2662 uintptr_t s = (uintptr_t)vs;
2663 uintptr_t o = (d | s | n) & 7;
2666 #ifndef HOST_WORDS_BIGENDIAN
2675 if (d < s || d >= s + n) {
2676 for (i = 0; i < n; i += 4) {
2677 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2680 for (i = n; i > 0; ) {
2682 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2689 if (d < s || d >= s + n) {
2690 for (i = 0; i < n; i += 2) {
2691 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2694 for (i = n; i > 0; ) {
2696 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2702 if (d < s || d >= s + n) {
2703 for (i = 0; i < n; i++) {
2704 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2707 for (i = n; i > 0; ) {
2709 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2716 /* Similarly for memset of 0. */
2717 static void swap_memzero(void *vd, size_t n)
2719 uintptr_t d = (uintptr_t)vd;
2720 uintptr_t o = (d | n) & 7;
2723 /* Usually, the first bit of a predicate is set, so N is 0. */
2724 if (likely(n == 0)) {
2728 #ifndef HOST_WORDS_BIGENDIAN
2737 for (i = 0; i < n; i += 4) {
2738 *(uint32_t *)H1_4(d + i) = 0;
2744 for (i = 0; i < n; i += 2) {
2745 *(uint16_t *)H1_2(d + i) = 0;
2750 for (i = 0; i < n; i++) {
2751 *(uint8_t *)H1(d + i) = 0;
2757 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2759 intptr_t opr_sz = simd_oprsz(desc);
2760 size_t n_ofs = simd_data(desc);
2761 size_t n_siz = opr_sz - n_ofs;
2764 swap_memmove(vd, vn + n_ofs, n_siz);
2765 swap_memmove(vd + n_siz, vm, n_ofs);
2766 } else if (vd != vn) {
2767 swap_memmove(vd + n_siz, vd, n_ofs);
2768 swap_memmove(vd, vn + n_ofs, n_siz);
2770 /* vd == vn == vm. Need temp space. */
2772 swap_memmove(&tmp, vm, n_ofs);
2773 swap_memmove(vd, vd + n_ofs, n_siz);
2774 memcpy(vd + n_siz, &tmp, n_ofs);
2778 #define DO_INSR(NAME, TYPE, H) \
2779 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2781 intptr_t opr_sz = simd_oprsz(desc); \
2782 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2783 *(TYPE *)(vd + H(0)) = val; \
2786 DO_INSR(sve_insr_b, uint8_t, H1)
2787 DO_INSR(sve_insr_h, uint16_t, H1_2)
2788 DO_INSR(sve_insr_s, uint32_t, H1_4)
2789 DO_INSR(sve_insr_d, uint64_t, )
2793 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2795 intptr_t i, j, opr_sz = simd_oprsz(desc);
2796 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2797 uint64_t f = *(uint64_t *)(vn + i);
2798 uint64_t b = *(uint64_t *)(vn + j);
2799 *(uint64_t *)(vd + i) = bswap64(b);
2800 *(uint64_t *)(vd + j) = bswap64(f);
2804 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2806 intptr_t i, j, opr_sz = simd_oprsz(desc);
2807 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2808 uint64_t f = *(uint64_t *)(vn + i);
2809 uint64_t b = *(uint64_t *)(vn + j);
2810 *(uint64_t *)(vd + i) = hswap64(b);
2811 *(uint64_t *)(vd + j) = hswap64(f);
2815 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2817 intptr_t i, j, opr_sz = simd_oprsz(desc);
2818 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2819 uint64_t f = *(uint64_t *)(vn + i);
2820 uint64_t b = *(uint64_t *)(vn + j);
2821 *(uint64_t *)(vd + i) = rol64(b, 32);
2822 *(uint64_t *)(vd + j) = rol64(f, 32);
2826 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2828 intptr_t i, j, opr_sz = simd_oprsz(desc);
2829 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2830 uint64_t f = *(uint64_t *)(vn + i);
2831 uint64_t b = *(uint64_t *)(vn + j);
2832 *(uint64_t *)(vd + i) = b;
2833 *(uint64_t *)(vd + j) = f;
2837 #define DO_TBL(NAME, TYPE, H) \
2838 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2840 intptr_t i, opr_sz = simd_oprsz(desc); \
2841 uintptr_t elem = opr_sz / sizeof(TYPE); \
2842 TYPE *d = vd, *n = vn, *m = vm; \
2844 if (unlikely(vd == vn)) { \
2845 n = memcpy(&tmp, vn, opr_sz); \
2847 for (i = 0; i < elem; i++) { \
2849 d[H(i)] = j < elem ? n[H(j)] : 0; \
2853 DO_TBL(sve_tbl_b, uint8_t, H1)
2854 DO_TBL(sve_tbl_h, uint16_t, H2)
2855 DO_TBL(sve_tbl_s, uint32_t, H4)
2856 DO_TBL(sve_tbl_d, uint64_t, )
2860 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2861 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2863 intptr_t i, opr_sz = simd_oprsz(desc); \
2867 if (unlikely(vn - vd < opr_sz)) { \
2868 n = memcpy(&tmp, n, opr_sz / 2); \
2870 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2871 d[HD(i)] = n[HS(i)]; \
2875 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2876 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2877 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2879 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2880 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2881 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2885 /* Mask of bits included in the even numbered predicates of width esz.
2886 * We also use this for expand_bits/compress_bits, and so extend the
2887 * same pattern out to 16-bit units.
2889 static const uint64_t even_bit_esz_masks[5] = {
2890 0x5555555555555555ull,
2891 0x3333333333333333ull,
2892 0x0f0f0f0f0f0f0f0full,
2893 0x00ff00ff00ff00ffull,
2894 0x0000ffff0000ffffull,
2897 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2898 * For N==0, this corresponds to the operation that in qemu/bitops.h
2899 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2900 * section 7-2 Shuffling Bits.
2902 static uint64_t expand_bits(uint64_t x, int n)
2907 for (i = 4; i >= n; i--) {
2909 x = ((x << sh) | x) & even_bit_esz_masks[i];
2914 /* Compress units of 2**(N+1) bits to units of 2**N bits.
2915 * For N==0, this corresponds to the operation that in qemu/bitops.h
2916 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2917 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2919 static uint64_t compress_bits(uint64_t x, int n)
2923 for (i = n; i <= 4; i++) {
2925 x &= even_bit_esz_masks[i];
2928 return x & 0xffffffffu;
2931 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2933 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2934 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2935 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2936 int esize = 1 << esz;
2941 uint64_t nn = *(uint64_t *)vn;
2942 uint64_t mm = *(uint64_t *)vm;
2943 int half = 4 * oprsz;
2945 nn = extract64(nn, high * half, half);
2946 mm = extract64(mm, high * half, half);
2947 nn = expand_bits(nn, esz);
2948 mm = expand_bits(mm, esz);
2949 d[0] = nn | (mm << esize);
2951 ARMPredicateReg tmp;
2953 /* We produce output faster than we consume input.
2954 Therefore we must be mindful of possible overlap. */
2956 vn = memcpy(&tmp, vn, oprsz);
2960 } else if (vd == vm) {
2961 vm = memcpy(&tmp, vm, oprsz);
2967 if ((oprsz & 7) == 0) {
2968 uint32_t *n = vn, *m = vm;
2971 for (i = 0; i < oprsz / 8; i++) {
2972 uint64_t nn = n[H4(high + i)];
2973 uint64_t mm = m[H4(high + i)];
2975 nn = expand_bits(nn, esz);
2976 mm = expand_bits(mm, esz);
2977 d[i] = nn | (mm << esize);
2980 uint8_t *n = vn, *m = vm;
2983 for (i = 0; i < oprsz / 2; i++) {
2984 uint16_t nn = n[H1(high + i)];
2985 uint16_t mm = m[H1(high + i)];
2987 nn = expand_bits(nn, esz);
2988 mm = expand_bits(mm, esz);
2989 d16[H2(i)] = nn | (mm << esize);
2995 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2997 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2998 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2999 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3000 uint64_t *d = vd, *n = vn, *m = vm;
3005 l = compress_bits(n[0] >> odd, esz);
3006 h = compress_bits(m[0] >> odd, esz);
3007 d[0] = l | (h << (4 * oprsz));
3009 ARMPredicateReg tmp_m;
3010 intptr_t oprsz_16 = oprsz / 16;
3012 if ((vm - vd) < (uintptr_t)oprsz) {
3013 m = memcpy(&tmp_m, vm, oprsz);
3016 for (i = 0; i < oprsz_16; i++) {
3019 l = compress_bits(l >> odd, esz);
3020 h = compress_bits(h >> odd, esz);
3021 d[i] = l | (h << 32);
3025 * For VL which is not a multiple of 512, the results from M do not
3026 * align nicely with the uint64_t for D. Put the aligned results
3027 * from M into TMP_M and then copy it into place afterward.
3030 int final_shift = (oprsz & 15) * 2;
3034 l = compress_bits(l >> odd, esz);
3035 h = compress_bits(h >> odd, esz);
3036 d[i] = l | (h << final_shift);
3038 for (i = 0; i < oprsz_16; i++) {
3041 l = compress_bits(l >> odd, esz);
3042 h = compress_bits(h >> odd, esz);
3043 tmp_m.p[i] = l | (h << 32);
3047 l = compress_bits(l >> odd, esz);
3048 h = compress_bits(h >> odd, esz);
3049 tmp_m.p[i] = l | (h << final_shift);
3051 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3053 for (i = 0; i < oprsz_16; i++) {
3056 l = compress_bits(l >> odd, esz);
3057 h = compress_bits(h >> odd, esz);
3058 d[oprsz_16 + i] = l | (h << 32);
3064 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3066 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3067 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3068 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3069 uint64_t *d = vd, *n = vn, *m = vm;
3076 mask = even_bit_esz_masks[esz];
3083 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3084 uint64_t nn = (n[i] & mask) >> shr;
3085 uint64_t mm = (m[i] & mask) << shl;
3090 /* Reverse units of 2**N bits. */
3091 static uint64_t reverse_bits_64(uint64_t x, int n)
3096 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3097 uint64_t mask = even_bit_esz_masks[i];
3098 x = ((x & mask) << sh) | ((x >> sh) & mask);
3103 static uint8_t reverse_bits_8(uint8_t x, int n)
3105 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3108 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3109 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3114 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3116 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3117 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3118 intptr_t i, oprsz_2 = oprsz / 2;
3121 uint64_t l = *(uint64_t *)vn;
3122 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3123 *(uint64_t *)vd = l;
3124 } else if ((oprsz & 15) == 0) {
3125 for (i = 0; i < oprsz_2; i += 8) {
3126 intptr_t ih = oprsz - 8 - i;
3127 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3128 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3129 *(uint64_t *)(vd + i) = h;
3130 *(uint64_t *)(vd + ih) = l;
3133 for (i = 0; i < oprsz_2; i += 1) {
3134 intptr_t il = H1(i);
3135 intptr_t ih = H1(oprsz - 1 - i);
3136 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3137 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3138 *(uint8_t *)(vd + il) = h;
3139 *(uint8_t *)(vd + ih) = l;
3144 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3147 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3152 uint64_t nn = *(uint64_t *)vn;
3153 int half = 4 * oprsz;
3155 nn = extract64(nn, high * half, half);
3156 nn = expand_bits(nn, 0);
3159 ARMPredicateReg tmp_n;
3161 /* We produce output faster than we consume input.
3162 Therefore we must be mindful of possible overlap. */
3163 if ((vn - vd) < (uintptr_t)oprsz) {
3164 vn = memcpy(&tmp_n, vn, oprsz);
3170 if ((oprsz & 7) == 0) {
3174 for (i = 0; i < oprsz / 8; i++) {
3175 uint64_t nn = n[H4(high + i)];
3176 d[i] = expand_bits(nn, 0);
3182 for (i = 0; i < oprsz / 2; i++) {
3183 uint16_t nn = n[H1(high + i)];
3184 d16[H2(i)] = expand_bits(nn, 0);
3190 #define DO_ZIP(NAME, TYPE, H) \
3191 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3193 intptr_t oprsz = simd_oprsz(desc); \
3194 intptr_t i, oprsz_2 = oprsz / 2; \
3195 ARMVectorReg tmp_n, tmp_m; \
3196 /* We produce output faster than we consume input. \
3197 Therefore we must be mindful of possible overlap. */ \
3198 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3199 vn = memcpy(&tmp_n, vn, oprsz_2); \
3201 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3202 vm = memcpy(&tmp_m, vm, oprsz_2); \
3204 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3205 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3206 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3210 DO_ZIP(sve_zip_b, uint8_t, H1)
3211 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3212 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3213 DO_ZIP(sve_zip_d, uint64_t, )
3215 #define DO_UZP(NAME, TYPE, H) \
3216 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3218 intptr_t oprsz = simd_oprsz(desc); \
3219 intptr_t oprsz_2 = oprsz / 2; \
3220 intptr_t odd_ofs = simd_data(desc); \
3222 ARMVectorReg tmp_m; \
3223 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3224 vm = memcpy(&tmp_m, vm, oprsz); \
3226 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3227 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3229 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3230 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3234 DO_UZP(sve_uzp_b, uint8_t, H1)
3235 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3236 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3237 DO_UZP(sve_uzp_d, uint64_t, )
3239 #define DO_TRN(NAME, TYPE, H) \
3240 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3242 intptr_t oprsz = simd_oprsz(desc); \
3243 intptr_t odd_ofs = simd_data(desc); \
3245 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3246 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3247 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3248 *(TYPE *)(vd + H(i + 0)) = ae; \
3249 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3253 DO_TRN(sve_trn_b, uint8_t, H1)
3254 DO_TRN(sve_trn_h, uint16_t, H1_2)
3255 DO_TRN(sve_trn_s, uint32_t, H1_4)
3256 DO_TRN(sve_trn_d, uint64_t, )
3262 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3264 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3265 uint32_t *d = vd, *n = vn;
3268 for (i = j = 0; i < opr_sz; i++) {
3269 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3270 d[H4(j)] = n[H4(i)];
3274 for (; j < opr_sz; j++) {
3279 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3281 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3282 uint64_t *d = vd, *n = vn;
3285 for (i = j = 0; i < opr_sz; i++) {
3286 if (pg[H1(i)] & 1) {
3291 for (; j < opr_sz; j++) {
3296 /* Similar to the ARM LastActiveElement pseudocode function, except the
3297 * result is multiplied by the element size. This includes the not found
3298 * indication; e.g. not found for esz=3 is -8.
3300 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3302 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3303 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3305 return last_active_element(vg, words, esz);
3308 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3310 intptr_t opr_sz = simd_oprsz(desc) / 8;
3311 int esz = simd_data(desc);
3312 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3313 intptr_t i, first_i, last_i;
3316 first_i = last_i = 0;
3317 first_g = last_g = 0;
3319 /* Find the extent of the active elements within VG. */
3320 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3321 pg = *(uint64_t *)(vg + i) & mask;
3334 first_i = first_i * 8 + ctz64(first_g);
3335 last_i = last_i * 8 + 63 - clz64(last_g);
3336 len = last_i - first_i + (1 << esz);
3338 vm = memcpy(&tmp, vm, opr_sz * 8);
3340 swap_memmove(vd, vn + first_i, len);
3342 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3345 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3346 void *vg, uint32_t desc)
3348 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3349 uint64_t *d = vd, *n = vn, *m = vm;
3352 for (i = 0; i < opr_sz; i += 1) {
3353 uint64_t nn = n[i], mm = m[i];
3354 uint64_t pp = expand_pred_b(pg[H1(i)]);
3355 d[i] = (nn & pp) | (mm & ~pp);
3359 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3360 void *vg, uint32_t desc)
3362 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3363 uint64_t *d = vd, *n = vn, *m = vm;
3366 for (i = 0; i < opr_sz; i += 1) {
3367 uint64_t nn = n[i], mm = m[i];
3368 uint64_t pp = expand_pred_h(pg[H1(i)]);
3369 d[i] = (nn & pp) | (mm & ~pp);
3373 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3374 void *vg, uint32_t desc)
3376 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3377 uint64_t *d = vd, *n = vn, *m = vm;
3380 for (i = 0; i < opr_sz; i += 1) {
3381 uint64_t nn = n[i], mm = m[i];
3382 uint64_t pp = expand_pred_s(pg[H1(i)]);
3383 d[i] = (nn & pp) | (mm & ~pp);
3387 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3388 void *vg, uint32_t desc)
3390 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3391 uint64_t *d = vd, *n = vn, *m = vm;
3394 for (i = 0; i < opr_sz; i += 1) {
3395 uint64_t nn = n[i], mm = m[i];
3396 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3400 /* Two operand comparison controlled by a predicate.
3401 * ??? It is very tempting to want to be able to expand this inline
3402 * with x86 instructions, e.g.
3404 * vcmpeqw zm, zn, %ymm0
3405 * vpmovmskb %ymm0, %eax
3409 * or even aarch64, e.g.
3411 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3412 * cmeq v0.8h, zn, zm
3413 * and v0.8h, v0.8h, mask
3417 * However, coming up with an abstraction that allows vector inputs and
3418 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3419 * scalar outputs, is tricky.
3421 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3422 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3424 intptr_t opr_sz = simd_oprsz(desc); \
3425 uint32_t flags = PREDTEST_INIT; \
3426 intptr_t i = opr_sz; \
3428 uint64_t out = 0, pg; \
3430 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3431 TYPE nn = *(TYPE *)(vn + H(i)); \
3432 TYPE mm = *(TYPE *)(vm + H(i)); \
3435 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3437 *(uint64_t *)(vd + (i >> 3)) = out; \
3438 flags = iter_predtest_bwd(out, pg, flags); \
3443 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3444 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3445 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3446 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3447 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3448 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3449 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3450 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3452 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3453 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3454 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3455 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3457 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3458 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3459 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3460 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3462 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3463 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3464 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3465 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3467 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3468 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3469 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3470 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3472 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3473 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3474 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3475 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3477 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3478 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3479 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3480 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3482 #undef DO_CMP_PPZZ_B
3483 #undef DO_CMP_PPZZ_H
3484 #undef DO_CMP_PPZZ_S
3485 #undef DO_CMP_PPZZ_D
3488 /* Similar, but the second source is "wide". */
3489 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3490 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3492 intptr_t opr_sz = simd_oprsz(desc); \
3493 uint32_t flags = PREDTEST_INIT; \
3494 intptr_t i = opr_sz; \
3496 uint64_t out = 0, pg; \
3498 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3500 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3501 TYPE nn = *(TYPE *)(vn + H(i)); \
3505 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3507 *(uint64_t *)(vd + (i >> 3)) = out; \
3508 flags = iter_predtest_bwd(out, pg, flags); \
3513 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3514 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3515 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3516 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3517 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3518 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3520 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3521 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3522 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3524 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3525 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3526 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3528 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3529 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3530 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3532 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3533 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3534 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3536 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3537 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3538 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3540 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3541 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3542 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3544 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3545 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3546 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3548 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3549 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3550 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3552 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3553 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3554 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3556 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3557 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3558 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3560 #undef DO_CMP_PPZW_B
3561 #undef DO_CMP_PPZW_H
3562 #undef DO_CMP_PPZW_S
3565 /* Similar, but the second source is immediate. */
3566 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3567 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3569 intptr_t opr_sz = simd_oprsz(desc); \
3570 uint32_t flags = PREDTEST_INIT; \
3571 TYPE mm = simd_data(desc); \
3572 intptr_t i = opr_sz; \
3574 uint64_t out = 0, pg; \
3576 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3577 TYPE nn = *(TYPE *)(vn + H(i)); \
3580 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3582 *(uint64_t *)(vd + (i >> 3)) = out; \
3583 flags = iter_predtest_bwd(out, pg, flags); \
3588 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3589 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3590 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3591 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3592 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3593 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3594 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3595 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3597 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3598 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3599 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3600 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3602 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3603 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3604 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3605 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3607 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3608 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3609 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3610 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3612 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3613 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3614 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3615 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3617 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3618 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3619 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3620 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3622 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3623 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3624 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3625 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3627 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3628 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3629 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3630 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3632 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3633 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3634 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3635 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3637 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3638 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3639 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3640 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3642 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3643 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3644 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3645 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3647 #undef DO_CMP_PPZI_B
3648 #undef DO_CMP_PPZI_H
3649 #undef DO_CMP_PPZI_S
3650 #undef DO_CMP_PPZI_D
3653 /* Similar to the ARM LastActive pseudocode function. */
3654 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3658 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3659 uint64_t pg = *(uint64_t *)(vg + i);
3661 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3667 /* Compute a mask into RETB that is true for all G, up to and including
3668 * (if after) or excluding (if !after) the first G & N.
3669 * Return true if BRK found.
3671 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3672 bool brk, bool after)
3678 } else if ((g & n) == 0) {
3679 /* For all G, no N are set; break not found. */
3682 /* Break somewhere in N. Locate it. */
3683 b = g & n; /* guard true, pred true */
3684 b = b & -b; /* first such */
3686 b = b | (b - 1); /* break after same */
3688 b = b - 1; /* break before same */
3697 /* Compute a zeroing BRK. */
3698 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3699 intptr_t oprsz, bool after)
3704 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3705 uint64_t this_b, this_g = g[i];
3707 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3708 d[i] = this_b & this_g;
3712 /* Likewise, but also compute flags. */
3713 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3714 intptr_t oprsz, bool after)
3716 uint32_t flags = PREDTEST_INIT;
3720 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3721 uint64_t this_b, this_d, this_g = g[i];
3723 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3724 d[i] = this_d = this_b & this_g;
3725 flags = iter_predtest_fwd(this_d, this_g, flags);
3730 /* Compute a merging BRK. */
3731 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3732 intptr_t oprsz, bool after)
3737 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3738 uint64_t this_b, this_g = g[i];
3740 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3741 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3745 /* Likewise, but also compute flags. */
3746 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3747 intptr_t oprsz, bool after)
3749 uint32_t flags = PREDTEST_INIT;
3753 for (i = 0; i < oprsz / 8; ++i) {
3754 uint64_t this_b, this_d = d[i], this_g = g[i];
3756 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3757 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3758 flags = iter_predtest_fwd(this_d, this_g, flags);
3763 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3765 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3766 * The compiler should turn this into 4 64-bit integer stores.
3768 memset(d, 0, sizeof(ARMPredicateReg));
3769 return PREDTEST_INIT;
3772 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3775 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3776 if (last_active_pred(vn, vg, oprsz)) {
3777 compute_brk_z(vd, vm, vg, oprsz, true);
3783 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3786 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3787 if (last_active_pred(vn, vg, oprsz)) {
3788 return compute_brks_z(vd, vm, vg, oprsz, true);
3790 return do_zero(vd, oprsz);
3794 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3797 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3798 if (last_active_pred(vn, vg, oprsz)) {
3799 compute_brk_z(vd, vm, vg, oprsz, false);
3805 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3808 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3809 if (last_active_pred(vn, vg, oprsz)) {
3810 return compute_brks_z(vd, vm, vg, oprsz, false);
3812 return do_zero(vd, oprsz);
3816 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3818 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3819 compute_brk_z(vd, vn, vg, oprsz, true);
3822 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3824 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3825 return compute_brks_z(vd, vn, vg, oprsz, true);
3828 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3830 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3831 compute_brk_z(vd, vn, vg, oprsz, false);
3834 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3836 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3837 return compute_brks_z(vd, vn, vg, oprsz, false);
3840 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3842 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3843 compute_brk_m(vd, vn, vg, oprsz, true);
3846 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3848 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3849 return compute_brks_m(vd, vn, vg, oprsz, true);
3852 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3854 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3855 compute_brk_m(vd, vn, vg, oprsz, false);
3858 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3860 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3861 return compute_brks_m(vd, vn, vg, oprsz, false);
3864 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3866 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3867 if (!last_active_pred(vn, vg, oprsz)) {
3872 /* As if PredTest(Ones(PL), D, esz). */
3873 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3876 uint32_t flags = PREDTEST_INIT;
3879 for (i = 0; i < oprsz / 8; i++) {
3880 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3883 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3884 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3889 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3891 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3892 if (last_active_pred(vn, vg, oprsz)) {
3893 return predtest_ones(vd, oprsz, -1);
3895 return do_zero(vd, oprsz);
3899 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3901 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3902 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3903 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3906 for (i = 0; i < words; ++i) {
3907 uint64_t t = n[i] & g[i] & mask;
3913 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
3915 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3916 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3917 uint64_t esz_mask = pred_esz_masks[esz];
3918 ARMPredicateReg *d = vd;
3922 /* Begin with a zero predicate register. */
3923 flags = do_zero(d, oprsz);
3928 /* Set all of the requested bits. */
3929 for (i = 0; i < count / 64; ++i) {
3933 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3936 return predtest_ones(d, oprsz, esz_mask);
3939 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
3941 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3942 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3943 uint64_t esz_mask = pred_esz_masks[esz];
3944 ARMPredicateReg *d = vd;
3945 intptr_t i, invcount, oprbits;
3949 return do_zero(d, oprsz);
3952 oprbits = oprsz * 8;
3953 tcg_debug_assert(count <= oprbits);
3957 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
3960 invcount = oprbits - count;
3961 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
3966 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
3972 return predtest_ones(d, oprsz, esz_mask);
3975 /* Recursive reduction on a function;
3976 * C.f. the ARM ARM function ReducePredicated.
3978 * While it would be possible to write this without the DATA temporary,
3979 * it is much simpler to process the predicate register this way.
3980 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3981 * little to gain with a more complex non-recursive form.
3983 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3984 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3989 uintptr_t half = n / 2; \
3990 TYPE lo = NAME##_reduce(data, status, half); \
3991 TYPE hi = NAME##_reduce(data + half, status, half); \
3992 return TYPE##_##FUNC(lo, hi, status); \
3995 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3997 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
3998 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3999 for (i = 0; i < oprsz; ) { \
4000 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4002 TYPE nn = *(TYPE *)(vn + H(i)); \
4003 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4004 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4007 for (; i < maxsz; i += sizeof(TYPE)) { \
4008 *(TYPE *)((void *)data + i) = IDENT; \
4010 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4013 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4014 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4015 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
4017 /* Identity is floatN_default_nan, without the function call. */
4018 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4019 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4020 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
4022 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4023 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4024 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
4026 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4027 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4028 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
4030 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4031 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4032 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
4036 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4037 void *status, uint32_t desc)
4039 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4040 float16 result = nn;
4043 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4046 float16 mm = *(float16 *)(vm + H1_2(i));
4047 result = float16_add(result, mm, status);
4049 i += sizeof(float16), pg >>= sizeof(float16);
4051 } while (i < opr_sz);
4056 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4057 void *status, uint32_t desc)
4059 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4060 float32 result = nn;
4063 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4066 float32 mm = *(float32 *)(vm + H1_2(i));
4067 result = float32_add(result, mm, status);
4069 i += sizeof(float32), pg >>= sizeof(float32);
4071 } while (i < opr_sz);
4076 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4077 void *status, uint32_t desc)
4079 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4083 for (i = 0; i < opr_sz; i++) {
4084 if (pg[H1(i)] & 1) {
4085 nn = float64_add(nn, m[i], status);
4092 /* Fully general three-operand expander, controlled by a predicate,
4093 * With the extra float_status parameter.
4095 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4096 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4097 void *status, uint32_t desc) \
4099 intptr_t i = simd_oprsz(desc); \
4102 uint64_t pg = g[(i - 1) >> 6]; \
4104 i -= sizeof(TYPE); \
4105 if (likely((pg >> (i & 63)) & 1)) { \
4106 TYPE nn = *(TYPE *)(vn + H(i)); \
4107 TYPE mm = *(TYPE *)(vm + H(i)); \
4108 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4114 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4115 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4116 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
4118 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4119 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4120 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
4122 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4123 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4124 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
4126 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4127 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4128 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
4130 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4131 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4132 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
4134 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4135 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4136 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
4138 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4139 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4140 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
4142 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4143 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4144 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
4146 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4148 return float16_abs(float16_sub(a, b, s));
4151 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4153 return float32_abs(float32_sub(a, b, s));
4156 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4158 return float64_abs(float64_sub(a, b, s));
4161 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4162 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4163 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
4165 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4167 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4168 return float64_scalbn(a, b_int, s);
4171 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4172 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4173 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
4175 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4176 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4177 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
4181 /* Three-operand expander, with one scalar operand, controlled by
4182 * a predicate, with the extra float_status parameter.
4184 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4185 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4186 void *status, uint32_t desc) \
4188 intptr_t i = simd_oprsz(desc); \
4192 uint64_t pg = g[(i - 1) >> 6]; \
4194 i -= sizeof(TYPE); \
4195 if (likely((pg >> (i & 63)) & 1)) { \
4196 TYPE nn = *(TYPE *)(vn + H(i)); \
4197 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4203 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4204 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4205 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
4207 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4208 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4209 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
4211 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4212 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4213 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
4215 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4217 return float16_sub(b, a, s);
4220 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4222 return float32_sub(b, a, s);
4225 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4227 return float64_sub(b, a, s);
4230 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4231 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4232 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
4234 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4235 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4236 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
4238 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4239 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4240 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
4242 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4243 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4244 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
4246 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4247 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4248 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
4250 /* Fully general two-operand expander, controlled by a predicate,
4251 * With the extra float_status parameter.
4253 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4254 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4256 intptr_t i = simd_oprsz(desc); \
4259 uint64_t pg = g[(i - 1) >> 6]; \
4261 i -= sizeof(TYPE); \
4262 if (likely((pg >> (i & 63)) & 1)) { \
4263 TYPE nn = *(TYPE *)(vn + H(i)); \
4264 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4270 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4271 * FZ16. When converting from fp16, this affects flushing input denormals;
4272 * when converting to fp16, this affects flushing output denormals.
4274 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4276 bool save = get_flush_inputs_to_zero(fpst);
4279 set_flush_inputs_to_zero(false, fpst);
4280 ret = float16_to_float32(f, true, fpst);
4281 set_flush_inputs_to_zero(save, fpst);
4285 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4287 bool save = get_flush_inputs_to_zero(fpst);
4290 set_flush_inputs_to_zero(false, fpst);
4291 ret = float16_to_float64(f, true, fpst);
4292 set_flush_inputs_to_zero(save, fpst);
4296 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4298 bool save = get_flush_to_zero(fpst);
4301 set_flush_to_zero(false, fpst);
4302 ret = float32_to_float16(f, true, fpst);
4303 set_flush_to_zero(save, fpst);
4307 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4309 bool save = get_flush_to_zero(fpst);
4312 set_flush_to_zero(false, fpst);
4313 ret = float64_to_float16(f, true, fpst);
4314 set_flush_to_zero(save, fpst);
4318 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4320 if (float16_is_any_nan(f)) {
4321 float_raise(float_flag_invalid, s);
4324 return float16_to_int16_round_to_zero(f, s);
4327 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4329 if (float16_is_any_nan(f)) {
4330 float_raise(float_flag_invalid, s);
4333 return float16_to_int64_round_to_zero(f, s);
4336 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4338 if (float32_is_any_nan(f)) {
4339 float_raise(float_flag_invalid, s);
4342 return float32_to_int64_round_to_zero(f, s);
4345 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4347 if (float64_is_any_nan(f)) {
4348 float_raise(float_flag_invalid, s);
4351 return float64_to_int64_round_to_zero(f, s);
4354 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4356 if (float16_is_any_nan(f)) {
4357 float_raise(float_flag_invalid, s);
4360 return float16_to_uint16_round_to_zero(f, s);
4363 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4365 if (float16_is_any_nan(f)) {
4366 float_raise(float_flag_invalid, s);
4369 return float16_to_uint64_round_to_zero(f, s);
4372 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4374 if (float32_is_any_nan(f)) {
4375 float_raise(float_flag_invalid, s);
4378 return float32_to_uint64_round_to_zero(f, s);
4381 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4383 if (float64_is_any_nan(f)) {
4384 float_raise(float_flag_invalid, s);
4387 return float64_to_uint64_round_to_zero(f, s);
4390 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4391 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4392 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
4393 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
4394 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
4395 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
4397 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4398 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4399 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4400 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
4401 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
4402 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
4403 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
4405 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4406 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4407 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4408 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
4409 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
4410 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
4411 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
4413 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4414 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4415 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
4417 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4418 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4419 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
4421 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4422 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4423 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
4425 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4426 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4427 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
4429 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4430 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4431 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4432 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4433 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4434 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4435 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4437 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4438 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4439 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4440 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4441 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4442 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4443 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4447 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4448 float_status *status, uint32_t desc,
4449 uint16_t neg1, uint16_t neg3)
4451 intptr_t i = simd_oprsz(desc);
4455 uint64_t pg = g[(i - 1) >> 6];
4458 if (likely((pg >> (i & 63)) & 1)) {
4459 float16 e1, e2, e3, r;
4461 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4462 e2 = *(uint16_t *)(vm + H1_2(i));
4463 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4464 r = float16_muladd(e1, e2, e3, 0, status);
4465 *(uint16_t *)(vd + H1_2(i)) = r;
4471 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4472 void *vg, void *status, uint32_t desc)
4474 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4477 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4478 void *vg, void *status, uint32_t desc)
4480 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4483 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4484 void *vg, void *status, uint32_t desc)
4486 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4489 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4490 void *vg, void *status, uint32_t desc)
4492 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4495 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4496 float_status *status, uint32_t desc,
4497 uint32_t neg1, uint32_t neg3)
4499 intptr_t i = simd_oprsz(desc);
4503 uint64_t pg = g[(i - 1) >> 6];
4506 if (likely((pg >> (i & 63)) & 1)) {
4507 float32 e1, e2, e3, r;
4509 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4510 e2 = *(uint32_t *)(vm + H1_4(i));
4511 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4512 r = float32_muladd(e1, e2, e3, 0, status);
4513 *(uint32_t *)(vd + H1_4(i)) = r;
4519 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4520 void *vg, void *status, uint32_t desc)
4522 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4525 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4526 void *vg, void *status, uint32_t desc)
4528 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4531 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4532 void *vg, void *status, uint32_t desc)
4534 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4537 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4538 void *vg, void *status, uint32_t desc)
4540 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4543 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4544 float_status *status, uint32_t desc,
4545 uint64_t neg1, uint64_t neg3)
4547 intptr_t i = simd_oprsz(desc);
4551 uint64_t pg = g[(i - 1) >> 6];
4554 if (likely((pg >> (i & 63)) & 1)) {
4555 float64 e1, e2, e3, r;
4557 e1 = *(uint64_t *)(vn + i) ^ neg1;
4558 e2 = *(uint64_t *)(vm + i);
4559 e3 = *(uint64_t *)(va + i) ^ neg3;
4560 r = float64_muladd(e1, e2, e3, 0, status);
4561 *(uint64_t *)(vd + i) = r;
4567 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4568 void *vg, void *status, uint32_t desc)
4570 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4573 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4574 void *vg, void *status, uint32_t desc)
4576 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4579 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4580 void *vg, void *status, uint32_t desc)
4582 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4585 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4586 void *vg, void *status, uint32_t desc)
4588 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4591 /* Two operand floating-point comparison controlled by a predicate.
4592 * Unlike the integer version, we are not allowed to optimistically
4593 * compare operands, since the comparison may have side effects wrt
4596 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4597 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4598 void *status, uint32_t desc) \
4600 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4601 uint64_t *d = vd, *g = vg; \
4603 uint64_t out = 0, pg = g[j]; \
4605 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4606 if (likely((pg >> (i & 63)) & 1)) { \
4607 TYPE nn = *(TYPE *)(vn + H(i)); \
4608 TYPE mm = *(TYPE *)(vm + H(i)); \
4609 out |= OP(TYPE, nn, mm, status); \
4616 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4617 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4618 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4619 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4620 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4621 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4623 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4624 DO_FPCMP_PPZZ_H(NAME, OP) \
4625 DO_FPCMP_PPZZ_S(NAME, OP) \
4626 DO_FPCMP_PPZZ_D(NAME, OP)
4628 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4629 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4630 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4631 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4632 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4633 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4634 #define DO_FCMUO(TYPE, X, Y, ST) \
4635 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4636 #define DO_FACGE(TYPE, X, Y, ST) \
4637 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4638 #define DO_FACGT(TYPE, X, Y, ST) \
4639 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4641 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4642 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4643 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4644 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4645 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4646 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4647 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4649 #undef DO_FPCMP_PPZZ_ALL
4650 #undef DO_FPCMP_PPZZ_D
4651 #undef DO_FPCMP_PPZZ_S
4652 #undef DO_FPCMP_PPZZ_H
4653 #undef DO_FPCMP_PPZZ
4655 /* One operand floating-point comparison against zero, controlled
4658 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4659 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4660 void *status, uint32_t desc) \
4662 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4663 uint64_t *d = vd, *g = vg; \
4665 uint64_t out = 0, pg = g[j]; \
4667 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4668 if ((pg >> (i & 63)) & 1) { \
4669 TYPE nn = *(TYPE *)(vn + H(i)); \
4670 out |= OP(TYPE, nn, 0, status); \
4677 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4678 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4679 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4680 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4681 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4682 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4684 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4685 DO_FPCMP_PPZ0_H(NAME, OP) \
4686 DO_FPCMP_PPZ0_S(NAME, OP) \
4687 DO_FPCMP_PPZ0_D(NAME, OP)
4689 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4690 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4691 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4692 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4693 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4694 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4696 /* FP Trig Multiply-Add. */
4698 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4700 static const float16 coeff[16] = {
4701 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4702 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4704 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4705 intptr_t x = simd_data(desc);
4706 float16 *d = vd, *n = vn, *m = vm;
4707 for (i = 0; i < opr_sz; i++) {
4710 if (float16_is_neg(mm)) {
4711 mm = float16_abs(mm);
4714 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4718 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4720 static const float32 coeff[16] = {
4721 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4722 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4723 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4724 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4726 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4727 intptr_t x = simd_data(desc);
4728 float32 *d = vd, *n = vn, *m = vm;
4729 for (i = 0; i < opr_sz; i++) {
4732 if (float32_is_neg(mm)) {
4733 mm = float32_abs(mm);
4736 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4740 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4742 static const float64 coeff[16] = {
4743 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4744 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4745 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4746 0x3de5d8408868552full, 0x0000000000000000ull,
4747 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4748 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4749 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4750 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4752 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4753 intptr_t x = simd_data(desc);
4754 float64 *d = vd, *n = vn, *m = vm;
4755 for (i = 0; i < opr_sz; i++) {
4758 if (float64_is_neg(mm)) {
4759 mm = float64_abs(mm);
4762 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4770 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4771 void *vs, uint32_t desc)
4773 intptr_t j, i = simd_oprsz(desc);
4775 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4776 float16 neg_real = float16_chs(neg_imag);
4779 uint64_t pg = g[(i - 1) >> 6];
4781 float16 e0, e1, e2, e3;
4783 /* I holds the real index; J holds the imag index. */
4784 j = i - sizeof(float16);
4785 i -= 2 * sizeof(float16);
4787 e0 = *(float16 *)(vn + H1_2(i));
4788 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4789 e2 = *(float16 *)(vn + H1_2(j));
4790 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4792 if (likely((pg >> (i & 63)) & 1)) {
4793 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4795 if (likely((pg >> (j & 63)) & 1)) {
4796 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4802 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4803 void *vs, uint32_t desc)
4805 intptr_t j, i = simd_oprsz(desc);
4807 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4808 float32 neg_real = float32_chs(neg_imag);
4811 uint64_t pg = g[(i - 1) >> 6];
4813 float32 e0, e1, e2, e3;
4815 /* I holds the real index; J holds the imag index. */
4816 j = i - sizeof(float32);
4817 i -= 2 * sizeof(float32);
4819 e0 = *(float32 *)(vn + H1_2(i));
4820 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4821 e2 = *(float32 *)(vn + H1_2(j));
4822 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4824 if (likely((pg >> (i & 63)) & 1)) {
4825 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4827 if (likely((pg >> (j & 63)) & 1)) {
4828 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4834 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4835 void *vs, uint32_t desc)
4837 intptr_t j, i = simd_oprsz(desc);
4839 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4840 float64 neg_real = float64_chs(neg_imag);
4843 uint64_t pg = g[(i - 1) >> 6];
4845 float64 e0, e1, e2, e3;
4847 /* I holds the real index; J holds the imag index. */
4848 j = i - sizeof(float64);
4849 i -= 2 * sizeof(float64);
4851 e0 = *(float64 *)(vn + H1_2(i));
4852 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4853 e2 = *(float64 *)(vn + H1_2(j));
4854 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4856 if (likely((pg >> (i & 63)) & 1)) {
4857 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4859 if (likely((pg >> (j & 63)) & 1)) {
4860 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4867 * FP Complex Multiply
4870 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4871 void *vg, void *status, uint32_t desc)
4873 intptr_t j, i = simd_oprsz(desc);
4874 unsigned rot = simd_data(desc);
4875 bool flip = rot & 1;
4876 float16 neg_imag, neg_real;
4879 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4880 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4883 uint64_t pg = g[(i - 1) >> 6];
4885 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4887 /* I holds the real index; J holds the imag index. */
4888 j = i - sizeof(float16);
4889 i -= 2 * sizeof(float16);
4891 nr = *(float16 *)(vn + H1_2(i));
4892 ni = *(float16 *)(vn + H1_2(j));
4893 mr = *(float16 *)(vm + H1_2(i));
4894 mi = *(float16 *)(vm + H1_2(j));
4896 e2 = (flip ? ni : nr);
4897 e1 = (flip ? mi : mr) ^ neg_real;
4899 e3 = (flip ? mr : mi) ^ neg_imag;
4901 if (likely((pg >> (i & 63)) & 1)) {
4902 d = *(float16 *)(va + H1_2(i));
4903 d = float16_muladd(e2, e1, d, 0, status);
4904 *(float16 *)(vd + H1_2(i)) = d;
4906 if (likely((pg >> (j & 63)) & 1)) {
4907 d = *(float16 *)(va + H1_2(j));
4908 d = float16_muladd(e4, e3, d, 0, status);
4909 *(float16 *)(vd + H1_2(j)) = d;
4915 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4916 void *vg, void *status, uint32_t desc)
4918 intptr_t j, i = simd_oprsz(desc);
4919 unsigned rot = simd_data(desc);
4920 bool flip = rot & 1;
4921 float32 neg_imag, neg_real;
4924 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4925 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4928 uint64_t pg = g[(i - 1) >> 6];
4930 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4932 /* I holds the real index; J holds the imag index. */
4933 j = i - sizeof(float32);
4934 i -= 2 * sizeof(float32);
4936 nr = *(float32 *)(vn + H1_2(i));
4937 ni = *(float32 *)(vn + H1_2(j));
4938 mr = *(float32 *)(vm + H1_2(i));
4939 mi = *(float32 *)(vm + H1_2(j));
4941 e2 = (flip ? ni : nr);
4942 e1 = (flip ? mi : mr) ^ neg_real;
4944 e3 = (flip ? mr : mi) ^ neg_imag;
4946 if (likely((pg >> (i & 63)) & 1)) {
4947 d = *(float32 *)(va + H1_2(i));
4948 d = float32_muladd(e2, e1, d, 0, status);
4949 *(float32 *)(vd + H1_2(i)) = d;
4951 if (likely((pg >> (j & 63)) & 1)) {
4952 d = *(float32 *)(va + H1_2(j));
4953 d = float32_muladd(e4, e3, d, 0, status);
4954 *(float32 *)(vd + H1_2(j)) = d;
4960 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4961 void *vg, void *status, uint32_t desc)
4963 intptr_t j, i = simd_oprsz(desc);
4964 unsigned rot = simd_data(desc);
4965 bool flip = rot & 1;
4966 float64 neg_imag, neg_real;
4969 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4970 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4973 uint64_t pg = g[(i - 1) >> 6];
4975 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4977 /* I holds the real index; J holds the imag index. */
4978 j = i - sizeof(float64);
4979 i -= 2 * sizeof(float64);
4981 nr = *(float64 *)(vn + H1_2(i));
4982 ni = *(float64 *)(vn + H1_2(j));
4983 mr = *(float64 *)(vm + H1_2(i));
4984 mi = *(float64 *)(vm + H1_2(j));
4986 e2 = (flip ? ni : nr);
4987 e1 = (flip ? mi : mr) ^ neg_real;
4989 e3 = (flip ? mr : mi) ^ neg_imag;
4991 if (likely((pg >> (i & 63)) & 1)) {
4992 d = *(float64 *)(va + H1_2(i));
4993 d = float64_muladd(e2, e1, d, 0, status);
4994 *(float64 *)(vd + H1_2(i)) = d;
4996 if (likely((pg >> (j & 63)) & 1)) {
4997 d = *(float64 *)(va + H1_2(j));
4998 d = float64_muladd(e4, e3, d, 0, status);
4999 *(float64 *)(vd + H1_2(j)) = d;
5006 * Load contiguous data, protected by a governing predicate.
5010 * Load one element into @vd + @reg_off from @host.
5011 * The controlling predicate is known to be true.
5013 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5016 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5017 * The controlling predicate is known to be true.
5019 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5020 target_ulong vaddr, uintptr_t retaddr);
5023 * Generate the above primitives.
5026 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5027 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5029 TYPEM val = HOST(host); \
5030 *(TYPEE *)(vd + H(reg_off)) = val; \
5033 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5034 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5035 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5037 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5038 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5039 target_ulong addr, uintptr_t ra) \
5041 *(TYPEE *)(vd + H(reg_off)) = \
5042 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5045 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5046 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5047 target_ulong addr, uintptr_t ra) \
5049 TLB(env, useronly_clean_ptr(addr), \
5050 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5053 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5054 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5055 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5057 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5058 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5059 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5060 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5061 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5062 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
5063 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
5065 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5066 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5067 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5069 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5070 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5071 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5072 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
5074 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5075 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5076 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5077 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5078 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5080 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5081 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5082 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5083 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5084 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5086 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5087 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5088 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5089 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
5090 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
5092 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5093 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5094 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
5096 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5097 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
5098 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
5100 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5101 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
5103 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
5104 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
5115 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5116 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5117 * element >= @reg_off, or @reg_max if there were no active elements at all.
5119 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5120 intptr_t reg_max, int esz)
5122 uint64_t pg_mask = pred_esz_masks[esz];
5123 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5125 /* In normal usage, the first element is active. */
5126 if (likely(pg & 1)) {
5134 if (unlikely(reg_off >= reg_max)) {
5135 /* The entire predicate was false. */
5138 pg = vg[reg_off >> 6] & pg_mask;
5141 reg_off += ctz64(pg);
5143 /* We should never see an out of range predicate bit set. */
5144 tcg_debug_assert(reg_off < reg_max);
5149 * Resolve the guest virtual address to info->host and info->flags.
5150 * If @nofault, return false if the page is invalid, otherwise
5151 * exit via page fault exception.
5160 static bool sve_probe_page(SVEHostPage *info, bool nofault,
5161 CPUARMState *env, target_ulong addr,
5162 int mem_off, MMUAccessType access_type,
5163 int mmu_idx, uintptr_t retaddr)
5170 * User-only currently always issues with TBI. See the comment
5171 * above useronly_clean_ptr. Usually we clean this top byte away
5172 * during translation, but we can't do that for e.g. vector + imm
5175 * We currently always enable TBI for user-only, and do not provide
5176 * a way to turn it off. So clean the pointer unconditionally here,
5177 * rather than look it up here, or pass it down from above.
5179 addr = useronly_clean_ptr(addr);
5181 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5182 &info->host, retaddr);
5183 info->flags = flags;
5185 if (flags & TLB_INVALID_MASK) {
5190 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5191 info->host -= mem_off;
5193 #ifdef CONFIG_USER_ONLY
5194 memset(&info->attrs, 0, sizeof(info->attrs));
5197 * Find the iotlbentry for addr and return the transaction attributes.
5198 * This *must* be present in the TLB because we just found the mapping.
5201 uintptr_t index = tlb_index(env, mmu_idx, addr);
5203 # ifdef CONFIG_DEBUG_TCG
5204 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5205 target_ulong comparator = (access_type == MMU_DATA_LOAD
5207 : tlb_addr_write(entry));
5208 g_assert(tlb_hit(comparator, addr));
5211 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5212 info->attrs = iotlbentry->attrs;
5221 * Analyse contiguous data, protected by a governing predicate.
5232 * First and last element wholly contained within the two pages.
5233 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5234 * reg_off_last[0] may be < 0 if the first element crosses pages.
5235 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5236 * are set >= 0 only if there are complete elements on a second page.
5238 * The reg_off_* offsets are relative to the internal vector register.
5239 * The mem_off_first offset is relative to the memory address; the
5240 * two offsets are different when a load operation extends, a store
5241 * operation truncates, or for multi-register operations.
5243 int16_t mem_off_first[2];
5244 int16_t reg_off_first[2];
5245 int16_t reg_off_last[2];
5248 * One element that is misaligned and spans both pages,
5249 * or -1 if there is no such active element.
5251 int16_t mem_off_split;
5252 int16_t reg_off_split;
5255 * The byte offset at which the entire operation crosses a page boundary.
5256 * Set >= 0 if and only if the entire operation spans two pages.
5260 /* TLB data for the two pages. */
5261 SVEHostPage page[2];
5265 * Find first active element on each page, and a loose bound for the
5266 * final element on each page. Identify any single element that spans
5267 * the page boundary. Return true if there are any active elements.
5269 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5270 uint64_t *vg, intptr_t reg_max,
5273 const int esize = 1 << esz;
5274 const uint64_t pg_mask = pred_esz_masks[esz];
5275 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5276 intptr_t mem_off_last, mem_off_split;
5277 intptr_t page_split, elt_split;
5280 /* Set all of the element indices to -1, and the TLB data to 0. */
5281 memset(info, -1, offsetof(SVEContLdSt, page));
5282 memset(info->page, 0, sizeof(info->page));
5284 /* Gross scan over the entire predicate to find bounds. */
5287 uint64_t pg = vg[i] & pg_mask;
5289 reg_off_last = i * 64 + 63 - clz64(pg);
5290 if (reg_off_first < 0) {
5291 reg_off_first = i * 64 + ctz64(pg);
5294 } while (++i * 64 < reg_max);
5296 if (unlikely(reg_off_first < 0)) {
5297 /* No active elements, no pages touched. */
5300 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5302 info->reg_off_first[0] = reg_off_first;
5303 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5304 mem_off_last = (reg_off_last >> esz) * msize;
5306 page_split = -(addr | TARGET_PAGE_MASK);
5307 if (likely(mem_off_last + msize <= page_split)) {
5308 /* The entire operation fits within a single page. */
5309 info->reg_off_last[0] = reg_off_last;
5313 info->page_split = page_split;
5314 elt_split = page_split / msize;
5315 reg_off_split = elt_split << esz;
5316 mem_off_split = elt_split * msize;
5319 * This is the last full element on the first page, but it is not
5320 * necessarily active. If there is no full element, i.e. the first
5321 * active element is the one that's split, this value remains -1.
5322 * It is useful as iteration bounds.
5324 if (elt_split != 0) {
5325 info->reg_off_last[0] = reg_off_split - esize;
5328 /* Determine if an unaligned element spans the pages. */
5329 if (page_split % msize != 0) {
5330 /* It is helpful to know if the split element is active. */
5331 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5332 info->reg_off_split = reg_off_split;
5333 info->mem_off_split = mem_off_split;
5335 if (reg_off_split == reg_off_last) {
5336 /* The page crossing element is last. */
5340 reg_off_split += esize;
5341 mem_off_split += msize;
5345 * We do want the first active element on the second page, because
5346 * this may affect the address reported in an exception.
5348 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5349 tcg_debug_assert(reg_off_split <= reg_off_last);
5350 info->reg_off_first[1] = reg_off_split;
5351 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5352 info->reg_off_last[1] = reg_off_last;
5357 * Resolve the guest virtual addresses to info->page[].
5358 * Control the generation of page faults with @fault. Return false if
5359 * there is no work to do, which can only happen with @fault == FAULT_NO.
5361 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5362 CPUARMState *env, target_ulong addr,
5363 MMUAccessType access_type, uintptr_t retaddr)
5365 int mmu_idx = cpu_mmu_index(env, false);
5366 int mem_off = info->mem_off_first[0];
5367 bool nofault = fault == FAULT_NO;
5368 bool have_work = true;
5370 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5371 access_type, mmu_idx, retaddr)) {
5372 /* No work to be done. */
5376 if (likely(info->page_split < 0)) {
5377 /* The entire operation was on the one page. */
5382 * If the second page is invalid, then we want the fault address to be
5383 * the first byte on that page which is accessed.
5385 if (info->mem_off_split >= 0) {
5387 * There is an element split across the pages. The fault address
5388 * should be the first byte of the second page.
5390 mem_off = info->page_split;
5392 * If the split element is also the first active element
5393 * of the vector, then: For first-fault we should continue
5394 * to generate faults for the second page. For no-fault,
5395 * we have work only if the second page is valid.
5397 if (info->mem_off_first[0] < info->mem_off_split) {
5398 nofault = FAULT_FIRST;
5403 * There is no element split across the pages. The fault address
5404 * should be the first active element on the second page.
5406 mem_off = info->mem_off_first[1];
5408 * There must have been one active element on the first page,
5409 * so we're out of first-fault territory.
5411 nofault = fault != FAULT_ALL;
5414 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5415 access_type, mmu_idx, retaddr);
5419 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5420 uint64_t *vg, target_ulong addr,
5421 int esize, int msize, int wp_access,
5424 #ifndef CONFIG_USER_ONLY
5425 intptr_t mem_off, reg_off, reg_last;
5426 int flags0 = info->page[0].flags;
5427 int flags1 = info->page[1].flags;
5429 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5433 /* Indicate that watchpoints are handled. */
5434 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5435 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5437 if (flags0 & TLB_WATCHPOINT) {
5438 mem_off = info->mem_off_first[0];
5439 reg_off = info->reg_off_first[0];
5440 reg_last = info->reg_off_last[0];
5442 while (reg_off <= reg_last) {
5443 uint64_t pg = vg[reg_off >> 6];
5445 if ((pg >> (reg_off & 63)) & 1) {
5446 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5447 msize, info->page[0].attrs,
5448 wp_access, retaddr);
5452 } while (reg_off <= reg_last && (reg_off & 63));
5456 mem_off = info->mem_off_split;
5458 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5459 info->page[0].attrs, wp_access, retaddr);
5462 mem_off = info->mem_off_first[1];
5463 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5464 reg_off = info->reg_off_first[1];
5465 reg_last = info->reg_off_last[1];
5468 uint64_t pg = vg[reg_off >> 6];
5470 if ((pg >> (reg_off & 63)) & 1) {
5471 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5472 msize, info->page[1].attrs,
5473 wp_access, retaddr);
5477 } while (reg_off & 63);
5478 } while (reg_off <= reg_last);
5483 static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5484 uint64_t *vg, target_ulong addr, int esize,
5485 int msize, uint32_t mtedesc, uintptr_t ra)
5487 intptr_t mem_off, reg_off, reg_last;
5489 /* Process the page only if MemAttr == Tagged. */
5490 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5491 mem_off = info->mem_off_first[0];
5492 reg_off = info->reg_off_first[0];
5493 reg_last = info->reg_off_split;
5495 reg_last = info->reg_off_last[0];
5499 uint64_t pg = vg[reg_off >> 6];
5501 if ((pg >> (reg_off & 63)) & 1) {
5502 mte_check(env, mtedesc, addr, ra);
5506 } while (reg_off <= reg_last && (reg_off & 63));
5507 } while (reg_off <= reg_last);
5510 mem_off = info->mem_off_first[1];
5511 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5512 reg_off = info->reg_off_first[1];
5513 reg_last = info->reg_off_last[1];
5516 uint64_t pg = vg[reg_off >> 6];
5518 if ((pg >> (reg_off & 63)) & 1) {
5519 mte_check(env, mtedesc, addr, ra);
5523 } while (reg_off & 63);
5524 } while (reg_off <= reg_last);
5529 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5531 static inline QEMU_ALWAYS_INLINE
5532 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5533 uint32_t desc, const uintptr_t retaddr,
5534 const int esz, const int msz, const int N, uint32_t mtedesc,
5535 sve_ldst1_host_fn *host_fn,
5536 sve_ldst1_tlb_fn *tlb_fn)
5538 const unsigned rd = simd_data(desc);
5539 const intptr_t reg_max = simd_oprsz(desc);
5540 intptr_t reg_off, reg_last, mem_off;
5545 /* Find the active elements. */
5546 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5547 /* The entire predicate was false; no load occurs. */
5548 for (i = 0; i < N; ++i) {
5549 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5554 /* Probe the page(s). Exit with exception for any invalid page. */
5555 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5557 /* Handle watchpoints for all active elements. */
5558 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5559 BP_MEM_READ, retaddr);
5562 * Handle mte checks for all active elements.
5563 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5566 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5570 flags = info.page[0].flags | info.page[1].flags;
5571 if (unlikely(flags != 0)) {
5572 #ifdef CONFIG_USER_ONLY
5573 g_assert_not_reached();
5576 * At least one page includes MMIO.
5577 * Any bus operation can fail with cpu_transaction_failed,
5578 * which for ARM will raise SyncExternal. Perform the load
5579 * into scratch memory to preserve register state until the end.
5581 ARMVectorReg scratch[4] = { };
5583 mem_off = info.mem_off_first[0];
5584 reg_off = info.reg_off_first[0];
5585 reg_last = info.reg_off_last[1];
5587 reg_last = info.reg_off_split;
5589 reg_last = info.reg_off_last[0];
5594 uint64_t pg = vg[reg_off >> 6];
5596 if ((pg >> (reg_off & 63)) & 1) {
5597 for (i = 0; i < N; ++i) {
5598 tlb_fn(env, &scratch[i], reg_off,
5599 addr + mem_off + (i << msz), retaddr);
5602 reg_off += 1 << esz;
5603 mem_off += N << msz;
5604 } while (reg_off & 63);
5605 } while (reg_off <= reg_last);
5607 for (i = 0; i < N; ++i) {
5608 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5614 /* The entire operation is in RAM, on valid pages. */
5616 for (i = 0; i < N; ++i) {
5617 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5620 mem_off = info.mem_off_first[0];
5621 reg_off = info.reg_off_first[0];
5622 reg_last = info.reg_off_last[0];
5623 host = info.page[0].host;
5625 while (reg_off <= reg_last) {
5626 uint64_t pg = vg[reg_off >> 6];
5628 if ((pg >> (reg_off & 63)) & 1) {
5629 for (i = 0; i < N; ++i) {
5630 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5631 host + mem_off + (i << msz));
5634 reg_off += 1 << esz;
5635 mem_off += N << msz;
5636 } while (reg_off <= reg_last && (reg_off & 63));
5640 * Use the slow path to manage the cross-page misalignment.
5641 * But we know this is RAM and cannot trap.
5643 mem_off = info.mem_off_split;
5644 if (unlikely(mem_off >= 0)) {
5645 reg_off = info.reg_off_split;
5646 for (i = 0; i < N; ++i) {
5647 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5648 addr + mem_off + (i << msz), retaddr);
5652 mem_off = info.mem_off_first[1];
5653 if (unlikely(mem_off >= 0)) {
5654 reg_off = info.reg_off_first[1];
5655 reg_last = info.reg_off_last[1];
5656 host = info.page[1].host;
5659 uint64_t pg = vg[reg_off >> 6];
5661 if ((pg >> (reg_off & 63)) & 1) {
5662 for (i = 0; i < N; ++i) {
5663 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5664 host + mem_off + (i << msz));
5667 reg_off += 1 << esz;
5668 mem_off += N << msz;
5669 } while (reg_off & 63);
5670 } while (reg_off <= reg_last);
5674 static inline QEMU_ALWAYS_INLINE
5675 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5676 uint32_t desc, const uintptr_t ra,
5677 const int esz, const int msz, const int N,
5678 sve_ldst1_host_fn *host_fn,
5679 sve_ldst1_tlb_fn *tlb_fn)
5681 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5682 int bit55 = extract64(addr, 55, 1);
5684 /* Remove mtedesc from the normal sve descriptor. */
5685 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5687 /* Perform gross MTE suppression early. */
5688 if (!tbi_check(desc, bit55) ||
5689 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5693 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5696 #define DO_LD1_1(NAME, ESZ) \
5697 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5698 target_ulong addr, uint32_t desc) \
5700 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5701 sve_##NAME##_host, sve_##NAME##_tlb); \
5703 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5704 target_ulong addr, uint32_t desc) \
5706 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5707 sve_##NAME##_host, sve_##NAME##_tlb); \
5710 #define DO_LD1_2(NAME, ESZ, MSZ) \
5711 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5712 target_ulong addr, uint32_t desc) \
5714 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5715 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5717 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5718 target_ulong addr, uint32_t desc) \
5720 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5721 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5723 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5724 target_ulong addr, uint32_t desc) \
5726 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5727 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5729 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5730 target_ulong addr, uint32_t desc) \
5732 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5733 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5736 DO_LD1_1(ld1bb, MO_8)
5737 DO_LD1_1(ld1bhu, MO_16)
5738 DO_LD1_1(ld1bhs, MO_16)
5739 DO_LD1_1(ld1bsu, MO_32)
5740 DO_LD1_1(ld1bss, MO_32)
5741 DO_LD1_1(ld1bdu, MO_64)
5742 DO_LD1_1(ld1bds, MO_64)
5744 DO_LD1_2(ld1hh, MO_16, MO_16)
5745 DO_LD1_2(ld1hsu, MO_32, MO_16)
5746 DO_LD1_2(ld1hss, MO_32, MO_16)
5747 DO_LD1_2(ld1hdu, MO_64, MO_16)
5748 DO_LD1_2(ld1hds, MO_64, MO_16)
5750 DO_LD1_2(ld1ss, MO_32, MO_32)
5751 DO_LD1_2(ld1sdu, MO_64, MO_32)
5752 DO_LD1_2(ld1sds, MO_64, MO_32)
5754 DO_LD1_2(ld1dd, MO_64, MO_64)
5759 #define DO_LDN_1(N) \
5760 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5761 target_ulong addr, uint32_t desc) \
5763 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5764 sve_ld1bb_host, sve_ld1bb_tlb); \
5766 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5767 target_ulong addr, uint32_t desc) \
5769 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5770 sve_ld1bb_host, sve_ld1bb_tlb); \
5773 #define DO_LDN_2(N, SUFF, ESZ) \
5774 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5775 target_ulong addr, uint32_t desc) \
5777 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5778 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5780 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5781 target_ulong addr, uint32_t desc) \
5783 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5784 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5786 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5787 target_ulong addr, uint32_t desc) \
5789 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5790 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5792 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5793 target_ulong addr, uint32_t desc) \
5795 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5796 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5803 DO_LDN_2(2, hh, MO_16)
5804 DO_LDN_2(3, hh, MO_16)
5805 DO_LDN_2(4, hh, MO_16)
5807 DO_LDN_2(2, ss, MO_32)
5808 DO_LDN_2(3, ss, MO_32)
5809 DO_LDN_2(4, ss, MO_32)
5811 DO_LDN_2(2, dd, MO_64)
5812 DO_LDN_2(3, dd, MO_64)
5813 DO_LDN_2(4, dd, MO_64)
5819 * Load contiguous data, first-fault and no-fault.
5821 * For user-only, one could argue that we should hold the mmap_lock during
5822 * the operation so that there is no race between page_check_range and the
5823 * load operation. However, unmapping pages out from under a running thread
5824 * is extraordinarily unlikely. This theoretical race condition also affects
5825 * linux-user/ in its get_user/put_user macros.
5827 * TODO: Construct some helpers, written in assembly, that interact with
5828 * handle_cpu_signal to produce memory ops which can properly report errors
5832 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5833 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5834 * option, which leaves subsequent data unchanged.
5836 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5838 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5841 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5842 i = ROUND_UP(i, 64);
5844 for (; i < oprsz; i += 64) {
5850 * Common helper for all contiguous no-fault and first-fault loads.
5852 static inline QEMU_ALWAYS_INLINE
5853 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5854 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5855 const int esz, const int msz, const SVEContFault fault,
5856 sve_ldst1_host_fn *host_fn,
5857 sve_ldst1_tlb_fn *tlb_fn)
5859 const unsigned rd = simd_data(desc);
5860 void *vd = &env->vfp.zregs[rd];
5861 const intptr_t reg_max = simd_oprsz(desc);
5862 intptr_t reg_off, mem_off, reg_last;
5867 /* Find the active elements. */
5868 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5869 /* The entire predicate was false; no load occurs. */
5870 memset(vd, 0, reg_max);
5873 reg_off = info.reg_off_first[0];
5875 /* Probe the page(s). */
5876 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5877 /* Fault on first element. */
5878 tcg_debug_assert(fault == FAULT_NO);
5879 memset(vd, 0, reg_max);
5883 mem_off = info.mem_off_first[0];
5884 flags = info.page[0].flags;
5887 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5888 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5890 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5894 if (fault == FAULT_FIRST) {
5895 /* Trapping mte check for the first-fault element. */
5897 mte_check(env, mtedesc, addr + mem_off, retaddr);
5901 * Special handling of the first active element,
5902 * if it crosses a page boundary or is MMIO.
5904 bool is_split = mem_off == info.mem_off_split;
5905 if (unlikely(flags != 0) || unlikely(is_split)) {
5907 * Use the slow path for cross-page handling.
5908 * Might trap for MMIO or watchpoints.
5910 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5912 /* After any fault, zero the other elements. */
5913 swap_memzero(vd, reg_off);
5914 reg_off += 1 << esz;
5915 mem_off += 1 << msz;
5916 swap_memzero(vd + reg_off, reg_max - reg_off);
5922 memset(vd, 0, reg_max);
5925 memset(vd, 0, reg_max);
5926 if (unlikely(mem_off == info.mem_off_split)) {
5927 /* The first active element crosses a page boundary. */
5928 flags |= info.page[1].flags;
5929 if (unlikely(flags & TLB_MMIO)) {
5930 /* Some page is MMIO, see below. */
5933 if (unlikely(flags & TLB_WATCHPOINT) &&
5934 (cpu_watchpoint_address_matches
5935 (env_cpu(env), addr + mem_off, 1 << msz)
5937 /* Watchpoint hit, see below. */
5940 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
5944 * Use the slow path for cross-page handling.
5945 * This is RAM, without a watchpoint, and will not trap.
5947 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5953 * From this point on, all memory operations are MemSingleNF.
5955 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5956 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5958 * Unfortuately we do not have access to the memory attributes from the
5959 * PTE to tell Device memory from Normal memory. So we make a mostly
5960 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5961 * This gives the right answer for the common cases of "Normal memory,
5962 * backed by host RAM" and "Device memory, backed by MMIO".
5963 * The architecture allows us to suppress an NF load and return
5964 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5965 * case of "Normal memory, backed by MMIO" is permitted. The case we
5966 * get wrong is "Device memory, backed by host RAM", for which we
5967 * should return (UNKNOWN, FAULT) for but do not.
5969 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5970 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5971 * architectural breakpoints the same.
5973 if (unlikely(flags & TLB_MMIO)) {
5977 reg_last = info.reg_off_last[0];
5978 host = info.page[0].host;
5981 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
5983 if ((pg >> (reg_off & 63)) & 1) {
5984 if (unlikely(flags & TLB_WATCHPOINT) &&
5985 (cpu_watchpoint_address_matches
5986 (env_cpu(env), addr + mem_off, 1 << msz)
5990 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
5993 host_fn(vd, reg_off, host + mem_off);
5995 reg_off += 1 << esz;
5996 mem_off += 1 << msz;
5997 } while (reg_off <= reg_last && (reg_off & 63));
5998 } while (reg_off <= reg_last);
6001 * MemSingleNF is allowed to fail for any reason. We have special
6002 * code above to handle the first element crossing a page boundary.
6003 * As an implementation choice, decline to handle a cross-page element
6004 * in any other position.
6006 reg_off = info.reg_off_split;
6012 reg_off = info.reg_off_first[1];
6013 if (likely(reg_off < 0)) {
6014 /* No active elements on the second page. All done. */
6019 * MemSingleNF is allowed to fail for any reason. As an implementation
6020 * choice, decline to handle elements on the second page. This should
6021 * be low frequency as the guest walks through memory -- the next
6022 * iteration of the guest's loop should be aligned on the page boundary,
6023 * and then all following iterations will stay aligned.
6027 record_fault(env, reg_off, reg_max);
6030 static inline QEMU_ALWAYS_INLINE
6031 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6032 uint32_t desc, const uintptr_t retaddr,
6033 const int esz, const int msz, const SVEContFault fault,
6034 sve_ldst1_host_fn *host_fn,
6035 sve_ldst1_tlb_fn *tlb_fn)
6037 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6038 int bit55 = extract64(addr, 55, 1);
6040 /* Remove mtedesc from the normal sve descriptor. */
6041 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6043 /* Perform gross MTE suppression early. */
6044 if (!tbi_check(desc, bit55) ||
6045 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6049 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6050 esz, msz, fault, host_fn, tlb_fn);
6053 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6054 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6055 target_ulong addr, uint32_t desc) \
6057 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6058 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6060 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6061 target_ulong addr, uint32_t desc) \
6063 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6064 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6066 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6067 target_ulong addr, uint32_t desc) \
6069 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6070 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6072 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6073 target_ulong addr, uint32_t desc) \
6075 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6076 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6079 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6080 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6081 target_ulong addr, uint32_t desc) \
6083 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6084 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6086 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6087 target_ulong addr, uint32_t desc) \
6089 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6090 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6092 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6093 target_ulong addr, uint32_t desc) \
6095 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6096 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6098 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6099 target_ulong addr, uint32_t desc) \
6101 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6102 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6104 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6105 target_ulong addr, uint32_t desc) \
6107 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6108 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6110 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6111 target_ulong addr, uint32_t desc) \
6113 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6114 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6116 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6117 target_ulong addr, uint32_t desc) \
6119 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6120 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6122 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6123 target_ulong addr, uint32_t desc) \
6125 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6126 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6129 DO_LDFF1_LDNF1_1(bb, MO_8)
6130 DO_LDFF1_LDNF1_1(bhu, MO_16)
6131 DO_LDFF1_LDNF1_1(bhs, MO_16)
6132 DO_LDFF1_LDNF1_1(bsu, MO_32)
6133 DO_LDFF1_LDNF1_1(bss, MO_32)
6134 DO_LDFF1_LDNF1_1(bdu, MO_64)
6135 DO_LDFF1_LDNF1_1(bds, MO_64)
6137 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6138 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6139 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6140 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6141 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6143 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6144 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6145 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6147 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6149 #undef DO_LDFF1_LDNF1_1
6150 #undef DO_LDFF1_LDNF1_2
6153 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6156 static inline QEMU_ALWAYS_INLINE
6157 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6158 uint32_t desc, const uintptr_t retaddr,
6159 const int esz, const int msz, const int N, uint32_t mtedesc,
6160 sve_ldst1_host_fn *host_fn,
6161 sve_ldst1_tlb_fn *tlb_fn)
6163 const unsigned rd = simd_data(desc);
6164 const intptr_t reg_max = simd_oprsz(desc);
6165 intptr_t reg_off, reg_last, mem_off;
6170 /* Find the active elements. */
6171 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6172 /* The entire predicate was false; no store occurs. */
6176 /* Probe the page(s). Exit with exception for any invalid page. */
6177 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6179 /* Handle watchpoints for all active elements. */
6180 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6181 BP_MEM_WRITE, retaddr);
6184 * Handle mte checks for all active elements.
6185 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6188 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6192 flags = info.page[0].flags | info.page[1].flags;
6193 if (unlikely(flags != 0)) {
6194 #ifdef CONFIG_USER_ONLY
6195 g_assert_not_reached();
6198 * At least one page includes MMIO.
6199 * Any bus operation can fail with cpu_transaction_failed,
6200 * which for ARM will raise SyncExternal. We cannot avoid
6201 * this fault and will leave with the store incomplete.
6203 mem_off = info.mem_off_first[0];
6204 reg_off = info.reg_off_first[0];
6205 reg_last = info.reg_off_last[1];
6207 reg_last = info.reg_off_split;
6209 reg_last = info.reg_off_last[0];
6214 uint64_t pg = vg[reg_off >> 6];
6216 if ((pg >> (reg_off & 63)) & 1) {
6217 for (i = 0; i < N; ++i) {
6218 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6219 addr + mem_off + (i << msz), retaddr);
6222 reg_off += 1 << esz;
6223 mem_off += N << msz;
6224 } while (reg_off & 63);
6225 } while (reg_off <= reg_last);
6230 mem_off = info.mem_off_first[0];
6231 reg_off = info.reg_off_first[0];
6232 reg_last = info.reg_off_last[0];
6233 host = info.page[0].host;
6235 while (reg_off <= reg_last) {
6236 uint64_t pg = vg[reg_off >> 6];
6238 if ((pg >> (reg_off & 63)) & 1) {
6239 for (i = 0; i < N; ++i) {
6240 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6241 host + mem_off + (i << msz));
6244 reg_off += 1 << esz;
6245 mem_off += N << msz;
6246 } while (reg_off <= reg_last && (reg_off & 63));
6250 * Use the slow path to manage the cross-page misalignment.
6251 * But we know this is RAM and cannot trap.
6253 mem_off = info.mem_off_split;
6254 if (unlikely(mem_off >= 0)) {
6255 reg_off = info.reg_off_split;
6256 for (i = 0; i < N; ++i) {
6257 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6258 addr + mem_off + (i << msz), retaddr);
6262 mem_off = info.mem_off_first[1];
6263 if (unlikely(mem_off >= 0)) {
6264 reg_off = info.reg_off_first[1];
6265 reg_last = info.reg_off_last[1];
6266 host = info.page[1].host;
6269 uint64_t pg = vg[reg_off >> 6];
6271 if ((pg >> (reg_off & 63)) & 1) {
6272 for (i = 0; i < N; ++i) {
6273 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6274 host + mem_off + (i << msz));
6277 reg_off += 1 << esz;
6278 mem_off += N << msz;
6279 } while (reg_off & 63);
6280 } while (reg_off <= reg_last);
6284 static inline QEMU_ALWAYS_INLINE
6285 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6286 uint32_t desc, const uintptr_t ra,
6287 const int esz, const int msz, const int N,
6288 sve_ldst1_host_fn *host_fn,
6289 sve_ldst1_tlb_fn *tlb_fn)
6291 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6292 int bit55 = extract64(addr, 55, 1);
6294 /* Remove mtedesc from the normal sve descriptor. */
6295 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6297 /* Perform gross MTE suppression early. */
6298 if (!tbi_check(desc, bit55) ||
6299 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6303 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6306 #define DO_STN_1(N, NAME, ESZ) \
6307 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6308 target_ulong addr, uint32_t desc) \
6310 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6311 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6313 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6314 target_ulong addr, uint32_t desc) \
6316 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6317 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6320 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6321 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6322 target_ulong addr, uint32_t desc) \
6324 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6325 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6327 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6328 target_ulong addr, uint32_t desc) \
6330 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6331 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6333 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6334 target_ulong addr, uint32_t desc) \
6336 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6337 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6339 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6340 target_ulong addr, uint32_t desc) \
6342 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6343 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6346 DO_STN_1(1, bb, MO_8)
6347 DO_STN_1(1, bh, MO_16)
6348 DO_STN_1(1, bs, MO_32)
6349 DO_STN_1(1, bd, MO_64)
6350 DO_STN_1(2, bb, MO_8)
6351 DO_STN_1(3, bb, MO_8)
6352 DO_STN_1(4, bb, MO_8)
6354 DO_STN_2(1, hh, MO_16, MO_16)
6355 DO_STN_2(1, hs, MO_32, MO_16)
6356 DO_STN_2(1, hd, MO_64, MO_16)
6357 DO_STN_2(2, hh, MO_16, MO_16)
6358 DO_STN_2(3, hh, MO_16, MO_16)
6359 DO_STN_2(4, hh, MO_16, MO_16)
6361 DO_STN_2(1, ss, MO_32, MO_32)
6362 DO_STN_2(1, sd, MO_64, MO_32)
6363 DO_STN_2(2, ss, MO_32, MO_32)
6364 DO_STN_2(3, ss, MO_32, MO_32)
6365 DO_STN_2(4, ss, MO_32, MO_32)
6367 DO_STN_2(1, dd, MO_64, MO_64)
6368 DO_STN_2(2, dd, MO_64, MO_64)
6369 DO_STN_2(3, dd, MO_64, MO_64)
6370 DO_STN_2(4, dd, MO_64, MO_64)
6376 * Loads with a vector index.
6380 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6382 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6384 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6386 return *(uint32_t *)(reg + H1_4(reg_ofs));
6389 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6391 return *(int32_t *)(reg + H1_4(reg_ofs));
6394 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6396 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6399 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6401 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6404 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6406 return *(uint64_t *)(reg + reg_ofs);
6409 static inline QEMU_ALWAYS_INLINE
6410 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6411 target_ulong base, uint32_t desc, uintptr_t retaddr,
6412 uint32_t mtedesc, int esize, int msize,
6413 zreg_off_fn *off_fn,
6414 sve_ldst1_host_fn *host_fn,
6415 sve_ldst1_tlb_fn *tlb_fn)
6417 const int mmu_idx = cpu_mmu_index(env, false);
6418 const intptr_t reg_max = simd_oprsz(desc);
6419 const int scale = simd_data(desc);
6420 ARMVectorReg scratch;
6422 SVEHostPage info, info2;
6424 memset(&scratch, 0, reg_max);
6427 uint64_t pg = vg[reg_off >> 6];
6429 if (likely(pg & 1)) {
6430 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6431 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6433 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6436 if (likely(in_page >= msize)) {
6437 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6438 cpu_check_watchpoint(env_cpu(env), addr, msize,
6439 info.attrs, BP_MEM_READ, retaddr);
6441 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6442 mte_check(env, mtedesc, addr, retaddr);
6444 host_fn(&scratch, reg_off, info.host);
6446 /* Element crosses the page boundary. */
6447 sve_probe_page(&info2, false, env, addr + in_page, 0,
6448 MMU_DATA_LOAD, mmu_idx, retaddr);
6449 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6450 cpu_check_watchpoint(env_cpu(env), addr,
6452 BP_MEM_READ, retaddr);
6454 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6455 mte_check(env, mtedesc, addr, retaddr);
6457 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6462 } while (reg_off & 63);
6463 } while (reg_off < reg_max);
6465 /* Wait until all exceptions have been raised to write back. */
6466 memcpy(vd, &scratch, reg_max);
6469 static inline QEMU_ALWAYS_INLINE
6470 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6471 target_ulong base, uint32_t desc, uintptr_t retaddr,
6472 int esize, int msize, zreg_off_fn *off_fn,
6473 sve_ldst1_host_fn *host_fn,
6474 sve_ldst1_tlb_fn *tlb_fn)
6476 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6477 /* Remove mtedesc from the normal sve descriptor. */
6478 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6481 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6482 * offset base entirely over the address space hole to change the
6483 * pointer tag, or change the bit55 selector. So we could here
6484 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6486 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6487 esize, msize, off_fn, host_fn, tlb_fn);
6490 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6491 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6492 void *vm, target_ulong base, uint32_t desc) \
6494 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6495 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6497 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6498 void *vm, target_ulong base, uint32_t desc) \
6500 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6501 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6504 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6505 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6506 void *vm, target_ulong base, uint32_t desc) \
6508 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6509 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6511 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6512 void *vm, target_ulong base, uint32_t desc) \
6514 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6515 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6518 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6519 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6520 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6521 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6522 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6524 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6525 DO_LD1_ZPZ_S(bss, zss, MO_8)
6526 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6527 DO_LD1_ZPZ_D(bds, zss, MO_8)
6528 DO_LD1_ZPZ_D(bds, zd, MO_8)
6530 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6531 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6532 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6533 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6534 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6536 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6537 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6538 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6539 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6540 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6542 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6543 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6544 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6545 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6546 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6548 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6549 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6550 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6551 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6552 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6554 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6555 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6556 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6557 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6558 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6560 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6561 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6562 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6563 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6564 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6566 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6567 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6568 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6570 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6571 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6572 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6574 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6575 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6576 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6578 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6579 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6580 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6585 /* First fault loads with a vector index. */
6588 * Common helpers for all gather first-faulting loads.
6591 static inline QEMU_ALWAYS_INLINE
6592 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6593 target_ulong base, uint32_t desc, uintptr_t retaddr,
6594 uint32_t mtedesc, const int esz, const int msz,
6595 zreg_off_fn *off_fn,
6596 sve_ldst1_host_fn *host_fn,
6597 sve_ldst1_tlb_fn *tlb_fn)
6599 const int mmu_idx = cpu_mmu_index(env, false);
6600 const intptr_t reg_max = simd_oprsz(desc);
6601 const int scale = simd_data(desc);
6602 const int esize = 1 << esz;
6603 const int msize = 1 << msz;
6606 target_ulong addr, in_page;
6608 /* Skip to the first true predicate. */
6609 reg_off = find_next_active(vg, 0, reg_max, esz);
6610 if (unlikely(reg_off >= reg_max)) {
6611 /* The entire predicate was false; no load occurs. */
6612 memset(vd, 0, reg_max);
6617 * Probe the first element, allowing faults.
6619 addr = base + (off_fn(vm, reg_off) << scale);
6621 mte_check(env, mtedesc, addr, retaddr);
6623 tlb_fn(env, vd, reg_off, addr, retaddr);
6625 /* After any fault, zero the other elements. */
6626 swap_memzero(vd, reg_off);
6628 swap_memzero(vd + reg_off, reg_max - reg_off);
6631 * Probe the remaining elements, not allowing faults.
6633 while (reg_off < reg_max) {
6634 uint64_t pg = vg[reg_off >> 6];
6636 if (likely((pg >> (reg_off & 63)) & 1)) {
6637 addr = base + (off_fn(vm, reg_off) << scale);
6638 in_page = -(addr | TARGET_PAGE_MASK);
6640 if (unlikely(in_page < msize)) {
6641 /* Stop if the element crosses a page boundary. */
6645 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6647 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6650 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6651 (cpu_watchpoint_address_matches
6652 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6656 arm_tlb_mte_tagged(&info.attrs) &&
6657 !mte_probe(env, mtedesc, addr)) {
6661 host_fn(vd, reg_off, info.host);
6664 } while (reg_off & 63);
6669 record_fault(env, reg_off, reg_max);
6672 static inline QEMU_ALWAYS_INLINE
6673 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6674 target_ulong base, uint32_t desc, uintptr_t retaddr,
6675 const int esz, const int msz,
6676 zreg_off_fn *off_fn,
6677 sve_ldst1_host_fn *host_fn,
6678 sve_ldst1_tlb_fn *tlb_fn)
6680 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6681 /* Remove mtedesc from the normal sve descriptor. */
6682 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6685 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6686 * offset base entirely over the address space hole to change the
6687 * pointer tag, or change the bit55 selector. So we could here
6688 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6690 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6691 esz, msz, off_fn, host_fn, tlb_fn);
6694 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6695 void HELPER(sve_ldff##MEM##_##OFS) \
6696 (CPUARMState *env, void *vd, void *vg, \
6697 void *vm, target_ulong base, uint32_t desc) \
6699 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6700 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6702 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6703 (CPUARMState *env, void *vd, void *vg, \
6704 void *vm, target_ulong base, uint32_t desc) \
6706 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6707 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6710 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6711 void HELPER(sve_ldff##MEM##_##OFS) \
6712 (CPUARMState *env, void *vd, void *vg, \
6713 void *vm, target_ulong base, uint32_t desc) \
6715 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6716 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6718 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6719 (CPUARMState *env, void *vd, void *vg, \
6720 void *vm, target_ulong base, uint32_t desc) \
6722 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6723 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6726 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6727 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6728 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6729 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6730 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6732 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6733 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6734 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6735 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6736 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6738 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6739 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6740 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6741 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6742 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6744 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6745 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6746 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6747 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6748 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6750 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6751 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6752 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6753 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6754 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6756 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6757 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6758 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6759 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6760 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6762 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6763 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6764 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6765 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6766 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6768 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6769 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6770 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6771 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6772 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6774 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6775 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6776 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6778 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6779 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6780 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6782 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6783 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6784 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6786 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6787 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6788 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6790 /* Stores with a vector index. */
6792 static inline QEMU_ALWAYS_INLINE
6793 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6794 target_ulong base, uint32_t desc, uintptr_t retaddr,
6795 uint32_t mtedesc, int esize, int msize,
6796 zreg_off_fn *off_fn,
6797 sve_ldst1_host_fn *host_fn,
6798 sve_ldst1_tlb_fn *tlb_fn)
6800 const int mmu_idx = cpu_mmu_index(env, false);
6801 const intptr_t reg_max = simd_oprsz(desc);
6802 const int scale = simd_data(desc);
6803 void *host[ARM_MAX_VQ * 4];
6804 intptr_t reg_off, i;
6805 SVEHostPage info, info2;
6808 * Probe all of the elements for host addresses and flags.
6812 uint64_t pg = vg[reg_off >> 6];
6814 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6815 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6818 if (likely((pg >> (reg_off & 63)) & 1)) {
6819 if (likely(in_page >= msize)) {
6820 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6822 host[i] = info.host;
6825 * Element crosses the page boundary.
6826 * Probe both pages, but do not record the host address,
6827 * so that we use the slow path.
6829 sve_probe_page(&info, false, env, addr, 0,
6830 MMU_DATA_STORE, mmu_idx, retaddr);
6831 sve_probe_page(&info2, false, env, addr + in_page, 0,
6832 MMU_DATA_STORE, mmu_idx, retaddr);
6833 info.flags |= info2.flags;
6836 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6837 cpu_check_watchpoint(env_cpu(env), addr, msize,
6838 info.attrs, BP_MEM_WRITE, retaddr);
6841 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6842 mte_check(env, mtedesc, addr, retaddr);
6847 } while (reg_off & 63);
6848 } while (reg_off < reg_max);
6851 * Now that we have recognized all exceptions except SyncExternal
6852 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6854 * Note for the common case of an element in RAM, not crossing a page
6855 * boundary, we have stored the host address in host[]. This doubles
6856 * as a first-level check against the predicate, since only enabled
6857 * elements have non-null host addresses.
6862 if (likely(h != NULL)) {
6863 host_fn(vd, reg_off, h);
6864 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6865 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6866 tlb_fn(env, vd, reg_off, addr, retaddr);
6870 } while (reg_off < reg_max);
6873 static inline QEMU_ALWAYS_INLINE
6874 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6875 target_ulong base, uint32_t desc, uintptr_t retaddr,
6876 int esize, int msize, zreg_off_fn *off_fn,
6877 sve_ldst1_host_fn *host_fn,
6878 sve_ldst1_tlb_fn *tlb_fn)
6880 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6881 /* Remove mtedesc from the normal sve descriptor. */
6882 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6885 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6886 * offset base entirely over the address space hole to change the
6887 * pointer tag, or change the bit55 selector. So we could here
6888 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6890 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6891 esize, msize, off_fn, host_fn, tlb_fn);
6894 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6895 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6896 void *vm, target_ulong base, uint32_t desc) \
6898 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6899 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6901 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6902 void *vm, target_ulong base, uint32_t desc) \
6904 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6905 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6908 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6909 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6910 void *vm, target_ulong base, uint32_t desc) \
6912 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6913 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6915 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6916 void *vm, target_ulong base, uint32_t desc) \
6918 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6919 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6922 DO_ST1_ZPZ_S(bs, zsu, MO_8)
6923 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6924 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6925 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6926 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6928 DO_ST1_ZPZ_S(bs, zss, MO_8)
6929 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6930 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6931 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6932 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6934 DO_ST1_ZPZ_D(bd, zsu, MO_8)
6935 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6936 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6937 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6938 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6939 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6940 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6942 DO_ST1_ZPZ_D(bd, zss, MO_8)
6943 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6944 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6945 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6946 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6947 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6948 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6950 DO_ST1_ZPZ_D(bd, zd, MO_8)
6951 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6952 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6953 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6954 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6955 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6956 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
6961 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6963 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6964 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6966 for (i = 0; i < opr_sz; ++i) {
6967 d[i] = n[i] ^ m[i] ^ k[i];
6971 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6973 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6974 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6976 for (i = 0; i < opr_sz; ++i) {
6977 d[i] = n[i] ^ (m[i] & ~k[i]);
6981 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6984 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6986 for (i = 0; i < opr_sz; ++i) {
6987 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
6991 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6993 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6994 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6996 for (i = 0; i < opr_sz; ++i) {
6997 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7001 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7003 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7004 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7006 for (i = 0; i < opr_sz; ++i) {
7007 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7012 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7013 * See hasless(v,1) from
7014 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7016 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7018 int bits = 8 << esz;
7019 uint64_t ones = dup_const(esz, 1);
7020 uint64_t signs = ones << (bits - 1);
7021 uint64_t cmp0, cmp1;
7023 cmp1 = dup_const(esz, n);
7026 cmp0 = (cmp0 - ones) & ~cmp0;
7027 cmp1 = (cmp1 - ones) & ~cmp1;
7028 return (cmp0 | cmp1) & signs;
7031 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7032 uint32_t desc, int esz, bool nmatch)
7034 uint16_t esz_mask = pred_esz_masks[esz];
7035 intptr_t opr_sz = simd_oprsz(desc);
7036 uint32_t flags = PREDTEST_INIT;
7039 for (i = 0; i < opr_sz; i += 16) {
7040 uint64_t m0 = *(uint64_t *)(vm + i);
7041 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7042 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7045 for (j = 0; j < 16; j += 8) {
7046 uint64_t n = *(uint64_t *)(vn + i + j);
7048 for (k = 0; k < 8; k += 1 << esz) {
7049 if (pg & (1 << (j + k))) {
7050 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7051 out |= (o ^ nmatch) << (j + k);
7055 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7056 flags = iter_predtest_fwd(out, pg, flags);
7061 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7062 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7064 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7067 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7068 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7070 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7071 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7073 #undef DO_PPZZ_MATCH
7075 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7078 ARMVectorReg scratch;
7080 intptr_t opr_sz = simd_oprsz(desc);
7081 uint32_t *d = vd, *n = vn, *m = vm;
7085 n = memcpy(&scratch, n, opr_sz);
7089 } else if (d == m) {
7090 m = memcpy(&scratch, m, opr_sz);
7093 for (i = 0; i < opr_sz; i += 4) {
7097 pred = pg[H1(i >> 3)] >> (i & 7);
7099 uint32_t nn = n[H4(i >> 2)];
7101 for (j = 0; j <= i; j += 4) {
7102 pred = pg[H1(j >> 3)] >> (j & 7);
7103 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7108 d[H4(i >> 2)] = count;
7112 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7115 ARMVectorReg scratch;
7117 intptr_t opr_sz = simd_oprsz(desc);
7118 uint64_t *d = vd, *n = vn, *m = vm;
7122 n = memcpy(&scratch, n, opr_sz);
7126 } else if (d == m) {
7127 m = memcpy(&scratch, m, opr_sz);
7130 for (i = 0; i < opr_sz / 8; ++i) {
7132 if (pg[H1(i)] & 1) {
7134 for (j = 0; j <= i; ++j) {
7135 if ((pg[H1(j)] & 1) && nn == m[j]) {
7145 * Returns the number of bytes in m0 and m1 that match n.
7146 * Unlike do_match2 we don't just need true/false, we need an exact count.
7147 * This requires two extra logical operations.
7149 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7151 const uint64_t mask = dup_const(MO_8, 0x7f);
7152 uint64_t cmp0, cmp1;
7154 cmp1 = dup_const(MO_8, n);
7159 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7160 * 2: carry in to msb if byte != 0 (+ mask)
7161 * 3: set msb if cmp has msb set (| cmp)
7162 * 4: set ~msb to ignore them (| mask)
7163 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7164 * 5: invert, resulting in 0x80 if and only if byte == 0.
7166 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7167 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7170 * Combine the two compares in a way that the bits do
7171 * not overlap, and so preserves the count of set bits.
7172 * If the host has an efficient instruction for ctpop,
7173 * then ctpop(x) + ctpop(y) has the same number of
7174 * operations as ctpop(x | (y >> 1)). If the host does
7175 * not have an efficient ctpop, then we only want to
7178 return ctpop64(cmp0 | (cmp1 >> 1));
7181 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7184 intptr_t opr_sz = simd_oprsz(desc);
7186 for (i = 0; i < opr_sz; i += 16) {
7187 uint64_t n0 = *(uint64_t *)(vn + i);
7188 uint64_t m0 = *(uint64_t *)(vm + i);
7189 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7190 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7194 for (j = 0; j < 64; j += 8) {
7195 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7196 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7201 *(uint64_t *)(vd + i) = out0;
7202 *(uint64_t *)(vd + i + 8) = out1;