]> Git Repo - qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE gather loads
[qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
a1f233f2 26#include "fpu/softfloat.h"
9e18d7a6
RH
27
28
f97cfd59
RH
29/* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
9e18d7a6
RH
45/* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52/* For no G bits set, NZCV = C. */
53#define PREDTEST_INIT 1
54
55/* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
757f9cff
RH
77/* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
79 */
80static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81{
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
92
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97}
98
9e18d7a6
RH
99/* The same for a single word predicate. */
100uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101{
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103}
104
105/* The same for a multi-word predicate. */
106uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107{
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117}
516e246a 118
ccd841c3
RH
119/* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
125 * }
126 * }
127 * printf("0x%016lx,\n", m);
128 * }
129 */
130static inline uint64_t expand_pred_b(uint8_t byte)
131{
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221}
222
223/* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
228 * }
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
232 * }
233 * }
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
235 * }
236 */
237static inline uint64_t expand_pred_h(uint8_t byte)
238{
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250}
251
252/* Similarly for single word elements. */
253static inline uint64_t expand_pred_s(uint8_t byte)
254{
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261}
262
dae8fb90
RH
263/* Swap 16-bit words within a 32-bit word. */
264static inline uint32_t hswap32(uint32_t h)
265{
266 return rol32(h, 16);
267}
268
269/* Swap 16-bit words within a 64-bit word. */
270static inline uint64_t hswap64(uint64_t h)
271{
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275}
276
277/* Swap 32-bit words within a 64-bit word. */
278static inline uint64_t wswap64(uint64_t h)
279{
280 return rol64(h, 32);
281}
282
516e246a
RH
283#define LOGICAL_PPPP(NAME, FUNC) \
284void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285{ \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292}
293
294#define DO_AND(N, M, G) (((N) & (M)) & (G))
295#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297#define DO_ORR(N, M, G) (((N) | (M)) & (G))
298#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303LOGICAL_PPPP(sve_and_pppp, DO_AND)
304LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312#undef DO_AND
313#undef DO_BIC
314#undef DO_EOR
315#undef DO_ORR
316#undef DO_ORN
317#undef DO_NOR
318#undef DO_NAND
319#undef DO_SEL
320#undef LOGICAL_PPPP
028e2a7b 321
f97cfd59
RH
322/* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
324 */
325/* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
329 */
330#define DO_ZPZZ(NAME, TYPE, H, OP) \
331void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332{ \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345}
346
347/* Similarly, specialized for 64-bit operands. */
348#define DO_ZPZZ_D(NAME, TYPE, OP) \
349void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350{ \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360}
361
362#define DO_AND(N, M) (N & M)
363#define DO_EOR(N, M) (N ^ M)
364#define DO_ORR(N, M) (N | M)
365#define DO_BIC(N, M) (N & ~M)
366#define DO_ADD(N, M) (N + M)
367#define DO_SUB(N, M) (N - M)
368#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371#define DO_MUL(N, M) (N * M)
372#define DO_DIV(N, M) (M ? N / M : 0)
373
374DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
375DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
376DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
377DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
378
379DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
380DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
381DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
382DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
383
384DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
385DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
386DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
387DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
388
389DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
390DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
391DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
392DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
393
394DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
395DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
396DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
397DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
398
399DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
400DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
401DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
402DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
403
404DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
405DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
406DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
407DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
408
409DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
410DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
411DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
412DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
413
414DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
415DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
416DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
417DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
418
419DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
420DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
421DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
422DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
423
424DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
425DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
426DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
427DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
428
429DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
430DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
431DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
432DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
433
434/* Because the computation type is at least twice as large as required,
435 these work for both signed and unsigned source types. */
436static inline uint8_t do_mulh_b(int32_t n, int32_t m)
437{
438 return (n * m) >> 8;
439}
440
441static inline uint16_t do_mulh_h(int32_t n, int32_t m)
442{
443 return (n * m) >> 16;
444}
445
446static inline uint32_t do_mulh_s(int64_t n, int64_t m)
447{
448 return (n * m) >> 32;
449}
450
451static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
452{
453 uint64_t lo, hi;
454 muls64(&lo, &hi, n, m);
455 return hi;
456}
457
458static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
459{
460 uint64_t lo, hi;
461 mulu64(&lo, &hi, n, m);
462 return hi;
463}
464
465DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
466DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
467DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
468DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
469
470DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
471DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
472DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
473DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
474
475DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
476DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
477DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
478DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
479
480DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
481DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
482
483DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
484DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
485
27721dbb
RH
486/* Note that all bits of the shift are significant
487 and not modulo the element size. */
488#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
489#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
490#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
491
492DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
493DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
494DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
495
496DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
497DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
498DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
499
500DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
501DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
502DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
503
504DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
505DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
506DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
507
f97cfd59
RH
508#undef DO_ZPZZ
509#undef DO_ZPZZ_D
047cec97 510
fe7f8dfb
RH
511/* Three-operand expander, controlled by a predicate, in which the
512 * third operand is "wide". That is, for D = N op M, the same 64-bit
513 * value of M is used with all of the narrower values of N.
514 */
515#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
516void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
517{ \
518 intptr_t i, opr_sz = simd_oprsz(desc); \
519 for (i = 0; i < opr_sz; ) { \
520 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
521 TYPEW mm = *(TYPEW *)(vm + i); \
522 do { \
523 if (pg & 1) { \
524 TYPE nn = *(TYPE *)(vn + H(i)); \
525 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
526 } \
527 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
528 } while (i & 7); \
529 } \
530}
531
532DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
533DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
534DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
535
536DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
537DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
538DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
539
540DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
541DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
542DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
543
544#undef DO_ZPZW
545
afac6d04
RH
546/* Fully general two-operand expander, controlled by a predicate.
547 */
548#define DO_ZPZ(NAME, TYPE, H, OP) \
549void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
550{ \
551 intptr_t i, opr_sz = simd_oprsz(desc); \
552 for (i = 0; i < opr_sz; ) { \
553 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
554 do { \
555 if (pg & 1) { \
556 TYPE nn = *(TYPE *)(vn + H(i)); \
557 *(TYPE *)(vd + H(i)) = OP(nn); \
558 } \
559 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
560 } while (i & 15); \
561 } \
562}
563
564/* Similarly, specialized for 64-bit operands. */
565#define DO_ZPZ_D(NAME, TYPE, OP) \
566void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
567{ \
568 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
569 TYPE *d = vd, *n = vn; \
570 uint8_t *pg = vg; \
571 for (i = 0; i < opr_sz; i += 1) { \
572 if (pg[H1(i)] & 1) { \
573 TYPE nn = n[i]; \
574 d[i] = OP(nn); \
575 } \
576 } \
577}
578
579#define DO_CLS_B(N) (clrsb32(N) - 24)
580#define DO_CLS_H(N) (clrsb32(N) - 16)
581
582DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
583DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
584DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
585DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
586
587#define DO_CLZ_B(N) (clz32(N) - 24)
588#define DO_CLZ_H(N) (clz32(N) - 16)
589
590DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
591DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
592DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
593DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
594
595DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
596DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
597DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
598DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
599
600#define DO_CNOT(N) (N == 0)
601
602DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
603DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
604DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
605DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
606
607#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
608
609DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
610DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
611DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
612
613#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
614
615DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
616DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
617DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
618
619#define DO_NOT(N) (~N)
620
621DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
622DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
623DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
624DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
625
626#define DO_SXTB(N) ((int8_t)N)
627#define DO_SXTH(N) ((int16_t)N)
628#define DO_SXTS(N) ((int32_t)N)
629#define DO_UXTB(N) ((uint8_t)N)
630#define DO_UXTH(N) ((uint16_t)N)
631#define DO_UXTS(N) ((uint32_t)N)
632
633DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
634DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
635DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
636DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
637DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
638DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
639
640DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
641DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
642DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
643DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
644DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
645DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
646
647#define DO_ABS(N) (N < 0 ? -N : N)
648
649DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
650DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
651DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
652DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
653
654#define DO_NEG(N) (-N)
655
656DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
657DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
658DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
659DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
660
dae8fb90
RH
661DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
662DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
663DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
664
665DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
666DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
667
668DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
669
670DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
671DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
672DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
673DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
674
d9d78dcc
RH
675/* Three-operand expander, unpredicated, in which the third operand is "wide".
676 */
677#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
678void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
679{ \
680 intptr_t i, opr_sz = simd_oprsz(desc); \
681 for (i = 0; i < opr_sz; ) { \
682 TYPEW mm = *(TYPEW *)(vm + i); \
683 do { \
684 TYPE nn = *(TYPE *)(vn + H(i)); \
685 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
686 i += sizeof(TYPE); \
687 } while (i & 7); \
688 } \
689}
690
691DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
692DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
693DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
694
695DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
696DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
697DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
698
699DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
700DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
701DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
702
703#undef DO_ZZW
704
afac6d04
RH
705#undef DO_CLS_B
706#undef DO_CLS_H
707#undef DO_CLZ_B
708#undef DO_CLZ_H
709#undef DO_CNOT
710#undef DO_FABS
711#undef DO_FNEG
712#undef DO_ABS
713#undef DO_NEG
714#undef DO_ZPZ
715#undef DO_ZPZ_D
716
047cec97
RH
717/* Two-operand reduction expander, controlled by a predicate.
718 * The difference between TYPERED and TYPERET has to do with
719 * sign-extension. E.g. for SMAX, TYPERED must be signed,
720 * but TYPERET must be unsigned so that e.g. a 32-bit value
721 * is not sign-extended to the ABI uint64_t return type.
722 */
723/* ??? If we were to vectorize this by hand the reduction ordering
724 * would change. For integer operands, this is perfectly fine.
725 */
726#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
727uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
728{ \
729 intptr_t i, opr_sz = simd_oprsz(desc); \
730 TYPERED ret = INIT; \
731 for (i = 0; i < opr_sz; ) { \
732 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
733 do { \
734 if (pg & 1) { \
735 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
736 ret = OP(ret, nn); \
737 } \
738 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
739 } while (i & 15); \
740 } \
741 return (TYPERET)ret; \
742}
743
744#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
745uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
746{ \
747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
748 TYPEE *n = vn; \
749 uint8_t *pg = vg; \
750 TYPER ret = INIT; \
751 for (i = 0; i < opr_sz; i += 1) { \
752 if (pg[H1(i)] & 1) { \
753 TYPEE nn = n[i]; \
754 ret = OP(ret, nn); \
755 } \
756 } \
757 return ret; \
758}
759
760DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
761DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
762DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
763DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
764
765DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
766DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
767DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
768DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
769
770DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
771DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
772DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
773DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
774
775DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
776DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
777DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
778
779DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
780DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
781DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
782DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
783
784DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
785DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
786DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
787DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
788
789DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
790DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
791DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
792DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
793
794DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
795DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
796DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
797DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
798
799DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
800DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
801DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
802DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
803
804#undef DO_VPZ
805#undef DO_VPZ_D
806
6e6a157d
RH
807/* Two vector operand, one scalar operand, unpredicated. */
808#define DO_ZZI(NAME, TYPE, OP) \
809void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
810{ \
811 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
812 TYPE s = s64, *d = vd, *n = vn; \
813 for (i = 0; i < opr_sz; ++i) { \
814 d[i] = OP(n[i], s); \
815 } \
816}
817
818#define DO_SUBR(X, Y) (Y - X)
819
820DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
821DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
822DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
823DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
824
825DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
826DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
827DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
828DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
829
830DO_ZZI(sve_smini_b, int8_t, DO_MIN)
831DO_ZZI(sve_smini_h, int16_t, DO_MIN)
832DO_ZZI(sve_smini_s, int32_t, DO_MIN)
833DO_ZZI(sve_smini_d, int64_t, DO_MIN)
834
835DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
836DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
837DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
838DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
839
840DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
841DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
842DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
843DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
844
845#undef DO_ZZI
846
f97cfd59
RH
847#undef DO_AND
848#undef DO_ORR
849#undef DO_EOR
850#undef DO_BIC
851#undef DO_ADD
852#undef DO_SUB
853#undef DO_MAX
854#undef DO_MIN
855#undef DO_ABD
856#undef DO_MUL
857#undef DO_DIV
27721dbb
RH
858#undef DO_ASR
859#undef DO_LSR
860#undef DO_LSL
6e6a157d 861#undef DO_SUBR
f97cfd59 862
028e2a7b
RH
863/* Similar to the ARM LastActiveElement pseudocode function, except the
864 result is multiplied by the element size. This includes the not found
865 indication; e.g. not found for esz=3 is -8. */
866static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
867{
868 uint64_t mask = pred_esz_masks[esz];
869 intptr_t i = words;
870
871 do {
872 uint64_t this_g = g[--i] & mask;
873 if (this_g) {
874 return i * 64 + (63 - clz64(this_g));
875 }
876 } while (i > 0);
877 return (intptr_t)-1 << esz;
878}
879
880uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
881{
882 uint32_t flags = PREDTEST_INIT;
883 uint64_t *d = vd, *g = vg;
884 intptr_t i = 0;
885
886 do {
887 uint64_t this_d = d[i];
888 uint64_t this_g = g[i];
889
890 if (this_g) {
891 if (!(flags & 4)) {
892 /* Set in D the first bit of G. */
893 this_d |= this_g & -this_g;
894 d[i] = this_d;
895 }
896 flags = iter_predtest_fwd(this_d, this_g, flags);
897 }
898 } while (++i < words);
899
900 return flags;
901}
902
903uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
904{
905 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
906 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
907 uint32_t flags = PREDTEST_INIT;
908 uint64_t *d = vd, *g = vg, esz_mask;
909 intptr_t i, next;
910
911 next = last_active_element(vd, words, esz) + (1 << esz);
912 esz_mask = pred_esz_masks[esz];
913
914 /* Similar to the pseudocode for pnext, but scaled by ESZ
915 so that we find the correct bit. */
916 if (next < words * 64) {
917 uint64_t mask = -1;
918
919 if (next & 63) {
920 mask = ~((1ull << (next & 63)) - 1);
921 next &= -64;
922 }
923 do {
924 uint64_t this_g = g[next / 64] & esz_mask & mask;
925 if (this_g != 0) {
926 next = (next & -64) + ctz64(this_g);
927 break;
928 }
929 next += 64;
930 mask = -1;
931 } while (next < words * 64);
932 }
933
934 i = 0;
935 do {
936 uint64_t this_d = 0;
937 if (i == next / 64) {
938 this_d = 1ull << (next & 63);
939 }
940 d[i] = this_d;
941 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
942 } while (++i < words);
943
944 return flags;
945}
ccd841c3
RH
946
947/* Store zero into every active element of Zd. We will use this for two
948 * and three-operand predicated instructions for which logic dictates a
949 * zero result. In particular, logical shift by element size, which is
950 * otherwise undefined on the host.
951 *
952 * For element sizes smaller than uint64_t, we use tables to expand
953 * the N bits of the controlling predicate to a byte mask, and clear
954 * those bytes.
955 */
956void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
957{
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
959 uint64_t *d = vd;
960 uint8_t *pg = vg;
961 for (i = 0; i < opr_sz; i += 1) {
962 d[i] &= ~expand_pred_b(pg[H1(i)]);
963 }
964}
965
966void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
967{
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_h(pg[H1(i)]);
973 }
974}
975
976void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_s(pg[H1(i)]);
983 }
984}
985
986void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
987{
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 if (pg[H1(i)] & 1) {
993 d[i] = 0;
994 }
995 }
996}
997
68459864
RH
998/* Copy Zn into Zd, and store zero into inactive elements. */
999void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1000{
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1002 uint64_t *d = vd, *n = vn;
1003 uint8_t *pg = vg;
1004 for (i = 0; i < opr_sz; i += 1) {
1005 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1006 }
1007}
1008
1009void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1010{
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1016 }
1017}
1018
1019void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1020{
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1026 }
1027}
1028
1029void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1030{
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1036 }
1037}
1038
ccd841c3
RH
1039/* Three-operand expander, immediate operand, controlled by a predicate.
1040 */
1041#define DO_ZPZI(NAME, TYPE, H, OP) \
1042void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1043{ \
1044 intptr_t i, opr_sz = simd_oprsz(desc); \
1045 TYPE imm = simd_data(desc); \
1046 for (i = 0; i < opr_sz; ) { \
1047 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1048 do { \
1049 if (pg & 1) { \
1050 TYPE nn = *(TYPE *)(vn + H(i)); \
1051 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1052 } \
1053 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1054 } while (i & 15); \
1055 } \
1056}
1057
1058/* Similarly, specialized for 64-bit operands. */
1059#define DO_ZPZI_D(NAME, TYPE, OP) \
1060void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1061{ \
1062 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1063 TYPE *d = vd, *n = vn; \
1064 TYPE imm = simd_data(desc); \
1065 uint8_t *pg = vg; \
1066 for (i = 0; i < opr_sz; i += 1) { \
1067 if (pg[H1(i)] & 1) { \
1068 TYPE nn = n[i]; \
1069 d[i] = OP(nn, imm); \
1070 } \
1071 } \
1072}
1073
1074#define DO_SHR(N, M) (N >> M)
1075#define DO_SHL(N, M) (N << M)
1076
1077/* Arithmetic shift right for division. This rounds negative numbers
1078 toward zero as per signed division. Therefore before shifting,
1079 when N is negative, add 2**M-1. */
1080#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1081
1082DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1083DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1084DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1085DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1086
1087DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1088DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1089DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1090DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1091
1092DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1093DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1094DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1095DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1096
1097DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1098DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1099DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1100DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1101
1102#undef DO_SHR
1103#undef DO_SHL
1104#undef DO_ASRD
1105#undef DO_ZPZI
1106#undef DO_ZPZI_D
96a36e4a
RH
1107
1108/* Fully general four-operand expander, controlled by a predicate.
1109 */
1110#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1111void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1112 void *vg, uint32_t desc) \
1113{ \
1114 intptr_t i, opr_sz = simd_oprsz(desc); \
1115 for (i = 0; i < opr_sz; ) { \
1116 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1117 do { \
1118 if (pg & 1) { \
1119 TYPE nn = *(TYPE *)(vn + H(i)); \
1120 TYPE mm = *(TYPE *)(vm + H(i)); \
1121 TYPE aa = *(TYPE *)(va + H(i)); \
1122 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1123 } \
1124 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1125 } while (i & 15); \
1126 } \
1127}
1128
1129/* Similarly, specialized for 64-bit operands. */
1130#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1131void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1132 void *vg, uint32_t desc) \
1133{ \
1134 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1135 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1136 uint8_t *pg = vg; \
1137 for (i = 0; i < opr_sz; i += 1) { \
1138 if (pg[H1(i)] & 1) { \
1139 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1140 d[i] = OP(aa, nn, mm); \
1141 } \
1142 } \
1143}
1144
1145#define DO_MLA(A, N, M) (A + N * M)
1146#define DO_MLS(A, N, M) (A - N * M)
1147
1148DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1149DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1150
1151DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1152DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1153
1154DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1155DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1156
1157DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1158DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1159
1160#undef DO_MLA
1161#undef DO_MLS
1162#undef DO_ZPZZZ
1163#undef DO_ZPZZZ_D
9a56c9c3
RH
1164
1165void HELPER(sve_index_b)(void *vd, uint32_t start,
1166 uint32_t incr, uint32_t desc)
1167{
1168 intptr_t i, opr_sz = simd_oprsz(desc);
1169 uint8_t *d = vd;
1170 for (i = 0; i < opr_sz; i += 1) {
1171 d[H1(i)] = start + i * incr;
1172 }
1173}
1174
1175void HELPER(sve_index_h)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177{
1178 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1179 uint16_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H2(i)] = start + i * incr;
1182 }
1183}
1184
1185void HELPER(sve_index_s)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187{
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1189 uint32_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H4(i)] = start + i * incr;
1192 }
1193}
1194
1195void HELPER(sve_index_d)(void *vd, uint64_t start,
1196 uint64_t incr, uint32_t desc)
1197{
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1199 uint64_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[i] = start + i * incr;
1202 }
1203}
4b242d9c
RH
1204
1205void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1206{
1207 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1208 uint32_t sh = simd_data(desc);
1209 uint32_t *d = vd, *n = vn, *m = vm;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = n[i] + (m[i] << sh);
1212 }
1213}
1214
1215void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1216{
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1218 uint64_t sh = simd_data(desc);
1219 uint64_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223}
1224
1225void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1232 }
1233}
1234
1235void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1236{
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1242 }
1243}
0762cd42
RH
1244
1245void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1246{
1247 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1248 static const uint16_t coeff[] = {
1249 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1250 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1251 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1252 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1253 };
1254 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1255 uint16_t *d = vd, *n = vn;
1256
1257 for (i = 0; i < opr_sz; i++) {
1258 uint16_t nn = n[i];
1259 intptr_t idx = extract32(nn, 0, 5);
1260 uint16_t exp = extract32(nn, 5, 5);
1261 d[i] = coeff[idx] | (exp << 10);
1262 }
1263}
1264
1265void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1266{
1267 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1268 static const uint32_t coeff[] = {
1269 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1270 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1271 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1272 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1273 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1274 0x1ef532, 0x20b051, 0x227043, 0x243516,
1275 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1276 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1277 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1278 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1279 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1280 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1281 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1282 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1283 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1284 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1285 };
1286 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1287 uint32_t *d = vd, *n = vn;
1288
1289 for (i = 0; i < opr_sz; i++) {
1290 uint32_t nn = n[i];
1291 intptr_t idx = extract32(nn, 0, 6);
1292 uint32_t exp = extract32(nn, 6, 8);
1293 d[i] = coeff[idx] | (exp << 23);
1294 }
1295}
1296
1297void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1298{
1299 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1300 static const uint64_t coeff[] = {
1301 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1302 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1303 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1304 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1305 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1306 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1307 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1308 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1309 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1310 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1311 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1312 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1313 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1314 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1315 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1316 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1317 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1318 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1319 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1320 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1321 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1322 0xFA7C1819E90D8ull,
1323 };
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1325 uint64_t *d = vd, *n = vn;
1326
1327 for (i = 0; i < opr_sz; i++) {
1328 uint64_t nn = n[i];
1329 intptr_t idx = extract32(nn, 0, 6);
1330 uint64_t exp = extract32(nn, 6, 11);
1331 d[i] = coeff[idx] | (exp << 52);
1332 }
1333}
a1f233f2
RH
1334
1335void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1336{
1337 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1338 uint16_t *d = vd, *n = vn, *m = vm;
1339 for (i = 0; i < opr_sz; i += 1) {
1340 uint16_t nn = n[i];
1341 uint16_t mm = m[i];
1342 if (mm & 1) {
1343 nn = float16_one;
1344 }
1345 d[i] = nn ^ (mm & 2) << 14;
1346 }
1347}
1348
1349void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1350{
1351 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1352 uint32_t *d = vd, *n = vn, *m = vm;
1353 for (i = 0; i < opr_sz; i += 1) {
1354 uint32_t nn = n[i];
1355 uint32_t mm = m[i];
1356 if (mm & 1) {
1357 nn = float32_one;
1358 }
1359 d[i] = nn ^ (mm & 2) << 30;
1360 }
1361}
1362
1363void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1364{
1365 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1366 uint64_t *d = vd, *n = vn, *m = vm;
1367 for (i = 0; i < opr_sz; i += 1) {
1368 uint64_t nn = n[i];
1369 uint64_t mm = m[i];
1370 if (mm & 1) {
1371 nn = float64_one;
1372 }
1373 d[i] = nn ^ (mm & 2) << 62;
1374 }
1375}
24e82e68
RH
1376
1377/*
1378 * Signed saturating addition with scalar operand.
1379 */
1380
1381void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1382{
1383 intptr_t i, oprsz = simd_oprsz(desc);
1384
1385 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1386 int r = *(int8_t *)(a + i) + b;
1387 if (r > INT8_MAX) {
1388 r = INT8_MAX;
1389 } else if (r < INT8_MIN) {
1390 r = INT8_MIN;
1391 }
1392 *(int8_t *)(d + i) = r;
1393 }
1394}
1395
1396void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1397{
1398 intptr_t i, oprsz = simd_oprsz(desc);
1399
1400 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1401 int r = *(int16_t *)(a + i) + b;
1402 if (r > INT16_MAX) {
1403 r = INT16_MAX;
1404 } else if (r < INT16_MIN) {
1405 r = INT16_MIN;
1406 }
1407 *(int16_t *)(d + i) = r;
1408 }
1409}
1410
1411void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1412{
1413 intptr_t i, oprsz = simd_oprsz(desc);
1414
1415 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1416 int64_t r = *(int32_t *)(a + i) + b;
1417 if (r > INT32_MAX) {
1418 r = INT32_MAX;
1419 } else if (r < INT32_MIN) {
1420 r = INT32_MIN;
1421 }
1422 *(int32_t *)(d + i) = r;
1423 }
1424}
1425
1426void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1427{
1428 intptr_t i, oprsz = simd_oprsz(desc);
1429
1430 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1431 int64_t ai = *(int64_t *)(a + i);
1432 int64_t r = ai + b;
1433 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1434 /* Signed overflow. */
1435 r = (r < 0 ? INT64_MAX : INT64_MIN);
1436 }
1437 *(int64_t *)(d + i) = r;
1438 }
1439}
1440
1441/*
1442 * Unsigned saturating addition with scalar operand.
1443 */
1444
1445void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1446{
1447 intptr_t i, oprsz = simd_oprsz(desc);
1448
1449 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1450 int r = *(uint8_t *)(a + i) + b;
1451 if (r > UINT8_MAX) {
1452 r = UINT8_MAX;
1453 } else if (r < 0) {
1454 r = 0;
1455 }
1456 *(uint8_t *)(d + i) = r;
1457 }
1458}
1459
1460void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1461{
1462 intptr_t i, oprsz = simd_oprsz(desc);
1463
1464 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1465 int r = *(uint16_t *)(a + i) + b;
1466 if (r > UINT16_MAX) {
1467 r = UINT16_MAX;
1468 } else if (r < 0) {
1469 r = 0;
1470 }
1471 *(uint16_t *)(d + i) = r;
1472 }
1473}
1474
1475void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1476{
1477 intptr_t i, oprsz = simd_oprsz(desc);
1478
1479 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1480 int64_t r = *(uint32_t *)(a + i) + b;
1481 if (r > UINT32_MAX) {
1482 r = UINT32_MAX;
1483 } else if (r < 0) {
1484 r = 0;
1485 }
1486 *(uint32_t *)(d + i) = r;
1487 }
1488}
1489
1490void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1491{
1492 intptr_t i, oprsz = simd_oprsz(desc);
1493
1494 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1495 uint64_t r = *(uint64_t *)(a + i) + b;
1496 if (r < b) {
1497 r = UINT64_MAX;
1498 }
1499 *(uint64_t *)(d + i) = r;
1500 }
1501}
1502
1503void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1504{
1505 intptr_t i, oprsz = simd_oprsz(desc);
1506
1507 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1508 uint64_t ai = *(uint64_t *)(a + i);
1509 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1510 }
1511}
f25a2361
RH
1512
1513/* Two operand predicated copy immediate with merge. All valid immediates
1514 * can fit within 17 signed bits in the simd_data field.
1515 */
1516void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1517 uint64_t mm, uint32_t desc)
1518{
1519 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1520 uint64_t *d = vd, *n = vn;
1521 uint8_t *pg = vg;
1522
1523 mm = dup_const(MO_8, mm);
1524 for (i = 0; i < opr_sz; i += 1) {
1525 uint64_t nn = n[i];
1526 uint64_t pp = expand_pred_b(pg[H1(i)]);
1527 d[i] = (mm & pp) | (nn & ~pp);
1528 }
1529}
1530
1531void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1532 uint64_t mm, uint32_t desc)
1533{
1534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1535 uint64_t *d = vd, *n = vn;
1536 uint8_t *pg = vg;
1537
1538 mm = dup_const(MO_16, mm);
1539 for (i = 0; i < opr_sz; i += 1) {
1540 uint64_t nn = n[i];
1541 uint64_t pp = expand_pred_h(pg[H1(i)]);
1542 d[i] = (mm & pp) | (nn & ~pp);
1543 }
1544}
1545
1546void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1547 uint64_t mm, uint32_t desc)
1548{
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd, *n = vn;
1551 uint8_t *pg = vg;
1552
1553 mm = dup_const(MO_32, mm);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 uint64_t nn = n[i];
1556 uint64_t pp = expand_pred_s(pg[H1(i)]);
1557 d[i] = (mm & pp) | (nn & ~pp);
1558 }
1559}
1560
1561void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1562 uint64_t mm, uint32_t desc)
1563{
1564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1565 uint64_t *d = vd, *n = vn;
1566 uint8_t *pg = vg;
1567
1568 for (i = 0; i < opr_sz; i += 1) {
1569 uint64_t nn = n[i];
1570 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1571 }
1572}
1573
1574void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1575{
1576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 uint64_t *d = vd;
1578 uint8_t *pg = vg;
1579
1580 val = dup_const(MO_8, val);
1581 for (i = 0; i < opr_sz; i += 1) {
1582 d[i] = val & expand_pred_b(pg[H1(i)]);
1583 }
1584}
1585
1586void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1587{
1588 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589 uint64_t *d = vd;
1590 uint8_t *pg = vg;
1591
1592 val = dup_const(MO_16, val);
1593 for (i = 0; i < opr_sz; i += 1) {
1594 d[i] = val & expand_pred_h(pg[H1(i)]);
1595 }
1596}
1597
1598void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1599{
1600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1601 uint64_t *d = vd;
1602 uint8_t *pg = vg;
1603
1604 val = dup_const(MO_32, val);
1605 for (i = 0; i < opr_sz; i += 1) {
1606 d[i] = val & expand_pred_s(pg[H1(i)]);
1607 }
1608}
1609
1610void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1611{
1612 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1613 uint64_t *d = vd;
1614 uint8_t *pg = vg;
1615
1616 for (i = 0; i < opr_sz; i += 1) {
1617 d[i] = (pg[H1(i)] & 1 ? val : 0);
1618 }
1619}
b94f8f60
RH
1620
1621/* Big-endian hosts need to frob the byte indicies. If the copy
1622 * happens to be 8-byte aligned, then no frobbing necessary.
1623 */
1624static void swap_memmove(void *vd, void *vs, size_t n)
1625{
1626 uintptr_t d = (uintptr_t)vd;
1627 uintptr_t s = (uintptr_t)vs;
1628 uintptr_t o = (d | s | n) & 7;
1629 size_t i;
1630
1631#ifndef HOST_WORDS_BIGENDIAN
1632 o = 0;
1633#endif
1634 switch (o) {
1635 case 0:
1636 memmove(vd, vs, n);
1637 break;
1638
1639 case 4:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i += 4) {
1642 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1643 }
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 4;
1647 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1648 }
1649 }
1650 break;
1651
1652 case 2:
1653 case 6:
1654 if (d < s || d >= s + n) {
1655 for (i = 0; i < n; i += 2) {
1656 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1657 }
1658 } else {
1659 for (i = n; i > 0; ) {
1660 i -= 2;
1661 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1662 }
1663 }
1664 break;
1665
1666 default:
1667 if (d < s || d >= s + n) {
1668 for (i = 0; i < n; i++) {
1669 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1670 }
1671 } else {
1672 for (i = n; i > 0; ) {
1673 i -= 1;
1674 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1675 }
1676 }
1677 break;
1678 }
1679}
1680
1681void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1682{
1683 intptr_t opr_sz = simd_oprsz(desc);
1684 size_t n_ofs = simd_data(desc);
1685 size_t n_siz = opr_sz - n_ofs;
1686
1687 if (vd != vm) {
1688 swap_memmove(vd, vn + n_ofs, n_siz);
1689 swap_memmove(vd + n_siz, vm, n_ofs);
1690 } else if (vd != vn) {
1691 swap_memmove(vd + n_siz, vd, n_ofs);
1692 swap_memmove(vd, vn + n_ofs, n_siz);
1693 } else {
1694 /* vd == vn == vm. Need temp space. */
1695 ARMVectorReg tmp;
1696 swap_memmove(&tmp, vm, n_ofs);
1697 swap_memmove(vd, vd + n_ofs, n_siz);
1698 memcpy(vd + n_siz, &tmp, n_ofs);
1699 }
1700}
30562ab7
RH
1701
1702#define DO_INSR(NAME, TYPE, H) \
1703void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1704{ \
1705 intptr_t opr_sz = simd_oprsz(desc); \
1706 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1707 *(TYPE *)(vd + H(0)) = val; \
1708}
1709
1710DO_INSR(sve_insr_b, uint8_t, H1)
1711DO_INSR(sve_insr_h, uint16_t, H1_2)
1712DO_INSR(sve_insr_s, uint32_t, H1_4)
1713DO_INSR(sve_insr_d, uint64_t, )
1714
1715#undef DO_INSR
1716
1717void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1718{
1719 intptr_t i, j, opr_sz = simd_oprsz(desc);
1720 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1721 uint64_t f = *(uint64_t *)(vn + i);
1722 uint64_t b = *(uint64_t *)(vn + j);
1723 *(uint64_t *)(vd + i) = bswap64(b);
1724 *(uint64_t *)(vd + j) = bswap64(f);
1725 }
1726}
1727
30562ab7
RH
1728void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1729{
1730 intptr_t i, j, opr_sz = simd_oprsz(desc);
1731 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1732 uint64_t f = *(uint64_t *)(vn + i);
1733 uint64_t b = *(uint64_t *)(vn + j);
1734 *(uint64_t *)(vd + i) = hswap64(b);
1735 *(uint64_t *)(vd + j) = hswap64(f);
1736 }
1737}
1738
1739void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1740{
1741 intptr_t i, j, opr_sz = simd_oprsz(desc);
1742 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1743 uint64_t f = *(uint64_t *)(vn + i);
1744 uint64_t b = *(uint64_t *)(vn + j);
1745 *(uint64_t *)(vd + i) = rol64(b, 32);
1746 *(uint64_t *)(vd + j) = rol64(f, 32);
1747 }
1748}
1749
1750void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1751{
1752 intptr_t i, j, opr_sz = simd_oprsz(desc);
1753 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1754 uint64_t f = *(uint64_t *)(vn + i);
1755 uint64_t b = *(uint64_t *)(vn + j);
1756 *(uint64_t *)(vd + i) = b;
1757 *(uint64_t *)(vd + j) = f;
1758 }
1759}
1760
1761#define DO_TBL(NAME, TYPE, H) \
1762void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1763{ \
1764 intptr_t i, opr_sz = simd_oprsz(desc); \
1765 uintptr_t elem = opr_sz / sizeof(TYPE); \
1766 TYPE *d = vd, *n = vn, *m = vm; \
1767 ARMVectorReg tmp; \
1768 if (unlikely(vd == vn)) { \
1769 n = memcpy(&tmp, vn, opr_sz); \
1770 } \
1771 for (i = 0; i < elem; i++) { \
1772 TYPE j = m[H(i)]; \
1773 d[H(i)] = j < elem ? n[H(j)] : 0; \
1774 } \
1775}
1776
1777DO_TBL(sve_tbl_b, uint8_t, H1)
1778DO_TBL(sve_tbl_h, uint16_t, H2)
1779DO_TBL(sve_tbl_s, uint32_t, H4)
1780DO_TBL(sve_tbl_d, uint64_t, )
1781
1782#undef TBL
1783
1784#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1785void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1786{ \
1787 intptr_t i, opr_sz = simd_oprsz(desc); \
1788 TYPED *d = vd; \
1789 TYPES *n = vn; \
1790 ARMVectorReg tmp; \
1791 if (unlikely(vn - vd < opr_sz)) { \
1792 n = memcpy(&tmp, n, opr_sz / 2); \
1793 } \
1794 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1795 d[HD(i)] = n[HS(i)]; \
1796 } \
1797}
1798
1799DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1800DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1801DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1802
1803DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1804DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1805DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1806
1807#undef DO_UNPK
d731d8cb
RH
1808
1809/* Mask of bits included in the even numbered predicates of width esz.
1810 * We also use this for expand_bits/compress_bits, and so extend the
1811 * same pattern out to 16-bit units.
1812 */
1813static const uint64_t even_bit_esz_masks[5] = {
1814 0x5555555555555555ull,
1815 0x3333333333333333ull,
1816 0x0f0f0f0f0f0f0f0full,
1817 0x00ff00ff00ff00ffull,
1818 0x0000ffff0000ffffull,
1819};
1820
1821/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1822 * For N==0, this corresponds to the operation that in qemu/bitops.h
1823 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1824 * section 7-2 Shuffling Bits.
1825 */
1826static uint64_t expand_bits(uint64_t x, int n)
1827{
1828 int i;
1829
1830 x &= 0xffffffffu;
1831 for (i = 4; i >= n; i--) {
1832 int sh = 1 << i;
1833 x = ((x << sh) | x) & even_bit_esz_masks[i];
1834 }
1835 return x;
1836}
1837
1838/* Compress units of 2**(N+1) bits to units of 2**N bits.
1839 * For N==0, this corresponds to the operation that in qemu/bitops.h
1840 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1841 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1842 */
1843static uint64_t compress_bits(uint64_t x, int n)
1844{
1845 int i;
1846
1847 for (i = n; i <= 4; i++) {
1848 int sh = 1 << i;
1849 x &= even_bit_esz_masks[i];
1850 x = (x >> sh) | x;
1851 }
1852 return x & 0xffffffffu;
1853}
1854
1855void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1856{
1857 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1858 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1859 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1860 uint64_t *d = vd;
1861 intptr_t i;
1862
1863 if (oprsz <= 8) {
1864 uint64_t nn = *(uint64_t *)vn;
1865 uint64_t mm = *(uint64_t *)vm;
1866 int half = 4 * oprsz;
1867
1868 nn = extract64(nn, high * half, half);
1869 mm = extract64(mm, high * half, half);
1870 nn = expand_bits(nn, esz);
1871 mm = expand_bits(mm, esz);
1872 d[0] = nn + (mm << (1 << esz));
1873 } else {
1874 ARMPredicateReg tmp_n, tmp_m;
1875
1876 /* We produce output faster than we consume input.
1877 Therefore we must be mindful of possible overlap. */
1878 if ((vn - vd) < (uintptr_t)oprsz) {
1879 vn = memcpy(&tmp_n, vn, oprsz);
1880 }
1881 if ((vm - vd) < (uintptr_t)oprsz) {
1882 vm = memcpy(&tmp_m, vm, oprsz);
1883 }
1884 if (high) {
1885 high = oprsz >> 1;
1886 }
1887
1888 if ((high & 3) == 0) {
1889 uint32_t *n = vn, *m = vm;
1890 high >>= 2;
1891
1892 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1893 uint64_t nn = n[H4(high + i)];
1894 uint64_t mm = m[H4(high + i)];
1895
1896 nn = expand_bits(nn, esz);
1897 mm = expand_bits(mm, esz);
1898 d[i] = nn + (mm << (1 << esz));
1899 }
1900 } else {
1901 uint8_t *n = vn, *m = vm;
1902 uint16_t *d16 = vd;
1903
1904 for (i = 0; i < oprsz / 2; i++) {
1905 uint16_t nn = n[H1(high + i)];
1906 uint16_t mm = m[H1(high + i)];
1907
1908 nn = expand_bits(nn, esz);
1909 mm = expand_bits(mm, esz);
1910 d16[H2(i)] = nn + (mm << (1 << esz));
1911 }
1912 }
1913 }
1914}
1915
1916void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1917{
1918 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1919 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1920 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1921 uint64_t *d = vd, *n = vn, *m = vm;
1922 uint64_t l, h;
1923 intptr_t i;
1924
1925 if (oprsz <= 8) {
1926 l = compress_bits(n[0] >> odd, esz);
1927 h = compress_bits(m[0] >> odd, esz);
1928 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1929 } else {
1930 ARMPredicateReg tmp_m;
1931 intptr_t oprsz_16 = oprsz / 16;
1932
1933 if ((vm - vd) < (uintptr_t)oprsz) {
1934 m = memcpy(&tmp_m, vm, oprsz);
1935 }
1936
1937 for (i = 0; i < oprsz_16; i++) {
1938 l = n[2 * i + 0];
1939 h = n[2 * i + 1];
1940 l = compress_bits(l >> odd, esz);
1941 h = compress_bits(h >> odd, esz);
1942 d[i] = l + (h << 32);
1943 }
1944
1945 /* For VL which is not a power of 2, the results from M do not
1946 align nicely with the uint64_t for D. Put the aligned results
1947 from M into TMP_M and then copy it into place afterward. */
1948 if (oprsz & 15) {
1949 d[i] = compress_bits(n[2 * i] >> odd, esz);
1950
1951 for (i = 0; i < oprsz_16; i++) {
1952 l = m[2 * i + 0];
1953 h = m[2 * i + 1];
1954 l = compress_bits(l >> odd, esz);
1955 h = compress_bits(h >> odd, esz);
1956 tmp_m.p[i] = l + (h << 32);
1957 }
1958 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1959
1960 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1961 } else {
1962 for (i = 0; i < oprsz_16; i++) {
1963 l = m[2 * i + 0];
1964 h = m[2 * i + 1];
1965 l = compress_bits(l >> odd, esz);
1966 h = compress_bits(h >> odd, esz);
1967 d[oprsz_16 + i] = l + (h << 32);
1968 }
1969 }
1970 }
1971}
1972
1973void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1974{
1975 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1976 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1977 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1978 uint64_t *d = vd, *n = vn, *m = vm;
1979 uint64_t mask;
1980 int shr, shl;
1981 intptr_t i;
1982
1983 shl = 1 << esz;
1984 shr = 0;
1985 mask = even_bit_esz_masks[esz];
1986 if (odd) {
1987 mask <<= shl;
1988 shr = shl;
1989 shl = 0;
1990 }
1991
1992 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1993 uint64_t nn = (n[i] & mask) >> shr;
1994 uint64_t mm = (m[i] & mask) << shl;
1995 d[i] = nn + mm;
1996 }
1997}
1998
1999/* Reverse units of 2**N bits. */
2000static uint64_t reverse_bits_64(uint64_t x, int n)
2001{
2002 int i, sh;
2003
2004 x = bswap64(x);
2005 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2006 uint64_t mask = even_bit_esz_masks[i];
2007 x = ((x & mask) << sh) | ((x >> sh) & mask);
2008 }
2009 return x;
2010}
2011
2012static uint8_t reverse_bits_8(uint8_t x, int n)
2013{
2014 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2015 int i, sh;
2016
2017 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2018 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2019 }
2020 return x;
2021}
2022
2023void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2024{
2025 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2026 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2027 intptr_t i, oprsz_2 = oprsz / 2;
2028
2029 if (oprsz <= 8) {
2030 uint64_t l = *(uint64_t *)vn;
2031 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2032 *(uint64_t *)vd = l;
2033 } else if ((oprsz & 15) == 0) {
2034 for (i = 0; i < oprsz_2; i += 8) {
2035 intptr_t ih = oprsz - 8 - i;
2036 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2037 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2038 *(uint64_t *)(vd + i) = h;
2039 *(uint64_t *)(vd + ih) = l;
2040 }
2041 } else {
2042 for (i = 0; i < oprsz_2; i += 1) {
2043 intptr_t il = H1(i);
2044 intptr_t ih = H1(oprsz - 1 - i);
2045 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2046 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2047 *(uint8_t *)(vd + il) = h;
2048 *(uint8_t *)(vd + ih) = l;
2049 }
2050 }
2051}
2052
2053void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2054{
2055 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2056 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2057 uint64_t *d = vd;
2058 intptr_t i;
2059
2060 if (oprsz <= 8) {
2061 uint64_t nn = *(uint64_t *)vn;
2062 int half = 4 * oprsz;
2063
2064 nn = extract64(nn, high * half, half);
2065 nn = expand_bits(nn, 0);
2066 d[0] = nn;
2067 } else {
2068 ARMPredicateReg tmp_n;
2069
2070 /* We produce output faster than we consume input.
2071 Therefore we must be mindful of possible overlap. */
2072 if ((vn - vd) < (uintptr_t)oprsz) {
2073 vn = memcpy(&tmp_n, vn, oprsz);
2074 }
2075 if (high) {
2076 high = oprsz >> 1;
2077 }
2078
2079 if ((high & 3) == 0) {
2080 uint32_t *n = vn;
2081 high >>= 2;
2082
2083 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2084 uint64_t nn = n[H4(high + i)];
2085 d[i] = expand_bits(nn, 0);
2086 }
2087 } else {
2088 uint16_t *d16 = vd;
2089 uint8_t *n = vn;
2090
2091 for (i = 0; i < oprsz / 2; i++) {
2092 uint16_t nn = n[H1(high + i)];
2093 d16[H2(i)] = expand_bits(nn, 0);
2094 }
2095 }
2096 }
2097}
234b48e9
RH
2098
2099#define DO_ZIP(NAME, TYPE, H) \
2100void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2101{ \
2102 intptr_t oprsz = simd_oprsz(desc); \
2103 intptr_t i, oprsz_2 = oprsz / 2; \
2104 ARMVectorReg tmp_n, tmp_m; \
2105 /* We produce output faster than we consume input. \
2106 Therefore we must be mindful of possible overlap. */ \
2107 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2108 vn = memcpy(&tmp_n, vn, oprsz_2); \
2109 } \
2110 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2111 vm = memcpy(&tmp_m, vm, oprsz_2); \
2112 } \
2113 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2114 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2115 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2116 } \
2117}
2118
2119DO_ZIP(sve_zip_b, uint8_t, H1)
2120DO_ZIP(sve_zip_h, uint16_t, H1_2)
2121DO_ZIP(sve_zip_s, uint32_t, H1_4)
2122DO_ZIP(sve_zip_d, uint64_t, )
2123
2124#define DO_UZP(NAME, TYPE, H) \
2125void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2126{ \
2127 intptr_t oprsz = simd_oprsz(desc); \
2128 intptr_t oprsz_2 = oprsz / 2; \
2129 intptr_t odd_ofs = simd_data(desc); \
2130 intptr_t i; \
2131 ARMVectorReg tmp_m; \
2132 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2133 vm = memcpy(&tmp_m, vm, oprsz); \
2134 } \
2135 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2136 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2137 } \
2138 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2139 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2140 } \
2141}
2142
2143DO_UZP(sve_uzp_b, uint8_t, H1)
2144DO_UZP(sve_uzp_h, uint16_t, H1_2)
2145DO_UZP(sve_uzp_s, uint32_t, H1_4)
2146DO_UZP(sve_uzp_d, uint64_t, )
2147
2148#define DO_TRN(NAME, TYPE, H) \
2149void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2150{ \
2151 intptr_t oprsz = simd_oprsz(desc); \
2152 intptr_t odd_ofs = simd_data(desc); \
2153 intptr_t i; \
2154 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2155 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2156 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2157 *(TYPE *)(vd + H(i + 0)) = ae; \
2158 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2159 } \
2160}
2161
2162DO_TRN(sve_trn_b, uint8_t, H1)
2163DO_TRN(sve_trn_h, uint16_t, H1_2)
2164DO_TRN(sve_trn_s, uint32_t, H1_4)
2165DO_TRN(sve_trn_d, uint64_t, )
2166
2167#undef DO_ZIP
2168#undef DO_UZP
2169#undef DO_TRN
3ca879ae
RH
2170
2171void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2172{
2173 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2174 uint32_t *d = vd, *n = vn;
2175 uint8_t *pg = vg;
2176
2177 for (i = j = 0; i < opr_sz; i++) {
2178 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2179 d[H4(j)] = n[H4(i)];
2180 j++;
2181 }
2182 }
2183 for (; j < opr_sz; j++) {
2184 d[H4(j)] = 0;
2185 }
2186}
2187
2188void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2189{
2190 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2191 uint64_t *d = vd, *n = vn;
2192 uint8_t *pg = vg;
2193
2194 for (i = j = 0; i < opr_sz; i++) {
2195 if (pg[H1(i)] & 1) {
2196 d[j] = n[i];
2197 j++;
2198 }
2199 }
2200 for (; j < opr_sz; j++) {
2201 d[j] = 0;
2202 }
2203}
ef23cb72
RH
2204
2205/* Similar to the ARM LastActiveElement pseudocode function, except the
2206 * result is multiplied by the element size. This includes the not found
2207 * indication; e.g. not found for esz=3 is -8.
2208 */
2209int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2210{
2211 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2212 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2213
2214 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2215}
b48ff240
RH
2216
2217void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2218{
2219 intptr_t opr_sz = simd_oprsz(desc) / 8;
2220 int esz = simd_data(desc);
2221 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2222 intptr_t i, first_i, last_i;
2223 ARMVectorReg tmp;
2224
2225 first_i = last_i = 0;
2226 first_g = last_g = 0;
2227
2228 /* Find the extent of the active elements within VG. */
2229 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2230 pg = *(uint64_t *)(vg + i) & mask;
2231 if (pg) {
2232 if (last_g == 0) {
2233 last_g = pg;
2234 last_i = i;
2235 }
2236 first_g = pg;
2237 first_i = i;
2238 }
2239 }
2240
2241 len = 0;
2242 if (first_g != 0) {
2243 first_i = first_i * 8 + ctz64(first_g);
2244 last_i = last_i * 8 + 63 - clz64(last_g);
2245 len = last_i - first_i + (1 << esz);
2246 if (vd == vm) {
2247 vm = memcpy(&tmp, vm, opr_sz * 8);
2248 }
2249 swap_memmove(vd, vn + first_i, len);
2250 }
2251 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2252}
d3fe4a29
RH
2253
2254void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2255 void *vg, uint32_t desc)
2256{
2257 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2258 uint64_t *d = vd, *n = vn, *m = vm;
2259 uint8_t *pg = vg;
2260
2261 for (i = 0; i < opr_sz; i += 1) {
2262 uint64_t nn = n[i], mm = m[i];
2263 uint64_t pp = expand_pred_b(pg[H1(i)]);
2264 d[i] = (nn & pp) | (mm & ~pp);
2265 }
2266}
2267
2268void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2269 void *vg, uint32_t desc)
2270{
2271 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2272 uint64_t *d = vd, *n = vn, *m = vm;
2273 uint8_t *pg = vg;
2274
2275 for (i = 0; i < opr_sz; i += 1) {
2276 uint64_t nn = n[i], mm = m[i];
2277 uint64_t pp = expand_pred_h(pg[H1(i)]);
2278 d[i] = (nn & pp) | (mm & ~pp);
2279 }
2280}
2281
2282void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2283 void *vg, uint32_t desc)
2284{
2285 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2286 uint64_t *d = vd, *n = vn, *m = vm;
2287 uint8_t *pg = vg;
2288
2289 for (i = 0; i < opr_sz; i += 1) {
2290 uint64_t nn = n[i], mm = m[i];
2291 uint64_t pp = expand_pred_s(pg[H1(i)]);
2292 d[i] = (nn & pp) | (mm & ~pp);
2293 }
2294}
2295
2296void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2297 void *vg, uint32_t desc)
2298{
2299 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2300 uint64_t *d = vd, *n = vn, *m = vm;
2301 uint8_t *pg = vg;
2302
2303 for (i = 0; i < opr_sz; i += 1) {
2304 uint64_t nn = n[i], mm = m[i];
2305 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2306 }
2307}
757f9cff
RH
2308
2309/* Two operand comparison controlled by a predicate.
2310 * ??? It is very tempting to want to be able to expand this inline
2311 * with x86 instructions, e.g.
2312 *
2313 * vcmpeqw zm, zn, %ymm0
2314 * vpmovmskb %ymm0, %eax
2315 * and $0x5555, %eax
2316 * and pg, %eax
2317 *
2318 * or even aarch64, e.g.
2319 *
2320 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2321 * cmeq v0.8h, zn, zm
2322 * and v0.8h, v0.8h, mask
2323 * addv h0, v0.8h
2324 * and v0.8b, pg
2325 *
2326 * However, coming up with an abstraction that allows vector inputs and
2327 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2328 * scalar outputs, is tricky.
2329 */
2330#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2331uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2332{ \
2333 intptr_t opr_sz = simd_oprsz(desc); \
2334 uint32_t flags = PREDTEST_INIT; \
2335 intptr_t i = opr_sz; \
2336 do { \
2337 uint64_t out = 0, pg; \
2338 do { \
2339 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2340 TYPE nn = *(TYPE *)(vn + H(i)); \
2341 TYPE mm = *(TYPE *)(vm + H(i)); \
2342 out |= nn OP mm; \
2343 } while (i & 63); \
2344 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2345 out &= pg; \
2346 *(uint64_t *)(vd + (i >> 3)) = out; \
2347 flags = iter_predtest_bwd(out, pg, flags); \
2348 } while (i > 0); \
2349 return flags; \
2350}
2351
2352#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2353 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2354#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2355 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2356#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2357 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2358#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2359 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2360
2361DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2362DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2363DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2364DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2365
2366DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2367DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2368DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2369DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2370
2371DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2372DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2373DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2374DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2375
2376DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2377DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2378DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2379DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2380
2381DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2382DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2383DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2384DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2385
2386DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2387DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2388DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2389DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2390
2391#undef DO_CMP_PPZZ_B
2392#undef DO_CMP_PPZZ_H
2393#undef DO_CMP_PPZZ_S
2394#undef DO_CMP_PPZZ_D
2395#undef DO_CMP_PPZZ
2396
2397/* Similar, but the second source is "wide". */
2398#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2399uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2400{ \
2401 intptr_t opr_sz = simd_oprsz(desc); \
2402 uint32_t flags = PREDTEST_INIT; \
2403 intptr_t i = opr_sz; \
2404 do { \
2405 uint64_t out = 0, pg; \
2406 do { \
2407 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2408 do { \
2409 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2410 TYPE nn = *(TYPE *)(vn + H(i)); \
2411 out |= nn OP mm; \
2412 } while (i & 7); \
2413 } while (i & 63); \
2414 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2415 out &= pg; \
2416 *(uint64_t *)(vd + (i >> 3)) = out; \
2417 flags = iter_predtest_bwd(out, pg, flags); \
2418 } while (i > 0); \
2419 return flags; \
2420}
2421
2422#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2423 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2424#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2425 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2426#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2427 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2428
2429DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2430DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2431DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2432
2433DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2434DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2435DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2436
2437DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2438DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2439DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2440
2441DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2442DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2443DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2444
2445DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2446DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2447DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2448
2449DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2450DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2451DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2452
2453DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2454DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2455DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2456
2457DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2458DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2459DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2460
2461DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2462DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2463DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2464
2465DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2466DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2467DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2468
2469#undef DO_CMP_PPZW_B
2470#undef DO_CMP_PPZW_H
2471#undef DO_CMP_PPZW_S
2472#undef DO_CMP_PPZW
38cadeba
RH
2473
2474/* Similar, but the second source is immediate. */
2475#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2476uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2477{ \
2478 intptr_t opr_sz = simd_oprsz(desc); \
2479 uint32_t flags = PREDTEST_INIT; \
2480 TYPE mm = simd_data(desc); \
2481 intptr_t i = opr_sz; \
2482 do { \
2483 uint64_t out = 0, pg; \
2484 do { \
2485 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2486 TYPE nn = *(TYPE *)(vn + H(i)); \
2487 out |= nn OP mm; \
2488 } while (i & 63); \
2489 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2490 out &= pg; \
2491 *(uint64_t *)(vd + (i >> 3)) = out; \
2492 flags = iter_predtest_bwd(out, pg, flags); \
2493 } while (i > 0); \
2494 return flags; \
2495}
2496
2497#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2498 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2499#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2500 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2501#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2502 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2503#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2504 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2505
2506DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2507DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2508DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2509DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2510
2511DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2512DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2513DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2514DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2515
2516DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2517DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2518DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2519DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2520
2521DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2522DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2523DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2524DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2525
2526DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2527DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2528DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2529DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2530
2531DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2532DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2533DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2534DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2535
2536DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2537DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2538DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2539DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2540
2541DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2542DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2543DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2544DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2545
2546DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2547DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2548DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2549DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2550
2551DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2552DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2553DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2554DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2555
2556#undef DO_CMP_PPZI_B
2557#undef DO_CMP_PPZI_H
2558#undef DO_CMP_PPZI_S
2559#undef DO_CMP_PPZI_D
2560#undef DO_CMP_PPZI
35da316f
RH
2561
2562/* Similar to the ARM LastActive pseudocode function. */
2563static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2564{
2565 intptr_t i;
2566
2567 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2568 uint64_t pg = *(uint64_t *)(vg + i);
2569 if (pg) {
2570 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2571 }
2572 }
2573 return 0;
2574}
2575
2576/* Compute a mask into RETB that is true for all G, up to and including
2577 * (if after) or excluding (if !after) the first G & N.
2578 * Return true if BRK found.
2579 */
2580static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2581 bool brk, bool after)
2582{
2583 uint64_t b;
2584
2585 if (brk) {
2586 b = 0;
2587 } else if ((g & n) == 0) {
2588 /* For all G, no N are set; break not found. */
2589 b = g;
2590 } else {
2591 /* Break somewhere in N. Locate it. */
2592 b = g & n; /* guard true, pred true */
2593 b = b & -b; /* first such */
2594 if (after) {
2595 b = b | (b - 1); /* break after same */
2596 } else {
2597 b = b - 1; /* break before same */
2598 }
2599 brk = true;
2600 }
2601
2602 *retb = b;
2603 return brk;
2604}
2605
2606/* Compute a zeroing BRK. */
2607static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2608 intptr_t oprsz, bool after)
2609{
2610 bool brk = false;
2611 intptr_t i;
2612
2613 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2614 uint64_t this_b, this_g = g[i];
2615
2616 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2617 d[i] = this_b & this_g;
2618 }
2619}
2620
2621/* Likewise, but also compute flags. */
2622static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2623 intptr_t oprsz, bool after)
2624{
2625 uint32_t flags = PREDTEST_INIT;
2626 bool brk = false;
2627 intptr_t i;
2628
2629 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2630 uint64_t this_b, this_d, this_g = g[i];
2631
2632 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2633 d[i] = this_d = this_b & this_g;
2634 flags = iter_predtest_fwd(this_d, this_g, flags);
2635 }
2636 return flags;
2637}
2638
2639/* Compute a merging BRK. */
2640static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2641 intptr_t oprsz, bool after)
2642{
2643 bool brk = false;
2644 intptr_t i;
2645
2646 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2647 uint64_t this_b, this_g = g[i];
2648
2649 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2650 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2651 }
2652}
2653
2654/* Likewise, but also compute flags. */
2655static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2656 intptr_t oprsz, bool after)
2657{
2658 uint32_t flags = PREDTEST_INIT;
2659 bool brk = false;
2660 intptr_t i;
2661
2662 for (i = 0; i < oprsz / 8; ++i) {
2663 uint64_t this_b, this_d = d[i], this_g = g[i];
2664
2665 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2666 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2667 flags = iter_predtest_fwd(this_d, this_g, flags);
2668 }
2669 return flags;
2670}
2671
2672static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2673{
2674 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2675 * The compiler should turn this into 4 64-bit integer stores.
2676 */
2677 memset(d, 0, sizeof(ARMPredicateReg));
2678 return PREDTEST_INIT;
2679}
2680
2681void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2682 uint32_t pred_desc)
2683{
2684 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2685 if (last_active_pred(vn, vg, oprsz)) {
2686 compute_brk_z(vd, vm, vg, oprsz, true);
2687 } else {
2688 do_zero(vd, oprsz);
2689 }
2690}
2691
2692uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2693 uint32_t pred_desc)
2694{
2695 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2696 if (last_active_pred(vn, vg, oprsz)) {
2697 return compute_brks_z(vd, vm, vg, oprsz, true);
2698 } else {
2699 return do_zero(vd, oprsz);
2700 }
2701}
2702
2703void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2704 uint32_t pred_desc)
2705{
2706 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2707 if (last_active_pred(vn, vg, oprsz)) {
2708 compute_brk_z(vd, vm, vg, oprsz, false);
2709 } else {
2710 do_zero(vd, oprsz);
2711 }
2712}
2713
2714uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2715 uint32_t pred_desc)
2716{
2717 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2718 if (last_active_pred(vn, vg, oprsz)) {
2719 return compute_brks_z(vd, vm, vg, oprsz, false);
2720 } else {
2721 return do_zero(vd, oprsz);
2722 }
2723}
2724
2725void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2726{
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 compute_brk_z(vd, vn, vg, oprsz, true);
2729}
2730
2731uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2732{
2733 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2734 return compute_brks_z(vd, vn, vg, oprsz, true);
2735}
2736
2737void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2738{
2739 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2740 compute_brk_z(vd, vn, vg, oprsz, false);
2741}
2742
2743uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2744{
2745 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2746 return compute_brks_z(vd, vn, vg, oprsz, false);
2747}
2748
2749void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2750{
2751 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2752 compute_brk_m(vd, vn, vg, oprsz, true);
2753}
2754
2755uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2756{
2757 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2758 return compute_brks_m(vd, vn, vg, oprsz, true);
2759}
2760
2761void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2762{
2763 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2764 compute_brk_m(vd, vn, vg, oprsz, false);
2765}
2766
2767uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2768{
2769 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2770 return compute_brks_m(vd, vn, vg, oprsz, false);
2771}
2772
2773void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2774{
2775 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2776
2777 if (!last_active_pred(vn, vg, oprsz)) {
2778 do_zero(vd, oprsz);
2779 }
2780}
2781
2782/* As if PredTest(Ones(PL), D, esz). */
2783static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2784 uint64_t esz_mask)
2785{
2786 uint32_t flags = PREDTEST_INIT;
2787 intptr_t i;
2788
2789 for (i = 0; i < oprsz / 8; i++) {
2790 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2791 }
2792 if (oprsz & 7) {
2793 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2794 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2795 }
2796 return flags;
2797}
2798
2799uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2800{
2801 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2802
2803 if (last_active_pred(vn, vg, oprsz)) {
2804 return predtest_ones(vd, oprsz, -1);
2805 } else {
2806 return do_zero(vd, oprsz);
2807 }
2808}
9ee3a611
RH
2809
2810uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2811{
2812 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2813 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2814 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2815 intptr_t i;
2816
2817 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2818 uint64_t t = n[i] & g[i] & mask;
2819 sum += ctpop64(t);
2820 }
2821 return sum;
2822}
caf1cefc
RH
2823
2824uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2825{
2826 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2827 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2828 uint64_t esz_mask = pred_esz_masks[esz];
2829 ARMPredicateReg *d = vd;
2830 uint32_t flags;
2831 intptr_t i;
2832
2833 /* Begin with a zero predicate register. */
2834 flags = do_zero(d, oprsz);
2835 if (count == 0) {
2836 return flags;
2837 }
2838
2839 /* Scale from predicate element count to bits. */
2840 count <<= esz;
2841 /* Bound to the bits in the predicate. */
2842 count = MIN(count, oprsz * 8);
2843
2844 /* Set all of the requested bits. */
2845 for (i = 0; i < count / 64; ++i) {
2846 d->p[i] = esz_mask;
2847 }
2848 if (count & 63) {
2849 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2850 }
2851
2852 return predtest_ones(d, oprsz, esz_mask);
2853}
c4e7c493 2854
7f9ddf64
RH
2855uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2856 void *status, uint32_t desc)
2857{
2858 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2859 float16 result = nn;
2860
2861 do {
2862 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2863 do {
2864 if (pg & 1) {
2865 float16 mm = *(float16 *)(vm + H1_2(i));
2866 result = float16_add(result, mm, status);
2867 }
2868 i += sizeof(float16), pg >>= sizeof(float16);
2869 } while (i & 15);
2870 } while (i < opr_sz);
2871
2872 return result;
2873}
2874
2875uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2876 void *status, uint32_t desc)
2877{
2878 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2879 float32 result = nn;
2880
2881 do {
2882 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2883 do {
2884 if (pg & 1) {
2885 float32 mm = *(float32 *)(vm + H1_2(i));
2886 result = float32_add(result, mm, status);
2887 }
2888 i += sizeof(float32), pg >>= sizeof(float32);
2889 } while (i & 15);
2890 } while (i < opr_sz);
2891
2892 return result;
2893}
2894
2895uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2896 void *status, uint32_t desc)
2897{
2898 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2899 uint64_t *m = vm;
2900 uint8_t *pg = vg;
2901
2902 for (i = 0; i < opr_sz; i++) {
2903 if (pg[H1(i)] & 1) {
2904 nn = float64_add(nn, m[i], status);
2905 }
2906 }
2907
2908 return nn;
2909}
2910
ec3b87c2
RH
2911/* Fully general three-operand expander, controlled by a predicate,
2912 * With the extra float_status parameter.
2913 */
2914#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2915void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2916 void *status, uint32_t desc) \
2917{ \
2918 intptr_t i = simd_oprsz(desc); \
2919 uint64_t *g = vg; \
2920 do { \
2921 uint64_t pg = g[(i - 1) >> 6]; \
2922 do { \
2923 i -= sizeof(TYPE); \
2924 if (likely((pg >> (i & 63)) & 1)) { \
2925 TYPE nn = *(TYPE *)(vn + H(i)); \
2926 TYPE mm = *(TYPE *)(vm + H(i)); \
2927 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2928 } \
2929 } while (i & 63); \
2930 } while (i != 0); \
2931}
2932
2933DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
2934DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
2935DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
2936
2937DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
2938DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
2939DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
2940
2941DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
2942DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
2943DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
2944
2945DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
2946DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
2947DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
2948
2949DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
2950DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
2951DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
2952
2953DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
2954DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
2955DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
2956
2957DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
2958DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
2959DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
2960
2961DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
2962DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
2963DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
2964
2965static inline float16 abd_h(float16 a, float16 b, float_status *s)
2966{
2967 return float16_abs(float16_sub(a, b, s));
2968}
2969
2970static inline float32 abd_s(float32 a, float32 b, float_status *s)
2971{
2972 return float32_abs(float32_sub(a, b, s));
2973}
2974
2975static inline float64 abd_d(float64 a, float64 b, float_status *s)
2976{
2977 return float64_abs(float64_sub(a, b, s));
2978}
2979
2980DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
2981DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
2982DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
2983
2984static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
2985{
2986 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
2987 return float64_scalbn(a, b_int, s);
2988}
2989
2990DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
2991DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
2992DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
2993
2994DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
2995DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
2996DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
2997
2998#undef DO_ZPZZ_FP
2999
8092c6a3
RH
3000/* Fully general two-operand expander, controlled by a predicate,
3001 * With the extra float_status parameter.
3002 */
3003#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3004void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3005{ \
3006 intptr_t i = simd_oprsz(desc); \
3007 uint64_t *g = vg; \
3008 do { \
3009 uint64_t pg = g[(i - 1) >> 6]; \
3010 do { \
3011 i -= sizeof(TYPE); \
3012 if (likely((pg >> (i & 63)) & 1)) { \
3013 TYPE nn = *(TYPE *)(vn + H(i)); \
3014 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3015 } \
3016 } while (i & 63); \
3017 } while (i != 0); \
3018}
3019
3020DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3021DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3022DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3023DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3024DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3025DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3026DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3027
3028DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3029DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3030DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3031DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3032DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3033DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3034DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3035
3036#undef DO_ZPZ_FP
3037
6ceabaad
RH
3038/* 4-operand predicated multiply-add. This requires 7 operands to pass
3039 * "properly", so we need to encode some of the registers into DESC.
3040 */
3041QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3042
3043static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3044 uint16_t neg1, uint16_t neg3)
3045{
3046 intptr_t i = simd_oprsz(desc);
3047 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3048 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3049 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3050 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3051 void *vd = &env->vfp.zregs[rd];
3052 void *vn = &env->vfp.zregs[rn];
3053 void *vm = &env->vfp.zregs[rm];
3054 void *va = &env->vfp.zregs[ra];
3055 uint64_t *g = vg;
3056
3057 do {
3058 uint64_t pg = g[(i - 1) >> 6];
3059 do {
3060 i -= 2;
3061 if (likely((pg >> (i & 63)) & 1)) {
3062 float16 e1, e2, e3, r;
3063
3064 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3065 e2 = *(uint16_t *)(vm + H1_2(i));
3066 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3067 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3068 *(uint16_t *)(vd + H1_2(i)) = r;
3069 }
3070 } while (i & 63);
3071 } while (i != 0);
3072}
3073
3074void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3075{
3076 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3077}
3078
3079void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3080{
3081 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3082}
3083
3084void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3085{
3086 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3087}
3088
3089void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3090{
3091 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3092}
3093
3094static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3095 uint32_t neg1, uint32_t neg3)
3096{
3097 intptr_t i = simd_oprsz(desc);
3098 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3099 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3100 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3101 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3102 void *vd = &env->vfp.zregs[rd];
3103 void *vn = &env->vfp.zregs[rn];
3104 void *vm = &env->vfp.zregs[rm];
3105 void *va = &env->vfp.zregs[ra];
3106 uint64_t *g = vg;
3107
3108 do {
3109 uint64_t pg = g[(i - 1) >> 6];
3110 do {
3111 i -= 4;
3112 if (likely((pg >> (i & 63)) & 1)) {
3113 float32 e1, e2, e3, r;
3114
3115 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3116 e2 = *(uint32_t *)(vm + H1_4(i));
3117 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3118 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3119 *(uint32_t *)(vd + H1_4(i)) = r;
3120 }
3121 } while (i & 63);
3122 } while (i != 0);
3123}
3124
3125void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3126{
3127 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3128}
3129
3130void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3131{
3132 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3133}
3134
3135void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3136{
3137 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3138}
3139
3140void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3141{
3142 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3143}
3144
3145static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3146 uint64_t neg1, uint64_t neg3)
3147{
3148 intptr_t i = simd_oprsz(desc);
3149 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3150 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3151 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3152 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3153 void *vd = &env->vfp.zregs[rd];
3154 void *vn = &env->vfp.zregs[rn];
3155 void *vm = &env->vfp.zregs[rm];
3156 void *va = &env->vfp.zregs[ra];
3157 uint64_t *g = vg;
3158
3159 do {
3160 uint64_t pg = g[(i - 1) >> 6];
3161 do {
3162 i -= 8;
3163 if (likely((pg >> (i & 63)) & 1)) {
3164 float64 e1, e2, e3, r;
3165
3166 e1 = *(uint64_t *)(vn + i) ^ neg1;
3167 e2 = *(uint64_t *)(vm + i);
3168 e3 = *(uint64_t *)(va + i) ^ neg3;
3169 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3170 *(uint64_t *)(vd + i) = r;
3171 }
3172 } while (i & 63);
3173 } while (i != 0);
3174}
3175
3176void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3177{
3178 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3179}
3180
3181void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3182{
3183 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3184}
3185
3186void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3187{
3188 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3189}
3190
3191void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3192{
3193 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3194}
3195
c4e7c493
RH
3196/*
3197 * Load contiguous data, protected by a governing predicate.
3198 */
3199#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3200static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3201 target_ulong addr, intptr_t oprsz, \
3202 uintptr_t ra) \
3203{ \
3204 intptr_t i = 0; \
3205 do { \
3206 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3207 do { \
3208 TYPEM m = 0; \
3209 if (pg & 1) { \
3210 m = FN(env, addr, ra); \
3211 } \
3212 *(TYPEE *)(vd + H(i)) = m; \
3213 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3214 addr += sizeof(TYPEM); \
3215 } while (i & 15); \
3216 } while (i < oprsz); \
3217} \
3218void HELPER(NAME)(CPUARMState *env, void *vg, \
3219 target_ulong addr, uint32_t desc) \
3220{ \
3221 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3222 addr, simd_oprsz(desc), GETPC()); \
3223}
3224
3225#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3226void HELPER(NAME)(CPUARMState *env, void *vg, \
3227 target_ulong addr, uint32_t desc) \
3228{ \
3229 intptr_t i, oprsz = simd_oprsz(desc); \
3230 intptr_t ra = GETPC(); \
3231 unsigned rd = simd_data(desc); \
3232 void *d1 = &env->vfp.zregs[rd]; \
3233 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3234 for (i = 0; i < oprsz; ) { \
3235 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3236 do { \
3237 TYPEM m1 = 0, m2 = 0; \
3238 if (pg & 1) { \
3239 m1 = FN(env, addr, ra); \
3240 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3241 } \
3242 *(TYPEE *)(d1 + H(i)) = m1; \
3243 *(TYPEE *)(d2 + H(i)) = m2; \
3244 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3245 addr += 2 * sizeof(TYPEM); \
3246 } while (i & 15); \
3247 } \
3248}
3249
3250#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3251void HELPER(NAME)(CPUARMState *env, void *vg, \
3252 target_ulong addr, uint32_t desc) \
3253{ \
3254 intptr_t i, oprsz = simd_oprsz(desc); \
3255 intptr_t ra = GETPC(); \
3256 unsigned rd = simd_data(desc); \
3257 void *d1 = &env->vfp.zregs[rd]; \
3258 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3259 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3260 for (i = 0; i < oprsz; ) { \
3261 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3262 do { \
3263 TYPEM m1 = 0, m2 = 0, m3 = 0; \
3264 if (pg & 1) { \
3265 m1 = FN(env, addr, ra); \
3266 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3267 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
3268 } \
3269 *(TYPEE *)(d1 + H(i)) = m1; \
3270 *(TYPEE *)(d2 + H(i)) = m2; \
3271 *(TYPEE *)(d3 + H(i)) = m3; \
3272 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3273 addr += 3 * sizeof(TYPEM); \
3274 } while (i & 15); \
3275 } \
3276}
3277
3278#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
3279void HELPER(NAME)(CPUARMState *env, void *vg, \
3280 target_ulong addr, uint32_t desc) \
3281{ \
3282 intptr_t i, oprsz = simd_oprsz(desc); \
3283 intptr_t ra = GETPC(); \
3284 unsigned rd = simd_data(desc); \
3285 void *d1 = &env->vfp.zregs[rd]; \
3286 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3287 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3288 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
3289 for (i = 0; i < oprsz; ) { \
3290 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3291 do { \
3292 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
3293 if (pg & 1) { \
3294 m1 = FN(env, addr, ra); \
3295 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3296 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
3297 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
3298 } \
3299 *(TYPEE *)(d1 + H(i)) = m1; \
3300 *(TYPEE *)(d2 + H(i)) = m2; \
3301 *(TYPEE *)(d3 + H(i)) = m3; \
3302 *(TYPEE *)(d4 + H(i)) = m4; \
3303 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3304 addr += 4 * sizeof(TYPEM); \
3305 } while (i & 15); \
3306 } \
3307}
3308
3309DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
3310DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
3311DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
3312DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
3313DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
3314DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
3315
3316DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
3317DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
3318DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
3319DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
3320
3321DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
3322DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
3323
3324DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3325DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3326DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3327DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3328
3329DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3330DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3331DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3332DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3333
3334DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3335DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3336DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3337DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3338
3339DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3340DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3341DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3342DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3343
3344#undef DO_LD1
3345#undef DO_LD2
3346#undef DO_LD3
3347#undef DO_LD4
e2654d75
RH
3348
3349/*
3350 * Load contiguous data, first-fault and no-fault.
3351 */
3352
3353#ifdef CONFIG_USER_ONLY
3354
3355/* Fault on byte I. All bits in FFR from I are cleared. The vector
3356 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
3357 * option, which leaves subsequent data unchanged.
3358 */
3359static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
3360{
3361 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
3362
3363 if (i & 63) {
3364 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
3365 i = ROUND_UP(i, 64);
3366 }
3367 for (; i < oprsz; i += 64) {
3368 ffr[i / 64] = 0;
3369 }
3370}
3371
3372/* Hold the mmap lock during the operation so that there is no race
3373 * between page_check_range and the load operation. We expect the
3374 * usual case to have no faults at all, so we check the whole range
3375 * first and if successful defer to the normal load operation.
3376 *
3377 * TODO: Change mmap_lock to a rwlock so that multiple readers
3378 * can run simultaneously. This will probably help other uses
3379 * within QEMU as well.
3380 */
3381#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
3382static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
3383 target_ulong addr, intptr_t oprsz, \
3384 bool first, uintptr_t ra) \
3385{ \
3386 intptr_t i = 0; \
3387 do { \
3388 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3389 do { \
3390 TYPEM m = 0; \
3391 if (pg & 1) { \
3392 if (!first && \
3393 unlikely(page_check_range(addr, sizeof(TYPEM), \
3394 PAGE_READ))) { \
3395 record_fault(env, i, oprsz); \
3396 return; \
3397 } \
3398 m = FN(env, addr, ra); \
3399 first = false; \
3400 } \
3401 *(TYPEE *)(vd + H(i)) = m; \
3402 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3403 addr += sizeof(TYPEM); \
3404 } while (i & 15); \
3405 } while (i < oprsz); \
3406} \
3407void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
3408 target_ulong addr, uint32_t desc) \
3409{ \
3410 intptr_t oprsz = simd_oprsz(desc); \
3411 unsigned rd = simd_data(desc); \
3412 void *vd = &env->vfp.zregs[rd]; \
3413 mmap_lock(); \
3414 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
3415 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
3416 } else { \
3417 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
3418 } \
3419 mmap_unlock(); \
3420}
3421
3422/* No-fault loads are like first-fault loads without the
3423 * first faulting special case.
3424 */
3425#define DO_LDNF1(PART) \
3426void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
3427 target_ulong addr, uint32_t desc) \
3428{ \
3429 intptr_t oprsz = simd_oprsz(desc); \
3430 unsigned rd = simd_data(desc); \
3431 void *vd = &env->vfp.zregs[rd]; \
3432 mmap_lock(); \
3433 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
3434 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
3435 } else { \
3436 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
3437 } \
3438 mmap_unlock(); \
3439}
3440
3441#else
3442
3443/* TODO: System mode is not yet supported.
3444 * This would probably use tlb_vaddr_to_host.
3445 */
3446#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
3447void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
3448 target_ulong addr, uint32_t desc) \
3449{ \
3450 g_assert_not_reached(); \
3451}
3452
3453#define DO_LDNF1(PART) \
3454void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
3455 target_ulong addr, uint32_t desc) \
3456{ \
3457 g_assert_not_reached(); \
3458}
3459
3460#endif
3461
3462DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
3463DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
3464DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
3465DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
3466DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
3467DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
3468DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
3469
3470DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
3471DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
3472DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
3473DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
3474DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
3475
3476DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
3477DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
3478DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
3479
3480DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
3481
3482#undef DO_LDFF1
3483
3484DO_LDNF1(bb_r)
3485DO_LDNF1(bhu_r)
3486DO_LDNF1(bhs_r)
3487DO_LDNF1(bsu_r)
3488DO_LDNF1(bss_r)
3489DO_LDNF1(bdu_r)
3490DO_LDNF1(bds_r)
3491
3492DO_LDNF1(hh_r)
3493DO_LDNF1(hsu_r)
3494DO_LDNF1(hss_r)
3495DO_LDNF1(hdu_r)
3496DO_LDNF1(hds_r)
3497
3498DO_LDNF1(ss_r)
3499DO_LDNF1(sdu_r)
3500DO_LDNF1(sds_r)
3501
3502DO_LDNF1(dd_r)
3503
3504#undef DO_LDNF1
1a039c7e
RH
3505
3506/*
3507 * Store contiguous data, protected by a governing predicate.
3508 */
3509#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
3510void HELPER(NAME)(CPUARMState *env, void *vg, \
3511 target_ulong addr, uint32_t desc) \
3512{ \
3513 intptr_t i, oprsz = simd_oprsz(desc); \
3514 intptr_t ra = GETPC(); \
3515 unsigned rd = simd_data(desc); \
3516 void *vd = &env->vfp.zregs[rd]; \
3517 for (i = 0; i < oprsz; ) { \
3518 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3519 do { \
3520 if (pg & 1) { \
3521 TYPEM m = *(TYPEE *)(vd + H(i)); \
3522 FN(env, addr, m, ra); \
3523 } \
3524 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3525 addr += sizeof(TYPEM); \
3526 } while (i & 15); \
3527 } \
3528}
3529
3530#define DO_ST1_D(NAME, FN, TYPEM) \
3531void HELPER(NAME)(CPUARMState *env, void *vg, \
3532 target_ulong addr, uint32_t desc) \
3533{ \
3534 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
3535 intptr_t ra = GETPC(); \
3536 unsigned rd = simd_data(desc); \
3537 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
3538 uint8_t *pg = vg; \
3539 for (i = 0; i < oprsz; i += 1) { \
3540 if (pg[H1(i)] & 1) { \
3541 FN(env, addr, d[i], ra); \
3542 } \
3543 addr += sizeof(TYPEM); \
3544 } \
3545}
3546
3547#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
3548void HELPER(NAME)(CPUARMState *env, void *vg, \
3549 target_ulong addr, uint32_t desc) \
3550{ \
3551 intptr_t i, oprsz = simd_oprsz(desc); \
3552 intptr_t ra = GETPC(); \
3553 unsigned rd = simd_data(desc); \
3554 void *d1 = &env->vfp.zregs[rd]; \
3555 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3556 for (i = 0; i < oprsz; ) { \
3557 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3558 do { \
3559 if (pg & 1) { \
3560 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
3561 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
3562 FN(env, addr, m1, ra); \
3563 FN(env, addr + sizeof(TYPEM), m2, ra); \
3564 } \
3565 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3566 addr += 2 * sizeof(TYPEM); \
3567 } while (i & 15); \
3568 } \
3569}
3570
3571#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
3572void HELPER(NAME)(CPUARMState *env, void *vg, \
3573 target_ulong addr, uint32_t desc) \
3574{ \
3575 intptr_t i, oprsz = simd_oprsz(desc); \
3576 intptr_t ra = GETPC(); \
3577 unsigned rd = simd_data(desc); \
3578 void *d1 = &env->vfp.zregs[rd]; \
3579 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3580 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3581 for (i = 0; i < oprsz; ) { \
3582 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3583 do { \
3584 if (pg & 1) { \
3585 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
3586 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
3587 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
3588 FN(env, addr, m1, ra); \
3589 FN(env, addr + sizeof(TYPEM), m2, ra); \
3590 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
3591 } \
3592 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3593 addr += 3 * sizeof(TYPEM); \
3594 } while (i & 15); \
3595 } \
3596}
3597
3598#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
3599void HELPER(NAME)(CPUARMState *env, void *vg, \
3600 target_ulong addr, uint32_t desc) \
3601{ \
3602 intptr_t i, oprsz = simd_oprsz(desc); \
3603 intptr_t ra = GETPC(); \
3604 unsigned rd = simd_data(desc); \
3605 void *d1 = &env->vfp.zregs[rd]; \
3606 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3607 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3608 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
3609 for (i = 0; i < oprsz; ) { \
3610 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3611 do { \
3612 if (pg & 1) { \
3613 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
3614 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
3615 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
3616 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
3617 FN(env, addr, m1, ra); \
3618 FN(env, addr + sizeof(TYPEM), m2, ra); \
3619 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
3620 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
3621 } \
3622 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3623 addr += 4 * sizeof(TYPEM); \
3624 } while (i & 15); \
3625 } \
3626}
3627
3628DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
3629DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
3630DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
3631
3632DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
3633DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
3634
3635DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
3636
3637DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
3638DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
3639DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
3640DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
3641
3642DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
3643DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
3644DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
3645DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
3646
3647DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
3648DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
3649DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
3650DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
3651
3652DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
3653
3654void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
3655 target_ulong addr, uint32_t desc)
3656{
3657 intptr_t i, oprsz = simd_oprsz(desc) / 8;
3658 intptr_t ra = GETPC();
3659 unsigned rd = simd_data(desc);
3660 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
3661 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
3662 uint8_t *pg = vg;
3663
3664 for (i = 0; i < oprsz; i += 1) {
3665 if (pg[H1(i)] & 1) {
3666 cpu_stq_data_ra(env, addr, d1[i], ra);
3667 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
3668 }
3669 addr += 2 * 8;
3670 }
3671}
3672
3673void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
3674 target_ulong addr, uint32_t desc)
3675{
3676 intptr_t i, oprsz = simd_oprsz(desc) / 8;
3677 intptr_t ra = GETPC();
3678 unsigned rd = simd_data(desc);
3679 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
3680 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
3681 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
3682 uint8_t *pg = vg;
3683
3684 for (i = 0; i < oprsz; i += 1) {
3685 if (pg[H1(i)] & 1) {
3686 cpu_stq_data_ra(env, addr, d1[i], ra);
3687 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
3688 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
3689 }
3690 addr += 3 * 8;
3691 }
3692}
3693
3694void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
3695 target_ulong addr, uint32_t desc)
3696{
3697 intptr_t i, oprsz = simd_oprsz(desc) / 8;
3698 intptr_t ra = GETPC();
3699 unsigned rd = simd_data(desc);
3700 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
3701 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
3702 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
3703 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
3704 uint8_t *pg = vg;
3705
3706 for (i = 0; i < oprsz; i += 1) {
3707 if (pg[H1(i)] & 1) {
3708 cpu_stq_data_ra(env, addr, d1[i], ra);
3709 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
3710 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
3711 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
3712 }
3713 addr += 4 * 8;
3714 }
3715}
f6dbf62a 3716
673e9fa6
RH
3717/* Loads with a vector index. */
3718
3719#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
3720void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
3721 target_ulong base, uint32_t desc) \
3722{ \
3723 intptr_t i, oprsz = simd_oprsz(desc); \
3724 unsigned scale = simd_data(desc); \
3725 uintptr_t ra = GETPC(); \
3726 for (i = 0; i < oprsz; i++) { \
3727 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3728 do { \
3729 TYPEM m = 0; \
3730 if (pg & 1) { \
3731 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
3732 m = FN(env, base + (off << scale), ra); \
3733 } \
3734 *(uint32_t *)(vd + H1_4(i)) = m; \
3735 i += 4, pg >>= 4; \
3736 } while (i & 15); \
3737 } \
3738}
3739
3740#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
3741void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
3742 target_ulong base, uint32_t desc) \
3743{ \
3744 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
3745 unsigned scale = simd_data(desc); \
3746 uintptr_t ra = GETPC(); \
3747 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
3748 for (i = 0; i < oprsz; i++) { \
3749 TYPEM mm = 0; \
3750 if (pg[H1(i)] & 1) { \
3751 target_ulong off = (TYPEI)m[i]; \
3752 mm = FN(env, base + (off << scale), ra); \
3753 } \
3754 d[i] = mm; \
3755 } \
3756}
3757
3758DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
3759DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
3760DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
3761DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
3762DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
3763
3764DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
3765DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
3766DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
3767DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
3768DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
3769
3770DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
3771DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
3772DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
3773DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
3774DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
3775DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
3776DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
3777
3778DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
3779DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
3780DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
3781DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
3782DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
3783DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
3784DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
3785
3786DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
3787DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
3788DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
3789DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
3790DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
3791DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
3792DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
3793
f6dbf62a
RH
3794/* Stores with a vector index. */
3795
3796#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
3797void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
3798 target_ulong base, uint32_t desc) \
3799{ \
3800 intptr_t i, oprsz = simd_oprsz(desc); \
3801 unsigned scale = simd_data(desc); \
3802 uintptr_t ra = GETPC(); \
3803 for (i = 0; i < oprsz; ) { \
3804 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3805 do { \
3806 if (likely(pg & 1)) { \
3807 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
3808 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
3809 FN(env, base + (off << scale), d, ra); \
3810 } \
3811 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
3812 } while (i & 15); \
3813 } \
3814}
3815
3816#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
3817void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
3818 target_ulong base, uint32_t desc) \
3819{ \
3820 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
3821 unsigned scale = simd_data(desc); \
3822 uintptr_t ra = GETPC(); \
3823 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
3824 for (i = 0; i < oprsz; i++) { \
3825 if (likely(pg[H1(i)] & 1)) { \
3826 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
3827 FN(env, base + off, d[i], ra); \
3828 } \
3829 } \
3830}
3831
3832DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
3833DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
3834DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
3835
3836DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
3837DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
3838DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
3839
3840DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
3841DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
3842DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
3843DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
3844
3845DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
3846DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
3847DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
3848DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
3849
3850DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
3851DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
3852DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
3853DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
This page took 0.494419 seconds and 4 git commands to generate.