]> Git Repo - qemu.git/blame - target/arm/sve_helper.c
target/arm: Fix aarch64_sve_change_el wrt EL0
[qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
9e18d7a6
RH
28
29
f97cfd59
RH
30/* Note that vector data is stored in host-endian 64-bit chunks,
31 so addressing units smaller than that needs a host-endian fixup. */
32#ifdef HOST_WORDS_BIGENDIAN
33#define H1(x) ((x) ^ 7)
34#define H1_2(x) ((x) ^ 6)
35#define H1_4(x) ((x) ^ 4)
36#define H2(x) ((x) ^ 3)
37#define H4(x) ((x) ^ 1)
38#else
39#define H1(x) (x)
40#define H1_2(x) (x)
41#define H1_4(x) (x)
42#define H2(x) (x)
43#define H4(x) (x)
44#endif
45
9e18d7a6
RH
46/* Return a value for NZCV as per the ARM PredTest pseudofunction.
47 *
48 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
49 * and bit 0 set if C is set. Compare the definitions of these variables
50 * within CPUARMState.
51 */
52
53/* For no G bits set, NZCV = C. */
54#define PREDTEST_INIT 1
55
56/* This is an iterative function, called for each Pd and Pg word
57 * moving forward.
58 */
59static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
60{
61 if (likely(g)) {
62 /* Compute N from first D & G.
63 Use bit 2 to signal first G bit seen. */
64 if (!(flags & 4)) {
65 flags |= ((d & (g & -g)) != 0) << 31;
66 flags |= 4;
67 }
68
69 /* Accumulate Z from each D & G. */
70 flags |= ((d & g) != 0) << 1;
71
72 /* Compute C from last !(D & G). Replace previous. */
73 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
74 }
75 return flags;
76}
77
757f9cff
RH
78/* This is an iterative function, called for each Pd and Pg word
79 * moving backward.
80 */
81static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
82{
83 if (likely(g)) {
84 /* Compute C from first (i.e last) !(D & G).
85 Use bit 2 to signal first G bit seen. */
86 if (!(flags & 4)) {
87 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
88 flags |= (d & pow2floor(g)) == 0;
89 }
90
91 /* Accumulate Z from each D & G. */
92 flags |= ((d & g) != 0) << 1;
93
94 /* Compute N from last (i.e first) D & G. Replace previous. */
95 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
96 }
97 return flags;
98}
99
9e18d7a6
RH
100/* The same for a single word predicate. */
101uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
102{
103 return iter_predtest_fwd(d, g, PREDTEST_INIT);
104}
105
106/* The same for a multi-word predicate. */
107uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
108{
109 uint32_t flags = PREDTEST_INIT;
110 uint64_t *d = vd, *g = vg;
111 uintptr_t i = 0;
112
113 do {
114 flags = iter_predtest_fwd(d[i], g[i], flags);
115 } while (++i < words);
116
117 return flags;
118}
516e246a 119
ccd841c3
RH
120/* Expand active predicate bits to bytes, for byte elements.
121 * for (i = 0; i < 256; ++i) {
122 * unsigned long m = 0;
123 * for (j = 0; j < 8; j++) {
124 * if ((i >> j) & 1) {
125 * m |= 0xfful << (j << 3);
126 * }
127 * }
128 * printf("0x%016lx,\n", m);
129 * }
130 */
131static inline uint64_t expand_pred_b(uint8_t byte)
132{
133 static const uint64_t word[256] = {
134 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
135 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
136 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
137 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
138 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
139 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
140 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
141 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
142 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
143 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
144 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
145 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
146 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
147 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
148 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
149 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
150 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
151 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
152 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
153 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
154 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
155 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
156 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
157 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
158 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
159 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
160 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
161 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
162 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
163 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
164 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
165 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
166 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
167 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
168 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
169 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
170 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
171 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
172 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
173 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
174 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
175 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
176 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
177 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
178 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
179 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
180 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
181 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
182 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
183 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
184 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
185 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
186 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
187 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
188 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
189 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
190 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
191 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
192 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
193 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
194 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
195 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
196 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
197 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
198 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
199 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
200 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
201 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
202 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
203 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
204 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
205 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
206 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
207 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
208 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
209 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
210 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
211 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
212 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
213 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
214 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
215 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
216 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
217 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
218 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
219 0xffffffffffffffff,
220 };
221 return word[byte];
222}
223
224/* Similarly for half-word elements.
225 * for (i = 0; i < 256; ++i) {
226 * unsigned long m = 0;
227 * if (i & 0xaa) {
228 * continue;
229 * }
230 * for (j = 0; j < 8; j += 2) {
231 * if ((i >> j) & 1) {
232 * m |= 0xfffful << (j << 3);
233 * }
234 * }
235 * printf("[0x%x] = 0x%016lx,\n", i, m);
236 * }
237 */
238static inline uint64_t expand_pred_h(uint8_t byte)
239{
240 static const uint64_t word[] = {
241 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
242 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
243 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
244 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
245 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
246 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
247 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
248 [0x55] = 0xffffffffffffffff,
249 };
250 return word[byte & 0x55];
251}
252
253/* Similarly for single word elements. */
254static inline uint64_t expand_pred_s(uint8_t byte)
255{
256 static const uint64_t word[] = {
257 [0x01] = 0x00000000ffffffffull,
258 [0x10] = 0xffffffff00000000ull,
259 [0x11] = 0xffffffffffffffffull,
260 };
261 return word[byte & 0x11];
262}
263
dae8fb90
RH
264/* Swap 16-bit words within a 32-bit word. */
265static inline uint32_t hswap32(uint32_t h)
266{
267 return rol32(h, 16);
268}
269
270/* Swap 16-bit words within a 64-bit word. */
271static inline uint64_t hswap64(uint64_t h)
272{
273 uint64_t m = 0x0000ffff0000ffffull;
274 h = rol64(h, 32);
275 return ((h & m) << 16) | ((h >> 16) & m);
276}
277
278/* Swap 32-bit words within a 64-bit word. */
279static inline uint64_t wswap64(uint64_t h)
280{
281 return rol64(h, 32);
282}
283
516e246a
RH
284#define LOGICAL_PPPP(NAME, FUNC) \
285void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
286{ \
287 uintptr_t opr_sz = simd_oprsz(desc); \
288 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
289 uintptr_t i; \
290 for (i = 0; i < opr_sz / 8; ++i) { \
291 d[i] = FUNC(n[i], m[i], g[i]); \
292 } \
293}
294
295#define DO_AND(N, M, G) (((N) & (M)) & (G))
296#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
297#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
298#define DO_ORR(N, M, G) (((N) | (M)) & (G))
299#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
300#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
301#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
302#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
303
304LOGICAL_PPPP(sve_and_pppp, DO_AND)
305LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
306LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
307LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
308LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
309LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
310LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
311LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
312
313#undef DO_AND
314#undef DO_BIC
315#undef DO_EOR
316#undef DO_ORR
317#undef DO_ORN
318#undef DO_NOR
319#undef DO_NAND
320#undef DO_SEL
321#undef LOGICAL_PPPP
028e2a7b 322
f97cfd59
RH
323/* Fully general three-operand expander, controlled by a predicate.
324 * This is complicated by the host-endian storage of the register file.
325 */
326/* ??? I don't expect the compiler could ever vectorize this itself.
327 * With some tables we can convert bit masks to byte masks, and with
328 * extra care wrt byte/word ordering we could use gcc generic vectors
329 * and do 16 bytes at a time.
330 */
331#define DO_ZPZZ(NAME, TYPE, H, OP) \
332void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
333{ \
334 intptr_t i, opr_sz = simd_oprsz(desc); \
335 for (i = 0; i < opr_sz; ) { \
336 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
337 do { \
338 if (pg & 1) { \
339 TYPE nn = *(TYPE *)(vn + H(i)); \
340 TYPE mm = *(TYPE *)(vm + H(i)); \
341 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
342 } \
343 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
344 } while (i & 15); \
345 } \
346}
347
348/* Similarly, specialized for 64-bit operands. */
349#define DO_ZPZZ_D(NAME, TYPE, OP) \
350void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
351{ \
352 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
353 TYPE *d = vd, *n = vn, *m = vm; \
354 uint8_t *pg = vg; \
355 for (i = 0; i < opr_sz; i += 1) { \
356 if (pg[H1(i)] & 1) { \
357 TYPE nn = n[i], mm = m[i]; \
358 d[i] = OP(nn, mm); \
359 } \
360 } \
361}
362
363#define DO_AND(N, M) (N & M)
364#define DO_EOR(N, M) (N ^ M)
365#define DO_ORR(N, M) (N | M)
366#define DO_BIC(N, M) (N & ~M)
367#define DO_ADD(N, M) (N + M)
368#define DO_SUB(N, M) (N - M)
369#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
370#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
371#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
372#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
373
374
375/*
376 * We must avoid the C undefined behaviour cases: division by
377 * zero and signed division of INT_MIN by -1. Both of these
378 * have architecturally defined required results for Arm.
379 * We special case all signed divisions by -1 to avoid having
380 * to deduce the minimum integer for the type involved.
381 */
382#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
383#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
384
385DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
386DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
387DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
388DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
389
390DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
391DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
392DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
393DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
394
395DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
396DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
397DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
398DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
399
400DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
401DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
402DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
403DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
404
405DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
406DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
407DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
408DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
409
410DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
411DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
412DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
413DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
414
415DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
416DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
417DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
418DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
419
420DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
421DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
422DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
423DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
424
425DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
426DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
427DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
428DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
429
430DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
431DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
432DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
433DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
434
435DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
436DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
437DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
438DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
439
440DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
441DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
442DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
443DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
444
445/* Because the computation type is at least twice as large as required,
446 these work for both signed and unsigned source types. */
447static inline uint8_t do_mulh_b(int32_t n, int32_t m)
448{
449 return (n * m) >> 8;
450}
451
452static inline uint16_t do_mulh_h(int32_t n, int32_t m)
453{
454 return (n * m) >> 16;
455}
456
457static inline uint32_t do_mulh_s(int64_t n, int64_t m)
458{
459 return (n * m) >> 32;
460}
461
462static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
463{
464 uint64_t lo, hi;
465 muls64(&lo, &hi, n, m);
466 return hi;
467}
468
469static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
470{
471 uint64_t lo, hi;
472 mulu64(&lo, &hi, n, m);
473 return hi;
474}
475
476DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
477DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
478DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
479DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
480
481DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
482DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
483DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
484DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
485
486DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
487DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
488DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
489DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
490
7e8fafbf
RH
491DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
492DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 493
7e8fafbf
RH
494DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
495DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 496
27721dbb
RH
497/* Note that all bits of the shift are significant
498 and not modulo the element size. */
499#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
500#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
501#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
502
503DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
504DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
505DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
506
507DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
508DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
509DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
510
511DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
512DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
513DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
514
515DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
516DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
517DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
518
f97cfd59
RH
519#undef DO_ZPZZ
520#undef DO_ZPZZ_D
047cec97 521
fe7f8dfb
RH
522/* Three-operand expander, controlled by a predicate, in which the
523 * third operand is "wide". That is, for D = N op M, the same 64-bit
524 * value of M is used with all of the narrower values of N.
525 */
526#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
527void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
528{ \
529 intptr_t i, opr_sz = simd_oprsz(desc); \
530 for (i = 0; i < opr_sz; ) { \
531 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
532 TYPEW mm = *(TYPEW *)(vm + i); \
533 do { \
534 if (pg & 1) { \
535 TYPE nn = *(TYPE *)(vn + H(i)); \
536 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
537 } \
538 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
539 } while (i & 7); \
540 } \
541}
542
543DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
544DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
545DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
546
547DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
548DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
549DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
550
551DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
552DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
553DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
554
555#undef DO_ZPZW
556
afac6d04
RH
557/* Fully general two-operand expander, controlled by a predicate.
558 */
559#define DO_ZPZ(NAME, TYPE, H, OP) \
560void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
561{ \
562 intptr_t i, opr_sz = simd_oprsz(desc); \
563 for (i = 0; i < opr_sz; ) { \
564 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
565 do { \
566 if (pg & 1) { \
567 TYPE nn = *(TYPE *)(vn + H(i)); \
568 *(TYPE *)(vd + H(i)) = OP(nn); \
569 } \
570 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
571 } while (i & 15); \
572 } \
573}
574
575/* Similarly, specialized for 64-bit operands. */
576#define DO_ZPZ_D(NAME, TYPE, OP) \
577void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
578{ \
579 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
580 TYPE *d = vd, *n = vn; \
581 uint8_t *pg = vg; \
582 for (i = 0; i < opr_sz; i += 1) { \
583 if (pg[H1(i)] & 1) { \
584 TYPE nn = n[i]; \
585 d[i] = OP(nn); \
586 } \
587 } \
588}
589
590#define DO_CLS_B(N) (clrsb32(N) - 24)
591#define DO_CLS_H(N) (clrsb32(N) - 16)
592
593DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
594DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
595DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
596DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
597
598#define DO_CLZ_B(N) (clz32(N) - 24)
599#define DO_CLZ_H(N) (clz32(N) - 16)
600
601DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
602DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
603DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
604DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
605
606DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
607DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
608DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
609DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
610
611#define DO_CNOT(N) (N == 0)
612
613DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
614DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
615DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
616DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
617
618#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
619
620DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
621DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
622DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
623
624#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
625
626DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
627DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
628DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
629
630#define DO_NOT(N) (~N)
631
632DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
633DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
634DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
635DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
636
637#define DO_SXTB(N) ((int8_t)N)
638#define DO_SXTH(N) ((int16_t)N)
639#define DO_SXTS(N) ((int32_t)N)
640#define DO_UXTB(N) ((uint8_t)N)
641#define DO_UXTH(N) ((uint16_t)N)
642#define DO_UXTS(N) ((uint32_t)N)
643
644DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
645DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
646DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
647DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
648DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
649DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
650
651DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
652DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
653DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
654DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
655DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
656DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
657
658#define DO_ABS(N) (N < 0 ? -N : N)
659
660DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
661DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
662DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
663DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
664
665#define DO_NEG(N) (-N)
666
667DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
668DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
669DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
670DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
671
dae8fb90
RH
672DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
673DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
674DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
675
676DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
677DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
678
679DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
680
681DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
682DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
683DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
684DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
685
d9d78dcc
RH
686/* Three-operand expander, unpredicated, in which the third operand is "wide".
687 */
688#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
689void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
690{ \
691 intptr_t i, opr_sz = simd_oprsz(desc); \
692 for (i = 0; i < opr_sz; ) { \
693 TYPEW mm = *(TYPEW *)(vm + i); \
694 do { \
695 TYPE nn = *(TYPE *)(vn + H(i)); \
696 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
697 i += sizeof(TYPE); \
698 } while (i & 7); \
699 } \
700}
701
702DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
703DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
704DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
705
706DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
707DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
708DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
709
710DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
711DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
712DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
713
714#undef DO_ZZW
715
afac6d04
RH
716#undef DO_CLS_B
717#undef DO_CLS_H
718#undef DO_CLZ_B
719#undef DO_CLZ_H
720#undef DO_CNOT
721#undef DO_FABS
722#undef DO_FNEG
723#undef DO_ABS
724#undef DO_NEG
725#undef DO_ZPZ
726#undef DO_ZPZ_D
727
047cec97
RH
728/* Two-operand reduction expander, controlled by a predicate.
729 * The difference between TYPERED and TYPERET has to do with
730 * sign-extension. E.g. for SMAX, TYPERED must be signed,
731 * but TYPERET must be unsigned so that e.g. a 32-bit value
732 * is not sign-extended to the ABI uint64_t return type.
733 */
734/* ??? If we were to vectorize this by hand the reduction ordering
735 * would change. For integer operands, this is perfectly fine.
736 */
737#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
738uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
739{ \
740 intptr_t i, opr_sz = simd_oprsz(desc); \
741 TYPERED ret = INIT; \
742 for (i = 0; i < opr_sz; ) { \
743 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
744 do { \
745 if (pg & 1) { \
746 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
747 ret = OP(ret, nn); \
748 } \
749 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
750 } while (i & 15); \
751 } \
752 return (TYPERET)ret; \
753}
754
755#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
756uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
757{ \
758 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
759 TYPEE *n = vn; \
760 uint8_t *pg = vg; \
761 TYPER ret = INIT; \
762 for (i = 0; i < opr_sz; i += 1) { \
763 if (pg[H1(i)] & 1) { \
764 TYPEE nn = n[i]; \
765 ret = OP(ret, nn); \
766 } \
767 } \
768 return ret; \
769}
770
771DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
772DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
773DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
774DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
775
776DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
777DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
778DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
779DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
780
781DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
782DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
783DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
784DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
785
786DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
787DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
788DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
789
790DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
791DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
792DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
793DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
794
795DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
796DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
797DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
798DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
799
800DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
801DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
802DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
803DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
804
805DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
806DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
807DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
808DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
809
810DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
811DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
812DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
813DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
814
815#undef DO_VPZ
816#undef DO_VPZ_D
817
6e6a157d
RH
818/* Two vector operand, one scalar operand, unpredicated. */
819#define DO_ZZI(NAME, TYPE, OP) \
820void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
821{ \
822 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
823 TYPE s = s64, *d = vd, *n = vn; \
824 for (i = 0; i < opr_sz; ++i) { \
825 d[i] = OP(n[i], s); \
826 } \
827}
828
829#define DO_SUBR(X, Y) (Y - X)
830
831DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
832DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
833DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
834DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
835
836DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
837DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
838DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
839DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
840
841DO_ZZI(sve_smini_b, int8_t, DO_MIN)
842DO_ZZI(sve_smini_h, int16_t, DO_MIN)
843DO_ZZI(sve_smini_s, int32_t, DO_MIN)
844DO_ZZI(sve_smini_d, int64_t, DO_MIN)
845
846DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
847DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
848DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
849DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
850
851DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
852DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
853DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
854DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
855
856#undef DO_ZZI
857
f97cfd59
RH
858#undef DO_AND
859#undef DO_ORR
860#undef DO_EOR
861#undef DO_BIC
862#undef DO_ADD
863#undef DO_SUB
864#undef DO_MAX
865#undef DO_MIN
866#undef DO_ABD
867#undef DO_MUL
868#undef DO_DIV
27721dbb
RH
869#undef DO_ASR
870#undef DO_LSR
871#undef DO_LSL
6e6a157d 872#undef DO_SUBR
f97cfd59 873
028e2a7b
RH
874/* Similar to the ARM LastActiveElement pseudocode function, except the
875 result is multiplied by the element size. This includes the not found
876 indication; e.g. not found for esz=3 is -8. */
877static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
878{
879 uint64_t mask = pred_esz_masks[esz];
880 intptr_t i = words;
881
882 do {
883 uint64_t this_g = g[--i] & mask;
884 if (this_g) {
885 return i * 64 + (63 - clz64(this_g));
886 }
887 } while (i > 0);
888 return (intptr_t)-1 << esz;
889}
890
891uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
892{
893 uint32_t flags = PREDTEST_INIT;
894 uint64_t *d = vd, *g = vg;
895 intptr_t i = 0;
896
897 do {
898 uint64_t this_d = d[i];
899 uint64_t this_g = g[i];
900
901 if (this_g) {
902 if (!(flags & 4)) {
903 /* Set in D the first bit of G. */
904 this_d |= this_g & -this_g;
905 d[i] = this_d;
906 }
907 flags = iter_predtest_fwd(this_d, this_g, flags);
908 }
909 } while (++i < words);
910
911 return flags;
912}
913
914uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
915{
916 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
917 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
918 uint32_t flags = PREDTEST_INIT;
919 uint64_t *d = vd, *g = vg, esz_mask;
920 intptr_t i, next;
921
922 next = last_active_element(vd, words, esz) + (1 << esz);
923 esz_mask = pred_esz_masks[esz];
924
925 /* Similar to the pseudocode for pnext, but scaled by ESZ
926 so that we find the correct bit. */
927 if (next < words * 64) {
928 uint64_t mask = -1;
929
930 if (next & 63) {
931 mask = ~((1ull << (next & 63)) - 1);
932 next &= -64;
933 }
934 do {
935 uint64_t this_g = g[next / 64] & esz_mask & mask;
936 if (this_g != 0) {
937 next = (next & -64) + ctz64(this_g);
938 break;
939 }
940 next += 64;
941 mask = -1;
942 } while (next < words * 64);
943 }
944
945 i = 0;
946 do {
947 uint64_t this_d = 0;
948 if (i == next / 64) {
949 this_d = 1ull << (next & 63);
950 }
951 d[i] = this_d;
952 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
953 } while (++i < words);
954
955 return flags;
956}
ccd841c3
RH
957
958/* Store zero into every active element of Zd. We will use this for two
959 * and three-operand predicated instructions for which logic dictates a
960 * zero result. In particular, logical shift by element size, which is
961 * otherwise undefined on the host.
962 *
963 * For element sizes smaller than uint64_t, we use tables to expand
964 * the N bits of the controlling predicate to a byte mask, and clear
965 * those bytes.
966 */
967void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
968{
969 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
970 uint64_t *d = vd;
971 uint8_t *pg = vg;
972 for (i = 0; i < opr_sz; i += 1) {
973 d[i] &= ~expand_pred_b(pg[H1(i)]);
974 }
975}
976
977void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
978{
979 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
980 uint64_t *d = vd;
981 uint8_t *pg = vg;
982 for (i = 0; i < opr_sz; i += 1) {
983 d[i] &= ~expand_pred_h(pg[H1(i)]);
984 }
985}
986
987void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
988{
989 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
990 uint64_t *d = vd;
991 uint8_t *pg = vg;
992 for (i = 0; i < opr_sz; i += 1) {
993 d[i] &= ~expand_pred_s(pg[H1(i)]);
994 }
995}
996
997void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
998{
999 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1000 uint64_t *d = vd;
1001 uint8_t *pg = vg;
1002 for (i = 0; i < opr_sz; i += 1) {
1003 if (pg[H1(i)] & 1) {
1004 d[i] = 0;
1005 }
1006 }
1007}
1008
68459864
RH
1009/* Copy Zn into Zd, and store zero into inactive elements. */
1010void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1011{
1012 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1013 uint64_t *d = vd, *n = vn;
1014 uint8_t *pg = vg;
1015 for (i = 0; i < opr_sz; i += 1) {
1016 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1017 }
1018}
1019
1020void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1021{
1022 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1023 uint64_t *d = vd, *n = vn;
1024 uint8_t *pg = vg;
1025 for (i = 0; i < opr_sz; i += 1) {
1026 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1027 }
1028}
1029
1030void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1031{
1032 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1033 uint64_t *d = vd, *n = vn;
1034 uint8_t *pg = vg;
1035 for (i = 0; i < opr_sz; i += 1) {
1036 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1037 }
1038}
1039
1040void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1041{
1042 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1043 uint64_t *d = vd, *n = vn;
1044 uint8_t *pg = vg;
1045 for (i = 0; i < opr_sz; i += 1) {
054e7adf 1046 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
68459864
RH
1047 }
1048}
1049
ccd841c3
RH
1050/* Three-operand expander, immediate operand, controlled by a predicate.
1051 */
1052#define DO_ZPZI(NAME, TYPE, H, OP) \
1053void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1054{ \
1055 intptr_t i, opr_sz = simd_oprsz(desc); \
1056 TYPE imm = simd_data(desc); \
1057 for (i = 0; i < opr_sz; ) { \
1058 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1059 do { \
1060 if (pg & 1) { \
1061 TYPE nn = *(TYPE *)(vn + H(i)); \
1062 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1063 } \
1064 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1065 } while (i & 15); \
1066 } \
1067}
1068
1069/* Similarly, specialized for 64-bit operands. */
1070#define DO_ZPZI_D(NAME, TYPE, OP) \
1071void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1072{ \
1073 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1074 TYPE *d = vd, *n = vn; \
1075 TYPE imm = simd_data(desc); \
1076 uint8_t *pg = vg; \
1077 for (i = 0; i < opr_sz; i += 1) { \
1078 if (pg[H1(i)] & 1) { \
1079 TYPE nn = n[i]; \
1080 d[i] = OP(nn, imm); \
1081 } \
1082 } \
1083}
1084
1085#define DO_SHR(N, M) (N >> M)
1086#define DO_SHL(N, M) (N << M)
1087
1088/* Arithmetic shift right for division. This rounds negative numbers
1089 toward zero as per signed division. Therefore before shifting,
1090 when N is negative, add 2**M-1. */
1091#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1092
1093DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1095DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1096DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1097
1098DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1100DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1101DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1102
1103DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1105DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1106DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1107
1108DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1109DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1110DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1111DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1112
1113#undef DO_SHR
1114#undef DO_SHL
1115#undef DO_ASRD
1116#undef DO_ZPZI
1117#undef DO_ZPZI_D
96a36e4a
RH
1118
1119/* Fully general four-operand expander, controlled by a predicate.
1120 */
1121#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1122void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1123 void *vg, uint32_t desc) \
1124{ \
1125 intptr_t i, opr_sz = simd_oprsz(desc); \
1126 for (i = 0; i < opr_sz; ) { \
1127 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1128 do { \
1129 if (pg & 1) { \
1130 TYPE nn = *(TYPE *)(vn + H(i)); \
1131 TYPE mm = *(TYPE *)(vm + H(i)); \
1132 TYPE aa = *(TYPE *)(va + H(i)); \
1133 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1134 } \
1135 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1136 } while (i & 15); \
1137 } \
1138}
1139
1140/* Similarly, specialized for 64-bit operands. */
1141#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1142void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1143 void *vg, uint32_t desc) \
1144{ \
1145 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1146 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1147 uint8_t *pg = vg; \
1148 for (i = 0; i < opr_sz; i += 1) { \
1149 if (pg[H1(i)] & 1) { \
1150 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1151 d[i] = OP(aa, nn, mm); \
1152 } \
1153 } \
1154}
1155
1156#define DO_MLA(A, N, M) (A + N * M)
1157#define DO_MLS(A, N, M) (A - N * M)
1158
1159DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1160DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1161
1162DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1163DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1164
1165DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1166DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1167
1168DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1169DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1170
1171#undef DO_MLA
1172#undef DO_MLS
1173#undef DO_ZPZZZ
1174#undef DO_ZPZZZ_D
9a56c9c3
RH
1175
1176void HELPER(sve_index_b)(void *vd, uint32_t start,
1177 uint32_t incr, uint32_t desc)
1178{
1179 intptr_t i, opr_sz = simd_oprsz(desc);
1180 uint8_t *d = vd;
1181 for (i = 0; i < opr_sz; i += 1) {
1182 d[H1(i)] = start + i * incr;
1183 }
1184}
1185
1186void HELPER(sve_index_h)(void *vd, uint32_t start,
1187 uint32_t incr, uint32_t desc)
1188{
1189 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1190 uint16_t *d = vd;
1191 for (i = 0; i < opr_sz; i += 1) {
1192 d[H2(i)] = start + i * incr;
1193 }
1194}
1195
1196void HELPER(sve_index_s)(void *vd, uint32_t start,
1197 uint32_t incr, uint32_t desc)
1198{
1199 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1200 uint32_t *d = vd;
1201 for (i = 0; i < opr_sz; i += 1) {
1202 d[H4(i)] = start + i * incr;
1203 }
1204}
1205
1206void HELPER(sve_index_d)(void *vd, uint64_t start,
1207 uint64_t incr, uint32_t desc)
1208{
1209 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210 uint64_t *d = vd;
1211 for (i = 0; i < opr_sz; i += 1) {
1212 d[i] = start + i * incr;
1213 }
1214}
4b242d9c
RH
1215
1216void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1217{
1218 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1219 uint32_t sh = simd_data(desc);
1220 uint32_t *d = vd, *n = vn, *m = vm;
1221 for (i = 0; i < opr_sz; i += 1) {
1222 d[i] = n[i] + (m[i] << sh);
1223 }
1224}
1225
1226void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1227{
1228 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1229 uint64_t sh = simd_data(desc);
1230 uint64_t *d = vd, *n = vn, *m = vm;
1231 for (i = 0; i < opr_sz; i += 1) {
1232 d[i] = n[i] + (m[i] << sh);
1233 }
1234}
1235
1236void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1237{
1238 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1239 uint64_t sh = simd_data(desc);
1240 uint64_t *d = vd, *n = vn, *m = vm;
1241 for (i = 0; i < opr_sz; i += 1) {
1242 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1243 }
1244}
1245
1246void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1247{
1248 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1249 uint64_t sh = simd_data(desc);
1250 uint64_t *d = vd, *n = vn, *m = vm;
1251 for (i = 0; i < opr_sz; i += 1) {
1252 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1253 }
1254}
0762cd42
RH
1255
1256void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1257{
1258 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1259 static const uint16_t coeff[] = {
1260 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1261 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1262 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1263 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1264 };
1265 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1266 uint16_t *d = vd, *n = vn;
1267
1268 for (i = 0; i < opr_sz; i++) {
1269 uint16_t nn = n[i];
1270 intptr_t idx = extract32(nn, 0, 5);
1271 uint16_t exp = extract32(nn, 5, 5);
1272 d[i] = coeff[idx] | (exp << 10);
1273 }
1274}
1275
1276void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1277{
1278 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1279 static const uint32_t coeff[] = {
1280 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1281 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1282 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1283 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1284 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1285 0x1ef532, 0x20b051, 0x227043, 0x243516,
1286 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1287 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1288 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1289 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1290 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1291 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1292 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1293 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1294 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1295 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1296 };
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1298 uint32_t *d = vd, *n = vn;
1299
1300 for (i = 0; i < opr_sz; i++) {
1301 uint32_t nn = n[i];
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint32_t exp = extract32(nn, 6, 8);
1304 d[i] = coeff[idx] | (exp << 23);
1305 }
1306}
1307
1308void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1309{
1310 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1311 static const uint64_t coeff[] = {
1312 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1313 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1314 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1315 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1316 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1317 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1318 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1319 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1320 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1321 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1322 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1323 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1324 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1325 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1326 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1327 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1328 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1329 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1330 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1331 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1332 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1333 0xFA7C1819E90D8ull,
1334 };
1335 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1336 uint64_t *d = vd, *n = vn;
1337
1338 for (i = 0; i < opr_sz; i++) {
1339 uint64_t nn = n[i];
1340 intptr_t idx = extract32(nn, 0, 6);
1341 uint64_t exp = extract32(nn, 6, 11);
1342 d[i] = coeff[idx] | (exp << 52);
1343 }
1344}
a1f233f2
RH
1345
1346void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1347{
1348 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1349 uint16_t *d = vd, *n = vn, *m = vm;
1350 for (i = 0; i < opr_sz; i += 1) {
1351 uint16_t nn = n[i];
1352 uint16_t mm = m[i];
1353 if (mm & 1) {
1354 nn = float16_one;
1355 }
1356 d[i] = nn ^ (mm & 2) << 14;
1357 }
1358}
1359
1360void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1361{
1362 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1363 uint32_t *d = vd, *n = vn, *m = vm;
1364 for (i = 0; i < opr_sz; i += 1) {
1365 uint32_t nn = n[i];
1366 uint32_t mm = m[i];
1367 if (mm & 1) {
1368 nn = float32_one;
1369 }
1370 d[i] = nn ^ (mm & 2) << 30;
1371 }
1372}
1373
1374void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1375{
1376 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1377 uint64_t *d = vd, *n = vn, *m = vm;
1378 for (i = 0; i < opr_sz; i += 1) {
1379 uint64_t nn = n[i];
1380 uint64_t mm = m[i];
1381 if (mm & 1) {
1382 nn = float64_one;
1383 }
1384 d[i] = nn ^ (mm & 2) << 62;
1385 }
1386}
24e82e68
RH
1387
1388/*
1389 * Signed saturating addition with scalar operand.
1390 */
1391
1392void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1393{
1394 intptr_t i, oprsz = simd_oprsz(desc);
1395
1396 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1397 int r = *(int8_t *)(a + i) + b;
1398 if (r > INT8_MAX) {
1399 r = INT8_MAX;
1400 } else if (r < INT8_MIN) {
1401 r = INT8_MIN;
1402 }
1403 *(int8_t *)(d + i) = r;
1404 }
1405}
1406
1407void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1408{
1409 intptr_t i, oprsz = simd_oprsz(desc);
1410
1411 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1412 int r = *(int16_t *)(a + i) + b;
1413 if (r > INT16_MAX) {
1414 r = INT16_MAX;
1415 } else if (r < INT16_MIN) {
1416 r = INT16_MIN;
1417 }
1418 *(int16_t *)(d + i) = r;
1419 }
1420}
1421
1422void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1423{
1424 intptr_t i, oprsz = simd_oprsz(desc);
1425
1426 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1427 int64_t r = *(int32_t *)(a + i) + b;
1428 if (r > INT32_MAX) {
1429 r = INT32_MAX;
1430 } else if (r < INT32_MIN) {
1431 r = INT32_MIN;
1432 }
1433 *(int32_t *)(d + i) = r;
1434 }
1435}
1436
1437void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1438{
1439 intptr_t i, oprsz = simd_oprsz(desc);
1440
1441 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1442 int64_t ai = *(int64_t *)(a + i);
1443 int64_t r = ai + b;
1444 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1445 /* Signed overflow. */
1446 r = (r < 0 ? INT64_MAX : INT64_MIN);
1447 }
1448 *(int64_t *)(d + i) = r;
1449 }
1450}
1451
1452/*
1453 * Unsigned saturating addition with scalar operand.
1454 */
1455
1456void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1457{
1458 intptr_t i, oprsz = simd_oprsz(desc);
1459
1460 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1461 int r = *(uint8_t *)(a + i) + b;
1462 if (r > UINT8_MAX) {
1463 r = UINT8_MAX;
1464 } else if (r < 0) {
1465 r = 0;
1466 }
1467 *(uint8_t *)(d + i) = r;
1468 }
1469}
1470
1471void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1472{
1473 intptr_t i, oprsz = simd_oprsz(desc);
1474
1475 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1476 int r = *(uint16_t *)(a + i) + b;
1477 if (r > UINT16_MAX) {
1478 r = UINT16_MAX;
1479 } else if (r < 0) {
1480 r = 0;
1481 }
1482 *(uint16_t *)(d + i) = r;
1483 }
1484}
1485
1486void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1487{
1488 intptr_t i, oprsz = simd_oprsz(desc);
1489
1490 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1491 int64_t r = *(uint32_t *)(a + i) + b;
1492 if (r > UINT32_MAX) {
1493 r = UINT32_MAX;
1494 } else if (r < 0) {
1495 r = 0;
1496 }
1497 *(uint32_t *)(d + i) = r;
1498 }
1499}
1500
1501void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1502{
1503 intptr_t i, oprsz = simd_oprsz(desc);
1504
1505 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1506 uint64_t r = *(uint64_t *)(a + i) + b;
1507 if (r < b) {
1508 r = UINT64_MAX;
1509 }
1510 *(uint64_t *)(d + i) = r;
1511 }
1512}
1513
1514void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1515{
1516 intptr_t i, oprsz = simd_oprsz(desc);
1517
1518 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1519 uint64_t ai = *(uint64_t *)(a + i);
1520 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1521 }
1522}
f25a2361
RH
1523
1524/* Two operand predicated copy immediate with merge. All valid immediates
1525 * can fit within 17 signed bits in the simd_data field.
1526 */
1527void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1528 uint64_t mm, uint32_t desc)
1529{
1530 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1531 uint64_t *d = vd, *n = vn;
1532 uint8_t *pg = vg;
1533
1534 mm = dup_const(MO_8, mm);
1535 for (i = 0; i < opr_sz; i += 1) {
1536 uint64_t nn = n[i];
1537 uint64_t pp = expand_pred_b(pg[H1(i)]);
1538 d[i] = (mm & pp) | (nn & ~pp);
1539 }
1540}
1541
1542void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1543 uint64_t mm, uint32_t desc)
1544{
1545 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1546 uint64_t *d = vd, *n = vn;
1547 uint8_t *pg = vg;
1548
1549 mm = dup_const(MO_16, mm);
1550 for (i = 0; i < opr_sz; i += 1) {
1551 uint64_t nn = n[i];
1552 uint64_t pp = expand_pred_h(pg[H1(i)]);
1553 d[i] = (mm & pp) | (nn & ~pp);
1554 }
1555}
1556
1557void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1558 uint64_t mm, uint32_t desc)
1559{
1560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1561 uint64_t *d = vd, *n = vn;
1562 uint8_t *pg = vg;
1563
1564 mm = dup_const(MO_32, mm);
1565 for (i = 0; i < opr_sz; i += 1) {
1566 uint64_t nn = n[i];
1567 uint64_t pp = expand_pred_s(pg[H1(i)]);
1568 d[i] = (mm & pp) | (nn & ~pp);
1569 }
1570}
1571
1572void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1573 uint64_t mm, uint32_t desc)
1574{
1575 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1576 uint64_t *d = vd, *n = vn;
1577 uint8_t *pg = vg;
1578
1579 for (i = 0; i < opr_sz; i += 1) {
1580 uint64_t nn = n[i];
1581 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1582 }
1583}
1584
1585void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1586{
1587 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1588 uint64_t *d = vd;
1589 uint8_t *pg = vg;
1590
1591 val = dup_const(MO_8, val);
1592 for (i = 0; i < opr_sz; i += 1) {
1593 d[i] = val & expand_pred_b(pg[H1(i)]);
1594 }
1595}
1596
1597void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1598{
1599 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1600 uint64_t *d = vd;
1601 uint8_t *pg = vg;
1602
1603 val = dup_const(MO_16, val);
1604 for (i = 0; i < opr_sz; i += 1) {
1605 d[i] = val & expand_pred_h(pg[H1(i)]);
1606 }
1607}
1608
1609void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1610{
1611 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1612 uint64_t *d = vd;
1613 uint8_t *pg = vg;
1614
1615 val = dup_const(MO_32, val);
1616 for (i = 0; i < opr_sz; i += 1) {
1617 d[i] = val & expand_pred_s(pg[H1(i)]);
1618 }
1619}
1620
1621void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1622{
1623 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1624 uint64_t *d = vd;
1625 uint8_t *pg = vg;
1626
1627 for (i = 0; i < opr_sz; i += 1) {
1628 d[i] = (pg[H1(i)] & 1 ? val : 0);
1629 }
1630}
b94f8f60
RH
1631
1632/* Big-endian hosts need to frob the byte indicies. If the copy
1633 * happens to be 8-byte aligned, then no frobbing necessary.
1634 */
1635static void swap_memmove(void *vd, void *vs, size_t n)
1636{
1637 uintptr_t d = (uintptr_t)vd;
1638 uintptr_t s = (uintptr_t)vs;
1639 uintptr_t o = (d | s | n) & 7;
1640 size_t i;
1641
1642#ifndef HOST_WORDS_BIGENDIAN
1643 o = 0;
1644#endif
1645 switch (o) {
1646 case 0:
1647 memmove(vd, vs, n);
1648 break;
1649
1650 case 4:
1651 if (d < s || d >= s + n) {
1652 for (i = 0; i < n; i += 4) {
1653 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1654 }
1655 } else {
1656 for (i = n; i > 0; ) {
1657 i -= 4;
1658 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1659 }
1660 }
1661 break;
1662
1663 case 2:
1664 case 6:
1665 if (d < s || d >= s + n) {
1666 for (i = 0; i < n; i += 2) {
1667 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1668 }
1669 } else {
1670 for (i = n; i > 0; ) {
1671 i -= 2;
1672 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1673 }
1674 }
1675 break;
1676
1677 default:
1678 if (d < s || d >= s + n) {
1679 for (i = 0; i < n; i++) {
1680 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1681 }
1682 } else {
1683 for (i = n; i > 0; ) {
1684 i -= 1;
1685 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1686 }
1687 }
1688 break;
1689 }
1690}
1691
9123aeb6
RH
1692/* Similarly for memset of 0. */
1693static void swap_memzero(void *vd, size_t n)
1694{
1695 uintptr_t d = (uintptr_t)vd;
1696 uintptr_t o = (d | n) & 7;
1697 size_t i;
1698
1699 /* Usually, the first bit of a predicate is set, so N is 0. */
1700 if (likely(n == 0)) {
1701 return;
1702 }
1703
1704#ifndef HOST_WORDS_BIGENDIAN
1705 o = 0;
1706#endif
1707 switch (o) {
1708 case 0:
1709 memset(vd, 0, n);
1710 break;
1711
1712 case 4:
1713 for (i = 0; i < n; i += 4) {
1714 *(uint32_t *)H1_4(d + i) = 0;
1715 }
1716 break;
1717
1718 case 2:
1719 case 6:
1720 for (i = 0; i < n; i += 2) {
1721 *(uint16_t *)H1_2(d + i) = 0;
1722 }
1723 break;
1724
1725 default:
1726 for (i = 0; i < n; i++) {
1727 *(uint8_t *)H1(d + i) = 0;
1728 }
1729 break;
1730 }
1731}
1732
b94f8f60
RH
1733void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1734{
1735 intptr_t opr_sz = simd_oprsz(desc);
1736 size_t n_ofs = simd_data(desc);
1737 size_t n_siz = opr_sz - n_ofs;
1738
1739 if (vd != vm) {
1740 swap_memmove(vd, vn + n_ofs, n_siz);
1741 swap_memmove(vd + n_siz, vm, n_ofs);
1742 } else if (vd != vn) {
1743 swap_memmove(vd + n_siz, vd, n_ofs);
1744 swap_memmove(vd, vn + n_ofs, n_siz);
1745 } else {
1746 /* vd == vn == vm. Need temp space. */
1747 ARMVectorReg tmp;
1748 swap_memmove(&tmp, vm, n_ofs);
1749 swap_memmove(vd, vd + n_ofs, n_siz);
1750 memcpy(vd + n_siz, &tmp, n_ofs);
1751 }
1752}
30562ab7
RH
1753
1754#define DO_INSR(NAME, TYPE, H) \
1755void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1756{ \
1757 intptr_t opr_sz = simd_oprsz(desc); \
1758 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1759 *(TYPE *)(vd + H(0)) = val; \
1760}
1761
1762DO_INSR(sve_insr_b, uint8_t, H1)
1763DO_INSR(sve_insr_h, uint16_t, H1_2)
1764DO_INSR(sve_insr_s, uint32_t, H1_4)
1765DO_INSR(sve_insr_d, uint64_t, )
1766
1767#undef DO_INSR
1768
1769void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1770{
1771 intptr_t i, j, opr_sz = simd_oprsz(desc);
1772 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1773 uint64_t f = *(uint64_t *)(vn + i);
1774 uint64_t b = *(uint64_t *)(vn + j);
1775 *(uint64_t *)(vd + i) = bswap64(b);
1776 *(uint64_t *)(vd + j) = bswap64(f);
1777 }
1778}
1779
30562ab7
RH
1780void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1781{
1782 intptr_t i, j, opr_sz = simd_oprsz(desc);
1783 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1784 uint64_t f = *(uint64_t *)(vn + i);
1785 uint64_t b = *(uint64_t *)(vn + j);
1786 *(uint64_t *)(vd + i) = hswap64(b);
1787 *(uint64_t *)(vd + j) = hswap64(f);
1788 }
1789}
1790
1791void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1792{
1793 intptr_t i, j, opr_sz = simd_oprsz(desc);
1794 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1795 uint64_t f = *(uint64_t *)(vn + i);
1796 uint64_t b = *(uint64_t *)(vn + j);
1797 *(uint64_t *)(vd + i) = rol64(b, 32);
1798 *(uint64_t *)(vd + j) = rol64(f, 32);
1799 }
1800}
1801
1802void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1803{
1804 intptr_t i, j, opr_sz = simd_oprsz(desc);
1805 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1806 uint64_t f = *(uint64_t *)(vn + i);
1807 uint64_t b = *(uint64_t *)(vn + j);
1808 *(uint64_t *)(vd + i) = b;
1809 *(uint64_t *)(vd + j) = f;
1810 }
1811}
1812
1813#define DO_TBL(NAME, TYPE, H) \
1814void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1815{ \
1816 intptr_t i, opr_sz = simd_oprsz(desc); \
1817 uintptr_t elem = opr_sz / sizeof(TYPE); \
1818 TYPE *d = vd, *n = vn, *m = vm; \
1819 ARMVectorReg tmp; \
1820 if (unlikely(vd == vn)) { \
1821 n = memcpy(&tmp, vn, opr_sz); \
1822 } \
1823 for (i = 0; i < elem; i++) { \
1824 TYPE j = m[H(i)]; \
1825 d[H(i)] = j < elem ? n[H(j)] : 0; \
1826 } \
1827}
1828
1829DO_TBL(sve_tbl_b, uint8_t, H1)
1830DO_TBL(sve_tbl_h, uint16_t, H2)
1831DO_TBL(sve_tbl_s, uint32_t, H4)
1832DO_TBL(sve_tbl_d, uint64_t, )
1833
1834#undef TBL
1835
1836#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1837void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1838{ \
1839 intptr_t i, opr_sz = simd_oprsz(desc); \
1840 TYPED *d = vd; \
1841 TYPES *n = vn; \
1842 ARMVectorReg tmp; \
1843 if (unlikely(vn - vd < opr_sz)) { \
1844 n = memcpy(&tmp, n, opr_sz / 2); \
1845 } \
1846 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1847 d[HD(i)] = n[HS(i)]; \
1848 } \
1849}
1850
1851DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1852DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1853DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1854
1855DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1856DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1857DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1858
1859#undef DO_UNPK
d731d8cb
RH
1860
1861/* Mask of bits included in the even numbered predicates of width esz.
1862 * We also use this for expand_bits/compress_bits, and so extend the
1863 * same pattern out to 16-bit units.
1864 */
1865static const uint64_t even_bit_esz_masks[5] = {
1866 0x5555555555555555ull,
1867 0x3333333333333333ull,
1868 0x0f0f0f0f0f0f0f0full,
1869 0x00ff00ff00ff00ffull,
1870 0x0000ffff0000ffffull,
1871};
1872
1873/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1874 * For N==0, this corresponds to the operation that in qemu/bitops.h
1875 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1876 * section 7-2 Shuffling Bits.
1877 */
1878static uint64_t expand_bits(uint64_t x, int n)
1879{
1880 int i;
1881
1882 x &= 0xffffffffu;
1883 for (i = 4; i >= n; i--) {
1884 int sh = 1 << i;
1885 x = ((x << sh) | x) & even_bit_esz_masks[i];
1886 }
1887 return x;
1888}
1889
1890/* Compress units of 2**(N+1) bits to units of 2**N bits.
1891 * For N==0, this corresponds to the operation that in qemu/bitops.h
1892 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1893 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1894 */
1895static uint64_t compress_bits(uint64_t x, int n)
1896{
1897 int i;
1898
1899 for (i = n; i <= 4; i++) {
1900 int sh = 1 << i;
1901 x &= even_bit_esz_masks[i];
1902 x = (x >> sh) | x;
1903 }
1904 return x & 0xffffffffu;
1905}
1906
1907void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1908{
1909 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1910 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1911 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1912 uint64_t *d = vd;
1913 intptr_t i;
1914
1915 if (oprsz <= 8) {
1916 uint64_t nn = *(uint64_t *)vn;
1917 uint64_t mm = *(uint64_t *)vm;
1918 int half = 4 * oprsz;
1919
1920 nn = extract64(nn, high * half, half);
1921 mm = extract64(mm, high * half, half);
1922 nn = expand_bits(nn, esz);
1923 mm = expand_bits(mm, esz);
1924 d[0] = nn + (mm << (1 << esz));
1925 } else {
1926 ARMPredicateReg tmp_n, tmp_m;
1927
1928 /* We produce output faster than we consume input.
1929 Therefore we must be mindful of possible overlap. */
1930 if ((vn - vd) < (uintptr_t)oprsz) {
1931 vn = memcpy(&tmp_n, vn, oprsz);
1932 }
1933 if ((vm - vd) < (uintptr_t)oprsz) {
1934 vm = memcpy(&tmp_m, vm, oprsz);
1935 }
1936 if (high) {
1937 high = oprsz >> 1;
1938 }
1939
1940 if ((high & 3) == 0) {
1941 uint32_t *n = vn, *m = vm;
1942 high >>= 2;
1943
1944 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1945 uint64_t nn = n[H4(high + i)];
1946 uint64_t mm = m[H4(high + i)];
1947
1948 nn = expand_bits(nn, esz);
1949 mm = expand_bits(mm, esz);
1950 d[i] = nn + (mm << (1 << esz));
1951 }
1952 } else {
1953 uint8_t *n = vn, *m = vm;
1954 uint16_t *d16 = vd;
1955
1956 for (i = 0; i < oprsz / 2; i++) {
1957 uint16_t nn = n[H1(high + i)];
1958 uint16_t mm = m[H1(high + i)];
1959
1960 nn = expand_bits(nn, esz);
1961 mm = expand_bits(mm, esz);
1962 d16[H2(i)] = nn + (mm << (1 << esz));
1963 }
1964 }
1965 }
1966}
1967
1968void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1969{
1970 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1971 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1972 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1973 uint64_t *d = vd, *n = vn, *m = vm;
1974 uint64_t l, h;
1975 intptr_t i;
1976
1977 if (oprsz <= 8) {
1978 l = compress_bits(n[0] >> odd, esz);
1979 h = compress_bits(m[0] >> odd, esz);
1980 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1981 } else {
1982 ARMPredicateReg tmp_m;
1983 intptr_t oprsz_16 = oprsz / 16;
1984
1985 if ((vm - vd) < (uintptr_t)oprsz) {
1986 m = memcpy(&tmp_m, vm, oprsz);
1987 }
1988
1989 for (i = 0; i < oprsz_16; i++) {
1990 l = n[2 * i + 0];
1991 h = n[2 * i + 1];
1992 l = compress_bits(l >> odd, esz);
1993 h = compress_bits(h >> odd, esz);
1994 d[i] = l + (h << 32);
1995 }
1996
1997 /* For VL which is not a power of 2, the results from M do not
1998 align nicely with the uint64_t for D. Put the aligned results
1999 from M into TMP_M and then copy it into place afterward. */
2000 if (oprsz & 15) {
2001 d[i] = compress_bits(n[2 * i] >> odd, esz);
2002
2003 for (i = 0; i < oprsz_16; i++) {
2004 l = m[2 * i + 0];
2005 h = m[2 * i + 1];
2006 l = compress_bits(l >> odd, esz);
2007 h = compress_bits(h >> odd, esz);
2008 tmp_m.p[i] = l + (h << 32);
2009 }
2010 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2011
2012 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2013 } else {
2014 for (i = 0; i < oprsz_16; i++) {
2015 l = m[2 * i + 0];
2016 h = m[2 * i + 1];
2017 l = compress_bits(l >> odd, esz);
2018 h = compress_bits(h >> odd, esz);
2019 d[oprsz_16 + i] = l + (h << 32);
2020 }
2021 }
2022 }
2023}
2024
2025void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2026{
2027 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2028 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2029 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2030 uint64_t *d = vd, *n = vn, *m = vm;
2031 uint64_t mask;
2032 int shr, shl;
2033 intptr_t i;
2034
2035 shl = 1 << esz;
2036 shr = 0;
2037 mask = even_bit_esz_masks[esz];
2038 if (odd) {
2039 mask <<= shl;
2040 shr = shl;
2041 shl = 0;
2042 }
2043
2044 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2045 uint64_t nn = (n[i] & mask) >> shr;
2046 uint64_t mm = (m[i] & mask) << shl;
2047 d[i] = nn + mm;
2048 }
2049}
2050
2051/* Reverse units of 2**N bits. */
2052static uint64_t reverse_bits_64(uint64_t x, int n)
2053{
2054 int i, sh;
2055
2056 x = bswap64(x);
2057 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2058 uint64_t mask = even_bit_esz_masks[i];
2059 x = ((x & mask) << sh) | ((x >> sh) & mask);
2060 }
2061 return x;
2062}
2063
2064static uint8_t reverse_bits_8(uint8_t x, int n)
2065{
2066 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2067 int i, sh;
2068
2069 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2070 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2071 }
2072 return x;
2073}
2074
2075void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2076{
2077 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2078 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2079 intptr_t i, oprsz_2 = oprsz / 2;
2080
2081 if (oprsz <= 8) {
2082 uint64_t l = *(uint64_t *)vn;
2083 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2084 *(uint64_t *)vd = l;
2085 } else if ((oprsz & 15) == 0) {
2086 for (i = 0; i < oprsz_2; i += 8) {
2087 intptr_t ih = oprsz - 8 - i;
2088 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2089 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2090 *(uint64_t *)(vd + i) = h;
2091 *(uint64_t *)(vd + ih) = l;
2092 }
2093 } else {
2094 for (i = 0; i < oprsz_2; i += 1) {
2095 intptr_t il = H1(i);
2096 intptr_t ih = H1(oprsz - 1 - i);
2097 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2098 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2099 *(uint8_t *)(vd + il) = h;
2100 *(uint8_t *)(vd + ih) = l;
2101 }
2102 }
2103}
2104
2105void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2106{
2107 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2108 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2109 uint64_t *d = vd;
2110 intptr_t i;
2111
2112 if (oprsz <= 8) {
2113 uint64_t nn = *(uint64_t *)vn;
2114 int half = 4 * oprsz;
2115
2116 nn = extract64(nn, high * half, half);
2117 nn = expand_bits(nn, 0);
2118 d[0] = nn;
2119 } else {
2120 ARMPredicateReg tmp_n;
2121
2122 /* We produce output faster than we consume input.
2123 Therefore we must be mindful of possible overlap. */
2124 if ((vn - vd) < (uintptr_t)oprsz) {
2125 vn = memcpy(&tmp_n, vn, oprsz);
2126 }
2127 if (high) {
2128 high = oprsz >> 1;
2129 }
2130
2131 if ((high & 3) == 0) {
2132 uint32_t *n = vn;
2133 high >>= 2;
2134
2135 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2136 uint64_t nn = n[H4(high + i)];
2137 d[i] = expand_bits(nn, 0);
2138 }
2139 } else {
2140 uint16_t *d16 = vd;
2141 uint8_t *n = vn;
2142
2143 for (i = 0; i < oprsz / 2; i++) {
2144 uint16_t nn = n[H1(high + i)];
2145 d16[H2(i)] = expand_bits(nn, 0);
2146 }
2147 }
2148 }
2149}
234b48e9
RH
2150
2151#define DO_ZIP(NAME, TYPE, H) \
2152void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2153{ \
2154 intptr_t oprsz = simd_oprsz(desc); \
2155 intptr_t i, oprsz_2 = oprsz / 2; \
2156 ARMVectorReg tmp_n, tmp_m; \
2157 /* We produce output faster than we consume input. \
2158 Therefore we must be mindful of possible overlap. */ \
2159 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2160 vn = memcpy(&tmp_n, vn, oprsz_2); \
2161 } \
2162 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2163 vm = memcpy(&tmp_m, vm, oprsz_2); \
2164 } \
2165 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2166 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2167 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2168 } \
2169}
2170
2171DO_ZIP(sve_zip_b, uint8_t, H1)
2172DO_ZIP(sve_zip_h, uint16_t, H1_2)
2173DO_ZIP(sve_zip_s, uint32_t, H1_4)
2174DO_ZIP(sve_zip_d, uint64_t, )
2175
2176#define DO_UZP(NAME, TYPE, H) \
2177void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2178{ \
2179 intptr_t oprsz = simd_oprsz(desc); \
2180 intptr_t oprsz_2 = oprsz / 2; \
2181 intptr_t odd_ofs = simd_data(desc); \
2182 intptr_t i; \
2183 ARMVectorReg tmp_m; \
2184 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2185 vm = memcpy(&tmp_m, vm, oprsz); \
2186 } \
2187 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2188 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2189 } \
2190 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2191 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2192 } \
2193}
2194
2195DO_UZP(sve_uzp_b, uint8_t, H1)
2196DO_UZP(sve_uzp_h, uint16_t, H1_2)
2197DO_UZP(sve_uzp_s, uint32_t, H1_4)
2198DO_UZP(sve_uzp_d, uint64_t, )
2199
2200#define DO_TRN(NAME, TYPE, H) \
2201void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2202{ \
2203 intptr_t oprsz = simd_oprsz(desc); \
2204 intptr_t odd_ofs = simd_data(desc); \
2205 intptr_t i; \
2206 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2207 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2208 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2209 *(TYPE *)(vd + H(i + 0)) = ae; \
2210 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2211 } \
2212}
2213
2214DO_TRN(sve_trn_b, uint8_t, H1)
2215DO_TRN(sve_trn_h, uint16_t, H1_2)
2216DO_TRN(sve_trn_s, uint32_t, H1_4)
2217DO_TRN(sve_trn_d, uint64_t, )
2218
2219#undef DO_ZIP
2220#undef DO_UZP
2221#undef DO_TRN
3ca879ae
RH
2222
2223void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2224{
2225 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2226 uint32_t *d = vd, *n = vn;
2227 uint8_t *pg = vg;
2228
2229 for (i = j = 0; i < opr_sz; i++) {
2230 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2231 d[H4(j)] = n[H4(i)];
2232 j++;
2233 }
2234 }
2235 for (; j < opr_sz; j++) {
2236 d[H4(j)] = 0;
2237 }
2238}
2239
2240void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2241{
2242 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2243 uint64_t *d = vd, *n = vn;
2244 uint8_t *pg = vg;
2245
2246 for (i = j = 0; i < opr_sz; i++) {
2247 if (pg[H1(i)] & 1) {
2248 d[j] = n[i];
2249 j++;
2250 }
2251 }
2252 for (; j < opr_sz; j++) {
2253 d[j] = 0;
2254 }
2255}
ef23cb72
RH
2256
2257/* Similar to the ARM LastActiveElement pseudocode function, except the
2258 * result is multiplied by the element size. This includes the not found
2259 * indication; e.g. not found for esz=3 is -8.
2260 */
2261int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2262{
2263 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2264 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2265
2266 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2267}
b48ff240
RH
2268
2269void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2270{
2271 intptr_t opr_sz = simd_oprsz(desc) / 8;
2272 int esz = simd_data(desc);
2273 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2274 intptr_t i, first_i, last_i;
2275 ARMVectorReg tmp;
2276
2277 first_i = last_i = 0;
2278 first_g = last_g = 0;
2279
2280 /* Find the extent of the active elements within VG. */
2281 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2282 pg = *(uint64_t *)(vg + i) & mask;
2283 if (pg) {
2284 if (last_g == 0) {
2285 last_g = pg;
2286 last_i = i;
2287 }
2288 first_g = pg;
2289 first_i = i;
2290 }
2291 }
2292
2293 len = 0;
2294 if (first_g != 0) {
2295 first_i = first_i * 8 + ctz64(first_g);
2296 last_i = last_i * 8 + 63 - clz64(last_g);
2297 len = last_i - first_i + (1 << esz);
2298 if (vd == vm) {
2299 vm = memcpy(&tmp, vm, opr_sz * 8);
2300 }
2301 swap_memmove(vd, vn + first_i, len);
2302 }
2303 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2304}
d3fe4a29
RH
2305
2306void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2308{
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311 uint8_t *pg = vg;
2312
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 uint64_t pp = expand_pred_b(pg[H1(i)]);
2316 d[i] = (nn & pp) | (mm & ~pp);
2317 }
2318}
2319
2320void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2321 void *vg, uint32_t desc)
2322{
2323 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2324 uint64_t *d = vd, *n = vn, *m = vm;
2325 uint8_t *pg = vg;
2326
2327 for (i = 0; i < opr_sz; i += 1) {
2328 uint64_t nn = n[i], mm = m[i];
2329 uint64_t pp = expand_pred_h(pg[H1(i)]);
2330 d[i] = (nn & pp) | (mm & ~pp);
2331 }
2332}
2333
2334void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2335 void *vg, uint32_t desc)
2336{
2337 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2338 uint64_t *d = vd, *n = vn, *m = vm;
2339 uint8_t *pg = vg;
2340
2341 for (i = 0; i < opr_sz; i += 1) {
2342 uint64_t nn = n[i], mm = m[i];
2343 uint64_t pp = expand_pred_s(pg[H1(i)]);
2344 d[i] = (nn & pp) | (mm & ~pp);
2345 }
2346}
2347
2348void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2349 void *vg, uint32_t desc)
2350{
2351 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2352 uint64_t *d = vd, *n = vn, *m = vm;
2353 uint8_t *pg = vg;
2354
2355 for (i = 0; i < opr_sz; i += 1) {
2356 uint64_t nn = n[i], mm = m[i];
2357 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2358 }
2359}
757f9cff
RH
2360
2361/* Two operand comparison controlled by a predicate.
2362 * ??? It is very tempting to want to be able to expand this inline
2363 * with x86 instructions, e.g.
2364 *
2365 * vcmpeqw zm, zn, %ymm0
2366 * vpmovmskb %ymm0, %eax
2367 * and $0x5555, %eax
2368 * and pg, %eax
2369 *
2370 * or even aarch64, e.g.
2371 *
2372 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2373 * cmeq v0.8h, zn, zm
2374 * and v0.8h, v0.8h, mask
2375 * addv h0, v0.8h
2376 * and v0.8b, pg
2377 *
2378 * However, coming up with an abstraction that allows vector inputs and
2379 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2380 * scalar outputs, is tricky.
2381 */
2382#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2383uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2384{ \
2385 intptr_t opr_sz = simd_oprsz(desc); \
2386 uint32_t flags = PREDTEST_INIT; \
2387 intptr_t i = opr_sz; \
2388 do { \
2389 uint64_t out = 0, pg; \
2390 do { \
2391 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2392 TYPE nn = *(TYPE *)(vn + H(i)); \
2393 TYPE mm = *(TYPE *)(vm + H(i)); \
2394 out |= nn OP mm; \
2395 } while (i & 63); \
2396 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2397 out &= pg; \
2398 *(uint64_t *)(vd + (i >> 3)) = out; \
2399 flags = iter_predtest_bwd(out, pg, flags); \
2400 } while (i > 0); \
2401 return flags; \
2402}
2403
2404#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2405 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2406#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2407 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2408#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2409 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2410#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2411 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2412
2413DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2414DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2415DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2416DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2417
2418DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2419DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2420DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2421DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2422
2423DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2424DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2425DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2426DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2427
2428DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2429DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2430DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2431DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2432
2433DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2434DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2435DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2436DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2437
2438DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2439DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2440DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2441DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2442
2443#undef DO_CMP_PPZZ_B
2444#undef DO_CMP_PPZZ_H
2445#undef DO_CMP_PPZZ_S
2446#undef DO_CMP_PPZZ_D
2447#undef DO_CMP_PPZZ
2448
2449/* Similar, but the second source is "wide". */
2450#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2451uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2452{ \
2453 intptr_t opr_sz = simd_oprsz(desc); \
2454 uint32_t flags = PREDTEST_INIT; \
2455 intptr_t i = opr_sz; \
2456 do { \
2457 uint64_t out = 0, pg; \
2458 do { \
2459 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2460 do { \
2461 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2462 TYPE nn = *(TYPE *)(vn + H(i)); \
2463 out |= nn OP mm; \
2464 } while (i & 7); \
2465 } while (i & 63); \
2466 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2467 out &= pg; \
2468 *(uint64_t *)(vd + (i >> 3)) = out; \
2469 flags = iter_predtest_bwd(out, pg, flags); \
2470 } while (i > 0); \
2471 return flags; \
2472}
2473
2474#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2475 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2476#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2477 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2478#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2479 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2480
df4e0010
RH
2481DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2482DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2483DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 2484
df4e0010
RH
2485DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2486DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2487DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
2488
2489DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2490DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2491DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2492
2493DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2494DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2495DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2496
2497DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2498DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2499DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2500
2501DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2502DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2503DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2504
2505DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2506DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2507DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2508
2509DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2510DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2511DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2512
2513DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2514DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2515DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2516
2517DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2518DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2519DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2520
2521#undef DO_CMP_PPZW_B
2522#undef DO_CMP_PPZW_H
2523#undef DO_CMP_PPZW_S
2524#undef DO_CMP_PPZW
38cadeba
RH
2525
2526/* Similar, but the second source is immediate. */
2527#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2528uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2529{ \
2530 intptr_t opr_sz = simd_oprsz(desc); \
2531 uint32_t flags = PREDTEST_INIT; \
2532 TYPE mm = simd_data(desc); \
2533 intptr_t i = opr_sz; \
2534 do { \
2535 uint64_t out = 0, pg; \
2536 do { \
2537 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2538 TYPE nn = *(TYPE *)(vn + H(i)); \
2539 out |= nn OP mm; \
2540 } while (i & 63); \
2541 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2542 out &= pg; \
2543 *(uint64_t *)(vd + (i >> 3)) = out; \
2544 flags = iter_predtest_bwd(out, pg, flags); \
2545 } while (i > 0); \
2546 return flags; \
2547}
2548
2549#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2550 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2551#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2552 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2553#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2554 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2555#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2556 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2557
2558DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2559DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2560DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2561DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2562
2563DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2564DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2565DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2566DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2567
2568DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2569DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2570DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2571DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2572
2573DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2574DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2575DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2576DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2577
2578DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2579DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2580DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2581DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2582
2583DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2584DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2585DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2586DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2587
2588DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2589DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2590DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2591DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2592
2593DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2594DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2595DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2596DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2597
2598DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2599DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2600DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2601DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2602
2603DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2604DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2605DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2606DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2607
2608#undef DO_CMP_PPZI_B
2609#undef DO_CMP_PPZI_H
2610#undef DO_CMP_PPZI_S
2611#undef DO_CMP_PPZI_D
2612#undef DO_CMP_PPZI
35da316f
RH
2613
2614/* Similar to the ARM LastActive pseudocode function. */
2615static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2616{
2617 intptr_t i;
2618
2619 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2620 uint64_t pg = *(uint64_t *)(vg + i);
2621 if (pg) {
2622 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2623 }
2624 }
2625 return 0;
2626}
2627
2628/* Compute a mask into RETB that is true for all G, up to and including
2629 * (if after) or excluding (if !after) the first G & N.
2630 * Return true if BRK found.
2631 */
2632static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2633 bool brk, bool after)
2634{
2635 uint64_t b;
2636
2637 if (brk) {
2638 b = 0;
2639 } else if ((g & n) == 0) {
2640 /* For all G, no N are set; break not found. */
2641 b = g;
2642 } else {
2643 /* Break somewhere in N. Locate it. */
2644 b = g & n; /* guard true, pred true */
2645 b = b & -b; /* first such */
2646 if (after) {
2647 b = b | (b - 1); /* break after same */
2648 } else {
2649 b = b - 1; /* break before same */
2650 }
2651 brk = true;
2652 }
2653
2654 *retb = b;
2655 return brk;
2656}
2657
2658/* Compute a zeroing BRK. */
2659static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2660 intptr_t oprsz, bool after)
2661{
2662 bool brk = false;
2663 intptr_t i;
2664
2665 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2666 uint64_t this_b, this_g = g[i];
2667
2668 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2669 d[i] = this_b & this_g;
2670 }
2671}
2672
2673/* Likewise, but also compute flags. */
2674static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2675 intptr_t oprsz, bool after)
2676{
2677 uint32_t flags = PREDTEST_INIT;
2678 bool brk = false;
2679 intptr_t i;
2680
2681 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2682 uint64_t this_b, this_d, this_g = g[i];
2683
2684 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2685 d[i] = this_d = this_b & this_g;
2686 flags = iter_predtest_fwd(this_d, this_g, flags);
2687 }
2688 return flags;
2689}
2690
2691/* Compute a merging BRK. */
2692static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2693 intptr_t oprsz, bool after)
2694{
2695 bool brk = false;
2696 intptr_t i;
2697
2698 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2699 uint64_t this_b, this_g = g[i];
2700
2701 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2702 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2703 }
2704}
2705
2706/* Likewise, but also compute flags. */
2707static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2708 intptr_t oprsz, bool after)
2709{
2710 uint32_t flags = PREDTEST_INIT;
2711 bool brk = false;
2712 intptr_t i;
2713
2714 for (i = 0; i < oprsz / 8; ++i) {
2715 uint64_t this_b, this_d = d[i], this_g = g[i];
2716
2717 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2718 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2719 flags = iter_predtest_fwd(this_d, this_g, flags);
2720 }
2721 return flags;
2722}
2723
2724static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2725{
2726 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2727 * The compiler should turn this into 4 64-bit integer stores.
2728 */
2729 memset(d, 0, sizeof(ARMPredicateReg));
2730 return PREDTEST_INIT;
2731}
2732
2733void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2734 uint32_t pred_desc)
2735{
2736 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2737 if (last_active_pred(vn, vg, oprsz)) {
2738 compute_brk_z(vd, vm, vg, oprsz, true);
2739 } else {
2740 do_zero(vd, oprsz);
2741 }
2742}
2743
2744uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2745 uint32_t pred_desc)
2746{
2747 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2748 if (last_active_pred(vn, vg, oprsz)) {
2749 return compute_brks_z(vd, vm, vg, oprsz, true);
2750 } else {
2751 return do_zero(vd, oprsz);
2752 }
2753}
2754
2755void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2756 uint32_t pred_desc)
2757{
2758 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2759 if (last_active_pred(vn, vg, oprsz)) {
2760 compute_brk_z(vd, vm, vg, oprsz, false);
2761 } else {
2762 do_zero(vd, oprsz);
2763 }
2764}
2765
2766uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2767 uint32_t pred_desc)
2768{
2769 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2770 if (last_active_pred(vn, vg, oprsz)) {
2771 return compute_brks_z(vd, vm, vg, oprsz, false);
2772 } else {
2773 return do_zero(vd, oprsz);
2774 }
2775}
2776
2777void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778{
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 compute_brk_z(vd, vn, vg, oprsz, true);
2781}
2782
2783uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784{
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786 return compute_brks_z(vd, vn, vg, oprsz, true);
2787}
2788
2789void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2790{
2791 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2792 compute_brk_z(vd, vn, vg, oprsz, false);
2793}
2794
2795uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2796{
2797 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2798 return compute_brks_z(vd, vn, vg, oprsz, false);
2799}
2800
2801void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2802{
2803 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2804 compute_brk_m(vd, vn, vg, oprsz, true);
2805}
2806
2807uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2808{
2809 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2810 return compute_brks_m(vd, vn, vg, oprsz, true);
2811}
2812
2813void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2814{
2815 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2816 compute_brk_m(vd, vn, vg, oprsz, false);
2817}
2818
2819uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2820{
2821 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2822 return compute_brks_m(vd, vn, vg, oprsz, false);
2823}
2824
2825void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2826{
2827 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2828
2829 if (!last_active_pred(vn, vg, oprsz)) {
2830 do_zero(vd, oprsz);
2831 }
2832}
2833
2834/* As if PredTest(Ones(PL), D, esz). */
2835static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2836 uint64_t esz_mask)
2837{
2838 uint32_t flags = PREDTEST_INIT;
2839 intptr_t i;
2840
2841 for (i = 0; i < oprsz / 8; i++) {
2842 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2843 }
2844 if (oprsz & 7) {
2845 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2846 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2847 }
2848 return flags;
2849}
2850
2851uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2852{
2853 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2854
2855 if (last_active_pred(vn, vg, oprsz)) {
2856 return predtest_ones(vd, oprsz, -1);
2857 } else {
2858 return do_zero(vd, oprsz);
2859 }
2860}
9ee3a611
RH
2861
2862uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2863{
2864 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2865 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2866 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2867 intptr_t i;
2868
2869 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2870 uint64_t t = n[i] & g[i] & mask;
2871 sum += ctpop64(t);
2872 }
2873 return sum;
2874}
caf1cefc
RH
2875
2876uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2877{
2878 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2879 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2880 uint64_t esz_mask = pred_esz_masks[esz];
2881 ARMPredicateReg *d = vd;
2882 uint32_t flags;
2883 intptr_t i;
2884
2885 /* Begin with a zero predicate register. */
2886 flags = do_zero(d, oprsz);
2887 if (count == 0) {
2888 return flags;
2889 }
2890
caf1cefc
RH
2891 /* Set all of the requested bits. */
2892 for (i = 0; i < count / 64; ++i) {
2893 d->p[i] = esz_mask;
2894 }
2895 if (count & 63) {
2896 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2897 }
2898
2899 return predtest_ones(d, oprsz, esz_mask);
2900}
c4e7c493 2901
23fbe79f
RH
2902/* Recursive reduction on a function;
2903 * C.f. the ARM ARM function ReducePredicated.
2904 *
2905 * While it would be possible to write this without the DATA temporary,
2906 * it is much simpler to process the predicate register this way.
2907 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2908 * little to gain with a more complex non-recursive form.
2909 */
2910#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2911static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2912{ \
2913 if (n == 1) { \
2914 return *data; \
2915 } else { \
2916 uintptr_t half = n / 2; \
2917 TYPE lo = NAME##_reduce(data, status, half); \
2918 TYPE hi = NAME##_reduce(data + half, status, half); \
2919 return TYPE##_##FUNC(lo, hi, status); \
2920 } \
2921} \
2922uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2923{ \
2924 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2925 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2926 for (i = 0; i < oprsz; ) { \
2927 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2928 do { \
2929 TYPE nn = *(TYPE *)(vn + H(i)); \
2930 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2931 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2932 } while (i & 15); \
2933 } \
2934 for (; i < maxsz; i += sizeof(TYPE)) { \
2935 *(TYPE *)((void *)data + i) = IDENT; \
2936 } \
2937 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2938}
2939
2940DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2941DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2942DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2943
2944/* Identity is floatN_default_nan, without the function call. */
2945DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2946DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2947DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2948
2949DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2950DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2951DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2952
2953DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2954DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2955DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2956
2957DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2958DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2959DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2960
2961#undef DO_REDUCE
2962
7f9ddf64
RH
2963uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2964 void *status, uint32_t desc)
2965{
2966 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2967 float16 result = nn;
2968
2969 do {
2970 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2971 do {
2972 if (pg & 1) {
2973 float16 mm = *(float16 *)(vm + H1_2(i));
2974 result = float16_add(result, mm, status);
2975 }
2976 i += sizeof(float16), pg >>= sizeof(float16);
2977 } while (i & 15);
2978 } while (i < opr_sz);
2979
2980 return result;
2981}
2982
2983uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2984 void *status, uint32_t desc)
2985{
2986 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2987 float32 result = nn;
2988
2989 do {
2990 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2991 do {
2992 if (pg & 1) {
2993 float32 mm = *(float32 *)(vm + H1_2(i));
2994 result = float32_add(result, mm, status);
2995 }
2996 i += sizeof(float32), pg >>= sizeof(float32);
2997 } while (i & 15);
2998 } while (i < opr_sz);
2999
3000 return result;
3001}
3002
3003uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3004 void *status, uint32_t desc)
3005{
3006 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3007 uint64_t *m = vm;
3008 uint8_t *pg = vg;
3009
3010 for (i = 0; i < opr_sz; i++) {
3011 if (pg[H1(i)] & 1) {
3012 nn = float64_add(nn, m[i], status);
3013 }
3014 }
3015
3016 return nn;
3017}
3018
ec3b87c2
RH
3019/* Fully general three-operand expander, controlled by a predicate,
3020 * With the extra float_status parameter.
3021 */
3022#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3023void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3024 void *status, uint32_t desc) \
3025{ \
3026 intptr_t i = simd_oprsz(desc); \
3027 uint64_t *g = vg; \
3028 do { \
3029 uint64_t pg = g[(i - 1) >> 6]; \
3030 do { \
3031 i -= sizeof(TYPE); \
3032 if (likely((pg >> (i & 63)) & 1)) { \
3033 TYPE nn = *(TYPE *)(vn + H(i)); \
3034 TYPE mm = *(TYPE *)(vm + H(i)); \
3035 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3036 } \
3037 } while (i & 63); \
3038 } while (i != 0); \
3039}
3040
3041DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3042DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3043DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3044
3045DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3046DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3047DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3048
3049DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3050DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3051DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3052
3053DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3054DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3055DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3056
3057DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3058DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3059DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3060
3061DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3062DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3063DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3064
3065DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3066DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3067DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3068
3069DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3070DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3071DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3072
3073static inline float16 abd_h(float16 a, float16 b, float_status *s)
3074{
3075 return float16_abs(float16_sub(a, b, s));
3076}
3077
3078static inline float32 abd_s(float32 a, float32 b, float_status *s)
3079{
3080 return float32_abs(float32_sub(a, b, s));
3081}
3082
3083static inline float64 abd_d(float64 a, float64 b, float_status *s)
3084{
3085 return float64_abs(float64_sub(a, b, s));
3086}
3087
3088DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3089DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3090DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3091
3092static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3093{
3094 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3095 return float64_scalbn(a, b_int, s);
3096}
3097
3098DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3099DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3100DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3101
3102DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3103DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3104DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3105
3106#undef DO_ZPZZ_FP
3107
cc48affe
RH
3108/* Three-operand expander, with one scalar operand, controlled by
3109 * a predicate, with the extra float_status parameter.
3110 */
3111#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3112void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3113 void *status, uint32_t desc) \
3114{ \
3115 intptr_t i = simd_oprsz(desc); \
3116 uint64_t *g = vg; \
3117 TYPE mm = scalar; \
3118 do { \
3119 uint64_t pg = g[(i - 1) >> 6]; \
3120 do { \
3121 i -= sizeof(TYPE); \
3122 if (likely((pg >> (i & 63)) & 1)) { \
3123 TYPE nn = *(TYPE *)(vn + H(i)); \
3124 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3125 } \
3126 } while (i & 63); \
3127 } while (i != 0); \
3128}
3129
3130DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3131DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3132DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3133
3134DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3135DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3136DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3137
3138DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3139DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3140DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3141
3142static inline float16 subr_h(float16 a, float16 b, float_status *s)
3143{
3144 return float16_sub(b, a, s);
3145}
3146
3147static inline float32 subr_s(float32 a, float32 b, float_status *s)
3148{
3149 return float32_sub(b, a, s);
3150}
3151
3152static inline float64 subr_d(float64 a, float64 b, float_status *s)
3153{
3154 return float64_sub(b, a, s);
3155}
3156
3157DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3158DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3159DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3160
3161DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3162DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3163DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3164
3165DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3166DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3167DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3168
3169DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3170DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3171DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3172
3173DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3174DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3175DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3176
8092c6a3
RH
3177/* Fully general two-operand expander, controlled by a predicate,
3178 * With the extra float_status parameter.
3179 */
3180#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3181void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3182{ \
3183 intptr_t i = simd_oprsz(desc); \
3184 uint64_t *g = vg; \
3185 do { \
3186 uint64_t pg = g[(i - 1) >> 6]; \
3187 do { \
3188 i -= sizeof(TYPE); \
3189 if (likely((pg >> (i & 63)) & 1)) { \
3190 TYPE nn = *(TYPE *)(vn + H(i)); \
3191 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3192 } \
3193 } while (i & 63); \
3194 } while (i != 0); \
3195}
3196
46d33d1e
RH
3197/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3198 * FZ16. When converting from fp16, this affects flushing input denormals;
3199 * when converting to fp16, this affects flushing output denormals.
3200 */
3201static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3202{
3203 flag save = get_flush_inputs_to_zero(fpst);
3204 float32 ret;
3205
3206 set_flush_inputs_to_zero(false, fpst);
3207 ret = float16_to_float32(f, true, fpst);
3208 set_flush_inputs_to_zero(save, fpst);
3209 return ret;
3210}
3211
3212static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3213{
3214 flag save = get_flush_inputs_to_zero(fpst);
3215 float64 ret;
3216
3217 set_flush_inputs_to_zero(false, fpst);
3218 ret = float16_to_float64(f, true, fpst);
3219 set_flush_inputs_to_zero(save, fpst);
3220 return ret;
3221}
3222
3223static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3224{
3225 flag save = get_flush_to_zero(fpst);
3226 float16 ret;
3227
3228 set_flush_to_zero(false, fpst);
3229 ret = float32_to_float16(f, true, fpst);
3230 set_flush_to_zero(save, fpst);
3231 return ret;
3232}
3233
3234static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3235{
3236 flag save = get_flush_to_zero(fpst);
3237 float16 ret;
3238
3239 set_flush_to_zero(false, fpst);
3240 ret = float64_to_float16(f, true, fpst);
3241 set_flush_to_zero(save, fpst);
3242 return ret;
3243}
3244
df4de1af
RH
3245static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3246{
3247 if (float16_is_any_nan(f)) {
3248 float_raise(float_flag_invalid, s);
3249 return 0;
3250 }
3251 return float16_to_int16_round_to_zero(f, s);
3252}
3253
3254static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3255{
3256 if (float16_is_any_nan(f)) {
3257 float_raise(float_flag_invalid, s);
3258 return 0;
3259 }
3260 return float16_to_int64_round_to_zero(f, s);
3261}
3262
3263static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3264{
3265 if (float32_is_any_nan(f)) {
3266 float_raise(float_flag_invalid, s);
3267 return 0;
3268 }
3269 return float32_to_int64_round_to_zero(f, s);
3270}
3271
3272static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3273{
3274 if (float64_is_any_nan(f)) {
3275 float_raise(float_flag_invalid, s);
3276 return 0;
3277 }
3278 return float64_to_int64_round_to_zero(f, s);
3279}
3280
3281static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3282{
3283 if (float16_is_any_nan(f)) {
3284 float_raise(float_flag_invalid, s);
3285 return 0;
3286 }
3287 return float16_to_uint16_round_to_zero(f, s);
3288}
3289
3290static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3291{
3292 if (float16_is_any_nan(f)) {
3293 float_raise(float_flag_invalid, s);
3294 return 0;
3295 }
3296 return float16_to_uint64_round_to_zero(f, s);
3297}
3298
3299static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3300{
3301 if (float32_is_any_nan(f)) {
3302 float_raise(float_flag_invalid, s);
3303 return 0;
3304 }
3305 return float32_to_uint64_round_to_zero(f, s);
3306}
3307
3308static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3309{
3310 if (float64_is_any_nan(f)) {
3311 float_raise(float_flag_invalid, s);
3312 return 0;
3313 }
3314 return float64_to_uint64_round_to_zero(f, s);
3315}
3316
46d33d1e
RH
3317DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3318DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3319DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3320DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3321DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3322DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3323
df4de1af
RH
3324DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3325DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3326DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3327DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3328DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3329DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3330DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3331
3332DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3333DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3334DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3335DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3336DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3337DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3338DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3339
cda3c753
RH
3340DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3341DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3342DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3343
3344DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3345DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3346DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3347
ec5b375b
RH
3348DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3349DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3350DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3351
3352DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3353DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3354DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3355
8092c6a3
RH
3356DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3357DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3358DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3359DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3360DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3361DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3362DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3363
3364DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3365DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3366DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3367DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3368DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3369DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3370DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3371
3372#undef DO_ZPZ_FP
3373
6ceabaad
RH
3374/* 4-operand predicated multiply-add. This requires 7 operands to pass
3375 * "properly", so we need to encode some of the registers into DESC.
3376 */
3377QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3378
3379static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3380 uint16_t neg1, uint16_t neg3)
3381{
3382 intptr_t i = simd_oprsz(desc);
3383 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3384 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3385 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3386 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3387 void *vd = &env->vfp.zregs[rd];
3388 void *vn = &env->vfp.zregs[rn];
3389 void *vm = &env->vfp.zregs[rm];
3390 void *va = &env->vfp.zregs[ra];
3391 uint64_t *g = vg;
3392
3393 do {
3394 uint64_t pg = g[(i - 1) >> 6];
3395 do {
3396 i -= 2;
3397 if (likely((pg >> (i & 63)) & 1)) {
3398 float16 e1, e2, e3, r;
3399
3400 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3401 e2 = *(uint16_t *)(vm + H1_2(i));
3402 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
52a339b1 3403 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
6ceabaad
RH
3404 *(uint16_t *)(vd + H1_2(i)) = r;
3405 }
3406 } while (i & 63);
3407 } while (i != 0);
3408}
3409
3410void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3411{
3412 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3413}
3414
3415void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3416{
3417 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3418}
3419
3420void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3421{
3422 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3423}
3424
3425void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3426{
3427 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3428}
3429
3430static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3431 uint32_t neg1, uint32_t neg3)
3432{
3433 intptr_t i = simd_oprsz(desc);
3434 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3435 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3436 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3437 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3438 void *vd = &env->vfp.zregs[rd];
3439 void *vn = &env->vfp.zregs[rn];
3440 void *vm = &env->vfp.zregs[rm];
3441 void *va = &env->vfp.zregs[ra];
3442 uint64_t *g = vg;
3443
3444 do {
3445 uint64_t pg = g[(i - 1) >> 6];
3446 do {
3447 i -= 4;
3448 if (likely((pg >> (i & 63)) & 1)) {
3449 float32 e1, e2, e3, r;
3450
3451 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3452 e2 = *(uint32_t *)(vm + H1_4(i));
3453 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3454 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3455 *(uint32_t *)(vd + H1_4(i)) = r;
3456 }
3457 } while (i & 63);
3458 } while (i != 0);
3459}
3460
3461void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3462{
3463 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3464}
3465
3466void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3467{
3468 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3469}
3470
3471void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3472{
3473 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3474}
3475
3476void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3477{
3478 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3479}
3480
3481static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3482 uint64_t neg1, uint64_t neg3)
3483{
3484 intptr_t i = simd_oprsz(desc);
3485 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3486 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3487 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3488 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3489 void *vd = &env->vfp.zregs[rd];
3490 void *vn = &env->vfp.zregs[rn];
3491 void *vm = &env->vfp.zregs[rm];
3492 void *va = &env->vfp.zregs[ra];
3493 uint64_t *g = vg;
3494
3495 do {
3496 uint64_t pg = g[(i - 1) >> 6];
3497 do {
3498 i -= 8;
3499 if (likely((pg >> (i & 63)) & 1)) {
3500 float64 e1, e2, e3, r;
3501
3502 e1 = *(uint64_t *)(vn + i) ^ neg1;
3503 e2 = *(uint64_t *)(vm + i);
3504 e3 = *(uint64_t *)(va + i) ^ neg3;
3505 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3506 *(uint64_t *)(vd + i) = r;
3507 }
3508 } while (i & 63);
3509 } while (i != 0);
3510}
3511
3512void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3513{
3514 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3515}
3516
3517void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3518{
3519 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3520}
3521
3522void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3523{
3524 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3525}
3526
3527void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3528{
3529 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3530}
3531
abfdefd5
RH
3532/* Two operand floating-point comparison controlled by a predicate.
3533 * Unlike the integer version, we are not allowed to optimistically
3534 * compare operands, since the comparison may have side effects wrt
3535 * the FPSR.
3536 */
3537#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3538void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3539 void *status, uint32_t desc) \
3540{ \
3541 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3542 uint64_t *d = vd, *g = vg; \
3543 do { \
3544 uint64_t out = 0, pg = g[j]; \
3545 do { \
3546 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3547 if (likely((pg >> (i & 63)) & 1)) { \
3548 TYPE nn = *(TYPE *)(vn + H(i)); \
3549 TYPE mm = *(TYPE *)(vm + H(i)); \
3550 out |= OP(TYPE, nn, mm, status); \
3551 } \
3552 } while (i & 63); \
3553 d[j--] = out; \
3554 } while (i > 0); \
3555}
3556
3557#define DO_FPCMP_PPZZ_H(NAME, OP) \
3558 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3559#define DO_FPCMP_PPZZ_S(NAME, OP) \
3560 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3561#define DO_FPCMP_PPZZ_D(NAME, OP) \
3562 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3563
3564#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3565 DO_FPCMP_PPZZ_H(NAME, OP) \
3566 DO_FPCMP_PPZZ_S(NAME, OP) \
3567 DO_FPCMP_PPZZ_D(NAME, OP)
3568
3569#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3570#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
3571#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3572#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
3573#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3574#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3575#define DO_FCMUO(TYPE, X, Y, ST) \
3576 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3577#define DO_FACGE(TYPE, X, Y, ST) \
3578 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3579#define DO_FACGT(TYPE, X, Y, ST) \
3580 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3581
3582DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3583DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3584DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3585DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3586DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3587DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3588DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3589
3590#undef DO_FPCMP_PPZZ_ALL
3591#undef DO_FPCMP_PPZZ_D
3592#undef DO_FPCMP_PPZZ_S
3593#undef DO_FPCMP_PPZZ_H
3594#undef DO_FPCMP_PPZZ
3595
4d2e2a03
RH
3596/* One operand floating-point comparison against zero, controlled
3597 * by a predicate.
3598 */
3599#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3600void HELPER(NAME)(void *vd, void *vn, void *vg, \
3601 void *status, uint32_t desc) \
3602{ \
3603 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3604 uint64_t *d = vd, *g = vg; \
3605 do { \
3606 uint64_t out = 0, pg = g[j]; \
3607 do { \
3608 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3609 if ((pg >> (i & 63)) & 1) { \
3610 TYPE nn = *(TYPE *)(vn + H(i)); \
3611 out |= OP(TYPE, nn, 0, status); \
3612 } \
3613 } while (i & 63); \
3614 d[j--] = out; \
3615 } while (i > 0); \
3616}
3617
3618#define DO_FPCMP_PPZ0_H(NAME, OP) \
3619 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3620#define DO_FPCMP_PPZ0_S(NAME, OP) \
3621 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3622#define DO_FPCMP_PPZ0_D(NAME, OP) \
3623 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3624
3625#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3626 DO_FPCMP_PPZ0_H(NAME, OP) \
3627 DO_FPCMP_PPZ0_S(NAME, OP) \
3628 DO_FPCMP_PPZ0_D(NAME, OP)
3629
3630DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3631DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3632DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3633DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3634DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3635DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3636
67fcd9ad
RH
3637/* FP Trig Multiply-Add. */
3638
3639void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3640{
3641 static const float16 coeff[16] = {
3642 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3643 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3644 };
3645 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3646 intptr_t x = simd_data(desc);
3647 float16 *d = vd, *n = vn, *m = vm;
3648 for (i = 0; i < opr_sz; i++) {
3649 float16 mm = m[i];
3650 intptr_t xx = x;
3651 if (float16_is_neg(mm)) {
3652 mm = float16_abs(mm);
3653 xx += 8;
3654 }
3655 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3656 }
3657}
3658
3659void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3660{
3661 static const float32 coeff[16] = {
3662 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3663 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3664 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3665 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3666 };
3667 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3668 intptr_t x = simd_data(desc);
3669 float32 *d = vd, *n = vn, *m = vm;
3670 for (i = 0; i < opr_sz; i++) {
3671 float32 mm = m[i];
3672 intptr_t xx = x;
3673 if (float32_is_neg(mm)) {
3674 mm = float32_abs(mm);
3675 xx += 8;
3676 }
3677 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3678 }
3679}
3680
3681void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3682{
3683 static const float64 coeff[16] = {
3684 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3685 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3686 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3687 0x3de5d8408868552full, 0x0000000000000000ull,
3688 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3689 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3690 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3691 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3692 };
3693 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3694 intptr_t x = simd_data(desc);
3695 float64 *d = vd, *n = vn, *m = vm;
3696 for (i = 0; i < opr_sz; i++) {
3697 float64 mm = m[i];
3698 intptr_t xx = x;
3699 if (float64_is_neg(mm)) {
3700 mm = float64_abs(mm);
3701 xx += 8;
3702 }
3703 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3704 }
3705}
3706
76a9d9cd
RH
3707/*
3708 * FP Complex Add
3709 */
3710
3711void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3712 void *vs, uint32_t desc)
3713{
3714 intptr_t j, i = simd_oprsz(desc);
3715 uint64_t *g = vg;
3716 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3717 float16 neg_real = float16_chs(neg_imag);
3718
3719 do {
3720 uint64_t pg = g[(i - 1) >> 6];
3721 do {
3722 float16 e0, e1, e2, e3;
3723
3724 /* I holds the real index; J holds the imag index. */
3725 j = i - sizeof(float16);
3726 i -= 2 * sizeof(float16);
3727
3728 e0 = *(float16 *)(vn + H1_2(i));
3729 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3730 e2 = *(float16 *)(vn + H1_2(j));
3731 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3732
3733 if (likely((pg >> (i & 63)) & 1)) {
3734 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3735 }
3736 if (likely((pg >> (j & 63)) & 1)) {
3737 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3738 }
3739 } while (i & 63);
3740 } while (i != 0);
3741}
3742
3743void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3744 void *vs, uint32_t desc)
3745{
3746 intptr_t j, i = simd_oprsz(desc);
3747 uint64_t *g = vg;
3748 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3749 float32 neg_real = float32_chs(neg_imag);
3750
3751 do {
3752 uint64_t pg = g[(i - 1) >> 6];
3753 do {
3754 float32 e0, e1, e2, e3;
3755
3756 /* I holds the real index; J holds the imag index. */
3757 j = i - sizeof(float32);
3758 i -= 2 * sizeof(float32);
3759
3760 e0 = *(float32 *)(vn + H1_2(i));
3761 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3762 e2 = *(float32 *)(vn + H1_2(j));
3763 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3764
3765 if (likely((pg >> (i & 63)) & 1)) {
3766 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3767 }
3768 if (likely((pg >> (j & 63)) & 1)) {
3769 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3770 }
3771 } while (i & 63);
3772 } while (i != 0);
3773}
3774
3775void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3776 void *vs, uint32_t desc)
3777{
3778 intptr_t j, i = simd_oprsz(desc);
3779 uint64_t *g = vg;
3780 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3781 float64 neg_real = float64_chs(neg_imag);
3782
3783 do {
3784 uint64_t pg = g[(i - 1) >> 6];
3785 do {
3786 float64 e0, e1, e2, e3;
3787
3788 /* I holds the real index; J holds the imag index. */
3789 j = i - sizeof(float64);
3790 i -= 2 * sizeof(float64);
3791
3792 e0 = *(float64 *)(vn + H1_2(i));
3793 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3794 e2 = *(float64 *)(vn + H1_2(j));
3795 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3796
3797 if (likely((pg >> (i & 63)) & 1)) {
3798 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3799 }
3800 if (likely((pg >> (j & 63)) & 1)) {
3801 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3802 }
3803 } while (i & 63);
3804 } while (i != 0);
3805}
3806
05f48bab
RH
3807/*
3808 * FP Complex Multiply
3809 */
3810
3811QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3812
3813void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3814{
3815 intptr_t j, i = simd_oprsz(desc);
3816 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3817 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3818 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3819 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3820 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3821 bool flip = rot & 1;
3822 float16 neg_imag, neg_real;
3823 void *vd = &env->vfp.zregs[rd];
3824 void *vn = &env->vfp.zregs[rn];
3825 void *vm = &env->vfp.zregs[rm];
3826 void *va = &env->vfp.zregs[ra];
3827 uint64_t *g = vg;
3828
3829 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3830 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3831
3832 do {
3833 uint64_t pg = g[(i - 1) >> 6];
3834 do {
3835 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3836
3837 /* I holds the real index; J holds the imag index. */
3838 j = i - sizeof(float16);
3839 i -= 2 * sizeof(float16);
3840
3841 nr = *(float16 *)(vn + H1_2(i));
3842 ni = *(float16 *)(vn + H1_2(j));
3843 mr = *(float16 *)(vm + H1_2(i));
3844 mi = *(float16 *)(vm + H1_2(j));
3845
3846 e2 = (flip ? ni : nr);
3847 e1 = (flip ? mi : mr) ^ neg_real;
3848 e4 = e2;
3849 e3 = (flip ? mr : mi) ^ neg_imag;
3850
3851 if (likely((pg >> (i & 63)) & 1)) {
3852 d = *(float16 *)(va + H1_2(i));
3853 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3854 *(float16 *)(vd + H1_2(i)) = d;
3855 }
3856 if (likely((pg >> (j & 63)) & 1)) {
3857 d = *(float16 *)(va + H1_2(j));
3858 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3859 *(float16 *)(vd + H1_2(j)) = d;
3860 }
3861 } while (i & 63);
3862 } while (i != 0);
3863}
3864
3865void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3866{
3867 intptr_t j, i = simd_oprsz(desc);
3868 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3869 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3870 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3871 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3872 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3873 bool flip = rot & 1;
3874 float32 neg_imag, neg_real;
3875 void *vd = &env->vfp.zregs[rd];
3876 void *vn = &env->vfp.zregs[rn];
3877 void *vm = &env->vfp.zregs[rm];
3878 void *va = &env->vfp.zregs[ra];
3879 uint64_t *g = vg;
3880
3881 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3882 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3883
3884 do {
3885 uint64_t pg = g[(i - 1) >> 6];
3886 do {
3887 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3888
3889 /* I holds the real index; J holds the imag index. */
3890 j = i - sizeof(float32);
3891 i -= 2 * sizeof(float32);
3892
3893 nr = *(float32 *)(vn + H1_2(i));
3894 ni = *(float32 *)(vn + H1_2(j));
3895 mr = *(float32 *)(vm + H1_2(i));
3896 mi = *(float32 *)(vm + H1_2(j));
3897
3898 e2 = (flip ? ni : nr);
3899 e1 = (flip ? mi : mr) ^ neg_real;
3900 e4 = e2;
3901 e3 = (flip ? mr : mi) ^ neg_imag;
3902
3903 if (likely((pg >> (i & 63)) & 1)) {
3904 d = *(float32 *)(va + H1_2(i));
3905 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3906 *(float32 *)(vd + H1_2(i)) = d;
3907 }
3908 if (likely((pg >> (j & 63)) & 1)) {
3909 d = *(float32 *)(va + H1_2(j));
3910 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3911 *(float32 *)(vd + H1_2(j)) = d;
3912 }
3913 } while (i & 63);
3914 } while (i != 0);
3915}
3916
3917void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3918{
3919 intptr_t j, i = simd_oprsz(desc);
3920 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3921 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3922 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3923 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3924 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3925 bool flip = rot & 1;
3926 float64 neg_imag, neg_real;
3927 void *vd = &env->vfp.zregs[rd];
3928 void *vn = &env->vfp.zregs[rn];
3929 void *vm = &env->vfp.zregs[rm];
3930 void *va = &env->vfp.zregs[ra];
3931 uint64_t *g = vg;
3932
3933 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3934 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3935
3936 do {
3937 uint64_t pg = g[(i - 1) >> 6];
3938 do {
3939 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3940
3941 /* I holds the real index; J holds the imag index. */
3942 j = i - sizeof(float64);
3943 i -= 2 * sizeof(float64);
3944
3945 nr = *(float64 *)(vn + H1_2(i));
3946 ni = *(float64 *)(vn + H1_2(j));
3947 mr = *(float64 *)(vm + H1_2(i));
3948 mi = *(float64 *)(vm + H1_2(j));
3949
3950 e2 = (flip ? ni : nr);
3951 e1 = (flip ? mi : mr) ^ neg_real;
3952 e4 = e2;
3953 e3 = (flip ? mr : mi) ^ neg_imag;
3954
3955 if (likely((pg >> (i & 63)) & 1)) {
3956 d = *(float64 *)(va + H1_2(i));
3957 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3958 *(float64 *)(vd + H1_2(i)) = d;
3959 }
3960 if (likely((pg >> (j & 63)) & 1)) {
3961 d = *(float64 *)(va + H1_2(j));
3962 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3963 *(float64 *)(vd + H1_2(j)) = d;
3964 }
3965 } while (i & 63);
3966 } while (i != 0);
3967}
3968
c4e7c493
RH
3969/*
3970 * Load contiguous data, protected by a governing predicate.
3971 */
9123aeb6
RH
3972
3973/*
3974 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3975 * Memory is valid through @host + @mem_max. The register element
3976 * indicies are inferred from @mem_ofs, as modified by the types for
3977 * which the helper is built. Return the @mem_ofs of the first element
3978 * not loaded (which is @mem_max if they are all loaded).
3979 *
3980 * For softmmu, we have fully validated the guest page. For user-only,
3981 * we cannot fully validate without taking the mmap lock, but since we
3982 * know the access is within one host page, if any access is valid they
3983 * all must be valid. However, when @vg is all false, it may be that
3984 * no access is valid.
3985 */
3986typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3987 intptr_t mem_ofs, intptr_t mem_max);
3988
3989/*
3990 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3991 * The controlling predicate is known to be true.
3992 */
3993typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
500d0484 3994 target_ulong vaddr, TCGMemOpIdx oi, uintptr_t ra);
9fd46c83 3995typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
9123aeb6
RH
3996
3997/*
3998 * Generate the above primitives.
3999 */
4000
4001#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4002static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4003 intptr_t mem_off, const intptr_t mem_max) \
4004{ \
4005 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4006 uint64_t *pg = vg; \
4007 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4008 TYPEM val = 0; \
4009 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4010 val = HOST(host + mem_off); \
4011 } \
4012 *(TYPEE *)(vd + H(reg_off)) = val; \
4013 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4014 } \
4015 return mem_off; \
4016}
4017
4018#ifdef CONFIG_SOFTMMU
4019#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4020static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 4021 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
9123aeb6 4022{ \
9123aeb6
RH
4023 TYPEM val = TLB(env, addr, oi, ra); \
4024 *(TYPEE *)(vd + H(reg_off)) = val; \
4025}
4026#else
4027#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4028static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 4029 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
9123aeb6
RH
4030{ \
4031 TYPEM val = HOST(g2h(addr)); \
4032 *(TYPEE *)(vd + H(reg_off)) = val; \
4033}
4034#endif
4035
4036#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4037 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4038 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4039
4040DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4041DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4042DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4043DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4044DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4045DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4046DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4047
4048#define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4049 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4050 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4051 MOEND, helper_##end##_##PT##_mmu)
4052
4053DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4054DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4055DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
4056DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
4057DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
4058
4059DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4060DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
4061DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
4062
4063DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
4064
4065DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4066DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4067DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
4068DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
4069DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
4070
4071DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4072DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
4073DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
4074
4075DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
4076
4077#undef DO_LD_TLB
4078#undef DO_LD_HOST
4079#undef DO_LD_PRIM_1
4080#undef DO_LD_PRIM_2
4081
4082/*
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4086 */
4087static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4088 intptr_t reg_max, int esz)
4089{
4090 uint64_t pg_mask = pred_esz_masks[esz];
4091 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4092
4093 /* In normal usage, the first element is active. */
4094 if (likely(pg & 1)) {
4095 return reg_off;
4096 }
4097
4098 if (pg == 0) {
4099 reg_off &= -64;
4100 do {
4101 reg_off += 64;
4102 if (unlikely(reg_off >= reg_max)) {
4103 /* The entire predicate was false. */
4104 return reg_max;
4105 }
4106 pg = vg[reg_off >> 6] & pg_mask;
4107 } while (pg == 0);
4108 }
4109 reg_off += ctz64(pg);
4110
4111 /* We should never see an out of range predicate bit set. */
4112 tcg_debug_assert(reg_off < reg_max);
4113 return reg_off;
4114}
4115
4116/*
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4119 */
4120static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4121 intptr_t mem_max)
4122{
4123 target_ulong addr = base + mem_off;
4124 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4125 return MIN(split, mem_max - mem_off) + mem_off;
4126}
4127
4128static inline void set_helper_retaddr(uintptr_t ra)
4129{
4130#ifdef CONFIG_USER_ONLY
4131 helper_retaddr = ra;
4132#endif
4133}
4134
4135/*
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4138 */
4139static inline bool test_host_page(void *host)
4140{
4141#ifdef CONFIG_USER_ONLY
4142 return true;
4143#else
4144 return likely(host != NULL);
4145#endif
4146}
4147
4148/*
4149 * Common helper for all contiguous one-register predicated loads.
4150 */
4151static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152 uint32_t desc, const uintptr_t retaddr,
4153 const int esz, const int msz,
4154 sve_ld1_host_fn *host_fn,
4155 sve_ld1_tlb_fn *tlb_fn)
4156{
500d0484
RH
4157 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4158 const int mmu_idx = get_mmuidx(oi);
4159 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4160 void *vd = &env->vfp.zregs[rd];
9123aeb6
RH
4161 const int diffsz = esz - msz;
4162 const intptr_t reg_max = simd_oprsz(desc);
4163 const intptr_t mem_max = reg_max >> diffsz;
9123aeb6
RH
4164 ARMVectorReg scratch;
4165 void *host;
4166 intptr_t split, reg_off, mem_off;
4167
4168 /* Find the first active element. */
4169 reg_off = find_next_active(vg, 0, reg_max, esz);
4170 if (unlikely(reg_off == reg_max)) {
4171 /* The entire predicate was false; no load occurs. */
4172 memset(vd, 0, reg_max);
4173 return;
4174 }
4175 mem_off = reg_off >> diffsz;
4176 set_helper_retaddr(retaddr);
4177
4178 /*
4179 * If the (remaining) load is entirely within a single page, then:
4180 * For softmmu, and the tlb hits, then no faults will occur;
4181 * For user-only, either the first load will fault or none will.
4182 * We can thus perform the load directly to the destination and
4183 * Vd will be unmodified on any exception path.
4184 */
4185 split = max_for_page(addr, mem_off, mem_max);
4186 if (likely(split == mem_max)) {
4187 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4188 if (test_host_page(host)) {
4189 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4190 tcg_debug_assert(mem_off == mem_max);
4191 set_helper_retaddr(0);
4192 /* After having taken any fault, zero leading inactive elements. */
4193 swap_memzero(vd, reg_off);
4194 return;
4195 }
4196 }
4197
4198 /*
4199 * Perform the predicated read into a temporary, thus ensuring
4200 * if the load of the last element faults, Vd is not modified.
4201 */
4202#ifdef CONFIG_USER_ONLY
4203 swap_memzero(&scratch, reg_off);
4204 host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4205#else
4206 memset(&scratch, 0, reg_max);
4207 goto start;
4208 while (1) {
4209 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4210 if (reg_off >= reg_max) {
4211 break;
4212 }
4213 mem_off = reg_off >> diffsz;
4214 split = max_for_page(addr, mem_off, mem_max);
4215
4216 start:
4217 if (split - mem_off >= (1 << msz)) {
4218 /* At least one whole element on this page. */
4219 host = tlb_vaddr_to_host(env, addr + mem_off,
4220 MMU_DATA_LOAD, mmu_idx);
4221 if (host) {
4222 mem_off = host_fn(&scratch, vg, host - mem_off,
4223 mem_off, split);
4224 reg_off = mem_off << diffsz;
4225 continue;
4226 }
4227 }
4228
4229 /*
4230 * Perform one normal read. This may fault, longjmping out to the
4231 * main loop in order to raise an exception. It may succeed, and
4232 * as a side-effect load the TLB entry for the next round. Finally,
4233 * in the extremely unlikely case we're performing this operation
4234 * on I/O memory, it may succeed but not bring in the TLB entry.
4235 * But even then we have still made forward progress.
4236 */
500d0484 4237 tlb_fn(env, &scratch, reg_off, addr + mem_off, oi, retaddr);
9123aeb6
RH
4238 reg_off += 1 << esz;
4239 }
4240#endif
4241
4242 set_helper_retaddr(0);
4243 memcpy(vd, &scratch, reg_max);
c4e7c493
RH
4244}
4245
9123aeb6
RH
4246#define DO_LD1_1(NAME, ESZ) \
4247void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4248 target_ulong addr, uint32_t desc) \
4249{ \
4250 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4251 sve_##NAME##_host, sve_##NAME##_tlb); \
4252}
4253
9123aeb6 4254#define DO_LD1_2(NAME, ESZ, MSZ) \
7d0a57a2
RH
4255void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4256 target_ulong addr, uint32_t desc) \
4257{ \
4258 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4259 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4260} \
4261void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4262 target_ulong addr, uint32_t desc) \
4263{ \
4264 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4265 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
4266}
4267
4268DO_LD1_1(ld1bb, 0)
4269DO_LD1_1(ld1bhu, 1)
4270DO_LD1_1(ld1bhs, 1)
4271DO_LD1_1(ld1bsu, 2)
4272DO_LD1_1(ld1bss, 2)
4273DO_LD1_1(ld1bdu, 3)
4274DO_LD1_1(ld1bds, 3)
4275
4276DO_LD1_2(ld1hh, 1, 1)
4277DO_LD1_2(ld1hsu, 2, 1)
4278DO_LD1_2(ld1hss, 2, 1)
4279DO_LD1_2(ld1hdu, 3, 1)
4280DO_LD1_2(ld1hds, 3, 1)
4281
4282DO_LD1_2(ld1ss, 2, 2)
4283DO_LD1_2(ld1sdu, 3, 2)
4284DO_LD1_2(ld1sds, 3, 2)
4285
4286DO_LD1_2(ld1dd, 3, 3)
4287
4288#undef DO_LD1_1
4289#undef DO_LD1_2
4290
f27d4dc2
RH
4291/*
4292 * Common helpers for all contiguous 2,3,4-register predicated loads.
4293 */
4294static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4295 uint32_t desc, int size, uintptr_t ra,
4296 sve_ld1_tlb_fn *tlb_fn)
4297{
500d0484
RH
4298 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4299 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4300 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4301 ARMVectorReg scratch[2] = { };
4302
4303 set_helper_retaddr(ra);
4304 for (i = 0; i < oprsz; ) {
4305 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4306 do {
4307 if (pg & 1) {
500d0484
RH
4308 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4309 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
f27d4dc2
RH
4310 }
4311 i += size, pg >>= size;
4312 addr += 2 * size;
4313 } while (i & 15);
4314 }
4315 set_helper_retaddr(0);
4316
4317 /* Wait until all exceptions have been raised to write back. */
4318 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4319 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
c4e7c493
RH
4320}
4321
f27d4dc2
RH
4322static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4323 uint32_t desc, int size, uintptr_t ra,
4324 sve_ld1_tlb_fn *tlb_fn)
4325{
500d0484
RH
4326 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4327 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4328 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4329 ARMVectorReg scratch[3] = { };
4330
4331 set_helper_retaddr(ra);
4332 for (i = 0; i < oprsz; ) {
4333 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4334 do {
4335 if (pg & 1) {
500d0484
RH
4336 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4337 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4338 tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
f27d4dc2
RH
4339 }
4340 i += size, pg >>= size;
4341 addr += 3 * size;
4342 } while (i & 15);
4343 }
4344 set_helper_retaddr(0);
4345
4346 /* Wait until all exceptions have been raised to write back. */
4347 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4348 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4349 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
c4e7c493
RH
4350}
4351
f27d4dc2
RH
4352static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4353 uint32_t desc, int size, uintptr_t ra,
4354 sve_ld1_tlb_fn *tlb_fn)
4355{
500d0484
RH
4356 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4357 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4358 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4359 ARMVectorReg scratch[4] = { };
4360
4361 set_helper_retaddr(ra);
4362 for (i = 0; i < oprsz; ) {
4363 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4364 do {
4365 if (pg & 1) {
500d0484
RH
4366 tlb_fn(env, &scratch[0], i, addr, oi, ra);
4367 tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4368 tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
4369 tlb_fn(env, &scratch[3], i, addr + 3 * size, oi, ra);
f27d4dc2
RH
4370 }
4371 i += size, pg >>= size;
4372 addr += 4 * size;
4373 } while (i & 15);
4374 }
4375 set_helper_retaddr(0);
4376
4377 /* Wait until all exceptions have been raised to write back. */
4378 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4379 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4380 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4381 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4382}
4383
4384#define DO_LDN_1(N) \
4385void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \
4386 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4387{ \
4388 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4389}
4390
4391#define DO_LDN_2(N, SUFF, SIZE) \
7d0a57a2
RH
4392void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_le_r) \
4393 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4394{ \
4395 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4396 sve_ld1##SUFF##_le_tlb); \
4397} \
4398void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_be_r) \
f27d4dc2
RH
4399 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4400{ \
4401 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
7d0a57a2 4402 sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
4403}
4404
f27d4dc2
RH
4405DO_LDN_1(2)
4406DO_LDN_1(3)
4407DO_LDN_1(4)
c4e7c493 4408
f27d4dc2
RH
4409DO_LDN_2(2, hh, 2)
4410DO_LDN_2(3, hh, 2)
4411DO_LDN_2(4, hh, 2)
c4e7c493 4412
f27d4dc2
RH
4413DO_LDN_2(2, ss, 4)
4414DO_LDN_2(3, ss, 4)
4415DO_LDN_2(4, ss, 4)
c4e7c493 4416
f27d4dc2
RH
4417DO_LDN_2(2, dd, 8)
4418DO_LDN_2(3, dd, 8)
4419DO_LDN_2(4, dd, 8)
c4e7c493 4420
f27d4dc2
RH
4421#undef DO_LDN_1
4422#undef DO_LDN_2
e2654d75
RH
4423
4424/*
4425 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
4426 *
4427 * For user-only, one could argue that we should hold the mmap_lock during
4428 * the operation so that there is no race between page_check_range and the
4429 * load operation. However, unmapping pages out from under a running thread
4430 * is extraordinarily unlikely. This theoretical race condition also affects
4431 * linux-user/ in its get_user/put_user macros.
4432 *
4433 * TODO: Construct some helpers, written in assembly, that interact with
4434 * handle_cpu_signal to produce memory ops which can properly report errors
4435 * without racing.
e2654d75
RH
4436 */
4437
e2654d75
RH
4438/* Fault on byte I. All bits in FFR from I are cleared. The vector
4439 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4440 * option, which leaves subsequent data unchanged.
4441 */
4442static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4443{
4444 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4445
4446 if (i & 63) {
4447 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4448 i = ROUND_UP(i, 64);
4449 }
4450 for (; i < oprsz; i += 64) {
4451 ffr[i / 64] = 0;
4452 }
4453}
4454
9123aeb6
RH
4455/*
4456 * Common helper for all contiguous first-fault loads.
4457 */
4458static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4459 uint32_t desc, const uintptr_t retaddr,
4460 const int esz, const int msz,
4461 sve_ld1_host_fn *host_fn,
4462 sve_ld1_tlb_fn *tlb_fn)
4463{
500d0484
RH
4464 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4465 const int mmu_idx = get_mmuidx(oi);
4466 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4467 void *vd = &env->vfp.zregs[rd];
9123aeb6
RH
4468 const int diffsz = esz - msz;
4469 const intptr_t reg_max = simd_oprsz(desc);
4470 const intptr_t mem_max = reg_max >> diffsz;
9123aeb6
RH
4471 intptr_t split, reg_off, mem_off;
4472 void *host;
4473
4474 /* Skip to the first active element. */
4475 reg_off = find_next_active(vg, 0, reg_max, esz);
4476 if (unlikely(reg_off == reg_max)) {
4477 /* The entire predicate was false; no load occurs. */
4478 memset(vd, 0, reg_max);
4479 return;
4480 }
4481 mem_off = reg_off >> diffsz;
4482 set_helper_retaddr(retaddr);
4483
4484 /*
4485 * If the (remaining) load is entirely within a single page, then:
4486 * For softmmu, and the tlb hits, then no faults will occur;
4487 * For user-only, either the first load will fault or none will.
4488 * We can thus perform the load directly to the destination and
4489 * Vd will be unmodified on any exception path.
4490 */
4491 split = max_for_page(addr, mem_off, mem_max);
4492 if (likely(split == mem_max)) {
4493 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4494 if (test_host_page(host)) {
4495 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4496 tcg_debug_assert(mem_off == mem_max);
4497 set_helper_retaddr(0);
4498 /* After any fault, zero any leading inactive elements. */
4499 swap_memzero(vd, reg_off);
4500 return;
4501 }
4502 }
4503
4504#ifdef CONFIG_USER_ONLY
4505 /*
4506 * The page(s) containing this first element at ADDR+MEM_OFF must
4507 * be valid. Considering that this first element may be misaligned
4508 * and cross a page boundary itself, take the rest of the page from
4509 * the last byte of the element.
4510 */
4511 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4512 mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4513
4514 /* After any fault, zero any leading inactive elements. */
4515 swap_memzero(vd, reg_off);
4516 reg_off = mem_off << diffsz;
4517#else
4518 /*
4519 * Perform one normal read, which will fault or not.
4520 * But it is likely to bring the page into the tlb.
4521 */
500d0484 4522 tlb_fn(env, vd, reg_off, addr + mem_off, oi, retaddr);
9123aeb6
RH
4523
4524 /* After any fault, zero any leading predicated false elts. */
4525 swap_memzero(vd, reg_off);
4526 mem_off += 1 << msz;
4527 reg_off += 1 << esz;
4528
4529 /* Try again to read the balance of the page. */
4530 split = max_for_page(addr, mem_off - 1, mem_max);
4531 if (split >= (1 << msz)) {
4532 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4533 if (host) {
4534 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4535 reg_off = mem_off << diffsz;
4536 }
4537 }
4538#endif
4539
4540 set_helper_retaddr(0);
4541 record_fault(env, reg_off, reg_max);
4542}
4543
4544/*
4545 * Common helper for all contiguous no-fault loads.
e2654d75 4546 */
9123aeb6
RH
4547static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4548 uint32_t desc, const int esz, const int msz,
4549 sve_ld1_host_fn *host_fn)
4550{
500d0484
RH
4551 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4552 void *vd = &env->vfp.zregs[rd];
9123aeb6
RH
4553 const int diffsz = esz - msz;
4554 const intptr_t reg_max = simd_oprsz(desc);
4555 const intptr_t mem_max = reg_max >> diffsz;
4556 const int mmu_idx = cpu_mmu_index(env, false);
4557 intptr_t split, reg_off, mem_off;
4558 void *host;
4559
4560#ifdef CONFIG_USER_ONLY
4561 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4562 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4563 /* The entire operation is valid and will not fault. */
4564 host_fn(vd, vg, host, 0, mem_max);
4565 return;
4566 }
4567#endif
4568
4569 /* There will be no fault, so we may modify in advance. */
4570 memset(vd, 0, reg_max);
4571
4572 /* Skip to the first active element. */
4573 reg_off = find_next_active(vg, 0, reg_max, esz);
4574 if (unlikely(reg_off == reg_max)) {
4575 /* The entire predicate was false; no load occurs. */
4576 return;
4577 }
4578 mem_off = reg_off >> diffsz;
4579
4580#ifdef CONFIG_USER_ONLY
4581 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4582 /* At least one load is valid; take the rest of the page. */
4583 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4584 mem_off = host_fn(vd, vg, host, mem_off, split);
4585 reg_off = mem_off << diffsz;
4586 }
4587#else
4588 /*
4589 * If the address is not in the TLB, we have no way to bring the
4590 * entry into the TLB without also risking a fault. Note that
4591 * the corollary is that we never load from an address not in RAM.
4592 *
4593 * This last is out of spec, in a weird corner case.
4594 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4595 * must not actually hit the bus -- it returns UNKNOWN data instead.
4596 * But if you map non-RAM with Normal memory attributes and do a NF
4597 * load then it should access the bus. (Nobody ought actually do this
4598 * in the real world, obviously.)
4599 *
4600 * Then there are the annoying special cases with watchpoints...
4601 *
4602 * TODO: Add a form of tlb_fill that does not raise an exception,
4603 * with a form of tlb_vaddr_to_host and a set of loads to match.
4604 * The non_fault_vaddr_to_host would handle everything, usually,
4605 * and the loads would handle the iomem path for watchpoints.
4606 */
4607 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4608 split = max_for_page(addr, mem_off, mem_max);
4609 if (host && split >= (1 << msz)) {
4610 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4611 reg_off = mem_off << diffsz;
4612 }
4613#endif
4614
4615 record_fault(env, reg_off, reg_max);
4616}
4617
4618#define DO_LDFF1_LDNF1_1(PART, ESZ) \
4619void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4620 target_ulong addr, uint32_t desc) \
e2654d75 4621{ \
9123aeb6
RH
4622 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4623 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 4624} \
9123aeb6
RH
4625void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4626 target_ulong addr, uint32_t desc) \
e2654d75 4627{ \
9123aeb6 4628 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
e2654d75
RH
4629}
4630
9123aeb6 4631#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
4632void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4633 target_ulong addr, uint32_t desc) \
e2654d75 4634{ \
7d0a57a2
RH
4635 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4636 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 4637} \
7d0a57a2
RH
4638void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4639 target_ulong addr, uint32_t desc) \
9123aeb6 4640{ \
7d0a57a2
RH
4641 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4642} \
4643void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4644 target_ulong addr, uint32_t desc) \
4645{ \
4646 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4647 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4648} \
4649void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4650 target_ulong addr, uint32_t desc) \
4651{ \
4652 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
e2654d75
RH
4653}
4654
9123aeb6
RH
4655DO_LDFF1_LDNF1_1(bb, 0)
4656DO_LDFF1_LDNF1_1(bhu, 1)
4657DO_LDFF1_LDNF1_1(bhs, 1)
4658DO_LDFF1_LDNF1_1(bsu, 2)
4659DO_LDFF1_LDNF1_1(bss, 2)
4660DO_LDFF1_LDNF1_1(bdu, 3)
4661DO_LDFF1_LDNF1_1(bds, 3)
e2654d75 4662
9123aeb6
RH
4663DO_LDFF1_LDNF1_2(hh, 1, 1)
4664DO_LDFF1_LDNF1_2(hsu, 2, 1)
4665DO_LDFF1_LDNF1_2(hss, 2, 1)
4666DO_LDFF1_LDNF1_2(hdu, 3, 1)
4667DO_LDFF1_LDNF1_2(hds, 3, 1)
e2654d75 4668
9123aeb6
RH
4669DO_LDFF1_LDNF1_2(ss, 2, 2)
4670DO_LDFF1_LDNF1_2(sdu, 3, 2)
4671DO_LDFF1_LDNF1_2(sds, 3, 2)
e2654d75 4672
9123aeb6 4673DO_LDFF1_LDNF1_2(dd, 3, 3)
e2654d75 4674
9123aeb6
RH
4675#undef DO_LDFF1_LDNF1_1
4676#undef DO_LDFF1_LDNF1_2
1a039c7e
RH
4677
4678/*
4679 * Store contiguous data, protected by a governing predicate.
4680 */
1a039c7e 4681
9fd46c83
RH
4682#ifdef CONFIG_SOFTMMU
4683#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4684static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 4685 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
9fd46c83 4686{ \
9fd46c83 4687 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
1a039c7e 4688}
9fd46c83
RH
4689#else
4690#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4691static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 4692 target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
9fd46c83
RH
4693{ \
4694 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
1a039c7e 4695}
9fd46c83 4696#endif
1a039c7e 4697
9fd46c83
RH
4698DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
4699DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
4700DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
4701DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
1a039c7e 4702
9fd46c83
RH
4703DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4704DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4705DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4706
4707DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4708DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
1a039c7e 4709
9fd46c83 4710DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
1a039c7e 4711
9fd46c83
RH
4712DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4713DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4714DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
1a039c7e 4715
9fd46c83
RH
4716DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4717DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
1a039c7e 4718
9fd46c83 4719DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
1a039c7e 4720
9fd46c83 4721#undef DO_ST_TLB
1a039c7e 4722
9fd46c83
RH
4723/*
4724 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4725 */
4726static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
4727 uint32_t desc, const uintptr_t ra,
4728 const int esize, const int msize,
4729 sve_st1_tlb_fn *tlb_fn)
4730{
500d0484
RH
4731 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4732 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 4733 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83 4734 void *vd = &env->vfp.zregs[rd];
1a039c7e 4735
9fd46c83
RH
4736 set_helper_retaddr(ra);
4737 for (i = 0; i < oprsz; ) {
4738 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4739 do {
4740 if (pg & 1) {
500d0484 4741 tlb_fn(env, vd, i, addr, oi, ra);
9fd46c83
RH
4742 }
4743 i += esize, pg >>= esize;
4744 addr += msize;
4745 } while (i & 15);
4746 }
4747 set_helper_retaddr(0);
4748}
1a039c7e 4749
9fd46c83
RH
4750static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
4751 uint32_t desc, const uintptr_t ra,
4752 const int esize, const int msize,
4753 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4754{
500d0484
RH
4755 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4756 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 4757 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
4758 void *d1 = &env->vfp.zregs[rd];
4759 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
1a039c7e 4760
9fd46c83
RH
4761 set_helper_retaddr(ra);
4762 for (i = 0; i < oprsz; ) {
4763 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4764 do {
4765 if (pg & 1) {
500d0484
RH
4766 tlb_fn(env, d1, i, addr, oi, ra);
4767 tlb_fn(env, d2, i, addr + msize, oi, ra);
9fd46c83
RH
4768 }
4769 i += esize, pg >>= esize;
4770 addr += 2 * msize;
4771 } while (i & 15);
1a039c7e 4772 }
9fd46c83 4773 set_helper_retaddr(0);
1a039c7e
RH
4774}
4775
9fd46c83
RH
4776static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
4777 uint32_t desc, const uintptr_t ra,
4778 const int esize, const int msize,
4779 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4780{
500d0484
RH
4781 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4782 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 4783 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
4784 void *d1 = &env->vfp.zregs[rd];
4785 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4786 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
1a039c7e 4787
9fd46c83
RH
4788 set_helper_retaddr(ra);
4789 for (i = 0; i < oprsz; ) {
4790 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4791 do {
4792 if (pg & 1) {
500d0484
RH
4793 tlb_fn(env, d1, i, addr, oi, ra);
4794 tlb_fn(env, d2, i, addr + msize, oi, ra);
4795 tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
9fd46c83
RH
4796 }
4797 i += esize, pg >>= esize;
4798 addr += 3 * msize;
4799 } while (i & 15);
1a039c7e 4800 }
9fd46c83 4801 set_helper_retaddr(0);
1a039c7e
RH
4802}
4803
9fd46c83
RH
4804static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
4805 uint32_t desc, const uintptr_t ra,
4806 const int esize, const int msize,
4807 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4808{
500d0484
RH
4809 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4810 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 4811 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
4812 void *d1 = &env->vfp.zregs[rd];
4813 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4814 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4815 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
1a039c7e 4816
9fd46c83
RH
4817 set_helper_retaddr(ra);
4818 for (i = 0; i < oprsz; ) {
4819 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4820 do {
4821 if (pg & 1) {
500d0484
RH
4822 tlb_fn(env, d1, i, addr, oi, ra);
4823 tlb_fn(env, d2, i, addr + msize, oi, ra);
4824 tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
4825 tlb_fn(env, d4, i, addr + 3 * msize, oi, ra);
9fd46c83
RH
4826 }
4827 i += esize, pg >>= esize;
4828 addr += 4 * msize;
4829 } while (i & 15);
1a039c7e 4830 }
9fd46c83
RH
4831 set_helper_retaddr(0);
4832}
4833
4834#define DO_STN_1(N, NAME, ESIZE) \
4835void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r) \
4836 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4837{ \
4838 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4839 sve_st1##NAME##_tlb); \
1a039c7e 4840}
f6dbf62a 4841
9fd46c83 4842#define DO_STN_2(N, NAME, ESIZE, MSIZE) \
28d57f2d 4843void __attribute__((flatten)) HELPER(sve_st##N##NAME##_le_r) \
9fd46c83
RH
4844 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4845{ \
4846 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
28d57f2d
RH
4847 sve_st1##NAME##_le_tlb); \
4848} \
4849void __attribute__((flatten)) HELPER(sve_st##N##NAME##_be_r) \
4850 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4851{ \
4852 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4853 sve_st1##NAME##_be_tlb); \
9fd46c83
RH
4854}
4855
4856DO_STN_1(1, bb, 1)
4857DO_STN_1(1, bh, 2)
4858DO_STN_1(1, bs, 4)
4859DO_STN_1(1, bd, 8)
4860DO_STN_1(2, bb, 1)
4861DO_STN_1(3, bb, 1)
4862DO_STN_1(4, bb, 1)
4863
4864DO_STN_2(1, hh, 2, 2)
4865DO_STN_2(1, hs, 4, 2)
4866DO_STN_2(1, hd, 8, 2)
4867DO_STN_2(2, hh, 2, 2)
4868DO_STN_2(3, hh, 2, 2)
4869DO_STN_2(4, hh, 2, 2)
4870
4871DO_STN_2(1, ss, 4, 4)
4872DO_STN_2(1, sd, 8, 4)
4873DO_STN_2(2, ss, 4, 4)
4874DO_STN_2(3, ss, 4, 4)
4875DO_STN_2(4, ss, 4, 4)
4876
4877DO_STN_2(1, dd, 8, 8)
4878DO_STN_2(2, dd, 8, 8)
4879DO_STN_2(3, dd, 8, 8)
4880DO_STN_2(4, dd, 8, 8)
4881
4882#undef DO_STN_1
4883#undef DO_STN_2
4884
d4f75f25
RH
4885/*
4886 * Loads with a vector index.
4887 */
673e9fa6 4888
d4f75f25
RH
4889/*
4890 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
4891 */
4892typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
4893
4894static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
4895{
4896 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
4897}
4898
d4f75f25
RH
4899static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
4900{
4901 return *(int32_t *)(reg + H1_4(reg_ofs));
4902}
4903
4904static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
4905{
4906 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
4907}
4908
4909static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
4910{
4911 return (int32_t)*(uint64_t *)(reg + reg_ofs);
4912}
4913
4914static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
4915{
4916 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
4917}
4918
d4f75f25
RH
4919static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
4920 target_ulong base, uint32_t desc, uintptr_t ra,
4921 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4922{
500d0484
RH
4923 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4924 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
d4f75f25 4925 intptr_t i, oprsz = simd_oprsz(desc);
d4f75f25
RH
4926 ARMVectorReg scratch = { };
4927
4928 set_helper_retaddr(ra);
4929 for (i = 0; i < oprsz; ) {
4930 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4931 do {
4932 if (likely(pg & 1)) {
4933 target_ulong off = off_fn(vm, i);
500d0484 4934 tlb_fn(env, &scratch, i, base + (off << scale), oi, ra);
d4f75f25
RH
4935 }
4936 i += 4, pg >>= 4;
4937 } while (i & 15);
4938 }
4939 set_helper_retaddr(0);
4940
4941 /* Wait until all exceptions have been raised to write back. */
4942 memcpy(vd, &scratch, oprsz);
4943}
4944
4945static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
4946 target_ulong base, uint32_t desc, uintptr_t ra,
4947 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4948{
500d0484
RH
4949 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4950 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
d4f75f25 4951 intptr_t i, oprsz = simd_oprsz(desc) / 8;
d4f75f25
RH
4952 ARMVectorReg scratch = { };
4953
4954 set_helper_retaddr(ra);
4955 for (i = 0; i < oprsz; i++) {
4956 uint8_t pg = *(uint8_t *)(vg + H1(i));
4957 if (likely(pg & 1)) {
4958 target_ulong off = off_fn(vm, i * 8);
500d0484 4959 tlb_fn(env, &scratch, i * 8, base + (off << scale), oi, ra);
d4f75f25
RH
4960 }
4961 }
4962 set_helper_retaddr(0);
4963
4964 /* Wait until all exceptions have been raised to write back. */
4965 memcpy(vd, &scratch, oprsz * 8);
4966}
4967
4968#define DO_LD1_ZPZ_S(MEM, OFS) \
4969void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4970 (CPUARMState *env, void *vd, void *vg, void *vm, \
4971 target_ulong base, uint32_t desc) \
4972{ \
4973 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
4974 off_##OFS##_s, sve_ld1##MEM##_tlb); \
4975}
4976
4977#define DO_LD1_ZPZ_D(MEM, OFS) \
4978void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4979 (CPUARMState *env, void *vd, void *vg, void *vm, \
4980 target_ulong base, uint32_t desc) \
4981{ \
4982 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
4983 off_##OFS##_d, sve_ld1##MEM##_tlb); \
4984}
4985
4986DO_LD1_ZPZ_S(bsu, zsu)
4987DO_LD1_ZPZ_S(bsu, zss)
4988DO_LD1_ZPZ_D(bdu, zsu)
4989DO_LD1_ZPZ_D(bdu, zss)
4990DO_LD1_ZPZ_D(bdu, zd)
4991
4992DO_LD1_ZPZ_S(bss, zsu)
4993DO_LD1_ZPZ_S(bss, zss)
4994DO_LD1_ZPZ_D(bds, zsu)
4995DO_LD1_ZPZ_D(bds, zss)
4996DO_LD1_ZPZ_D(bds, zd)
4997
4998DO_LD1_ZPZ_S(hsu_le, zsu)
4999DO_LD1_ZPZ_S(hsu_le, zss)
5000DO_LD1_ZPZ_D(hdu_le, zsu)
5001DO_LD1_ZPZ_D(hdu_le, zss)
5002DO_LD1_ZPZ_D(hdu_le, zd)
5003
5004DO_LD1_ZPZ_S(hsu_be, zsu)
5005DO_LD1_ZPZ_S(hsu_be, zss)
5006DO_LD1_ZPZ_D(hdu_be, zsu)
5007DO_LD1_ZPZ_D(hdu_be, zss)
5008DO_LD1_ZPZ_D(hdu_be, zd)
5009
5010DO_LD1_ZPZ_S(hss_le, zsu)
5011DO_LD1_ZPZ_S(hss_le, zss)
5012DO_LD1_ZPZ_D(hds_le, zsu)
5013DO_LD1_ZPZ_D(hds_le, zss)
5014DO_LD1_ZPZ_D(hds_le, zd)
5015
5016DO_LD1_ZPZ_S(hss_be, zsu)
5017DO_LD1_ZPZ_S(hss_be, zss)
5018DO_LD1_ZPZ_D(hds_be, zsu)
5019DO_LD1_ZPZ_D(hds_be, zss)
5020DO_LD1_ZPZ_D(hds_be, zd)
5021
5022DO_LD1_ZPZ_S(ss_le, zsu)
5023DO_LD1_ZPZ_S(ss_le, zss)
5024DO_LD1_ZPZ_D(sdu_le, zsu)
5025DO_LD1_ZPZ_D(sdu_le, zss)
5026DO_LD1_ZPZ_D(sdu_le, zd)
5027
5028DO_LD1_ZPZ_S(ss_be, zsu)
5029DO_LD1_ZPZ_S(ss_be, zss)
5030DO_LD1_ZPZ_D(sdu_be, zsu)
5031DO_LD1_ZPZ_D(sdu_be, zss)
5032DO_LD1_ZPZ_D(sdu_be, zd)
5033
5034DO_LD1_ZPZ_D(sds_le, zsu)
5035DO_LD1_ZPZ_D(sds_le, zss)
5036DO_LD1_ZPZ_D(sds_le, zd)
5037
5038DO_LD1_ZPZ_D(sds_be, zsu)
5039DO_LD1_ZPZ_D(sds_be, zss)
5040DO_LD1_ZPZ_D(sds_be, zd)
5041
5042DO_LD1_ZPZ_D(dd_le, zsu)
5043DO_LD1_ZPZ_D(dd_le, zss)
5044DO_LD1_ZPZ_D(dd_le, zd)
5045
5046DO_LD1_ZPZ_D(dd_be, zsu)
5047DO_LD1_ZPZ_D(dd_be, zss)
5048DO_LD1_ZPZ_D(dd_be, zd)
5049
5050#undef DO_LD1_ZPZ_S
5051#undef DO_LD1_ZPZ_D
673e9fa6 5052
ed67eb7f
RH
5053/* First fault loads with a vector index. */
5054
116347ce
RH
5055/* Load one element into VD+REG_OFF from (ENV,VADDR) without faulting.
5056 * The controlling predicate is known to be true. Return true if the
5057 * load was successful.
5058 */
5059typedef bool sve_ld1_nf_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5060 target_ulong vaddr, int mmu_idx);
ed67eb7f 5061
116347ce
RH
5062#ifdef CONFIG_SOFTMMU
5063#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5064static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 5065 target_ulong addr, int mmu_idx) \
116347ce
RH
5066{ \
5067 target_ulong next_page = -(addr | TARGET_PAGE_MASK); \
5068 if (likely(next_page - addr >= sizeof(TYPEM))) { \
5069 void *host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx); \
5070 if (likely(host)) { \
5071 TYPEM val = HOST(host); \
5072 *(TYPEE *)(vd + H(reg_off)) = val; \
5073 return true; \
5074 } \
5075 } \
5076 return false; \
ed67eb7f 5077}
ed67eb7f 5078#else
116347ce
RH
5079#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5080static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5081 target_ulong addr, int mmu_idx) \
5082{ \
5083 if (likely(page_check_range(addr, sizeof(TYPEM), PAGE_READ))) { \
5084 TYPEM val = HOST(g2h(addr)); \
5085 *(TYPEE *)(vd + H(reg_off)) = val; \
5086 return true; \
5087 } \
5088 return false; \
5089}
5090#endif
ed67eb7f 5091
116347ce
RH
5092DO_LD_NF(bsu, H1_4, uint32_t, uint8_t, ldub_p)
5093DO_LD_NF(bss, H1_4, uint32_t, int8_t, ldsb_p)
5094DO_LD_NF(bdu, , uint64_t, uint8_t, ldub_p)
5095DO_LD_NF(bds, , uint64_t, int8_t, ldsb_p)
5096
5097DO_LD_NF(hsu_le, H1_4, uint32_t, uint16_t, lduw_le_p)
5098DO_LD_NF(hss_le, H1_4, uint32_t, int16_t, ldsw_le_p)
5099DO_LD_NF(hsu_be, H1_4, uint32_t, uint16_t, lduw_be_p)
5100DO_LD_NF(hss_be, H1_4, uint32_t, int16_t, ldsw_be_p)
5101DO_LD_NF(hdu_le, , uint64_t, uint16_t, lduw_le_p)
5102DO_LD_NF(hds_le, , uint64_t, int16_t, ldsw_le_p)
5103DO_LD_NF(hdu_be, , uint64_t, uint16_t, lduw_be_p)
5104DO_LD_NF(hds_be, , uint64_t, int16_t, ldsw_be_p)
5105
5106DO_LD_NF(ss_le, H1_4, uint32_t, uint32_t, ldl_le_p)
5107DO_LD_NF(ss_be, H1_4, uint32_t, uint32_t, ldl_be_p)
5108DO_LD_NF(sdu_le, , uint64_t, uint32_t, ldl_le_p)
5109DO_LD_NF(sds_le, , uint64_t, int32_t, ldl_le_p)
5110DO_LD_NF(sdu_be, , uint64_t, uint32_t, ldl_be_p)
5111DO_LD_NF(sds_be, , uint64_t, int32_t, ldl_be_p)
5112
5113DO_LD_NF(dd_le, , uint64_t, uint64_t, ldq_le_p)
5114DO_LD_NF(dd_be, , uint64_t, uint64_t, ldq_be_p)
5115
5116/*
5117 * Common helper for all gather first-faulting loads.
5118 */
5119static inline void sve_ldff1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5120 target_ulong base, uint32_t desc, uintptr_t ra,
5121 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
5122 sve_ld1_nf_fn *nonfault_fn)
5123{
500d0484
RH
5124 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5125 const int mmu_idx = get_mmuidx(oi);
5126 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
116347ce 5127 intptr_t reg_off, reg_max = simd_oprsz(desc);
116347ce
RH
5128 target_ulong addr;
5129
5130 /* Skip to the first true predicate. */
5131 reg_off = find_next_active(vg, 0, reg_max, MO_32);
5132 if (likely(reg_off < reg_max)) {
5133 /* Perform one normal read, which will fault or not. */
5134 set_helper_retaddr(ra);
5135 addr = off_fn(vm, reg_off);
5136 addr = base + (addr << scale);
500d0484 5137 tlb_fn(env, vd, reg_off, addr, oi, ra);
116347ce
RH
5138
5139 /* The rest of the reads will be non-faulting. */
5140 set_helper_retaddr(0);
5141 }
5142
5143 /* After any fault, zero the leading predicated false elements. */
5144 swap_memzero(vd, reg_off);
5145
5146 while (likely((reg_off += 4) < reg_max)) {
5147 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 6) * 8);
5148 if (likely((pg >> (reg_off & 63)) & 1)) {
5149 addr = off_fn(vm, reg_off);
5150 addr = base + (addr << scale);
5151 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5152 record_fault(env, reg_off, reg_max);
5153 break;
5154 }
5155 } else {
5156 *(uint32_t *)(vd + H1_4(reg_off)) = 0;
5157 }
5158 }
ed67eb7f
RH
5159}
5160
116347ce
RH
5161static inline void sve_ldff1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5162 target_ulong base, uint32_t desc, uintptr_t ra,
5163 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
5164 sve_ld1_nf_fn *nonfault_fn)
5165{
500d0484
RH
5166 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5167 const int mmu_idx = get_mmuidx(oi);
5168 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
116347ce 5169 intptr_t reg_off, reg_max = simd_oprsz(desc);
116347ce
RH
5170 target_ulong addr;
5171
5172 /* Skip to the first true predicate. */
5173 reg_off = find_next_active(vg, 0, reg_max, MO_64);
5174 if (likely(reg_off < reg_max)) {
5175 /* Perform one normal read, which will fault or not. */
5176 set_helper_retaddr(ra);
5177 addr = off_fn(vm, reg_off);
5178 addr = base + (addr << scale);
500d0484 5179 tlb_fn(env, vd, reg_off, addr, oi, ra);
116347ce
RH
5180
5181 /* The rest of the reads will be non-faulting. */
5182 set_helper_retaddr(0);
5183 }
ed67eb7f 5184
116347ce
RH
5185 /* After any fault, zero the leading predicated false elements. */
5186 swap_memzero(vd, reg_off);
5187
5188 while (likely((reg_off += 8) < reg_max)) {
5189 uint8_t pg = *(uint8_t *)(vg + H1(reg_off >> 3));
5190 if (likely(pg & 1)) {
5191 addr = off_fn(vm, reg_off);
5192 addr = base + (addr << scale);
5193 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5194 record_fault(env, reg_off, reg_max);
5195 break;
5196 }
5197 } else {
5198 *(uint64_t *)(vd + reg_off) = 0;
5199 }
5200 }
5201}
5202
5203#define DO_LDFF1_ZPZ_S(MEM, OFS) \
5204void HELPER(sve_ldff##MEM##_##OFS) \
5205 (CPUARMState *env, void *vd, void *vg, void *vm, \
5206 target_ulong base, uint32_t desc) \
5207{ \
5208 sve_ldff1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5209 off_##OFS##_s, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5210}
5211
5212#define DO_LDFF1_ZPZ_D(MEM, OFS) \
5213void HELPER(sve_ldff##MEM##_##OFS) \
5214 (CPUARMState *env, void *vd, void *vg, void *vm, \
5215 target_ulong base, uint32_t desc) \
5216{ \
5217 sve_ldff1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5218 off_##OFS##_d, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5219}
5220
5221DO_LDFF1_ZPZ_S(bsu, zsu)
5222DO_LDFF1_ZPZ_S(bsu, zss)
5223DO_LDFF1_ZPZ_D(bdu, zsu)
5224DO_LDFF1_ZPZ_D(bdu, zss)
5225DO_LDFF1_ZPZ_D(bdu, zd)
5226
5227DO_LDFF1_ZPZ_S(bss, zsu)
5228DO_LDFF1_ZPZ_S(bss, zss)
5229DO_LDFF1_ZPZ_D(bds, zsu)
5230DO_LDFF1_ZPZ_D(bds, zss)
5231DO_LDFF1_ZPZ_D(bds, zd)
5232
5233DO_LDFF1_ZPZ_S(hsu_le, zsu)
5234DO_LDFF1_ZPZ_S(hsu_le, zss)
5235DO_LDFF1_ZPZ_D(hdu_le, zsu)
5236DO_LDFF1_ZPZ_D(hdu_le, zss)
5237DO_LDFF1_ZPZ_D(hdu_le, zd)
5238
5239DO_LDFF1_ZPZ_S(hsu_be, zsu)
5240DO_LDFF1_ZPZ_S(hsu_be, zss)
5241DO_LDFF1_ZPZ_D(hdu_be, zsu)
5242DO_LDFF1_ZPZ_D(hdu_be, zss)
5243DO_LDFF1_ZPZ_D(hdu_be, zd)
5244
5245DO_LDFF1_ZPZ_S(hss_le, zsu)
5246DO_LDFF1_ZPZ_S(hss_le, zss)
5247DO_LDFF1_ZPZ_D(hds_le, zsu)
5248DO_LDFF1_ZPZ_D(hds_le, zss)
5249DO_LDFF1_ZPZ_D(hds_le, zd)
5250
5251DO_LDFF1_ZPZ_S(hss_be, zsu)
5252DO_LDFF1_ZPZ_S(hss_be, zss)
5253DO_LDFF1_ZPZ_D(hds_be, zsu)
5254DO_LDFF1_ZPZ_D(hds_be, zss)
5255DO_LDFF1_ZPZ_D(hds_be, zd)
5256
5257DO_LDFF1_ZPZ_S(ss_le, zsu)
5258DO_LDFF1_ZPZ_S(ss_le, zss)
5259DO_LDFF1_ZPZ_D(sdu_le, zsu)
5260DO_LDFF1_ZPZ_D(sdu_le, zss)
5261DO_LDFF1_ZPZ_D(sdu_le, zd)
5262
5263DO_LDFF1_ZPZ_S(ss_be, zsu)
5264DO_LDFF1_ZPZ_S(ss_be, zss)
5265DO_LDFF1_ZPZ_D(sdu_be, zsu)
5266DO_LDFF1_ZPZ_D(sdu_be, zss)
5267DO_LDFF1_ZPZ_D(sdu_be, zd)
5268
5269DO_LDFF1_ZPZ_D(sds_le, zsu)
5270DO_LDFF1_ZPZ_D(sds_le, zss)
5271DO_LDFF1_ZPZ_D(sds_le, zd)
5272
5273DO_LDFF1_ZPZ_D(sds_be, zsu)
5274DO_LDFF1_ZPZ_D(sds_be, zss)
5275DO_LDFF1_ZPZ_D(sds_be, zd)
5276
5277DO_LDFF1_ZPZ_D(dd_le, zsu)
5278DO_LDFF1_ZPZ_D(dd_le, zss)
5279DO_LDFF1_ZPZ_D(dd_le, zd)
5280
5281DO_LDFF1_ZPZ_D(dd_be, zsu)
5282DO_LDFF1_ZPZ_D(dd_be, zss)
5283DO_LDFF1_ZPZ_D(dd_be, zd)
ed67eb7f 5284
f6dbf62a
RH
5285/* Stores with a vector index. */
5286
78cf1b88
RH
5287static void sve_st1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5288 target_ulong base, uint32_t desc, uintptr_t ra,
5289 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5290{
500d0484
RH
5291 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5292 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
78cf1b88 5293 intptr_t i, oprsz = simd_oprsz(desc);
f6dbf62a 5294
78cf1b88
RH
5295 set_helper_retaddr(ra);
5296 for (i = 0; i < oprsz; ) {
5297 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5298 do {
5299 if (likely(pg & 1)) {
5300 target_ulong off = off_fn(vm, i);
500d0484 5301 tlb_fn(env, vd, i, base + (off << scale), oi, ra);
78cf1b88
RH
5302 }
5303 i += 4, pg >>= 4;
5304 } while (i & 15);
5305 }
5306 set_helper_retaddr(0);
f6dbf62a
RH
5307}
5308
78cf1b88
RH
5309static void sve_st1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5310 target_ulong base, uint32_t desc, uintptr_t ra,
5311 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
5312{
500d0484
RH
5313 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5314 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
78cf1b88 5315 intptr_t i, oprsz = simd_oprsz(desc) / 8;
f6dbf62a 5316
78cf1b88
RH
5317 set_helper_retaddr(ra);
5318 for (i = 0; i < oprsz; i++) {
5319 uint8_t pg = *(uint8_t *)(vg + H1(i));
5320 if (likely(pg & 1)) {
5321 target_ulong off = off_fn(vm, i * 8);
500d0484 5322 tlb_fn(env, vd, i * 8, base + (off << scale), oi, ra);
78cf1b88
RH
5323 }
5324 }
5325 set_helper_retaddr(0);
5326}
f6dbf62a 5327
78cf1b88
RH
5328#define DO_ST1_ZPZ_S(MEM, OFS) \
5329void __attribute__((flatten)) HELPER(sve_st##MEM##_##OFS) \
5330 (CPUARMState *env, void *vd, void *vg, void *vm, \
5331 target_ulong base, uint32_t desc) \
5332{ \
5333 sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5334 off_##OFS##_s, sve_st1##MEM##_tlb); \
5335}
f6dbf62a 5336
78cf1b88
RH
5337#define DO_ST1_ZPZ_D(MEM, OFS) \
5338void __attribute__((flatten)) HELPER(sve_st##MEM##_##OFS) \
5339 (CPUARMState *env, void *vd, void *vg, void *vm, \
5340 target_ulong base, uint32_t desc) \
5341{ \
5342 sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5343 off_##OFS##_d, sve_st1##MEM##_tlb); \
5344}
5345
5346DO_ST1_ZPZ_S(bs, zsu)
5347DO_ST1_ZPZ_S(hs_le, zsu)
5348DO_ST1_ZPZ_S(hs_be, zsu)
5349DO_ST1_ZPZ_S(ss_le, zsu)
5350DO_ST1_ZPZ_S(ss_be, zsu)
5351
5352DO_ST1_ZPZ_S(bs, zss)
5353DO_ST1_ZPZ_S(hs_le, zss)
5354DO_ST1_ZPZ_S(hs_be, zss)
5355DO_ST1_ZPZ_S(ss_le, zss)
5356DO_ST1_ZPZ_S(ss_be, zss)
5357
5358DO_ST1_ZPZ_D(bd, zsu)
5359DO_ST1_ZPZ_D(hd_le, zsu)
5360DO_ST1_ZPZ_D(hd_be, zsu)
5361DO_ST1_ZPZ_D(sd_le, zsu)
5362DO_ST1_ZPZ_D(sd_be, zsu)
5363DO_ST1_ZPZ_D(dd_le, zsu)
5364DO_ST1_ZPZ_D(dd_be, zsu)
5365
5366DO_ST1_ZPZ_D(bd, zss)
5367DO_ST1_ZPZ_D(hd_le, zss)
5368DO_ST1_ZPZ_D(hd_be, zss)
5369DO_ST1_ZPZ_D(sd_le, zss)
5370DO_ST1_ZPZ_D(sd_be, zss)
5371DO_ST1_ZPZ_D(dd_le, zss)
5372DO_ST1_ZPZ_D(dd_be, zss)
5373
5374DO_ST1_ZPZ_D(bd, zd)
5375DO_ST1_ZPZ_D(hd_le, zd)
5376DO_ST1_ZPZ_D(hd_be, zd)
5377DO_ST1_ZPZ_D(sd_le, zd)
5378DO_ST1_ZPZ_D(sd_be, zd)
5379DO_ST1_ZPZ_D(dd_le, zd)
5380DO_ST1_ZPZ_D(dd_be, zd)
5381
5382#undef DO_ST1_ZPZ_S
5383#undef DO_ST1_ZPZ_D
This page took 0.741512 seconds and 4 git commands to generate.