]> Git Repo - qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE Permute - Unpredicated Group
[qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
a1f233f2 26#include "fpu/softfloat.h"
9e18d7a6
RH
27
28
f97cfd59
RH
29/* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
9e18d7a6
RH
45/* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52/* For no G bits set, NZCV = C. */
53#define PREDTEST_INIT 1
54
55/* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
77/* The same for a single word predicate. */
78uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
79{
80 return iter_predtest_fwd(d, g, PREDTEST_INIT);
81}
82
83/* The same for a multi-word predicate. */
84uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
85{
86 uint32_t flags = PREDTEST_INIT;
87 uint64_t *d = vd, *g = vg;
88 uintptr_t i = 0;
89
90 do {
91 flags = iter_predtest_fwd(d[i], g[i], flags);
92 } while (++i < words);
93
94 return flags;
95}
516e246a 96
ccd841c3
RH
97/* Expand active predicate bits to bytes, for byte elements.
98 * for (i = 0; i < 256; ++i) {
99 * unsigned long m = 0;
100 * for (j = 0; j < 8; j++) {
101 * if ((i >> j) & 1) {
102 * m |= 0xfful << (j << 3);
103 * }
104 * }
105 * printf("0x%016lx,\n", m);
106 * }
107 */
108static inline uint64_t expand_pred_b(uint8_t byte)
109{
110 static const uint64_t word[256] = {
111 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
112 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
113 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
114 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
115 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
116 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
117 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
118 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
119 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
120 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
121 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
122 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
123 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
124 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
125 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
126 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
127 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
128 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
129 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
130 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
131 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
132 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
133 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
134 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
135 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
136 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
137 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
138 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
139 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
140 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
141 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
142 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
143 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
144 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
145 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
146 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
147 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
148 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
149 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
150 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
151 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
152 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
153 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
154 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
155 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
156 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
157 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
158 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
159 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
160 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
161 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
162 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
163 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
164 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
165 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
166 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
167 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
168 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
169 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
170 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
171 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
172 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
173 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
174 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
175 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
176 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
177 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
178 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
179 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
180 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
181 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
182 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
183 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
184 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
185 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
186 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
187 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
188 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
189 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
190 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
191 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
192 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
193 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
194 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
195 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
196 0xffffffffffffffff,
197 };
198 return word[byte];
199}
200
201/* Similarly for half-word elements.
202 * for (i = 0; i < 256; ++i) {
203 * unsigned long m = 0;
204 * if (i & 0xaa) {
205 * continue;
206 * }
207 * for (j = 0; j < 8; j += 2) {
208 * if ((i >> j) & 1) {
209 * m |= 0xfffful << (j << 3);
210 * }
211 * }
212 * printf("[0x%x] = 0x%016lx,\n", i, m);
213 * }
214 */
215static inline uint64_t expand_pred_h(uint8_t byte)
216{
217 static const uint64_t word[] = {
218 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
219 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
220 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
221 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
222 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
223 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
224 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
225 [0x55] = 0xffffffffffffffff,
226 };
227 return word[byte & 0x55];
228}
229
230/* Similarly for single word elements. */
231static inline uint64_t expand_pred_s(uint8_t byte)
232{
233 static const uint64_t word[] = {
234 [0x01] = 0x00000000ffffffffull,
235 [0x10] = 0xffffffff00000000ull,
236 [0x11] = 0xffffffffffffffffull,
237 };
238 return word[byte & 0x11];
239}
240
516e246a
RH
241#define LOGICAL_PPPP(NAME, FUNC) \
242void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
243{ \
244 uintptr_t opr_sz = simd_oprsz(desc); \
245 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
246 uintptr_t i; \
247 for (i = 0; i < opr_sz / 8; ++i) { \
248 d[i] = FUNC(n[i], m[i], g[i]); \
249 } \
250}
251
252#define DO_AND(N, M, G) (((N) & (M)) & (G))
253#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
254#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
255#define DO_ORR(N, M, G) (((N) | (M)) & (G))
256#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
257#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
258#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
259#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
260
261LOGICAL_PPPP(sve_and_pppp, DO_AND)
262LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
263LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
264LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
265LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
266LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
267LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
268LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
269
270#undef DO_AND
271#undef DO_BIC
272#undef DO_EOR
273#undef DO_ORR
274#undef DO_ORN
275#undef DO_NOR
276#undef DO_NAND
277#undef DO_SEL
278#undef LOGICAL_PPPP
028e2a7b 279
f97cfd59
RH
280/* Fully general three-operand expander, controlled by a predicate.
281 * This is complicated by the host-endian storage of the register file.
282 */
283/* ??? I don't expect the compiler could ever vectorize this itself.
284 * With some tables we can convert bit masks to byte masks, and with
285 * extra care wrt byte/word ordering we could use gcc generic vectors
286 * and do 16 bytes at a time.
287 */
288#define DO_ZPZZ(NAME, TYPE, H, OP) \
289void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
290{ \
291 intptr_t i, opr_sz = simd_oprsz(desc); \
292 for (i = 0; i < opr_sz; ) { \
293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
294 do { \
295 if (pg & 1) { \
296 TYPE nn = *(TYPE *)(vn + H(i)); \
297 TYPE mm = *(TYPE *)(vm + H(i)); \
298 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
299 } \
300 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
301 } while (i & 15); \
302 } \
303}
304
305/* Similarly, specialized for 64-bit operands. */
306#define DO_ZPZZ_D(NAME, TYPE, OP) \
307void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
308{ \
309 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
310 TYPE *d = vd, *n = vn, *m = vm; \
311 uint8_t *pg = vg; \
312 for (i = 0; i < opr_sz; i += 1) { \
313 if (pg[H1(i)] & 1) { \
314 TYPE nn = n[i], mm = m[i]; \
315 d[i] = OP(nn, mm); \
316 } \
317 } \
318}
319
320#define DO_AND(N, M) (N & M)
321#define DO_EOR(N, M) (N ^ M)
322#define DO_ORR(N, M) (N | M)
323#define DO_BIC(N, M) (N & ~M)
324#define DO_ADD(N, M) (N + M)
325#define DO_SUB(N, M) (N - M)
326#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
327#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
328#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
329#define DO_MUL(N, M) (N * M)
330#define DO_DIV(N, M) (M ? N / M : 0)
331
332DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
333DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
334DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
335DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
336
337DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
338DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
339DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
340DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
341
342DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
343DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
344DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
345DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
346
347DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
348DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
349DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
350DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
351
352DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
353DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
354DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
355DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
356
357DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
358DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
359DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
360DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
361
362DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
363DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
364DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
365DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
366
367DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
368DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
369DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
370DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
371
372DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
373DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
374DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
375DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
376
377DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
378DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
379DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
380DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
381
382DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
383DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
384DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
385DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
386
387DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
388DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
389DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
390DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
391
392/* Because the computation type is at least twice as large as required,
393 these work for both signed and unsigned source types. */
394static inline uint8_t do_mulh_b(int32_t n, int32_t m)
395{
396 return (n * m) >> 8;
397}
398
399static inline uint16_t do_mulh_h(int32_t n, int32_t m)
400{
401 return (n * m) >> 16;
402}
403
404static inline uint32_t do_mulh_s(int64_t n, int64_t m)
405{
406 return (n * m) >> 32;
407}
408
409static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
410{
411 uint64_t lo, hi;
412 muls64(&lo, &hi, n, m);
413 return hi;
414}
415
416static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
417{
418 uint64_t lo, hi;
419 mulu64(&lo, &hi, n, m);
420 return hi;
421}
422
423DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
424DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
425DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
426DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
427
428DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
429DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
430DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
431DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
432
433DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
434DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
435DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
436DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
437
438DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
439DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
440
441DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
442DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
443
27721dbb
RH
444/* Note that all bits of the shift are significant
445 and not modulo the element size. */
446#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
447#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
448#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
449
450DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
451DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
452DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
453
454DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
455DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
456DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
457
458DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
459DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
460DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
461
462DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
463DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
464DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
465
f97cfd59
RH
466#undef DO_ZPZZ
467#undef DO_ZPZZ_D
047cec97 468
fe7f8dfb
RH
469/* Three-operand expander, controlled by a predicate, in which the
470 * third operand is "wide". That is, for D = N op M, the same 64-bit
471 * value of M is used with all of the narrower values of N.
472 */
473#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
474void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
475{ \
476 intptr_t i, opr_sz = simd_oprsz(desc); \
477 for (i = 0; i < opr_sz; ) { \
478 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
479 TYPEW mm = *(TYPEW *)(vm + i); \
480 do { \
481 if (pg & 1) { \
482 TYPE nn = *(TYPE *)(vn + H(i)); \
483 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
484 } \
485 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
486 } while (i & 7); \
487 } \
488}
489
490DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
491DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
492DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
493
494DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
495DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
496DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
497
498DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
499DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
500DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
501
502#undef DO_ZPZW
503
afac6d04
RH
504/* Fully general two-operand expander, controlled by a predicate.
505 */
506#define DO_ZPZ(NAME, TYPE, H, OP) \
507void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
508{ \
509 intptr_t i, opr_sz = simd_oprsz(desc); \
510 for (i = 0; i < opr_sz; ) { \
511 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
512 do { \
513 if (pg & 1) { \
514 TYPE nn = *(TYPE *)(vn + H(i)); \
515 *(TYPE *)(vd + H(i)) = OP(nn); \
516 } \
517 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
518 } while (i & 15); \
519 } \
520}
521
522/* Similarly, specialized for 64-bit operands. */
523#define DO_ZPZ_D(NAME, TYPE, OP) \
524void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
525{ \
526 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
527 TYPE *d = vd, *n = vn; \
528 uint8_t *pg = vg; \
529 for (i = 0; i < opr_sz; i += 1) { \
530 if (pg[H1(i)] & 1) { \
531 TYPE nn = n[i]; \
532 d[i] = OP(nn); \
533 } \
534 } \
535}
536
537#define DO_CLS_B(N) (clrsb32(N) - 24)
538#define DO_CLS_H(N) (clrsb32(N) - 16)
539
540DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
541DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
542DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
543DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
544
545#define DO_CLZ_B(N) (clz32(N) - 24)
546#define DO_CLZ_H(N) (clz32(N) - 16)
547
548DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
549DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
550DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
551DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
552
553DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
554DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
555DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
556DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
557
558#define DO_CNOT(N) (N == 0)
559
560DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
561DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
562DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
563DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
564
565#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
566
567DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
568DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
569DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
570
571#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
572
573DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
574DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
575DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
576
577#define DO_NOT(N) (~N)
578
579DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
580DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
581DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
582DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
583
584#define DO_SXTB(N) ((int8_t)N)
585#define DO_SXTH(N) ((int16_t)N)
586#define DO_SXTS(N) ((int32_t)N)
587#define DO_UXTB(N) ((uint8_t)N)
588#define DO_UXTH(N) ((uint16_t)N)
589#define DO_UXTS(N) ((uint32_t)N)
590
591DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
592DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
593DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
594DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
595DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
596DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
597
598DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
599DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
600DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
601DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
602DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
603DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
604
605#define DO_ABS(N) (N < 0 ? -N : N)
606
607DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
608DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
609DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
610DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
611
612#define DO_NEG(N) (-N)
613
614DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
615DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
616DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
617DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
618
d9d78dcc
RH
619/* Three-operand expander, unpredicated, in which the third operand is "wide".
620 */
621#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
622void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
623{ \
624 intptr_t i, opr_sz = simd_oprsz(desc); \
625 for (i = 0; i < opr_sz; ) { \
626 TYPEW mm = *(TYPEW *)(vm + i); \
627 do { \
628 TYPE nn = *(TYPE *)(vn + H(i)); \
629 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
630 i += sizeof(TYPE); \
631 } while (i & 7); \
632 } \
633}
634
635DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
636DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
637DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
638
639DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
640DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
641DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
642
643DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
644DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
645DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
646
647#undef DO_ZZW
648
afac6d04
RH
649#undef DO_CLS_B
650#undef DO_CLS_H
651#undef DO_CLZ_B
652#undef DO_CLZ_H
653#undef DO_CNOT
654#undef DO_FABS
655#undef DO_FNEG
656#undef DO_ABS
657#undef DO_NEG
658#undef DO_ZPZ
659#undef DO_ZPZ_D
660
047cec97
RH
661/* Two-operand reduction expander, controlled by a predicate.
662 * The difference between TYPERED and TYPERET has to do with
663 * sign-extension. E.g. for SMAX, TYPERED must be signed,
664 * but TYPERET must be unsigned so that e.g. a 32-bit value
665 * is not sign-extended to the ABI uint64_t return type.
666 */
667/* ??? If we were to vectorize this by hand the reduction ordering
668 * would change. For integer operands, this is perfectly fine.
669 */
670#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
671uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
672{ \
673 intptr_t i, opr_sz = simd_oprsz(desc); \
674 TYPERED ret = INIT; \
675 for (i = 0; i < opr_sz; ) { \
676 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
677 do { \
678 if (pg & 1) { \
679 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
680 ret = OP(ret, nn); \
681 } \
682 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
683 } while (i & 15); \
684 } \
685 return (TYPERET)ret; \
686}
687
688#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
689uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
690{ \
691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
692 TYPEE *n = vn; \
693 uint8_t *pg = vg; \
694 TYPER ret = INIT; \
695 for (i = 0; i < opr_sz; i += 1) { \
696 if (pg[H1(i)] & 1) { \
697 TYPEE nn = n[i]; \
698 ret = OP(ret, nn); \
699 } \
700 } \
701 return ret; \
702}
703
704DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
705DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
706DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
707DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
708
709DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
710DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
711DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
712DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
713
714DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
715DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
716DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
717DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
718
719DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
720DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
721DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
722
723DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
724DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
725DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
726DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
727
728DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
729DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
730DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
731DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
732
733DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
734DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
735DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
736DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
737
738DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
739DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
740DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
741DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
742
743DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
744DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
745DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
746DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
747
748#undef DO_VPZ
749#undef DO_VPZ_D
750
f97cfd59
RH
751#undef DO_AND
752#undef DO_ORR
753#undef DO_EOR
754#undef DO_BIC
755#undef DO_ADD
756#undef DO_SUB
757#undef DO_MAX
758#undef DO_MIN
759#undef DO_ABD
760#undef DO_MUL
761#undef DO_DIV
27721dbb
RH
762#undef DO_ASR
763#undef DO_LSR
764#undef DO_LSL
f97cfd59 765
028e2a7b
RH
766/* Similar to the ARM LastActiveElement pseudocode function, except the
767 result is multiplied by the element size. This includes the not found
768 indication; e.g. not found for esz=3 is -8. */
769static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
770{
771 uint64_t mask = pred_esz_masks[esz];
772 intptr_t i = words;
773
774 do {
775 uint64_t this_g = g[--i] & mask;
776 if (this_g) {
777 return i * 64 + (63 - clz64(this_g));
778 }
779 } while (i > 0);
780 return (intptr_t)-1 << esz;
781}
782
783uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
784{
785 uint32_t flags = PREDTEST_INIT;
786 uint64_t *d = vd, *g = vg;
787 intptr_t i = 0;
788
789 do {
790 uint64_t this_d = d[i];
791 uint64_t this_g = g[i];
792
793 if (this_g) {
794 if (!(flags & 4)) {
795 /* Set in D the first bit of G. */
796 this_d |= this_g & -this_g;
797 d[i] = this_d;
798 }
799 flags = iter_predtest_fwd(this_d, this_g, flags);
800 }
801 } while (++i < words);
802
803 return flags;
804}
805
806uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
807{
808 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
809 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
810 uint32_t flags = PREDTEST_INIT;
811 uint64_t *d = vd, *g = vg, esz_mask;
812 intptr_t i, next;
813
814 next = last_active_element(vd, words, esz) + (1 << esz);
815 esz_mask = pred_esz_masks[esz];
816
817 /* Similar to the pseudocode for pnext, but scaled by ESZ
818 so that we find the correct bit. */
819 if (next < words * 64) {
820 uint64_t mask = -1;
821
822 if (next & 63) {
823 mask = ~((1ull << (next & 63)) - 1);
824 next &= -64;
825 }
826 do {
827 uint64_t this_g = g[next / 64] & esz_mask & mask;
828 if (this_g != 0) {
829 next = (next & -64) + ctz64(this_g);
830 break;
831 }
832 next += 64;
833 mask = -1;
834 } while (next < words * 64);
835 }
836
837 i = 0;
838 do {
839 uint64_t this_d = 0;
840 if (i == next / 64) {
841 this_d = 1ull << (next & 63);
842 }
843 d[i] = this_d;
844 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
845 } while (++i < words);
846
847 return flags;
848}
ccd841c3
RH
849
850/* Store zero into every active element of Zd. We will use this for two
851 * and three-operand predicated instructions for which logic dictates a
852 * zero result. In particular, logical shift by element size, which is
853 * otherwise undefined on the host.
854 *
855 * For element sizes smaller than uint64_t, we use tables to expand
856 * the N bits of the controlling predicate to a byte mask, and clear
857 * those bytes.
858 */
859void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
860{
861 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
862 uint64_t *d = vd;
863 uint8_t *pg = vg;
864 for (i = 0; i < opr_sz; i += 1) {
865 d[i] &= ~expand_pred_b(pg[H1(i)]);
866 }
867}
868
869void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
870{
871 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
872 uint64_t *d = vd;
873 uint8_t *pg = vg;
874 for (i = 0; i < opr_sz; i += 1) {
875 d[i] &= ~expand_pred_h(pg[H1(i)]);
876 }
877}
878
879void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
880{
881 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
882 uint64_t *d = vd;
883 uint8_t *pg = vg;
884 for (i = 0; i < opr_sz; i += 1) {
885 d[i] &= ~expand_pred_s(pg[H1(i)]);
886 }
887}
888
889void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
890{
891 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
892 uint64_t *d = vd;
893 uint8_t *pg = vg;
894 for (i = 0; i < opr_sz; i += 1) {
895 if (pg[H1(i)] & 1) {
896 d[i] = 0;
897 }
898 }
899}
900
901/* Three-operand expander, immediate operand, controlled by a predicate.
902 */
903#define DO_ZPZI(NAME, TYPE, H, OP) \
904void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
905{ \
906 intptr_t i, opr_sz = simd_oprsz(desc); \
907 TYPE imm = simd_data(desc); \
908 for (i = 0; i < opr_sz; ) { \
909 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
910 do { \
911 if (pg & 1) { \
912 TYPE nn = *(TYPE *)(vn + H(i)); \
913 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
914 } \
915 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
916 } while (i & 15); \
917 } \
918}
919
920/* Similarly, specialized for 64-bit operands. */
921#define DO_ZPZI_D(NAME, TYPE, OP) \
922void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
923{ \
924 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
925 TYPE *d = vd, *n = vn; \
926 TYPE imm = simd_data(desc); \
927 uint8_t *pg = vg; \
928 for (i = 0; i < opr_sz; i += 1) { \
929 if (pg[H1(i)] & 1) { \
930 TYPE nn = n[i]; \
931 d[i] = OP(nn, imm); \
932 } \
933 } \
934}
935
936#define DO_SHR(N, M) (N >> M)
937#define DO_SHL(N, M) (N << M)
938
939/* Arithmetic shift right for division. This rounds negative numbers
940 toward zero as per signed division. Therefore before shifting,
941 when N is negative, add 2**M-1. */
942#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
943
944DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
945DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
946DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
947DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
948
949DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
950DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
951DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
952DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
953
954DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
955DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
956DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
957DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
958
959DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
960DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
961DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
962DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
963
964#undef DO_SHR
965#undef DO_SHL
966#undef DO_ASRD
967#undef DO_ZPZI
968#undef DO_ZPZI_D
96a36e4a
RH
969
970/* Fully general four-operand expander, controlled by a predicate.
971 */
972#define DO_ZPZZZ(NAME, TYPE, H, OP) \
973void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
974 void *vg, uint32_t desc) \
975{ \
976 intptr_t i, opr_sz = simd_oprsz(desc); \
977 for (i = 0; i < opr_sz; ) { \
978 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
979 do { \
980 if (pg & 1) { \
981 TYPE nn = *(TYPE *)(vn + H(i)); \
982 TYPE mm = *(TYPE *)(vm + H(i)); \
983 TYPE aa = *(TYPE *)(va + H(i)); \
984 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
985 } \
986 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
987 } while (i & 15); \
988 } \
989}
990
991/* Similarly, specialized for 64-bit operands. */
992#define DO_ZPZZZ_D(NAME, TYPE, OP) \
993void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
994 void *vg, uint32_t desc) \
995{ \
996 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
997 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
998 uint8_t *pg = vg; \
999 for (i = 0; i < opr_sz; i += 1) { \
1000 if (pg[H1(i)] & 1) { \
1001 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1002 d[i] = OP(aa, nn, mm); \
1003 } \
1004 } \
1005}
1006
1007#define DO_MLA(A, N, M) (A + N * M)
1008#define DO_MLS(A, N, M) (A - N * M)
1009
1010DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1011DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1012
1013DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1014DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1015
1016DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1017DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1018
1019DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1020DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1021
1022#undef DO_MLA
1023#undef DO_MLS
1024#undef DO_ZPZZZ
1025#undef DO_ZPZZZ_D
9a56c9c3
RH
1026
1027void HELPER(sve_index_b)(void *vd, uint32_t start,
1028 uint32_t incr, uint32_t desc)
1029{
1030 intptr_t i, opr_sz = simd_oprsz(desc);
1031 uint8_t *d = vd;
1032 for (i = 0; i < opr_sz; i += 1) {
1033 d[H1(i)] = start + i * incr;
1034 }
1035}
1036
1037void HELPER(sve_index_h)(void *vd, uint32_t start,
1038 uint32_t incr, uint32_t desc)
1039{
1040 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1041 uint16_t *d = vd;
1042 for (i = 0; i < opr_sz; i += 1) {
1043 d[H2(i)] = start + i * incr;
1044 }
1045}
1046
1047void HELPER(sve_index_s)(void *vd, uint32_t start,
1048 uint32_t incr, uint32_t desc)
1049{
1050 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1051 uint32_t *d = vd;
1052 for (i = 0; i < opr_sz; i += 1) {
1053 d[H4(i)] = start + i * incr;
1054 }
1055}
1056
1057void HELPER(sve_index_d)(void *vd, uint64_t start,
1058 uint64_t incr, uint32_t desc)
1059{
1060 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1061 uint64_t *d = vd;
1062 for (i = 0; i < opr_sz; i += 1) {
1063 d[i] = start + i * incr;
1064 }
1065}
4b242d9c
RH
1066
1067void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1068{
1069 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1070 uint32_t sh = simd_data(desc);
1071 uint32_t *d = vd, *n = vn, *m = vm;
1072 for (i = 0; i < opr_sz; i += 1) {
1073 d[i] = n[i] + (m[i] << sh);
1074 }
1075}
1076
1077void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1078{
1079 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1080 uint64_t sh = simd_data(desc);
1081 uint64_t *d = vd, *n = vn, *m = vm;
1082 for (i = 0; i < opr_sz; i += 1) {
1083 d[i] = n[i] + (m[i] << sh);
1084 }
1085}
1086
1087void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1088{
1089 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1090 uint64_t sh = simd_data(desc);
1091 uint64_t *d = vd, *n = vn, *m = vm;
1092 for (i = 0; i < opr_sz; i += 1) {
1093 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1094 }
1095}
1096
1097void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1098{
1099 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1100 uint64_t sh = simd_data(desc);
1101 uint64_t *d = vd, *n = vn, *m = vm;
1102 for (i = 0; i < opr_sz; i += 1) {
1103 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1104 }
1105}
0762cd42
RH
1106
1107void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1108{
1109 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1110 static const uint16_t coeff[] = {
1111 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1112 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1113 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1114 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1115 };
1116 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1117 uint16_t *d = vd, *n = vn;
1118
1119 for (i = 0; i < opr_sz; i++) {
1120 uint16_t nn = n[i];
1121 intptr_t idx = extract32(nn, 0, 5);
1122 uint16_t exp = extract32(nn, 5, 5);
1123 d[i] = coeff[idx] | (exp << 10);
1124 }
1125}
1126
1127void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1128{
1129 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1130 static const uint32_t coeff[] = {
1131 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1132 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1133 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1134 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1135 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1136 0x1ef532, 0x20b051, 0x227043, 0x243516,
1137 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1138 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1139 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1140 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1141 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1142 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1143 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1144 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1145 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1146 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1147 };
1148 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1149 uint32_t *d = vd, *n = vn;
1150
1151 for (i = 0; i < opr_sz; i++) {
1152 uint32_t nn = n[i];
1153 intptr_t idx = extract32(nn, 0, 6);
1154 uint32_t exp = extract32(nn, 6, 8);
1155 d[i] = coeff[idx] | (exp << 23);
1156 }
1157}
1158
1159void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1160{
1161 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1162 static const uint64_t coeff[] = {
1163 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1164 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1165 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1166 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1167 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1168 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1169 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1170 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1171 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1172 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1173 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1174 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1175 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1176 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1177 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1178 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1179 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1180 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1181 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1182 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1183 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1184 0xFA7C1819E90D8ull,
1185 };
1186 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1187 uint64_t *d = vd, *n = vn;
1188
1189 for (i = 0; i < opr_sz; i++) {
1190 uint64_t nn = n[i];
1191 intptr_t idx = extract32(nn, 0, 6);
1192 uint64_t exp = extract32(nn, 6, 11);
1193 d[i] = coeff[idx] | (exp << 52);
1194 }
1195}
a1f233f2
RH
1196
1197void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1198{
1199 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1200 uint16_t *d = vd, *n = vn, *m = vm;
1201 for (i = 0; i < opr_sz; i += 1) {
1202 uint16_t nn = n[i];
1203 uint16_t mm = m[i];
1204 if (mm & 1) {
1205 nn = float16_one;
1206 }
1207 d[i] = nn ^ (mm & 2) << 14;
1208 }
1209}
1210
1211void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1212{
1213 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1214 uint32_t *d = vd, *n = vn, *m = vm;
1215 for (i = 0; i < opr_sz; i += 1) {
1216 uint32_t nn = n[i];
1217 uint32_t mm = m[i];
1218 if (mm & 1) {
1219 nn = float32_one;
1220 }
1221 d[i] = nn ^ (mm & 2) << 30;
1222 }
1223}
1224
1225void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t *d = vd, *n = vn, *m = vm;
1229 for (i = 0; i < opr_sz; i += 1) {
1230 uint64_t nn = n[i];
1231 uint64_t mm = m[i];
1232 if (mm & 1) {
1233 nn = float64_one;
1234 }
1235 d[i] = nn ^ (mm & 2) << 62;
1236 }
1237}
24e82e68
RH
1238
1239/*
1240 * Signed saturating addition with scalar operand.
1241 */
1242
1243void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1244{
1245 intptr_t i, oprsz = simd_oprsz(desc);
1246
1247 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1248 int r = *(int8_t *)(a + i) + b;
1249 if (r > INT8_MAX) {
1250 r = INT8_MAX;
1251 } else if (r < INT8_MIN) {
1252 r = INT8_MIN;
1253 }
1254 *(int8_t *)(d + i) = r;
1255 }
1256}
1257
1258void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1259{
1260 intptr_t i, oprsz = simd_oprsz(desc);
1261
1262 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1263 int r = *(int16_t *)(a + i) + b;
1264 if (r > INT16_MAX) {
1265 r = INT16_MAX;
1266 } else if (r < INT16_MIN) {
1267 r = INT16_MIN;
1268 }
1269 *(int16_t *)(d + i) = r;
1270 }
1271}
1272
1273void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1274{
1275 intptr_t i, oprsz = simd_oprsz(desc);
1276
1277 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1278 int64_t r = *(int32_t *)(a + i) + b;
1279 if (r > INT32_MAX) {
1280 r = INT32_MAX;
1281 } else if (r < INT32_MIN) {
1282 r = INT32_MIN;
1283 }
1284 *(int32_t *)(d + i) = r;
1285 }
1286}
1287
1288void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1289{
1290 intptr_t i, oprsz = simd_oprsz(desc);
1291
1292 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1293 int64_t ai = *(int64_t *)(a + i);
1294 int64_t r = ai + b;
1295 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1296 /* Signed overflow. */
1297 r = (r < 0 ? INT64_MAX : INT64_MIN);
1298 }
1299 *(int64_t *)(d + i) = r;
1300 }
1301}
1302
1303/*
1304 * Unsigned saturating addition with scalar operand.
1305 */
1306
1307void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1308{
1309 intptr_t i, oprsz = simd_oprsz(desc);
1310
1311 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1312 int r = *(uint8_t *)(a + i) + b;
1313 if (r > UINT8_MAX) {
1314 r = UINT8_MAX;
1315 } else if (r < 0) {
1316 r = 0;
1317 }
1318 *(uint8_t *)(d + i) = r;
1319 }
1320}
1321
1322void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1323{
1324 intptr_t i, oprsz = simd_oprsz(desc);
1325
1326 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1327 int r = *(uint16_t *)(a + i) + b;
1328 if (r > UINT16_MAX) {
1329 r = UINT16_MAX;
1330 } else if (r < 0) {
1331 r = 0;
1332 }
1333 *(uint16_t *)(d + i) = r;
1334 }
1335}
1336
1337void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1338{
1339 intptr_t i, oprsz = simd_oprsz(desc);
1340
1341 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1342 int64_t r = *(uint32_t *)(a + i) + b;
1343 if (r > UINT32_MAX) {
1344 r = UINT32_MAX;
1345 } else if (r < 0) {
1346 r = 0;
1347 }
1348 *(uint32_t *)(d + i) = r;
1349 }
1350}
1351
1352void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1353{
1354 intptr_t i, oprsz = simd_oprsz(desc);
1355
1356 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1357 uint64_t r = *(uint64_t *)(a + i) + b;
1358 if (r < b) {
1359 r = UINT64_MAX;
1360 }
1361 *(uint64_t *)(d + i) = r;
1362 }
1363}
1364
1365void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1366{
1367 intptr_t i, oprsz = simd_oprsz(desc);
1368
1369 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1370 uint64_t ai = *(uint64_t *)(a + i);
1371 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1372 }
1373}
f25a2361
RH
1374
1375/* Two operand predicated copy immediate with merge. All valid immediates
1376 * can fit within 17 signed bits in the simd_data field.
1377 */
1378void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1379 uint64_t mm, uint32_t desc)
1380{
1381 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1382 uint64_t *d = vd, *n = vn;
1383 uint8_t *pg = vg;
1384
1385 mm = dup_const(MO_8, mm);
1386 for (i = 0; i < opr_sz; i += 1) {
1387 uint64_t nn = n[i];
1388 uint64_t pp = expand_pred_b(pg[H1(i)]);
1389 d[i] = (mm & pp) | (nn & ~pp);
1390 }
1391}
1392
1393void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1394 uint64_t mm, uint32_t desc)
1395{
1396 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1397 uint64_t *d = vd, *n = vn;
1398 uint8_t *pg = vg;
1399
1400 mm = dup_const(MO_16, mm);
1401 for (i = 0; i < opr_sz; i += 1) {
1402 uint64_t nn = n[i];
1403 uint64_t pp = expand_pred_h(pg[H1(i)]);
1404 d[i] = (mm & pp) | (nn & ~pp);
1405 }
1406}
1407
1408void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1409 uint64_t mm, uint32_t desc)
1410{
1411 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1412 uint64_t *d = vd, *n = vn;
1413 uint8_t *pg = vg;
1414
1415 mm = dup_const(MO_32, mm);
1416 for (i = 0; i < opr_sz; i += 1) {
1417 uint64_t nn = n[i];
1418 uint64_t pp = expand_pred_s(pg[H1(i)]);
1419 d[i] = (mm & pp) | (nn & ~pp);
1420 }
1421}
1422
1423void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1424 uint64_t mm, uint32_t desc)
1425{
1426 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1427 uint64_t *d = vd, *n = vn;
1428 uint8_t *pg = vg;
1429
1430 for (i = 0; i < opr_sz; i += 1) {
1431 uint64_t nn = n[i];
1432 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1433 }
1434}
1435
1436void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1437{
1438 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1439 uint64_t *d = vd;
1440 uint8_t *pg = vg;
1441
1442 val = dup_const(MO_8, val);
1443 for (i = 0; i < opr_sz; i += 1) {
1444 d[i] = val & expand_pred_b(pg[H1(i)]);
1445 }
1446}
1447
1448void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1449{
1450 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1451 uint64_t *d = vd;
1452 uint8_t *pg = vg;
1453
1454 val = dup_const(MO_16, val);
1455 for (i = 0; i < opr_sz; i += 1) {
1456 d[i] = val & expand_pred_h(pg[H1(i)]);
1457 }
1458}
1459
1460void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1461{
1462 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1463 uint64_t *d = vd;
1464 uint8_t *pg = vg;
1465
1466 val = dup_const(MO_32, val);
1467 for (i = 0; i < opr_sz; i += 1) {
1468 d[i] = val & expand_pred_s(pg[H1(i)]);
1469 }
1470}
1471
1472void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1473{
1474 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1475 uint64_t *d = vd;
1476 uint8_t *pg = vg;
1477
1478 for (i = 0; i < opr_sz; i += 1) {
1479 d[i] = (pg[H1(i)] & 1 ? val : 0);
1480 }
1481}
b94f8f60
RH
1482
1483/* Big-endian hosts need to frob the byte indicies. If the copy
1484 * happens to be 8-byte aligned, then no frobbing necessary.
1485 */
1486static void swap_memmove(void *vd, void *vs, size_t n)
1487{
1488 uintptr_t d = (uintptr_t)vd;
1489 uintptr_t s = (uintptr_t)vs;
1490 uintptr_t o = (d | s | n) & 7;
1491 size_t i;
1492
1493#ifndef HOST_WORDS_BIGENDIAN
1494 o = 0;
1495#endif
1496 switch (o) {
1497 case 0:
1498 memmove(vd, vs, n);
1499 break;
1500
1501 case 4:
1502 if (d < s || d >= s + n) {
1503 for (i = 0; i < n; i += 4) {
1504 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1505 }
1506 } else {
1507 for (i = n; i > 0; ) {
1508 i -= 4;
1509 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1510 }
1511 }
1512 break;
1513
1514 case 2:
1515 case 6:
1516 if (d < s || d >= s + n) {
1517 for (i = 0; i < n; i += 2) {
1518 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1519 }
1520 } else {
1521 for (i = n; i > 0; ) {
1522 i -= 2;
1523 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1524 }
1525 }
1526 break;
1527
1528 default:
1529 if (d < s || d >= s + n) {
1530 for (i = 0; i < n; i++) {
1531 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1532 }
1533 } else {
1534 for (i = n; i > 0; ) {
1535 i -= 1;
1536 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1537 }
1538 }
1539 break;
1540 }
1541}
1542
1543void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1544{
1545 intptr_t opr_sz = simd_oprsz(desc);
1546 size_t n_ofs = simd_data(desc);
1547 size_t n_siz = opr_sz - n_ofs;
1548
1549 if (vd != vm) {
1550 swap_memmove(vd, vn + n_ofs, n_siz);
1551 swap_memmove(vd + n_siz, vm, n_ofs);
1552 } else if (vd != vn) {
1553 swap_memmove(vd + n_siz, vd, n_ofs);
1554 swap_memmove(vd, vn + n_ofs, n_siz);
1555 } else {
1556 /* vd == vn == vm. Need temp space. */
1557 ARMVectorReg tmp;
1558 swap_memmove(&tmp, vm, n_ofs);
1559 swap_memmove(vd, vd + n_ofs, n_siz);
1560 memcpy(vd + n_siz, &tmp, n_ofs);
1561 }
1562}
30562ab7
RH
1563
1564#define DO_INSR(NAME, TYPE, H) \
1565void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1566{ \
1567 intptr_t opr_sz = simd_oprsz(desc); \
1568 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1569 *(TYPE *)(vd + H(0)) = val; \
1570}
1571
1572DO_INSR(sve_insr_b, uint8_t, H1)
1573DO_INSR(sve_insr_h, uint16_t, H1_2)
1574DO_INSR(sve_insr_s, uint32_t, H1_4)
1575DO_INSR(sve_insr_d, uint64_t, )
1576
1577#undef DO_INSR
1578
1579void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1580{
1581 intptr_t i, j, opr_sz = simd_oprsz(desc);
1582 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1583 uint64_t f = *(uint64_t *)(vn + i);
1584 uint64_t b = *(uint64_t *)(vn + j);
1585 *(uint64_t *)(vd + i) = bswap64(b);
1586 *(uint64_t *)(vd + j) = bswap64(f);
1587 }
1588}
1589
1590static inline uint64_t hswap64(uint64_t h)
1591{
1592 uint64_t m = 0x0000ffff0000ffffull;
1593 h = rol64(h, 32);
1594 return ((h & m) << 16) | ((h >> 16) & m);
1595}
1596
1597void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1598{
1599 intptr_t i, j, opr_sz = simd_oprsz(desc);
1600 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1601 uint64_t f = *(uint64_t *)(vn + i);
1602 uint64_t b = *(uint64_t *)(vn + j);
1603 *(uint64_t *)(vd + i) = hswap64(b);
1604 *(uint64_t *)(vd + j) = hswap64(f);
1605 }
1606}
1607
1608void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1609{
1610 intptr_t i, j, opr_sz = simd_oprsz(desc);
1611 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1612 uint64_t f = *(uint64_t *)(vn + i);
1613 uint64_t b = *(uint64_t *)(vn + j);
1614 *(uint64_t *)(vd + i) = rol64(b, 32);
1615 *(uint64_t *)(vd + j) = rol64(f, 32);
1616 }
1617}
1618
1619void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1620{
1621 intptr_t i, j, opr_sz = simd_oprsz(desc);
1622 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1623 uint64_t f = *(uint64_t *)(vn + i);
1624 uint64_t b = *(uint64_t *)(vn + j);
1625 *(uint64_t *)(vd + i) = b;
1626 *(uint64_t *)(vd + j) = f;
1627 }
1628}
1629
1630#define DO_TBL(NAME, TYPE, H) \
1631void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1632{ \
1633 intptr_t i, opr_sz = simd_oprsz(desc); \
1634 uintptr_t elem = opr_sz / sizeof(TYPE); \
1635 TYPE *d = vd, *n = vn, *m = vm; \
1636 ARMVectorReg tmp; \
1637 if (unlikely(vd == vn)) { \
1638 n = memcpy(&tmp, vn, opr_sz); \
1639 } \
1640 for (i = 0; i < elem; i++) { \
1641 TYPE j = m[H(i)]; \
1642 d[H(i)] = j < elem ? n[H(j)] : 0; \
1643 } \
1644}
1645
1646DO_TBL(sve_tbl_b, uint8_t, H1)
1647DO_TBL(sve_tbl_h, uint16_t, H2)
1648DO_TBL(sve_tbl_s, uint32_t, H4)
1649DO_TBL(sve_tbl_d, uint64_t, )
1650
1651#undef TBL
1652
1653#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1654void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1655{ \
1656 intptr_t i, opr_sz = simd_oprsz(desc); \
1657 TYPED *d = vd; \
1658 TYPES *n = vn; \
1659 ARMVectorReg tmp; \
1660 if (unlikely(vn - vd < opr_sz)) { \
1661 n = memcpy(&tmp, n, opr_sz / 2); \
1662 } \
1663 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1664 d[HD(i)] = n[HS(i)]; \
1665 } \
1666}
1667
1668DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1669DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1670DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1671
1672DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1673DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1674DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1675
1676#undef DO_UNPK
This page took 0.20563 seconds and 4 git commands to generate.