]>
Commit | Line | Data |
---|---|---|
f84734b8 RH |
1 | /* |
2 | * ARM SME Operations | |
3 | * | |
4 | * Copyright (c) 2022 Linaro, Ltd. | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
7390e0e9 | 22 | #include "internals.h" |
e9ad3ef1 | 23 | #include "tcg/tcg-gvec-desc.h" |
f84734b8 | 24 | #include "exec/helper-proto.h" |
7390e0e9 RH |
25 | #include "exec/cpu_ldst.h" |
26 | #include "exec/exec-all.h" | |
e9ad3ef1 | 27 | #include "qemu/int128.h" |
558e956c | 28 | #include "fpu/softfloat.h" |
e9ad3ef1 | 29 | #include "vec_internal.h" |
7390e0e9 | 30 | #include "sve_ldst_internal.h" |
f84734b8 RH |
31 | |
32 | /* ResetSVEState */ | |
33 | void arm_reset_sve_state(CPUARMState *env) | |
34 | { | |
35 | memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs)); | |
36 | /* Recall that FFR is stored as pregs[16]. */ | |
37 | memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs)); | |
38 | vfp_set_fpcr(env, 0x0800009f); | |
39 | } | |
40 | ||
41 | void helper_set_pstate_sm(CPUARMState *env, uint32_t i) | |
42 | { | |
43 | if (i == FIELD_EX64(env->svcr, SVCR, SM)) { | |
44 | return; | |
45 | } | |
46 | env->svcr ^= R_SVCR_SM_MASK; | |
47 | arm_reset_sve_state(env); | |
48 | } | |
49 | ||
50 | void helper_set_pstate_za(CPUARMState *env, uint32_t i) | |
51 | { | |
52 | if (i == FIELD_EX64(env->svcr, SVCR, ZA)) { | |
53 | return; | |
54 | } | |
55 | env->svcr ^= R_SVCR_ZA_MASK; | |
56 | ||
57 | /* | |
58 | * ResetSMEState. | |
59 | * | |
60 | * SetPSTATE_ZA zeros on enable and disable. We can zero this only | |
61 | * on enable: while disabled, the storage is inaccessible and the | |
62 | * value does not matter. We're not saving the storage in vmstate | |
63 | * when disabled either. | |
64 | */ | |
65 | if (i) { | |
66 | memset(env->zarray, 0, sizeof(env->zarray)); | |
67 | } | |
68 | } | |
ad939afb RH |
69 | |
70 | void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) | |
71 | { | |
72 | uint32_t i; | |
73 | ||
74 | /* | |
75 | * Special case clearing the entire ZA space. | |
76 | * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any | |
77 | * parts of the ZA storage outside of SVL. | |
78 | */ | |
79 | if (imm == 0xff) { | |
80 | memset(env->zarray, 0, sizeof(env->zarray)); | |
81 | return; | |
82 | } | |
83 | ||
84 | /* | |
85 | * Recall that ZAnH.D[m] is spread across ZA[n+8*m], | |
86 | * so each row is discontiguous within ZA[]. | |
87 | */ | |
88 | for (i = 0; i < svl; i++) { | |
89 | if (imm & (1 << (i % 8))) { | |
90 | memset(&env->zarray[i], 0, svl); | |
91 | } | |
92 | } | |
93 | } | |
e9ad3ef1 RH |
94 | |
95 | ||
96 | /* | |
97 | * When considering the ZA storage as an array of elements of | |
98 | * type T, the index within that array of the Nth element of | |
99 | * a vertical slice of a tile can be calculated like this, | |
100 | * regardless of the size of type T. This is because the tiles | |
101 | * are interleaved, so if type T is size N bytes then row 1 of | |
102 | * the tile is N rows away from row 0. The division by N to | |
103 | * convert a byte offset into an array index and the multiplication | |
104 | * by N to convert from vslice-index-within-the-tile to | |
105 | * the index within the ZA storage cancel out. | |
106 | */ | |
107 | #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg)) | |
108 | ||
109 | /* | |
110 | * When doing byte arithmetic on the ZA storage, the element | |
111 | * byteoff bytes away in a tile vertical slice is always this | |
112 | * many bytes away in the ZA storage, regardless of the | |
113 | * size of the tile element, assuming that byteoff is a multiple | |
114 | * of the element size. Again this is because of the interleaving | |
115 | * of the tiles. For instance if we have 1 byte per element then | |
116 | * each row of the ZA storage has one byte of the vslice data, | |
117 | * and (counting from 0) byte 8 goes in row 8 of the storage | |
118 | * at offset (8 * row-size-in-bytes). | |
119 | * If we have 8 bytes per element then each row of the ZA storage | |
120 | * has 8 bytes of the data, but there are 8 interleaved tiles and | |
121 | * so byte 8 of the data goes into row 1 of the tile, | |
122 | * which is again row 8 of the storage, so the offset is still | |
123 | * (8 * row-size-in-bytes). Similarly for other element sizes. | |
124 | */ | |
125 | #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg)) | |
126 | ||
127 | ||
128 | /* | |
129 | * Move Zreg vector to ZArray column. | |
130 | */ | |
131 | #define DO_MOVA_C(NAME, TYPE, H) \ | |
132 | void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \ | |
133 | { \ | |
134 | int i, oprsz = simd_oprsz(desc); \ | |
135 | for (i = 0; i < oprsz; ) { \ | |
136 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
137 | do { \ | |
138 | if (pg & 1) { \ | |
139 | *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \ | |
140 | } \ | |
141 | i += sizeof(TYPE); \ | |
142 | pg >>= sizeof(TYPE); \ | |
143 | } while (i & 15); \ | |
144 | } \ | |
145 | } | |
146 | ||
147 | DO_MOVA_C(sme_mova_cz_b, uint8_t, H1) | |
148 | DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2) | |
149 | DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4) | |
150 | ||
151 | void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc) | |
152 | { | |
153 | int i, oprsz = simd_oprsz(desc) / 8; | |
154 | uint8_t *pg = vg; | |
155 | uint64_t *n = vn; | |
156 | uint64_t *a = za; | |
157 | ||
158 | for (i = 0; i < oprsz; i++) { | |
159 | if (pg[H1(i)] & 1) { | |
160 | a[tile_vslice_index(i)] = n[i]; | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc) | |
166 | { | |
167 | int i, oprsz = simd_oprsz(desc) / 16; | |
168 | uint16_t *pg = vg; | |
169 | Int128 *n = vn; | |
170 | Int128 *a = za; | |
171 | ||
172 | /* | |
173 | * Int128 is used here simply to copy 16 bytes, and to simplify | |
174 | * the address arithmetic. | |
175 | */ | |
176 | for (i = 0; i < oprsz; i++) { | |
177 | if (pg[H2(i)] & 1) { | |
178 | a[tile_vslice_index(i)] = n[i]; | |
179 | } | |
180 | } | |
181 | } | |
182 | ||
183 | #undef DO_MOVA_C | |
184 | ||
185 | /* | |
186 | * Move ZArray column to Zreg vector. | |
187 | */ | |
188 | #define DO_MOVA_Z(NAME, TYPE, H) \ | |
189 | void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \ | |
190 | { \ | |
191 | int i, oprsz = simd_oprsz(desc); \ | |
192 | for (i = 0; i < oprsz; ) { \ | |
193 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
194 | do { \ | |
195 | if (pg & 1) { \ | |
196 | *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \ | |
197 | } \ | |
198 | i += sizeof(TYPE); \ | |
199 | pg >>= sizeof(TYPE); \ | |
200 | } while (i & 15); \ | |
201 | } \ | |
202 | } | |
203 | ||
204 | DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1) | |
205 | DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2) | |
206 | DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4) | |
207 | ||
208 | void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc) | |
209 | { | |
210 | int i, oprsz = simd_oprsz(desc) / 8; | |
211 | uint8_t *pg = vg; | |
212 | uint64_t *d = vd; | |
213 | uint64_t *a = za; | |
214 | ||
215 | for (i = 0; i < oprsz; i++) { | |
216 | if (pg[H1(i)] & 1) { | |
217 | d[i] = a[tile_vslice_index(i)]; | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) | |
223 | { | |
224 | int i, oprsz = simd_oprsz(desc) / 16; | |
225 | uint16_t *pg = vg; | |
226 | Int128 *d = vd; | |
227 | Int128 *a = za; | |
228 | ||
229 | /* | |
230 | * Int128 is used here simply to copy 16 bytes, and to simplify | |
231 | * the address arithmetic. | |
232 | */ | |
233 | for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) { | |
234 | if (pg[H2(i)] & 1) { | |
235 | d[i] = a[tile_vslice_index(i)]; | |
236 | } | |
237 | } | |
238 | } | |
239 | ||
240 | #undef DO_MOVA_Z | |
7390e0e9 RH |
241 | |
242 | /* | |
243 | * Clear elements in a tile slice comprising len bytes. | |
244 | */ | |
245 | ||
246 | typedef void ClearFn(void *ptr, size_t off, size_t len); | |
247 | ||
248 | static void clear_horizontal(void *ptr, size_t off, size_t len) | |
249 | { | |
250 | memset(ptr + off, 0, len); | |
251 | } | |
252 | ||
253 | static void clear_vertical_b(void *vptr, size_t off, size_t len) | |
254 | { | |
255 | for (size_t i = 0; i < len; ++i) { | |
256 | *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0; | |
257 | } | |
258 | } | |
259 | ||
260 | static void clear_vertical_h(void *vptr, size_t off, size_t len) | |
261 | { | |
262 | for (size_t i = 0; i < len; i += 2) { | |
263 | *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0; | |
264 | } | |
265 | } | |
266 | ||
267 | static void clear_vertical_s(void *vptr, size_t off, size_t len) | |
268 | { | |
269 | for (size_t i = 0; i < len; i += 4) { | |
270 | *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0; | |
271 | } | |
272 | } | |
273 | ||
274 | static void clear_vertical_d(void *vptr, size_t off, size_t len) | |
275 | { | |
276 | for (size_t i = 0; i < len; i += 8) { | |
277 | *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0; | |
278 | } | |
279 | } | |
280 | ||
281 | static void clear_vertical_q(void *vptr, size_t off, size_t len) | |
282 | { | |
283 | for (size_t i = 0; i < len; i += 16) { | |
284 | memset(vptr + tile_vslice_offset(i + off), 0, 16); | |
285 | } | |
286 | } | |
287 | ||
288 | /* | |
289 | * Copy elements from an array into a tile slice comprising len bytes. | |
290 | */ | |
291 | ||
292 | typedef void CopyFn(void *dst, const void *src, size_t len); | |
293 | ||
294 | static void copy_horizontal(void *dst, const void *src, size_t len) | |
295 | { | |
296 | memcpy(dst, src, len); | |
297 | } | |
298 | ||
299 | static void copy_vertical_b(void *vdst, const void *vsrc, size_t len) | |
300 | { | |
301 | const uint8_t *src = vsrc; | |
302 | uint8_t *dst = vdst; | |
303 | size_t i; | |
304 | ||
305 | for (i = 0; i < len; ++i) { | |
306 | dst[tile_vslice_index(i)] = src[i]; | |
307 | } | |
308 | } | |
309 | ||
310 | static void copy_vertical_h(void *vdst, const void *vsrc, size_t len) | |
311 | { | |
312 | const uint16_t *src = vsrc; | |
313 | uint16_t *dst = vdst; | |
314 | size_t i; | |
315 | ||
316 | for (i = 0; i < len / 2; ++i) { | |
317 | dst[tile_vslice_index(i)] = src[i]; | |
318 | } | |
319 | } | |
320 | ||
321 | static void copy_vertical_s(void *vdst, const void *vsrc, size_t len) | |
322 | { | |
323 | const uint32_t *src = vsrc; | |
324 | uint32_t *dst = vdst; | |
325 | size_t i; | |
326 | ||
327 | for (i = 0; i < len / 4; ++i) { | |
328 | dst[tile_vslice_index(i)] = src[i]; | |
329 | } | |
330 | } | |
331 | ||
332 | static void copy_vertical_d(void *vdst, const void *vsrc, size_t len) | |
333 | { | |
334 | const uint64_t *src = vsrc; | |
335 | uint64_t *dst = vdst; | |
336 | size_t i; | |
337 | ||
338 | for (i = 0; i < len / 8; ++i) { | |
339 | dst[tile_vslice_index(i)] = src[i]; | |
340 | } | |
341 | } | |
342 | ||
343 | static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) | |
344 | { | |
345 | for (size_t i = 0; i < len; i += 16) { | |
346 | memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16); | |
347 | } | |
348 | } | |
349 | ||
350 | /* | |
351 | * Host and TLB primitives for vertical tile slice addressing. | |
352 | */ | |
353 | ||
354 | #define DO_LD(NAME, TYPE, HOST, TLB) \ | |
355 | static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ | |
356 | { \ | |
357 | TYPE val = HOST(host); \ | |
358 | *(TYPE *)(za + tile_vslice_offset(off)) = val; \ | |
359 | } \ | |
360 | static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ | |
361 | intptr_t off, target_ulong addr, uintptr_t ra) \ | |
362 | { \ | |
363 | TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \ | |
364 | *(TYPE *)(za + tile_vslice_offset(off)) = val; \ | |
365 | } | |
366 | ||
367 | #define DO_ST(NAME, TYPE, HOST, TLB) \ | |
368 | static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ | |
369 | { \ | |
370 | TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ | |
371 | HOST(host, val); \ | |
372 | } \ | |
373 | static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ | |
374 | intptr_t off, target_ulong addr, uintptr_t ra) \ | |
375 | { \ | |
376 | TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ | |
377 | TLB(env, useronly_clean_ptr(addr), val, ra); \ | |
378 | } | |
379 | ||
380 | /* | |
381 | * The ARMVectorReg elements are stored in host-endian 64-bit units. | |
382 | * For 128-bit quantities, the sequence defined by the Elem[] pseudocode | |
383 | * corresponds to storing the two 64-bit pieces in little-endian order. | |
384 | */ | |
385 | #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ | |
386 | static inline void HNAME##_host(void *za, intptr_t off, void *host) \ | |
387 | { \ | |
388 | uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ | |
389 | uint64_t *ptr = za + off; \ | |
390 | ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ | |
391 | } \ | |
392 | static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ | |
393 | { \ | |
394 | HNAME##_host(za, tile_vslice_offset(off), host); \ | |
395 | } \ | |
396 | static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ | |
397 | target_ulong addr, uintptr_t ra) \ | |
398 | { \ | |
399 | uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ | |
400 | uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ | |
401 | uint64_t *ptr = za + off; \ | |
402 | ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ | |
403 | } \ | |
404 | static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ | |
405 | target_ulong addr, uintptr_t ra) \ | |
406 | { \ | |
407 | HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ | |
408 | } | |
409 | ||
410 | #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ | |
411 | static inline void HNAME##_host(void *za, intptr_t off, void *host) \ | |
412 | { \ | |
413 | uint64_t *ptr = za + off; \ | |
414 | HOST(host, ptr[BE]); \ | |
415 | HOST(host + 1, ptr[!BE]); \ | |
416 | } \ | |
417 | static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ | |
418 | { \ | |
419 | HNAME##_host(za, tile_vslice_offset(off), host); \ | |
420 | } \ | |
421 | static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ | |
422 | target_ulong addr, uintptr_t ra) \ | |
423 | { \ | |
424 | uint64_t *ptr = za + off; \ | |
425 | TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ | |
426 | TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ | |
427 | } \ | |
428 | static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ | |
429 | target_ulong addr, uintptr_t ra) \ | |
430 | { \ | |
431 | HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ | |
432 | } | |
433 | ||
434 | DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra) | |
435 | DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra) | |
436 | DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra) | |
437 | DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra) | |
438 | DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) | |
439 | DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) | |
440 | DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) | |
441 | ||
442 | DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) | |
443 | DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) | |
444 | ||
445 | DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) | |
446 | DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) | |
447 | DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra) | |
448 | DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra) | |
449 | DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) | |
450 | DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) | |
451 | DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) | |
452 | ||
453 | DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) | |
454 | DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) | |
455 | ||
456 | #undef DO_LD | |
457 | #undef DO_ST | |
458 | #undef DO_LDQ | |
459 | #undef DO_STQ | |
460 | ||
461 | /* | |
462 | * Common helper for all contiguous predicated loads. | |
463 | */ | |
464 | ||
465 | static inline QEMU_ALWAYS_INLINE | |
466 | void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, | |
467 | const target_ulong addr, uint32_t desc, const uintptr_t ra, | |
468 | const int esz, uint32_t mtedesc, bool vertical, | |
469 | sve_ldst1_host_fn *host_fn, | |
470 | sve_ldst1_tlb_fn *tlb_fn, | |
471 | ClearFn *clr_fn, | |
472 | CopyFn *cpy_fn) | |
473 | { | |
474 | const intptr_t reg_max = simd_oprsz(desc); | |
475 | const intptr_t esize = 1 << esz; | |
476 | intptr_t reg_off, reg_last; | |
477 | SVEContLdSt info; | |
478 | void *host; | |
479 | int flags; | |
480 | ||
481 | /* Find the active elements. */ | |
482 | if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { | |
483 | /* The entire predicate was false; no load occurs. */ | |
484 | clr_fn(za, 0, reg_max); | |
485 | return; | |
486 | } | |
487 | ||
488 | /* Probe the page(s). Exit with exception for any invalid page. */ | |
489 | sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); | |
490 | ||
491 | /* Handle watchpoints for all active elements. */ | |
492 | sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, | |
493 | BP_MEM_READ, ra); | |
494 | ||
495 | /* | |
496 | * Handle mte checks for all active elements. | |
497 | * Since TBI must be set for MTE, !mtedesc => !mte_active. | |
498 | */ | |
499 | if (mtedesc) { | |
500 | sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, | |
501 | mtedesc, ra); | |
502 | } | |
503 | ||
504 | flags = info.page[0].flags | info.page[1].flags; | |
505 | if (unlikely(flags != 0)) { | |
506 | #ifdef CONFIG_USER_ONLY | |
507 | g_assert_not_reached(); | |
508 | #else | |
509 | /* | |
510 | * At least one page includes MMIO. | |
511 | * Any bus operation can fail with cpu_transaction_failed, | |
512 | * which for ARM will raise SyncExternal. Perform the load | |
513 | * into scratch memory to preserve register state until the end. | |
514 | */ | |
515 | ARMVectorReg scratch = { }; | |
516 | ||
517 | reg_off = info.reg_off_first[0]; | |
518 | reg_last = info.reg_off_last[1]; | |
519 | if (reg_last < 0) { | |
520 | reg_last = info.reg_off_split; | |
521 | if (reg_last < 0) { | |
522 | reg_last = info.reg_off_last[0]; | |
523 | } | |
524 | } | |
525 | ||
526 | do { | |
527 | uint64_t pg = vg[reg_off >> 6]; | |
528 | do { | |
529 | if ((pg >> (reg_off & 63)) & 1) { | |
530 | tlb_fn(env, &scratch, reg_off, addr + reg_off, ra); | |
531 | } | |
532 | reg_off += esize; | |
533 | } while (reg_off & 63); | |
534 | } while (reg_off <= reg_last); | |
535 | ||
536 | cpy_fn(za, &scratch, reg_max); | |
537 | return; | |
538 | #endif | |
539 | } | |
540 | ||
541 | /* The entire operation is in RAM, on valid pages. */ | |
542 | ||
543 | reg_off = info.reg_off_first[0]; | |
544 | reg_last = info.reg_off_last[0]; | |
545 | host = info.page[0].host; | |
546 | ||
547 | if (!vertical) { | |
548 | memset(za, 0, reg_max); | |
549 | } else if (reg_off) { | |
550 | clr_fn(za, 0, reg_off); | |
551 | } | |
552 | ||
553 | while (reg_off <= reg_last) { | |
554 | uint64_t pg = vg[reg_off >> 6]; | |
555 | do { | |
556 | if ((pg >> (reg_off & 63)) & 1) { | |
557 | host_fn(za, reg_off, host + reg_off); | |
558 | } else if (vertical) { | |
559 | clr_fn(za, reg_off, esize); | |
560 | } | |
561 | reg_off += esize; | |
562 | } while (reg_off <= reg_last && (reg_off & 63)); | |
563 | } | |
564 | ||
565 | /* | |
566 | * Use the slow path to manage the cross-page misalignment. | |
567 | * But we know this is RAM and cannot trap. | |
568 | */ | |
569 | reg_off = info.reg_off_split; | |
570 | if (unlikely(reg_off >= 0)) { | |
571 | tlb_fn(env, za, reg_off, addr + reg_off, ra); | |
572 | } | |
573 | ||
574 | reg_off = info.reg_off_first[1]; | |
575 | if (unlikely(reg_off >= 0)) { | |
576 | reg_last = info.reg_off_last[1]; | |
577 | host = info.page[1].host; | |
578 | ||
579 | do { | |
580 | uint64_t pg = vg[reg_off >> 6]; | |
581 | do { | |
582 | if ((pg >> (reg_off & 63)) & 1) { | |
583 | host_fn(za, reg_off, host + reg_off); | |
584 | } else if (vertical) { | |
585 | clr_fn(za, reg_off, esize); | |
586 | } | |
587 | reg_off += esize; | |
588 | } while (reg_off & 63); | |
589 | } while (reg_off <= reg_last); | |
590 | } | |
591 | } | |
592 | ||
593 | static inline QEMU_ALWAYS_INLINE | |
594 | void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, | |
595 | target_ulong addr, uint32_t desc, uintptr_t ra, | |
596 | const int esz, bool vertical, | |
597 | sve_ldst1_host_fn *host_fn, | |
598 | sve_ldst1_tlb_fn *tlb_fn, | |
599 | ClearFn *clr_fn, | |
600 | CopyFn *cpy_fn) | |
601 | { | |
602 | uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); | |
603 | int bit55 = extract64(addr, 55, 1); | |
604 | ||
605 | /* Remove mtedesc from the normal sve descriptor. */ | |
606 | desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); | |
607 | ||
608 | /* Perform gross MTE suppression early. */ | |
609 | if (!tbi_check(desc, bit55) || | |
610 | tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { | |
611 | mtedesc = 0; | |
612 | } | |
613 | ||
614 | sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical, | |
615 | host_fn, tlb_fn, clr_fn, cpy_fn); | |
616 | } | |
617 | ||
618 | #define DO_LD(L, END, ESZ) \ | |
619 | void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ | |
620 | target_ulong addr, uint32_t desc) \ | |
621 | { \ | |
622 | sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ | |
623 | sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ | |
624 | clear_horizontal, copy_horizontal); \ | |
625 | } \ | |
626 | void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ | |
627 | target_ulong addr, uint32_t desc) \ | |
628 | { \ | |
629 | sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ | |
630 | sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ | |
631 | clear_vertical_##L, copy_vertical_##L); \ | |
632 | } \ | |
633 | void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ | |
634 | target_ulong addr, uint32_t desc) \ | |
635 | { \ | |
636 | sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ | |
637 | sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ | |
638 | clear_horizontal, copy_horizontal); \ | |
639 | } \ | |
640 | void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ | |
641 | target_ulong addr, uint32_t desc) \ | |
642 | { \ | |
643 | sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ | |
644 | sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ | |
645 | clear_vertical_##L, copy_vertical_##L); \ | |
646 | } | |
647 | ||
648 | DO_LD(b, , MO_8) | |
649 | DO_LD(h, _be, MO_16) | |
650 | DO_LD(h, _le, MO_16) | |
651 | DO_LD(s, _be, MO_32) | |
652 | DO_LD(s, _le, MO_32) | |
653 | DO_LD(d, _be, MO_64) | |
654 | DO_LD(d, _le, MO_64) | |
655 | DO_LD(q, _be, MO_128) | |
656 | DO_LD(q, _le, MO_128) | |
657 | ||
658 | #undef DO_LD | |
659 | ||
660 | /* | |
661 | * Common helper for all contiguous predicated stores. | |
662 | */ | |
663 | ||
664 | static inline QEMU_ALWAYS_INLINE | |
665 | void sme_st1(CPUARMState *env, void *za, uint64_t *vg, | |
666 | const target_ulong addr, uint32_t desc, const uintptr_t ra, | |
667 | const int esz, uint32_t mtedesc, bool vertical, | |
668 | sve_ldst1_host_fn *host_fn, | |
669 | sve_ldst1_tlb_fn *tlb_fn) | |
670 | { | |
671 | const intptr_t reg_max = simd_oprsz(desc); | |
672 | const intptr_t esize = 1 << esz; | |
673 | intptr_t reg_off, reg_last; | |
674 | SVEContLdSt info; | |
675 | void *host; | |
676 | int flags; | |
677 | ||
678 | /* Find the active elements. */ | |
679 | if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { | |
680 | /* The entire predicate was false; no store occurs. */ | |
681 | return; | |
682 | } | |
683 | ||
684 | /* Probe the page(s). Exit with exception for any invalid page. */ | |
685 | sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); | |
686 | ||
687 | /* Handle watchpoints for all active elements. */ | |
688 | sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, | |
689 | BP_MEM_WRITE, ra); | |
690 | ||
691 | /* | |
692 | * Handle mte checks for all active elements. | |
693 | * Since TBI must be set for MTE, !mtedesc => !mte_active. | |
694 | */ | |
695 | if (mtedesc) { | |
696 | sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, | |
697 | mtedesc, ra); | |
698 | } | |
699 | ||
700 | flags = info.page[0].flags | info.page[1].flags; | |
701 | if (unlikely(flags != 0)) { | |
702 | #ifdef CONFIG_USER_ONLY | |
703 | g_assert_not_reached(); | |
704 | #else | |
705 | /* | |
706 | * At least one page includes MMIO. | |
707 | * Any bus operation can fail with cpu_transaction_failed, | |
708 | * which for ARM will raise SyncExternal. We cannot avoid | |
709 | * this fault and will leave with the store incomplete. | |
710 | */ | |
711 | reg_off = info.reg_off_first[0]; | |
712 | reg_last = info.reg_off_last[1]; | |
713 | if (reg_last < 0) { | |
714 | reg_last = info.reg_off_split; | |
715 | if (reg_last < 0) { | |
716 | reg_last = info.reg_off_last[0]; | |
717 | } | |
718 | } | |
719 | ||
720 | do { | |
721 | uint64_t pg = vg[reg_off >> 6]; | |
722 | do { | |
723 | if ((pg >> (reg_off & 63)) & 1) { | |
724 | tlb_fn(env, za, reg_off, addr + reg_off, ra); | |
725 | } | |
726 | reg_off += esize; | |
727 | } while (reg_off & 63); | |
728 | } while (reg_off <= reg_last); | |
729 | return; | |
730 | #endif | |
731 | } | |
732 | ||
733 | reg_off = info.reg_off_first[0]; | |
734 | reg_last = info.reg_off_last[0]; | |
735 | host = info.page[0].host; | |
736 | ||
737 | while (reg_off <= reg_last) { | |
738 | uint64_t pg = vg[reg_off >> 6]; | |
739 | do { | |
740 | if ((pg >> (reg_off & 63)) & 1) { | |
741 | host_fn(za, reg_off, host + reg_off); | |
742 | } | |
743 | reg_off += 1 << esz; | |
744 | } while (reg_off <= reg_last && (reg_off & 63)); | |
745 | } | |
746 | ||
747 | /* | |
748 | * Use the slow path to manage the cross-page misalignment. | |
749 | * But we know this is RAM and cannot trap. | |
750 | */ | |
751 | reg_off = info.reg_off_split; | |
752 | if (unlikely(reg_off >= 0)) { | |
753 | tlb_fn(env, za, reg_off, addr + reg_off, ra); | |
754 | } | |
755 | ||
756 | reg_off = info.reg_off_first[1]; | |
757 | if (unlikely(reg_off >= 0)) { | |
758 | reg_last = info.reg_off_last[1]; | |
759 | host = info.page[1].host; | |
760 | ||
761 | do { | |
762 | uint64_t pg = vg[reg_off >> 6]; | |
763 | do { | |
764 | if ((pg >> (reg_off & 63)) & 1) { | |
765 | host_fn(za, reg_off, host + reg_off); | |
766 | } | |
767 | reg_off += 1 << esz; | |
768 | } while (reg_off & 63); | |
769 | } while (reg_off <= reg_last); | |
770 | } | |
771 | } | |
772 | ||
773 | static inline QEMU_ALWAYS_INLINE | |
774 | void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, | |
775 | uint32_t desc, uintptr_t ra, int esz, bool vertical, | |
776 | sve_ldst1_host_fn *host_fn, | |
777 | sve_ldst1_tlb_fn *tlb_fn) | |
778 | { | |
779 | uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); | |
780 | int bit55 = extract64(addr, 55, 1); | |
781 | ||
782 | /* Remove mtedesc from the normal sve descriptor. */ | |
783 | desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); | |
784 | ||
785 | /* Perform gross MTE suppression early. */ | |
786 | if (!tbi_check(desc, bit55) || | |
787 | tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { | |
788 | mtedesc = 0; | |
789 | } | |
790 | ||
791 | sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc, | |
792 | vertical, host_fn, tlb_fn); | |
793 | } | |
794 | ||
795 | #define DO_ST(L, END, ESZ) \ | |
796 | void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ | |
797 | target_ulong addr, uint32_t desc) \ | |
798 | { \ | |
799 | sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ | |
800 | sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ | |
801 | } \ | |
802 | void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ | |
803 | target_ulong addr, uint32_t desc) \ | |
804 | { \ | |
805 | sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ | |
806 | sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ | |
807 | } \ | |
808 | void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ | |
809 | target_ulong addr, uint32_t desc) \ | |
810 | { \ | |
811 | sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ | |
812 | sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ | |
813 | } \ | |
814 | void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ | |
815 | target_ulong addr, uint32_t desc) \ | |
816 | { \ | |
817 | sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ | |
818 | sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ | |
819 | } | |
820 | ||
821 | DO_ST(b, , MO_8) | |
822 | DO_ST(h, _be, MO_16) | |
823 | DO_ST(h, _le, MO_16) | |
824 | DO_ST(s, _be, MO_32) | |
825 | DO_ST(s, _le, MO_32) | |
826 | DO_ST(d, _be, MO_64) | |
827 | DO_ST(d, _le, MO_64) | |
828 | DO_ST(q, _be, MO_128) | |
829 | DO_ST(q, _le, MO_128) | |
830 | ||
831 | #undef DO_ST | |
bc4420d9 RH |
832 | |
833 | void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, | |
834 | void *vpm, uint32_t desc) | |
835 | { | |
836 | intptr_t row, col, oprsz = simd_oprsz(desc) / 4; | |
837 | uint64_t *pn = vpn, *pm = vpm; | |
838 | uint32_t *zda = vzda, *zn = vzn; | |
839 | ||
840 | for (row = 0; row < oprsz; ) { | |
841 | uint64_t pa = pn[row >> 4]; | |
842 | do { | |
843 | if (pa & 1) { | |
844 | for (col = 0; col < oprsz; ) { | |
845 | uint64_t pb = pm[col >> 4]; | |
846 | do { | |
847 | if (pb & 1) { | |
848 | zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)]; | |
849 | } | |
850 | pb >>= 4; | |
851 | } while (++col & 15); | |
852 | } | |
853 | } | |
854 | pa >>= 4; | |
855 | } while (++row & 15); | |
856 | } | |
857 | } | |
858 | ||
859 | void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, | |
860 | void *vpm, uint32_t desc) | |
861 | { | |
862 | intptr_t row, col, oprsz = simd_oprsz(desc) / 8; | |
863 | uint8_t *pn = vpn, *pm = vpm; | |
864 | uint64_t *zda = vzda, *zn = vzn; | |
865 | ||
866 | for (row = 0; row < oprsz; ++row) { | |
867 | if (pn[H1(row)] & 1) { | |
868 | for (col = 0; col < oprsz; ++col) { | |
869 | if (pm[H1(col)] & 1) { | |
870 | zda[tile_vslice_index(row) + col] += zn[col]; | |
871 | } | |
872 | } | |
873 | } | |
874 | } | |
875 | } | |
876 | ||
877 | void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, | |
878 | void *vpm, uint32_t desc) | |
879 | { | |
880 | intptr_t row, col, oprsz = simd_oprsz(desc) / 4; | |
881 | uint64_t *pn = vpn, *pm = vpm; | |
882 | uint32_t *zda = vzda, *zn = vzn; | |
883 | ||
884 | for (row = 0; row < oprsz; ) { | |
885 | uint64_t pa = pn[row >> 4]; | |
886 | do { | |
887 | if (pa & 1) { | |
888 | uint32_t zn_row = zn[H4(row)]; | |
889 | for (col = 0; col < oprsz; ) { | |
890 | uint64_t pb = pm[col >> 4]; | |
891 | do { | |
892 | if (pb & 1) { | |
893 | zda[tile_vslice_index(row) + H4(col)] += zn_row; | |
894 | } | |
895 | pb >>= 4; | |
896 | } while (++col & 15); | |
897 | } | |
898 | } | |
899 | pa >>= 4; | |
900 | } while (++row & 15); | |
901 | } | |
902 | } | |
903 | ||
904 | void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, | |
905 | void *vpm, uint32_t desc) | |
906 | { | |
907 | intptr_t row, col, oprsz = simd_oprsz(desc) / 8; | |
908 | uint8_t *pn = vpn, *pm = vpm; | |
909 | uint64_t *zda = vzda, *zn = vzn; | |
910 | ||
911 | for (row = 0; row < oprsz; ++row) { | |
912 | if (pn[H1(row)] & 1) { | |
913 | uint64_t zn_row = zn[row]; | |
914 | for (col = 0; col < oprsz; ++col) { | |
915 | if (pm[H1(col)] & 1) { | |
916 | zda[tile_vslice_index(row) + col] += zn_row; | |
917 | } | |
918 | } | |
919 | } | |
920 | } | |
921 | } | |
558e956c RH |
922 | |
923 | void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, | |
924 | void *vpm, void *vst, uint32_t desc) | |
925 | { | |
926 | intptr_t row, col, oprsz = simd_maxsz(desc); | |
927 | uint32_t neg = simd_data(desc) << 31; | |
928 | uint16_t *pn = vpn, *pm = vpm; | |
929 | float_status fpst; | |
930 | ||
931 | /* | |
932 | * Make a copy of float_status because this operation does not | |
933 | * update the cumulative fp exception status. It also produces | |
934 | * default nans. | |
935 | */ | |
936 | fpst = *(float_status *)vst; | |
937 | set_default_nan_mode(true, &fpst); | |
938 | ||
939 | for (row = 0; row < oprsz; ) { | |
940 | uint16_t pa = pn[H2(row >> 4)]; | |
941 | do { | |
942 | if (pa & 1) { | |
943 | void *vza_row = vza + tile_vslice_offset(row); | |
944 | uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; | |
945 | ||
946 | for (col = 0; col < oprsz; ) { | |
947 | uint16_t pb = pm[H2(col >> 4)]; | |
948 | do { | |
949 | if (pb & 1) { | |
950 | uint32_t *a = vza_row + H1_4(col); | |
951 | uint32_t *m = vzm + H1_4(col); | |
952 | *a = float32_muladd(n, *m, *a, 0, vst); | |
953 | } | |
954 | col += 4; | |
955 | pb >>= 4; | |
956 | } while (col & 15); | |
957 | } | |
958 | } | |
959 | row += 4; | |
960 | pa >>= 4; | |
961 | } while (row & 15); | |
962 | } | |
963 | } | |
964 | ||
965 | void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, | |
966 | void *vpm, void *vst, uint32_t desc) | |
967 | { | |
968 | intptr_t row, col, oprsz = simd_oprsz(desc) / 8; | |
969 | uint64_t neg = (uint64_t)simd_data(desc) << 63; | |
970 | uint64_t *za = vza, *zn = vzn, *zm = vzm; | |
971 | uint8_t *pn = vpn, *pm = vpm; | |
972 | float_status fpst = *(float_status *)vst; | |
973 | ||
974 | set_default_nan_mode(true, &fpst); | |
975 | ||
976 | for (row = 0; row < oprsz; ++row) { | |
977 | if (pn[H1(row)] & 1) { | |
978 | uint64_t *za_row = &za[tile_vslice_index(row)]; | |
979 | uint64_t n = zn[row] ^ neg; | |
980 | ||
981 | for (col = 0; col < oprsz; ++col) { | |
982 | if (pm[H1(col)] & 1) { | |
983 | uint64_t *a = &za_row[col]; | |
984 | *a = float64_muladd(n, zm[col], *a, 0, &fpst); | |
985 | } | |
986 | } | |
987 | } | |
988 | } | |
989 | } | |
920f640d RH |
990 | |
991 | /* | |
992 | * Alter PAIR as needed for controlling predicates being false, | |
993 | * and for NEG on an enabled row element. | |
994 | */ | |
995 | static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) | |
996 | { | |
997 | /* | |
998 | * The pseudocode uses a conditional negate after the conditional zero. | |
999 | * It is simpler here to unconditionally negate before conditional zero. | |
1000 | */ | |
1001 | pair ^= neg; | |
1002 | if (!(pg & 1)) { | |
1003 | pair &= 0xffff0000u; | |
1004 | } | |
1005 | if (!(pg & 4)) { | |
1006 | pair &= 0x0000ffffu; | |
1007 | } | |
1008 | return pair; | |
1009 | } | |
1010 | ||
3916841a RH |
1011 | static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, |
1012 | float_status *s_std, float_status *s_odd) | |
1013 | { | |
1014 | float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std); | |
1015 | float64 e1c = float16_to_float64(e1 >> 16, true, s_std); | |
1016 | float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std); | |
1017 | float64 e2c = float16_to_float64(e2 >> 16, true, s_std); | |
1018 | float64 t64; | |
1019 | float32 t32; | |
1020 | ||
1021 | /* | |
1022 | * The ARM pseudocode function FPDot performs both multiplies | |
1023 | * and the add with a single rounding operation. Emulate this | |
1024 | * by performing the first multiply in round-to-odd, then doing | |
1025 | * the second multiply as fused multiply-add, and rounding to | |
1026 | * float32 all in one step. | |
1027 | */ | |
1028 | t64 = float64_mul(e1r, e2r, s_odd); | |
1029 | t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); | |
1030 | ||
1031 | /* This conversion is exact, because we've already rounded. */ | |
1032 | t32 = float64_to_float32(t64, s_std); | |
1033 | ||
1034 | /* The final accumulation step is not fused. */ | |
1035 | return float32_add(sum, t32, s_std); | |
1036 | } | |
1037 | ||
1038 | void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, | |
1039 | void *vpm, void *vst, uint32_t desc) | |
1040 | { | |
1041 | intptr_t row, col, oprsz = simd_maxsz(desc); | |
1042 | uint32_t neg = simd_data(desc) * 0x80008000u; | |
1043 | uint16_t *pn = vpn, *pm = vpm; | |
1044 | float_status fpst_odd, fpst_std; | |
1045 | ||
1046 | /* | |
1047 | * Make a copy of float_status because this operation does not | |
1048 | * update the cumulative fp exception status. It also produces | |
1049 | * default nans. Make a second copy with round-to-odd -- see above. | |
1050 | */ | |
1051 | fpst_std = *(float_status *)vst; | |
1052 | set_default_nan_mode(true, &fpst_std); | |
1053 | fpst_odd = fpst_std; | |
1054 | set_float_rounding_mode(float_round_to_odd, &fpst_odd); | |
1055 | ||
1056 | for (row = 0; row < oprsz; ) { | |
1057 | uint16_t prow = pn[H2(row >> 4)]; | |
1058 | do { | |
1059 | void *vza_row = vza + tile_vslice_offset(row); | |
1060 | uint32_t n = *(uint32_t *)(vzn + H1_4(row)); | |
1061 | ||
1062 | n = f16mop_adj_pair(n, prow, neg); | |
1063 | ||
1064 | for (col = 0; col < oprsz; ) { | |
1065 | uint16_t pcol = pm[H2(col >> 4)]; | |
1066 | do { | |
1067 | if (prow & pcol & 0b0101) { | |
1068 | uint32_t *a = vza_row + H1_4(col); | |
1069 | uint32_t m = *(uint32_t *)(vzm + H1_4(col)); | |
1070 | ||
1071 | m = f16mop_adj_pair(m, pcol, 0); | |
1072 | *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd); | |
1073 | ||
1074 | col += 4; | |
1075 | pcol >>= 4; | |
1076 | } | |
1077 | } while (col & 15); | |
1078 | } | |
1079 | row += 4; | |
1080 | prow >>= 4; | |
1081 | } while (row & 15); | |
1082 | } | |
1083 | } | |
1084 | ||
920f640d RH |
1085 | void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, |
1086 | void *vpm, uint32_t desc) | |
1087 | { | |
1088 | intptr_t row, col, oprsz = simd_maxsz(desc); | |
1089 | uint32_t neg = simd_data(desc) * 0x80008000u; | |
1090 | uint16_t *pn = vpn, *pm = vpm; | |
1091 | ||
1092 | for (row = 0; row < oprsz; ) { | |
1093 | uint16_t prow = pn[H2(row >> 4)]; | |
1094 | do { | |
1095 | void *vza_row = vza + tile_vslice_offset(row); | |
1096 | uint32_t n = *(uint32_t *)(vzn + H1_4(row)); | |
1097 | ||
1098 | n = f16mop_adj_pair(n, prow, neg); | |
1099 | ||
1100 | for (col = 0; col < oprsz; ) { | |
1101 | uint16_t pcol = pm[H2(col >> 4)]; | |
1102 | do { | |
1103 | if (prow & pcol & 0b0101) { | |
1104 | uint32_t *a = vza_row + H1_4(col); | |
1105 | uint32_t m = *(uint32_t *)(vzm + H1_4(col)); | |
1106 | ||
1107 | m = f16mop_adj_pair(m, pcol, 0); | |
1108 | *a = bfdotadd(*a, n, m); | |
1109 | ||
1110 | col += 4; | |
1111 | pcol >>= 4; | |
1112 | } | |
1113 | } while (col & 15); | |
1114 | } | |
1115 | row += 4; | |
1116 | prow >>= 4; | |
1117 | } while (row & 15); | |
1118 | } | |
1119 | } | |
23a5e385 RH |
1120 | |
1121 | typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool); | |
1122 | ||
1123 | static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm, | |
1124 | uint8_t *pn, uint8_t *pm, | |
1125 | uint32_t desc, IMOPFn *fn) | |
1126 | { | |
1127 | intptr_t row, col, oprsz = simd_oprsz(desc) / 8; | |
1128 | bool neg = simd_data(desc); | |
1129 | ||
1130 | for (row = 0; row < oprsz; ++row) { | |
1131 | uint8_t pa = pn[H1(row)]; | |
1132 | uint64_t *za_row = &za[tile_vslice_index(row)]; | |
1133 | uint64_t n = zn[row]; | |
1134 | ||
1135 | for (col = 0; col < oprsz; ++col) { | |
1136 | uint8_t pb = pm[H1(col)]; | |
1137 | uint64_t *a = &za_row[col]; | |
1138 | ||
1139 | *a = fn(n, zm[col], *a, pa & pb, neg); | |
1140 | } | |
1141 | } | |
1142 | } | |
1143 | ||
1144 | #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ | |
1145 | static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ | |
1146 | { \ | |
1147 | uint32_t sum0 = 0, sum1 = 0; \ | |
1148 | /* Apply P to N as a mask, making the inactive elements 0. */ \ | |
1149 | n &= expand_pred_b(p); \ | |
1150 | sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ | |
1151 | sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \ | |
1152 | sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ | |
1153 | sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \ | |
1154 | sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ | |
1155 | sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \ | |
1156 | sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ | |
1157 | sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \ | |
1158 | if (neg) { \ | |
1159 | sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \ | |
1160 | } else { \ | |
1161 | sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \ | |
1162 | } \ | |
1163 | return ((uint64_t)sum1 << 32) | sum0; \ | |
1164 | } | |
1165 | ||
1166 | #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ | |
1167 | static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ | |
1168 | { \ | |
1169 | uint64_t sum = 0; \ | |
1170 | /* Apply P to N as a mask, making the inactive elements 0. */ \ | |
1171 | n &= expand_pred_h(p); \ | |
1172 | sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ | |
1173 | sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ | |
1174 | sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ | |
1175 | sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ | |
1176 | return neg ? a - sum : a + sum; \ | |
1177 | } | |
1178 | ||
1179 | DEF_IMOP_32(smopa_s, int8_t, int8_t) | |
1180 | DEF_IMOP_32(umopa_s, uint8_t, uint8_t) | |
1181 | DEF_IMOP_32(sumopa_s, int8_t, uint8_t) | |
1182 | DEF_IMOP_32(usmopa_s, uint8_t, int8_t) | |
1183 | ||
1184 | DEF_IMOP_64(smopa_d, int16_t, int16_t) | |
1185 | DEF_IMOP_64(umopa_d, uint16_t, uint16_t) | |
1186 | DEF_IMOP_64(sumopa_d, int16_t, uint16_t) | |
1187 | DEF_IMOP_64(usmopa_d, uint16_t, int16_t) | |
1188 | ||
1189 | #define DEF_IMOPH(NAME) \ | |
1190 | void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \ | |
1191 | void *vpm, uint32_t desc) \ | |
1192 | { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); } | |
1193 | ||
1194 | DEF_IMOPH(smopa_s) | |
1195 | DEF_IMOPH(umopa_s) | |
1196 | DEF_IMOPH(sumopa_s) | |
1197 | DEF_IMOPH(usmopa_s) | |
1198 | DEF_IMOPH(smopa_d) | |
1199 | DEF_IMOPH(umopa_d) | |
1200 | DEF_IMOPH(sumopa_d) | |
1201 | DEF_IMOPH(usmopa_d) |