]>
Commit | Line | Data |
---|---|---|
db432672 RH |
1 | /* |
2 | * Generic vectorized operation runtime | |
3 | * | |
4 | * Copyright (c) 2018 Linaro | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "qemu/host-utils.h" | |
22 | #include "cpu.h" | |
23 | #include "exec/helper-proto.h" | |
24 | #include "tcg-gvec-desc.h" | |
25 | ||
26 | ||
27 | /* Virtually all hosts support 16-byte vectors. Those that don't can emulate | |
28 | * them via GCC's generic vector extension. This turns out to be simpler and | |
29 | * more reliable than getting the compiler to autovectorize. | |
30 | * | |
31 | * In tcg-op-gvec.c, we asserted that both the size and alignment of the data | |
32 | * are multiples of 16. | |
33 | * | |
34 | * When the compiler does not support all of the operations we require, the | |
35 | * loops are written so that we can always fall back on the base types. | |
36 | */ | |
37 | #ifdef CONFIG_VECTOR16 | |
38 | typedef uint8_t vec8 __attribute__((vector_size(16))); | |
39 | typedef uint16_t vec16 __attribute__((vector_size(16))); | |
40 | typedef uint32_t vec32 __attribute__((vector_size(16))); | |
41 | typedef uint64_t vec64 __attribute__((vector_size(16))); | |
42 | ||
43 | typedef int8_t svec8 __attribute__((vector_size(16))); | |
44 | typedef int16_t svec16 __attribute__((vector_size(16))); | |
45 | typedef int32_t svec32 __attribute__((vector_size(16))); | |
46 | typedef int64_t svec64 __attribute__((vector_size(16))); | |
47 | ||
48 | #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } | |
49 | #define DUP8(X) { X, X, X, X, X, X, X, X } | |
50 | #define DUP4(X) { X, X, X, X } | |
51 | #define DUP2(X) { X, X } | |
52 | #else | |
53 | typedef uint8_t vec8; | |
54 | typedef uint16_t vec16; | |
55 | typedef uint32_t vec32; | |
56 | typedef uint64_t vec64; | |
57 | ||
58 | typedef int8_t svec8; | |
59 | typedef int16_t svec16; | |
60 | typedef int32_t svec32; | |
61 | typedef int64_t svec64; | |
62 | ||
63 | #define DUP16(X) X | |
64 | #define DUP8(X) X | |
65 | #define DUP4(X) X | |
66 | #define DUP2(X) X | |
67 | #endif /* CONFIG_VECTOR16 */ | |
68 | ||
69 | static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) | |
70 | { | |
71 | intptr_t maxsz = simd_maxsz(desc); | |
72 | intptr_t i; | |
73 | ||
74 | if (unlikely(maxsz > oprsz)) { | |
75 | for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { | |
76 | *(uint64_t *)(d + i) = 0; | |
77 | } | |
78 | } | |
79 | } | |
80 | ||
81 | void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) | |
82 | { | |
83 | intptr_t oprsz = simd_oprsz(desc); | |
84 | intptr_t i; | |
85 | ||
86 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
87 | *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); | |
88 | } | |
89 | clear_high(d, oprsz, desc); | |
90 | } | |
91 | ||
92 | void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) | |
93 | { | |
94 | intptr_t oprsz = simd_oprsz(desc); | |
95 | intptr_t i; | |
96 | ||
97 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
98 | *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); | |
99 | } | |
100 | clear_high(d, oprsz, desc); | |
101 | } | |
102 | ||
103 | void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) | |
104 | { | |
105 | intptr_t oprsz = simd_oprsz(desc); | |
106 | intptr_t i; | |
107 | ||
108 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
109 | *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); | |
110 | } | |
111 | clear_high(d, oprsz, desc); | |
112 | } | |
113 | ||
114 | void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) | |
115 | { | |
116 | intptr_t oprsz = simd_oprsz(desc); | |
117 | intptr_t i; | |
118 | ||
119 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
120 | *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); | |
121 | } | |
122 | clear_high(d, oprsz, desc); | |
123 | } | |
124 | ||
125 | void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) | |
126 | { | |
127 | intptr_t oprsz = simd_oprsz(desc); | |
128 | intptr_t i; | |
129 | ||
130 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
131 | *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); | |
132 | } | |
133 | clear_high(d, oprsz, desc); | |
134 | } | |
135 | ||
136 | void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) | |
137 | { | |
138 | intptr_t oprsz = simd_oprsz(desc); | |
139 | intptr_t i; | |
140 | ||
141 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
142 | *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); | |
143 | } | |
144 | clear_high(d, oprsz, desc); | |
145 | } | |
146 | ||
147 | void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) | |
148 | { | |
149 | intptr_t oprsz = simd_oprsz(desc); | |
150 | intptr_t i; | |
151 | ||
152 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
153 | *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); | |
154 | } | |
155 | clear_high(d, oprsz, desc); | |
156 | } | |
157 | ||
158 | void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) | |
159 | { | |
160 | intptr_t oprsz = simd_oprsz(desc); | |
161 | intptr_t i; | |
162 | ||
163 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
164 | *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); | |
165 | } | |
166 | clear_high(d, oprsz, desc); | |
167 | } | |
168 | ||
169 | void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) | |
170 | { | |
171 | intptr_t oprsz = simd_oprsz(desc); | |
172 | intptr_t i; | |
173 | ||
174 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
175 | *(vec8 *)(d + i) = -*(vec8 *)(a + i); | |
176 | } | |
177 | clear_high(d, oprsz, desc); | |
178 | } | |
179 | ||
180 | void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) | |
181 | { | |
182 | intptr_t oprsz = simd_oprsz(desc); | |
183 | intptr_t i; | |
184 | ||
185 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
186 | *(vec16 *)(d + i) = -*(vec16 *)(a + i); | |
187 | } | |
188 | clear_high(d, oprsz, desc); | |
189 | } | |
190 | ||
191 | void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) | |
192 | { | |
193 | intptr_t oprsz = simd_oprsz(desc); | |
194 | intptr_t i; | |
195 | ||
196 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
197 | *(vec32 *)(d + i) = -*(vec32 *)(a + i); | |
198 | } | |
199 | clear_high(d, oprsz, desc); | |
200 | } | |
201 | ||
202 | void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) | |
203 | { | |
204 | intptr_t oprsz = simd_oprsz(desc); | |
205 | intptr_t i; | |
206 | ||
207 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
208 | *(vec64 *)(d + i) = -*(vec64 *)(a + i); | |
209 | } | |
210 | clear_high(d, oprsz, desc); | |
211 | } | |
212 | ||
213 | void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) | |
214 | { | |
215 | intptr_t oprsz = simd_oprsz(desc); | |
216 | ||
217 | memcpy(d, a, oprsz); | |
218 | clear_high(d, oprsz, desc); | |
219 | } | |
220 | ||
221 | void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) | |
222 | { | |
223 | intptr_t oprsz = simd_oprsz(desc); | |
224 | intptr_t i; | |
225 | ||
226 | if (c == 0) { | |
227 | oprsz = 0; | |
228 | } else { | |
229 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { | |
230 | *(uint64_t *)(d + i) = c; | |
231 | } | |
232 | } | |
233 | clear_high(d, oprsz, desc); | |
234 | } | |
235 | ||
236 | void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) | |
237 | { | |
238 | intptr_t oprsz = simd_oprsz(desc); | |
239 | intptr_t i; | |
240 | ||
241 | if (c == 0) { | |
242 | oprsz = 0; | |
243 | } else { | |
244 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { | |
245 | *(uint32_t *)(d + i) = c; | |
246 | } | |
247 | } | |
248 | clear_high(d, oprsz, desc); | |
249 | } | |
250 | ||
251 | void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) | |
252 | { | |
253 | HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); | |
254 | } | |
255 | ||
256 | void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) | |
257 | { | |
258 | HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); | |
259 | } | |
260 | ||
261 | void HELPER(gvec_not)(void *d, void *a, uint32_t desc) | |
262 | { | |
263 | intptr_t oprsz = simd_oprsz(desc); | |
264 | intptr_t i; | |
265 | ||
266 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
267 | *(vec64 *)(d + i) = ~*(vec64 *)(a + i); | |
268 | } | |
269 | clear_high(d, oprsz, desc); | |
270 | } | |
271 | ||
272 | void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) | |
273 | { | |
274 | intptr_t oprsz = simd_oprsz(desc); | |
275 | intptr_t i; | |
276 | ||
277 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
278 | *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); | |
279 | } | |
280 | clear_high(d, oprsz, desc); | |
281 | } | |
282 | ||
283 | void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) | |
284 | { | |
285 | intptr_t oprsz = simd_oprsz(desc); | |
286 | intptr_t i; | |
287 | ||
288 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
289 | *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); | |
290 | } | |
291 | clear_high(d, oprsz, desc); | |
292 | } | |
293 | ||
294 | void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) | |
295 | { | |
296 | intptr_t oprsz = simd_oprsz(desc); | |
297 | intptr_t i; | |
298 | ||
299 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
300 | *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); | |
301 | } | |
302 | clear_high(d, oprsz, desc); | |
303 | } | |
304 | ||
305 | void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) | |
306 | { | |
307 | intptr_t oprsz = simd_oprsz(desc); | |
308 | intptr_t i; | |
309 | ||
310 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
311 | *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); | |
312 | } | |
313 | clear_high(d, oprsz, desc); | |
314 | } | |
315 | ||
316 | void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) | |
317 | { | |
318 | intptr_t oprsz = simd_oprsz(desc); | |
319 | intptr_t i; | |
320 | ||
321 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
322 | *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); | |
323 | } | |
324 | clear_high(d, oprsz, desc); | |
325 | } | |
d0ec9796 RH |
326 | |
327 | void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) | |
328 | { | |
329 | intptr_t oprsz = simd_oprsz(desc); | |
330 | int shift = simd_data(desc); | |
331 | intptr_t i; | |
332 | ||
333 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
334 | *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; | |
335 | } | |
336 | clear_high(d, oprsz, desc); | |
337 | } | |
338 | ||
339 | void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) | |
340 | { | |
341 | intptr_t oprsz = simd_oprsz(desc); | |
342 | int shift = simd_data(desc); | |
343 | intptr_t i; | |
344 | ||
345 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
346 | *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; | |
347 | } | |
348 | clear_high(d, oprsz, desc); | |
349 | } | |
350 | ||
351 | void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) | |
352 | { | |
353 | intptr_t oprsz = simd_oprsz(desc); | |
354 | int shift = simd_data(desc); | |
355 | intptr_t i; | |
356 | ||
357 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
358 | *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; | |
359 | } | |
360 | clear_high(d, oprsz, desc); | |
361 | } | |
362 | ||
363 | void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) | |
364 | { | |
365 | intptr_t oprsz = simd_oprsz(desc); | |
366 | int shift = simd_data(desc); | |
367 | intptr_t i; | |
368 | ||
369 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
370 | *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; | |
371 | } | |
372 | clear_high(d, oprsz, desc); | |
373 | } | |
374 | ||
375 | void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) | |
376 | { | |
377 | intptr_t oprsz = simd_oprsz(desc); | |
378 | int shift = simd_data(desc); | |
379 | intptr_t i; | |
380 | ||
381 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
382 | *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; | |
383 | } | |
384 | clear_high(d, oprsz, desc); | |
385 | } | |
386 | ||
387 | void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) | |
388 | { | |
389 | intptr_t oprsz = simd_oprsz(desc); | |
390 | int shift = simd_data(desc); | |
391 | intptr_t i; | |
392 | ||
393 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
394 | *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; | |
395 | } | |
396 | clear_high(d, oprsz, desc); | |
397 | } | |
398 | ||
399 | void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) | |
400 | { | |
401 | intptr_t oprsz = simd_oprsz(desc); | |
402 | int shift = simd_data(desc); | |
403 | intptr_t i; | |
404 | ||
405 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
406 | *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; | |
407 | } | |
408 | clear_high(d, oprsz, desc); | |
409 | } | |
410 | ||
411 | void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) | |
412 | { | |
413 | intptr_t oprsz = simd_oprsz(desc); | |
414 | int shift = simd_data(desc); | |
415 | intptr_t i; | |
416 | ||
417 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
418 | *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; | |
419 | } | |
420 | clear_high(d, oprsz, desc); | |
421 | } | |
422 | ||
423 | void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) | |
424 | { | |
425 | intptr_t oprsz = simd_oprsz(desc); | |
426 | int shift = simd_data(desc); | |
427 | intptr_t i; | |
428 | ||
429 | for (i = 0; i < oprsz; i += sizeof(vec8)) { | |
430 | *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; | |
431 | } | |
432 | clear_high(d, oprsz, desc); | |
433 | } | |
434 | ||
435 | void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) | |
436 | { | |
437 | intptr_t oprsz = simd_oprsz(desc); | |
438 | int shift = simd_data(desc); | |
439 | intptr_t i; | |
440 | ||
441 | for (i = 0; i < oprsz; i += sizeof(vec16)) { | |
442 | *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; | |
443 | } | |
444 | clear_high(d, oprsz, desc); | |
445 | } | |
446 | ||
447 | void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) | |
448 | { | |
449 | intptr_t oprsz = simd_oprsz(desc); | |
450 | int shift = simd_data(desc); | |
451 | intptr_t i; | |
452 | ||
453 | for (i = 0; i < oprsz; i += sizeof(vec32)) { | |
454 | *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; | |
455 | } | |
456 | clear_high(d, oprsz, desc); | |
457 | } | |
458 | ||
459 | void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) | |
460 | { | |
461 | intptr_t oprsz = simd_oprsz(desc); | |
462 | int shift = simd_data(desc); | |
463 | intptr_t i; | |
464 | ||
465 | for (i = 0; i < oprsz; i += sizeof(vec64)) { | |
466 | *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; | |
467 | } | |
468 | clear_high(d, oprsz, desc); | |
469 | } | |
212be173 RH |
470 | |
471 | /* If vectors are enabled, the compiler fills in -1 for true. | |
472 | Otherwise, we must take care of this by hand. */ | |
473 | #ifdef CONFIG_VECTOR16 | |
474 | # define DO_CMP0(X) X | |
475 | #else | |
476 | # define DO_CMP0(X) -(X) | |
477 | #endif | |
478 | ||
479 | #define DO_CMP1(NAME, TYPE, OP) \ | |
480 | void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ | |
481 | { \ | |
482 | intptr_t oprsz = simd_oprsz(desc); \ | |
483 | intptr_t i; \ | |
484 | for (i = 0; i < oprsz; i += sizeof(vec64)) { \ | |
485 | *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ | |
486 | } \ | |
487 | clear_high(d, oprsz, desc); \ | |
488 | } | |
489 | ||
490 | #define DO_CMP2(SZ) \ | |
491 | DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ | |
492 | DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ | |
493 | DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ | |
494 | DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ | |
495 | DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ | |
496 | DO_CMP1(gvec_leu##SZ, vec##SZ, <=) | |
497 | ||
498 | DO_CMP2(8) | |
499 | DO_CMP2(16) | |
500 | DO_CMP2(32) | |
501 | DO_CMP2(64) | |
502 | ||
503 | #undef DO_CMP0 | |
504 | #undef DO_CMP1 | |
505 | #undef DO_CMP2 |