2 * Loongson Multimedia Instruction emulation helpers for QEMU.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "exec/helper-proto.h"
25 * If the byte ordering doesn't matter, i.e. all columns are treated
26 * identically, then this union can be used directly. If byte ordering
27 * does matter, we generally ignore dumping to memory.
39 /* Some byte ordering issues can be mitigated by XORing in the following. */
40 #ifdef HOST_WORDS_BIGENDIAN
41 # define BYTE_ORDER_XOR(N) N
43 # define BYTE_ORDER_XOR(N) 0
46 #define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
47 #define SATUB(x) (x > 0xff ? 0xff : x)
49 #define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
50 #define SATUH(x) (x > 0xffff ? 0xffff : x)
53 (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
54 #define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x)
56 uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
63 for (i = 0; i < 8; ++i) {
64 int r = vs.sb[i] + vt.sb[i];
70 uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
77 for (i = 0; i < 8; ++i) {
78 int r = vs.ub[i] + vt.ub[i];
84 uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
91 for (i = 0; i < 4; ++i) {
92 int r = vs.sh[i] + vt.sh[i];
98 uint64_t helper_paddush(uint64_t fs, uint64_t ft)
105 for (i = 0; i < 4; ++i) {
106 int r = vs.uh[i] + vt.uh[i];
112 uint64_t helper_paddb(uint64_t fs, uint64_t ft)
119 for (i = 0; i < 8; ++i) {
120 vs.ub[i] += vt.ub[i];
125 uint64_t helper_paddh(uint64_t fs, uint64_t ft)
132 for (i = 0; i < 4; ++i) {
133 vs.uh[i] += vt.uh[i];
138 uint64_t helper_paddw(uint64_t fs, uint64_t ft)
145 for (i = 0; i < 2; ++i) {
146 vs.uw[i] += vt.uw[i];
151 uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
158 for (i = 0; i < 8; ++i) {
159 int r = vs.sb[i] - vt.sb[i];
165 uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
172 for (i = 0; i < 8; ++i) {
173 int r = vs.ub[i] - vt.ub[i];
179 uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
186 for (i = 0; i < 4; ++i) {
187 int r = vs.sh[i] - vt.sh[i];
193 uint64_t helper_psubush(uint64_t fs, uint64_t ft)
200 for (i = 0; i < 4; ++i) {
201 int r = vs.uh[i] - vt.uh[i];
207 uint64_t helper_psubb(uint64_t fs, uint64_t ft)
214 for (i = 0; i < 8; ++i) {
215 vs.ub[i] -= vt.ub[i];
220 uint64_t helper_psubh(uint64_t fs, uint64_t ft)
227 for (i = 0; i < 4; ++i) {
228 vs.uh[i] -= vt.uh[i];
233 uint64_t helper_psubw(uint64_t fs, uint64_t ft)
240 for (i = 0; i < 2; ++i) {
241 vs.uw[i] -= vt.uw[i];
246 uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
248 unsigned host = BYTE_ORDER_XOR(3);
254 for (i = 0; i < 4; i++, ft >>= 2) {
255 vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
260 uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
265 tmp = (int32_t)(fs >> 0);
267 fd |= (tmp & 0xffff) << 0;
269 tmp = (int32_t)(fs >> 32);
271 fd |= (tmp & 0xffff) << 16;
273 tmp = (int32_t)(ft >> 0);
275 fd |= (tmp & 0xffff) << 32;
277 tmp = (int32_t)(ft >> 32);
279 fd |= (tmp & 0xffff) << 48;
284 uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
289 for (i = 0; i < 4; ++i) {
290 int16_t tmp = fs >> (i * 16);
292 fd |= (uint64_t)(tmp & 0xff) << (i * 8);
294 for (i = 0; i < 4; ++i) {
295 int16_t tmp = ft >> (i * 16);
297 fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
303 uint64_t helper_packushb(uint64_t fs, uint64_t ft)
308 for (i = 0; i < 4; ++i) {
309 int16_t tmp = fs >> (i * 16);
311 fd |= (uint64_t)(tmp & 0xff) << (i * 8);
313 for (i = 0; i < 4; ++i) {
314 int16_t tmp = ft >> (i * 16);
316 fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
322 uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
324 return (fs & 0xffffffff) | (ft << 32);
327 uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
329 return (fs >> 32) | (ft & ~0xffffffffull);
332 uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
334 unsigned host = BYTE_ORDER_XOR(3);
339 vd.uh[0 ^ host] = vs.uh[0 ^ host];
340 vd.uh[1 ^ host] = vt.uh[0 ^ host];
341 vd.uh[2 ^ host] = vs.uh[1 ^ host];
342 vd.uh[3 ^ host] = vt.uh[1 ^ host];
347 uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
349 unsigned host = BYTE_ORDER_XOR(3);
354 vd.uh[0 ^ host] = vs.uh[2 ^ host];
355 vd.uh[1 ^ host] = vt.uh[2 ^ host];
356 vd.uh[2 ^ host] = vs.uh[3 ^ host];
357 vd.uh[3 ^ host] = vt.uh[3 ^ host];
362 uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
364 unsigned host = BYTE_ORDER_XOR(7);
369 vd.ub[0 ^ host] = vs.ub[0 ^ host];
370 vd.ub[1 ^ host] = vt.ub[0 ^ host];
371 vd.ub[2 ^ host] = vs.ub[1 ^ host];
372 vd.ub[3 ^ host] = vt.ub[1 ^ host];
373 vd.ub[4 ^ host] = vs.ub[2 ^ host];
374 vd.ub[5 ^ host] = vt.ub[2 ^ host];
375 vd.ub[6 ^ host] = vs.ub[3 ^ host];
376 vd.ub[7 ^ host] = vt.ub[3 ^ host];
381 uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
383 unsigned host = BYTE_ORDER_XOR(7);
388 vd.ub[0 ^ host] = vs.ub[4 ^ host];
389 vd.ub[1 ^ host] = vt.ub[4 ^ host];
390 vd.ub[2 ^ host] = vs.ub[5 ^ host];
391 vd.ub[3 ^ host] = vt.ub[5 ^ host];
392 vd.ub[4 ^ host] = vs.ub[6 ^ host];
393 vd.ub[5 ^ host] = vt.ub[6 ^ host];
394 vd.ub[6 ^ host] = vs.ub[7 ^ host];
395 vd.ub[7 ^ host] = vt.ub[7 ^ host];
400 uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
407 for (i = 0; i < 4; i++) {
408 vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
413 uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
420 for (i = 0; i < 8; i++) {
421 vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
426 uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
433 for (i = 0; i < 4; i++) {
434 vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
439 uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
446 for (i = 0; i < 4; i++) {
447 vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
452 uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
459 for (i = 0; i < 4; i++) {
460 vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
465 uint64_t helper_pminub(uint64_t fs, uint64_t ft)
472 for (i = 0; i < 4; i++) {
473 vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
478 uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
485 for (i = 0; i < 2; i++) {
486 vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
491 uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
498 for (i = 0; i < 2; i++) {
499 vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
504 uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
511 for (i = 0; i < 4; i++) {
512 vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
517 uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
524 for (i = 0; i < 4; i++) {
525 vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
530 uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
537 for (i = 0; i < 8; i++) {
538 vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
543 uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
550 for (i = 0; i < 8; i++) {
551 vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
556 uint64_t helper_psllw(uint64_t fs, uint64_t ft)
566 for (i = 0; i < 2; ++i) {
572 uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
582 for (i = 0; i < 2; ++i) {
588 uint64_t helper_psraw(uint64_t fs, uint64_t ft)
598 for (i = 0; i < 2; ++i) {
604 uint64_t helper_psllh(uint64_t fs, uint64_t ft)
614 for (i = 0; i < 4; ++i) {
620 uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
630 for (i = 0; i < 4; ++i) {
636 uint64_t helper_psrah(uint64_t fs, uint64_t ft)
646 for (i = 0; i < 4; ++i) {
652 uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
659 for (i = 0; i < 4; ++i) {
660 vs.sh[i] *= vt.sh[i];
665 uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
672 for (i = 0; i < 4; ++i) {
673 int32_t r = vs.sh[i] * vt.sh[i];
679 uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
686 for (i = 0; i < 4; ++i) {
687 uint32_t r = vs.uh[i] * vt.uh[i];
693 uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
695 unsigned host = BYTE_ORDER_XOR(3);
701 p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host];
702 p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
703 p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host];
704 p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
706 return ((uint64_t)p1 << 32) | p0;
709 uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
716 for (i = 0; i < 8; ++i) {
717 int r = vs.ub[i] - vt.ub[i];
718 vs.ub[i] = (r < 0 ? -r : r);
723 uint64_t helper_biadd(uint64_t fs)
727 for (i = fd = 0; i < 8; ++i) {
728 fd += (fs >> (i * 8)) & 0xff;
733 uint64_t helper_pmovmskb(uint64_t fs)
737 fd |= ((fs >> 7) & 1) << 0;
738 fd |= ((fs >> 15) & 1) << 1;
739 fd |= ((fs >> 23) & 1) << 2;
740 fd |= ((fs >> 31) & 1) << 3;
741 fd |= ((fs >> 39) & 1) << 4;
742 fd |= ((fs >> 47) & 1) << 5;
743 fd |= ((fs >> 55) & 1) << 6;
744 fd |= ((fs >> 63) & 1) << 7;