]>
Commit | Line | Data |
---|---|---|
bd277fa1 RH |
1 | /* |
2 | * Loongson Multimedia Instruction emulation helpers for QEMU. | |
3 | * | |
4 | * Copyright (c) 2011 Richard Henderson <[email protected]> | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "cpu.h" | |
2ef6175a | 21 | #include "exec/helper-proto.h" |
bd277fa1 RH |
22 | |
23 | /* If the byte ordering doesn't matter, i.e. all columns are treated | |
24 | identically, then this union can be used directly. If byte ordering | |
25 | does matter, we generally ignore dumping to memory. */ | |
26 | typedef union { | |
27 | uint8_t ub[8]; | |
28 | int8_t sb[8]; | |
29 | uint16_t uh[4]; | |
30 | int16_t sh[4]; | |
31 | uint32_t uw[2]; | |
32 | int32_t sw[2]; | |
33 | uint64_t d; | |
34 | } LMIValue; | |
35 | ||
36 | /* Some byte ordering issues can be mitigated by XORing in the following. */ | |
37 | #ifdef HOST_WORDS_BIGENDIAN | |
38 | # define BYTE_ORDER_XOR(N) N | |
39 | #else | |
40 | # define BYTE_ORDER_XOR(N) 0 | |
41 | #endif | |
42 | ||
43 | #define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x) | |
44 | #define SATUB(x) (x > 0xff ? 0xff : x) | |
45 | ||
46 | #define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x) | |
47 | #define SATUH(x) (x > 0xffff ? 0xffff : x) | |
48 | ||
49 | #define SATSW(x) \ | |
50 | (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x) | |
51 | #define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x) | |
52 | ||
53 | uint64_t helper_paddsb(uint64_t fs, uint64_t ft) | |
54 | { | |
55 | LMIValue vs, vt; | |
56 | unsigned int i; | |
57 | ||
58 | vs.d = fs; | |
59 | vt.d = ft; | |
60 | for (i = 0; i < 8; ++i) { | |
61 | int r = vs.sb[i] + vt.sb[i]; | |
62 | vs.sb[i] = SATSB(r); | |
63 | } | |
64 | return vs.d; | |
65 | } | |
66 | ||
67 | uint64_t helper_paddusb(uint64_t fs, uint64_t ft) | |
68 | { | |
69 | LMIValue vs, vt; | |
70 | unsigned int i; | |
71 | ||
72 | vs.d = fs; | |
73 | vt.d = ft; | |
74 | for (i = 0; i < 8; ++i) { | |
75 | int r = vs.ub[i] + vt.ub[i]; | |
76 | vs.ub[i] = SATUB(r); | |
77 | } | |
78 | return vs.d; | |
79 | } | |
80 | ||
81 | uint64_t helper_paddsh(uint64_t fs, uint64_t ft) | |
82 | { | |
83 | LMIValue vs, vt; | |
84 | unsigned int i; | |
85 | ||
86 | vs.d = fs; | |
87 | vt.d = ft; | |
88 | for (i = 0; i < 4; ++i) { | |
89 | int r = vs.sh[i] + vt.sh[i]; | |
90 | vs.sh[i] = SATSH(r); | |
91 | } | |
92 | return vs.d; | |
93 | } | |
94 | ||
95 | uint64_t helper_paddush(uint64_t fs, uint64_t ft) | |
96 | { | |
97 | LMIValue vs, vt; | |
98 | unsigned int i; | |
99 | ||
100 | vs.d = fs; | |
101 | vt.d = ft; | |
102 | for (i = 0; i < 4; ++i) { | |
103 | int r = vs.uh[i] + vt.uh[i]; | |
104 | vs.uh[i] = SATUH(r); | |
105 | } | |
106 | return vs.d; | |
107 | } | |
108 | ||
109 | uint64_t helper_paddb(uint64_t fs, uint64_t ft) | |
110 | { | |
111 | LMIValue vs, vt; | |
112 | unsigned int i; | |
113 | ||
114 | vs.d = fs; | |
115 | vt.d = ft; | |
116 | for (i = 0; i < 8; ++i) { | |
117 | vs.ub[i] += vt.ub[i]; | |
118 | } | |
119 | return vs.d; | |
120 | } | |
121 | ||
122 | uint64_t helper_paddh(uint64_t fs, uint64_t ft) | |
123 | { | |
124 | LMIValue vs, vt; | |
125 | unsigned int i; | |
126 | ||
127 | vs.d = fs; | |
128 | vt.d = ft; | |
129 | for (i = 0; i < 4; ++i) { | |
130 | vs.uh[i] += vt.uh[i]; | |
131 | } | |
132 | return vs.d; | |
133 | } | |
134 | ||
135 | uint64_t helper_paddw(uint64_t fs, uint64_t ft) | |
136 | { | |
137 | LMIValue vs, vt; | |
138 | unsigned int i; | |
139 | ||
140 | vs.d = fs; | |
141 | vt.d = ft; | |
142 | for (i = 0; i < 2; ++i) { | |
143 | vs.uw[i] += vt.uw[i]; | |
144 | } | |
145 | return vs.d; | |
146 | } | |
147 | ||
148 | uint64_t helper_psubsb(uint64_t fs, uint64_t ft) | |
149 | { | |
150 | LMIValue vs, vt; | |
151 | unsigned int i; | |
152 | ||
153 | vs.d = fs; | |
154 | vt.d = ft; | |
155 | for (i = 0; i < 8; ++i) { | |
156 | int r = vs.sb[i] - vt.sb[i]; | |
157 | vs.sb[i] = SATSB(r); | |
158 | } | |
159 | return vs.d; | |
160 | } | |
161 | ||
162 | uint64_t helper_psubusb(uint64_t fs, uint64_t ft) | |
163 | { | |
164 | LMIValue vs, vt; | |
165 | unsigned int i; | |
166 | ||
167 | vs.d = fs; | |
168 | vt.d = ft; | |
169 | for (i = 0; i < 8; ++i) { | |
170 | int r = vs.ub[i] - vt.ub[i]; | |
171 | vs.ub[i] = SATUB(r); | |
172 | } | |
173 | return vs.d; | |
174 | } | |
175 | ||
176 | uint64_t helper_psubsh(uint64_t fs, uint64_t ft) | |
177 | { | |
178 | LMIValue vs, vt; | |
179 | unsigned int i; | |
180 | ||
181 | vs.d = fs; | |
182 | vt.d = ft; | |
183 | for (i = 0; i < 4; ++i) { | |
184 | int r = vs.sh[i] - vt.sh[i]; | |
185 | vs.sh[i] = SATSH(r); | |
186 | } | |
187 | return vs.d; | |
188 | } | |
189 | ||
190 | uint64_t helper_psubush(uint64_t fs, uint64_t ft) | |
191 | { | |
192 | LMIValue vs, vt; | |
193 | unsigned int i; | |
194 | ||
195 | vs.d = fs; | |
196 | vt.d = ft; | |
197 | for (i = 0; i < 4; ++i) { | |
198 | int r = vs.uh[i] - vt.uh[i]; | |
199 | vs.uh[i] = SATUH(r); | |
200 | } | |
201 | return vs.d; | |
202 | } | |
203 | ||
204 | uint64_t helper_psubb(uint64_t fs, uint64_t ft) | |
205 | { | |
206 | LMIValue vs, vt; | |
207 | unsigned int i; | |
208 | ||
209 | vs.d = fs; | |
210 | vt.d = ft; | |
211 | for (i = 0; i < 8; ++i) { | |
212 | vs.ub[i] -= vt.ub[i]; | |
213 | } | |
214 | return vs.d; | |
215 | } | |
216 | ||
217 | uint64_t helper_psubh(uint64_t fs, uint64_t ft) | |
218 | { | |
219 | LMIValue vs, vt; | |
220 | unsigned int i; | |
221 | ||
222 | vs.d = fs; | |
223 | vt.d = ft; | |
224 | for (i = 0; i < 4; ++i) { | |
225 | vs.uh[i] -= vt.uh[i]; | |
226 | } | |
227 | return vs.d; | |
228 | } | |
229 | ||
230 | uint64_t helper_psubw(uint64_t fs, uint64_t ft) | |
231 | { | |
232 | LMIValue vs, vt; | |
233 | unsigned int i; | |
234 | ||
235 | vs.d = fs; | |
236 | vt.d = ft; | |
237 | for (i = 0; i < 2; ++i) { | |
238 | vs.uw[i] -= vt.uw[i]; | |
239 | } | |
240 | return vs.d; | |
241 | } | |
242 | ||
243 | uint64_t helper_pshufh(uint64_t fs, uint64_t ft) | |
244 | { | |
245 | unsigned host = BYTE_ORDER_XOR(3); | |
246 | LMIValue vd, vs; | |
247 | unsigned i; | |
248 | ||
249 | vs.d = fs; | |
250 | vd.d = 0; | |
251 | for (i = 0; i < 4; i++, ft >>= 2) { | |
252 | vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host]; | |
253 | } | |
254 | return vd.d; | |
255 | } | |
256 | ||
257 | uint64_t helper_packsswh(uint64_t fs, uint64_t ft) | |
258 | { | |
259 | uint64_t fd = 0; | |
260 | int64_t tmp; | |
261 | ||
262 | tmp = (int32_t)(fs >> 0); | |
263 | tmp = SATSH(tmp); | |
264 | fd |= (tmp & 0xffff) << 0; | |
265 | ||
266 | tmp = (int32_t)(fs >> 32); | |
267 | tmp = SATSH(tmp); | |
268 | fd |= (tmp & 0xffff) << 16; | |
269 | ||
270 | tmp = (int32_t)(ft >> 0); | |
271 | tmp = SATSH(tmp); | |
272 | fd |= (tmp & 0xffff) << 32; | |
273 | ||
274 | tmp = (int32_t)(ft >> 32); | |
275 | tmp = SATSH(tmp); | |
276 | fd |= (tmp & 0xffff) << 48; | |
277 | ||
278 | return fd; | |
279 | } | |
280 | ||
281 | uint64_t helper_packsshb(uint64_t fs, uint64_t ft) | |
282 | { | |
283 | uint64_t fd = 0; | |
284 | unsigned int i; | |
285 | ||
286 | for (i = 0; i < 4; ++i) { | |
287 | int16_t tmp = fs >> (i * 16); | |
288 | tmp = SATSB(tmp); | |
289 | fd |= (uint64_t)(tmp & 0xff) << (i * 8); | |
290 | } | |
291 | for (i = 0; i < 4; ++i) { | |
292 | int16_t tmp = ft >> (i * 16); | |
293 | tmp = SATSB(tmp); | |
294 | fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); | |
295 | } | |
296 | ||
297 | return fd; | |
298 | } | |
299 | ||
300 | uint64_t helper_packushb(uint64_t fs, uint64_t ft) | |
301 | { | |
302 | uint64_t fd = 0; | |
303 | unsigned int i; | |
304 | ||
305 | for (i = 0; i < 4; ++i) { | |
306 | int16_t tmp = fs >> (i * 16); | |
307 | tmp = SATUB(tmp); | |
308 | fd |= (uint64_t)(tmp & 0xff) << (i * 8); | |
309 | } | |
310 | for (i = 0; i < 4; ++i) { | |
311 | int16_t tmp = ft >> (i * 16); | |
312 | tmp = SATUB(tmp); | |
313 | fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); | |
314 | } | |
315 | ||
316 | return fd; | |
317 | } | |
318 | ||
319 | uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft) | |
320 | { | |
321 | return (fs & 0xffffffff) | (ft << 32); | |
322 | } | |
323 | ||
324 | uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft) | |
325 | { | |
326 | return (fs >> 32) | (ft & ~0xffffffffull); | |
327 | } | |
328 | ||
329 | uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft) | |
330 | { | |
331 | unsigned host = BYTE_ORDER_XOR(3); | |
332 | LMIValue vd, vs, vt; | |
333 | ||
334 | vs.d = fs; | |
335 | vt.d = ft; | |
336 | vd.uh[0 ^ host] = vs.uh[0 ^ host]; | |
337 | vd.uh[1 ^ host] = vt.uh[0 ^ host]; | |
338 | vd.uh[2 ^ host] = vs.uh[1 ^ host]; | |
339 | vd.uh[3 ^ host] = vt.uh[1 ^ host]; | |
340 | ||
341 | return vd.d; | |
342 | } | |
343 | ||
344 | uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft) | |
345 | { | |
346 | unsigned host = BYTE_ORDER_XOR(3); | |
347 | LMIValue vd, vs, vt; | |
348 | ||
349 | vs.d = fs; | |
350 | vt.d = ft; | |
351 | vd.uh[0 ^ host] = vs.uh[2 ^ host]; | |
352 | vd.uh[1 ^ host] = vt.uh[2 ^ host]; | |
353 | vd.uh[2 ^ host] = vs.uh[3 ^ host]; | |
354 | vd.uh[3 ^ host] = vt.uh[3 ^ host]; | |
355 | ||
356 | return vd.d; | |
357 | } | |
358 | ||
359 | uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft) | |
360 | { | |
361 | unsigned host = BYTE_ORDER_XOR(7); | |
362 | LMIValue vd, vs, vt; | |
363 | ||
364 | vs.d = fs; | |
365 | vt.d = ft; | |
366 | vd.ub[0 ^ host] = vs.ub[0 ^ host]; | |
367 | vd.ub[1 ^ host] = vt.ub[0 ^ host]; | |
368 | vd.ub[2 ^ host] = vs.ub[1 ^ host]; | |
369 | vd.ub[3 ^ host] = vt.ub[1 ^ host]; | |
370 | vd.ub[4 ^ host] = vs.ub[2 ^ host]; | |
371 | vd.ub[5 ^ host] = vt.ub[2 ^ host]; | |
372 | vd.ub[6 ^ host] = vs.ub[3 ^ host]; | |
373 | vd.ub[7 ^ host] = vt.ub[3 ^ host]; | |
374 | ||
375 | return vd.d; | |
376 | } | |
377 | ||
378 | uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft) | |
379 | { | |
380 | unsigned host = BYTE_ORDER_XOR(7); | |
381 | LMIValue vd, vs, vt; | |
382 | ||
383 | vs.d = fs; | |
384 | vt.d = ft; | |
385 | vd.ub[0 ^ host] = vs.ub[4 ^ host]; | |
386 | vd.ub[1 ^ host] = vt.ub[4 ^ host]; | |
387 | vd.ub[2 ^ host] = vs.ub[5 ^ host]; | |
388 | vd.ub[3 ^ host] = vt.ub[5 ^ host]; | |
389 | vd.ub[4 ^ host] = vs.ub[6 ^ host]; | |
390 | vd.ub[5 ^ host] = vt.ub[6 ^ host]; | |
391 | vd.ub[6 ^ host] = vs.ub[7 ^ host]; | |
392 | vd.ub[7 ^ host] = vt.ub[7 ^ host]; | |
393 | ||
394 | return vd.d; | |
395 | } | |
396 | ||
397 | uint64_t helper_pavgh(uint64_t fs, uint64_t ft) | |
398 | { | |
399 | LMIValue vs, vt; | |
400 | unsigned i; | |
401 | ||
402 | vs.d = fs; | |
403 | vt.d = ft; | |
404 | for (i = 0; i < 4; i++) { | |
405 | vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1; | |
406 | } | |
407 | return vs.d; | |
408 | } | |
409 | ||
410 | uint64_t helper_pavgb(uint64_t fs, uint64_t ft) | |
411 | { | |
412 | LMIValue vs, vt; | |
413 | unsigned i; | |
414 | ||
415 | vs.d = fs; | |
416 | vt.d = ft; | |
417 | for (i = 0; i < 8; i++) { | |
418 | vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1; | |
419 | } | |
420 | return vs.d; | |
421 | } | |
422 | ||
423 | uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft) | |
424 | { | |
425 | LMIValue vs, vt; | |
426 | unsigned i; | |
427 | ||
428 | vs.d = fs; | |
429 | vt.d = ft; | |
430 | for (i = 0; i < 4; i++) { | |
431 | vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]); | |
432 | } | |
433 | return vs.d; | |
434 | } | |
435 | ||
436 | uint64_t helper_pminsh(uint64_t fs, uint64_t ft) | |
437 | { | |
438 | LMIValue vs, vt; | |
439 | unsigned i; | |
440 | ||
441 | vs.d = fs; | |
442 | vt.d = ft; | |
443 | for (i = 0; i < 4; i++) { | |
444 | vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]); | |
445 | } | |
446 | return vs.d; | |
447 | } | |
448 | ||
449 | uint64_t helper_pmaxub(uint64_t fs, uint64_t ft) | |
450 | { | |
451 | LMIValue vs, vt; | |
452 | unsigned i; | |
453 | ||
454 | vs.d = fs; | |
455 | vt.d = ft; | |
456 | for (i = 0; i < 4; i++) { | |
457 | vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]); | |
458 | } | |
459 | return vs.d; | |
460 | } | |
461 | ||
462 | uint64_t helper_pminub(uint64_t fs, uint64_t ft) | |
463 | { | |
464 | LMIValue vs, vt; | |
465 | unsigned i; | |
466 | ||
467 | vs.d = fs; | |
468 | vt.d = ft; | |
469 | for (i = 0; i < 4; i++) { | |
470 | vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]); | |
471 | } | |
472 | return vs.d; | |
473 | } | |
474 | ||
475 | uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft) | |
476 | { | |
477 | LMIValue vs, vt; | |
478 | unsigned i; | |
479 | ||
480 | vs.d = fs; | |
481 | vt.d = ft; | |
482 | for (i = 0; i < 2; i++) { | |
483 | vs.uw[i] = -(vs.uw[i] == vt.uw[i]); | |
484 | } | |
485 | return vs.d; | |
486 | } | |
487 | ||
488 | uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft) | |
489 | { | |
490 | LMIValue vs, vt; | |
491 | unsigned i; | |
492 | ||
493 | vs.d = fs; | |
494 | vt.d = ft; | |
495 | for (i = 0; i < 2; i++) { | |
496 | vs.uw[i] = -(vs.uw[i] > vt.uw[i]); | |
497 | } | |
498 | return vs.d; | |
499 | } | |
500 | ||
501 | uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft) | |
502 | { | |
503 | LMIValue vs, vt; | |
504 | unsigned i; | |
505 | ||
506 | vs.d = fs; | |
507 | vt.d = ft; | |
508 | for (i = 0; i < 4; i++) { | |
509 | vs.uh[i] = -(vs.uh[i] == vt.uh[i]); | |
510 | } | |
511 | return vs.d; | |
512 | } | |
513 | ||
514 | uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft) | |
515 | { | |
516 | LMIValue vs, vt; | |
517 | unsigned i; | |
518 | ||
519 | vs.d = fs; | |
520 | vt.d = ft; | |
521 | for (i = 0; i < 4; i++) { | |
522 | vs.uh[i] = -(vs.uh[i] > vt.uh[i]); | |
523 | } | |
524 | return vs.d; | |
525 | } | |
526 | ||
527 | uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft) | |
528 | { | |
529 | LMIValue vs, vt; | |
530 | unsigned i; | |
531 | ||
532 | vs.d = fs; | |
533 | vt.d = ft; | |
534 | for (i = 0; i < 8; i++) { | |
535 | vs.ub[i] = -(vs.ub[i] == vt.ub[i]); | |
536 | } | |
537 | return vs.d; | |
538 | } | |
539 | ||
540 | uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft) | |
541 | { | |
542 | LMIValue vs, vt; | |
543 | unsigned i; | |
544 | ||
545 | vs.d = fs; | |
546 | vt.d = ft; | |
547 | for (i = 0; i < 8; i++) { | |
548 | vs.ub[i] = -(vs.ub[i] > vt.ub[i]); | |
549 | } | |
550 | return vs.d; | |
551 | } | |
552 | ||
553 | uint64_t helper_psllw(uint64_t fs, uint64_t ft) | |
554 | { | |
555 | LMIValue vs; | |
556 | unsigned i; | |
557 | ||
558 | ft &= 0x7f; | |
559 | if (ft > 31) { | |
560 | return 0; | |
561 | } | |
562 | vs.d = fs; | |
563 | for (i = 0; i < 2; ++i) { | |
564 | vs.uw[i] <<= ft; | |
565 | } | |
566 | return vs.d; | |
567 | } | |
568 | ||
569 | uint64_t helper_psrlw(uint64_t fs, uint64_t ft) | |
570 | { | |
571 | LMIValue vs; | |
572 | unsigned i; | |
573 | ||
574 | ft &= 0x7f; | |
575 | if (ft > 31) { | |
576 | return 0; | |
577 | } | |
578 | vs.d = fs; | |
579 | for (i = 0; i < 2; ++i) { | |
580 | vs.uw[i] >>= ft; | |
581 | } | |
582 | return vs.d; | |
583 | } | |
584 | ||
585 | uint64_t helper_psraw(uint64_t fs, uint64_t ft) | |
586 | { | |
587 | LMIValue vs; | |
588 | unsigned i; | |
589 | ||
590 | ft &= 0x7f; | |
591 | if (ft > 31) { | |
592 | ft = 31; | |
593 | } | |
594 | vs.d = fs; | |
595 | for (i = 0; i < 2; ++i) { | |
596 | vs.sw[i] >>= ft; | |
597 | } | |
598 | return vs.d; | |
599 | } | |
600 | ||
601 | uint64_t helper_psllh(uint64_t fs, uint64_t ft) | |
602 | { | |
603 | LMIValue vs; | |
604 | unsigned i; | |
605 | ||
606 | ft &= 0x7f; | |
607 | if (ft > 15) { | |
608 | return 0; | |
609 | } | |
610 | vs.d = fs; | |
611 | for (i = 0; i < 4; ++i) { | |
612 | vs.uh[i] <<= ft; | |
613 | } | |
614 | return vs.d; | |
615 | } | |
616 | ||
617 | uint64_t helper_psrlh(uint64_t fs, uint64_t ft) | |
618 | { | |
619 | LMIValue vs; | |
620 | unsigned i; | |
621 | ||
622 | ft &= 0x7f; | |
623 | if (ft > 15) { | |
624 | return 0; | |
625 | } | |
626 | vs.d = fs; | |
627 | for (i = 0; i < 4; ++i) { | |
628 | vs.uh[i] >>= ft; | |
629 | } | |
630 | return vs.d; | |
631 | } | |
632 | ||
633 | uint64_t helper_psrah(uint64_t fs, uint64_t ft) | |
634 | { | |
635 | LMIValue vs; | |
636 | unsigned i; | |
637 | ||
638 | ft &= 0x7f; | |
639 | if (ft > 15) { | |
640 | ft = 15; | |
641 | } | |
642 | vs.d = fs; | |
643 | for (i = 0; i < 4; ++i) { | |
644 | vs.sh[i] >>= ft; | |
645 | } | |
646 | return vs.d; | |
647 | } | |
648 | ||
649 | uint64_t helper_pmullh(uint64_t fs, uint64_t ft) | |
650 | { | |
651 | LMIValue vs, vt; | |
652 | unsigned i; | |
653 | ||
654 | vs.d = fs; | |
655 | vt.d = ft; | |
656 | for (i = 0; i < 4; ++i) { | |
657 | vs.sh[i] *= vt.sh[i]; | |
658 | } | |
659 | return vs.d; | |
660 | } | |
661 | ||
662 | uint64_t helper_pmulhh(uint64_t fs, uint64_t ft) | |
663 | { | |
664 | LMIValue vs, vt; | |
665 | unsigned i; | |
666 | ||
667 | vs.d = fs; | |
668 | vt.d = ft; | |
669 | for (i = 0; i < 4; ++i) { | |
670 | int32_t r = vs.sh[i] * vt.sh[i]; | |
671 | vs.sh[i] = r >> 16; | |
672 | } | |
673 | return vs.d; | |
674 | } | |
675 | ||
676 | uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft) | |
677 | { | |
678 | LMIValue vs, vt; | |
679 | unsigned i; | |
680 | ||
681 | vs.d = fs; | |
682 | vt.d = ft; | |
683 | for (i = 0; i < 4; ++i) { | |
684 | uint32_t r = vs.uh[i] * vt.uh[i]; | |
685 | vs.uh[i] = r >> 16; | |
686 | } | |
687 | return vs.d; | |
688 | } | |
689 | ||
690 | uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft) | |
691 | { | |
692 | unsigned host = BYTE_ORDER_XOR(3); | |
693 | LMIValue vs, vt; | |
694 | uint32_t p0, p1; | |
695 | ||
696 | vs.d = fs; | |
697 | vt.d = ft; | |
698 | p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host]; | |
699 | p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host]; | |
700 | p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host]; | |
701 | p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host]; | |
702 | ||
703 | return ((uint64_t)p1 << 32) | p0; | |
704 | } | |
705 | ||
706 | uint64_t helper_pasubub(uint64_t fs, uint64_t ft) | |
707 | { | |
708 | LMIValue vs, vt; | |
709 | unsigned i; | |
710 | ||
711 | vs.d = fs; | |
712 | vt.d = ft; | |
713 | for (i = 0; i < 8; ++i) { | |
714 | int r = vs.ub[i] - vt.ub[i]; | |
715 | vs.ub[i] = (r < 0 ? -r : r); | |
716 | } | |
717 | return vs.d; | |
718 | } | |
719 | ||
720 | uint64_t helper_biadd(uint64_t fs) | |
721 | { | |
722 | unsigned i, fd; | |
723 | ||
724 | for (i = fd = 0; i < 8; ++i) { | |
725 | fd += (fs >> (i * 8)) & 0xff; | |
726 | } | |
727 | return fd & 0xffff; | |
728 | } | |
729 | ||
730 | uint64_t helper_pmovmskb(uint64_t fs) | |
731 | { | |
732 | unsigned fd = 0; | |
733 | ||
734 | fd |= ((fs >> 7) & 1) << 0; | |
735 | fd |= ((fs >> 15) & 1) << 1; | |
736 | fd |= ((fs >> 23) & 1) << 2; | |
737 | fd |= ((fs >> 31) & 1) << 3; | |
738 | fd |= ((fs >> 39) & 1) << 4; | |
739 | fd |= ((fs >> 47) & 1) << 5; | |
740 | fd |= ((fs >> 55) & 1) << 6; | |
741 | fd |= ((fs >> 63) & 1) << 7; | |
742 | ||
743 | return fd & 0xff; | |
744 | } |