+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+ switch (opc) {
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ return 1;
+ case INDEX_op_cmp_vec:
+ return -1;
+
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ /* We must expand the operation for MO_8. */
+ return vece == MO_8 ? -1 : 1;
+
+ case INDEX_op_sari_vec:
+ /* We must expand the operation for MO_8. */
+ if (vece == MO_8) {
+ return -1;
+ }
+ /* We can emulate this for MO_64, but it does not pay off
+ unless we're producing at least 4 values. */
+ if (vece == MO_64) {
+ return type >= TCG_TYPE_V256 ? -1 : 0;
+ }
+ return 1;
+
+ case INDEX_op_mul_vec:
+ if (vece == MO_8) {
+ /* We can expand the operation for MO_8. */
+ return -1;
+ }
+ if (vece == MO_64) {
+ return 0;
+ }
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg a0, ...)
+{
+ va_list va;
+ TCGArg a1, a2;
+ TCGv_vec v0, t1, t2, t3, t4;
+
+ va_start(va, a0);
+ v0 = temp_tcgv_vec(arg_temp(a0));
+
+ switch (opc) {
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ /* Unpack to W, shift, and repack. Tricky bits:
+ (1) Use punpck*bw x,x to produce DDCCBBAA,
+ i.e. duplicate in other half of the 16-bit lane.
+ (2) For right-shift, add 8 so that the high half of
+ the lane becomes zero. For left-shift, we must
+ shift up and down again.
+ (3) Step 2 leaves high half zero such that PACKUSWB
+ (pack with unsigned saturation) does not modify
+ the quantity. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ if (opc == INDEX_op_shri_vec) {
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ } else {
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
+ }
+ vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case INDEX_op_sari_vec:
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ if (vece == MO_8) {
+ /* Unpack to W, shift, and repack, as above. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+ }
+ tcg_debug_assert(vece == MO_64);
+ /* MO_64: If the shift is <= 32, we can emulate the sign extend by
+ performing an arithmetic 32-bit shift and overwriting the high
+ half of the result (note that the ISA says shift of 32 is valid). */
+ if (a2 <= 32) {
+ t1 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
+ a0, a0, tcgv_vec_arg(t1), 0xaa);
+ tcg_temp_free_vec(t1);
+ break;
+ }
+ /* Otherwise we will need to use a compare vs 0 to produce the
+ sign-extend, shift and merge. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_const_zeros_vec(type);
+ vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
+ tcg_temp_free_vec(t2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
+ vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ break;
+
+ case INDEX_op_mul_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ switch (type) {
+ case TCG_TYPE_V64:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t2, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case TCG_TYPE_V128:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t4, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ case TCG_TYPE_V256:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V256);
+ tcg_gen_dup16i_vec(t4, 0);
+ /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
+ t1: extends of B[0-7], D[0-7]
+ t2: extends of X[0-7], Z[0-7]
+ t3: extends of A[0-7], C[0-7]
+ t4: extends of W[0-7], Y[0-7]. */
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ /* t1: BX DZ; t2: AW CY. */
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ /* a0: AW BX CY DZ. */
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ break;
+
+ case INDEX_op_cmp_vec:
+ {
+ enum {
+ NEED_SWAP = 1,
+ NEED_INV = 2,
+ NEED_BIAS = 4
+ };
+ static const uint8_t fixups[16] = {
+ [0 ... 15] = -1,
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_NE] = NEED_INV,
+ [TCG_COND_GT] = 0,
+ [TCG_COND_LT] = NEED_SWAP,
+ [TCG_COND_LE] = NEED_INV,
+ [TCG_COND_GE] = NEED_SWAP | NEED_INV,
+ [TCG_COND_GTU] = NEED_BIAS,
+ [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
+ [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
+ [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
+ };
+
+ TCGCond cond;
+ uint8_t fixup;
+
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ cond = va_arg(va, TCGArg);
+ fixup = fixups[cond & 15];
+ tcg_debug_assert(fixup != 0xff);
+
+ if (fixup & NEED_INV) {
+ cond = tcg_invert_cond(cond);
+ }
+ if (fixup & NEED_SWAP) {
+ TCGArg t;
+ t = a1, a1 = a2, a2 = t;
+ cond = tcg_swap_cond(cond);
+ }
+
+ t1 = t2 = NULL;
+ if (fixup & NEED_BIAS) {
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
+ tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
+ tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
+ a1 = tcgv_vec_arg(t1);
+ a2 = tcgv_vec_arg(t2);
+ cond = tcg_signed_cond(cond);
+ }
+
+ tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
+ vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
+
+ if (fixup & NEED_BIAS) {
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ }
+ if (fixup & NEED_INV) {
+ tcg_gen_not_vec(vece, v0, v0);
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ va_end(va);
+}
+
+static const int tcg_target_callee_save_regs[] = {