static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#if TCG_TARGET_REG_BITS == 64
"%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
- "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
#else
"%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+#endif
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+#if TCG_TARGET_REG_BITS == 64
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
#endif
};
#endif
TCG_REG_ECX,
TCG_REG_EDX,
TCG_REG_EAX,
+#endif
+ TCG_REG_XMM0,
+ TCG_REG_XMM1,
+ TCG_REG_XMM2,
+ TCG_REG_XMM3,
+ TCG_REG_XMM4,
+ TCG_REG_XMM5,
+#ifndef _WIN64
+ /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
+ any of them. Therefore only allow xmm0-xmm5 to be allocated. */
+ TCG_REG_XMM6,
+ TCG_REG_XMM7,
+#if TCG_TARGET_REG_BITS == 64
+ TCG_REG_XMM8,
+ TCG_REG_XMM9,
+ TCG_REG_XMM10,
+ TCG_REG_XMM11,
+ TCG_REG_XMM12,
+ TCG_REG_XMM13,
+ TCG_REG_XMM14,
+ TCG_REG_XMM15,
+#endif
#endif
};
#define TCG_CT_CONST_I32 0x400
#define TCG_CT_CONST_WSZ 0x800
-/* Registers used with L constraint, which are the first argument
+/* Registers used with L constraint, which are the first argument
registers on x86_64, and two random call clobbered registers on
i386. */
#if TCG_TARGET_REG_BITS == 64
it there. Therefore we always define the variable. */
bool have_bmi1;
bool have_popcnt;
+bool have_avx1;
+bool have_avx2;
#ifdef CONFIG_CPUID_H
static bool have_movbe;
static tcg_insn_unit *tb_ret_addr;
-static void patch_reloc(tcg_insn_unit *code_ptr, int type,
+static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
intptr_t value, intptr_t addend)
{
value += addend;
case R_386_PC32:
value -= (uintptr_t)code_ptr;
if (value != (int32_t)value) {
- tcg_abort();
+ return false;
}
+ /* FALLTHRU */
+ case R_386_32:
tcg_patch32(code_ptr, value);
break;
case R_386_PC8:
value -= (uintptr_t)code_ptr;
if (value != (int8_t)value) {
- tcg_abort();
+ return false;
}
tcg_patch8(code_ptr, value);
break;
default:
tcg_abort();
}
+ return true;
}
+#if TCG_TARGET_REG_BITS == 64
+#define ALL_GENERAL_REGS 0x0000ffffu
+#define ALL_VECTOR_REGS 0xffff0000u
+#else
+#define ALL_GENERAL_REGS 0x000000ffu
+#define ALL_VECTOR_REGS 0x00ff0000u
+#endif
+
/* parse target specific constraints */
static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
break;
case 'q':
+ /* A register that can be used as a byte operand. */
ct->ct |= TCG_CT_REG;
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_regset_set32(ct->u.regs, 0, 0xffff);
- } else {
- tcg_regset_set32(ct->u.regs, 0, 0xf);
- }
+ ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
break;
case 'Q':
+ /* A register with an addressable second byte (e.g. %ah). */
ct->ct |= TCG_CT_REG;
- tcg_regset_set32(ct->u.regs, 0, 0xf);
+ ct->u.regs = 0xf;
break;
case 'r':
+ /* A general register. */
ct->ct |= TCG_CT_REG;
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_regset_set32(ct->u.regs, 0, 0xffff);
- } else {
- tcg_regset_set32(ct->u.regs, 0, 0xff);
- }
+ ct->u.regs |= ALL_GENERAL_REGS;
break;
case 'W':
/* With TZCNT/LZCNT, we can have operand-size as an input. */
ct->ct |= TCG_CT_CONST_WSZ;
break;
+ case 'x':
+ /* A vector register. */
+ ct->ct |= TCG_CT_REG;
+ ct->u.regs |= ALL_VECTOR_REGS;
+ break;
/* qemu_ld/st address constraint */
case 'L':
ct->ct |= TCG_CT_REG;
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_regset_set32(ct->u.regs, 0, 0xffff);
- } else {
- tcg_regset_set32(ct->u.regs, 0, 0xff);
- }
+ ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
break;
return 0;
}
-#if TCG_TARGET_REG_BITS == 64
# define LOWREGMASK(x) ((x) & 7)
-#else
-# define LOWREGMASK(x) (x)
-#endif
#define P_EXT 0x100 /* 0x0f opcode prefix */
#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
#define P_DATA16 0x400 /* 0x66 opcode prefix */
#if TCG_TARGET_REG_BITS == 64
-# define P_ADDR32 0x800 /* 0x67 opcode prefix */
# define P_REXW 0x1000 /* Set REX.W = 1 */
# define P_REXB_R 0x2000 /* REG field as byte register */
# define P_REXB_RM 0x4000 /* R/M field as byte register */
# define P_GS 0x8000 /* gs segment override */
#else
-# define P_ADDR32 0
# define P_REXW 0
# define P_REXB_R 0
# define P_REXB_RM 0
# define P_GS 0
#endif
-#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
-#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
+#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
+#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
+#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
+#define P_VEXL 0x80000 /* Set VEX.L = 1 */
#define OPC_ARITH_EvIz (0x81)
#define OPC_ARITH_EvIb (0x83)
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
#define OPC_ANDN (0xf2 | P_EXT38)
#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
#define OPC_BSF (0xbc | P_EXT)
#define OPC_BSR (0xbd | P_EXT)
#define OPC_BSWAP (0xc8 | P_EXT)
#define OPC_MOVL_Iv (0xb8)
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
+#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
+#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
+#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
+#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
+#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
+#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
+#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
#define OPC_MOVSBL (0xbe | P_EXT)
#define OPC_MOVSWL (0xbf | P_EXT)
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
+#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
+#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
+#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
+#define OPC_PADDB (0xfc | P_EXT | P_DATA16)
+#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
+#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
+#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
+#define OPC_PAND (0xdb | P_EXT | P_DATA16)
+#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
+#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
+#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
+#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
+#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
+#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
+#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
+#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
+#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
+#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
+#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
+#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
+#define OPC_POR (0xeb | P_EXT | P_DATA16)
+#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
+#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
+#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
+#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
+#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
+#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
+#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
+#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
+#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
+#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
+#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
+#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
+#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
+#define OPC_PXOR (0xef | P_EXT | P_DATA16)
#define OPC_POP_r32 (0x58)
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
#define OPC_PUSH_r32 (0x50)
#define OPC_SHIFT_Ib (0xc1)
#define OPC_SHIFT_cl (0xd3)
#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHUFPS (0xc6 | P_EXT)
#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
#define OPC_TESTL (0x85)
#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
+#define OPC_UD2 (0x0b | P_EXT)
+#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
+#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
+#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
+#define OPC_VZEROUPPER (0x77 | P_EXT)
#define OPC_XCHG_ax_r32 (0x90)
#define OPC_GRP3_Ev (0xf7)
#define OPC_GRP5 (0xff)
+#define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
/* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH. */
tcg_debug_assert((opc & P_REXW) == 0);
tcg_out8(s, 0x66);
}
- if (opc & P_ADDR32) {
- tcg_out8(s, 0x67);
- }
if (opc & P_SIMDF3) {
tcg_out8(s, 0xf3);
} else if (opc & P_SIMDF2) {
tcg_out8(s, (uint8_t)(rex | 0x40));
}
- if (opc & (P_EXT | P_EXT38)) {
+ if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
} else if (opc & P_SIMDF2) {
tcg_out8(s, 0xf2);
}
- if (opc & (P_EXT | P_EXT38)) {
+ if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
tcg_out8(s, opc);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
+ int rm, int index)
{
int tmp;
- if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+ /* Use the two byte form if possible, which cannot encode
+ VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
+ if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
+ && ((rm | index) & 8) == 0) {
+ /* Two byte VEX prefix. */
+ tcg_out8(s, 0xc5);
+
+ tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ } else {
/* Three byte VEX prefix. */
tcg_out8(s, 0xc4);
/* VEX.m-mmmm */
- if (opc & P_EXT38) {
+ if (opc & P_EXT3A) {
+ tmp = 3;
+ } else if (opc & P_EXT38) {
tmp = 2;
} else if (opc & P_EXT) {
tmp = 1;
} else {
- tcg_abort();
+ g_assert_not_reached();
}
- tmp |= 0x40; /* VEX.X */
- tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
- tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
+ tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
+ tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
tcg_out8(s, tmp);
- tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
- } else {
- /* Two byte VEX prefix. */
- tcg_out8(s, 0xc5);
-
- tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
}
+
+ tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
/* VEX.pp */
if (opc & P_DATA16) {
tmp |= 1; /* 0x66 */
tmp |= (~v & 15) << 3; /* VEX.vvvv */
tcg_out8(s, tmp);
tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm, 0);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
mode for absolute addresses, ~RM is the size of the immediate operand
that will follow the instruction. */
-static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
- int index, int shift, intptr_t offset)
+static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
+ int shift, intptr_t offset)
{
int mod, len;
intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
intptr_t disp = offset - pc;
if (disp == (int32_t)disp) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
tcg_out32(s, disp);
return;
use of the MODRM+SIB encoding and is therefore larger than
rip-relative addressing. */
if (offset == (int32_t)offset) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (4 << 3) | 5);
tcg_out32(s, offset);
}
/* ??? The memory isn't directly addressable. */
- tcg_abort();
+ g_assert_not_reached();
} else {
/* Absolute address. */
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (r << 3) | 5);
tcg_out32(s, offset);
return;
that would be used for %esp is the escape to the two byte form. */
if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
/* Single byte MODRM format. */
- tcg_out_opc(s, opc, r, rm, 0);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
} else {
/* Two byte MODRM+SIB format. */
tcg_debug_assert(index != TCG_REG_ESP);
}
- tcg_out_opc(s, opc, r, rm, index);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
}
}
}
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+ int index, int shift, intptr_t offset)
+{
+ tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
+static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
+ int rm, int index, int shift,
+ intptr_t offset)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
/* A simplification of the above with no index or shift. */
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
int rm, intptr_t offset)
tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
}
+static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
+ int v, int rm, intptr_t offset)
+{
+ tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
+}
+
+/* Output an opcode with an expected reference to the constant pool. */
+static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
+{
+ tcg_out_opc(s, opc, r, 0, 0);
+ /* Absolute for 32-bit, pc-relative for 64-bit. */
+ tcg_out8(s, LOWREGMASK(r) << 3 | 5);
+ tcg_out32(s, 0);
+}
+
+/* Output an opcode with an expected reference to the constant pool. */
+static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
+{
+ tcg_out_vex_opc(s, opc, r, 0, 0, 0);
+ /* Absolute for 32-bit, pc-relative for 64-bit. */
+ tcg_out8(s, LOWREGMASK(r) << 3 | 5);
+ tcg_out32(s, 0);
+}
+
/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
{
tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
- TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+ int rexw = 0;
+
+ if (arg == ret) {
+ return;
+ }
+ switch (type) {
+ case TCG_TYPE_I64:
+ rexw = P_REXW;
+ /* fallthru */
+ case TCG_TYPE_I32:
+ if (ret < 16) {
+ if (arg < 16) {
+ tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
+ } else {
+ tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
+ }
+ } else {
+ if (arg < 16) {
+ tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
+ } else {
+ tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
+ }
+ }
+ break;
+
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg r, TCGReg a)
+{
+ if (have_avx2) {
+ static const int dup_insn[4] = {
+ OPC_VPBROADCASTB, OPC_VPBROADCASTW,
+ OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
+ };
+ int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+ tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
+ } else {
+ switch (vece) {
+ case MO_8:
+ /* ??? With zero in a register, use PSHUFB. */
+ tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
+ a = r;
+ /* FALLTHRU */
+ case MO_16:
+ tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
+ a = r;
+ /* FALLTHRU */
+ case MO_32:
+ tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
+ /* imm8 operand: all output lanes selected from input lane 0. */
+ tcg_out8(s, 0);
+ break;
+ case MO_64:
+ tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+ TCGReg ret, tcg_target_long arg)
{
- if (arg != ret) {
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm(s, opc, ret, arg);
+ int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+
+ if (arg == 0) {
+ tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
+ return;
+ }
+ if (arg == -1) {
+ tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
+ return;
+ }
+
+ if (TCG_TARGET_REG_BITS == 64) {
+ if (type == TCG_TYPE_V64) {
+ tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
+ } else if (have_avx2) {
+ tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
+ } else {
+ tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
+ }
+ new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
+ } else if (have_avx2) {
+ tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+ new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+ } else {
+ tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
+ new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+ tcg_out_dup_vec(s, type, MO_32, ret, ret);
}
}
{
tcg_target_long diff;
+ switch (type) {
+ case TCG_TYPE_I32:
+#if TCG_TARGET_REG_BITS == 64
+ case TCG_TYPE_I64:
+#endif
+ if (ret < 16) {
+ break;
+ }
+ /* fallthru */
+ case TCG_TYPE_V64:
+ case TCG_TYPE_V128:
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_dupi_vec(s, type, ret, arg);
+ return;
+ default:
+ g_assert_not_reached();
+ }
+
if (arg == 0) {
tgen_arithr(s, ARITH_XOR, ret, ret);
return;
tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
}
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I32:
+ if (ret < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
+ } else {
+ tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
+ }
+ break;
+ case TCG_TYPE_I64:
+ if (ret < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
+ break;
+ }
+ /* FALLTHRU */
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
+ ret, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I32:
+ if (arg < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
+ } else {
+ tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
+ }
+ break;
+ case TCG_TYPE_I64:
+ if (arg < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
+ break;
+ }
+ /* FALLTHRU */
+ case TCG_TYPE_V64:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
+ arg, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
return false;
}
rexw = P_REXW;
+ } else if (type != TCG_TYPE_I32) {
+ return false;
}
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
tcg_out32(s, val);
tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
/* Prepare for both the fast path add of the tlb addend, and the slow
- path function argument setup. There are two cases worth note:
- For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
- before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
- copies the entire guest address for the slow path, while truncation
- for the 32-bit host happens with the fastpath ADDL below. */
+ path function argument setup. */
tcg_out_mov(s, ttype, r1, addrlo);
/* jne slow_path */
* Record the context of a call to the out of line helper code for the slow path
* for a load or store, so that we can later generate the correct helper code
*/
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
+ TCGMemOpIdx oi,
TCGReg datalo, TCGReg datahi,
TCGReg addrlo, TCGReg addrhi,
tcg_insn_unit *raddr,
label->is_ld = is_ld;
label->oi = oi;
+ label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
label->datalo_reg = datalo;
label->datahi_reg = datahi;
label->addrlo_reg = addrlo;
TCGMemOp opc = get_memop(oi);
TCGReg data_reg;
tcg_insn_unit **label_ptr = &l->label_ptr[0];
+ int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
/* resolve label address */
tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
data_reg = l->datalo_reg;
switch (opc & MO_SSIZE) {
case MO_SB:
- tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+ tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
break;
case MO_SW:
- tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+ tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
break;
#if TCG_TARGET_REG_BITS == 64
case MO_SL:
tcg_out_push(s, retaddr);
tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
}
-#elif defined(__x86_64__) && defined(__linux__)
-# include <asm/prctl.h>
-# include <sys/prctl.h>
-
+#elif TCG_TARGET_REG_BITS == 32
+# define x86_guest_base_seg 0
+# define x86_guest_base_index -1
+# define x86_guest_base_offset guest_base
+#else
+static int x86_guest_base_seg;
+static int x86_guest_base_index = -1;
+static int32_t x86_guest_base_offset;
+# if defined(__x86_64__) && defined(__linux__)
+# include <asm/prctl.h>
+# include <sys/prctl.h>
int arch_prctl(int code, unsigned long addr);
-
-static int guest_base_flags;
-static inline void setup_guest_base_seg(void)
+static inline int setup_guest_base_seg(void)
{
if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
- guest_base_flags = P_GS;
+ return P_GS;
}
+ return 0;
}
-#else
-# define guest_base_flags 0
-static inline void setup_guest_base_seg(void) { }
+# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
+# include <machine/sysarch.h>
+static inline int setup_guest_base_seg(void)
+{
+ if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
+ return P_GS;
+ }
+ return 0;
+}
+# else
+static inline int setup_guest_base_seg(void)
+{
+ return 0;
+}
+# endif
#endif /* SOFTMMU */
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
TCGReg base, int index, intptr_t ofs,
- int seg, TCGMemOp memop)
+ int seg, bool is64, TCGMemOp memop)
{
const TCGMemOp real_bswap = memop & MO_BSWAP;
TCGMemOp bswap = real_bswap;
+ int rexw = is64 * P_REXW;
int movop = OPC_MOVL_GvEv;
if (have_movbe && real_bswap) {
base, index, 0, ofs);
break;
case MO_SB:
- tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
+ tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
base, index, 0, ofs);
break;
case MO_UW:
base, index, 0, ofs);
tcg_out_rolw_8(s, datalo);
}
- tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
+ tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
} else {
- tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
+ tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
datalo, base, index, 0, ofs);
}
break;
label_ptr, offsetof(CPUTLBEntry, addr_read));
/* TLB Hit. */
- tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
+ tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
/* Record the current context of a load into ldst label */
- add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+ add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
s->code_ptr, label_ptr);
#else
- {
- int32_t offset = guest_base;
- TCGReg base = addrlo;
- int index = -1;
- int seg = 0;
-
- /* For a 32-bit guest, the high 32 bits may contain garbage.
- We can do this with the ADDR32 prefix if we're not using
- a guest base, or when using segmentation. Otherwise we
- need to zero-extend manually. */
- if (guest_base == 0 || guest_base_flags) {
- seg = guest_base_flags;
- offset = 0;
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
- seg |= P_ADDR32;
- }
- } else if (TCG_TARGET_REG_BITS == 64) {
- if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L0, base);
- base = TCG_REG_L0;
- }
- if (offset != guest_base) {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
- index = TCG_REG_L1;
- offset = 0;
- }
- }
-
- tcg_out_qemu_ld_direct(s, datalo, datahi,
- base, index, offset, seg, opc);
- }
+ tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
+ x86_guest_base_offset, x86_guest_base_seg,
+ is64, opc);
#endif
}
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
- TCGReg base, intptr_t ofs, int seg,
- TCGMemOp memop)
+ TCGReg base, int index, intptr_t ofs,
+ int seg, TCGMemOp memop)
{
/* ??? Ideally we wouldn't need a scratch register. For user-only,
we could perform the bswap twice to restore the original value
tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
datalo = scratch;
}
- tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
- datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
+ datalo, base, index, 0, ofs);
break;
case MO_16:
if (bswap) {
tcg_out_rolw_8(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
+ base, index, 0, ofs);
break;
case MO_32:
if (bswap) {
tcg_out_bswap32(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
break;
case MO_64:
if (TCG_TARGET_REG_BITS == 64) {
tcg_out_bswap64(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
+ base, index, 0, ofs);
} else if (bswap) {
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
tcg_out_bswap32(s, scratch);
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
+ tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+ base, index, 0, ofs);
tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
tcg_out_bswap32(s, scratch);
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
+ tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+ base, index, 0, ofs + 4);
} else {
if (real_bswap) {
int t = datalo;
datalo = datahi;
datahi = t;
}
- tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
- tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
+ tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+ base, index, 0, ofs);
+ tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+ base, index, 0, ofs + 4);
}
break;
default:
label_ptr, offsetof(CPUTLBEntry, addr_write));
/* TLB Hit. */
- tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
+ tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
/* Record the current context of a store into ldst label */
- add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+ add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
s->code_ptr, label_ptr);
#else
- {
- int32_t offset = guest_base;
- TCGReg base = addrlo;
- int seg = 0;
-
- /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
- if (guest_base == 0 || guest_base_flags) {
- seg = guest_base_flags;
- offset = 0;
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
- seg |= P_ADDR32;
- }
- } else if (TCG_TARGET_REG_BITS == 64) {
- /* ??? Note that we can't use the same SIB addressing scheme
- as for loads, since we require L0 free for bswap. */
- if (offset != guest_base) {
- if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L0, base);
- base = TCG_REG_L0;
- }
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
- tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
- base = TCG_REG_L1;
- offset = 0;
- } else if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L1, base);
- base = TCG_REG_L1;
- }
- }
-
- tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
- }
+ tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
+ x86_guest_base_offset, x86_guest_base_seg, opc);
#endif
}
tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
(intptr_t)(s->tb_jmp_target_addr + a0));
}
- s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
+ set_jmp_reset_offset(s, a0);
break;
case INDEX_op_goto_ptr:
/* jmp to the given host address (could be epilogue) */
break;
case INDEX_op_extu_i32_i64:
case INDEX_op_ext32u_i64:
+ case INDEX_op_extrl_i64_i32:
tcg_out_ext32u(s, a0, a1);
break;
case INDEX_op_ext_i32_i64:
case INDEX_op_ext32s_i64:
tcg_out_ext32s(s, a0, a1);
break;
+ case INDEX_op_extrh_i64_i32:
+ tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
+ break;
#endif
OP_32_64(deposit):
break;
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
+ case INDEX_op_dupi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
tcg_abort();
#undef OP_32_64
}
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+ unsigned vecl, unsigned vece,
+ const TCGArg *args, const int *const_args)
+{
+ static int const add_insn[4] = {
+ OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
+ };
+ static int const sub_insn[4] = {
+ OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
+ };
+ static int const mul_insn[4] = {
+ OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
+ };
+ static int const shift_imm_insn[4] = {
+ OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
+ };
+ static int const cmpeq_insn[4] = {
+ OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
+ };
+ static int const cmpgt_insn[4] = {
+ OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
+ };
+ static int const punpckl_insn[4] = {
+ OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
+ };
+ static int const punpckh_insn[4] = {
+ OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
+ };
+ static int const packss_insn[4] = {
+ OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
+ };
+ static int const packus_insn[4] = {
+ OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
+ };
+
+ TCGType type = vecl + TCG_TYPE_V64;
+ int insn, sub;
+ TCGArg a0, a1, a2;
+
+ a0 = args[0];
+ a1 = args[1];
+ a2 = args[2];
+
+ switch (opc) {
+ case INDEX_op_add_vec:
+ insn = add_insn[vece];
+ goto gen_simd;
+ case INDEX_op_sub_vec:
+ insn = sub_insn[vece];
+ goto gen_simd;
+ case INDEX_op_mul_vec:
+ insn = mul_insn[vece];
+ goto gen_simd;
+ case INDEX_op_and_vec:
+ insn = OPC_PAND;
+ goto gen_simd;
+ case INDEX_op_or_vec:
+ insn = OPC_POR;
+ goto gen_simd;
+ case INDEX_op_xor_vec:
+ insn = OPC_PXOR;
+ goto gen_simd;
+ case INDEX_op_x86_punpckl_vec:
+ insn = punpckl_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_punpckh_vec:
+ insn = punpckh_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_packss_vec:
+ insn = packss_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_packus_vec:
+ insn = packus_insn[vece];
+ goto gen_simd;
+#if TCG_TARGET_REG_BITS == 32
+ case INDEX_op_dup2_vec:
+ /* Constraints have already placed both 32-bit inputs in xmm regs. */
+ insn = OPC_PUNPCKLDQ;
+ goto gen_simd;
+#endif
+ gen_simd:
+ tcg_debug_assert(insn != OPC_UD2);
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ break;
+
+ case INDEX_op_cmp_vec:
+ sub = args[3];
+ if (sub == TCG_COND_EQ) {
+ insn = cmpeq_insn[vece];
+ } else if (sub == TCG_COND_GT) {
+ insn = cmpgt_insn[vece];
+ } else {
+ g_assert_not_reached();
+ }
+ goto gen_simd;
+
+ case INDEX_op_andc_vec:
+ insn = OPC_PANDN;
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a2, a1);
+ break;
+
+ case INDEX_op_shli_vec:
+ sub = 6;
+ goto gen_shift;
+ case INDEX_op_shri_vec:
+ sub = 2;
+ goto gen_shift;
+ case INDEX_op_sari_vec:
+ tcg_debug_assert(vece != MO_64);
+ sub = 4;
+ gen_shift:
+ tcg_debug_assert(vece != MO_8);
+ insn = shift_imm_insn[vece];
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, sub, a0, a1);
+ tcg_out8(s, a2);
+ break;
+
+ case INDEX_op_ld_vec:
+ tcg_out_ld(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_st_vec:
+ tcg_out_st(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_dup_vec:
+ tcg_out_dup_vec(s, type, vece, a0, a1);
+ break;
+
+ case INDEX_op_x86_shufps_vec:
+ insn = OPC_SHUFPS;
+ sub = args[3];
+ goto gen_simd_imm8;
+ case INDEX_op_x86_blend_vec:
+ if (vece == MO_16) {
+ insn = OPC_PBLENDW;
+ } else if (vece == MO_32) {
+ insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
+ } else {
+ g_assert_not_reached();
+ }
+ sub = args[3];
+ goto gen_simd_imm8;
+ case INDEX_op_x86_vperm2i128_vec:
+ insn = OPC_VPERM2I128;
+ sub = args[3];
+ goto gen_simd_imm8;
+ gen_simd_imm8:
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out8(s, sub);
+ break;
+
+ case INDEX_op_x86_vpblendvb_vec:
+ insn = OPC_VPBLENDVB;
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out8(s, args[3] << 4);
+ break;
+
+ case INDEX_op_x86_psrldq_vec:
+ tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
+ tcg_out8(s, a2);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
= { .args_ct_str = { "r", "r", "L", "L" } };
static const TCGTargetOpDef L_L_L_L
= { .args_ct_str = { "L", "L", "L", "L" } };
+ static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
+ static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
+ static const TCGTargetOpDef x_x_x_x
+ = { .args_ct_str = { "x", "x", "x", "x" } };
+ static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
switch (op) {
case INDEX_op_goto_ptr:
case INDEX_op_neg_i64:
case INDEX_op_not_i32:
case INDEX_op_not_i64:
+ case INDEX_op_extrh_i64_i32:
return &r_0;
case INDEX_op_ext8s_i32:
case INDEX_op_ext32u_i64:
case INDEX_op_ext_i32_i64:
case INDEX_op_extu_i32_i64:
+ case INDEX_op_extrl_i64_i32:
case INDEX_op_extract_i32:
case INDEX_op_extract_i64:
case INDEX_op_sextract_i32:
return &s2;
}
+ case INDEX_op_ld_vec:
+ case INDEX_op_st_vec:
+ return &x_r;
+
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_mul_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ case INDEX_op_cmp_vec:
+ case INDEX_op_x86_shufps_vec:
+ case INDEX_op_x86_blend_vec:
+ case INDEX_op_x86_packss_vec:
+ case INDEX_op_x86_packus_vec:
+ case INDEX_op_x86_vperm2i128_vec:
+ case INDEX_op_x86_punpckl_vec:
+ case INDEX_op_x86_punpckh_vec:
+#if TCG_TARGET_REG_BITS == 32
+ case INDEX_op_dup2_vec:
+#endif
+ return &x_x_x;
+ case INDEX_op_dup_vec:
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ case INDEX_op_sari_vec:
+ case INDEX_op_x86_psrldq_vec:
+ return &x_x;
+ case INDEX_op_x86_vpblendvb_vec:
+ return &x_x_x_x;
+
default:
break;
}
return NULL;
}
-static int tcg_target_callee_save_regs[] = {
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+ switch (opc) {
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ return 1;
+ case INDEX_op_cmp_vec:
+ return -1;
+
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ /* We must expand the operation for MO_8. */
+ return vece == MO_8 ? -1 : 1;
+
+ case INDEX_op_sari_vec:
+ /* We must expand the operation for MO_8. */
+ if (vece == MO_8) {
+ return -1;
+ }
+ /* We can emulate this for MO_64, but it does not pay off
+ unless we're producing at least 4 values. */
+ if (vece == MO_64) {
+ return type >= TCG_TYPE_V256 ? -1 : 0;
+ }
+ return 1;
+
+ case INDEX_op_mul_vec:
+ if (vece == MO_8) {
+ /* We can expand the operation for MO_8. */
+ return -1;
+ }
+ if (vece == MO_64) {
+ return 0;
+ }
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg a0, ...)
+{
+ va_list va;
+ TCGArg a1, a2;
+ TCGv_vec v0, t1, t2, t3, t4;
+
+ va_start(va, a0);
+ v0 = temp_tcgv_vec(arg_temp(a0));
+
+ switch (opc) {
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ /* Unpack to W, shift, and repack. Tricky bits:
+ (1) Use punpck*bw x,x to produce DDCCBBAA,
+ i.e. duplicate in other half of the 16-bit lane.
+ (2) For right-shift, add 8 so that the high half of
+ the lane becomes zero. For left-shift, we must
+ shift up and down again.
+ (3) Step 2 leaves high half zero such that PACKUSWB
+ (pack with unsigned saturation) does not modify
+ the quantity. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ if (opc == INDEX_op_shri_vec) {
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ } else {
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
+ }
+ vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case INDEX_op_sari_vec:
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ if (vece == MO_8) {
+ /* Unpack to W, shift, and repack, as above. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+ }
+ tcg_debug_assert(vece == MO_64);
+ /* MO_64: If the shift is <= 32, we can emulate the sign extend by
+ performing an arithmetic 32-bit shift and overwriting the high
+ half of the result (note that the ISA says shift of 32 is valid). */
+ if (a2 <= 32) {
+ t1 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
+ a0, a0, tcgv_vec_arg(t1), 0xaa);
+ tcg_temp_free_vec(t1);
+ break;
+ }
+ /* Otherwise we will need to use a compare vs 0 to produce the
+ sign-extend, shift and merge. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_const_zeros_vec(type);
+ vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
+ tcg_temp_free_vec(t2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
+ vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ break;
+
+ case INDEX_op_mul_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ switch (type) {
+ case TCG_TYPE_V64:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t2, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case TCG_TYPE_V128:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t4, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ case TCG_TYPE_V256:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V256);
+ tcg_gen_dup16i_vec(t4, 0);
+ /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
+ t1: extends of B[0-7], D[0-7]
+ t2: extends of X[0-7], Z[0-7]
+ t3: extends of A[0-7], C[0-7]
+ t4: extends of W[0-7], Y[0-7]. */
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ /* t1: BX DZ; t2: AW CY. */
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ /* a0: AW BX CY DZ. */
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ break;
+
+ case INDEX_op_cmp_vec:
+ {
+ enum {
+ NEED_SWAP = 1,
+ NEED_INV = 2,
+ NEED_BIAS = 4
+ };
+ static const uint8_t fixups[16] = {
+ [0 ... 15] = -1,
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_NE] = NEED_INV,
+ [TCG_COND_GT] = 0,
+ [TCG_COND_LT] = NEED_SWAP,
+ [TCG_COND_LE] = NEED_INV,
+ [TCG_COND_GE] = NEED_SWAP | NEED_INV,
+ [TCG_COND_GTU] = NEED_BIAS,
+ [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
+ [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
+ [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
+ };
+
+ TCGCond cond;
+ uint8_t fixup;
+
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ cond = va_arg(va, TCGArg);
+ fixup = fixups[cond & 15];
+ tcg_debug_assert(fixup != 0xff);
+
+ if (fixup & NEED_INV) {
+ cond = tcg_invert_cond(cond);
+ }
+ if (fixup & NEED_SWAP) {
+ TCGArg t;
+ t = a1, a1 = a2, a2 = t;
+ cond = tcg_swap_cond(cond);
+ }
+
+ t1 = t2 = NULL;
+ if (fixup & NEED_BIAS) {
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
+ tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
+ tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
+ a1 = tcgv_vec_arg(t1);
+ a2 = tcgv_vec_arg(t2);
+ cond = tcg_signed_cond(cond);
+ }
+
+ tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
+ vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
+
+ if (fixup & NEED_BIAS) {
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ }
+ if (fixup & NEED_INV) {
+ tcg_gen_not_vec(vece, v0, v0);
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ va_end(va);
+}
+
+static const int tcg_target_callee_save_regs[] = {
#if TCG_TARGET_REG_BITS == 64
TCG_REG_RBP,
TCG_REG_RBX,
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
/* jmp *tb. */
tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
- (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
- + stack_addend);
+ (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
+ + stack_addend);
#else
+# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
+ if (guest_base) {
+ int seg = setup_guest_base_seg();
+ if (seg != 0) {
+ x86_guest_base_seg = seg;
+ } else if (guest_base == (int32_t)guest_base) {
+ x86_guest_base_offset = guest_base;
+ } else {
+ /* Choose R12 because, as a base, it requires a SIB byte. */
+ x86_guest_base_index = TCG_REG_R12;
+ tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
+ tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
+ }
+ }
+# endif
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
/* jmp *tb. */
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
+ if (have_avx2) {
+ tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
+ }
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
}
tcg_out_opc(s, OPC_RET, 0, 0, 0);
-
-#if !defined(CONFIG_SOFTMMU)
- /* Try to set up a segment register to point to guest_base. */
- if (guest_base) {
- setup_guest_base_seg();
- }
-#endif
}
static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
static void tcg_target_init(TCGContext *s)
{
#ifdef CONFIG_CPUID_H
- unsigned a, b, c, d;
+ unsigned a, b, c, d, b7 = 0;
int max = __get_cpuid_max(0, 0);
+ if (max >= 7) {
+ /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
+ __cpuid_count(7, 0, a, b7, c, d);
+ have_bmi1 = (b7 & bit_BMI) != 0;
+ have_bmi2 = (b7 & bit_BMI2) != 0;
+ }
+
if (max >= 1) {
__cpuid(1, a, b, c, d);
#ifndef have_cmov
available, we'll use a small forward branch. */
have_cmov = (d & bit_CMOV) != 0;
#endif
+
/* MOVBE is only available on Intel Atom and Haswell CPUs, so we
need to probe for it. */
have_movbe = (c & bit_MOVBE) != 0;
have_popcnt = (c & bit_POPCNT) != 0;
- }
- if (max >= 7) {
- /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
- __cpuid_count(7, 0, a, b, c, d);
- have_bmi1 = (b & bit_BMI) != 0;
- have_bmi2 = (b & bit_BMI2) != 0;
+ /* There are a number of things we must check before we can be
+ sure of not hitting invalid opcode. */
+ if (c & bit_OSXSAVE) {
+ unsigned xcrl, xcrh;
+ /* The xgetbv instruction is not available to older versions of
+ * the assembler, so we encode the instruction manually.
+ */
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
+ if ((xcrl & 6) == 6) {
+ have_avx1 = (c & bit_AVX) != 0;
+ have_avx2 = (b7 & bit_AVX2) != 0;
+ }
+ }
}
max = __get_cpuid_max(0x8000000, 0);
}
#endif /* CONFIG_CPUID_H */
+ tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
if (TCG_TARGET_REG_BITS == 64) {
- tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
- tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
- } else {
- tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
+ tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
+ }
+ if (have_avx1) {
+ tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
+ tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+ }
+ if (have_avx2) {
+ tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
}
- tcg_regset_clear(tcg_target_call_clobber_regs);
+ tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
}
- tcg_regset_clear(s->reserved_regs);
+ s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
}