#include <inttypes.h>
#include <signal.h>
+#include "qemu/host-utils.h"
#include "cpu.h"
#include "disas/disas.h"
#include "tcg-op.h"
#define PREFIX_LOCK 0x04
#define PREFIX_DATA 0x08
#define PREFIX_ADR 0x10
+#define PREFIX_VEX 0x20
#ifdef TARGET_X86_64
#define CODE64(s) ((s)->code64)
#define REX_B(s) 0
#endif
+#ifdef TARGET_X86_64
+# define ctztl ctz64
+# define clztl clz64
+#else
+# define ctztl ctz32
+# define clztl clz32
+#endif
+
//#define MACRO_TEST 1
/* global register indexes */
static TCGv_ptr cpu_env;
-static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst;
+static TCGv cpu_A0;
+static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT;
static TCGv_i32 cpu_cc_op;
static TCGv cpu_regs[CPU_NB_REGS];
/* local temps */
-static TCGv cpu_T[2], cpu_T3;
+static TCGv cpu_T[2];
/* local register indexes (only used inside old micro ops) */
static TCGv cpu_tmp0, cpu_tmp4;
static TCGv_ptr cpu_ptr0, cpu_ptr1;
static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
static TCGv_i64 cpu_tmp1_i64;
-static TCGv cpu_tmp5;
static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
int code64; /* 64 bit code segment */
int rex_x, rex_b;
#endif
+ int vex_l; /* vex vector length */
+ int vex_v; /* vex vvvv register, without 1's compliment. */
int ss32; /* 32 bit stack segment */
CCOp cc_op; /* current CC operation */
bool cc_op_dirty;
static void gen_eob(DisasContext *s);
static void gen_jmp(DisasContext *s, target_ulong eip);
static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
+static void gen_op(DisasContext *s1, int op, int ot, int d);
/* i386 arith/logic operations */
enum {
};
enum {
- USES_CC_DST = 1,
- USES_CC_SRC = 2,
+ USES_CC_DST = 1,
+ USES_CC_SRC = 2,
+ USES_CC_SRC2 = 4,
+ USES_CC_SRCT = 8,
};
/* Bit set if the global variable is live after setting CC_OP to X. */
static const uint8_t cc_op_live[CC_OP_NB] = {
- [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
[CC_OP_EFLAGS] = USES_CC_SRC,
[CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
- [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC,
- [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC,
- [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+ [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRCT,
+ [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
[CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
[CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_SHLB ... CC_OP_SHLQ] = USES_CC_DST | USES_CC_SRC,
[CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_BMILGB ... CC_OP_BMILGQ] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
+ [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
+ [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+ [CC_OP_CLR] = 0,
};
static void set_cc_op(DisasContext *s, CCOp op)
if (dead & USES_CC_SRC) {
tcg_gen_discard_tl(cpu_cc_src);
}
+ if (dead & USES_CC_SRC2) {
+ tcg_gen_discard_tl(cpu_cc_src2);
+ }
+ if (dead & USES_CC_SRCT) {
+ tcg_gen_discard_tl(cpu_cc_srcT);
+ }
+ if (op == CC_OP_DYNAMIC) {
+ /* The DYNAMIC setting is translator only, and should never be
+ stored. Thus we always consider it clean. */
+ s->cc_op_dirty = false;
+ } else {
+ /* Discard any computed CC_OP value (see shifts). */
+ if (s->cc_op == CC_OP_DYNAMIC) {
+ tcg_gen_discard_i32(cpu_cc_op);
+ }
+ s->cc_op_dirty = true;
+ }
s->cc_op = op;
- /* The DYNAMIC setting is translator only, and should never be
- stored. Thus we always consider it clean. */
- s->cc_op_dirty = (op != CC_OP_DYNAMIC);
}
static void gen_update_cc_op(DisasContext *s)
tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
}
-static inline void gen_op_cmpl_T0_T1_cc(void)
+static void gen_op_update3_cc(TCGv reg)
{
+ tcg_gen_mov_tl(cpu_cc_src2, reg);
tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
- tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
}
static inline void gen_op_testl_T0_T1_cc(void)
static void gen_op_update_neg_cc(void)
{
- tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-}
-
-/* compute eflags.C to reg */
-static void gen_compute_eflags_c(DisasContext *s, TCGv reg)
-{
- gen_update_cc_op(s);
- gen_helper_cc_compute_c(cpu_tmp2_i32, cpu_env, cpu_cc_op);
- tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+ tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+ tcg_gen_movi_tl(cpu_cc_srcT, 0);
}
/* compute all eflags to cc_src */
static void gen_compute_eflags(DisasContext *s)
{
+ TCGv zero, dst, src1, src2;
+ int live, dead;
+
if (s->cc_op == CC_OP_EFLAGS) {
return;
}
+ if (s->cc_op == CC_OP_CLR) {
+ tcg_gen_movi_tl(cpu_cc_src, CC_Z);
+ set_cc_op(s, CC_OP_EFLAGS);
+ return;
+ }
+
+ TCGV_UNUSED(zero);
+ dst = cpu_cc_dst;
+ src1 = cpu_cc_src;
+ src2 = cpu_cc_src2;
+
+ /* Take care to not read values that are not live. */
+ live = cc_op_live[s->cc_op] & ~USES_CC_SRCT;
+ dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2);
+ if (dead) {
+ zero = tcg_const_tl(0);
+ if (dead & USES_CC_DST) {
+ dst = zero;
+ }
+ if (dead & USES_CC_SRC) {
+ src1 = zero;
+ }
+ if (dead & USES_CC_SRC2) {
+ src2 = zero;
+ }
+ }
+
gen_update_cc_op(s);
- gen_helper_cc_compute_all(cpu_tmp2_i32, cpu_env, cpu_cc_op);
+ gen_helper_cc_compute_all(cpu_cc_src, dst, src1, src2, cpu_cc_op);
set_cc_op(s, CC_OP_EFLAGS);
- tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
-}
-/* compute eflags.P to reg */
-static void gen_compute_eflags_p(DisasContext *s, TCGv reg)
-{
- gen_compute_eflags(s);
- tcg_gen_shri_tl(reg, cpu_cc_src, 2);
- tcg_gen_andi_tl(reg, reg, 1);
+ if (dead) {
+ tcg_temp_free(zero);
+ }
}
-/* compute eflags.S to reg */
-static void gen_compute_eflags_s(DisasContext *s, TCGv reg)
+typedef struct CCPrepare {
+ TCGCond cond;
+ TCGv reg;
+ TCGv reg2;
+ target_ulong imm;
+ target_ulong mask;
+ bool use_reg2;
+ bool no_setcond;
+} CCPrepare;
+
+/* compute eflags.C to reg */
+static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
{
- gen_compute_eflags(s);
- tcg_gen_shri_tl(reg, cpu_cc_src, 7);
- tcg_gen_andi_tl(reg, reg, 1);
+ TCGv t0, t1;
+ int size, shift;
+
+ switch (s->cc_op) {
+ case CC_OP_SUBB ... CC_OP_SUBQ:
+ /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
+ size = s->cc_op - CC_OP_SUBB;
+ t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+ /* If no temporary was used, be careful not to alias t1 and t0. */
+ t0 = TCGV_EQUAL(t1, cpu_cc_src) ? cpu_tmp0 : reg;
+ tcg_gen_mov_tl(t0, cpu_cc_srcT);
+ gen_extu(size, t0);
+ goto add_sub;
+
+ case CC_OP_ADDB ... CC_OP_ADDQ:
+ /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
+ size = s->cc_op - CC_OP_ADDB;
+ t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+ t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+ add_sub:
+ return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0,
+ .reg2 = t1, .mask = -1, .use_reg2 = true };
+
+ case CC_OP_LOGICB ... CC_OP_LOGICQ:
+ case CC_OP_CLR:
+ return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+
+ case CC_OP_INCB ... CC_OP_INCQ:
+ case CC_OP_DECB ... CC_OP_DECQ:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = -1, .no_setcond = true };
+
+ case CC_OP_SHLB ... CC_OP_SHLQ:
+ /* (CC_SRC >> (DATA_BITS - 1)) & 1 */
+ size = s->cc_op - CC_OP_SHLB;
+ shift = (8 << size) - 1;
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = (target_ulong)1 << shift };
+
+ case CC_OP_MULB ... CC_OP_MULQ:
+ return (CCPrepare) { .cond = TCG_COND_NE,
+ .reg = cpu_cc_src, .mask = -1 };
+
+ case CC_OP_BMILGB ... CC_OP_BMILGQ:
+ size = s->cc_op - CC_OP_BMILGB;
+ t0 = gen_ext_tl(reg, cpu_cc_src, size, false);
+ return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+
+ case CC_OP_ADCX:
+ case CC_OP_ADCOX:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
+ .mask = -1, .no_setcond = true };
+
+ case CC_OP_EFLAGS:
+ case CC_OP_SARB ... CC_OP_SARQ:
+ /* CC_SRC & 1 */
+ return (CCPrepare) { .cond = TCG_COND_NE,
+ .reg = cpu_cc_src, .mask = CC_C };
+
+ default:
+ /* The need to compute only C from CC_OP_DYNAMIC is important
+ in efficiently implementing e.g. INC at the start of a TB. */
+ gen_update_cc_op(s);
+ gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src,
+ cpu_cc_src2, cpu_cc_op);
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+ .mask = -1, .no_setcond = true };
+ }
}
-/* compute eflags.O to reg */
-static void gen_compute_eflags_o(DisasContext *s, TCGv reg)
+/* compute eflags.P to reg */
+static CCPrepare gen_prepare_eflags_p(DisasContext *s, TCGv reg)
{
gen_compute_eflags(s);
- tcg_gen_shri_tl(reg, cpu_cc_src, 11);
- tcg_gen_andi_tl(reg, reg, 1);
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = CC_P };
}
-/* compute eflags.Z to reg */
-static void gen_compute_eflags_z(DisasContext *s, TCGv reg)
+/* compute eflags.S to reg */
+static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
{
- gen_compute_eflags(s);
- tcg_gen_shri_tl(reg, cpu_cc_src, 6);
- tcg_gen_andi_tl(reg, reg, 1);
+ switch (s->cc_op) {
+ case CC_OP_DYNAMIC:
+ gen_compute_eflags(s);
+ /* FALLTHRU */
+ case CC_OP_EFLAGS:
+ case CC_OP_ADCX:
+ case CC_OP_ADOX:
+ case CC_OP_ADCOX:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = CC_S };
+ case CC_OP_CLR:
+ return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+ default:
+ {
+ int size = (s->cc_op - CC_OP_ADDB) & 3;
+ TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, true);
+ return (CCPrepare) { .cond = TCG_COND_LT, .reg = t0, .mask = -1 };
+ }
+ }
}
-static inline void gen_setcc_slow_T0(DisasContext *s, int jcc_op)
+/* compute eflags.O to reg */
+static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
{
- switch(jcc_op) {
- case JCC_O:
- gen_compute_eflags_o(s, cpu_T[0]);
- break;
- case JCC_B:
- gen_compute_eflags_c(s, cpu_T[0]);
- break;
- case JCC_Z:
- gen_compute_eflags_z(s, cpu_T[0]);
- break;
- case JCC_BE:
- gen_compute_eflags(s);
- tcg_gen_shri_tl(cpu_T[0], cpu_cc_src, 6);
- tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
- tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
- break;
- case JCC_S:
- gen_compute_eflags_s(s, cpu_T[0]);
- break;
- case JCC_P:
- gen_compute_eflags_p(s, cpu_T[0]);
- break;
- case JCC_L:
- gen_compute_eflags(s);
- tcg_gen_shri_tl(cpu_T[0], cpu_cc_src, 11); /* CC_O */
- tcg_gen_shri_tl(cpu_tmp0, cpu_cc_src, 7); /* CC_S */
- tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
- tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
- break;
+ switch (s->cc_op) {
+ case CC_OP_ADOX:
+ case CC_OP_ADCOX:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
+ .mask = -1, .no_setcond = true };
+ case CC_OP_CLR:
+ return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
default:
- case JCC_LE:
gen_compute_eflags(s);
- tcg_gen_shri_tl(cpu_T[0], cpu_cc_src, 11); /* CC_O */
- tcg_gen_shri_tl(cpu_tmp4, cpu_cc_src, 7); /* CC_S */
- tcg_gen_shri_tl(cpu_tmp0, cpu_cc_src, 6); /* CC_Z */
- tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
- tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
- tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
- break;
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = CC_O };
}
}
-/* return true if setcc_slow is not needed (WARNING: must be kept in
- sync with gen_jcc1) */
-static int is_fast_jcc_case(DisasContext *s, int b)
+/* compute eflags.Z to reg */
+static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
{
- int jcc_op;
- jcc_op = (b >> 1) & 7;
- switch(s->cc_op) {
- /* we optimize the cmp/jcc case */
- case CC_OP_SUBB:
- case CC_OP_SUBW:
- case CC_OP_SUBL:
- case CC_OP_SUBQ:
- if (jcc_op == JCC_O || jcc_op == JCC_P)
- goto slow_jcc;
- break;
-
- /* some jumps are easy to compute */
- case CC_OP_ADDB:
- case CC_OP_ADDW:
- case CC_OP_ADDL:
- case CC_OP_ADDQ:
-
- case CC_OP_LOGICB:
- case CC_OP_LOGICW:
- case CC_OP_LOGICL:
- case CC_OP_LOGICQ:
-
- case CC_OP_INCB:
- case CC_OP_INCW:
- case CC_OP_INCL:
- case CC_OP_INCQ:
-
- case CC_OP_DECB:
- case CC_OP_DECW:
- case CC_OP_DECL:
- case CC_OP_DECQ:
-
- case CC_OP_SHLB:
- case CC_OP_SHLW:
- case CC_OP_SHLL:
- case CC_OP_SHLQ:
- if (jcc_op != JCC_Z && jcc_op != JCC_S)
- goto slow_jcc;
- break;
+ switch (s->cc_op) {
+ case CC_OP_DYNAMIC:
+ gen_compute_eflags(s);
+ /* FALLTHRU */
+ case CC_OP_EFLAGS:
+ case CC_OP_ADCX:
+ case CC_OP_ADOX:
+ case CC_OP_ADCOX:
+ return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = CC_Z };
+ case CC_OP_CLR:
+ return (CCPrepare) { .cond = TCG_COND_ALWAYS, .mask = -1 };
default:
- slow_jcc:
- return 0;
+ {
+ int size = (s->cc_op - CC_OP_ADDB) & 3;
+ TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+ return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+ }
}
- return 1;
}
-/* generate a conditional jump to label 'l1' according to jump opcode
+/* perform a conditional store into register 'reg' according to jump opcode
value 'b'. In the fast case, T0 is guaranted not to be used. */
-static inline void gen_jcc1(DisasContext *s, int b, int l1)
+static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
{
int inv, jcc_op, size, cond;
+ CCPrepare cc;
TCGv t0;
inv = b & 1;
jcc_op = (b >> 1) & 7;
switch (s->cc_op) {
- /* we optimize the cmp/jcc case */
- case CC_OP_SUBB:
- case CC_OP_SUBW:
- case CC_OP_SUBL:
- case CC_OP_SUBQ:
-
+ case CC_OP_SUBB ... CC_OP_SUBQ:
+ /* We optimize relational operators for the cmp/jcc case. */
size = s->cc_op - CC_OP_SUBB;
- switch(jcc_op) {
- case JCC_Z:
- fast_jcc_z:
- t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, false);
- tcg_gen_brcondi_tl(inv ? TCG_COND_NE : TCG_COND_EQ, t0, 0, l1);
- break;
- case JCC_S:
- fast_jcc_s:
- t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, true);
- tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, t0, 0, l1);
- break;
-
- case JCC_B:
- cond = inv ? TCG_COND_GEU : TCG_COND_LTU;
- goto fast_jcc_b;
+ switch (jcc_op) {
case JCC_BE:
- cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
- fast_jcc_b:
- tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+ tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
gen_extu(size, cpu_tmp4);
t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
- tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+ cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = cpu_tmp4,
+ .reg2 = t0, .mask = -1, .use_reg2 = true };
break;
-
+
case JCC_L:
- cond = inv ? TCG_COND_GE : TCG_COND_LT;
+ cond = TCG_COND_LT;
goto fast_jcc_l;
case JCC_LE:
- cond = inv ? TCG_COND_GT : TCG_COND_LE;
+ cond = TCG_COND_LE;
fast_jcc_l:
- tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+ tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
gen_exts(size, cpu_tmp4);
t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
- tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+ cc = (CCPrepare) { .cond = cond, .reg = cpu_tmp4,
+ .reg2 = t0, .mask = -1, .use_reg2 = true };
break;
-
+
default:
goto slow_jcc;
}
break;
-
- /* some jumps are easy to compute */
- case CC_OP_ADDB:
- case CC_OP_ADDW:
- case CC_OP_ADDL:
- case CC_OP_ADDQ:
-
- case CC_OP_ADCB:
- case CC_OP_ADCW:
- case CC_OP_ADCL:
- case CC_OP_ADCQ:
-
- case CC_OP_SBBB:
- case CC_OP_SBBW:
- case CC_OP_SBBL:
- case CC_OP_SBBQ:
-
- case CC_OP_LOGICB:
- case CC_OP_LOGICW:
- case CC_OP_LOGICL:
- case CC_OP_LOGICQ:
-
- case CC_OP_INCB:
- case CC_OP_INCW:
- case CC_OP_INCL:
- case CC_OP_INCQ:
-
- case CC_OP_DECB:
- case CC_OP_DECW:
- case CC_OP_DECL:
- case CC_OP_DECQ:
-
- case CC_OP_SHLB:
- case CC_OP_SHLW:
- case CC_OP_SHLL:
- case CC_OP_SHLQ:
-
- case CC_OP_SARB:
- case CC_OP_SARW:
- case CC_OP_SARL:
- case CC_OP_SARQ:
- switch(jcc_op) {
+
+ default:
+ slow_jcc:
+ /* This actually generates good code for JC, JZ and JS. */
+ switch (jcc_op) {
+ case JCC_O:
+ cc = gen_prepare_eflags_o(s, reg);
+ break;
+ case JCC_B:
+ cc = gen_prepare_eflags_c(s, reg);
+ break;
case JCC_Z:
- size = (s->cc_op - CC_OP_ADDB) & 3;
- goto fast_jcc_z;
+ cc = gen_prepare_eflags_z(s, reg);
+ break;
+ case JCC_BE:
+ gen_compute_eflags(s);
+ cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+ .mask = CC_Z | CC_C };
+ break;
case JCC_S:
- size = (s->cc_op - CC_OP_ADDB) & 3;
- goto fast_jcc_s;
+ cc = gen_prepare_eflags_s(s, reg);
+ break;
+ case JCC_P:
+ cc = gen_prepare_eflags_p(s, reg);
+ break;
+ case JCC_L:
+ gen_compute_eflags(s);
+ if (TCGV_EQUAL(reg, cpu_cc_src)) {
+ reg = cpu_tmp0;
+ }
+ tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+ tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+ cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+ .mask = CC_S };
+ break;
default:
- goto slow_jcc;
+ case JCC_LE:
+ gen_compute_eflags(s);
+ if (TCGV_EQUAL(reg, cpu_cc_src)) {
+ reg = cpu_tmp0;
+ }
+ tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+ tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+ cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+ .mask = CC_S | CC_Z };
+ break;
}
break;
- default:
- slow_jcc:
- gen_setcc_slow_T0(s, jcc_op);
- tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE,
- cpu_T[0], 0, l1);
- break;
+ }
+
+ if (inv) {
+ cc.cond = tcg_invert_cond(cc.cond);
+ }
+ return cc;
+}
+
+static void gen_setcc1(DisasContext *s, int b, TCGv reg)
+{
+ CCPrepare cc = gen_prepare_cc(s, b, reg);
+
+ if (cc.no_setcond) {
+ if (cc.cond == TCG_COND_EQ) {
+ tcg_gen_xori_tl(reg, cc.reg, 1);
+ } else {
+ tcg_gen_mov_tl(reg, cc.reg);
+ }
+ return;
+ }
+
+ if (cc.cond == TCG_COND_NE && !cc.use_reg2 && cc.imm == 0 &&
+ cc.mask != 0 && (cc.mask & (cc.mask - 1)) == 0) {
+ tcg_gen_shri_tl(reg, cc.reg, ctztl(cc.mask));
+ tcg_gen_andi_tl(reg, reg, 1);
+ return;
+ }
+ if (cc.mask != -1) {
+ tcg_gen_andi_tl(reg, cc.reg, cc.mask);
+ cc.reg = reg;
+ }
+ if (cc.use_reg2) {
+ tcg_gen_setcond_tl(cc.cond, reg, cc.reg, cc.reg2);
+ } else {
+ tcg_gen_setcondi_tl(cc.cond, reg, cc.reg, cc.imm);
+ }
+}
+
+static inline void gen_compute_eflags_c(DisasContext *s, TCGv reg)
+{
+ gen_setcc1(s, JCC_B << 1, reg);
+}
+
+/* generate a conditional jump to label 'l1' according to jump opcode
+ value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_jcc1_noeob(DisasContext *s, int b, int l1)
+{
+ CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+ if (cc.mask != -1) {
+ tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+ cc.reg = cpu_T[0];
+ }
+ if (cc.use_reg2) {
+ tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+ } else {
+ tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
+ }
+}
+
+/* Generate a conditional jump to label 'l1' according to jump opcode
+ value 'b'. In the fast case, T0 is guaranted not to be used.
+ A translation block must end soon. */
+static inline void gen_jcc1(DisasContext *s, int b, int l1)
+{
+ CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+ gen_update_cc_op(s);
+ if (cc.mask != -1) {
+ tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+ cc.reg = cpu_T[0];
+ }
+ set_cc_op(s, CC_OP_DYNAMIC);
+ if (cc.use_reg2) {
+ tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+ } else {
+ tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
}
}
static inline void gen_scas(DisasContext *s, int ot)
{
- gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
gen_string_movl_A0_EDI(s);
gen_op_ld_T1_A0(ot + s->mem_index);
- gen_op_cmpl_T0_T1_cc();
+ gen_op(s, OP_CMPL, ot, R_EAX);
gen_op_movl_T0_Dshift(ot);
gen_op_add_reg_T0(s->aflag, R_EDI);
- set_cc_op(s, CC_OP_SUBB + ot);
}
static inline void gen_cmps(DisasContext *s, int ot)
{
- gen_string_movl_A0_ESI(s);
- gen_op_ld_T0_A0(ot + s->mem_index);
gen_string_movl_A0_EDI(s);
gen_op_ld_T1_A0(ot + s->mem_index);
- gen_op_cmpl_T0_T1_cc();
+ gen_string_movl_A0_ESI(s);
+ gen_op(s, OP_CMPL, ot, OR_TMP0);
gen_op_movl_T0_Dshift(ot);
gen_op_add_reg_T0(s->aflag, R_ESI);
gen_op_add_reg_T0(s->aflag, R_EDI);
- set_cc_op(s, CC_OP_SUBB + ot);
}
static inline void gen_ins(DisasContext *s, int ot)
if (!s->jmp_opt) \
gen_op_jz_ecx(s->aflag, l2); \
gen_jmp(s, cur_eip); \
- set_cc_op(s, CC_OP_DYNAMIC); \
}
GEN_REPZ(movs)
gen_op_mov_reg_T0(ot, d);
else
gen_op_st_T0_A0(ot + s1->mem_index);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
- tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
- tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
- set_cc_op(s1, CC_OP_DYNAMIC);
+ gen_op_update3_cc(cpu_tmp4);
+ set_cc_op(s1, CC_OP_ADCB + ot);
break;
case OP_SBBL:
gen_compute_eflags_c(s1, cpu_tmp4);
gen_op_mov_reg_T0(ot, d);
else
gen_op_st_T0_A0(ot + s1->mem_index);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
- tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
- tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
- set_cc_op(s1, CC_OP_DYNAMIC);
+ gen_op_update3_cc(cpu_tmp4);
+ set_cc_op(s1, CC_OP_SBBB + ot);
break;
case OP_ADDL:
gen_op_addl_T0_T1();
set_cc_op(s1, CC_OP_ADDB + ot);
break;
case OP_SUBL:
+ tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
if (d != OR_TMP0)
gen_op_mov_reg_T0(ot, d);
set_cc_op(s1, CC_OP_LOGICB + ot);
break;
case OP_CMPL:
- gen_op_cmpl_T0_T1_cc();
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+ tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
+ tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
set_cc_op(s1, CC_OP_SUBB + ot);
break;
}
tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
}
-static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
- int is_right, int is_arith)
+static void gen_shift_flags(DisasContext *s, int ot, TCGv result, TCGv shm1,
+ TCGv count, bool is_right)
{
- target_ulong mask;
- int shift_label;
- TCGv t0, t1, t2;
+ TCGv_i32 z32, s32, oldop;
+ TCGv z_tl;
+
+ /* Store the results into the CC variables. If we know that the
+ variable must be dead, store unconditionally. Otherwise we'll
+ need to not disrupt the current contents. */
+ z_tl = tcg_const_tl(0);
+ if (cc_op_live[s->cc_op] & USES_CC_DST) {
+ tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
+ result, cpu_cc_dst);
+ } else {
+ tcg_gen_mov_tl(cpu_cc_dst, result);
+ }
+ if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+ tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
+ shm1, cpu_cc_src);
+ } else {
+ tcg_gen_mov_tl(cpu_cc_src, shm1);
+ }
+ tcg_temp_free(z_tl);
- if (ot == OT_QUAD) {
- mask = 0x3f;
+ /* Get the two potential CC_OP values into temporaries. */
+ tcg_gen_movi_i32(cpu_tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
+ if (s->cc_op == CC_OP_DYNAMIC) {
+ oldop = cpu_cc_op;
} else {
- mask = 0x1f;
+ tcg_gen_movi_i32(cpu_tmp3_i32, s->cc_op);
+ oldop = cpu_tmp3_i32;
}
+ /* Conditionally store the CC_OP value. */
+ z32 = tcg_const_i32(0);
+ s32 = tcg_temp_new_i32();
+ tcg_gen_trunc_tl_i32(s32, count);
+ tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, cpu_tmp2_i32, oldop);
+ tcg_temp_free_i32(z32);
+ tcg_temp_free_i32(s32);
+
+ /* The CC_OP value is no longer predictable. */
+ set_cc_op(s, CC_OP_DYNAMIC);
+}
+
+static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
+ int is_right, int is_arith)
+{
+ target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+
/* load */
if (op1 == OR_TMP0) {
gen_op_ld_T0_A0(ot + s->mem_index);
gen_op_mov_TN_reg(ot, 0, op1);
}
- t0 = tcg_temp_local_new();
- t1 = tcg_temp_local_new();
- t2 = tcg_temp_local_new();
-
- tcg_gen_andi_tl(t2, cpu_T[1], mask);
+ tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+ tcg_gen_subi_tl(cpu_tmp0, cpu_T[1], 1);
if (is_right) {
if (is_arith) {
gen_exts(ot, cpu_T[0]);
- tcg_gen_mov_tl(t0, cpu_T[0]);
- tcg_gen_sar_tl(cpu_T[0], cpu_T[0], t2);
+ tcg_gen_sar_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
} else {
gen_extu(ot, cpu_T[0]);
- tcg_gen_mov_tl(t0, cpu_T[0]);
- tcg_gen_shr_tl(cpu_T[0], cpu_T[0], t2);
+ tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
}
} else {
- tcg_gen_mov_tl(t0, cpu_T[0]);
- tcg_gen_shl_tl(cpu_T[0], cpu_T[0], t2);
+ tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
}
/* store */
gen_op_mov_reg_T0(ot, op1);
}
- /* update eflags */
- gen_update_cc_op(s);
-
- tcg_gen_mov_tl(t1, cpu_T[0]);
-
- shift_label = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, shift_label);
-
- tcg_gen_addi_tl(t2, t2, -1);
- tcg_gen_mov_tl(cpu_cc_dst, t1);
-
- if (is_right) {
- if (is_arith) {
- tcg_gen_sar_tl(cpu_cc_src, t0, t2);
- } else {
- tcg_gen_shr_tl(cpu_cc_src, t0, t2);
- }
- } else {
- tcg_gen_shl_tl(cpu_cc_src, t0, t2);
- }
-
- if (is_right) {
- tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
- } else {
- tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
- }
-
- gen_set_label(shift_label);
- set_cc_op(s, CC_OP_DYNAMIC); /* cannot predict flags after */
-
- tcg_temp_free(t0);
- tcg_temp_free(t1);
- tcg_temp_free(t2);
+ gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, cpu_T[1], is_right);
}
static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
int is_right, int is_arith)
{
- int mask;
-
- if (ot == OT_QUAD)
- mask = 0x3f;
- else
- mask = 0x1f;
+ int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
/* load */
if (op1 == OR_TMP0)
tcg_gen_shri_tl(ret, arg1, -arg2);
}
-static void gen_rot_rm_T1(DisasContext *s, int ot, int op1,
- int is_right)
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right)
{
- target_ulong mask;
- int label1, label2, data_bits;
- TCGv t0, t1, t2, a0;
-
- /* XXX: inefficient, but we must use local temps */
- t0 = tcg_temp_local_new();
- t1 = tcg_temp_local_new();
- t2 = tcg_temp_local_new();
- a0 = tcg_temp_local_new();
-
- if (ot == OT_QUAD)
- mask = 0x3f;
- else
- mask = 0x1f;
+ target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+ TCGv_i32 t0, t1;
/* load */
if (op1 == OR_TMP0) {
- tcg_gen_mov_tl(a0, cpu_A0);
- gen_op_ld_v(ot + s->mem_index, t0, a0);
+ gen_op_ld_T0_A0(ot + s->mem_index);
} else {
- gen_op_mov_v_reg(ot, t0, op1);
+ gen_op_mov_TN_reg(ot, 0, op1);
}
- tcg_gen_mov_tl(t1, cpu_T[1]);
+ tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
- tcg_gen_andi_tl(t1, t1, mask);
-
- /* Must test zero case to avoid using undefined behaviour in TCG
- shifts. */
- label1 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
-
- if (ot <= OT_WORD)
- tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
- else
- tcg_gen_mov_tl(cpu_tmp0, t1);
-
- gen_extu(ot, t0);
- tcg_gen_mov_tl(t2, t0);
-
- data_bits = 8 << ot;
- /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
- fix TCG definition) */
- if (is_right) {
- tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
- tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
- tcg_gen_shl_tl(t0, t0, cpu_tmp0);
- } else {
- tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
- tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
- tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+ switch (ot) {
+ case OT_BYTE:
+ /* Replicate the 8-bit input so that a 32-bit rotate works. */
+ tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+ tcg_gen_muli_tl(cpu_T[0], cpu_T[0], 0x01010101);
+ goto do_long;
+ case OT_WORD:
+ /* Replicate the 16-bit input so that a 32-bit rotate works. */
+ tcg_gen_deposit_tl(cpu_T[0], cpu_T[0], cpu_T[0], 16, 16);
+ goto do_long;
+ do_long:
+#ifdef TARGET_X86_64
+ case OT_LONG:
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+ if (is_right) {
+ tcg_gen_rotr_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+ } else {
+ tcg_gen_rotl_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+ }
+ tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+ break;
+#endif
+ default:
+ if (is_right) {
+ tcg_gen_rotr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ } else {
+ tcg_gen_rotl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ }
+ break;
}
- tcg_gen_or_tl(t0, t0, cpu_tmp4);
- gen_set_label(label1);
/* store */
if (op1 == OR_TMP0) {
- gen_op_st_v(ot + s->mem_index, t0, a0);
+ gen_op_st_T0_A0(ot + s->mem_index);
} else {
- gen_op_mov_reg_v(ot, op1, t0);
+ gen_op_mov_reg_T0(ot, op1);
}
-
- /* update eflags. It is needed anyway most of the time, do it always. */
- gen_compute_eflags(s);
- assert(s->cc_op == CC_OP_EFLAGS);
- label2 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
+ /* We'll need the flags computed into CC_SRC. */
+ gen_compute_eflags(s);
- tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
- tcg_gen_xor_tl(cpu_tmp0, t2, t0);
- tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
- tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
- tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+ /* The value that was "rotated out" is now present at the other end
+ of the word. Compute C into CC_DST and O into CC_SRC2. Note that
+ since we've computed the flags into CC_SRC, these variables are
+ currently dead. */
if (is_right) {
- tcg_gen_shri_tl(t0, t0, data_bits - 1);
+ tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+ tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+ tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+ } else {
+ tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+ tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
}
- tcg_gen_andi_tl(t0, t0, CC_C);
- tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-
- gen_set_label(label2);
-
- tcg_temp_free(t0);
- tcg_temp_free(t1);
- tcg_temp_free(t2);
- tcg_temp_free(a0);
+ tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+ tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+
+ /* Now conditionally store the new CC_OP value. If the shift count
+ is 0 we keep the CC_OP_EFLAGS setting so that only CC_SRC is live.
+ Otherwise reuse CC_OP_ADCOX which have the C and O flags split out
+ exactly as we computed above. */
+ t0 = tcg_const_i32(0);
+ t1 = tcg_temp_new_i32();
+ tcg_gen_trunc_tl_i32(t1, cpu_T[1]);
+ tcg_gen_movi_i32(cpu_tmp2_i32, CC_OP_ADCOX);
+ tcg_gen_movi_i32(cpu_tmp3_i32, CC_OP_EFLAGS);
+ tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, t1, t0,
+ cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+
+ /* The CC_OP value is no longer predictable. */
+ set_cc_op(s, CC_OP_DYNAMIC);
}
static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
int is_right)
{
- int mask;
- int data_bits;
- TCGv t0, t1, a0;
-
- /* XXX: inefficient, but we must use local temps */
- t0 = tcg_temp_local_new();
- t1 = tcg_temp_local_new();
- a0 = tcg_temp_local_new();
-
- if (ot == OT_QUAD)
- mask = 0x3f;
- else
- mask = 0x1f;
+ int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+ int shift;
/* load */
if (op1 == OR_TMP0) {
- tcg_gen_mov_tl(a0, cpu_A0);
- gen_op_ld_v(ot + s->mem_index, t0, a0);
+ gen_op_ld_T0_A0(ot + s->mem_index);
} else {
- gen_op_mov_v_reg(ot, t0, op1);
+ gen_op_mov_TN_reg(ot, 0, op1);
}
- gen_extu(ot, t0);
- tcg_gen_mov_tl(t1, t0);
-
op2 &= mask;
- data_bits = 8 << ot;
if (op2 != 0) {
- int shift = op2 & ((1 << (3 + ot)) - 1);
- if (is_right) {
- tcg_gen_shri_tl(cpu_tmp4, t0, shift);
- tcg_gen_shli_tl(t0, t0, data_bits - shift);
- }
- else {
- tcg_gen_shli_tl(cpu_tmp4, t0, shift);
- tcg_gen_shri_tl(t0, t0, data_bits - shift);
+ switch (ot) {
+#ifdef TARGET_X86_64
+ case OT_LONG:
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ if (is_right) {
+ tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+ } else {
+ tcg_gen_rotli_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+ }
+ tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+ break;
+#endif
+ default:
+ if (is_right) {
+ tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], op2);
+ } else {
+ tcg_gen_rotli_tl(cpu_T[0], cpu_T[0], op2);
+ }
+ break;
+ case OT_BYTE:
+ mask = 7;
+ goto do_shifts;
+ case OT_WORD:
+ mask = 15;
+ do_shifts:
+ shift = op2 & mask;
+ if (is_right) {
+ shift = mask + 1 - shift;
+ }
+ gen_extu(ot, cpu_T[0]);
+ tcg_gen_shli_tl(cpu_tmp0, cpu_T[0], shift);
+ tcg_gen_shri_tl(cpu_T[0], cpu_T[0], mask + 1 - shift);
+ tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+ break;
}
- tcg_gen_or_tl(t0, t0, cpu_tmp4);
}
/* store */
if (op1 == OR_TMP0) {
- gen_op_st_v(ot + s->mem_index, t0, a0);
+ gen_op_st_T0_A0(ot + s->mem_index);
} else {
- gen_op_mov_reg_v(ot, op1, t0);
+ gen_op_mov_reg_T0(ot, op1);
}
if (op2 != 0) {
- /* update eflags */
+ /* Compute the flags into CC_SRC. */
gen_compute_eflags(s);
- assert(s->cc_op == CC_OP_EFLAGS);
- tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
- tcg_gen_xor_tl(cpu_tmp0, t1, t0);
- tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
- tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
- tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+ /* The value that was "rotated out" is now present at the other end
+ of the word. Compute C into CC_DST and O into CC_SRC2. Note that
+ since we've computed the flags into CC_SRC, these variables are
+ currently dead. */
if (is_right) {
- tcg_gen_shri_tl(t0, t0, data_bits - 1);
+ tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+ tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+ tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+ } else {
+ tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+ tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
}
- tcg_gen_andi_tl(t0, t0, CC_C);
- tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
+ tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+ tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+ set_cc_op(s, CC_OP_ADCOX);
}
-
- tcg_temp_free(t0);
- tcg_temp_free(t1);
- tcg_temp_free(a0);
}
/* XXX: add faster immediate = 1 case */
}
/* XXX: add faster immediate case */
-static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1,
- int is_right)
+static void gen_shiftd_rm_T1(DisasContext *s, int ot, int op1,
+ bool is_right, TCGv count_in)
{
- int label1, label2, data_bits;
- target_ulong mask;
- TCGv t0, t1, t2, a0;
-
- t0 = tcg_temp_local_new();
- t1 = tcg_temp_local_new();
- t2 = tcg_temp_local_new();
- a0 = tcg_temp_local_new();
-
- if (ot == OT_QUAD)
- mask = 0x3f;
- else
- mask = 0x1f;
+ target_ulong mask = (ot == OT_QUAD ? 63 : 31);
+ TCGv count;
/* load */
if (op1 == OR_TMP0) {
- tcg_gen_mov_tl(a0, cpu_A0);
- gen_op_ld_v(ot + s->mem_index, t0, a0);
+ gen_op_ld_T0_A0(ot + s->mem_index);
} else {
- gen_op_mov_v_reg(ot, t0, op1);
+ gen_op_mov_TN_reg(ot, 0, op1);
}
- tcg_gen_andi_tl(cpu_T3, cpu_T3, mask);
+ count = tcg_temp_new();
+ tcg_gen_andi_tl(count, count_in, mask);
- tcg_gen_mov_tl(t1, cpu_T[1]);
- tcg_gen_mov_tl(t2, cpu_T3);
-
- /* Must test zero case to avoid using undefined behaviour in TCG
- shifts. */
- label1 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
-
- tcg_gen_addi_tl(cpu_tmp5, t2, -1);
- if (ot == OT_WORD) {
- /* Note: we implement the Intel behaviour for shift count > 16 */
+ switch (ot) {
+ case OT_WORD:
+ /* Note: we implement the Intel behaviour for shift count > 16.
+ This means "shrdw C, B, A" shifts A:B:A >> C. Build the B:A
+ portion by constructing it as a 32-bit value. */
if (is_right) {
- tcg_gen_andi_tl(t0, t0, 0xffff);
- tcg_gen_shli_tl(cpu_tmp0, t1, 16);
- tcg_gen_or_tl(t0, t0, cpu_tmp0);
- tcg_gen_ext32u_tl(t0, t0);
-
- tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-
- /* only needed if count > 16, but a test would complicate */
- tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
- tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
-
- tcg_gen_shr_tl(t0, t0, t2);
-
- tcg_gen_or_tl(t0, t0, cpu_tmp0);
+ tcg_gen_deposit_tl(cpu_tmp0, cpu_T[0], cpu_T[1], 16, 16);
+ tcg_gen_mov_tl(cpu_T[1], cpu_T[0]);
+ tcg_gen_mov_tl(cpu_T[0], cpu_tmp0);
} else {
- /* XXX: not optimal */
- tcg_gen_andi_tl(t0, t0, 0xffff);
- tcg_gen_shli_tl(t1, t1, 16);
- tcg_gen_or_tl(t1, t1, t0);
- tcg_gen_ext32u_tl(t1, t1);
-
- tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
- tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
- tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
- tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
-
- tcg_gen_shl_tl(t0, t0, t2);
- tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
- tcg_gen_shr_tl(t1, t1, cpu_tmp5);
- tcg_gen_or_tl(t0, t0, t1);
+ tcg_gen_deposit_tl(cpu_T[1], cpu_T[0], cpu_T[1], 16, 16);
}
- } else {
- data_bits = 8 << ot;
+ /* FALLTHRU */
+#ifdef TARGET_X86_64
+ case OT_LONG:
+ /* Concatenate the two 32-bit values and use a 64-bit shift. */
+ tcg_gen_subi_tl(cpu_tmp0, count, 1);
if (is_right) {
- if (ot == OT_LONG)
- tcg_gen_ext32u_tl(t0, t0);
-
- tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+ tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[0], cpu_T[1]);
+ tcg_gen_shr_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ tcg_gen_shr_i64(cpu_T[0], cpu_T[0], count);
+ } else {
+ tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[1], cpu_T[0]);
+ tcg_gen_shl_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ tcg_gen_shl_i64(cpu_T[0], cpu_T[0], count);
+ tcg_gen_shri_i64(cpu_tmp0, cpu_tmp0, 32);
+ tcg_gen_shri_i64(cpu_T[0], cpu_T[0], 32);
+ }
+ break;
+#endif
+ default:
+ tcg_gen_subi_tl(cpu_tmp0, count, 1);
+ if (is_right) {
+ tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
- tcg_gen_shr_tl(t0, t0, t2);
- tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
- tcg_gen_shl_tl(t1, t1, cpu_tmp5);
- tcg_gen_or_tl(t0, t0, t1);
-
+ tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+ tcg_gen_shr_tl(cpu_T[0], cpu_T[0], count);
+ tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
} else {
- if (ot == OT_LONG)
- tcg_gen_ext32u_tl(t1, t1);
-
- tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-
- tcg_gen_shl_tl(t0, t0, t2);
- tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
- tcg_gen_shr_tl(t1, t1, cpu_tmp5);
- tcg_gen_or_tl(t0, t0, t1);
+ tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+ if (ot == OT_WORD) {
+ /* Only needed if count > 16, for Intel behaviour. */
+ tcg_gen_subfi_tl(cpu_tmp4, 33, count);
+ tcg_gen_shr_tl(cpu_tmp4, cpu_T[1], cpu_tmp4);
+ tcg_gen_or_tl(cpu_tmp0, cpu_tmp0, cpu_tmp4);
+ }
+
+ tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+ tcg_gen_shl_tl(cpu_T[0], cpu_T[0], count);
+ tcg_gen_shr_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
}
+ tcg_gen_movi_tl(cpu_tmp4, 0);
+ tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[1], count, cpu_tmp4,
+ cpu_tmp4, cpu_T[1]);
+ tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ break;
}
- tcg_gen_mov_tl(t1, cpu_tmp4);
- gen_set_label(label1);
/* store */
if (op1 == OR_TMP0) {
- gen_op_st_v(ot + s->mem_index, t0, a0);
- } else {
- gen_op_mov_reg_v(ot, op1, t0);
- }
-
- /* update eflags */
- gen_update_cc_op(s);
-
- label2 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
-
- tcg_gen_mov_tl(cpu_cc_src, t1);
- tcg_gen_mov_tl(cpu_cc_dst, t0);
- if (is_right) {
- tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+ gen_op_st_T0_A0(ot + s->mem_index);
} else {
- tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+ gen_op_mov_reg_T0(ot, op1);
}
- gen_set_label(label2);
- set_cc_op(s, CC_OP_DYNAMIC); /* cannot predict flags after */
- tcg_temp_free(t0);
- tcg_temp_free(t1);
- tcg_temp_free(t2);
- tcg_temp_free(a0);
+ gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, count, is_right);
+ tcg_temp_free(count);
}
static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
int l1, l2;
if (s->jmp_opt) {
- gen_update_cc_op(s);
l1 = gen_new_label();
gen_jcc1(s, b, l1);
- set_cc_op(s, CC_OP_DYNAMIC);
-
+
gen_goto_tb(s, 0, next_eip);
gen_set_label(l1);
}
}
-static void gen_setcc(DisasContext *s, int b)
+static void gen_cmovcc1(CPUX86State *env, DisasContext *s, int ot, int b,
+ int modrm, int reg)
{
- int inv, jcc_op, l1;
- TCGv t0;
+ CCPrepare cc;
- if (is_fast_jcc_case(s, b)) {
- /* nominal case: we use a jump */
- /* XXX: make it faster by adding new instructions in TCG */
- t0 = tcg_temp_local_new();
- tcg_gen_movi_tl(t0, 0);
- l1 = gen_new_label();
- gen_jcc1(s, b ^ 1, l1);
- tcg_gen_movi_tl(t0, 1);
- gen_set_label(l1);
- tcg_gen_mov_tl(cpu_T[0], t0);
- tcg_temp_free(t0);
- } else {
- /* slow case: it is more efficient not to generate a jump,
- although it is questionnable whether this optimization is
- worth to */
- inv = b & 1;
- jcc_op = (b >> 1) & 7;
- gen_setcc_slow_T0(s, jcc_op);
- if (inv) {
- tcg_gen_xori_tl(cpu_T[0], cpu_T[0], 1);
- }
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+ cc = gen_prepare_cc(s, b, cpu_T[1]);
+ if (cc.mask != -1) {
+ TCGv t0 = tcg_temp_new();
+ tcg_gen_andi_tl(t0, cc.reg, cc.mask);
+ cc.reg = t0;
+ }
+ if (!cc.use_reg2) {
+ cc.reg2 = tcg_const_tl(cc.imm);
+ }
+
+ tcg_gen_movcond_tl(cc.cond, cpu_T[0], cc.reg, cc.reg2,
+ cpu_T[0], cpu_regs[reg]);
+ gen_op_mov_reg_T0(ot, reg);
+
+ if (cc.mask != -1) {
+ tcg_temp_free(cc.reg);
+ }
+ if (!cc.use_reg2) {
+ tcg_temp_free(cc.reg2);
}
}
direct call to the next block may occur */
static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
{
+ gen_update_cc_op(s);
+ set_cc_op(s, CC_OP_DYNAMIC);
if (s->jmp_opt) {
- gen_update_cc_op(s);
gen_goto_tb(s, tb_num, eip);
s->is_jmp = DISAS_TB_JUMP;
} else {
[0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
(SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
- [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
- [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+ /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */
+ [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+ [0x3a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
/* MMX ops and their SSE extensions */
[0x60] = MMX_OP2(punpcklbw),
#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
+#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \
+ CPUID_EXT_PCLMULQDQ }
+#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES }
static const struct SSEOpHelper_epp sse_op_table6[256] = {
[0x00] = SSSE3_OP(pshufb),
[0x3f] = SSE41_OP(pmaxud),
[0x40] = SSE41_OP(pmulld),
[0x41] = SSE41_OP(phminposuw),
+ [0xdb] = AESNI_OP(aesimc),
+ [0xdc] = AESNI_OP(aesenc),
+ [0xdd] = AESNI_OP(aesenclast),
+ [0xde] = AESNI_OP(aesdec),
+ [0xdf] = AESNI_OP(aesdeclast),
};
static const struct SSEOpHelper_eppi sse_op_table7[256] = {
[0x40] = SSE41_OP(dpps),
[0x41] = SSE41_OP(dppd),
[0x42] = SSE41_OP(mpsadbw),
+ [0x44] = PCLMULQDQ_OP(pclmulqdq),
[0x60] = SSE42_OP(pcmpestrm),
[0x61] = SSE42_OP(pcmpestri),
[0x62] = SSE42_OP(pcmpistrm),
[0x63] = SSE42_OP(pcmpistri),
+ [0xdf] = AESNI_OP(aeskeygenassist),
};
static void gen_sse(CPUX86State *env, DisasContext *s, int b,
reg = ((modrm >> 3) & 7) | rex_r;
gen_op_mov_reg_T0(OT_LONG, reg);
break;
+
case 0x138:
- if (s->prefix & PREFIX_REPNZ)
- goto crc32;
case 0x038:
b = modrm;
+ if ((b & 0xf0) == 0xf0) {
+ goto do_0f_38_fx;
+ }
modrm = cpu_ldub_code(env, s->pc++);
rm = modrm & 7;
reg = ((modrm >> 3) & 7) | rex_r;
tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset +
offsetof(XMMReg, XMM_L(0)));
break;
- case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
- tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
- (s->mem_index >> 2) - 1);
- tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
- offsetof(XMMReg, XMM_W(0)));
+ case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
+ tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
+ (s->mem_index >> 2) - 1);
+ tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
+ offsetof(XMMReg, XMM_W(0)));
+ break;
+ case 0x2a: /* movntqda */
+ gen_ldo_env_A0(s->mem_index, op1_offset);
+ return;
+ default:
+ gen_ldo_env_A0(s->mem_index, op2_offset);
+ }
+ }
+ } else {
+ op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+ if (mod == 3) {
+ op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+ } else {
+ op2_offset = offsetof(CPUX86State,mmx_t0);
+ gen_lea_modrm(env, s, modrm, ®_addr, &offset_addr);
+ gen_ldq_env_A0(s->mem_index, op2_offset);
+ }
+ }
+ if (sse_fn_epp == SSE_SPECIAL) {
+ goto illegal_op;
+ }
+
+ tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+ tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+ sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+
+ if (b == 0x17) {
+ set_cc_op(s, CC_OP_EFLAGS);
+ }
+ break;
+
+ case 0x238:
+ case 0x338:
+ do_0f_38_fx:
+ /* Various integer extensions at 0f 38 f[0-f]. */
+ b = modrm | (b1 << 8);
+ modrm = cpu_ldub_code(env, s->pc++);
+ reg = ((modrm >> 3) & 7) | rex_r;
+
+ switch (b) {
+ case 0x3f0: /* crc32 Gd,Eb */
+ case 0x3f1: /* crc32 Gd,Ey */
+ do_crc32:
+ if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) {
+ goto illegal_op;
+ }
+ if ((b & 0xff) == 0xf0) {
+ ot = OT_BYTE;
+ } else if (s->dflag != 2) {
+ ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+ } else {
+ ot = OT_QUAD;
+ }
+
+ gen_op_mov_TN_reg(OT_LONG, 0, reg);
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
+ cpu_T[0], tcg_const_i32(8 << ot));
+
+ ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+ gen_op_mov_reg_T0(ot, reg);
+ break;
+
+ case 0x1f0: /* crc32 or movbe */
+ case 0x1f1:
+ /* For these insns, the f3 prefix is supposed to have priority
+ over the 66 prefix, but that's not what we implement above
+ setting b1. */
+ if (s->prefix & PREFIX_REPNZ) {
+ goto do_crc32;
+ }
+ /* FALLTHRU */
+ case 0x0f0: /* movbe Gy,My */
+ case 0x0f1: /* movbe My,Gy */
+ if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) {
+ goto illegal_op;
+ }
+ if (s->dflag != 2) {
+ ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+ } else {
+ ot = OT_QUAD;
+ }
+
+ /* Load the data incoming to the bswap. Note that the TCG
+ implementation of bswap requires the input be zero
+ extended. In the case of the loads, we simply know that
+ gen_op_ld_v via gen_ldst_modrm does that already. */
+ if ((b & 1) == 0) {
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ } else {
+ switch (ot) {
+ case OT_WORD:
+ tcg_gen_ext16u_tl(cpu_T[0], cpu_regs[reg]);
+ break;
+ default:
+ tcg_gen_ext32u_tl(cpu_T[0], cpu_regs[reg]);
+ break;
+ case OT_QUAD:
+ tcg_gen_mov_tl(cpu_T[0], cpu_regs[reg]);
+ break;
+ }
+ }
+
+ switch (ot) {
+ case OT_WORD:
+ tcg_gen_bswap16_tl(cpu_T[0], cpu_T[0]);
+ break;
+ default:
+ tcg_gen_bswap32_tl(cpu_T[0], cpu_T[0]);
+ break;
+#ifdef TARGET_X86_64
+ case OT_QUAD:
+ tcg_gen_bswap64_tl(cpu_T[0], cpu_T[0]);
+ break;
+#endif
+ }
+
+ if ((b & 1) == 0) {
+ gen_op_mov_reg_T0(ot, reg);
+ } else {
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 1);
+ }
+ break;
+
+ case 0x0f2: /* andn Gy, By, Ey */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ tcg_gen_andc_tl(cpu_T[0], cpu_regs[s->vex_v], cpu_T[0]);
+ gen_op_mov_reg_T0(ot, reg);
+ gen_op_update1_cc();
+ set_cc_op(s, CC_OP_LOGICB + ot);
+ break;
+
+ case 0x0f7: /* bextr Gy, Ey, By */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ {
+ TCGv bound, zero;
+
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ /* Extract START, and shift the operand.
+ Shifts larger than operand size get zeros. */
+ tcg_gen_ext8u_tl(cpu_A0, cpu_regs[s->vex_v]);
+ tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_A0);
+
+ bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+ zero = tcg_const_tl(0);
+ tcg_gen_movcond_tl(TCG_COND_LEU, cpu_T[0], cpu_A0, bound,
+ cpu_T[0], zero);
+ tcg_temp_free(zero);
+
+ /* Extract the LEN into a mask. Lengths larger than
+ operand size get all ones. */
+ tcg_gen_shri_tl(cpu_A0, cpu_regs[s->vex_v], 8);
+ tcg_gen_ext8u_tl(cpu_A0, cpu_A0);
+ tcg_gen_movcond_tl(TCG_COND_LEU, cpu_A0, cpu_A0, bound,
+ cpu_A0, bound);
+ tcg_temp_free(bound);
+ tcg_gen_movi_tl(cpu_T[1], 1);
+ tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_A0);
+ tcg_gen_subi_tl(cpu_T[1], cpu_T[1], 1);
+ tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+
+ gen_op_mov_reg_T0(ot, reg);
+ gen_op_update1_cc();
+ set_cc_op(s, CC_OP_LOGICB + ot);
+ }
+ break;
+
+ case 0x0f5: /* bzhi Gy, Ey, By */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ tcg_gen_ext8u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+ {
+ TCGv bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+ /* Note that since we're using BMILG (in order to get O
+ cleared) we need to store the inverse into C. */
+ tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
+ cpu_T[1], bound);
+ tcg_gen_movcond_tl(TCG_COND_GT, cpu_T[1], cpu_T[1],
+ bound, bound, cpu_T[1]);
+ tcg_temp_free(bound);
+ }
+ tcg_gen_movi_tl(cpu_A0, -1);
+ tcg_gen_shl_tl(cpu_A0, cpu_A0, cpu_T[1]);
+ tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_A0);
+ gen_op_mov_reg_T0(ot, reg);
+ gen_op_update1_cc();
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ break;
+
+ case 0x3f6: /* mulx By, Gy, rdx, Ey */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ switch (ot) {
+ default:
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
+ tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+ cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
+ break;
+#ifdef TARGET_X86_64
+ case OT_QUAD:
+ tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
+ cpu_T[0], cpu_regs[R_EDX]);
+ break;
+#endif
+ }
+ break;
+
+ case 0x3f5: /* pdep Gy, By, Ey */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ /* Note that by zero-extending the mask operand, we
+ automatically handle zero-extending the result. */
+ if (s->dflag == 2) {
+ tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+ } else {
+ tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+ }
+ gen_helper_pdep(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+ break;
+
+ case 0x2f5: /* pext Gy, By, Ey */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ /* Note that by zero-extending the mask operand, we
+ automatically handle zero-extending the result. */
+ if (s->dflag == 2) {
+ tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+ } else {
+ tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+ }
+ gen_helper_pext(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+ break;
+
+ case 0x1f6: /* adcx Gy, Ey */
+ case 0x2f6: /* adox Gy, Ey */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
+ goto illegal_op;
+ } else {
+ TCGv carry_in, carry_out, zero;
+ int end_op;
+
+ ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+ /* Re-use the carry-out from a previous round. */
+ TCGV_UNUSED(carry_in);
+ carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
+ switch (s->cc_op) {
+ case CC_OP_ADCX:
+ if (b == 0x1f6) {
+ carry_in = cpu_cc_dst;
+ end_op = CC_OP_ADCX;
+ } else {
+ end_op = CC_OP_ADCOX;
+ }
+ break;
+ case CC_OP_ADOX:
+ if (b == 0x1f6) {
+ end_op = CC_OP_ADCOX;
+ } else {
+ carry_in = cpu_cc_src2;
+ end_op = CC_OP_ADOX;
+ }
+ break;
+ case CC_OP_ADCOX:
+ end_op = CC_OP_ADCOX;
+ carry_in = carry_out;
+ break;
+ default:
+ end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX);
+ break;
+ }
+ /* If we can't reuse carry-out, get it out of EFLAGS. */
+ if (TCGV_IS_UNUSED(carry_in)) {
+ if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+ gen_compute_eflags(s);
+ }
+ carry_in = cpu_tmp0;
+ tcg_gen_shri_tl(carry_in, cpu_cc_src,
+ ctz32(b == 0x1f6 ? CC_C : CC_O));
+ tcg_gen_andi_tl(carry_in, carry_in, 1);
+ }
+
+ switch (ot) {
+#ifdef TARGET_X86_64
+ case OT_LONG:
+ /* If we know TL is 64-bit, and we want a 32-bit
+ result, just do everything in 64-bit arithmetic. */
+ tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
+ tcg_gen_ext32u_i64(cpu_T[0], cpu_T[0]);
+ tcg_gen_add_i64(cpu_T[0], cpu_T[0], cpu_regs[reg]);
+ tcg_gen_add_i64(cpu_T[0], cpu_T[0], carry_in);
+ tcg_gen_ext32u_i64(cpu_regs[reg], cpu_T[0]);
+ tcg_gen_shri_i64(carry_out, cpu_T[0], 32);
break;
- case 0x2a: /* movntqda */
- gen_ldo_env_A0(s->mem_index, op1_offset);
- return;
+#endif
default:
- gen_ldo_env_A0(s->mem_index, op2_offset);
+ /* Otherwise compute the carry-out in two steps. */
+ zero = tcg_const_tl(0);
+ tcg_gen_add2_tl(cpu_T[0], carry_out,
+ cpu_T[0], zero,
+ carry_in, zero);
+ tcg_gen_add2_tl(cpu_regs[reg], carry_out,
+ cpu_regs[reg], carry_out,
+ cpu_T[0], zero);
+ tcg_temp_free(zero);
+ break;
}
+ set_cc_op(s, end_op);
}
- } else {
- op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
- if (mod == 3) {
- op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+ break;
+
+ case 0x1f7: /* shlx Gy, Ey, By */
+ case 0x2f7: /* sarx Gy, Ey, By */
+ case 0x3f7: /* shrx Gy, Ey, By */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ if (ot == OT_QUAD) {
+ tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 63);
} else {
- op2_offset = offsetof(CPUX86State,mmx_t0);
- gen_lea_modrm(env, s, modrm, ®_addr, &offset_addr);
- gen_ldq_env_A0(s->mem_index, op2_offset);
+ tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 31);
}
- }
- if (sse_fn_epp == SSE_SPECIAL) {
- goto illegal_op;
- }
-
- tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
- tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
- sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+ if (b == 0x1f7) {
+ tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ } else if (b == 0x2f7) {
+ if (ot != OT_QUAD) {
+ tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+ }
+ tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ } else {
+ if (ot != OT_QUAD) {
+ tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+ }
+ tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ }
+ gen_op_mov_reg_T0(ot, reg);
+ break;
- if (b == 0x17) {
- set_cc_op(s, CC_OP_EFLAGS);
- }
- break;
- case 0x338: /* crc32 */
- crc32:
- b = modrm;
- modrm = cpu_ldub_code(env, s->pc++);
- reg = ((modrm >> 3) & 7) | rex_r;
+ case 0x0f3:
+ case 0x1f3:
+ case 0x2f3:
+ case 0x3f3: /* Group 17 */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+ switch (reg & 7) {
+ case 1: /* blsr By,Ey */
+ tcg_gen_neg_tl(cpu_T[1], cpu_T[0]);
+ tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+ gen_op_mov_reg_T0(ot, s->vex_v);
+ gen_op_update2_cc();
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ break;
- if (b != 0xf0 && b != 0xf1)
- goto illegal_op;
- if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
- goto illegal_op;
+ case 2: /* blsmsk By,Ey */
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+ tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+ tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ break;
- if (b == 0xf0)
- ot = OT_BYTE;
- else if (b == 0xf1 && s->dflag != 2)
- if (s->prefix & PREFIX_DATA)
- ot = OT_WORD;
- else
- ot = OT_LONG;
- else
- ot = OT_QUAD;
+ case 3: /* blsi By, Ey */
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+ tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+ tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ break;
- gen_op_mov_TN_reg(OT_LONG, 0, reg);
- tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
- gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
- gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
- cpu_T[0], tcg_const_i32(8 << ot));
+ default:
+ goto illegal_op;
+ }
+ break;
- ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
- gen_op_mov_reg_T0(ot, reg);
+ default:
+ goto illegal_op;
+ }
break;
+
case 0x03a:
case 0x13a:
b = modrm;
if (mod == 3)
gen_op_mov_TN_reg(OT_LONG, 0, rm);
else
- tcg_gen_qemu_ld8u(cpu_tmp0, cpu_A0,
+ tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
(s->mem_index >> 2) - 1);
- tcg_gen_st8_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State,
+ tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
xmm_regs[reg].XMM_B(val & 15)));
break;
case 0x21: /* insertps */
tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
break;
+
+ case 0x33a:
+ /* Various integer extensions at 0f 3a f[0-f]. */
+ b = modrm | (b1 << 8);
+ modrm = cpu_ldub_code(env, s->pc++);
+ reg = ((modrm >> 3) & 7) | rex_r;
+
+ switch (b) {
+ case 0x3f0: /* rorx Gy,Ey, Ib */
+ if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+ || !(s->prefix & PREFIX_VEX)
+ || s->vex_l != 0) {
+ goto illegal_op;
+ }
+ ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ b = cpu_ldub_code(env, s->pc++);
+ if (ot == OT_QUAD) {
+ tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], b & 63);
+ } else {
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, b & 31);
+ tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+ }
+ gen_op_mov_reg_T0(ot, reg);
+ break;
+
+ default:
+ goto illegal_op;
+ }
+ break;
+
default:
goto illegal_op;
}
}
s->pc = pc_start;
prefixes = 0;
- aflag = s->code32;
- dflag = s->code32;
s->override = -1;
rex_w = -1;
rex_r = 0;
x86_64_hregs = 0;
#endif
s->rip_offset = 0; /* for relative ip address */
+ s->vex_l = 0;
+ s->vex_v = 0;
next_byte:
b = cpu_ldub_code(env, s->pc);
s->pc++;
- /* check prefixes */
+ /* Collect prefixes. */
+ switch (b) {
+ case 0xf3:
+ prefixes |= PREFIX_REPZ;
+ goto next_byte;
+ case 0xf2:
+ prefixes |= PREFIX_REPNZ;
+ goto next_byte;
+ case 0xf0:
+ prefixes |= PREFIX_LOCK;
+ goto next_byte;
+ case 0x2e:
+ s->override = R_CS;
+ goto next_byte;
+ case 0x36:
+ s->override = R_SS;
+ goto next_byte;
+ case 0x3e:
+ s->override = R_DS;
+ goto next_byte;
+ case 0x26:
+ s->override = R_ES;
+ goto next_byte;
+ case 0x64:
+ s->override = R_FS;
+ goto next_byte;
+ case 0x65:
+ s->override = R_GS;
+ goto next_byte;
+ case 0x66:
+ prefixes |= PREFIX_DATA;
+ goto next_byte;
+ case 0x67:
+ prefixes |= PREFIX_ADR;
+ goto next_byte;
#ifdef TARGET_X86_64
- if (CODE64(s)) {
- switch (b) {
- case 0xf3:
- prefixes |= PREFIX_REPZ;
- goto next_byte;
- case 0xf2:
- prefixes |= PREFIX_REPNZ;
- goto next_byte;
- case 0xf0:
- prefixes |= PREFIX_LOCK;
- goto next_byte;
- case 0x2e:
- s->override = R_CS;
- goto next_byte;
- case 0x36:
- s->override = R_SS;
- goto next_byte;
- case 0x3e:
- s->override = R_DS;
- goto next_byte;
- case 0x26:
- s->override = R_ES;
- goto next_byte;
- case 0x64:
- s->override = R_FS;
- goto next_byte;
- case 0x65:
- s->override = R_GS;
- goto next_byte;
- case 0x66:
- prefixes |= PREFIX_DATA;
- goto next_byte;
- case 0x67:
- prefixes |= PREFIX_ADR;
- goto next_byte;
- case 0x40 ... 0x4f:
+ case 0x40 ... 0x4f:
+ if (CODE64(s)) {
/* REX prefix */
rex_w = (b >> 3) & 1;
rex_r = (b & 0x4) << 1;
x86_64_hregs = 1; /* select uniform byte register addressing */
goto next_byte;
}
- if (rex_w == 1) {
- /* 0x66 is ignored if rex.w is set */
- dflag = 2;
- } else {
- if (prefixes & PREFIX_DATA)
- dflag ^= 1;
- }
- if (!(prefixes & PREFIX_ADR))
- aflag = 2;
- } else
+ break;
#endif
- {
- switch (b) {
- case 0xf3:
- prefixes |= PREFIX_REPZ;
- goto next_byte;
- case 0xf2:
- prefixes |= PREFIX_REPNZ;
- goto next_byte;
- case 0xf0:
- prefixes |= PREFIX_LOCK;
- goto next_byte;
- case 0x2e:
- s->override = R_CS;
- goto next_byte;
- case 0x36:
- s->override = R_SS;
- goto next_byte;
- case 0x3e:
- s->override = R_DS;
- goto next_byte;
- case 0x26:
- s->override = R_ES;
- goto next_byte;
- case 0x64:
- s->override = R_FS;
- goto next_byte;
- case 0x65:
- s->override = R_GS;
- goto next_byte;
- case 0x66:
- prefixes |= PREFIX_DATA;
- goto next_byte;
- case 0x67:
- prefixes |= PREFIX_ADR;
- goto next_byte;
+ case 0xc5: /* 2-byte VEX */
+ case 0xc4: /* 3-byte VEX */
+ /* VEX prefixes cannot be used except in 32-bit mode.
+ Otherwise the instruction is LES or LDS. */
+ if (s->code32 && !s->vm86) {
+ static const int pp_prefix[4] = {
+ 0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
+ };
+ int vex3, vex2 = cpu_ldub_code(env, s->pc);
+
+ if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
+ /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
+ otherwise the instruction is LES or LDS. */
+ break;
+ }
+ s->pc++;
+
+ /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
+ if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
+ | PREFIX_LOCK | PREFIX_DATA)) {
+ goto illegal_op;
+ }
+#ifdef TARGET_X86_64
+ if (x86_64_hregs) {
+ goto illegal_op;
+ }
+#endif
+ rex_r = (~vex2 >> 4) & 8;
+ if (b == 0xc5) {
+ vex3 = vex2;
+ b = cpu_ldub_code(env, s->pc++);
+ } else {
+#ifdef TARGET_X86_64
+ s->rex_x = (~vex2 >> 3) & 8;
+ s->rex_b = (~vex2 >> 2) & 8;
+#endif
+ vex3 = cpu_ldub_code(env, s->pc++);
+ rex_w = (vex3 >> 7) & 1;
+ switch (vex2 & 0x1f) {
+ case 0x01: /* Implied 0f leading opcode bytes. */
+ b = cpu_ldub_code(env, s->pc++) | 0x100;
+ break;
+ case 0x02: /* Implied 0f 38 leading opcode bytes. */
+ b = 0x138;
+ break;
+ case 0x03: /* Implied 0f 3a leading opcode bytes. */
+ b = 0x13a;
+ break;
+ default: /* Reserved for future use. */
+ goto illegal_op;
+ }
+ }
+ s->vex_v = (~vex3 >> 3) & 0xf;
+ s->vex_l = (vex3 >> 2) & 1;
+ prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX;
}
- if (prefixes & PREFIX_DATA)
+ break;
+ }
+
+ /* Post-process prefixes. */
+ if (CODE64(s)) {
+ /* In 64-bit mode, the default data size is 32-bit. Select 64-bit
+ data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
+ over 0x66 if both are present. */
+ dflag = (rex_w > 0 ? 2 : prefixes & PREFIX_DATA ? 0 : 1);
+ /* In 64-bit mode, 0x67 selects 32-bit addressing. */
+ aflag = (prefixes & PREFIX_ADR ? 1 : 2);
+ } else {
+ /* In 16/32-bit mode, 0x66 selects the opposite data size. */
+ dflag = s->code32;
+ if (prefixes & PREFIX_DATA) {
dflag ^= 1;
- if (prefixes & PREFIX_ADR)
+ }
+ /* In 16/32-bit mode, 0x67 selects the opposite addressing. */
+ aflag = s->code32;
+ if (prefixes & PREFIX_ADR) {
aflag ^= 1;
+ }
}
s->prefix = prefixes;
} else if (op == OP_XORL && rm == reg) {
xor_zero:
/* xor reg, reg optimisation */
+ set_cc_op(s, CC_OP_CLR);
gen_op_movl_T0_0();
- set_cc_op(s, CC_OP_LOGICB + ot);
gen_op_mov_reg_T0(ot, reg);
- gen_op_update1_cc();
break;
} else {
opreg = rm;
break;
default:
case OT_LONG:
-#ifdef TARGET_X86_64
- gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
- tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
- tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
- tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
- gen_op_mov_reg_T0(OT_LONG, R_EAX);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
- gen_op_mov_reg_T0(OT_LONG, R_EDX);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-#else
- {
- TCGv_i64 t0, t1;
- t0 = tcg_temp_new_i64();
- t1 = tcg_temp_new_i64();
- gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
- tcg_gen_extu_i32_i64(t0, cpu_T[0]);
- tcg_gen_extu_i32_i64(t1, cpu_T[1]);
- tcg_gen_mul_i64(t0, t0, t1);
- tcg_gen_trunc_i64_i32(cpu_T[0], t0);
- gen_op_mov_reg_T0(OT_LONG, R_EAX);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_shri_i64(t0, t0, 32);
- tcg_gen_trunc_i64_i32(cpu_T[0], t0);
- gen_op_mov_reg_T0(OT_LONG, R_EDX);
- tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
- }
-#endif
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+ tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+ cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+ tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
set_cc_op(s, CC_OP_MULL);
break;
#ifdef TARGET_X86_64
case OT_QUAD:
- gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
+ tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+ cpu_T[0], cpu_regs[R_EAX]);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+ tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
set_cc_op(s, CC_OP_MULQ);
break;
#endif
break;
default:
case OT_LONG:
-#ifdef TARGET_X86_64
- gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
- tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
- tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
- tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
- gen_op_mov_reg_T0(OT_LONG, R_EAX);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
- tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
- tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
- gen_op_mov_reg_T0(OT_LONG, R_EDX);
-#else
- {
- TCGv_i64 t0, t1;
- t0 = tcg_temp_new_i64();
- t1 = tcg_temp_new_i64();
- gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
- tcg_gen_ext_i32_i64(t0, cpu_T[0]);
- tcg_gen_ext_i32_i64(t1, cpu_T[1]);
- tcg_gen_mul_i64(t0, t0, t1);
- tcg_gen_trunc_i64_i32(cpu_T[0], t0);
- gen_op_mov_reg_T0(OT_LONG, R_EAX);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
- tcg_gen_shri_i64(t0, t0, 32);
- tcg_gen_trunc_i64_i32(cpu_T[0], t0);
- gen_op_mov_reg_T0(OT_LONG, R_EDX);
- tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
- }
-#endif
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+ tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+ cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+ tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+ tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
set_cc_op(s, CC_OP_MULL);
break;
#ifdef TARGET_X86_64
case OT_QUAD:
- gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
+ tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+ cpu_T[0], cpu_regs[R_EAX]);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+ tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
+ tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
set_cc_op(s, CC_OP_MULQ);
break;
#endif
} else {
gen_op_mov_TN_reg(ot, 1, reg);
}
-
-#ifdef TARGET_X86_64
- if (ot == OT_QUAD) {
- gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
- } else
-#endif
- if (ot == OT_LONG) {
+ switch (ot) {
#ifdef TARGET_X86_64
- tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
- tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
- tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
- tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-#else
- {
- TCGv_i64 t0, t1;
- t0 = tcg_temp_new_i64();
- t1 = tcg_temp_new_i64();
- tcg_gen_ext_i32_i64(t0, cpu_T[0]);
- tcg_gen_ext_i32_i64(t1, cpu_T[1]);
- tcg_gen_mul_i64(t0, t0, t1);
- tcg_gen_trunc_i64_i32(cpu_T[0], t0);
- tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
- tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
- tcg_gen_shri_i64(t0, t0, 32);
- tcg_gen_trunc_i64_i32(cpu_T[1], t0);
- tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
- }
+ case OT_QUAD:
+ tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+ tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
+ tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
+ break;
#endif
- } else {
+ case OT_LONG:
+ tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+ tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+ tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+ cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
+ tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+ tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+ tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+ break;
+ default:
tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
/* XXX: use 32 bit mul which could be faster */
tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+ gen_op_mov_reg_T0(ot, reg);
+ break;
}
- gen_op_mov_reg_T0(ot, reg);
set_cc_op(s, CC_OP_MULB + ot);
break;
case 0x1c0:
rm = 0; /* avoid warning */
}
label1 = gen_new_label();
- tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+ tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+ gen_extu(ot, t0);
gen_extu(ot, t2);
- tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+ tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
label2 = gen_new_label();
if (mod == 3) {
gen_op_mov_reg_v(ot, R_EAX, t0);
}
gen_set_label(label2);
tcg_gen_mov_tl(cpu_cc_src, t0);
- tcg_gen_mov_tl(cpu_cc_dst, t2);
+ tcg_gen_mov_tl(cpu_cc_srcT, t2);
+ tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
set_cc_op(s, CC_OP_SUBB + ot);
tcg_temp_free(t0);
tcg_temp_free(t1);
}
break;
case 0xc4: /* les Gv */
- if (CODE64(s))
- goto illegal_op;
+ /* In CODE64 this is VEX3; see above. */
op = R_ES;
goto do_lxx;
case 0xc5: /* lds Gv */
- if (CODE64(s))
- goto illegal_op;
+ /* In CODE64 this is VEX2; see above. */
op = R_DS;
goto do_lxx;
case 0x1b2: /* lss Gv */
gen_op_mov_TN_reg(ot, 1, reg);
if (shift) {
- val = cpu_ldub_code(env, s->pc++);
- tcg_gen_movi_tl(cpu_T3, val);
+ TCGv imm = tcg_const_tl(cpu_ldub_code(env, s->pc++));
+ gen_shiftd_rm_T1(s, ot, opreg, op, imm);
+ tcg_temp_free(imm);
} else {
- tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+ gen_shiftd_rm_T1(s, ot, opreg, op, cpu_regs[R_ECX]);
}
- gen_shiftd_rm_T1_T3(s, ot, opreg, op);
break;
/************************/
};
op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
l1 = gen_new_label();
- gen_jcc1(s, op1, l1);
+ gen_jcc1_noeob(s, op1, l1);
gen_helper_fmov_ST0_STN(cpu_env, tcg_const_i32(opreg));
gen_set_label(l1);
}
case 0x190 ... 0x19f: /* setcc Gv */
modrm = cpu_ldub_code(env, s->pc++);
- gen_setcc(s, b);
+ gen_setcc1(s, b, cpu_T[0]);
gen_ldst_modrm(env, s, modrm, OT_BYTE, OR_TMP0, 1);
break;
case 0x140 ... 0x14f: /* cmov Gv, Ev */
- {
- int l1;
- TCGv t0;
-
- ot = dflag + OT_WORD;
- modrm = cpu_ldub_code(env, s->pc++);
- reg = ((modrm >> 3) & 7) | rex_r;
- mod = (modrm >> 6) & 3;
- t0 = tcg_temp_local_new();
- if (mod != 3) {
- gen_lea_modrm(env, s, modrm, ®_addr, &offset_addr);
- gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
- } else {
- rm = (modrm & 7) | REX_B(s);
- gen_op_mov_v_reg(ot, t0, rm);
- }
-#ifdef TARGET_X86_64
- if (ot == OT_LONG) {
- /* XXX: specific Intel behaviour ? */
- l1 = gen_new_label();
- gen_jcc1(s, b ^ 1, l1);
- tcg_gen_mov_tl(cpu_regs[reg], t0);
- gen_set_label(l1);
- tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
- } else
-#endif
- {
- l1 = gen_new_label();
- gen_jcc1(s, b ^ 1, l1);
- gen_op_mov_reg_v(ot, reg, t0);
- gen_set_label(l1);
- }
- tcg_temp_free(t0);
- }
+ ot = dflag + OT_WORD;
+ modrm = cpu_ldub_code(env, s->pc++);
+ reg = ((modrm >> 3) & 7) | rex_r;
+ gen_cmovcc1(env, s, ot, b, modrm, reg);
break;
/************************/
tcg_gen_movi_tl(cpu_cc_dst, 0);
}
break;
- case 0x1bc: /* bsf */
- case 0x1bd: /* bsr */
- {
- int label1;
- TCGv t0;
-
- ot = dflag + OT_WORD;
- modrm = cpu_ldub_code(env, s->pc++);
- reg = ((modrm >> 3) & 7) | rex_r;
- gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
- gen_extu(ot, cpu_T[0]);
- t0 = tcg_temp_local_new();
- tcg_gen_mov_tl(t0, cpu_T[0]);
- if ((b & 1) && (prefixes & PREFIX_REPZ) &&
- (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
- switch(ot) {
- case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(16)); break;
- case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(32)); break;
- case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
- tcg_const_i32(64)); break;
- }
- gen_op_mov_reg_T0(ot, reg);
+ case 0x1bc: /* bsf / tzcnt */
+ case 0x1bd: /* bsr / lzcnt */
+ ot = dflag + OT_WORD;
+ modrm = cpu_ldub_code(env, s->pc++);
+ reg = ((modrm >> 3) & 7) | rex_r;
+ gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+ gen_extu(ot, cpu_T[0]);
+
+ /* Note that lzcnt and tzcnt are in different extensions. */
+ if ((prefixes & PREFIX_REPZ)
+ && (b & 1
+ ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+ : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+ int size = 8 << ot;
+ tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+ if (b & 1) {
+ /* For lzcnt, reduce the target_ulong result by the
+ number of zeros that we expect to find at the top. */
+ gen_helper_clz(cpu_T[0], cpu_T[0]);
+ tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
} else {
- label1 = gen_new_label();
- tcg_gen_movi_tl(cpu_cc_dst, 0);
- tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
- if (b & 1) {
- gen_helper_bsr(cpu_T[0], t0);
- } else {
- gen_helper_bsf(cpu_T[0], t0);
- }
- gen_op_mov_reg_T0(ot, reg);
- tcg_gen_movi_tl(cpu_cc_dst, 1);
- gen_set_label(label1);
- set_cc_op(s, CC_OP_LOGICB + ot);
+ /* For tzcnt, a zero input must return the operand size:
+ force all bits outside the operand size to 1. */
+ target_ulong mask = (target_ulong)-2 << (size - 1);
+ tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+ gen_helper_ctz(cpu_T[0], cpu_T[0]);
}
- tcg_temp_free(t0);
+ /* For lzcnt/tzcnt, C and Z bits are defined and are
+ related to the result. */
+ gen_op_update1_cc();
+ set_cc_op(s, CC_OP_BMILGB + ot);
+ } else {
+ /* For bsr/bsf, only the Z bit is defined and it is related
+ to the input and not the result. */
+ tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+ set_cc_op(s, CC_OP_LOGICB + ot);
+ if (b & 1) {
+ /* For bsr, return the bit index of the first 1 bit,
+ not the count of leading zeros. */
+ gen_helper_clz(cpu_T[0], cpu_T[0]);
+ tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+ } else {
+ gen_helper_ctz(cpu_T[0], cpu_T[0]);
+ }
+ /* ??? The manual says that the output is undefined when the
+ input is zero, but real hardware leaves it unchanged, and
+ real programs appear to depend on that. */
+ tcg_gen_movi_tl(cpu_tmp0, 0);
+ tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+ cpu_regs[reg], cpu_T[0]);
}
+ gen_op_mov_reg_T0(ot, reg);
break;
/************************/
/* bcd */
cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
offsetof(CPUX86State, cc_op), "cc_op");
- cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
- "cc_src");
cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_dst),
"cc_dst");
+ cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
+ "cc_src");
+ cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2),
+ "cc_src2");
#ifdef TARGET_X86_64
cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
if (flags & HF_SOFTMMU_MASK) {
dc->mem_index = (cpu_mmu_index(env) + 1) << 2;
}
- dc->cpuid_features = env->cpuid_features;
- dc->cpuid_ext_features = env->cpuid_ext_features;
- dc->cpuid_ext2_features = env->cpuid_ext2_features;
- dc->cpuid_ext3_features = env->cpuid_ext3_features;
- dc->cpuid_7_0_ebx_features = env->cpuid_7_0_ebx_features;
+ dc->cpuid_features = env->features[FEAT_1_EDX];
+ dc->cpuid_ext_features = env->features[FEAT_1_ECX];
+ dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
+ dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
+ dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
#ifdef TARGET_X86_64
dc->lma = (flags >> HF_LMA_SHIFT) & 1;
dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
cpu_T[0] = tcg_temp_new();
cpu_T[1] = tcg_temp_new();
cpu_A0 = tcg_temp_new();
- cpu_T3 = tcg_temp_new();
cpu_tmp0 = tcg_temp_new();
cpu_tmp1_i64 = tcg_temp_new_i64();
cpu_tmp2_i32 = tcg_temp_new_i32();
cpu_tmp3_i32 = tcg_temp_new_i32();
cpu_tmp4 = tcg_temp_new();
- cpu_tmp5 = tcg_temp_new();
cpu_ptr0 = tcg_temp_new_ptr();
cpu_ptr1 = tcg_temp_new_ptr();
+ cpu_cc_srcT = tcg_temp_local_new();
gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
if (max_insns == 0)
max_insns = CF_COUNT_MASK;
- gen_icount_start();
+ gen_tb_start();
for(;;) {
if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
}
if (tb->cflags & CF_LAST_IO)
gen_io_end();
- gen_icount_end(tb, num_insns);
+ gen_tb_end(tb, num_insns);
*tcg_ctx.gen_opc_ptr = INDEX_op_end;
/* we don't forget to fill the last values */
if (search_pc) {