tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 static const int tcg_target_call_iarg_regs[] = {
  95 #if TCG_TARGET_REG_BITS == 64
  96 #if defined(_WIN64)
  97     TCG_REG_RCX,
  98     TCG_REG_RDX,
  99 #else
 100     TCG_REG_RDI,
 101     TCG_REG_RSI,
 102     TCG_REG_RDX,
 103     TCG_REG_RCX,
 104 #endif
 105     TCG_REG_R8,
 106     TCG_REG_R9,
 107 #else
 108     /* 32 bit mode uses stack based calling convention (GCC default). */
 109 #endif
 110 };
 111
 112 static const int tcg_target_call_oarg_regs[] = {
 113     TCG_REG_EAX,
 114 #if TCG_TARGET_REG_BITS == 32
 115     TCG_REG_EDX
 116 #endif
 117 };
 118
 119 /* Constants we accept.  */
 120 #define TCG_CT_CONST_S32 0x100
 121 #define TCG_CT_CONST_U32 0x200
 122 #define TCG_CT_CONST_I32 0x400
 123 #define TCG_CT_CONST_WSZ 0x800
 124
 125 /* Registers used with L constraint, which are the first argument
 126    registers on x86_64, and two random call clobbered registers on
 127    i386. */
 128 #if TCG_TARGET_REG_BITS == 64
 129 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 130 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 131 #else
 132 # define TCG_REG_L0 TCG_REG_EAX
 133 # define TCG_REG_L1 TCG_REG_EDX
 134 #endif
 135
 136 #define ALL_BYTEH_REGS         0x0000000fu
 137 #if TCG_TARGET_REG_BITS == 64
 138 # define ALL_GENERAL_REGS      0x0000ffffu
 139 # define ALL_VECTOR_REGS       0xffff0000u
 140 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 141 #else
 142 # define ALL_GENERAL_REGS      0x000000ffu
 143 # define ALL_VECTOR_REGS       0x00ff0000u
 144 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 145 #endif
 146 #ifdef CONFIG_SOFTMMU
 147 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 148 #else
 149 # define SOFTMMU_RESERVE_REGS  0
 150 #endif
 151
 152 /* The host compiler should supply <cpuid.h> to enable runtime features
 153    detection, as we're not going to go so far as our own inline assembly.
 154    If not available, default values will be assumed.  */
 155 #if defined(CONFIG_CPUID_H)
 156 #include "qemu/cpuid.h"
 157 #endif
 158
 159 /* For 64-bit, we always know that CMOV is available.  */
 160 #if TCG_TARGET_REG_BITS == 64
 161 # define have_cmov 1
 162 #elif defined(CONFIG_CPUID_H)
 163 static bool have_cmov;
 164 #else
 165 # define have_cmov 0
 166 #endif
 167
 168 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 169    it there.  Therefore we always define the variable.  */
 170 bool have_bmi1;
 171 bool have_popcnt;
 172 bool have_avx1;
 173 bool have_avx2;
 174 bool have_avx512bw;
 175 bool have_avx512dq;
 176 bool have_avx512vbmi2;
 177 bool have_avx512vl;
 178 bool have_movbe;
 179
 180 #ifdef CONFIG_CPUID_H
 181 static bool have_bmi2;
 182 static bool have_lzcnt;
 183 #else
 184 # define have_bmi2 0
 185 # define have_lzcnt 0
 186 #endif
 187
 188 static const tcg_insn_unit *tb_ret_addr;
 189
 190 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 191                         intptr_t value, intptr_t addend)
 192 {
 193     value += addend;
 194     switch(type) {
 195     case R_386_PC32:
 196         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 197         if (value != (int32_t)value) {
 198             return false;
 199         }
 200         /* FALLTHRU */
 201     case R_386_32:
 202         tcg_patch32(code_ptr, value);
 203         break;
 204     case R_386_PC8:
 205         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 206         if (value != (int8_t)value) {
 207             return false;
 208         }
 209         tcg_patch8(code_ptr, value);
 210         break;
 211     default:
 212         tcg_abort();
 213     }
 214     return true;
 215 }
 216
 217 /* test if a constant matches the constraint */
 218 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 219 {
 220     if (ct & TCG_CT_CONST) {
 221         return 1;
 222     }
 223     if (type == TCG_TYPE_I32) {
 224         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 225             return 1;
 226         }
 227     } else {
 228         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 229             return 1;
 230         }
 231         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 232             return 1;
 233         }
 234         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 235             return 1;
 236         }
 237     }
 238     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 239         return 1;
 240     }
 241     return 0;
 242 }
 243
 244 # define LOWREGMASK(x)  ((x) & 7)
 245
 246 #define P_EXT           0x100           /* 0x0f opcode prefix */
 247 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 248 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 249 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 250 #if TCG_TARGET_REG_BITS == 64
 251 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 252 # define P_REXB_R       0x2000          /* REG field as byte register */
 253 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 254 # define P_GS           0x8000          /* gs segment override */
 255 #else
 256 # define P_REXW         0
 257 # define P_REXB_R       0
 258 # define P_REXB_RM      0
 259 # define P_GS           0
 260 #endif
 261 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 262 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 263 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 264 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 265 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 266
 267 #define OPC_ARITH_EvIz  (0x81)
 268 #define OPC_ARITH_EvIb  (0x83)
 269 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 270 #define OPC_ANDN        (0xf2 | P_EXT38)
 271 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 272 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 273 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 274 #define OPC_BSF         (0xbc | P_EXT)
 275 #define OPC_BSR         (0xbd | P_EXT)
 276 #define OPC_BSWAP       (0xc8 | P_EXT)
 277 #define OPC_CALL_Jz     (0xe8)
 278 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 279 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 280 #define OPC_DEC_r32     (0x48)
 281 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 282 #define OPC_IMUL_GvEvIb (0x6b)
 283 #define OPC_IMUL_GvEvIz (0x69)
 284 #define OPC_INC_r32     (0x40)
 285 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 286 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 287 #define OPC_JMP_long    (0xe9)
 288 #define OPC_JMP_short   (0xeb)
 289 #define OPC_LEA         (0x8d)
 290 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 291 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 292 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 293 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 294 #define OPC_MOVB_EvIz   (0xc6)
 295 #define OPC_MOVL_EvIz   (0xc7)
 296 #define OPC_MOVL_Iv     (0xb8)
 297 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 298 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 299 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 300 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 301 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 302 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 303 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 304 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 305 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 306 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 307 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 308 #define OPC_MOVSBL      (0xbe | P_EXT)
 309 #define OPC_MOVSWL      (0xbf | P_EXT)
 310 #define OPC_MOVSLQ      (0x63 | P_REXW)
 311 #define OPC_MOVZBL      (0xb6 | P_EXT)
 312 #define OPC_MOVZWL      (0xb7 | P_EXT)
 313 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 314 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 315 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 316 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 317 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 318 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 319 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 320 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 321 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 322 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 323 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 324 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 325 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 326 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 327 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 328 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 329 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 330 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 331 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 332 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 333 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 334 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 335 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 336 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 337 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 338 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 339 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 340 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 341 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 342 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 343 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 344 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 345 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 346 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 347 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 348 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 349 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 350 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 351 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 352 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 353 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 354 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 355 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 356 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 357 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 358 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 359 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 360 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 361 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 362 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 363 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 364 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 365 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 366 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 367 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 368 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 369 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 370 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 371 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 372 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 373 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 374 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 375 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 376 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 377 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 378 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 379 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 380 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 381 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 382 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 383 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 384 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 385 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 386 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 387 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 388 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 389 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 390 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 391 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 392 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 393 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 394 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 395 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 396 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 397 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 398 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 399 #define OPC_POP_r32     (0x58)
 400 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 401 #define OPC_PUSH_r32    (0x50)
 402 #define OPC_PUSH_Iv     (0x68)
 403 #define OPC_PUSH_Ib     (0x6a)
 404 #define OPC_RET         (0xc3)
 405 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 406 #define OPC_SHIFT_1     (0xd1)
 407 #define OPC_SHIFT_Ib    (0xc1)
 408 #define OPC_SHIFT_cl    (0xd3)
 409 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 410 #define OPC_SHUFPS      (0xc6 | P_EXT)
 411 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 412 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 413 #define OPC_SHRD_Ib     (0xac | P_EXT)
 414 #define OPC_TESTL       (0x85)
 415 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 416 #define OPC_UD2         (0x0b | P_EXT)
 417 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 418 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 419 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 420 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 421 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 422 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 423 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 424 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 425 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 426 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 427 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 428 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 429 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 430 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 431 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 432 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 433 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 434 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 435 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 436 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 437 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 438 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 439 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 440 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 441 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 442 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 443 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 444 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 445 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 446 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 447 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 448 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 449 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 450 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 451 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 452 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 453 #define OPC_XCHG_ax_r32 (0x90)
 454
 455 #define OPC_GRP3_Eb     (0xf6)
 456 #define OPC_GRP3_Ev     (0xf7)
 457 #define OPC_GRP5        (0xff)
 458 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 459
 460 /* Group 1 opcode extensions for 0x80-0x83.
 461    These are also used as modifiers for OPC_ARITH.  */
 462 #define ARITH_ADD 0
 463 #define ARITH_OR  1
 464 #define ARITH_ADC 2
 465 #define ARITH_SBB 3
 466 #define ARITH_AND 4
 467 #define ARITH_SUB 5
 468 #define ARITH_XOR 6
 469 #define ARITH_CMP 7
 470
 471 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 472 #define SHIFT_ROL 0
 473 #define SHIFT_ROR 1
 474 #define SHIFT_SHL 4
 475 #define SHIFT_SHR 5
 476 #define SHIFT_SAR 7
 477
 478 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 479 #define EXT3_TESTi 0
 480 #define EXT3_NOT   2
 481 #define EXT3_NEG   3
 482 #define EXT3_MUL   4
 483 #define EXT3_IMUL  5
 484 #define EXT3_DIV   6
 485 #define EXT3_IDIV  7
 486
 487 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 488 #define EXT5_INC_Ev     0
 489 #define EXT5_DEC_Ev     1
 490 #define EXT5_CALLN_Ev   2
 491 #define EXT5_JMPN_Ev    4
 492
 493 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 494 #define JCC_JMP (-1)
 495 #define JCC_JO  0x0
 496 #define JCC_JNO 0x1
 497 #define JCC_JB  0x2
 498 #define JCC_JAE 0x3
 499 #define JCC_JE  0x4
 500 #define JCC_JNE 0x5
 501 #define JCC_JBE 0x6
 502 #define JCC_JA  0x7
 503 #define JCC_JS  0x8
 504 #define JCC_JNS 0x9
 505 #define JCC_JP  0xa
 506 #define JCC_JNP 0xb
 507 #define JCC_JL  0xc
 508 #define JCC_JGE 0xd
 509 #define JCC_JLE 0xe
 510 #define JCC_JG  0xf
 511
 512 static const uint8_t tcg_cond_to_jcc[] = {
 513     [TCG_COND_EQ] = JCC_JE,
 514     [TCG_COND_NE] = JCC_JNE,
 515     [TCG_COND_LT] = JCC_JL,
 516     [TCG_COND_GE] = JCC_JGE,
 517     [TCG_COND_LE] = JCC_JLE,
 518     [TCG_COND_GT] = JCC_JG,
 519     [TCG_COND_LTU] = JCC_JB,
 520     [TCG_COND_GEU] = JCC_JAE,
 521     [TCG_COND_LEU] = JCC_JBE,
 522     [TCG_COND_GTU] = JCC_JA,
 523 };
 524
 525 #if TCG_TARGET_REG_BITS == 64
 526 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 527 {
 528     int rex;
 529
 530     if (opc & P_GS) {
 531         tcg_out8(s, 0x65);
 532     }
 533     if (opc & P_DATA16) {
 534         /* We should never be asking for both 16 and 64-bit operation.  */
 535         tcg_debug_assert((opc & P_REXW) == 0);
 536         tcg_out8(s, 0x66);
 537     }
 538     if (opc & P_SIMDF3) {
 539         tcg_out8(s, 0xf3);
 540     } else if (opc & P_SIMDF2) {
 541         tcg_out8(s, 0xf2);
 542     }
 543
 544     rex = 0;
 545     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 546     rex |= (r & 8) >> 1;                /* REX.R */
 547     rex |= (x & 8) >> 2;                /* REX.X */
 548     rex |= (rm & 8) >> 3;               /* REX.B */
 549
 550     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 551        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 552        as otherwise the encoding indicates %[abcd]h.  Note that the values
 553        that are ORed in merely indicate that the REX byte must be present;
 554        those bits get discarded in output.  */
 555     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 556     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 557
 558     if (rex) {
 559         tcg_out8(s, (uint8_t)(rex | 0x40));
 560     }
 561
 562     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 563         tcg_out8(s, 0x0f);
 564         if (opc & P_EXT38) {
 565             tcg_out8(s, 0x38);
 566         } else if (opc & P_EXT3A) {
 567             tcg_out8(s, 0x3a);
 568         }
 569     }
 570
 571     tcg_out8(s, opc);
 572 }
 573 #else
 574 static void tcg_out_opc(TCGContext *s, int opc)
 575 {
 576     if (opc & P_DATA16) {
 577         tcg_out8(s, 0x66);
 578     }
 579     if (opc & P_SIMDF3) {
 580         tcg_out8(s, 0xf3);
 581     } else if (opc & P_SIMDF2) {
 582         tcg_out8(s, 0xf2);
 583     }
 584     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 585         tcg_out8(s, 0x0f);
 586         if (opc & P_EXT38) {
 587             tcg_out8(s, 0x38);
 588         } else if (opc & P_EXT3A) {
 589             tcg_out8(s, 0x3a);
 590         }
 591     }
 592     tcg_out8(s, opc);
 593 }
 594 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 595    the 32-bit compilation paths.  This method works with all versions of gcc,
 596    whereas relying on optimization may not be able to exclude them.  */
 597 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 598 #endif
 599
 600 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 601 {
 602     tcg_out_opc(s, opc, r, rm, 0);
 603     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 604 }
 605
 606 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 607                             int rm, int index)
 608 {
 609     int tmp;
 610
 611     /* Use the two byte form if possible, which cannot encode
 612        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 613     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 614         && ((rm | index) & 8) == 0) {
 615         /* Two byte VEX prefix.  */
 616         tcg_out8(s, 0xc5);
 617
 618         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 619     } else {
 620         /* Three byte VEX prefix.  */
 621         tcg_out8(s, 0xc4);
 622
 623         /* VEX.m-mmmm */
 624         if (opc & P_EXT3A) {
 625             tmp = 3;
 626         } else if (opc & P_EXT38) {
 627             tmp = 2;
 628         } else if (opc & P_EXT) {
 629             tmp = 1;
 630         } else {
 631             g_assert_not_reached();
 632         }
 633         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 634         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 635         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 636         tcg_out8(s, tmp);
 637
 638         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 639     }
 640
 641     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 642     /* VEX.pp */
 643     if (opc & P_DATA16) {
 644         tmp |= 1;                          /* 0x66 */
 645     } else if (opc & P_SIMDF3) {
 646         tmp |= 2;                          /* 0xf3 */
 647     } else if (opc & P_SIMDF2) {
 648         tmp |= 3;                          /* 0xf2 */
 649     }
 650     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 651     tcg_out8(s, tmp);
 652     tcg_out8(s, opc);
 653 }
 654
 655 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 656                              int rm, int index)
 657 {
 658     /* The entire 4-byte evex prefix; with R' and V' set. */
 659     uint32_t p = 0x08041062;
 660     int mm, pp;
 661
 662     tcg_debug_assert(have_avx512vl);
 663
 664     /* EVEX.mm */
 665     if (opc & P_EXT3A) {
 666         mm = 3;
 667     } else if (opc & P_EXT38) {
 668         mm = 2;
 669     } else if (opc & P_EXT) {
 670         mm = 1;
 671     } else {
 672         g_assert_not_reached();
 673     }
 674
 675     /* EVEX.pp */
 676     if (opc & P_DATA16) {
 677         pp = 1;                          /* 0x66 */
 678     } else if (opc & P_SIMDF3) {
 679         pp = 2;                          /* 0xf3 */
 680     } else if (opc & P_SIMDF2) {
 681         pp = 3;                          /* 0xf2 */
 682     } else {
 683         pp = 0;
 684     }
 685
 686     p = deposit32(p, 8, 2, mm);
 687     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 688     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 689     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 690     p = deposit32(p, 16, 2, pp);
 691     p = deposit32(p, 19, 4, ~v);
 692     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 693     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 694
 695     tcg_out32(s, p);
 696     tcg_out8(s, opc);
 697 }
 698
 699 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 700 {
 701     if (opc & P_EVEX) {
 702         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 703     } else {
 704         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 705     }
 706     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 707 }
 708
 709 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 710    We handle either RM and INDEX missing with a negative value.  In 64-bit
 711    mode for absolute addresses, ~RM is the size of the immediate operand
 712    that will follow the instruction.  */
 713
 714 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 715                                int shift, intptr_t offset)
 716 {
 717     int mod, len;
 718
 719     if (index < 0 && rm < 0) {
 720         if (TCG_TARGET_REG_BITS == 64) {
 721             /* Try for a rip-relative addressing mode.  This has replaced
 722                the 32-bit-mode absolute addressing encoding.  */
 723             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 724             intptr_t disp = offset - pc;
 725             if (disp == (int32_t)disp) {
 726                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 727                 tcg_out32(s, disp);
 728                 return;
 729             }
 730
 731             /* Try for an absolute address encoding.  This requires the
 732                use of the MODRM+SIB encoding and is therefore larger than
 733                rip-relative addressing.  */
 734             if (offset == (int32_t)offset) {
 735                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 736                 tcg_out8(s, (4 << 3) | 5);
 737                 tcg_out32(s, offset);
 738                 return;
 739             }
 740
 741             /* ??? The memory isn't directly addressable.  */
 742             g_assert_not_reached();
 743         } else {
 744             /* Absolute address.  */
 745             tcg_out8(s, (r << 3) | 5);
 746             tcg_out32(s, offset);
 747             return;
 748         }
 749     }
 750
 751     /* Find the length of the immediate addend.  Note that the encoding
 752        that would be used for (%ebp) indicates absolute addressing.  */
 753     if (rm < 0) {
 754         mod = 0, len = 4, rm = 5;
 755     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 756         mod = 0, len = 0;
 757     } else if (offset == (int8_t)offset) {
 758         mod = 0x40, len = 1;
 759     } else {
 760         mod = 0x80, len = 4;
 761     }
 762
 763     /* Use a single byte MODRM format if possible.  Note that the encoding
 764        that would be used for %esp is the escape to the two byte form.  */
 765     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 766         /* Single byte MODRM format.  */
 767         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 768     } else {
 769         /* Two byte MODRM+SIB format.  */
 770
 771         /* Note that the encoding that would place %esp into the index
 772            field indicates no index register.  In 64-bit mode, the REX.X
 773            bit counts, so %r12 can be used as the index.  */
 774         if (index < 0) {
 775             index = 4;
 776         } else {
 777             tcg_debug_assert(index != TCG_REG_ESP);
 778         }
 779
 780         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 781         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 782     }
 783
 784     if (len == 1) {
 785         tcg_out8(s, offset);
 786     } else if (len == 4) {
 787         tcg_out32(s, offset);
 788     }
 789 }
 790
 791 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 792                                      int index, int shift, intptr_t offset)
 793 {
 794     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 795     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 796 }
 797
 798 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 799                                          int rm, int index, int shift,
 800                                          intptr_t offset)
 801 {
 802     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 803     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 804 }
 805
 806 /* A simplification of the above with no index or shift.  */
 807 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 808                                         int rm, intptr_t offset)
 809 {
 810     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 811 }
 812
 813 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 814                                             int v, int rm, intptr_t offset)
 815 {
 816     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 817 }
 818
 819 /* Output an opcode with an expected reference to the constant pool.  */
 820 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 821 {
 822     tcg_out_opc(s, opc, r, 0, 0);
 823     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 824     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 825     tcg_out32(s, 0);
 826 }
 827
 828 /* Output an opcode with an expected reference to the constant pool.  */
 829 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 830 {
 831     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 832     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 833     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 834     tcg_out32(s, 0);
 835 }
 836
 837 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 838 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 839 {
 840     /* Propagate an opcode prefix, such as P_REXW.  */
 841     int ext = subop & ~0x7;
 842     subop &= 0x7;
 843
 844     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 845 }
 846
 847 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 848 {
 849     int rexw = 0;
 850
 851     if (arg == ret) {
 852         return true;
 853     }
 854     switch (type) {
 855     case TCG_TYPE_I64:
 856         rexw = P_REXW;
 857         /* fallthru */
 858     case TCG_TYPE_I32:
 859         if (ret < 16) {
 860             if (arg < 16) {
 861                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 862             } else {
 863                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 864             }
 865         } else {
 866             if (arg < 16) {
 867                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 868             } else {
 869                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 870             }
 871         }
 872         break;
 873
 874     case TCG_TYPE_V64:
 875         tcg_debug_assert(ret >= 16 && arg >= 16);
 876         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 877         break;
 878     case TCG_TYPE_V128:
 879         tcg_debug_assert(ret >= 16 && arg >= 16);
 880         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 881         break;
 882     case TCG_TYPE_V256:
 883         tcg_debug_assert(ret >= 16 && arg >= 16);
 884         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 885         break;
 886
 887     default:
 888         g_assert_not_reached();
 889     }
 890     return true;
 891 }
 892
 893 static const int avx2_dup_insn[4] = {
 894     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 895     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 896 };
 897
 898 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 899                             TCGReg r, TCGReg a)
 900 {
 901     if (have_avx2) {
 902         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 903         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 904     } else {
 905         switch (vece) {
 906         case MO_8:
 907             /* ??? With zero in a register, use PSHUFB.  */
 908             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 909             a = r;
 910             /* FALLTHRU */
 911         case MO_16:
 912             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 913             a = r;
 914             /* FALLTHRU */
 915         case MO_32:
 916             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 917             /* imm8 operand: all output lanes selected from input lane 0.  */
 918             tcg_out8(s, 0);
 919             break;
 920         case MO_64:
 921             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 922             break;
 923         default:
 924             g_assert_not_reached();
 925         }
 926     }
 927     return true;
 928 }
 929
 930 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 931                              TCGReg r, TCGReg base, intptr_t offset)
 932 {
 933     if (have_avx2) {
 934         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 935         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 936                                  r, 0, base, offset);
 937     } else {
 938         switch (vece) {
 939         case MO_64:
 940             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 941             break;
 942         case MO_32:
 943             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 944             break;
 945         case MO_16:
 946             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 947             tcg_out8(s, 0); /* imm8 */
 948             tcg_out_dup_vec(s, type, vece, r, r);
 949             break;
 950         case MO_8:
 951             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 952             tcg_out8(s, 0); /* imm8 */
 953             tcg_out_dup_vec(s, type, vece, r, r);
 954             break;
 955         default:
 956             g_assert_not_reached();
 957         }
 958     }
 959     return true;
 960 }
 961
 962 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 963                              TCGReg ret, int64_t arg)
 964 {
 965     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 966
 967     if (arg == 0) {
 968         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 969         return;
 970     }
 971     if (arg == -1) {
 972         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 973         return;
 974     }
 975
 976     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 977         if (have_avx2) {
 978             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 979         } else {
 980             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 981         }
 982         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 983     } else {
 984         if (type == TCG_TYPE_V64) {
 985             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 986         } else if (have_avx2) {
 987             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 988         } else {
 989             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 990         }
 991         if (TCG_TARGET_REG_BITS == 64) {
 992             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 993         } else {
 994             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 995         }
 996     }
 997 }
 998
 999 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1000                              TCGReg ret, tcg_target_long arg)
1001 {
1002     if (arg == 0) {
1003         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1004         return;
1005     }
1006     if (arg == -1) {
1007         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1008         return;
1009     }
1010
1011     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1012     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1013     if (TCG_TARGET_REG_BITS == 64) {
1014         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1015     } else {
1016         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1017     }
1018 }
1019
1020 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1021                              TCGReg ret, tcg_target_long arg)
1022 {
1023     tcg_target_long diff;
1024
1025     if (arg == 0) {
1026         tgen_arithr(s, ARITH_XOR, ret, ret);
1027         return;
1028     }
1029     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1030         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1031         tcg_out32(s, arg);
1032         return;
1033     }
1034     if (arg == (int32_t)arg) {
1035         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1036         tcg_out32(s, arg);
1037         return;
1038     }
1039
1040     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1041     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1042     if (diff == (int32_t)diff) {
1043         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1044         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1045         tcg_out32(s, diff);
1046         return;
1047     }
1048
1049     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1050     tcg_out64(s, arg);
1051 }
1052
1053 static void tcg_out_movi(TCGContext *s, TCGType type,
1054                          TCGReg ret, tcg_target_long arg)
1055 {
1056     switch (type) {
1057     case TCG_TYPE_I32:
1058 #if TCG_TARGET_REG_BITS == 64
1059     case TCG_TYPE_I64:
1060 #endif
1061         if (ret < 16) {
1062             tcg_out_movi_int(s, type, ret, arg);
1063         } else {
1064             tcg_out_movi_vec(s, type, ret, arg);
1065         }
1066         break;
1067     default:
1068         g_assert_not_reached();
1069     }
1070 }
1071
1072 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1073 {
1074     if (val == (int8_t)val) {
1075         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1076         tcg_out8(s, val);
1077     } else if (val == (int32_t)val) {
1078         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1079         tcg_out32(s, val);
1080     } else {
1081         tcg_abort();
1082     }
1083 }
1084
1085 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1086 {
1087     /* Given the strength of x86 memory ordering, we only need care for
1088        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1089        faster than "mfence", so don't bother with the sse insn.  */
1090     if (a0 & TCG_MO_ST_LD) {
1091         tcg_out8(s, 0xf0);
1092         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1093         tcg_out8(s, 0);
1094     }
1095 }
1096
1097 static inline void tcg_out_push(TCGContext *s, int reg)
1098 {
1099     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1100 }
1101
1102 static inline void tcg_out_pop(TCGContext *s, int reg)
1103 {
1104     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1105 }
1106
1107 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1108                        TCGReg arg1, intptr_t arg2)
1109 {
1110     switch (type) {
1111     case TCG_TYPE_I32:
1112         if (ret < 16) {
1113             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1114         } else {
1115             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1116         }
1117         break;
1118     case TCG_TYPE_I64:
1119         if (ret < 16) {
1120             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1121             break;
1122         }
1123         /* FALLTHRU */
1124     case TCG_TYPE_V64:
1125         /* There is no instruction that can validate 8-byte alignment.  */
1126         tcg_debug_assert(ret >= 16);
1127         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1128         break;
1129     case TCG_TYPE_V128:
1130         /*
1131          * The gvec infrastructure is asserts that v128 vector loads
1132          * and stores use a 16-byte aligned offset.  Validate that the
1133          * final pointer is aligned by using an insn that will SIGSEGV.
1134          */
1135         tcg_debug_assert(ret >= 16);
1136         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1137         break;
1138     case TCG_TYPE_V256:
1139         /*
1140          * The gvec infrastructure only requires 16-byte alignment,
1141          * so here we must use an unaligned load.
1142          */
1143         tcg_debug_assert(ret >= 16);
1144         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1145                                  ret, 0, arg1, arg2);
1146         break;
1147     default:
1148         g_assert_not_reached();
1149     }
1150 }
1151
1152 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1153                        TCGReg arg1, intptr_t arg2)
1154 {
1155     switch (type) {
1156     case TCG_TYPE_I32:
1157         if (arg < 16) {
1158             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1159         } else {
1160             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1161         }
1162         break;
1163     case TCG_TYPE_I64:
1164         if (arg < 16) {
1165             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1166             break;
1167         }
1168         /* FALLTHRU */
1169     case TCG_TYPE_V64:
1170         /* There is no instruction that can validate 8-byte alignment.  */
1171         tcg_debug_assert(arg >= 16);
1172         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1173         break;
1174     case TCG_TYPE_V128:
1175         /*
1176          * The gvec infrastructure is asserts that v128 vector loads
1177          * and stores use a 16-byte aligned offset.  Validate that the
1178          * final pointer is aligned by using an insn that will SIGSEGV.
1179          */
1180         tcg_debug_assert(arg >= 16);
1181         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1182         break;
1183     case TCG_TYPE_V256:
1184         /*
1185          * The gvec infrastructure only requires 16-byte alignment,
1186          * so here we must use an unaligned store.
1187          */
1188         tcg_debug_assert(arg >= 16);
1189         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1190                                  arg, 0, arg1, arg2);
1191         break;
1192     default:
1193         g_assert_not_reached();
1194     }
1195 }
1196
1197 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1198                         TCGReg base, intptr_t ofs)
1199 {
1200     int rexw = 0;
1201     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1202         if (val != (int32_t)val) {
1203             return false;
1204         }
1205         rexw = P_REXW;
1206     } else if (type != TCG_TYPE_I32) {
1207         return false;
1208     }
1209     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1210     tcg_out32(s, val);
1211     return true;
1212 }
1213
1214 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1215 {
1216     /* Propagate an opcode prefix, such as P_DATA16.  */
1217     int ext = subopc & ~0x7;
1218     subopc &= 0x7;
1219
1220     if (count == 1) {
1221         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1222     } else {
1223         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1224         tcg_out8(s, count);
1225     }
1226 }
1227
1228 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1229 {
1230     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1231 }
1232
1233 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1234 {
1235     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1236 }
1237
1238 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1239 {
1240     /* movzbl */
1241     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1242     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1243 }
1244
1245 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1246 {
1247     /* movsbl */
1248     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1249     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1250 }
1251
1252 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1253 {
1254     /* movzwl */
1255     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1256 }
1257
1258 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1259 {
1260     /* movsw[lq] */
1261     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1262 }
1263
1264 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1265 {
1266     /* 32-bit mov zero extends.  */
1267     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1268 }
1269
1270 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1271 {
1272     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1273 }
1274
1275 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1276 {
1277     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1278 }
1279
1280 static void tgen_arithi(TCGContext *s, int c, int r0,
1281                         tcg_target_long val, int cf)
1282 {
1283     int rexw = 0;
1284
1285     if (TCG_TARGET_REG_BITS == 64) {
1286         rexw = c & -8;
1287         c &= 7;
1288     }
1289
1290     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1291        partial flags update stalls on Pentium4 and are not recommended
1292        by current Intel optimization manuals.  */
1293     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1294         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1295         if (TCG_TARGET_REG_BITS == 64) {
1296             /* The single-byte increment encodings are re-tasked as the
1297                REX prefixes.  Use the MODRM encoding.  */
1298             tcg_out_modrm(s, OPC_GRP5 + rexw,
1299                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1300         } else {
1301             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1302         }
1303         return;
1304     }
1305
1306     if (c == ARITH_AND) {
1307         if (TCG_TARGET_REG_BITS == 64) {
1308             if (val == 0xffffffffu) {
1309                 tcg_out_ext32u(s, r0, r0);
1310                 return;
1311             }
1312             if (val == (uint32_t)val) {
1313                 /* AND with no high bits set can use a 32-bit operation.  */
1314                 rexw = 0;
1315             }
1316         }
1317         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1318             tcg_out_ext8u(s, r0, r0);
1319             return;
1320         }
1321         if (val == 0xffffu) {
1322             tcg_out_ext16u(s, r0, r0);
1323             return;
1324         }
1325     }
1326
1327     if (val == (int8_t)val) {
1328         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1329         tcg_out8(s, val);
1330         return;
1331     }
1332     if (rexw == 0 || val == (int32_t)val) {
1333         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1334         tcg_out32(s, val);
1335         return;
1336     }
1337
1338     tcg_abort();
1339 }
1340
1341 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1342 {
1343     if (val != 0) {
1344         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1345     }
1346 }
1347
1348 /* Use SMALL != 0 to force a short forward branch.  */
1349 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1350 {
1351     int32_t val, val1;
1352
1353     if (l->has_value) {
1354         val = tcg_pcrel_diff(s, l->u.value_ptr);
1355         val1 = val - 2;
1356         if ((int8_t)val1 == val1) {
1357             if (opc == -1) {
1358                 tcg_out8(s, OPC_JMP_short);
1359             } else {
1360                 tcg_out8(s, OPC_JCC_short + opc);
1361             }
1362             tcg_out8(s, val1);
1363         } else {
1364             if (small) {
1365                 tcg_abort();
1366             }
1367             if (opc == -1) {
1368                 tcg_out8(s, OPC_JMP_long);
1369                 tcg_out32(s, val - 5);
1370             } else {
1371                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1372                 tcg_out32(s, val - 6);
1373             }
1374         }
1375     } else if (small) {
1376         if (opc == -1) {
1377             tcg_out8(s, OPC_JMP_short);
1378         } else {
1379             tcg_out8(s, OPC_JCC_short + opc);
1380         }
1381         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1382         s->code_ptr += 1;
1383     } else {
1384         if (opc == -1) {
1385             tcg_out8(s, OPC_JMP_long);
1386         } else {
1387             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1388         }
1389         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1390         s->code_ptr += 4;
1391     }
1392 }
1393
1394 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1395                         int const_arg2, int rexw)
1396 {
1397     if (const_arg2) {
1398         if (arg2 == 0) {
1399             /* test r, r */
1400             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1401         } else {
1402             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1403         }
1404     } else {
1405         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1406     }
1407 }
1408
1409 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1410                              TCGArg arg1, TCGArg arg2, int const_arg2,
1411                              TCGLabel *label, int small)
1412 {
1413     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1414     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1415 }
1416
1417 #if TCG_TARGET_REG_BITS == 64
1418 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1419                              TCGArg arg1, TCGArg arg2, int const_arg2,
1420                              TCGLabel *label, int small)
1421 {
1422     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1423     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1424 }
1425 #else
1426 /* XXX: we implement it at the target level to avoid having to
1427    handle cross basic blocks temporaries */
1428 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1429                             const int *const_args, int small)
1430 {
1431     TCGLabel *label_next = gen_new_label();
1432     TCGLabel *label_this = arg_label(args[5]);
1433
1434     switch(args[4]) {
1435     case TCG_COND_EQ:
1436         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1437                          label_next, 1);
1438         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1439                          label_this, small);
1440         break;
1441     case TCG_COND_NE:
1442         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1443                          label_this, small);
1444         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1445                          label_this, small);
1446         break;
1447     case TCG_COND_LT:
1448         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1449                          label_this, small);
1450         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1451         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1452                          label_this, small);
1453         break;
1454     case TCG_COND_LE:
1455         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1456                          label_this, small);
1457         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1458         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1459                          label_this, small);
1460         break;
1461     case TCG_COND_GT:
1462         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1463                          label_this, small);
1464         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1465         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1466                          label_this, small);
1467         break;
1468     case TCG_COND_GE:
1469         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1470                          label_this, small);
1471         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1472         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1473                          label_this, small);
1474         break;
1475     case TCG_COND_LTU:
1476         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1477                          label_this, small);
1478         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1479         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1480                          label_this, small);
1481         break;
1482     case TCG_COND_LEU:
1483         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1484                          label_this, small);
1485         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1486         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1487                          label_this, small);
1488         break;
1489     case TCG_COND_GTU:
1490         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1491                          label_this, small);
1492         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1493         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1494                          label_this, small);
1495         break;
1496     case TCG_COND_GEU:
1497         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1498                          label_this, small);
1499         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1500         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1501                          label_this, small);
1502         break;
1503     default:
1504         tcg_abort();
1505     }
1506     tcg_out_label(s, label_next);
1507 }
1508 #endif
1509
1510 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1511                               TCGArg arg1, TCGArg arg2, int const_arg2)
1512 {
1513     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1514     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1515     tcg_out_ext8u(s, dest, dest);
1516 }
1517
1518 #if TCG_TARGET_REG_BITS == 64
1519 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1520                               TCGArg arg1, TCGArg arg2, int const_arg2)
1521 {
1522     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1523     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1524     tcg_out_ext8u(s, dest, dest);
1525 }
1526 #else
1527 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1528                              const int *const_args)
1529 {
1530     TCGArg new_args[6];
1531     TCGLabel *label_true, *label_over;
1532
1533     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1534
1535     if (args[0] == args[1] || args[0] == args[2]
1536         || (!const_args[3] && args[0] == args[3])
1537         || (!const_args[4] && args[0] == args[4])) {
1538         /* When the destination overlaps with one of the argument
1539            registers, don't do anything tricky.  */
1540         label_true = gen_new_label();
1541         label_over = gen_new_label();
1542
1543         new_args[5] = label_arg(label_true);
1544         tcg_out_brcond2(s, new_args, const_args+1, 1);
1545
1546         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1547         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1548         tcg_out_label(s, label_true);
1549
1550         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1551         tcg_out_label(s, label_over);
1552     } else {
1553         /* When the destination does not overlap one of the arguments,
1554            clear the destination first, jump if cond false, and emit an
1555            increment in the true case.  This results in smaller code.  */
1556
1557         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1558
1559         label_over = gen_new_label();
1560         new_args[4] = tcg_invert_cond(new_args[4]);
1561         new_args[5] = label_arg(label_over);
1562         tcg_out_brcond2(s, new_args, const_args+1, 1);
1563
1564         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1565         tcg_out_label(s, label_over);
1566     }
1567 }
1568 #endif
1569
1570 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1571                          TCGReg dest, TCGReg v1)
1572 {
1573     if (have_cmov) {
1574         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1575     } else {
1576         TCGLabel *over = gen_new_label();
1577         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1578         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1579         tcg_out_label(s, over);
1580     }
1581 }
1582
1583 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1584                               TCGReg c1, TCGArg c2, int const_c2,
1585                               TCGReg v1)
1586 {
1587     tcg_out_cmp(s, c1, c2, const_c2, 0);
1588     tcg_out_cmov(s, cond, 0, dest, v1);
1589 }
1590
1591 #if TCG_TARGET_REG_BITS == 64
1592 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1593                               TCGReg c1, TCGArg c2, int const_c2,
1594                               TCGReg v1)
1595 {
1596     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1597     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1598 }
1599 #endif
1600
1601 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1602                         TCGArg arg2, bool const_a2)
1603 {
1604     if (have_bmi1) {
1605         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1606         if (const_a2) {
1607             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1608         } else {
1609             tcg_debug_assert(dest != arg2);
1610             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1611         }
1612     } else {
1613         tcg_debug_assert(dest != arg2);
1614         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1615         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1616     }
1617 }
1618
1619 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1620                         TCGArg arg2, bool const_a2)
1621 {
1622     if (have_lzcnt) {
1623         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1624         if (const_a2) {
1625             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1626         } else {
1627             tcg_debug_assert(dest != arg2);
1628             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1629         }
1630     } else {
1631         tcg_debug_assert(!const_a2);
1632         tcg_debug_assert(dest != arg1);
1633         tcg_debug_assert(dest != arg2);
1634
1635         /* Recall that the output of BSR is the index not the count.  */
1636         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1637         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1638
1639         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1640         tcg_out_cmp(s, arg1, 0, 1, rexw);
1641         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1642     }
1643 }
1644
1645 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1646 {
1647     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1648
1649     if (disp == (int32_t)disp) {
1650         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1651         tcg_out32(s, disp);
1652     } else {
1653         /* rip-relative addressing into the constant pool.
1654            This is 6 + 8 = 14 bytes, as compared to using an
1655            immediate load 10 + 6 = 16 bytes, plus we may
1656            be able to re-use the pool constant for more calls.  */
1657         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1658         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1659         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1660         tcg_out32(s, 0);
1661     }
1662 }
1663
1664 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1665                          const TCGHelperInfo *info)
1666 {
1667     tcg_out_branch(s, 1, dest);
1668 }
1669
1670 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1671 {
1672     tcg_out_branch(s, 0, dest);
1673 }
1674
1675 static void tcg_out_nopn(TCGContext *s, int n)
1676 {
1677     int i;
1678     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1679      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1680      * duplicate prefix, and all of the interesting recent cores can
1681      * decode and discard the duplicates in a single cycle.
1682      */
1683     tcg_debug_assert(n >= 1);
1684     for (i = 1; i < n; ++i) {
1685         tcg_out8(s, 0x66);
1686     }
1687     tcg_out8(s, 0x90);
1688 }
1689
1690 #if defined(CONFIG_SOFTMMU)
1691 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1692  *                                     int mmu_idx, uintptr_t ra)
1693  */
1694 static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1695     [MO_UB]   = helper_ret_ldub_mmu,
1696     [MO_LEUW] = helper_le_lduw_mmu,
1697     [MO_LEUL] = helper_le_ldul_mmu,
1698     [MO_LEUQ] = helper_le_ldq_mmu,
1699     [MO_BEUW] = helper_be_lduw_mmu,
1700     [MO_BEUL] = helper_be_ldul_mmu,
1701     [MO_BEUQ] = helper_be_ldq_mmu,
1702 };
1703
1704 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1705  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1706  */
1707 static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1708     [MO_UB]   = helper_ret_stb_mmu,
1709     [MO_LEUW] = helper_le_stw_mmu,
1710     [MO_LEUL] = helper_le_stl_mmu,
1711     [MO_LEUQ] = helper_le_stq_mmu,
1712     [MO_BEUW] = helper_be_stw_mmu,
1713     [MO_BEUL] = helper_be_stl_mmu,
1714     [MO_BEUQ] = helper_be_stq_mmu,
1715 };
1716
1717 /* Perform the TLB load and compare.
1718
1719    Inputs:
1720    ADDRLO and ADDRHI contain the low and high part of the address.
1721
1722    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1723
1724    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1725    This should be offsetof addr_read or addr_write.
1726
1727    Outputs:
1728    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1729    positions of the displacements of forward jumps to the TLB miss case.
1730
1731    Second argument register is loaded with the low part of the address.
1732    In the TLB hit case, it has been adjusted as indicated by the TLB
1733    and so is a host address.  In the TLB miss case, it continues to
1734    hold a guest address.
1735
1736    First argument register is clobbered.  */
1737
1738 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1739                                     int mem_index, MemOp opc,
1740                                     tcg_insn_unit **label_ptr, int which)
1741 {
1742     const TCGReg r0 = TCG_REG_L0;
1743     const TCGReg r1 = TCG_REG_L1;
1744     TCGType ttype = TCG_TYPE_I32;
1745     TCGType tlbtype = TCG_TYPE_I32;
1746     int trexw = 0, hrexw = 0, tlbrexw = 0;
1747     unsigned a_bits = get_alignment_bits(opc);
1748     unsigned s_bits = opc & MO_SIZE;
1749     unsigned a_mask = (1 << a_bits) - 1;
1750     unsigned s_mask = (1 << s_bits) - 1;
1751     target_ulong tlb_mask;
1752
1753     if (TCG_TARGET_REG_BITS == 64) {
1754         if (TARGET_LONG_BITS == 64) {
1755             ttype = TCG_TYPE_I64;
1756             trexw = P_REXW;
1757         }
1758         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1759             hrexw = P_REXW;
1760             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1761                 tlbtype = TCG_TYPE_I64;
1762                 tlbrexw = P_REXW;
1763             }
1764         }
1765     }
1766
1767     tcg_out_mov(s, tlbtype, r0, addrlo);
1768     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1769                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1770
1771     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1772                          TLB_MASK_TABLE_OFS(mem_index) +
1773                          offsetof(CPUTLBDescFast, mask));
1774
1775     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1776                          TLB_MASK_TABLE_OFS(mem_index) +
1777                          offsetof(CPUTLBDescFast, table));
1778
1779     /* If the required alignment is at least as large as the access, simply
1780        copy the address and mask.  For lesser alignments, check that we don't
1781        cross pages for the complete access.  */
1782     if (a_bits >= s_bits) {
1783         tcg_out_mov(s, ttype, r1, addrlo);
1784     } else {
1785         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1786     }
1787     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1788     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1789
1790     /* cmp 0(r0), r1 */
1791     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1792
1793     /* Prepare for both the fast path add of the tlb addend, and the slow
1794        path function argument setup.  */
1795     tcg_out_mov(s, ttype, r1, addrlo);
1796
1797     /* jne slow_path */
1798     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1799     label_ptr[0] = s->code_ptr;
1800     s->code_ptr += 4;
1801
1802     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1803         /* cmp 4(r0), addrhi */
1804         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1805
1806         /* jne slow_path */
1807         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1808         label_ptr[1] = s->code_ptr;
1809         s->code_ptr += 4;
1810     }
1811
1812     /* TLB Hit.  */
1813
1814     /* add addend(r0), r1 */
1815     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1816                          offsetof(CPUTLBEntry, addend));
1817 }
1818
1819 /*
1820  * Record the context of a call to the out of line helper code for the slow path
1821  * for a load or store, so that we can later generate the correct helper code
1822  */
1823 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1824                                 MemOpIdx oi,
1825                                 TCGReg datalo, TCGReg datahi,
1826                                 TCGReg addrlo, TCGReg addrhi,
1827                                 tcg_insn_unit *raddr,
1828                                 tcg_insn_unit **label_ptr)
1829 {
1830     TCGLabelQemuLdst *label = new_ldst_label(s);
1831
1832     label->is_ld = is_ld;
1833     label->oi = oi;
1834     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1835     label->datalo_reg = datalo;
1836     label->datahi_reg = datahi;
1837     label->addrlo_reg = addrlo;
1838     label->addrhi_reg = addrhi;
1839     label->raddr = tcg_splitwx_to_rx(raddr);
1840     label->label_ptr[0] = label_ptr[0];
1841     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1842         label->label_ptr[1] = label_ptr[1];
1843     }
1844 }
1845
1846 /*
1847  * Generate code for the slow path for a load at the end of block
1848  */
1849 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1850 {
1851     MemOpIdx oi = l->oi;
1852     MemOp opc = get_memop(oi);
1853     TCGReg data_reg;
1854     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1855     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1856
1857     /* resolve label address */
1858     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1859     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1860         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1861     }
1862
1863     if (TCG_TARGET_REG_BITS == 32) {
1864         int ofs = 0;
1865
1866         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1867         ofs += 4;
1868
1869         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1870         ofs += 4;
1871
1872         if (TARGET_LONG_BITS == 64) {
1873             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1874             ofs += 4;
1875         }
1876
1877         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1878         ofs += 4;
1879
1880         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1881     } else {
1882         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1883         /* The second argument is already loaded with addrlo.  */
1884         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1885         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1886                      (uintptr_t)l->raddr);
1887     }
1888
1889     tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1890
1891     data_reg = l->datalo_reg;
1892     switch (opc & MO_SSIZE) {
1893     case MO_SB:
1894         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1895         break;
1896     case MO_SW:
1897         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1898         break;
1899 #if TCG_TARGET_REG_BITS == 64
1900     case MO_SL:
1901         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1902         break;
1903 #endif
1904     case MO_UB:
1905     case MO_UW:
1906         /* Note that the helpers have zero-extended to tcg_target_long.  */
1907     case MO_UL:
1908         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1909         break;
1910     case MO_UQ:
1911         if (TCG_TARGET_REG_BITS == 64) {
1912             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1913         } else if (data_reg == TCG_REG_EDX) {
1914             /* xchg %edx, %eax */
1915             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1916             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1917         } else {
1918             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1919             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1920         }
1921         break;
1922     default:
1923         tcg_abort();
1924     }
1925
1926     /* Jump to the code corresponding to next IR of qemu_st */
1927     tcg_out_jmp(s, l->raddr);
1928     return true;
1929 }
1930
1931 /*
1932  * Generate code for the slow path for a store at the end of block
1933  */
1934 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1935 {
1936     MemOpIdx oi = l->oi;
1937     MemOp opc = get_memop(oi);
1938     MemOp s_bits = opc & MO_SIZE;
1939     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1940     TCGReg retaddr;
1941
1942     /* resolve label address */
1943     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1944     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1945         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1946     }
1947
1948     if (TCG_TARGET_REG_BITS == 32) {
1949         int ofs = 0;
1950
1951         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1952         ofs += 4;
1953
1954         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1955         ofs += 4;
1956
1957         if (TARGET_LONG_BITS == 64) {
1958             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1959             ofs += 4;
1960         }
1961
1962         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1963         ofs += 4;
1964
1965         if (s_bits == MO_64) {
1966             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1967             ofs += 4;
1968         }
1969
1970         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1971         ofs += 4;
1972
1973         retaddr = TCG_REG_EAX;
1974         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1975         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1976     } else {
1977         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1978         /* The second argument is already loaded with addrlo.  */
1979         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1980                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1981         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1982
1983         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1984             retaddr = tcg_target_call_iarg_regs[4];
1985             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1986         } else {
1987             retaddr = TCG_REG_RAX;
1988             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1989             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1990                        TCG_TARGET_CALL_STACK_OFFSET);
1991         }
1992     }
1993
1994     /* "Tail call" to the helper, with the return address back inline.  */
1995     tcg_out_push(s, retaddr);
1996     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1997     return true;
1998 }
1999 #else
2000
2001 static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
2002                                    TCGReg addrhi, unsigned a_bits)
2003 {
2004     unsigned a_mask = (1 << a_bits) - 1;
2005     TCGLabelQemuLdst *label;
2006
2007     /*
2008      * We are expecting a_bits to max out at 7, so we can usually use testb.
2009      * For i686, we have to use testl for %esi/%edi.
2010      */
2011     if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
2012         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
2013         tcg_out8(s, a_mask);
2014     } else {
2015         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
2016         tcg_out32(s, a_mask);
2017     }
2018
2019     /* jne slow_path */
2020     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2021
2022     label = new_ldst_label(s);
2023     label->is_ld = is_ld;
2024     label->addrlo_reg = addrlo;
2025     label->addrhi_reg = addrhi;
2026     label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
2027     label->label_ptr[0] = s->code_ptr;
2028
2029     s->code_ptr += 4;
2030 }
2031
2032 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
2033 {
2034     /* resolve label address */
2035     tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
2036
2037     if (TCG_TARGET_REG_BITS == 32) {
2038         int ofs = 0;
2039
2040         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
2041         ofs += 4;
2042
2043         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
2044         ofs += 4;
2045         if (TARGET_LONG_BITS == 64) {
2046             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
2047             ofs += 4;
2048         }
2049
2050         tcg_out_pushi(s, (uintptr_t)l->raddr);
2051     } else {
2052         tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
2053                     l->addrlo_reg);
2054         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
2055
2056         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
2057         tcg_out_push(s, TCG_REG_RAX);
2058     }
2059
2060     /* "Tail call" to the helper, with the return address back inline. */
2061     tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
2062                                   : helper_unaligned_st));
2063     return true;
2064 }
2065
2066 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2067 {
2068     return tcg_out_fail_alignment(s, l);
2069 }
2070
2071 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2072 {
2073     return tcg_out_fail_alignment(s, l);
2074 }
2075
2076 #if TCG_TARGET_REG_BITS == 32
2077 # define x86_guest_base_seg     0
2078 # define x86_guest_base_index   -1
2079 # define x86_guest_base_offset  guest_base
2080 #else
2081 static int x86_guest_base_seg;
2082 static int x86_guest_base_index = -1;
2083 static int32_t x86_guest_base_offset;
2084 # if defined(__x86_64__) && defined(__linux__)
2085 #  include <asm/prctl.h>
2086 #  include <sys/prctl.h>
2087 int arch_prctl(int code, unsigned long addr);
2088 static inline int setup_guest_base_seg(void)
2089 {
2090     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2091         return P_GS;
2092     }
2093     return 0;
2094 }
2095 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
2096 #  include <machine/sysarch.h>
2097 static inline int setup_guest_base_seg(void)
2098 {
2099     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2100         return P_GS;
2101     }
2102     return 0;
2103 }
2104 # else
2105 static inline int setup_guest_base_seg(void)
2106 {
2107     return 0;
2108 }
2109 # endif
2110 #endif
2111 #endif /* SOFTMMU */
2112
2113 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2114                                    TCGReg base, int index, intptr_t ofs,
2115                                    int seg, bool is64, MemOp memop)
2116 {
2117     bool use_movbe = false;
2118     int rexw = is64 * P_REXW;
2119     int movop = OPC_MOVL_GvEv;
2120
2121     /* Do big-endian loads with movbe.  */
2122     if (memop & MO_BSWAP) {
2123         tcg_debug_assert(have_movbe);
2124         use_movbe = true;
2125         movop = OPC_MOVBE_GyMy;
2126     }
2127
2128     switch (memop & MO_SSIZE) {
2129     case MO_UB:
2130         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2131                                  base, index, 0, ofs);
2132         break;
2133     case MO_SB:
2134         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2135                                  base, index, 0, ofs);
2136         break;
2137     case MO_UW:
2138         if (use_movbe) {
2139             /* There is no extending movbe; only low 16-bits are modified.  */
2140             if (datalo != base && datalo != index) {
2141                 /* XOR breaks dependency chains.  */
2142                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2143                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2144                                          datalo, base, index, 0, ofs);
2145             } else {
2146                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2147                                          datalo, base, index, 0, ofs);
2148                 tcg_out_ext16u(s, datalo, datalo);
2149             }
2150         } else {
2151             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2152                                      base, index, 0, ofs);
2153         }
2154         break;
2155     case MO_SW:
2156         if (use_movbe) {
2157             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2158                                      datalo, base, index, 0, ofs);
2159             tcg_out_ext16s(s, datalo, datalo, rexw);
2160         } else {
2161             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2162                                      datalo, base, index, 0, ofs);
2163         }
2164         break;
2165     case MO_UL:
2166         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2167         break;
2168 #if TCG_TARGET_REG_BITS == 64
2169     case MO_SL:
2170         if (use_movbe) {
2171             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2172                                      base, index, 0, ofs);
2173             tcg_out_ext32s(s, datalo, datalo);
2174         } else {
2175             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2176                                      base, index, 0, ofs);
2177         }
2178         break;
2179 #endif
2180     case MO_UQ:
2181         if (TCG_TARGET_REG_BITS == 64) {
2182             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2183                                      base, index, 0, ofs);
2184         } else {
2185             if (use_movbe) {
2186                 TCGReg t = datalo;
2187                 datalo = datahi;
2188                 datahi = t;
2189             }
2190             if (base != datalo) {
2191                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2192                                          base, index, 0, ofs);
2193                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2194                                          base, index, 0, ofs + 4);
2195             } else {
2196                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2197                                          base, index, 0, ofs + 4);
2198                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2199                                          base, index, 0, ofs);
2200             }
2201         }
2202         break;
2203     default:
2204         g_assert_not_reached();
2205     }
2206 }
2207
2208 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2209    EAX. It will be useful once fixed registers globals are less
2210    common. */
2211 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2212 {
2213     TCGReg datalo, datahi, addrlo;
2214     TCGReg addrhi __attribute__((unused));
2215     MemOpIdx oi;
2216     MemOp opc;
2217 #if defined(CONFIG_SOFTMMU)
2218     int mem_index;
2219     tcg_insn_unit *label_ptr[2];
2220 #else
2221     unsigned a_bits;
2222 #endif
2223
2224     datalo = *args++;
2225     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2226     addrlo = *args++;
2227     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2228     oi = *args++;
2229     opc = get_memop(oi);
2230
2231 #if defined(CONFIG_SOFTMMU)
2232     mem_index = get_mmuidx(oi);
2233
2234     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2235                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2236
2237     /* TLB Hit.  */
2238     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2239
2240     /* Record the current context of a load into ldst label */
2241     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2242                         s->code_ptr, label_ptr);
2243 #else
2244     a_bits = get_alignment_bits(opc);
2245     if (a_bits) {
2246         tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
2247     }
2248
2249     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2250                            x86_guest_base_offset, x86_guest_base_seg,
2251                            is64, opc);
2252 #endif
2253 }
2254
2255 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2256                                    TCGReg base, int index, intptr_t ofs,
2257                                    int seg, MemOp memop)
2258 {
2259     bool use_movbe = false;
2260     int movop = OPC_MOVL_EvGv;
2261
2262     /*
2263      * Do big-endian stores with movbe or softmmu.
2264      * User-only without movbe will have its swapping done generically.
2265      */
2266     if (memop & MO_BSWAP) {
2267         tcg_debug_assert(have_movbe);
2268         use_movbe = true;
2269         movop = OPC_MOVBE_MyGy;
2270     }
2271
2272     switch (memop & MO_SIZE) {
2273     case MO_8:
2274         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2275         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2276         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2277                                  datalo, base, index, 0, ofs);
2278         break;
2279     case MO_16:
2280         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2281                                  base, index, 0, ofs);
2282         break;
2283     case MO_32:
2284         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2285         break;
2286     case MO_64:
2287         if (TCG_TARGET_REG_BITS == 64) {
2288             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2289                                      base, index, 0, ofs);
2290         } else {
2291             if (use_movbe) {
2292                 TCGReg t = datalo;
2293                 datalo = datahi;
2294                 datahi = t;
2295             }
2296             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2297                                      base, index, 0, ofs);
2298             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2299                                      base, index, 0, ofs + 4);
2300         }
2301         break;
2302     default:
2303         g_assert_not_reached();
2304     }
2305 }
2306
2307 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2308 {
2309     TCGReg datalo, datahi, addrlo;
2310     TCGReg addrhi __attribute__((unused));
2311     MemOpIdx oi;
2312     MemOp opc;
2313 #if defined(CONFIG_SOFTMMU)
2314     int mem_index;
2315     tcg_insn_unit *label_ptr[2];
2316 #else
2317     unsigned a_bits;
2318 #endif
2319
2320     datalo = *args++;
2321     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2322     addrlo = *args++;
2323     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2324     oi = *args++;
2325     opc = get_memop(oi);
2326
2327 #if defined(CONFIG_SOFTMMU)
2328     mem_index = get_mmuidx(oi);
2329
2330     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2331                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2332
2333     /* TLB Hit.  */
2334     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2335
2336     /* Record the current context of a store into ldst label */
2337     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2338                         s->code_ptr, label_ptr);
2339 #else
2340     a_bits = get_alignment_bits(opc);
2341     if (a_bits) {
2342         tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
2343     }
2344
2345     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2346                            x86_guest_base_offset, x86_guest_base_seg, opc);
2347 #endif
2348 }
2349
2350 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2351                               const TCGArg args[TCG_MAX_OP_ARGS],
2352                               const int const_args[TCG_MAX_OP_ARGS])
2353 {
2354     TCGArg a0, a1, a2;
2355     int c, const_a2, vexop, rexw = 0;
2356
2357 #if TCG_TARGET_REG_BITS == 64
2358 # define OP_32_64(x) \
2359         case glue(glue(INDEX_op_, x), _i64): \
2360             rexw = P_REXW; /* FALLTHRU */    \
2361         case glue(glue(INDEX_op_, x), _i32)
2362 #else
2363 # define OP_32_64(x) \
2364         case glue(glue(INDEX_op_, x), _i32)
2365 #endif
2366
2367     /* Hoist the loads of the most common arguments.  */
2368     a0 = args[0];
2369     a1 = args[1];
2370     a2 = args[2];
2371     const_a2 = const_args[2];
2372
2373     switch (opc) {
2374     case INDEX_op_exit_tb:
2375         /* Reuse the zeroing that exists for goto_ptr.  */
2376         if (a0 == 0) {
2377             tcg_out_jmp(s, tcg_code_gen_epilogue);
2378         } else {
2379             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2380             tcg_out_jmp(s, tb_ret_addr);
2381         }
2382         break;
2383     case INDEX_op_goto_tb:
2384         if (s->tb_jmp_insn_offset) {
2385             /* direct jump method */
2386             int gap;
2387             /* jump displacement must be aligned for atomic patching;
2388              * see if we need to add extra nops before jump
2389              */
2390             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2391             if (gap != 1) {
2392                 tcg_out_nopn(s, gap - 1);
2393             }
2394             tcg_out8(s, OPC_JMP_long); /* jmp im */
2395             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2396             tcg_out32(s, 0);
2397         } else {
2398             /* indirect jump method */
2399             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2400                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2401         }
2402         set_jmp_reset_offset(s, a0);
2403         break;
2404     case INDEX_op_goto_ptr:
2405         /* jmp to the given host address (could be epilogue) */
2406         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2407         break;
2408     case INDEX_op_br:
2409         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2410         break;
2411     OP_32_64(ld8u):
2412         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2413         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2414         break;
2415     OP_32_64(ld8s):
2416         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2417         break;
2418     OP_32_64(ld16u):
2419         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2420         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2421         break;
2422     OP_32_64(ld16s):
2423         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2424         break;
2425 #if TCG_TARGET_REG_BITS == 64
2426     case INDEX_op_ld32u_i64:
2427 #endif
2428     case INDEX_op_ld_i32:
2429         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2430         break;
2431
2432     OP_32_64(st8):
2433         if (const_args[0]) {
2434             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2435             tcg_out8(s, a0);
2436         } else {
2437             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2438         }
2439         break;
2440     OP_32_64(st16):
2441         if (const_args[0]) {
2442             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2443             tcg_out16(s, a0);
2444         } else {
2445             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2446         }
2447         break;
2448 #if TCG_TARGET_REG_BITS == 64
2449     case INDEX_op_st32_i64:
2450 #endif
2451     case INDEX_op_st_i32:
2452         if (const_args[0]) {
2453             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2454             tcg_out32(s, a0);
2455         } else {
2456             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2457         }
2458         break;
2459
2460     OP_32_64(add):
2461         /* For 3-operand addition, use LEA.  */
2462         if (a0 != a1) {
2463             TCGArg c3 = 0;
2464             if (const_a2) {
2465                 c3 = a2, a2 = -1;
2466             } else if (a0 == a2) {
2467                 /* Watch out for dest = src + dest, since we've removed
2468                    the matching constraint on the add.  */
2469                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2470                 break;
2471             }
2472
2473             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2474             break;
2475         }
2476         c = ARITH_ADD;
2477         goto gen_arith;
2478     OP_32_64(sub):
2479         c = ARITH_SUB;
2480         goto gen_arith;
2481     OP_32_64(and):
2482         c = ARITH_AND;
2483         goto gen_arith;
2484     OP_32_64(or):
2485         c = ARITH_OR;
2486         goto gen_arith;
2487     OP_32_64(xor):
2488         c = ARITH_XOR;
2489         goto gen_arith;
2490     gen_arith:
2491         if (const_a2) {
2492             tgen_arithi(s, c + rexw, a0, a2, 0);
2493         } else {
2494             tgen_arithr(s, c + rexw, a0, a2);
2495         }
2496         break;
2497
2498     OP_32_64(andc):
2499         if (const_a2) {
2500             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2501             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2502         } else {
2503             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2504         }
2505         break;
2506
2507     OP_32_64(mul):
2508         if (const_a2) {
2509             int32_t val;
2510             val = a2;
2511             if (val == (int8_t)val) {
2512                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2513                 tcg_out8(s, val);
2514             } else {
2515                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2516                 tcg_out32(s, val);
2517             }
2518         } else {
2519             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2520         }
2521         break;
2522
2523     OP_32_64(div2):
2524         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2525         break;
2526     OP_32_64(divu2):
2527         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2528         break;
2529
2530     OP_32_64(shl):
2531         /* For small constant 3-operand shift, use LEA.  */
2532         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2533             if (a2 - 1 == 0) {
2534                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2535                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2536             } else {
2537                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2538                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2539             }
2540             break;
2541         }
2542         c = SHIFT_SHL;
2543         vexop = OPC_SHLX;
2544         goto gen_shift_maybe_vex;
2545     OP_32_64(shr):
2546         c = SHIFT_SHR;
2547         vexop = OPC_SHRX;
2548         goto gen_shift_maybe_vex;
2549     OP_32_64(sar):
2550         c = SHIFT_SAR;
2551         vexop = OPC_SARX;
2552         goto gen_shift_maybe_vex;
2553     OP_32_64(rotl):
2554         c = SHIFT_ROL;
2555         goto gen_shift;
2556     OP_32_64(rotr):
2557         c = SHIFT_ROR;
2558         goto gen_shift;
2559     gen_shift_maybe_vex:
2560         if (have_bmi2) {
2561             if (!const_a2) {
2562                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2563                 break;
2564             }
2565             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2566         }
2567         /* FALLTHRU */
2568     gen_shift:
2569         if (const_a2) {
2570             tcg_out_shifti(s, c + rexw, a0, a2);
2571         } else {
2572             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2573         }
2574         break;
2575
2576     OP_32_64(ctz):
2577         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2578         break;
2579     OP_32_64(clz):
2580         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2581         break;
2582     OP_32_64(ctpop):
2583         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2584         break;
2585
2586     case INDEX_op_brcond_i32:
2587         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2588         break;
2589     case INDEX_op_setcond_i32:
2590         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2591         break;
2592     case INDEX_op_movcond_i32:
2593         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2594         break;
2595
2596     OP_32_64(bswap16):
2597         if (a2 & TCG_BSWAP_OS) {
2598             /* Output must be sign-extended. */
2599             if (rexw) {
2600                 tcg_out_bswap64(s, a0);
2601                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2602             } else {
2603                 tcg_out_bswap32(s, a0);
2604                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2605             }
2606         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2607             /* Output must be zero-extended, but input isn't. */
2608             tcg_out_bswap32(s, a0);
2609             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2610         } else {
2611             tcg_out_rolw_8(s, a0);
2612         }
2613         break;
2614     OP_32_64(bswap32):
2615         tcg_out_bswap32(s, a0);
2616         if (rexw && (a2 & TCG_BSWAP_OS)) {
2617             tcg_out_ext32s(s, a0, a0);
2618         }
2619         break;
2620
2621     OP_32_64(neg):
2622         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2623         break;
2624     OP_32_64(not):
2625         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2626         break;
2627
2628     OP_32_64(ext8s):
2629         tcg_out_ext8s(s, a0, a1, rexw);
2630         break;
2631     OP_32_64(ext16s):
2632         tcg_out_ext16s(s, a0, a1, rexw);
2633         break;
2634     OP_32_64(ext8u):
2635         tcg_out_ext8u(s, a0, a1);
2636         break;
2637     OP_32_64(ext16u):
2638         tcg_out_ext16u(s, a0, a1);
2639         break;
2640
2641     case INDEX_op_qemu_ld_i32:
2642         tcg_out_qemu_ld(s, args, 0);
2643         break;
2644     case INDEX_op_qemu_ld_i64:
2645         tcg_out_qemu_ld(s, args, 1);
2646         break;
2647     case INDEX_op_qemu_st_i32:
2648     case INDEX_op_qemu_st8_i32:
2649         tcg_out_qemu_st(s, args, 0);
2650         break;
2651     case INDEX_op_qemu_st_i64:
2652         tcg_out_qemu_st(s, args, 1);
2653         break;
2654
2655     OP_32_64(mulu2):
2656         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2657         break;
2658     OP_32_64(muls2):
2659         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2660         break;
2661     OP_32_64(add2):
2662         if (const_args[4]) {
2663             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2664         } else {
2665             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2666         }
2667         if (const_args[5]) {
2668             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2669         } else {
2670             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2671         }
2672         break;
2673     OP_32_64(sub2):
2674         if (const_args[4]) {
2675             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2676         } else {
2677             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2678         }
2679         if (const_args[5]) {
2680             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2681         } else {
2682             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2683         }
2684         break;
2685
2686 #if TCG_TARGET_REG_BITS == 32
2687     case INDEX_op_brcond2_i32:
2688         tcg_out_brcond2(s, args, const_args, 0);
2689         break;
2690     case INDEX_op_setcond2_i32:
2691         tcg_out_setcond2(s, args, const_args);
2692         break;
2693 #else /* TCG_TARGET_REG_BITS == 64 */
2694     case INDEX_op_ld32s_i64:
2695         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2696         break;
2697     case INDEX_op_ld_i64:
2698         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2699         break;
2700     case INDEX_op_st_i64:
2701         if (const_args[0]) {
2702             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2703             tcg_out32(s, a0);
2704         } else {
2705             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2706         }
2707         break;
2708
2709     case INDEX_op_brcond_i64:
2710         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2711         break;
2712     case INDEX_op_setcond_i64:
2713         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2714         break;
2715     case INDEX_op_movcond_i64:
2716         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2717         break;
2718
2719     case INDEX_op_bswap64_i64:
2720         tcg_out_bswap64(s, a0);
2721         break;
2722     case INDEX_op_extu_i32_i64:
2723     case INDEX_op_ext32u_i64:
2724     case INDEX_op_extrl_i64_i32:
2725         tcg_out_ext32u(s, a0, a1);
2726         break;
2727     case INDEX_op_ext_i32_i64:
2728     case INDEX_op_ext32s_i64:
2729         tcg_out_ext32s(s, a0, a1);
2730         break;
2731     case INDEX_op_extrh_i64_i32:
2732         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2733         break;
2734 #endif
2735
2736     OP_32_64(deposit):
2737         if (args[3] == 0 && args[4] == 8) {
2738             /* load bits 0..7 */
2739             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2740         } else if (args[3] == 8 && args[4] == 8) {
2741             /* load bits 8..15 */
2742             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2743         } else if (args[3] == 0 && args[4] == 16) {
2744             /* load bits 0..15 */
2745             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2746         } else {
2747             tcg_abort();
2748         }
2749         break;
2750
2751     case INDEX_op_extract_i64:
2752         if (a2 + args[3] == 32) {
2753             /* This is a 32-bit zero-extending right shift.  */
2754             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2755             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2756             break;
2757         }
2758         /* FALLTHRU */
2759     case INDEX_op_extract_i32:
2760         /* On the off-chance that we can use the high-byte registers.
2761            Otherwise we emit the same ext16 + shift pattern that we
2762            would have gotten from the normal tcg-op.c expansion.  */
2763         tcg_debug_assert(a2 == 8 && args[3] == 8);
2764         if (a1 < 4 && a0 < 8) {
2765             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2766         } else {
2767             tcg_out_ext16u(s, a0, a1);
2768             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2769         }
2770         break;
2771
2772     case INDEX_op_sextract_i32:
2773         /* We don't implement sextract_i64, as we cannot sign-extend to
2774            64-bits without using the REX prefix that explicitly excludes
2775            access to the high-byte registers.  */
2776         tcg_debug_assert(a2 == 8 && args[3] == 8);
2777         if (a1 < 4 && a0 < 8) {
2778             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2779         } else {
2780             tcg_out_ext16s(s, a0, a1, 0);
2781             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2782         }
2783         break;
2784
2785     OP_32_64(extract2):
2786         /* Note that SHRD outputs to the r/m operand.  */
2787         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2788         tcg_out8(s, args[3]);
2789         break;
2790
2791     case INDEX_op_mb:
2792         tcg_out_mb(s, a0);
2793         break;
2794     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2795     case INDEX_op_mov_i64:
2796     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2797     default:
2798         tcg_abort();
2799     }
2800
2801 #undef OP_32_64
2802 }
2803
2804 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2805                            unsigned vecl, unsigned vece,
2806                            const TCGArg args[TCG_MAX_OP_ARGS],
2807                            const int const_args[TCG_MAX_OP_ARGS])
2808 {
2809     static int const add_insn[4] = {
2810         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2811     };
2812     static int const ssadd_insn[4] = {
2813         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2814     };
2815     static int const usadd_insn[4] = {
2816         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2817     };
2818     static int const sub_insn[4] = {
2819         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2820     };
2821     static int const sssub_insn[4] = {
2822         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2823     };
2824     static int const ussub_insn[4] = {
2825         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2826     };
2827     static int const mul_insn[4] = {
2828         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2829     };
2830     static int const shift_imm_insn[4] = {
2831         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2832     };
2833     static int const cmpeq_insn[4] = {
2834         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2835     };
2836     static int const cmpgt_insn[4] = {
2837         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2838     };
2839     static int const punpckl_insn[4] = {
2840         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2841     };
2842     static int const punpckh_insn[4] = {
2843         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2844     };
2845     static int const packss_insn[4] = {
2846         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2847     };
2848     static int const packus_insn[4] = {
2849         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2850     };
2851     static int const smin_insn[4] = {
2852         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2853     };
2854     static int const smax_insn[4] = {
2855         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2856     };
2857     static int const umin_insn[4] = {
2858         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2859     };
2860     static int const umax_insn[4] = {
2861         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2862     };
2863     static int const rotlv_insn[4] = {
2864         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2865     };
2866     static int const rotrv_insn[4] = {
2867         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2868     };
2869     static int const shlv_insn[4] = {
2870         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2871     };
2872     static int const shrv_insn[4] = {
2873         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2874     };
2875     static int const sarv_insn[4] = {
2876         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2877     };
2878     static int const shls_insn[4] = {
2879         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2880     };
2881     static int const shrs_insn[4] = {
2882         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2883     };
2884     static int const sars_insn[4] = {
2885         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2886     };
2887     static int const vpshldi_insn[4] = {
2888         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2889     };
2890     static int const vpshldv_insn[4] = {
2891         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2892     };
2893     static int const vpshrdv_insn[4] = {
2894         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2895     };
2896     static int const abs_insn[4] = {
2897         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2898     };
2899
2900     TCGType type = vecl + TCG_TYPE_V64;
2901     int insn, sub;
2902     TCGArg a0, a1, a2, a3;
2903
2904     a0 = args[0];
2905     a1 = args[1];
2906     a2 = args[2];
2907
2908     switch (opc) {
2909     case INDEX_op_add_vec:
2910         insn = add_insn[vece];
2911         goto gen_simd;
2912     case INDEX_op_ssadd_vec:
2913         insn = ssadd_insn[vece];
2914         goto gen_simd;
2915     case INDEX_op_usadd_vec:
2916         insn = usadd_insn[vece];
2917         goto gen_simd;
2918     case INDEX_op_sub_vec:
2919         insn = sub_insn[vece];
2920         goto gen_simd;
2921     case INDEX_op_sssub_vec:
2922         insn = sssub_insn[vece];
2923         goto gen_simd;
2924     case INDEX_op_ussub_vec:
2925         insn = ussub_insn[vece];
2926         goto gen_simd;
2927     case INDEX_op_mul_vec:
2928         insn = mul_insn[vece];
2929         goto gen_simd;
2930     case INDEX_op_and_vec:
2931         insn = OPC_PAND;
2932         goto gen_simd;
2933     case INDEX_op_or_vec:
2934         insn = OPC_POR;
2935         goto gen_simd;
2936     case INDEX_op_xor_vec:
2937         insn = OPC_PXOR;
2938         goto gen_simd;
2939     case INDEX_op_smin_vec:
2940         insn = smin_insn[vece];
2941         goto gen_simd;
2942     case INDEX_op_umin_vec:
2943         insn = umin_insn[vece];
2944         goto gen_simd;
2945     case INDEX_op_smax_vec:
2946         insn = smax_insn[vece];
2947         goto gen_simd;
2948     case INDEX_op_umax_vec:
2949         insn = umax_insn[vece];
2950         goto gen_simd;
2951     case INDEX_op_shlv_vec:
2952         insn = shlv_insn[vece];
2953         goto gen_simd;
2954     case INDEX_op_shrv_vec:
2955         insn = shrv_insn[vece];
2956         goto gen_simd;
2957     case INDEX_op_sarv_vec:
2958         insn = sarv_insn[vece];
2959         goto gen_simd;
2960     case INDEX_op_rotlv_vec:
2961         insn = rotlv_insn[vece];
2962         goto gen_simd;
2963     case INDEX_op_rotrv_vec:
2964         insn = rotrv_insn[vece];
2965         goto gen_simd;
2966     case INDEX_op_shls_vec:
2967         insn = shls_insn[vece];
2968         goto gen_simd;
2969     case INDEX_op_shrs_vec:
2970         insn = shrs_insn[vece];
2971         goto gen_simd;
2972     case INDEX_op_sars_vec:
2973         insn = sars_insn[vece];
2974         goto gen_simd;
2975     case INDEX_op_x86_punpckl_vec:
2976         insn = punpckl_insn[vece];
2977         goto gen_simd;
2978     case INDEX_op_x86_punpckh_vec:
2979         insn = punpckh_insn[vece];
2980         goto gen_simd;
2981     case INDEX_op_x86_packss_vec:
2982         insn = packss_insn[vece];
2983         goto gen_simd;
2984     case INDEX_op_x86_packus_vec:
2985         insn = packus_insn[vece];
2986         goto gen_simd;
2987     case INDEX_op_x86_vpshldv_vec:
2988         insn = vpshldv_insn[vece];
2989         a1 = a2;
2990         a2 = args[3];
2991         goto gen_simd;
2992     case INDEX_op_x86_vpshrdv_vec:
2993         insn = vpshrdv_insn[vece];
2994         a1 = a2;
2995         a2 = args[3];
2996         goto gen_simd;
2997 #if TCG_TARGET_REG_BITS == 32
2998     case INDEX_op_dup2_vec:
2999         /* First merge the two 32-bit inputs to a single 64-bit element. */
3000         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3001         /* Then replicate the 64-bit elements across the rest of the vector. */
3002         if (type != TCG_TYPE_V64) {
3003             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3004         }
3005         break;
3006 #endif
3007     case INDEX_op_abs_vec:
3008         insn = abs_insn[vece];
3009         a2 = a1;
3010         a1 = 0;
3011         goto gen_simd;
3012     gen_simd:
3013         tcg_debug_assert(insn != OPC_UD2);
3014         if (type == TCG_TYPE_V256) {
3015             insn |= P_VEXL;
3016         }
3017         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3018         break;
3019
3020     case INDEX_op_cmp_vec:
3021         sub = args[3];
3022         if (sub == TCG_COND_EQ) {
3023             insn = cmpeq_insn[vece];
3024         } else if (sub == TCG_COND_GT) {
3025             insn = cmpgt_insn[vece];
3026         } else {
3027             g_assert_not_reached();
3028         }
3029         goto gen_simd;
3030
3031     case INDEX_op_andc_vec:
3032         insn = OPC_PANDN;
3033         if (type == TCG_TYPE_V256) {
3034             insn |= P_VEXL;
3035         }
3036         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3037         break;
3038
3039     case INDEX_op_shli_vec:
3040         insn = shift_imm_insn[vece];
3041         sub = 6;
3042         goto gen_shift;
3043     case INDEX_op_shri_vec:
3044         insn = shift_imm_insn[vece];
3045         sub = 2;
3046         goto gen_shift;
3047     case INDEX_op_sari_vec:
3048         if (vece == MO_64) {
3049             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3050         } else {
3051             insn = shift_imm_insn[vece];
3052         }
3053         sub = 4;
3054         goto gen_shift;
3055     case INDEX_op_rotli_vec:
3056         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3057         if (vece == MO_64) {
3058             insn |= P_VEXW;
3059         }
3060         sub = 1;
3061         goto gen_shift;
3062     gen_shift:
3063         tcg_debug_assert(vece != MO_8);
3064         if (type == TCG_TYPE_V256) {
3065             insn |= P_VEXL;
3066         }
3067         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3068         tcg_out8(s, a2);
3069         break;
3070
3071     case INDEX_op_ld_vec:
3072         tcg_out_ld(s, type, a0, a1, a2);
3073         break;
3074     case INDEX_op_st_vec:
3075         tcg_out_st(s, type, a0, a1, a2);
3076         break;
3077     case INDEX_op_dupm_vec:
3078         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3079         break;
3080
3081     case INDEX_op_x86_shufps_vec:
3082         insn = OPC_SHUFPS;
3083         sub = args[3];
3084         goto gen_simd_imm8;
3085     case INDEX_op_x86_blend_vec:
3086         if (vece == MO_16) {
3087             insn = OPC_PBLENDW;
3088         } else if (vece == MO_32) {
3089             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3090         } else {
3091             g_assert_not_reached();
3092         }
3093         sub = args[3];
3094         goto gen_simd_imm8;
3095     case INDEX_op_x86_vperm2i128_vec:
3096         insn = OPC_VPERM2I128;
3097         sub = args[3];
3098         goto gen_simd_imm8;
3099     case INDEX_op_x86_vpshldi_vec:
3100         insn = vpshldi_insn[vece];
3101         sub = args[3];
3102         goto gen_simd_imm8;
3103
3104     case INDEX_op_not_vec:
3105         insn = OPC_VPTERNLOGQ;
3106         a2 = a1;
3107         sub = 0x33; /* !B */
3108         goto gen_simd_imm8;
3109     case INDEX_op_nor_vec:
3110         insn = OPC_VPTERNLOGQ;
3111         sub = 0x11; /* norCB */
3112         goto gen_simd_imm8;
3113     case INDEX_op_nand_vec:
3114         insn = OPC_VPTERNLOGQ;
3115         sub = 0x77; /* nandCB */
3116         goto gen_simd_imm8;
3117     case INDEX_op_eqv_vec:
3118         insn = OPC_VPTERNLOGQ;
3119         sub = 0x99; /* xnorCB */
3120         goto gen_simd_imm8;
3121     case INDEX_op_orc_vec:
3122         insn = OPC_VPTERNLOGQ;
3123         sub = 0xdd; /* orB!C */
3124         goto gen_simd_imm8;
3125
3126     case INDEX_op_bitsel_vec:
3127         insn = OPC_VPTERNLOGQ;
3128         a3 = args[3];
3129         if (a0 == a1) {
3130             a1 = a2;
3131             a2 = a3;
3132             sub = 0xca; /* A?B:C */
3133         } else if (a0 == a2) {
3134             a2 = a3;
3135             sub = 0xe2; /* B?A:C */
3136         } else {
3137             tcg_out_mov(s, type, a0, a3);
3138             sub = 0xb8; /* B?C:A */
3139         }
3140         goto gen_simd_imm8;
3141
3142     gen_simd_imm8:
3143         tcg_debug_assert(insn != OPC_UD2);
3144         if (type == TCG_TYPE_V256) {
3145             insn |= P_VEXL;
3146         }
3147         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3148         tcg_out8(s, sub);
3149         break;
3150
3151     case INDEX_op_x86_vpblendvb_vec:
3152         insn = OPC_VPBLENDVB;
3153         if (type == TCG_TYPE_V256) {
3154             insn |= P_VEXL;
3155         }
3156         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3157         tcg_out8(s, args[3] << 4);
3158         break;
3159
3160     case INDEX_op_x86_psrldq_vec:
3161         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3162         tcg_out8(s, a2);
3163         break;
3164
3165     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3166     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3167     default:
3168         g_assert_not_reached();
3169     }
3170 }
3171
3172 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3173 {
3174     switch (op) {
3175     case INDEX_op_goto_ptr:
3176         return C_O0_I1(r);
3177
3178     case INDEX_op_ld8u_i32:
3179     case INDEX_op_ld8u_i64:
3180     case INDEX_op_ld8s_i32:
3181     case INDEX_op_ld8s_i64:
3182     case INDEX_op_ld16u_i32:
3183     case INDEX_op_ld16u_i64:
3184     case INDEX_op_ld16s_i32:
3185     case INDEX_op_ld16s_i64:
3186     case INDEX_op_ld_i32:
3187     case INDEX_op_ld32u_i64:
3188     case INDEX_op_ld32s_i64:
3189     case INDEX_op_ld_i64:
3190         return C_O1_I1(r, r);
3191
3192     case INDEX_op_st8_i32:
3193     case INDEX_op_st8_i64:
3194         return C_O0_I2(qi, r);
3195
3196     case INDEX_op_st16_i32:
3197     case INDEX_op_st16_i64:
3198     case INDEX_op_st_i32:
3199     case INDEX_op_st32_i64:
3200         return C_O0_I2(ri, r);
3201
3202     case INDEX_op_st_i64:
3203         return C_O0_I2(re, r);
3204
3205     case INDEX_op_add_i32:
3206     case INDEX_op_add_i64:
3207         return C_O1_I2(r, r, re);
3208
3209     case INDEX_op_sub_i32:
3210     case INDEX_op_sub_i64:
3211     case INDEX_op_mul_i32:
3212     case INDEX_op_mul_i64:
3213     case INDEX_op_or_i32:
3214     case INDEX_op_or_i64:
3215     case INDEX_op_xor_i32:
3216     case INDEX_op_xor_i64:
3217         return C_O1_I2(r, 0, re);
3218
3219     case INDEX_op_and_i32:
3220     case INDEX_op_and_i64:
3221         return C_O1_I2(r, 0, reZ);
3222
3223     case INDEX_op_andc_i32:
3224     case INDEX_op_andc_i64:
3225         return C_O1_I2(r, r, rI);
3226
3227     case INDEX_op_shl_i32:
3228     case INDEX_op_shl_i64:
3229     case INDEX_op_shr_i32:
3230     case INDEX_op_shr_i64:
3231     case INDEX_op_sar_i32:
3232     case INDEX_op_sar_i64:
3233         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3234
3235     case INDEX_op_rotl_i32:
3236     case INDEX_op_rotl_i64:
3237     case INDEX_op_rotr_i32:
3238     case INDEX_op_rotr_i64:
3239         return C_O1_I2(r, 0, ci);
3240
3241     case INDEX_op_brcond_i32:
3242     case INDEX_op_brcond_i64:
3243         return C_O0_I2(r, re);
3244
3245     case INDEX_op_bswap16_i32:
3246     case INDEX_op_bswap16_i64:
3247     case INDEX_op_bswap32_i32:
3248     case INDEX_op_bswap32_i64:
3249     case INDEX_op_bswap64_i64:
3250     case INDEX_op_neg_i32:
3251     case INDEX_op_neg_i64:
3252     case INDEX_op_not_i32:
3253     case INDEX_op_not_i64:
3254     case INDEX_op_extrh_i64_i32:
3255         return C_O1_I1(r, 0);
3256
3257     case INDEX_op_ext8s_i32:
3258     case INDEX_op_ext8s_i64:
3259     case INDEX_op_ext8u_i32:
3260     case INDEX_op_ext8u_i64:
3261         return C_O1_I1(r, q);
3262
3263     case INDEX_op_ext16s_i32:
3264     case INDEX_op_ext16s_i64:
3265     case INDEX_op_ext16u_i32:
3266     case INDEX_op_ext16u_i64:
3267     case INDEX_op_ext32s_i64:
3268     case INDEX_op_ext32u_i64:
3269     case INDEX_op_ext_i32_i64:
3270     case INDEX_op_extu_i32_i64:
3271     case INDEX_op_extrl_i64_i32:
3272     case INDEX_op_extract_i32:
3273     case INDEX_op_extract_i64:
3274     case INDEX_op_sextract_i32:
3275     case INDEX_op_ctpop_i32:
3276     case INDEX_op_ctpop_i64:
3277         return C_O1_I1(r, r);
3278
3279     case INDEX_op_extract2_i32:
3280     case INDEX_op_extract2_i64:
3281         return C_O1_I2(r, 0, r);
3282
3283     case INDEX_op_deposit_i32:
3284     case INDEX_op_deposit_i64:
3285         return C_O1_I2(Q, 0, Q);
3286
3287     case INDEX_op_setcond_i32:
3288     case INDEX_op_setcond_i64:
3289         return C_O1_I2(q, r, re);
3290
3291     case INDEX_op_movcond_i32:
3292     case INDEX_op_movcond_i64:
3293         return C_O1_I4(r, r, re, r, 0);
3294
3295     case INDEX_op_div2_i32:
3296     case INDEX_op_div2_i64:
3297     case INDEX_op_divu2_i32:
3298     case INDEX_op_divu2_i64:
3299         return C_O2_I3(a, d, 0, 1, r);
3300
3301     case INDEX_op_mulu2_i32:
3302     case INDEX_op_mulu2_i64:
3303     case INDEX_op_muls2_i32:
3304     case INDEX_op_muls2_i64:
3305         return C_O2_I2(a, d, a, r);
3306
3307     case INDEX_op_add2_i32:
3308     case INDEX_op_add2_i64:
3309     case INDEX_op_sub2_i32:
3310     case INDEX_op_sub2_i64:
3311         return C_O2_I4(r, r, 0, 1, re, re);
3312
3313     case INDEX_op_ctz_i32:
3314     case INDEX_op_ctz_i64:
3315         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3316
3317     case INDEX_op_clz_i32:
3318     case INDEX_op_clz_i64:
3319         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3320
3321     case INDEX_op_qemu_ld_i32:
3322         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3323                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3324
3325     case INDEX_op_qemu_st_i32:
3326         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3327                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3328     case INDEX_op_qemu_st8_i32:
3329         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3330                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3331
3332     case INDEX_op_qemu_ld_i64:
3333         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3334                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3335                 : C_O2_I2(r, r, L, L));
3336
3337     case INDEX_op_qemu_st_i64:
3338         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3339                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3340                 : C_O0_I4(L, L, L, L));
3341
3342     case INDEX_op_brcond2_i32:
3343         return C_O0_I4(r, r, ri, ri);
3344
3345     case INDEX_op_setcond2_i32:
3346         return C_O1_I4(r, r, r, ri, ri);
3347
3348     case INDEX_op_ld_vec:
3349     case INDEX_op_dupm_vec:
3350         return C_O1_I1(x, r);
3351
3352     case INDEX_op_st_vec:
3353         return C_O0_I2(x, r);
3354
3355     case INDEX_op_add_vec:
3356     case INDEX_op_sub_vec:
3357     case INDEX_op_mul_vec:
3358     case INDEX_op_and_vec:
3359     case INDEX_op_or_vec:
3360     case INDEX_op_xor_vec:
3361     case INDEX_op_andc_vec:
3362     case INDEX_op_orc_vec:
3363     case INDEX_op_nand_vec:
3364     case INDEX_op_nor_vec:
3365     case INDEX_op_eqv_vec:
3366     case INDEX_op_ssadd_vec:
3367     case INDEX_op_usadd_vec:
3368     case INDEX_op_sssub_vec:
3369     case INDEX_op_ussub_vec:
3370     case INDEX_op_smin_vec:
3371     case INDEX_op_umin_vec:
3372     case INDEX_op_smax_vec:
3373     case INDEX_op_umax_vec:
3374     case INDEX_op_shlv_vec:
3375     case INDEX_op_shrv_vec:
3376     case INDEX_op_sarv_vec:
3377     case INDEX_op_rotlv_vec:
3378     case INDEX_op_rotrv_vec:
3379     case INDEX_op_shls_vec:
3380     case INDEX_op_shrs_vec:
3381     case INDEX_op_sars_vec:
3382     case INDEX_op_cmp_vec:
3383     case INDEX_op_x86_shufps_vec:
3384     case INDEX_op_x86_blend_vec:
3385     case INDEX_op_x86_packss_vec:
3386     case INDEX_op_x86_packus_vec:
3387     case INDEX_op_x86_vperm2i128_vec:
3388     case INDEX_op_x86_punpckl_vec:
3389     case INDEX_op_x86_punpckh_vec:
3390     case INDEX_op_x86_vpshldi_vec:
3391 #if TCG_TARGET_REG_BITS == 32
3392     case INDEX_op_dup2_vec:
3393 #endif
3394         return C_O1_I2(x, x, x);
3395
3396     case INDEX_op_abs_vec:
3397     case INDEX_op_dup_vec:
3398     case INDEX_op_not_vec:
3399     case INDEX_op_shli_vec:
3400     case INDEX_op_shri_vec:
3401     case INDEX_op_sari_vec:
3402     case INDEX_op_rotli_vec:
3403     case INDEX_op_x86_psrldq_vec:
3404         return C_O1_I1(x, x);
3405
3406     case INDEX_op_x86_vpshldv_vec:
3407     case INDEX_op_x86_vpshrdv_vec:
3408         return C_O1_I3(x, 0, x, x);
3409
3410     case INDEX_op_bitsel_vec:
3411     case INDEX_op_x86_vpblendvb_vec:
3412         return C_O1_I3(x, x, x, x);
3413
3414     default:
3415         g_assert_not_reached();
3416     }
3417 }
3418
3419 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3420 {
3421     switch (opc) {
3422     case INDEX_op_add_vec:
3423     case INDEX_op_sub_vec:
3424     case INDEX_op_and_vec:
3425     case INDEX_op_or_vec:
3426     case INDEX_op_xor_vec:
3427     case INDEX_op_andc_vec:
3428     case INDEX_op_orc_vec:
3429     case INDEX_op_nand_vec:
3430     case INDEX_op_nor_vec:
3431     case INDEX_op_eqv_vec:
3432     case INDEX_op_not_vec:
3433     case INDEX_op_bitsel_vec:
3434         return 1;
3435     case INDEX_op_cmp_vec:
3436     case INDEX_op_cmpsel_vec:
3437         return -1;
3438
3439     case INDEX_op_rotli_vec:
3440         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3441
3442     case INDEX_op_shli_vec:
3443     case INDEX_op_shri_vec:
3444         /* We must expand the operation for MO_8.  */
3445         return vece == MO_8 ? -1 : 1;
3446
3447     case INDEX_op_sari_vec:
3448         switch (vece) {
3449         case MO_8:
3450             return -1;
3451         case MO_16:
3452         case MO_32:
3453             return 1;
3454         case MO_64:
3455             if (have_avx512vl) {
3456                 return 1;
3457             }
3458             /*
3459              * We can emulate this for MO_64, but it does not pay off
3460              * unless we're producing at least 4 values.
3461              */
3462             return type >= TCG_TYPE_V256 ? -1 : 0;
3463         }
3464         return 0;
3465
3466     case INDEX_op_shls_vec:
3467     case INDEX_op_shrs_vec:
3468         return vece >= MO_16;
3469     case INDEX_op_sars_vec:
3470         switch (vece) {
3471         case MO_16:
3472         case MO_32:
3473             return 1;
3474         case MO_64:
3475             return have_avx512vl;
3476         }
3477         return 0;
3478     case INDEX_op_rotls_vec:
3479         return vece >= MO_16 ? -1 : 0;
3480
3481     case INDEX_op_shlv_vec:
3482     case INDEX_op_shrv_vec:
3483         switch (vece) {
3484         case MO_16:
3485             return have_avx512bw;
3486         case MO_32:
3487         case MO_64:
3488             return have_avx2;
3489         }
3490         return 0;
3491     case INDEX_op_sarv_vec:
3492         switch (vece) {
3493         case MO_16:
3494             return have_avx512bw;
3495         case MO_32:
3496             return have_avx2;
3497         case MO_64:
3498             return have_avx512vl;
3499         }
3500         return 0;
3501     case INDEX_op_rotlv_vec:
3502     case INDEX_op_rotrv_vec:
3503         switch (vece) {
3504         case MO_16:
3505             return have_avx512vbmi2 ? -1 : 0;
3506         case MO_32:
3507         case MO_64:
3508             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3509         }
3510         return 0;
3511
3512     case INDEX_op_mul_vec:
3513         switch (vece) {
3514         case MO_8:
3515             return -1;
3516         case MO_64:
3517             return have_avx512dq;
3518         }
3519         return 1;
3520
3521     case INDEX_op_ssadd_vec:
3522     case INDEX_op_usadd_vec:
3523     case INDEX_op_sssub_vec:
3524     case INDEX_op_ussub_vec:
3525         return vece <= MO_16;
3526     case INDEX_op_smin_vec:
3527     case INDEX_op_smax_vec:
3528     case INDEX_op_umin_vec:
3529     case INDEX_op_umax_vec:
3530     case INDEX_op_abs_vec:
3531         return vece <= MO_32 || have_avx512vl;
3532
3533     default:
3534         return 0;
3535     }
3536 }
3537
3538 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3539                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3540 {
3541     TCGv_vec t1, t2;
3542
3543     tcg_debug_assert(vece == MO_8);
3544
3545     t1 = tcg_temp_new_vec(type);
3546     t2 = tcg_temp_new_vec(type);
3547
3548     /*
3549      * Unpack to W, shift, and repack.  Tricky bits:
3550      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3551      *     i.e. duplicate in other half of the 16-bit lane.
3552      * (2) For right-shift, add 8 so that the high half of the lane
3553      *     becomes zero.  For left-shift, and left-rotate, we must
3554      *     shift up and down again.
3555      * (3) Step 2 leaves high half zero such that PACKUSWB
3556      *     (pack with unsigned saturation) does not modify
3557      *     the quantity.
3558      */
3559     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3560               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3561     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3562               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3563
3564     if (opc != INDEX_op_rotli_vec) {
3565         imm += 8;
3566     }
3567     if (opc == INDEX_op_shri_vec) {
3568         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3569         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3570     } else {
3571         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3572         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3573         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3574         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3575     }
3576
3577     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3578               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3579     tcg_temp_free_vec(t1);
3580     tcg_temp_free_vec(t2);
3581 }
3582
3583 static void expand_vec_sari(TCGType type, unsigned vece,
3584                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3585 {
3586     TCGv_vec t1, t2;
3587
3588     switch (vece) {
3589     case MO_8:
3590         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3591         t1 = tcg_temp_new_vec(type);
3592         t2 = tcg_temp_new_vec(type);
3593         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3594                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3595         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3596                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3597         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3598         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3599         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3600                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3601         tcg_temp_free_vec(t1);
3602         tcg_temp_free_vec(t2);
3603         break;
3604
3605     case MO_64:
3606         if (imm <= 32) {
3607             /*
3608              * We can emulate a small sign extend by performing an arithmetic
3609              * 32-bit shift and overwriting the high half of a 64-bit logical
3610              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3611              * does not, so we have to bound the smaller shift -- we get the
3612              * same result in the high half either way.
3613              */
3614             t1 = tcg_temp_new_vec(type);
3615             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3616             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3617             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3618                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3619                       tcgv_vec_arg(t1), 0xaa);
3620             tcg_temp_free_vec(t1);
3621         } else {
3622             /* Otherwise we will need to use a compare vs 0 to produce
3623              * the sign-extend, shift and merge.
3624              */
3625             t1 = tcg_const_zeros_vec(type);
3626             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3627             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3628             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3629             tcg_gen_or_vec(MO_64, v0, v0, t1);
3630             tcg_temp_free_vec(t1);
3631         }
3632         break;
3633
3634     default:
3635         g_assert_not_reached();
3636     }
3637 }
3638
3639 static void expand_vec_rotli(TCGType type, unsigned vece,
3640                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3641 {
3642     TCGv_vec t;
3643
3644     if (vece == MO_8) {
3645         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3646         return;
3647     }
3648
3649     if (have_avx512vbmi2) {
3650         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3651                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3652         return;
3653     }
3654
3655     t = tcg_temp_new_vec(type);
3656     tcg_gen_shli_vec(vece, t, v1, imm);
3657     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3658     tcg_gen_or_vec(vece, v0, v0, t);
3659     tcg_temp_free_vec(t);
3660 }
3661
3662 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3663                             TCGv_vec v1, TCGv_vec sh, bool right)
3664 {
3665     TCGv_vec t;
3666
3667     if (have_avx512vbmi2) {
3668         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3669                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3670                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3671         return;
3672     }
3673
3674     t = tcg_temp_new_vec(type);
3675     tcg_gen_dupi_vec(vece, t, 8 << vece);
3676     tcg_gen_sub_vec(vece, t, t, sh);
3677     if (right) {
3678         tcg_gen_shlv_vec(vece, t, v1, t);
3679         tcg_gen_shrv_vec(vece, v0, v1, sh);
3680     } else {
3681         tcg_gen_shrv_vec(vece, t, v1, t);
3682         tcg_gen_shlv_vec(vece, v0, v1, sh);
3683     }
3684     tcg_gen_or_vec(vece, v0, v0, t);
3685     tcg_temp_free_vec(t);
3686 }
3687
3688 static void expand_vec_rotls(TCGType type, unsigned vece,
3689                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3690 {
3691     TCGv_vec t = tcg_temp_new_vec(type);
3692
3693     tcg_debug_assert(vece != MO_8);
3694
3695     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3696         tcg_gen_dup_i32_vec(vece, t, lsh);
3697         if (vece >= MO_32) {
3698             tcg_gen_rotlv_vec(vece, v0, v1, t);
3699         } else {
3700             expand_vec_rotv(type, vece, v0, v1, t, false);
3701         }
3702     } else {
3703         TCGv_i32 rsh = tcg_temp_new_i32();
3704
3705         tcg_gen_neg_i32(rsh, lsh);
3706         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3707         tcg_gen_shls_vec(vece, t, v1, lsh);
3708         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3709         tcg_gen_or_vec(vece, v0, v0, t);
3710
3711         tcg_temp_free_i32(rsh);
3712     }
3713
3714     tcg_temp_free_vec(t);
3715 }
3716
3717 static void expand_vec_mul(TCGType type, unsigned vece,
3718                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3719 {
3720     TCGv_vec t1, t2, t3, t4, zero;
3721
3722     tcg_debug_assert(vece == MO_8);
3723
3724     /*
3725      * Unpack v1 bytes to words, 0 | x.
3726      * Unpack v2 bytes to words, y | 0.
3727      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3728      * Shift logical right by 8 bits to clear the high 8 bytes before
3729      * using an unsigned saturated pack.
3730      *
3731      * The difference between the V64, V128 and V256 cases is merely how
3732      * we distribute the expansion between temporaries.
3733      */
3734     switch (type) {
3735     case TCG_TYPE_V64:
3736         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3737         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3738         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3739         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3740                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3741         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3742                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3743         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3744         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3745         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3746                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3747         tcg_temp_free_vec(t1);
3748         tcg_temp_free_vec(t2);
3749         break;
3750
3751     case TCG_TYPE_V128:
3752     case TCG_TYPE_V256:
3753         t1 = tcg_temp_new_vec(type);
3754         t2 = tcg_temp_new_vec(type);
3755         t3 = tcg_temp_new_vec(type);
3756         t4 = tcg_temp_new_vec(type);
3757         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3758         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3759                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3760         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3761                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3762         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3763                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3764         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3765                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3766         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3767         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3768         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3769         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3770         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3771                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3772         tcg_temp_free_vec(t1);
3773         tcg_temp_free_vec(t2);
3774         tcg_temp_free_vec(t3);
3775         tcg_temp_free_vec(t4);
3776         break;
3777
3778     default:
3779         g_assert_not_reached();
3780     }
3781 }
3782
3783 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3784                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3785 {
3786     enum {
3787         NEED_INV  = 1,
3788         NEED_SWAP = 2,
3789         NEED_BIAS = 4,
3790         NEED_UMIN = 8,
3791         NEED_UMAX = 16,
3792     };
3793     TCGv_vec t1, t2, t3;
3794     uint8_t fixup;
3795
3796     switch (cond) {
3797     case TCG_COND_EQ:
3798     case TCG_COND_GT:
3799         fixup = 0;
3800         break;
3801     case TCG_COND_NE:
3802     case TCG_COND_LE:
3803         fixup = NEED_INV;
3804         break;
3805     case TCG_COND_LT:
3806         fixup = NEED_SWAP;
3807         break;
3808     case TCG_COND_GE:
3809         fixup = NEED_SWAP | NEED_INV;
3810         break;
3811     case TCG_COND_LEU:
3812         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3813             fixup = NEED_UMIN;
3814         } else {
3815             fixup = NEED_BIAS | NEED_INV;
3816         }
3817         break;
3818     case TCG_COND_GTU:
3819         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3820             fixup = NEED_UMIN | NEED_INV;
3821         } else {
3822             fixup = NEED_BIAS;
3823         }
3824         break;
3825     case TCG_COND_GEU:
3826         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3827             fixup = NEED_UMAX;
3828         } else {
3829             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3830         }
3831         break;
3832     case TCG_COND_LTU:
3833         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3834             fixup = NEED_UMAX | NEED_INV;
3835         } else {
3836             fixup = NEED_BIAS | NEED_SWAP;
3837         }
3838         break;
3839     default:
3840         g_assert_not_reached();
3841     }
3842
3843     if (fixup & NEED_INV) {
3844         cond = tcg_invert_cond(cond);
3845     }
3846     if (fixup & NEED_SWAP) {
3847         t1 = v1, v1 = v2, v2 = t1;
3848         cond = tcg_swap_cond(cond);
3849     }
3850
3851     t1 = t2 = NULL;
3852     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3853         t1 = tcg_temp_new_vec(type);
3854         if (fixup & NEED_UMIN) {
3855             tcg_gen_umin_vec(vece, t1, v1, v2);
3856         } else {
3857             tcg_gen_umax_vec(vece, t1, v1, v2);
3858         }
3859         v2 = t1;
3860         cond = TCG_COND_EQ;
3861     } else if (fixup & NEED_BIAS) {
3862         t1 = tcg_temp_new_vec(type);
3863         t2 = tcg_temp_new_vec(type);
3864         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3865         tcg_gen_sub_vec(vece, t1, v1, t3);
3866         tcg_gen_sub_vec(vece, t2, v2, t3);
3867         v1 = t1;
3868         v2 = t2;
3869         cond = tcg_signed_cond(cond);
3870     }
3871
3872     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3873     /* Expand directly; do not recurse.  */
3874     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3875               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3876
3877     if (t1) {
3878         tcg_temp_free_vec(t1);
3879         if (t2) {
3880             tcg_temp_free_vec(t2);
3881         }
3882     }
3883     return fixup & NEED_INV;
3884 }
3885
3886 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3887                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3888 {
3889     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3890         tcg_gen_not_vec(vece, v0, v0);
3891     }
3892 }
3893
3894 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3895                               TCGv_vec c1, TCGv_vec c2,
3896                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3897 {
3898     TCGv_vec t = tcg_temp_new_vec(type);
3899
3900     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3901         /* Invert the sense of the compare by swapping arguments.  */
3902         TCGv_vec x;
3903         x = v3, v3 = v4, v4 = x;
3904     }
3905     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3906               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3907               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3908     tcg_temp_free_vec(t);
3909 }
3910
3911 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3912                        TCGArg a0, ...)
3913 {
3914     va_list va;
3915     TCGArg a2;
3916     TCGv_vec v0, v1, v2, v3, v4;
3917
3918     va_start(va, a0);
3919     v0 = temp_tcgv_vec(arg_temp(a0));
3920     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3921     a2 = va_arg(va, TCGArg);
3922
3923     switch (opc) {
3924     case INDEX_op_shli_vec:
3925     case INDEX_op_shri_vec:
3926         expand_vec_shi(type, vece, opc, v0, v1, a2);
3927         break;
3928
3929     case INDEX_op_sari_vec:
3930         expand_vec_sari(type, vece, v0, v1, a2);
3931         break;
3932
3933     case INDEX_op_rotli_vec:
3934         expand_vec_rotli(type, vece, v0, v1, a2);
3935         break;
3936
3937     case INDEX_op_rotls_vec:
3938         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3939         break;
3940
3941     case INDEX_op_rotlv_vec:
3942         v2 = temp_tcgv_vec(arg_temp(a2));
3943         expand_vec_rotv(type, vece, v0, v1, v2, false);
3944         break;
3945     case INDEX_op_rotrv_vec:
3946         v2 = temp_tcgv_vec(arg_temp(a2));
3947         expand_vec_rotv(type, vece, v0, v1, v2, true);
3948         break;
3949
3950     case INDEX_op_mul_vec:
3951         v2 = temp_tcgv_vec(arg_temp(a2));
3952         expand_vec_mul(type, vece, v0, v1, v2);
3953         break;
3954
3955     case INDEX_op_cmp_vec:
3956         v2 = temp_tcgv_vec(arg_temp(a2));
3957         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3958         break;
3959
3960     case INDEX_op_cmpsel_vec:
3961         v2 = temp_tcgv_vec(arg_temp(a2));
3962         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3963         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3964         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3965         break;
3966
3967     default:
3968         break;
3969     }
3970
3971     va_end(va);
3972 }
3973
3974 static const int tcg_target_callee_save_regs[] = {
3975 #if TCG_TARGET_REG_BITS == 64
3976     TCG_REG_RBP,
3977     TCG_REG_RBX,
3978 #if defined(_WIN64)
3979     TCG_REG_RDI,
3980     TCG_REG_RSI,
3981 #endif
3982     TCG_REG_R12,
3983     TCG_REG_R13,
3984     TCG_REG_R14, /* Currently used for the global env. */
3985     TCG_REG_R15,
3986 #else
3987     TCG_REG_EBP, /* Currently used for the global env. */
3988     TCG_REG_EBX,
3989     TCG_REG_ESI,
3990     TCG_REG_EDI,
3991 #endif
3992 };
3993
3994 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3995    and tcg_register_jit.  */
3996
3997 #define PUSH_SIZE \
3998     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3999      * (TCG_TARGET_REG_BITS / 8))
4000
4001 #define FRAME_SIZE \
4002     ((PUSH_SIZE \
4003       + TCG_STATIC_CALL_ARGS_SIZE \
4004       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4005       + TCG_TARGET_STACK_ALIGN - 1) \
4006      & ~(TCG_TARGET_STACK_ALIGN - 1))
4007
4008 /* Generate global QEMU prologue and epilogue code */
4009 static void tcg_target_qemu_prologue(TCGContext *s)
4010 {
4011     int i, stack_addend;
4012
4013     /* TB prologue */
4014
4015     /* Reserve some stack space, also for TCG temps.  */
4016     stack_addend = FRAME_SIZE - PUSH_SIZE;
4017     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4018                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4019
4020     /* Save all callee saved registers.  */
4021     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4022         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4023     }
4024
4025 #if TCG_TARGET_REG_BITS == 32
4026     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4027                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4028     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4029     /* jmp *tb.  */
4030     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4031                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4032                          + stack_addend);
4033 #else
4034 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
4035     if (guest_base) {
4036         int seg = setup_guest_base_seg();
4037         if (seg != 0) {
4038             x86_guest_base_seg = seg;
4039         } else if (guest_base == (int32_t)guest_base) {
4040             x86_guest_base_offset = guest_base;
4041         } else {
4042             /* Choose R12 because, as a base, it requires a SIB byte. */
4043             x86_guest_base_index = TCG_REG_R12;
4044             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
4045             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
4046         }
4047     }
4048 # endif
4049     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4050     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4051     /* jmp *tb.  */
4052     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4053 #endif
4054
4055     /*
4056      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4057      * and fall through to the rest of the epilogue.
4058      */
4059     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4060     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4061
4062     /* TB epilogue */
4063     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4064
4065     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4066
4067     if (have_avx2) {
4068         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4069     }
4070     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4071         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4072     }
4073     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4074 }
4075
4076 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4077 {
4078     memset(p, 0x90, count);
4079 }
4080
4081 static void tcg_target_init(TCGContext *s)
4082 {
4083 #ifdef CONFIG_CPUID_H
4084     unsigned a, b, c, d, b7 = 0, c7 = 0;
4085     unsigned max = __get_cpuid_max(0, 0);
4086
4087     if (max >= 7) {
4088         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
4089         __cpuid_count(7, 0, a, b7, c7, d);
4090         have_bmi1 = (b7 & bit_BMI) != 0;
4091         have_bmi2 = (b7 & bit_BMI2) != 0;
4092     }
4093
4094     if (max >= 1) {
4095         __cpuid(1, a, b, c, d);
4096 #ifndef have_cmov
4097         /* For 32-bit, 99% certainty that we're running on hardware that
4098            supports cmov, but we still need to check.  In case cmov is not
4099            available, we'll use a small forward branch.  */
4100         have_cmov = (d & bit_CMOV) != 0;
4101 #endif
4102
4103         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
4104            need to probe for it.  */
4105         have_movbe = (c & bit_MOVBE) != 0;
4106         have_popcnt = (c & bit_POPCNT) != 0;
4107
4108         /* There are a number of things we must check before we can be
4109            sure of not hitting invalid opcode.  */
4110         if (c & bit_OSXSAVE) {
4111             unsigned xcrl, xcrh;
4112             /* The xgetbv instruction is not available to older versions of
4113              * the assembler, so we encode the instruction manually.
4114              */
4115             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
4116             if ((xcrl & 6) == 6) {
4117                 have_avx1 = (c & bit_AVX) != 0;
4118                 have_avx2 = (b7 & bit_AVX2) != 0;
4119
4120                 /*
4121                  * There are interesting instructions in AVX512, so long
4122                  * as we have AVX512VL, which indicates support for EVEX
4123                  * on sizes smaller than 512 bits.  We are required to
4124                  * check that OPMASK and all extended ZMM state are enabled
4125                  * even if we're not using them -- the insns will fault.
4126                  */
4127                 if ((xcrl & 0xe0) == 0xe0
4128                     && (b7 & bit_AVX512F)
4129                     && (b7 & bit_AVX512VL)) {
4130                     have_avx512vl = true;
4131                     have_avx512bw = (b7 & bit_AVX512BW) != 0;
4132                     have_avx512dq = (b7 & bit_AVX512DQ) != 0;
4133                     have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
4134                 }
4135             }
4136         }
4137     }
4138
4139     max = __get_cpuid_max(0x8000000, 0);
4140     if (max >= 1) {
4141         __cpuid(0x80000001, a, b, c, d);
4142         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4143         have_lzcnt = (c & bit_LZCNT) != 0;
4144     }
4145 #endif /* CONFIG_CPUID_H */
4146
4147     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4148     if (TCG_TARGET_REG_BITS == 64) {
4149         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4150     }
4151     if (have_avx1) {
4152         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4153         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4154     }
4155     if (have_avx2) {
4156         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4157     }
4158
4159     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4160     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4161     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4162     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4163     if (TCG_TARGET_REG_BITS == 64) {
4164 #if !defined(_WIN64)
4165         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4166         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4167 #endif
4168         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4169         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4170         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4171         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4172     }
4173
4174     s->reserved_regs = 0;
4175     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4176 }
4177
4178 typedef struct {
4179     DebugFrameHeader h;
4180     uint8_t fde_def_cfa[4];
4181     uint8_t fde_reg_ofs[14];
4182 } DebugFrame;
4183
4184 /* We're expecting a 2 byte uleb128 encoded value.  */
4185 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4186
4187 #if !defined(__ELF__)
4188     /* Host machine without ELF. */
4189 #elif TCG_TARGET_REG_BITS == 64
4190 #define ELF_HOST_MACHINE EM_X86_64
4191 static const DebugFrame debug_frame = {
4192     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4193     .h.cie.id = -1,
4194     .h.cie.version = 1,
4195     .h.cie.code_align = 1,
4196     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4197     .h.cie.return_column = 16,
4198
4199     /* Total FDE size does not include the "len" member.  */
4200     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4201
4202     .fde_def_cfa = {
4203         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4204         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4205         (FRAME_SIZE >> 7)
4206     },
4207     .fde_reg_ofs = {
4208         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4209         /* The following ordering must match tcg_target_callee_save_regs.  */
4210         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4211         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4212         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4213         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4214         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4215         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4216     }
4217 };
4218 #else
4219 #define ELF_HOST_MACHINE EM_386
4220 static const DebugFrame debug_frame = {
4221     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4222     .h.cie.id = -1,
4223     .h.cie.version = 1,
4224     .h.cie.code_align = 1,
4225     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4226     .h.cie.return_column = 8,
4227
4228     /* Total FDE size does not include the "len" member.  */
4229     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4230
4231     .fde_def_cfa = {
4232         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4233         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4234         (FRAME_SIZE >> 7)
4235     },
4236     .fde_reg_ofs = {
4237         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4238         /* The following ordering must match tcg_target_callee_save_regs.  */
4239         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4240         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4241         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4242         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4243     }
4244 };
4245 #endif
4246
4247 #if defined(ELF_HOST_MACHINE)
4248 void tcg_register_jit(const void *buf, size_t buf_size)
4249 {
4250     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4251 }
4252 #endif