Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-2.12-20180410' into staging

[qemu.git] / tcg / tcg.h
diff --git a/tcg/tcg.h b/tcg/tcg.h

index 83da5fb3f0d937e3700a75014675de6a415115e7..30896ca304ed6fc851cf6919f1c32d1ec2b2aa2c 100644 (file)
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -26,9 +26,30 @@
  #define TCG_H
  
  #include "qemu-common.h"
+#include "cpu.h"
+#include "exec/tb-context.h"
  #include "qemu/bitops.h"
+#include "qemu/queue.h"
+#include "tcg-mo.h"
  #include "tcg-target.h"
  
+/* XXX: make safe guess about sizes */
+#define MAX_OP_PER_INSTR 266
+
+#if HOST_LONG_BITS == 32
+#define MAX_OPC_PARAM_PER_ARG 2
+#else
+#define MAX_OPC_PARAM_PER_ARG 1
+#endif
+#define MAX_OPC_PARAM_IARGS 6
+#define MAX_OPC_PARAM_OARGS 1
+#define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS)
+
+/* A Call op needs up to 4 + 2N parameters on 32-bit archs,
+ * and up to 4 + N parameters on 64-bit archs
+ * (N = number of input arguments + output arguments).  */
+#define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
+
  #define CPU_TEMP_BUF_NLONGS 128
  
  /* Default target word size to pointer size.  */
@@ -56,6 +77,15 @@ typedef uint64_t tcg_target_ulong;
  #error unsupported
  #endif
  
+/* Oversized TCG guests make things like MTTCG hard
+ * as we can't use atomics for cputlb updates.
+ */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+#define TCG_OVERSIZED_GUEST 1
+#else
+#define TCG_OVERSIZED_GUEST 0
+#endif
+
  #if TCG_TARGET_NB_REGS <= 32
  typedef uint32_t TCGRegSet;
  #elif TCG_TARGET_NB_REGS <= 64
@@ -88,7 +118,12 @@ typedef uint64_t TCGRegSet;
  #define TCG_TARGET_HAS_eqv_i64          0
  #define TCG_TARGET_HAS_nand_i64         0
  #define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_clz_i64          0
+#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_ctpop_i64        0
  #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
  #define TCG_TARGET_HAS_movcond_i64      0
  #define TCG_TARGET_HAS_add2_i64         0
  #define TCG_TARGET_HAS_sub2_i64         0
@@ -107,6 +142,12 @@ typedef uint64_t TCGRegSet;
  #ifndef TCG_TARGET_deposit_i64_valid
  #define TCG_TARGET_deposit_i64_valid(ofs, len) 1
  #endif
+#ifndef TCG_TARGET_extract_i32_valid
+#define TCG_TARGET_extract_i32_valid(ofs, len) 1
+#endif
+#ifndef TCG_TARGET_extract_i64_valid
+#define TCG_TARGET_extract_i64_valid(ofs, len) 1
+#endif
  
  /* Only one of DIV or DIV2 should be defined.  */
  #if defined(TCG_TARGET_HAS_div_i32)
@@ -129,6 +170,31 @@ typedef uint64_t TCGRegSet;
  # error "Missing unsigned widening multiply"
  #endif
  
+#if !defined(TCG_TARGET_HAS_v64) \
+    && !defined(TCG_TARGET_HAS_v128) \
+    && !defined(TCG_TARGET_HAS_v256)
+#define TCG_TARGET_MAYBE_vec            0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_mul_vec          0
+#else
+#define TCG_TARGET_MAYBE_vec            1
+#endif
+#ifndef TCG_TARGET_HAS_v64
+#define TCG_TARGET_HAS_v64              0
+#endif
+#ifndef TCG_TARGET_HAS_v128
+#define TCG_TARGET_HAS_v128             0
+#endif
+#ifndef TCG_TARGET_HAS_v256
+#define TCG_TARGET_HAS_v256             0
+#endif
+
  #ifndef TARGET_INSN_START_EXTRA_WORDS
  # define TARGET_INSN_START_WORDS 1
  #else
@@ -142,16 +208,9 @@ typedef enum TCGOpcode {
      NB_OPS,
  } TCGOpcode;
  
-#define tcg_regset_clear(d) (d) = 0
-#define tcg_regset_set(d, s) (d) = (s)
-#define tcg_regset_set32(d, reg, val32) (d) |= (val32) << (reg)
-#define tcg_regset_set_reg(d, r) (d) |= 1L << (r)
-#define tcg_regset_reset_reg(d, r) (d) &= ~(1L << (r))
-#define tcg_regset_test_reg(d, r) (((d) >> (r)) & 1)
-#define tcg_regset_or(d, a, b) (d) = (a) | (b)
-#define tcg_regset_and(d, a, b) (d) = (a) & (b)
-#define tcg_regset_andnot(d, a, b) (d) = (a) & ~(b)
-#define tcg_regset_not(d, a) (d) = ~(a)
+#define tcg_regset_set_reg(d, r)   ((d) |= (TCGRegSet)1 << (r))
+#define tcg_regset_reset_reg(d, r) ((d) &= ~((TCGRegSet)1 << (r)))
+#define tcg_regset_test_reg(d, r)  (((d) >> (r)) & 1)
  
  #ifndef TCG_TARGET_INSN_UNIT_SIZE
  # error "Missing TCG_TARGET_INSN_UNIT_SIZE"
@@ -168,6 +227,15 @@ typedef uint64_t tcg_insn_unit;
  #endif
  
  
+#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#elif QEMU_GNUC_PREREQ(4, 5)
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#else
+# define tcg_debug_assert(X) do { (void)(X); } while (0)
+#endif
+
  typedef struct TCGRelocation {
      struct TCGRelocation *next;
      int type;
@@ -203,6 +271,11 @@ typedef struct TCGPool {
  typedef enum TCGType {
      TCG_TYPE_I32,
      TCG_TYPE_I64,
+
+    TCG_TYPE_V64,
+    TCG_TYPE_V128,
+    TCG_TYPE_V256,
+
      TCG_TYPE_COUNT, /* number of different types */
  
      /* An alias for the size of the host register.  */
@@ -252,10 +325,25 @@ typedef enum TCGMemOp {
  #endif
  
      /* MO_UNALN accesses are never checked for alignment.
-       MO_ALIGN accesses will result in a call to the CPU's
-       do_unaligned_access hook if the guest address is not aligned.
-       The default depends on whether the target CPU defines ALIGNED_ONLY.  */
-    MO_AMASK = 16,
+     * MO_ALIGN accesses will result in a call to the CPU's
+     * do_unaligned_access hook if the guest address is not aligned.
+     * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     *
+     * Some architectures (e.g. ARMv8) need the address which is aligned
+     * to a size more than the size of the memory access.
+     * Some architectures (e.g. SPARCv9) need an address which is aligned,
+     * but less strictly than the natural alignment.
+     *
+     * MO_ALIGN supposes the alignment size is the size of a memory access.
+     *
+     * There are three options:
+     * - unaligned access permitted (MO_UNALN).
+     * - an alignment to the size of an access (MO_ALIGN);
+     * - an alignment to a specified size, which may be more or less than
+     *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
+     */
+    MO_ASHIFT = 4,
+    MO_AMASK = 7 << MO_ASHIFT,
  #ifdef ALIGNED_ONLY
      MO_ALIGN = 0,
      MO_UNALN = MO_AMASK,
@@ -263,6 +351,12 @@ typedef enum TCGMemOp {
      MO_ALIGN = MO_AMASK,
      MO_UNALN = 0,
  #endif
+    MO_ALIGN_2  = 1 << MO_ASHIFT,
+    MO_ALIGN_4  = 2 << MO_ASHIFT,
+    MO_ALIGN_8  = 3 << MO_ASHIFT,
+    MO_ALIGN_16 = 4 << MO_ASHIFT,
+    MO_ALIGN_32 = 5 << MO_ASHIFT,
+    MO_ALIGN_64 = 6 << MO_ASHIFT,
  
      /* Combinations of the above, for ease of use.  */
      MO_UB    = MO_8,
@@ -294,69 +388,78 @@ typedef enum TCGMemOp {
      MO_SSIZE = MO_SIZE | MO_SIGN,
  } TCGMemOp;
  
+/**
+ * get_alignment_bits
+ * @memop: TCGMemOp value
+ *
+ * Extract the alignment size from the memop.
+ */
+static inline unsigned get_alignment_bits(TCGMemOp memop)
+{
+    unsigned a = memop & MO_AMASK;
+
+    if (a == MO_UNALN) {
+        /* No alignment required.  */
+        a = 0;
+    } else if (a == MO_ALIGN) {
+        /* A natural alignment requirement.  */
+        a = memop & MO_SIZE;
+    } else {
+        /* A specific alignment requirement.  */
+        a = a >> MO_ASHIFT;
+    }
+#if defined(CONFIG_SOFTMMU)
+    /* The requested alignment cannot overlap the TLB flags.  */
+    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
+#endif
+    return a;
+}
+
  typedef tcg_target_ulong TCGArg;
  
-/* Define a type and accessor macros for variables.  Using pointer types
-   is nice because it gives some level of type safely.  Converting to and
-   from intptr_t rather than int reduces the number of sign-extension
-   instructions that get implied on 64-bit hosts.  Users of tcg_gen_* don't
-   need to know about any of this, and should treat TCGv as an opaque type.
-   In addition we do typechecking for different types of variables.  TCGv_i32
-   and TCGv_i64 are 32/64-bit variables respectively.  TCGv and TCGv_ptr
-   are aliases for target_ulong and host pointer sized values respectively.  */
+/* Define type and accessor macros for TCG variables.
+
+   TCG variables are the inputs and outputs of TCG ops, as described
+   in tcg/README. Target CPU front-end code uses these types to deal
+   with TCG variables as it emits TCG code via the tcg_gen_* functions.
+   They come in several flavours:
+    * TCGv_i32 : 32 bit integer type
+    * TCGv_i64 : 64 bit integer type
+    * TCGv_ptr : a host pointer type
+    * TCGv_vec : a host vector type; the exact size is not exposed
+                 to the CPU front-end code.
+    * TCGv : an integer type the same size as target_ulong
+             (an alias for either TCGv_i32 or TCGv_i64)
+   The compiler's type checking will complain if you mix them
+   up and pass the wrong sized TCGv to a function.
+
+   Users of tcg_gen_* don't need to know about any of the internal
+   details of these, and should treat them as opaque types.
+   You won't be able to look inside them in a debugger either.
+
+   Internal implementation details follow:
+
+   Note that there is no definition of the structs TCGv_i32_d etc anywhere.
+   This is deliberate, because the values we store in variables of type
+   TCGv_i32 are not really pointers-to-structures. They're just small
+   integers, but keeping them in pointer types like this means that the
+   compiler will complain if you accidentally pass a TCGv_i32 to a
+   function which takes a TCGv_i64, and so on. Only the internals of
+   TCG need to care about the actual contents of the types.  */
  
  typedef struct TCGv_i32_d *TCGv_i32;
  typedef struct TCGv_i64_d *TCGv_i64;
  typedef struct TCGv_ptr_d *TCGv_ptr;
-
-static inline TCGv_i32 QEMU_ARTIFICIAL MAKE_TCGV_I32(intptr_t i)
-{
-    return (TCGv_i32)i;
-}
-
-static inline TCGv_i64 QEMU_ARTIFICIAL MAKE_TCGV_I64(intptr_t i)
-{
-    return (TCGv_i64)i;
-}
-
-static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
-{
-    return (TCGv_ptr)i;
-}
-
-static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
-{
-    return (intptr_t)t;
-}
-
-static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I64(TCGv_i64 t)
-{
-    return (intptr_t)t;
-}
-
-static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
-{
-    return (intptr_t)t;
-}
-
-#if TCG_TARGET_REG_BITS == 32
-#define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
-#define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
+typedef struct TCGv_vec_d *TCGv_vec;
+typedef TCGv_ptr TCGv_env;
+#if TARGET_LONG_BITS == 32
+#define TCGv TCGv_i32
+#elif TARGET_LONG_BITS == 64
+#define TCGv TCGv_i64
+#else
+#error Unhandled TARGET_LONG_BITS value
  #endif
  
-#define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
-#define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
-#define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
-
-/* Dummy definition to avoid compiler warnings.  */
-#define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
-#define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
-#define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
-
-#define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
-#define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
-#define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
-
  /* call flags */
  /* Helper does not read globals (either directly or through an exception). It
     implies TCG_CALL_NO_WRITE_GLOBALS. */
@@ -373,9 +476,8 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
  #define TCG_CALL_NO_RWG_SE      (TCG_CALL_NO_RWG | TCG_CALL_NO_SE)
  #define TCG_CALL_NO_WG_SE       (TCG_CALL_NO_WG | TCG_CALL_NO_SE)
  
-/* used to align parameters */
-#define TCG_CALL_DUMMY_TCGV     MAKE_TCGV_I32(-1)
-#define TCG_CALL_DUMMY_ARG      ((TCGArg)(-1))
+/* Used to align parameters.  See the comment before tcgv_i32_temp.  */
+#define TCG_CALL_DUMMY_ARG      ((TCGArg)0)
  
  /* Conditions.  Note that these are laid out for easy manipulation by
     the functions below:
@@ -419,6 +521,12 @@ static inline TCGCond tcg_unsigned_cond(TCGCond c)
      return c & 2 ? (TCGCond)(c ^ 6) : c;
  }
  
+/* Create a "signed" version of an "unsigned" comparison.  */
+static inline TCGCond tcg_signed_cond(TCGCond c)
+{
+    return c & 4 ? (TCGCond)(c ^ 6) : c;
+}
+
  /* Must a comparison be considered unsigned?  */
  static inline bool is_unsigned_cond(TCGCond c)
  {
@@ -453,17 +561,29 @@ typedef struct TCGTemp {
      TCGType base_type:8;
      TCGType type:8;
      unsigned int fixed_reg:1;
+    unsigned int indirect_reg:1;
+    unsigned int indirect_base:1;
      unsigned int mem_coherent:1;
      unsigned int mem_allocated:1;
-    unsigned int temp_local:1; /* If true, the temp is saved across
-                                  basic blocks. Otherwise, it is not
-                                  preserved across basic blocks. */
-    unsigned int temp_allocated:1; /* never used for code gen */
+    /* If true, the temp is saved across both basic blocks and
+       translation blocks.  */
+    unsigned int temp_global:1;
+    /* If true, the temp is saved across basic blocks but dead
+       at the end of translation blocks.  If false, the temp is
+       dead at the end of basic blocks.  */
+    unsigned int temp_local:1;
+    unsigned int temp_allocated:1;
  
      tcg_target_long val;
      struct TCGTemp *mem_base;
      intptr_t mem_offset;
      const char *name;
+
+    /* Pass-specific information that can be stored for a temporary.
+       One word worth of integer data, and one pointer to data
+       allocated separately.  */
+    uintptr_t state;
+    void *state_ptr;
  } TCGTemp;
  
  typedef struct TCGContext TCGContext;
@@ -472,57 +592,43 @@ typedef struct TCGTempSet {
      unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
  } TCGTempSet;
  
-typedef struct TCGOp {
-    TCGOpcode opc   : 8;
+/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
+   this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
+   There are never more than 2 outputs, which means that we can store all
+   dead + sync data within 16 bits.  */
+#define DEAD_ARG  4
+#define SYNC_ARG  1
+typedef uint16_t TCGLifeData;
  
-    /* The number of out and in parameter for a call.  */
-    unsigned callo  : 2;
-    unsigned calli  : 6;
+/* The layout here is designed to avoid a bitfield crossing of
+   a 32-bit boundary, which would cause GCC to add extra padding.  */
+typedef struct TCGOp {
+    TCGOpcode opc   : 8;        /*  8 */
  
-    /* Index of the arguments for this op, or -1 for zero-operand ops.  */
-    signed args     : 16;
+    /* Parameters for this opcode.  See below.  */
+    unsigned param1 : 4;        /* 12 */
+    unsigned param2 : 4;        /* 16 */
  
-    /* Index of the prex/next op, or -1 for the end of the list.  */
-    signed prev     : 16;
-    signed next     : 16;
-} TCGOp;
+    /* Lifetime data of the operands.  */
+    unsigned life   : 16;       /* 32 */
  
-QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+    /* Next and previous opcodes.  */
+    QTAILQ_ENTRY(TCGOp) link;
  
-struct TCGContext {
-    uint8_t *pool_cur, *pool_end;
-    TCGPool *pool_first, *pool_current, *pool_first_large;
-    int nb_labels;
-    int nb_globals;
-    int nb_temps;
+    /* Arguments for the opcode.  */
+    TCGArg args[MAX_OPC_PARAM];
+} TCGOp;
  
-    /* goto_tb support */
-    tcg_insn_unit *code_buf;
-    uintptr_t *tb_next;
-    uint16_t *tb_next_offset;
-    uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */
-
-    /* liveness analysis */
-    uint16_t *op_dead_args; /* for each operation, each bit tells if the
-                               corresponding argument is dead */
-    uint8_t *op_sync_args;  /* for each operation, each bit tells if the
-                               corresponding output argument needs to be
-                               sync to memory. */
-    
-    TCGRegSet reserved_regs;
-    intptr_t current_frame_offset;
-    intptr_t frame_start;
-    intptr_t frame_end;
-    TCGTemp *frame_temp;
+#define TCGOP_CALLI(X)    (X)->param1
+#define TCGOP_CALLO(X)    (X)->param2
  
-    tcg_insn_unit *code_ptr;
+#define TCGOP_VECL(X)     (X)->param1
+#define TCGOP_VECE(X)     (X)->param2
  
-    GHashTable *helpers;
+/* Make sure operands fit in the bitfields above.  */
+QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
  
-#ifdef CONFIG_PROFILER
-    /* profiling info */
+typedef struct TCGProfile {
      int64_t tb_count1;
      int64_t tb_count;
      int64_t op_count; /* total insn count */
@@ -539,6 +645,34 @@ struct TCGContext {
      int64_t opt_time;
      int64_t restore_count;
      int64_t restore_time;
+    int64_t table_op_count[NB_OPS];
+} TCGProfile;
+
+struct TCGContext {
+    uint8_t *pool_cur, *pool_end;
+    TCGPool *pool_first, *pool_current, *pool_first_large;
+    int nb_labels;
+    int nb_globals;
+    int nb_temps;
+    int nb_indirects;
+
+    /* goto_tb support */
+    tcg_insn_unit *code_buf;
+    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
+    uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
+    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
+
+    TCGRegSet reserved_regs;
+    uint32_t tb_cflags; /* cflags of the current TB */
+    intptr_t current_frame_offset;
+    intptr_t frame_start;
+    intptr_t frame_end;
+    TCGTemp *frame_temp;
+
+    tcg_insn_unit *code_ptr;
+
+#ifdef CONFIG_PROFILER
+    TCGProfile prof;
  #endif
  
  #ifdef CONFIG_DEBUG_TCG
@@ -546,76 +680,199 @@ struct TCGContext {
      int goto_tb_issue_mask;
  #endif
  
-    int gen_first_op_idx;
-    int gen_last_op_idx;
-    int gen_next_op_idx;
-    int gen_next_parm_idx;
-
      /* Code generation.  Note that we specifically do not use tcg_insn_unit
         here, because there's too much arithmetic throughout that relies
         on addition and subtraction working on bytes.  Rely on the GCC
         extension that allows arithmetic on void*.  */
-    int code_gen_max_blocks;
      void *code_gen_prologue;
+    void *code_gen_epilogue;
      void *code_gen_buffer;
      size_t code_gen_buffer_size;
      void *code_gen_ptr;
+    void *data_gen_ptr;
  
      /* Threshold to flush the translated code buffer.  */
      void *code_gen_highwater;
  
-    TBContext tb_ctx;
+    /* Track which vCPU triggers events */
+    CPUState *cpu;                      /* *_trans */
  
-    /* The TCGBackendData structure is private to tcg-target.c.  */
-    struct TCGBackendData *be;
+    /* These structures are private to tcg-target.inc.c.  */
+#ifdef TCG_TARGET_NEED_LDST_LABELS
+    struct TCGLabelQemuLdst *ldst_labels;
+#endif
+#ifdef TCG_TARGET_NEED_POOL_LABELS
+    struct TCGLabelPoolData *pool_labels;
+#endif
+
+    TCGLabel *exitreq_label;
  
      TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
      TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
  
+    QTAILQ_HEAD(TCGOpHead, TCGOp) ops, free_ops;
+
      /* Tells which temporary holds a given register.
         It does not take into account fixed registers */
      TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];
  
-    TCGOp gen_op_buf[OPC_BUF_SIZE];
-    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
-
      uint16_t gen_insn_end_off[TCG_MAX_INSNS];
      target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
  };
  
-extern TCGContext tcg_ctx;
+extern TCGContext tcg_init_ctx;
+extern __thread TCGContext *tcg_ctx;
+extern TCGv_env cpu_env;
+
+static inline size_t temp_idx(TCGTemp *ts)
+{
+    ptrdiff_t n = ts - tcg_ctx->temps;
+    tcg_debug_assert(n >= 0 && n < tcg_ctx->nb_temps);
+    return n;
+}
+
+static inline TCGArg temp_arg(TCGTemp *ts)
+{
+    return (uintptr_t)ts;
+}
+
+static inline TCGTemp *arg_temp(TCGArg a)
+{
+    return (TCGTemp *)(uintptr_t)a;
+}
+
+/* Using the offset of a temporary, relative to TCGContext, rather than
+   its index means that we don't use 0.  That leaves offset 0 free for
+   a NULL representation without having to leave index 0 unused.  */
+static inline TCGTemp *tcgv_i32_temp(TCGv_i32 v)
+{
+    uintptr_t o = (uintptr_t)v;
+    TCGTemp *t = (void *)tcg_ctx + o;
+    tcg_debug_assert(offsetof(TCGContext, temps[temp_idx(t)]) == o);
+    return t;
+}
+
+static inline TCGTemp *tcgv_i64_temp(TCGv_i64 v)
+{
+    return tcgv_i32_temp((TCGv_i32)v);
+}
+
+static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v)
+{
+    return tcgv_i32_temp((TCGv_i32)v);
+}
+
+static inline TCGTemp *tcgv_vec_temp(TCGv_vec v)
+{
+    return tcgv_i32_temp((TCGv_i32)v);
+}
+
+static inline TCGArg tcgv_i32_arg(TCGv_i32 v)
+{
+    return temp_arg(tcgv_i32_temp(v));
+}
+
+static inline TCGArg tcgv_i64_arg(TCGv_i64 v)
+{
+    return temp_arg(tcgv_i64_temp(v));
+}
+
+static inline TCGArg tcgv_ptr_arg(TCGv_ptr v)
+{
+    return temp_arg(tcgv_ptr_temp(v));
+}
+
+static inline TCGArg tcgv_vec_arg(TCGv_vec v)
+{
+    return temp_arg(tcgv_vec_temp(v));
+}
+
+static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t)
+{
+    (void)temp_idx(t); /* trigger embedded assert */
+    return (TCGv_i32)((void *)t - (void *)tcg_ctx);
+}
+
+static inline TCGv_i64 temp_tcgv_i64(TCGTemp *t)
+{
+    return (TCGv_i64)temp_tcgv_i32(t);
+}
+
+static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t)
+{
+    return (TCGv_ptr)temp_tcgv_i32(t);
+}
+
+static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
+{
+    return (TCGv_vec)temp_tcgv_i32(t);
+}
+
+#if TCG_TARGET_REG_BITS == 32
+static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
+{
+    return temp_tcgv_i32(tcgv_i64_temp(t));
+}
+
+static inline TCGv_i32 TCGV_HIGH(TCGv_i64 t)
+{
+    return temp_tcgv_i32(tcgv_i64_temp(t) + 1);
+}
+#endif
+
+static inline void tcg_set_insn_param(TCGOp *op, int arg, TCGArg v)
+{
+    op->args[arg] = v;
+}
+
+static inline void tcg_set_insn_start_param(TCGOp *op, int arg, target_ulong v)
+{
+#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+    tcg_set_insn_param(op, arg, v);
+#else
+    tcg_set_insn_param(op, arg * 2, v);
+    tcg_set_insn_param(op, arg * 2 + 1, v >> 32);
+#endif
+}
  
-/* The number of opcodes emitted so far.  */
-static inline int tcg_op_buf_count(void)
+/* The last op that was emitted.  */
+static inline TCGOp *tcg_last_op(void)
  {
-    return tcg_ctx.gen_next_op_idx;
+    return QTAILQ_LAST(&tcg_ctx->ops, TCGOpHead);
  }
  
  /* Test for whether to terminate the TB for using too many opcodes.  */
  static inline bool tcg_op_buf_full(void)
  {
-    return tcg_op_buf_count() >= OPC_MAX_SIZE;
+    return false;
  }
  
  /* pool based memory allocation */
  
+/* user-mode: tb_lock must be held for tcg_malloc_internal. */
  void *tcg_malloc_internal(TCGContext *s, int size);
  void tcg_pool_reset(TCGContext *s);
-void tcg_pool_delete(TCGContext *s);
+TranslationBlock *tcg_tb_alloc(TCGContext *s);
+
+void tcg_region_init(void);
+void tcg_region_reset_all(void);
  
-void tb_lock(void);
-void tb_unlock(void);
-void tb_lock_reset(void);
+size_t tcg_code_size(void);
+size_t tcg_code_capacity(void);
  
+/* user-mode: Called with tb_lock held.  */
  static inline void *tcg_malloc(int size)
  {
-    TCGContext *s = &tcg_ctx;
+    TCGContext *s = tcg_ctx;
      uint8_t *ptr, *ptr_end;
-    size = (size + sizeof(long) - 1) & ~(sizeof(long) - 1);
+
+    /* ??? This is a weak placeholder for minimum malloc alignment.  */
+    size = QEMU_ALIGN_UP(size, 8);
+
      ptr = s->pool_cur;
      ptr_end = ptr + size;
      if (unlikely(ptr_end > s->pool_end)) {
-        return tcg_malloc_internal(&tcg_ctx, size);
+        return tcg_malloc_internal(tcg_ctx, size);
      } else {
          s->pool_cur = ptr_end;
          return ptr;
@@ -623,29 +880,31 @@ static inline void *tcg_malloc(int size)
  }
  
  void tcg_context_init(TCGContext *s);
+void tcg_register_thread(void);
  void tcg_prologue_init(TCGContext *s);
  void tcg_func_start(TCGContext *s);
  
-int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf);
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
  
  void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
  
-int tcg_global_mem_new_internal(TCGType, TCGv_ptr, intptr_t, const char *);
-
-TCGv_i32 tcg_global_reg_new_i32(TCGReg reg, const char *name);
-TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
+TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
+                                     intptr_t, const char *);
  
  TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
  TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_vec tcg_temp_new_vec(TCGType type);
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
  
  void tcg_temp_free_i32(TCGv_i32 arg);
  void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_vec(TCGv_vec arg);
  
  static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
                                                const char *name)
  {
-    int idx = tcg_global_mem_new_internal(TCG_TYPE_I32, reg, offset, name);
-    return MAKE_TCGV_I32(idx);
+    TCGTemp *t = tcg_global_mem_new_internal(TCG_TYPE_I32, reg, offset, name);
+    return temp_tcgv_i32(t);
  }
  
  static inline TCGv_i32 tcg_temp_new_i32(void)
@@ -661,8 +920,8 @@ static inline TCGv_i32 tcg_temp_local_new_i32(void)
  static inline TCGv_i64 tcg_global_mem_new_i64(TCGv_ptr reg, intptr_t offset,
                                                const char *name)
  {
-    int idx = tcg_global_mem_new_internal(TCG_TYPE_I64, reg, offset, name);
-    return MAKE_TCGV_I64(idx);
+    TCGTemp *t = tcg_global_mem_new_internal(TCG_TYPE_I64, reg, offset, name);
+    return temp_tcgv_i64(t);
  }
  
  static inline TCGv_i64 tcg_temp_new_i64(void)
@@ -693,6 +952,7 @@ void tcg_dump_op_count(FILE *f, fprintf_function cpu_fprintf);
  
  #define TCG_CT_ALIAS  0x80
  #define TCG_CT_IALIAS 0x40
+#define TCG_CT_NEWREG 0x20 /* output requires a new register */
  #define TCG_CT_REG    0x01
  #define TCG_CT_CONST  0x02 /* any constant of register size */
  
@@ -720,6 +980,8 @@ enum {
      /* Instruction is optional and not implemented by the host, or insn
         is generic and should not be implemened by the host.  */
      TCG_OPF_NOT_PRESENT  = 0x10,
+    /* Instruction operands are vectors.  */
+    TCG_OPF_VECTOR       = 0x20,
  };
  
  typedef struct TCGOpDef {
@@ -747,55 +1009,48 @@ do {\
      abort();\
  } while (0)
  
-#ifdef CONFIG_DEBUG_TCG
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#elif QEMU_GNUC_PREREQ(4, 5)
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#else
-# define tcg_debug_assert(X) do { (void)(X); } while (0)
-#endif
-
-void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
-
  #if UINTPTR_MAX == UINT32_MAX
-#define TCGV_NAT_TO_PTR(n) MAKE_TCGV_PTR(GET_TCGV_I32(n))
-#define TCGV_PTR_TO_NAT(n) MAKE_TCGV_I32(GET_TCGV_PTR(n))
+static inline TCGv_ptr TCGV_NAT_TO_PTR(TCGv_i32 n) { return (TCGv_ptr)n; }
+static inline TCGv_i32 TCGV_PTR_TO_NAT(TCGv_ptr n) { return (TCGv_i32)n; }
  
  #define tcg_const_ptr(V) TCGV_NAT_TO_PTR(tcg_const_i32((intptr_t)(V)))
-#define tcg_global_reg_new_ptr(R, N) \
-    TCGV_NAT_TO_PTR(tcg_global_reg_new_i32((R), (N)))
  #define tcg_global_mem_new_ptr(R, O, N) \
      TCGV_NAT_TO_PTR(tcg_global_mem_new_i32((R), (O), (N)))
  #define tcg_temp_new_ptr() TCGV_NAT_TO_PTR(tcg_temp_new_i32())
  #define tcg_temp_free_ptr(T) tcg_temp_free_i32(TCGV_PTR_TO_NAT(T))
  #else
-#define TCGV_NAT_TO_PTR(n) MAKE_TCGV_PTR(GET_TCGV_I64(n))
-#define TCGV_PTR_TO_NAT(n) MAKE_TCGV_I64(GET_TCGV_PTR(n))
+static inline TCGv_ptr TCGV_NAT_TO_PTR(TCGv_i64 n) { return (TCGv_ptr)n; }
+static inline TCGv_i64 TCGV_PTR_TO_NAT(TCGv_ptr n) { return (TCGv_i64)n; }
  
  #define tcg_const_ptr(V) TCGV_NAT_TO_PTR(tcg_const_i64((intptr_t)(V)))
-#define tcg_global_reg_new_ptr(R, N) \
-    TCGV_NAT_TO_PTR(tcg_global_reg_new_i64((R), (N)))
  #define tcg_global_mem_new_ptr(R, O, N) \
      TCGV_NAT_TO_PTR(tcg_global_mem_new_i64((R), (O), (N)))
  #define tcg_temp_new_ptr() TCGV_NAT_TO_PTR(tcg_temp_new_i64())
  #define tcg_temp_free_ptr(T) tcg_temp_free_i64(TCGV_PTR_TO_NAT(T))
  #endif
  
-void tcg_gen_callN(TCGContext *s, void *func,
-                   TCGArg ret, int nargs, TCGArg *args);
+bool tcg_op_supported(TCGOpcode op);
  
+void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
+
+TCGOp *tcg_emit_op(TCGOpcode opc);
  void tcg_op_remove(TCGContext *s, TCGOp *op);
+TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
+TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
+
  void tcg_optimize(TCGContext *s);
  
  /* only used for debugging purposes */
  void tcg_dump_ops(TCGContext *s);
  
-void dump_ops(const uint16_t *opc_buf, const TCGArg *opparam_buf);
  TCGv_i32 tcg_const_i32(int32_t val);
  TCGv_i64 tcg_const_i64(int64_t val);
  TCGv_i32 tcg_const_local_i32(int32_t val);
  TCGv_i64 tcg_const_local_i64(int64_t val);
+TCGv_vec tcg_const_zeros_vec(TCGType);
+TCGv_vec tcg_const_ones_vec(TCGType);
+TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
+TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
  
  TCGLabel *gen_new_label(void);
  
@@ -909,7 +1164,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
  
  /**
   * tcg_qemu_tb_exec:
- * @env: CPUArchState * for the CPU
+ * @env: pointer to CPUArchState for the CPU
   * @tb_ptr: address of generated code for the TB to execute
   *
   * Start executing code from a given translation block.
@@ -920,30 +1175,31 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
   * which has not yet been directly linked, or an asynchronous
   * event such as an interrupt needs handling.
   *
- * The return value is a pointer to the next TB to execute
- * (if known; otherwise zero). This pointer is assumed to be
- * 4-aligned, and the bottom two bits are used to return further
- * information:
+ * Return: The return value is the value passed to the corresponding
+ * tcg_gen_exit_tb() at translation time of the last TB attempted to execute.
+ * The value is either zero or a 4-byte aligned pointer to that TB combined
+ * with additional information in its two least significant bits. The
+ * additional information is encoded as follows:
   *  0, 1: the link between this TB and the next is via the specified
   *        TB index (0 or 1). That is, we left the TB via (the equivalent
   *        of) "goto_tb <index>". The main loop uses this to determine
   *        how to link the TB just executed to the next.
   *  2:    we are using instruction counting code generation, and we
   *        did not start executing this TB because the instruction counter
- *        would hit zero midway through it. In this case the next-TB pointer
+ *        would hit zero midway through it. In this case the pointer
   *        returned is the TB we were about to execute, and the caller must
   *        arrange to execute the remaining count of instructions.
   *  3:    we stopped because the CPU's exit_request flag was set
   *        (usually meaning that there is an interrupt that needs to be
- *        handled). The next-TB pointer returned is the TB we were
- *        about to execute when we noticed the pending exit request.
+ *        handled). The pointer returned is the TB we were about to execute
+ *        when we noticed the pending exit request.
   *
   * If the bottom two bits indicate an exit-via-index then the CPU
   * state is correctly synchronised and ready for execution of the next
   * TB (and in particular the guest PC is the address to execute next).
   * Otherwise, we gave up on execution of this TB before it started, and
   * the caller must fix up the CPU state by calling the CPU's
- * synchronize_from_tb() method with the next-TB pointer we return (falling
+ * synchronize_from_tb() method with the TB pointer we return (falling
   * back to calling the CPU's set_pc method with tb->pb if no
   * synchronize_from_tb() method exists).
   *
@@ -954,18 +1210,44 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
  #define TB_EXIT_MASK 3
  #define TB_EXIT_IDX0 0
  #define TB_EXIT_IDX1 1
-#define TB_EXIT_ICOUNT_EXPIRED 2
  #define TB_EXIT_REQUESTED 3
  
  #ifdef HAVE_TCG_QEMU_TB_EXEC
  uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
  #else
  # define tcg_qemu_tb_exec(env, tb_ptr) \
-    ((uintptr_t (*)(void *, void *))tcg_ctx.code_gen_prologue)(env, tb_ptr)
+    ((uintptr_t (*)(void *, void *))tcg_ctx->code_gen_prologue)(env, tb_ptr)
  #endif
  
  void tcg_register_jit(void *buf, size_t buf_size);
  
+#if TCG_TARGET_MAYBE_vec
+/* Return zero if the tuple (opc, type, vece) is unsupportable;
+   return > 0 if it is directly supportable;
+   return < 0 if we must call tcg_expand_vec_op.  */
+int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
+#else
+static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
+{
+    return 0;
+}
+#endif
+
+/* Expand the tuple (opc, type, vece) on the given arguments.  */
+void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
+
+/* Replicate a constant C accoring to the log2 of the element size.  */
+uint64_t dup_const(unsigned vece, uint64_t c);
+
+#define dup_const(VECE, C)                                         \
+    (__builtin_constant_p(VECE)                                    \
+     ? (  (VECE) == MO_8  ? 0x0101010101010101ull * (uint8_t)(C)   \
+        : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C)  \
+        : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C)  \
+        : dup_const(VECE, C))                                      \
+     : dup_const(VECE, C))
+
+
  /*
   * Memory helpers that will be used by TCG generated code.
   */
@@ -1057,6 +1339,90 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
  # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
  #endif
  
+uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
+                                    uint32_t cmpv, uint32_t newv,
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
+TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
+    (CPUArchState *env, target_ulong addr, TYPE val,  \
+     TCGMemOpIdx oi, uintptr_t retaddr);
+
+#ifdef CONFIG_ATOMIC64
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+#else
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)
+#endif
+
+GEN_ATOMIC_HELPER_ALL(fetch_add)
+GEN_ATOMIC_HELPER_ALL(fetch_sub)
+GEN_ATOMIC_HELPER_ALL(fetch_and)
+GEN_ATOMIC_HELPER_ALL(fetch_or)
+GEN_ATOMIC_HELPER_ALL(fetch_xor)
+
+GEN_ATOMIC_HELPER_ALL(add_fetch)
+GEN_ATOMIC_HELPER_ALL(sub_fetch)
+GEN_ATOMIC_HELPER_ALL(and_fetch)
+GEN_ATOMIC_HELPER_ALL(or_fetch)
+GEN_ATOMIC_HELPER_ALL(xor_fetch)
+
+GEN_ATOMIC_HELPER_ALL(xchg)
+
+#undef GEN_ATOMIC_HELPER_ALL
+#undef GEN_ATOMIC_HELPER
  #endif /* CONFIG_SOFTMMU */
  
+#ifdef CONFIG_ATOMIC128
+#include "qemu/int128.h"
+
+/* These aren't really a "proper" helpers because TCG cannot manage Int128.
+   However, use the same format as the others, for use by the backends. */
+Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+
+Int128 helper_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+
+#endif /* CONFIG_ATOMIC128 */
+
  #endif /* TCG_H */