tcg: Add CONFIG_ATOMIC64

[qemu.git] / tcg / tcg.h
diff --git a/tcg/tcg.h b/tcg/tcg.h

index 8fd12528cb8f1017df4ca59a52fd27741d95dec5..b34b5fbc2fa7321b4946145251ea697a6dc5043b 100644 (file)
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -26,9 +26,32 @@
  #define TCG_H
  
  #include "qemu-common.h"
+#include "cpu.h"
+#include "exec/tb-context.h"
  #include "qemu/bitops.h"
  #include "tcg-target.h"
  
+/* XXX: make safe guess about sizes */
+#define MAX_OP_PER_INSTR 266
+
+#if HOST_LONG_BITS == 32
+#define MAX_OPC_PARAM_PER_ARG 2
+#else
+#define MAX_OPC_PARAM_PER_ARG 1
+#endif
+#define MAX_OPC_PARAM_IARGS 5
+#define MAX_OPC_PARAM_OARGS 1
+#define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS)
+
+/* A Call op needs up to 4 + 2N parameters on 32-bit archs,
+ * and up to 4 + N parameters on 64-bit archs
+ * (N = number of input arguments + output arguments).  */
+#define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
+#define OPC_BUF_SIZE 640
+#define OPC_MAX_SIZE (OPC_BUF_SIZE - MAX_OP_PER_INSTR)
+
+#define OPPARAM_BUF_SIZE (OPC_BUF_SIZE * MAX_OPC_PARAM)
+
  #define CPU_TEMP_BUF_NLONGS 128
  
  /* Default target word size to pointer size.  */
@@ -168,6 +191,15 @@ typedef uint64_t tcg_insn_unit;
  #endif
  
  
+#if defined CONFIG_DEBUG_TCG || defined QEMU_STATIC_ANALYSIS
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#elif QEMU_GNUC_PREREQ(4, 5)
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#else
+# define tcg_debug_assert(X) do { (void)(X); } while (0)
+#endif
+
  typedef struct TCGRelocation {
      struct TCGRelocation *next;
      int type;
@@ -252,10 +284,25 @@ typedef enum TCGMemOp {
  #endif
  
      /* MO_UNALN accesses are never checked for alignment.
-       MO_ALIGN accesses will result in a call to the CPU's
-       do_unaligned_access hook if the guest address is not aligned.
-       The default depends on whether the target CPU defines ALIGNED_ONLY.  */
-    MO_AMASK = 16,
+     * MO_ALIGN accesses will result in a call to the CPU's
+     * do_unaligned_access hook if the guest address is not aligned.
+     * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     *
+     * Some architectures (e.g. ARMv8) need the address which is aligned
+     * to a size more than the size of the memory access.
+     * Some architectures (e.g. SPARCv9) need an address which is aligned,
+     * but less strictly than the natural alignment.
+     *
+     * MO_ALIGN supposes the alignment size is the size of a memory access.
+     *
+     * There are three options:
+     * - unaligned access permitted (MO_UNALN).
+     * - an alignment to the size of an access (MO_ALIGN);
+     * - an alignment to a specified size, which may be more or less than
+     *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
+     */
+    MO_ASHIFT = 4,
+    MO_AMASK = 7 << MO_ASHIFT,
  #ifdef ALIGNED_ONLY
      MO_ALIGN = 0,
      MO_UNALN = MO_AMASK,
@@ -263,6 +310,12 @@ typedef enum TCGMemOp {
      MO_ALIGN = MO_AMASK,
      MO_UNALN = 0,
  #endif
+    MO_ALIGN_2  = 1 << MO_ASHIFT,
+    MO_ALIGN_4  = 2 << MO_ASHIFT,
+    MO_ALIGN_8  = 3 << MO_ASHIFT,
+    MO_ALIGN_16 = 4 << MO_ASHIFT,
+    MO_ALIGN_32 = 5 << MO_ASHIFT,
+    MO_ALIGN_64 = 6 << MO_ASHIFT,
  
      /* Combinations of the above, for ease of use.  */
      MO_UB    = MO_8,
@@ -294,6 +347,33 @@ typedef enum TCGMemOp {
      MO_SSIZE = MO_SIZE | MO_SIGN,
  } TCGMemOp;
  
+/**
+ * get_alignment_bits
+ * @memop: TCGMemOp value
+ *
+ * Extract the alignment size from the memop.
+ */
+static inline unsigned get_alignment_bits(TCGMemOp memop)
+{
+    unsigned a = memop & MO_AMASK;
+
+    if (a == MO_UNALN) {
+        /* No alignment required.  */
+        a = 0;
+    } else if (a == MO_ALIGN) {
+        /* A natural alignment requirement.  */
+        a = memop & MO_SIZE;
+    } else {
+        /* A specific alignment requirement.  */
+        a = a >> MO_ASHIFT;
+    }
+#if defined(CONFIG_SOFTMMU)
+    /* The requested alignment cannot overlap the TLB flags.  */
+    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
+#endif
+    return a;
+}
+
  typedef tcg_target_ulong TCGArg;
  
  /* Define a type and accessor macros for variables.  Using pointer types
@@ -308,6 +388,14 @@ typedef tcg_target_ulong TCGArg;
  typedef struct TCGv_i32_d *TCGv_i32;
  typedef struct TCGv_i64_d *TCGv_i64;
  typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef TCGv_ptr TCGv_env;
+#if TARGET_LONG_BITS == 32
+#define TCGv TCGv_i32
+#elif TARGET_LONG_BITS == 64
+#define TCGv TCGv_i64
+#else
+#error Unhandled TARGET_LONG_BITS value
+#endif
  
  static inline TCGv_i32 QEMU_ARTIFICIAL MAKE_TCGV_I32(intptr_t i)
  {
@@ -377,6 +465,23 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
  #define TCG_CALL_DUMMY_TCGV     MAKE_TCGV_I32(-1)
  #define TCG_CALL_DUMMY_ARG      ((TCGArg)(-1))
  
+typedef enum {
+    /* Used to indicate the type of accesses on which ordering
+       is to be ensured.  Modeled after SPARC barriers.  */
+    TCG_MO_LD_LD  = 0x01,
+    TCG_MO_ST_LD  = 0x02,
+    TCG_MO_LD_ST  = 0x04,
+    TCG_MO_ST_ST  = 0x08,
+    TCG_MO_ALL    = 0x0F,  /* OR of the above */
+
+    /* Used to indicate the kind of ordering which is to be ensured by the
+       instruction.  These types are derived from x86/aarch64 instructions.
+       It should be noted that these are different from C11 semantics.  */
+    TCG_BAR_LDAQ  = 0x10,  /* Following ops will not come forward */
+    TCG_BAR_STRL  = 0x20,  /* Previous ops will not be delayed */
+    TCG_BAR_SC    = 0x30,  /* No ops cross barrier; OR of the above */
+} TCGBar;
+
  /* Conditions.  Note that these are laid out for easy manipulation by
     the functions below:
       bit 0 is used for inverting;
@@ -448,12 +553,13 @@ typedef enum TCGTempVal {
  } TCGTempVal;
  
  typedef struct TCGTemp {
-    unsigned int reg:8;
-    unsigned int mem_reg:8;
+    TCGReg reg:8;
      TCGTempVal val_type:8;
      TCGType base_type:8;
      TCGType type:8;
      unsigned int fixed_reg:1;
+    unsigned int indirect_reg:1;
+    unsigned int indirect_base:1;
      unsigned int mem_coherent:1;
      unsigned int mem_allocated:1;
      unsigned int temp_local:1; /* If true, the temp is saved across
@@ -462,6 +568,7 @@ typedef struct TCGTemp {
      unsigned int temp_allocated:1; /* never used for code gen */
  
      tcg_target_long val;
+    struct TCGTemp *mem_base;
      intptr_t mem_offset;
      const char *name;
  } TCGTemp;
@@ -472,24 +579,41 @@ typedef struct TCGTempSet {
      unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
  } TCGTempSet;
  
+/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
+   this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
+   There are never more than 2 outputs, which means that we can store all
+   dead + sync data within 16 bits.  */
+#define DEAD_ARG  4
+#define SYNC_ARG  1
+typedef uint16_t TCGLifeData;
+
+/* The layout here is designed to avoid crossing of a 32-bit boundary.
+   If we do so, gcc adds padding, expanding the size to 12.  */
  typedef struct TCGOp {
-    TCGOpcode opc   : 8;
+    TCGOpcode opc   : 8;        /*  8 */
+
+    /* Index of the prev/next op, or 0 for the end of the list.  */
+    unsigned prev   : 10;       /* 18 */
+    unsigned next   : 10;       /* 28 */
  
      /* The number of out and in parameter for a call.  */
-    unsigned callo  : 2;
-    unsigned calli  : 6;
+    unsigned calli  : 4;        /* 32 */
+    unsigned callo  : 2;        /* 34 */
  
-    /* Index of the arguments for this op, or -1 for zero-operand ops.  */
-    signed args     : 16;
+    /* Index of the arguments for this op, or 0 for zero-operand ops.  */
+    unsigned args   : 14;       /* 48 */
  
-    /* Index of the prex/next op, or -1 for the end of the list.  */
-    signed prev     : 16;
-    signed next     : 16;
+    /* Lifetime data of the operands.  */
+    unsigned life   : 16;       /* 64 */
  } TCGOp;
  
-QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+/* Make sure operands fit in the bitfields above.  */
+QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10));
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14));
+
+/* Make sure that we don't overflow 64 bits without noticing.  */
+QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
  
  struct TCGContext {
      uint8_t *pool_cur, *pool_end;
@@ -497,25 +621,19 @@ struct TCGContext {
      int nb_labels;
      int nb_globals;
      int nb_temps;
+    int nb_indirects;
  
      /* goto_tb support */
      tcg_insn_unit *code_buf;
-    uintptr_t *tb_next;
-    uint16_t *tb_next_offset;
-    uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */
-
-    /* liveness analysis */
-    uint16_t *op_dead_args; /* for each operation, each bit tells if the
-                               corresponding argument is dead */
-    uint8_t *op_sync_args;  /* for each operation, each bit tells if the
-                               corresponding output argument needs to be
-                               sync to memory. */
-    
+    uint16_t *tb_jmp_reset_offset; /* tb->jmp_reset_offset */
+    uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */
+    uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
+
      TCGRegSet reserved_regs;
      intptr_t current_frame_offset;
      intptr_t frame_start;
      intptr_t frame_end;
-    int frame_reg;
+    TCGTemp *frame_temp;
  
      tcg_insn_unit *code_ptr;
  
@@ -532,6 +650,7 @@ struct TCGContext {
      int64_t del_op_count;
      int64_t code_in_len;
      int64_t code_out_len;
+    int64_t search_out_len;
      int64_t interm_time;
      int64_t code_time;
      int64_t la_time;
@@ -545,8 +664,6 @@ struct TCGContext {
      int goto_tb_issue_mask;
  #endif
  
-    int gen_first_op_idx;
-    int gen_last_op_idx;
      int gen_next_op_idx;
      int gen_next_parm_idx;
  
@@ -558,33 +675,42 @@ struct TCGContext {
      void *code_gen_prologue;
      void *code_gen_buffer;
      size_t code_gen_buffer_size;
-    /* threshold to flush the translated code buffer */
-    size_t code_gen_buffer_max_size;
      void *code_gen_ptr;
  
+    /* Threshold to flush the translated code buffer.  */
+    void *code_gen_highwater;
+
      TBContext tb_ctx;
  
-    /* The TCGBackendData structure is private to tcg-target.c.  */
+    /* Track which vCPU triggers events */
+    CPUState *cpu;                      /* *_trans */
+    TCGv_env tcg_env;                   /* *_exec  */
+
+    /* The TCGBackendData structure is private to tcg-target.inc.c.  */
      struct TCGBackendData *be;
  
      TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
      TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
  
-    /* tells in which temporary a given register is. It does not take
-       into account fixed registers */
-    int reg_to_temp[TCG_TARGET_NB_REGS];
+    /* Tells which temporary holds a given register.
+       It does not take into account fixed registers */
+    TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];
  
      TCGOp gen_op_buf[OPC_BUF_SIZE];
      TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
  
-    target_ulong gen_opc_pc[OPC_BUF_SIZE];
-    uint16_t gen_opc_icount[OPC_BUF_SIZE];
-    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
-
-    target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
+    uint16_t gen_insn_end_off[TCG_MAX_INSNS];
+    target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
  };
  
  extern TCGContext tcg_ctx;
+extern bool parallel_cpus;
+
+static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
+{
+    int op_argi = tcg_ctx.gen_op_buf[op_idx].args;
+    tcg_ctx.gen_opparam_buf[op_argi + arg] = v;
+}
  
  /* The number of opcodes emitted so far.  */
  static inline int tcg_op_buf_count(void)
@@ -602,7 +728,6 @@ static inline bool tcg_op_buf_full(void)
  
  void *tcg_malloc_internal(TCGContext *s, int size);
  void tcg_pool_reset(TCGContext *s);
-void tcg_pool_delete(TCGContext *s);
  
  void tb_lock(void);
  void tb_unlock(void);
@@ -627,39 +752,54 @@ void tcg_context_init(TCGContext *s);
  void tcg_prologue_init(TCGContext *s);
  void tcg_func_start(TCGContext *s);
  
-int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf);
-int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
-                           long offset);
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
+
+void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);
+
+int tcg_global_mem_new_internal(TCGType, TCGv_ptr, intptr_t, const char *);
  
-void tcg_set_frame(TCGContext *s, int reg, intptr_t start, intptr_t size);
+TCGv_i32 tcg_global_reg_new_i32(TCGReg reg, const char *name);
+TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
  
-TCGv_i32 tcg_global_reg_new_i32(int reg, const char *name);
-TCGv_i32 tcg_global_mem_new_i32(int reg, intptr_t offset, const char *name);
  TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
+TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+
+void tcg_temp_free_i32(TCGv_i32 arg);
+void tcg_temp_free_i64(TCGv_i64 arg);
+
+static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
+                                              const char *name)
+{
+    int idx = tcg_global_mem_new_internal(TCG_TYPE_I32, reg, offset, name);
+    return MAKE_TCGV_I32(idx);
+}
+
  static inline TCGv_i32 tcg_temp_new_i32(void)
  {
      return tcg_temp_new_internal_i32(0);
  }
+
  static inline TCGv_i32 tcg_temp_local_new_i32(void)
  {
      return tcg_temp_new_internal_i32(1);
  }
-void tcg_temp_free_i32(TCGv_i32 arg);
-char *tcg_get_arg_str_i32(TCGContext *s, char *buf, int buf_size, TCGv_i32 arg);
  
-TCGv_i64 tcg_global_reg_new_i64(int reg, const char *name);
-TCGv_i64 tcg_global_mem_new_i64(int reg, intptr_t offset, const char *name);
-TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+static inline TCGv_i64 tcg_global_mem_new_i64(TCGv_ptr reg, intptr_t offset,
+                                              const char *name)
+{
+    int idx = tcg_global_mem_new_internal(TCG_TYPE_I64, reg, offset, name);
+    return MAKE_TCGV_I64(idx);
+}
+
  static inline TCGv_i64 tcg_temp_new_i64(void)
  {
      return tcg_temp_new_internal_i64(0);
  }
+
  static inline TCGv_i64 tcg_temp_local_new_i64(void)
  {
      return tcg_temp_new_internal_i64(1);
  }
-void tcg_temp_free_i64(TCGv_i64 arg);
-char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg);
  
  #if defined(CONFIG_DEBUG_TCG)
  /* If you call tcg_clear_temp_count() at the start of a section of
@@ -733,15 +873,6 @@ do {\
      abort();\
  } while (0)
  
-#ifdef CONFIG_DEBUG_TCG
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#elif QEMU_GNUC_PREREQ(4, 5)
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#else
-# define tcg_debug_assert(X) do { (void)(X); } while (0)
-#endif
-
  void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
  
  #if UINTPTR_MAX == UINT32_MAX
@@ -772,12 +903,14 @@ void tcg_gen_callN(TCGContext *s, void *func,
                     TCGArg ret, int nargs, TCGArg *args);
  
  void tcg_op_remove(TCGContext *s, TCGOp *op);
+TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
+TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
+
  void tcg_optimize(TCGContext *s);
  
  /* only used for debugging purposes */
  void tcg_dump_ops(TCGContext *s);
  
-void dump_ops(const uint16_t *opc_buf, const TCGArg *opparam_buf);
  TCGv_i32 tcg_const_i32(int32_t val);
  TCGv_i64 tcg_const_i64(int64_t val);
  TCGv_i32 tcg_const_local_i32(int32_t val);
@@ -895,7 +1028,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
  
  /**
   * tcg_qemu_tb_exec:
- * @env: CPUArchState * for the CPU
+ * @env: pointer to CPUArchState for the CPU
   * @tb_ptr: address of generated code for the TB to execute
   *
   * Start executing code from a given translation block.
@@ -906,30 +1039,31 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
   * which has not yet been directly linked, or an asynchronous
   * event such as an interrupt needs handling.
   *
- * The return value is a pointer to the next TB to execute
- * (if known; otherwise zero). This pointer is assumed to be
- * 4-aligned, and the bottom two bits are used to return further
- * information:
+ * Return: The return value is the value passed to the corresponding
+ * tcg_gen_exit_tb() at translation time of the last TB attempted to execute.
+ * The value is either zero or a 4-byte aligned pointer to that TB combined
+ * with additional information in its two least significant bits. The
+ * additional information is encoded as follows:
   *  0, 1: the link between this TB and the next is via the specified
   *        TB index (0 or 1). That is, we left the TB via (the equivalent
   *        of) "goto_tb <index>". The main loop uses this to determine
   *        how to link the TB just executed to the next.
   *  2:    we are using instruction counting code generation, and we
   *        did not start executing this TB because the instruction counter
- *        would hit zero midway through it. In this case the next-TB pointer
+ *        would hit zero midway through it. In this case the pointer
   *        returned is the TB we were about to execute, and the caller must
   *        arrange to execute the remaining count of instructions.
   *  3:    we stopped because the CPU's exit_request flag was set
   *        (usually meaning that there is an interrupt that needs to be
- *        handled). The next-TB pointer returned is the TB we were
- *        about to execute when we noticed the pending exit request.
+ *        handled). The pointer returned is the TB we were about to execute
+ *        when we noticed the pending exit request.
   *
   * If the bottom two bits indicate an exit-via-index then the CPU
   * state is correctly synchronised and ready for execution of the next
   * TB (and in particular the guest PC is the address to execute next).
   * Otherwise, we gave up on execution of this TB before it started, and
   * the caller must fix up the CPU state by calling the CPU's
- * synchronize_from_tb() method with the next-TB pointer we return (falling
+ * synchronize_from_tb() method with the TB pointer we return (falling
   * back to calling the CPU's set_pc method with tb->pb if no
   * synchronize_from_tb() method exists).
   *
@@ -1043,6 +1177,90 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
  # define helper_ret_ldq_cmmu  helper_le_ldq_cmmu
  #endif
  
+uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
+                                    uint32_t cmpv, uint32_t newv,
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_le_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgw_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint32_t helper_atomic_cmpxchgl_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint32_t cmpv, uint32_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+uint64_t helper_atomic_cmpxchgq_be_mmu(CPUArchState *env, target_ulong addr,
+                                       uint64_t cmpv, uint64_t newv,
+                                       TCGMemOpIdx oi, uintptr_t retaddr);
+
+#define GEN_ATOMIC_HELPER(NAME, TYPE, SUFFIX)         \
+TYPE helper_atomic_ ## NAME ## SUFFIX ## _mmu         \
+    (CPUArchState *env, target_ulong addr, TYPE val,  \
+     TCGMemOpIdx oi, uintptr_t retaddr);
+
+#ifdef CONFIG_ATOMIC64
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint64_t, q_be)
+#else
+#define GEN_ATOMIC_HELPER_ALL(NAME)          \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, b)     \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, w_be)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_le)  \
+    GEN_ATOMIC_HELPER(NAME, uint32_t, l_be)
+#endif
+
+GEN_ATOMIC_HELPER_ALL(fetch_add)
+GEN_ATOMIC_HELPER_ALL(fetch_sub)
+GEN_ATOMIC_HELPER_ALL(fetch_and)
+GEN_ATOMIC_HELPER_ALL(fetch_or)
+GEN_ATOMIC_HELPER_ALL(fetch_xor)
+
+GEN_ATOMIC_HELPER_ALL(add_fetch)
+GEN_ATOMIC_HELPER_ALL(sub_fetch)
+GEN_ATOMIC_HELPER_ALL(and_fetch)
+GEN_ATOMIC_HELPER_ALL(or_fetch)
+GEN_ATOMIC_HELPER_ALL(xor_fetch)
+
+GEN_ATOMIC_HELPER_ALL(xchg)
+
+#undef GEN_ATOMIC_HELPER_ALL
+#undef GEN_ATOMIC_HELPER
  #endif /* CONFIG_SOFTMMU */
  
+#ifdef CONFIG_ATOMIC128
+#include "qemu/int128.h"
+
+/* These aren't really a "proper" helpers because TCG cannot manage Int128.
+   However, use the same format as the others, for use by the backends. */
+Int128 helper_atomic_cmpxchgo_le_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_cmpxchgo_be_mmu(CPUArchState *env, target_ulong addr,
+                                     Int128 cmpv, Int128 newv,
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
+
+Int128 helper_atomic_ldo_le_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+Int128 helper_atomic_ldo_be_mmu(CPUArchState *env, target_ulong addr,
+                                TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+                              TCGMemOpIdx oi, uintptr_t retaddr);
+
+#endif /* CONFIG_ATOMIC128 */
+
  #endif /* TCG_H */