Merge remote-tracking branch 'remotes/mjt/tags/trivial-patches-2014-02-02' into staging

[qemu.git] / exec.c
diff --git a/exec.c b/exec.c

index e3ac1a18fdc8a2899d180fe621eddf4ff915d645..9ad0a4b04568b3334f7691636437842c9f0d7b74 100644 (file)
--- a/exec.c
+++ b/exec.c
@@ -31,6 +31,7 @@
  #include "hw/qdev.h"
  #include "qemu/osdep.h"
  #include "sysemu/kvm.h"
+#include "sysemu/sysemu.h"
  #include "hw/xen/xen.h"
  #include "qemu/timer.h"
  #include "qemu/config-file.h"
@@ -49,12 +50,15 @@
  #include "translate-all.h"
  
  #include "exec/memory-internal.h"
+#include "exec/ram_addr.h"
+#include "qemu/cache-utils.h"
+
+#include "qemu/range.h"
  
  //#define DEBUG_SUBPAGE
  
  #if !defined(CONFIG_USER_ONLY)
-int phys_ram_fd;
-static int in_migration;
+static bool in_migration;
  
  RAMList ram_list = { .blocks = QTAILQ_HEAD_INITIALIZER(ram_list.blocks) };
  
@@ -69,10 +73,10 @@ static MemoryRegion io_mem_unassigned;
  
  #endif
  
-CPUArchState *first_cpu;
+struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  /* current CPU in the current thread. It is only valid inside
     cpu_exec() */
-DEFINE_TLS(CPUArchState *,cpu_single_env);
+DEFINE_TLS(CPUState *, current_cpu);
  /* 0 = Do not count executed instructions.
     1 = Precise instruction counting.
     2 = Adaptive rate instruction counting.  */
@@ -83,17 +87,39 @@ int use_icount;
  typedef struct PhysPageEntry PhysPageEntry;
  
  struct PhysPageEntry {
-    uint16_t is_leaf : 1;
-     /* index into phys_sections (is_leaf) or phys_map_nodes (!is_leaf) */
-    uint16_t ptr : 15;
+    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
+    uint32_t skip : 6;
+     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
+    uint32_t ptr : 26;
  };
  
+#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
+
+/* Size of the L2 (and L3, etc) page tables.  */
+#define ADDR_SPACE_BITS 64
+
+#define P_L2_BITS 9
+#define P_L2_SIZE (1 << P_L2_BITS)
+
+#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
+
+typedef PhysPageEntry Node[P_L2_SIZE];
+
+typedef struct PhysPageMap {
+    unsigned sections_nb;
+    unsigned sections_nb_alloc;
+    unsigned nodes_nb;
+    unsigned nodes_nb_alloc;
+    Node *nodes;
+    MemoryRegionSection *sections;
+} PhysPageMap;
+
  struct AddressSpaceDispatch {
      /* This is a multi-level map on the physical address space.
       * The bottom level has pointers to MemoryRegionSections.
       */
      PhysPageEntry phys_map;
-    MemoryListener listener;
+    PhysPageMap map;
      AddressSpace *as;
  };
  
@@ -105,85 +131,73 @@ typedef struct subpage_t {
      uint16_t sub_section[TARGET_PAGE_SIZE];
  } subpage_t;
  
-static MemoryRegionSection *phys_sections;
-static unsigned phys_sections_nb, phys_sections_nb_alloc;
-static uint16_t phys_section_unassigned;
-static uint16_t phys_section_notdirty;
-static uint16_t phys_section_rom;
-static uint16_t phys_section_watch;
-
-/* Simple allocator for PhysPageEntry nodes */
-static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
-static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
-
-#define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
+#define PHYS_SECTION_UNASSIGNED 0
+#define PHYS_SECTION_NOTDIRTY 1
+#define PHYS_SECTION_ROM 2
+#define PHYS_SECTION_WATCH 3
  
  static void io_mem_init(void);
  static void memory_map_init(void);
-static void *qemu_safe_ram_ptr(ram_addr_t addr);
  
  static MemoryRegion io_mem_watch;
  #endif
  
  #if !defined(CONFIG_USER_ONLY)
  
-static void phys_map_node_reserve(unsigned nodes)
+static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
  {
-    if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
-        typedef PhysPageEntry Node[L2_SIZE];
-        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
-        phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
-                                      phys_map_nodes_nb + nodes);
-        phys_map_nodes = g_renew(Node, phys_map_nodes,
-                                 phys_map_nodes_nb_alloc);
+    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
+        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
+        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
+        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
      }
  }
  
-static uint16_t phys_map_node_alloc(void)
+static uint32_t phys_map_node_alloc(PhysPageMap *map)
  {
      unsigned i;
-    uint16_t ret;
+    uint32_t ret;
  
-    ret = phys_map_nodes_nb++;
+    ret = map->nodes_nb++;
      assert(ret != PHYS_MAP_NODE_NIL);
-    assert(ret != phys_map_nodes_nb_alloc);
-    for (i = 0; i < L2_SIZE; ++i) {
-        phys_map_nodes[ret][i].is_leaf = 0;
-        phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
+    assert(ret != map->nodes_nb_alloc);
+    for (i = 0; i < P_L2_SIZE; ++i) {
+        map->nodes[ret][i].skip = 1;
+        map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
      }
      return ret;
  }
  
-static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
-                                hwaddr *nb, uint16_t leaf,
+static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
+                                hwaddr *index, hwaddr *nb, uint16_t leaf,
                                  int level)
  {
      PhysPageEntry *p;
      int i;
-    hwaddr step = (hwaddr)1 << (level * L2_BITS);
+    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
  
-    if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
-        lp->ptr = phys_map_node_alloc();
-        p = phys_map_nodes[lp->ptr];
+    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
+        lp->ptr = phys_map_node_alloc(map);
+        p = map->nodes[lp->ptr];
          if (level == 0) {
-            for (i = 0; i < L2_SIZE; i++) {
-                p[i].is_leaf = 1;
-                p[i].ptr = phys_section_unassigned;
+            for (i = 0; i < P_L2_SIZE; i++) {
+                p[i].skip = 0;
+                p[i].ptr = PHYS_SECTION_UNASSIGNED;
              }
          }
      } else {
-        p = phys_map_nodes[lp->ptr];
+        p = map->nodes[lp->ptr];
      }
-    lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
+    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
  
-    while (*nb && lp < &p[L2_SIZE]) {
+    while (*nb && lp < &p[P_L2_SIZE]) {
          if ((*index & (step - 1)) == 0 && *nb >= step) {
-            lp->is_leaf = true;
+            lp->skip = 0;
              lp->ptr = leaf;
              *index += step;
              *nb -= step;
          } else {
-            phys_page_set_level(lp, index, nb, leaf, level - 1);
+            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
          }
          ++lp;
      }
@@ -194,25 +208,95 @@ static void phys_page_set(AddressSpaceDispatch *d,
                            uint16_t leaf)
  {
      /* Wildly overreserve - it doesn't matter much. */
-    phys_map_node_reserve(3 * P_L2_LEVELS);
+    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
  
-    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
+    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
  }
  
-static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr index)
+/* Compact a non leaf page entry. Simply detect that the entry has a single child,
+ * and update our entry so we can skip it and go directly to the destination.
+ */
+static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
  {
-    PhysPageEntry lp = d->phys_map;
+    unsigned valid_ptr = P_L2_SIZE;
+    int valid = 0;
      PhysPageEntry *p;
      int i;
  
-    for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
+    if (lp->ptr == PHYS_MAP_NODE_NIL) {
+        return;
+    }
+
+    p = nodes[lp->ptr];
+    for (i = 0; i < P_L2_SIZE; i++) {
+        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
+            continue;
+        }
+
+        valid_ptr = i;
+        valid++;
+        if (p[i].skip) {
+            phys_page_compact(&p[i], nodes, compacted);
+        }
+    }
+
+    /* We can only compress if there's only one child. */
+    if (valid != 1) {
+        return;
+    }
+
+    assert(valid_ptr < P_L2_SIZE);
+
+    /* Don't compress if it won't fit in the # of bits we have. */
+    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
+        return;
+    }
+
+    lp->ptr = p[valid_ptr].ptr;
+    if (!p[valid_ptr].skip) {
+        /* If our only child is a leaf, make this a leaf. */
+        /* By design, we should have made this node a leaf to begin with so we
+         * should never reach here.
+         * But since it's so simple to handle this, let's do it just in case we
+         * change this rule.
+         */
+        lp->skip = 0;
+    } else {
+        lp->skip += p[valid_ptr].skip;
+    }
+}
+
+static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
+{
+    DECLARE_BITMAP(compacted, nodes_nb);
+
+    if (d->phys_map.skip) {
+        phys_page_compact(&d->phys_map, d->map.nodes, compacted);
+    }
+}
+
+static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
+                                           Node *nodes, MemoryRegionSection *sections)
+{
+    PhysPageEntry *p;
+    hwaddr index = addr >> TARGET_PAGE_BITS;
+    int i;
+
+    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
          if (lp.ptr == PHYS_MAP_NODE_NIL) {
-            return &phys_sections[phys_section_unassigned];
+            return &sections[PHYS_SECTION_UNASSIGNED];
          }
-        p = phys_map_nodes[lp.ptr];
-        lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
+        p = nodes[lp.ptr];
+        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
+    }
+
+    if (sections[lp.ptr].size.hi ||
+        range_covers_byte(sections[lp.ptr].offset_within_address_space,
+                          sections[lp.ptr].size.lo, addr)) {
+        return &sections[lp.ptr];
+    } else {
+        return &sections[PHYS_SECTION_UNASSIGNED];
      }
-    return &phys_sections[lp.ptr];
  }
  
  bool memory_region_is_unassigned(MemoryRegion *mr)
@@ -221,36 +305,38 @@ bool memory_region_is_unassigned(MemoryRegion *mr)
          && mr != &io_mem_watch;
  }
  
-static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
+static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
                                                          hwaddr addr,
                                                          bool resolve_subpage)
  {
      MemoryRegionSection *section;
      subpage_t *subpage;
  
-    section = phys_page_find(as->dispatch, addr >> TARGET_PAGE_BITS);
+    section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
      if (resolve_subpage && section->mr->subpage) {
          subpage = container_of(section->mr, subpage_t, iomem);
-        section = &phys_sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
+        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
      }
      return section;
  }
  
  static MemoryRegionSection *
-address_space_translate_internal(AddressSpace *as, hwaddr addr, hwaddr *xlat,
+address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
                                   hwaddr *plen, bool resolve_subpage)
  {
      MemoryRegionSection *section;
-    Int128 diff;
+    Int128 diff, diff_page;
  
-    section = address_space_lookup_region(as, addr, resolve_subpage);
+    section = address_space_lookup_region(d, addr, resolve_subpage);
      /* Compute offset within MemoryRegionSection */
      addr -= section->offset_within_address_space;
  
      /* Compute offset within MemoryRegion */
      *xlat = addr + section->offset_within_region;
  
+    diff_page = int128_make64(((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr);
      diff = int128_sub(section->mr->size, int128_make64(addr));
+    diff = int128_min(diff, diff_page);
      *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
      return section;
  }
@@ -265,7 +351,7 @@ MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
      hwaddr len = *plen;
  
      for (;;) {
-        section = address_space_translate_internal(as, addr, &addr, plen, true);
+        section = address_space_translate_internal(as->dispatch, addr, &addr, &len, true);
          mr = section->mr;
  
          if (!mr->iommu_ops) {
@@ -294,7 +380,7 @@ address_space_translate_for_iotlb(AddressSpace *as, hwaddr addr, hwaddr *xlat,
                                    hwaddr *plen)
  {
      MemoryRegionSection *section;
-    section = address_space_translate_internal(as, addr, xlat, plen, false);
+    section = address_space_translate_internal(as->dispatch, addr, xlat, plen, false);
  
      assert(!section->mr->iommu_ops);
      return section;
@@ -341,45 +427,29 @@ const VMStateDescription vmstate_cpu_common = {
  
  CPUState *qemu_get_cpu(int index)
  {
-    CPUArchState *env = first_cpu;
-    CPUState *cpu = NULL;
+    CPUState *cpu;
  
-    while (env) {
-        cpu = ENV_GET_CPU(env);
+    CPU_FOREACH(cpu) {
          if (cpu->cpu_index == index) {
-            break;
+            return cpu;
          }
-        env = env->next_cpu;
      }
  
-    return env ? cpu : NULL;
-}
-
-void qemu_for_each_cpu(void (*func)(CPUState *cpu, void *data), void *data)
-{
-    CPUArchState *env = first_cpu;
-
-    while (env) {
-        func(ENV_GET_CPU(env), data);
-        env = env->next_cpu;
-    }
+    return NULL;
  }
  
  void cpu_exec_init(CPUArchState *env)
  {
      CPUState *cpu = ENV_GET_CPU(env);
      CPUClass *cc = CPU_GET_CLASS(cpu);
-    CPUArchState **penv;
+    CPUState *some_cpu;
      int cpu_index;
  
  #if defined(CONFIG_USER_ONLY)
      cpu_list_lock();
  #endif
-    env->next_cpu = NULL;
-    penv = &first_cpu;
      cpu_index = 0;
-    while (*penv != NULL) {
-        penv = &(*penv)->next_cpu;
+    CPU_FOREACH(some_cpu) {
          cpu_index++;
      }
      cpu->cpu_index = cpu_index;
@@ -389,15 +459,18 @@ void cpu_exec_init(CPUArchState *env)
  #ifndef CONFIG_USER_ONLY
      cpu->thread_id = qemu_get_thread_id();
  #endif
-    *penv = env;
+    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
  #if defined(CONFIG_USER_ONLY)
      cpu_list_unlock();
  #endif
-    vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
+    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+        vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
+    }
  #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
      register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
                      cpu_save, cpu_load, env);
      assert(cc->vmsd == NULL);
+    assert(qdev_get_vmsd(DEVICE(cpu)) == NULL);
  #endif
      if (cc->vmsd != NULL) {
          vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
@@ -406,15 +479,17 @@ void cpu_exec_init(CPUArchState *env)
  
  #if defined(TARGET_HAS_ICE)
  #if defined(CONFIG_USER_ONLY)
-static void breakpoint_invalidate(CPUArchState *env, target_ulong pc)
+static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  {
      tb_invalidate_phys_page_range(pc, pc + 1, 0);
  }
  #else
-static void breakpoint_invalidate(CPUArchState *env, target_ulong pc)
+static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  {
-    tb_invalidate_phys_addr(cpu_get_phys_page_debug(env, pc) |
-            (pc & ~TARGET_PAGE_MASK));
+    hwaddr phys = cpu_get_phys_page_debug(cpu, pc);
+    if (phys != -1) {
+        tb_invalidate_phys_addr(phys | (pc & ~TARGET_PAGE_MASK));
+    }
  }
  #endif
  #endif /* TARGET_HAS_ICE */
@@ -516,15 +591,17 @@ int cpu_breakpoint_insert(CPUArchState *env, target_ulong pc, int flags,
      bp->flags = flags;
  
      /* keep all GDB-injected breakpoints in front */
-    if (flags & BP_GDB)
+    if (flags & BP_GDB) {
          QTAILQ_INSERT_HEAD(&env->breakpoints, bp, entry);
-    else
+    } else {
          QTAILQ_INSERT_TAIL(&env->breakpoints, bp, entry);
+    }
  
-    breakpoint_invalidate(env, pc);
+    breakpoint_invalidate(ENV_GET_CPU(env), pc);
  
-    if (breakpoint)
+    if (breakpoint) {
          *breakpoint = bp;
+    }
      return 0;
  #else
      return -ENOSYS;
@@ -555,7 +632,7 @@ void cpu_breakpoint_remove_by_ref(CPUArchState *env, CPUBreakpoint *breakpoint)
  #if defined(TARGET_HAS_ICE)
      QTAILQ_REMOVE(&env->breakpoints, breakpoint, entry);
  
-    breakpoint_invalidate(env, breakpoint->pc);
+    breakpoint_invalidate(ENV_GET_CPU(env), breakpoint->pc);
  
      g_free(breakpoint);
  #endif
@@ -576,16 +653,17 @@ void cpu_breakpoint_remove_all(CPUArchState *env, int mask)
  
  /* enable or disable single step mode. EXCP_DEBUG is returned by the
     CPU loop after each instruction */
-void cpu_single_step(CPUArchState *env, int enabled)
+void cpu_single_step(CPUState *cpu, int enabled)
  {
  #if defined(TARGET_HAS_ICE)
-    if (env->singlestep_enabled != enabled) {
-        env->singlestep_enabled = enabled;
-        if (kvm_enabled())
-            kvm_update_guest_debug(env, 0);
-        else {
+    if (cpu->singlestep_enabled != enabled) {
+        cpu->singlestep_enabled = enabled;
+        if (kvm_enabled()) {
+            kvm_update_guest_debug(cpu, 0);
+        } else {
              /* must flush all the translated code to avoid inconsistencies */
              /* XXX: only flush what is necessary */
+            CPUArchState *env = cpu->env_ptr;
              tb_flush(env);
          }
      }
@@ -608,7 +686,7 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...)
          qemu_log("qemu: fatal: ");
          qemu_log_vprintf(fmt, ap2);
          qemu_log("\n");
-        log_cpu_state(env, CPU_DUMP_FPU | CPU_DUMP_CCOP);
+        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
          qemu_log_flush();
          qemu_log_close();
      }
@@ -625,81 +703,61 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...)
      abort();
  }
  
-CPUArchState *cpu_copy(CPUArchState *env)
+#if !defined(CONFIG_USER_ONLY)
+static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
  {
-    CPUArchState *new_env = cpu_init(env->cpu_model_str);
-    CPUArchState *next_cpu = new_env->next_cpu;
-#if defined(TARGET_HAS_ICE)
-    CPUBreakpoint *bp;
-    CPUWatchpoint *wp;
-#endif
-
-    memcpy(new_env, env, sizeof(CPUArchState));
-
-    /* Preserve chaining. */
-    new_env->next_cpu = next_cpu;
+    RAMBlock *block;
  
-    /* Clone all break/watchpoints.
-       Note: Once we support ptrace with hw-debug register access, make sure
-       BP_CPU break/watchpoints are handled correctly on clone. */
-    QTAILQ_INIT(&env->breakpoints);
-    QTAILQ_INIT(&env->watchpoints);
-#if defined(TARGET_HAS_ICE)
-    QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
-        cpu_breakpoint_insert(new_env, bp->pc, bp->flags, NULL);
+    /* The list is protected by the iothread lock here.  */
+    block = ram_list.mru_block;
+    if (block && addr - block->offset < block->length) {
+        goto found;
      }
-    QTAILQ_FOREACH(wp, &env->watchpoints, entry) {
-        cpu_watchpoint_insert(new_env, wp->vaddr, (~wp->len_mask) + 1,
-                              wp->flags, NULL);
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (addr - block->offset < block->length) {
+            goto found;
+        }
      }
-#endif
-
-    return new_env;
-}
  
-#if !defined(CONFIG_USER_ONLY)
-static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t end,
-                                      uintptr_t length)
-{
-    uintptr_t start1;
-
-    /* we modify the TLB cache so that the dirty bit will be set again
-       when accessing the range */
-    start1 = (uintptr_t)qemu_safe_ram_ptr(start);
-    /* Check that we don't span multiple blocks - this breaks the
-       address comparisons below.  */
-    if ((uintptr_t)qemu_safe_ram_ptr(end - 1) - start1
-            != (end - 1) - start) {
-        abort();
-    }
-    cpu_tlb_reset_dirty_all(start1, length);
+    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
+    abort();
  
+found:
+    ram_list.mru_block = block;
+    return block;
  }
  
-/* Note: start and end must be within the same ram block.  */
-void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
-                                     int dirty_flags)
+static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
  {
-    uintptr_t length;
+    ram_addr_t start1;
+    RAMBlock *block;
+    ram_addr_t end;
  
+    end = TARGET_PAGE_ALIGN(start + length);
      start &= TARGET_PAGE_MASK;
-    end = TARGET_PAGE_ALIGN(end);
  
-    length = end - start;
+    block = qemu_get_ram_block(start);
+    assert(block == qemu_get_ram_block(end - 1));
+    start1 = (uintptr_t)block->host + (start - block->offset);
+    cpu_tlb_reset_dirty_all(start1, length);
+}
+
+/* Note: start and end must be within the same ram block.  */
+void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length,
+                                     unsigned client)
+{
      if (length == 0)
          return;
-    cpu_physical_memory_mask_dirty_range(start, length, dirty_flags);
+    cpu_physical_memory_clear_dirty_range(start, length, client);
  
      if (tcg_enabled()) {
-        tlb_reset_dirty_range_all(start, end, length);
+        tlb_reset_dirty_range_all(start, length);
      }
  }
  
-static int cpu_physical_memory_set_dirty_tracking(int enable)
+static void cpu_physical_memory_set_dirty_tracking(bool enable)
  {
-    int ret = 0;
      in_migration = enable;
-    return ret;
  }
  
  hwaddr memory_region_section_get_iotlb(CPUArchState *env,
@@ -717,12 +775,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
          iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
              + xlat;
          if (!section->readonly) {
-            iotlb |= phys_section_notdirty;
+            iotlb |= PHYS_SECTION_NOTDIRTY;
          } else {
-            iotlb |= phys_section_rom;
+            iotlb |= PHYS_SECTION_ROM;
          }
      } else {
-        iotlb = section - phys_sections;
+        iotlb = section - address_space_memory.dispatch->map.sections;
          iotlb += xlat;
      }
  
@@ -732,7 +790,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
          if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
              /* Avoid trapping reads of pages with a write breakpoint. */
              if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
-                iotlb = phys_section_watch + paddr;
+                iotlb = PHYS_SECTION_WATCH + paddr;
                  *address |= TLB_MMIO;
                  break;
              }
@@ -749,22 +807,35 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
                               uint16_t section);
  static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
  
-static uint16_t phys_section_add(MemoryRegionSection *section)
+static void *(*phys_mem_alloc)(size_t size) = qemu_anon_ram_alloc;
+
+/*
+ * Set a custom physical guest memory alloator.
+ * Accelerators with unusual needs may need this.  Hopefully, we can
+ * get rid of it eventually.
+ */
+void phys_mem_set_alloc(void *(*alloc)(size_t))
+{
+    phys_mem_alloc = alloc;
+}
+
+static uint16_t phys_section_add(PhysPageMap *map,
+                                 MemoryRegionSection *section)
  {
      /* The physical section number is ORed with a page-aligned
       * pointer to produce the iotlb entries.  Thus it should
       * never overflow into the page-aligned value.
       */
-    assert(phys_sections_nb < TARGET_PAGE_SIZE);
+    assert(map->sections_nb < TARGET_PAGE_SIZE);
  
-    if (phys_sections_nb == phys_sections_nb_alloc) {
-        phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
-        phys_sections = g_renew(MemoryRegionSection, phys_sections,
-                                phys_sections_nb_alloc);
+    if (map->sections_nb == map->sections_nb_alloc) {
+        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
+        map->sections = g_renew(MemoryRegionSection, map->sections,
+                                map->sections_nb_alloc);
      }
-    phys_sections[phys_sections_nb] = *section;
+    map->sections[map->sections_nb] = *section;
      memory_region_ref(section->mr);
-    return phys_sections_nb++;
+    return map->sections_nb++;
  }
  
  static void phys_section_destroy(MemoryRegion *mr)
@@ -778,13 +849,14 @@ static void phys_section_destroy(MemoryRegion *mr)
      }
  }
  
-static void phys_sections_clear(void)
+static void phys_sections_free(PhysPageMap *map)
  {
-    while (phys_sections_nb > 0) {
-        MemoryRegionSection *section = &phys_sections[--phys_sections_nb];
+    while (map->sections_nb > 0) {
+        MemoryRegionSection *section = &map->sections[--map->sections_nb];
          phys_section_destroy(section->mr);
      }
-    phys_map_nodes_nb = 0;
+    g_free(map->sections);
+    g_free(map->nodes);
  }
  
  static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
@@ -792,7 +864,8 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
      subpage_t *subpage;
      hwaddr base = section->offset_within_address_space
          & TARGET_PAGE_MASK;
-    MemoryRegionSection *existing = phys_page_find(d, base >> TARGET_PAGE_BITS);
+    MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
+                                                   d->map.nodes, d->map.sections);
      MemoryRegionSection subsection = {
          .offset_within_address_space = base,
          .size = int128_make64(TARGET_PAGE_SIZE),
@@ -805,13 +878,14 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
          subpage = subpage_init(d->as, base);
          subsection.mr = &subpage->iomem;
          phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
-                      phys_section_add(&subsection));
+                      phys_section_add(&d->map, &subsection));
      } else {
          subpage = container_of(existing->mr, subpage_t, iomem);
      }
      start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
      end = start + int128_get64(section->size) - 1;
-    subpage_register(subpage, start, end, phys_section_add(section));
+    subpage_register(subpage, start, end,
+                     phys_section_add(&d->map, section));
  }
  
  
@@ -819,7 +893,7 @@ static void register_multipage(AddressSpaceDispatch *d,
                                 MemoryRegionSection *section)
  {
      hwaddr start_addr = section->offset_within_address_space;
-    uint16_t section_index = phys_section_add(section);
+    uint16_t section_index = phys_section_add(&d->map, section);
      uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                      TARGET_PAGE_BITS));
  
@@ -829,7 +903,8 @@ static void register_multipage(AddressSpaceDispatch *d,
  
  static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
  {
-    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
+    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
+    AddressSpaceDispatch *d = as->next_dispatch;
      MemoryRegionSection now = *section, remain = *section;
      Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
  
@@ -849,7 +924,7 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
          now = remain;
          if (int128_lt(remain.size, page_size)) {
              register_subpage(d, &now);
-        } else if (remain.offset_within_region & ~TARGET_PAGE_MASK) {
+        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
              now.size = page_size;
              register_subpage(d, &now);
          } else {
@@ -875,7 +950,7 @@ void qemu_mutex_unlock_ramlist(void)
      qemu_mutex_unlock(&ram_list.mutex);
  }
  
-#if defined(__linux__) && !defined(TARGET_S390X)
+#ifdef __linux__
  
  #include <sys/vfs.h>
  
@@ -901,6 +976,13 @@ static long gethugepagesize(const char *path)
      return fs.f_bsize;
  }
  
+static sigjmp_buf sigjump;
+
+static void sigbus_handler(int signal)
+{
+    siglongjmp(sigjump, 1);
+}
+
  static void *file_ram_alloc(RAMBlock *block,
                              ram_addr_t memory,
                              const char *path)
@@ -910,9 +992,6 @@ static void *file_ram_alloc(RAMBlock *block,
      char *c;
      void *area;
      int fd;
-#ifdef MAP_POPULATE
-    int flags;
-#endif
      unsigned long hpagesize;
  
      hpagesize = gethugepagesize(path);
@@ -960,24 +1039,63 @@ static void *file_ram_alloc(RAMBlock *block,
      if (ftruncate(fd, memory))
          perror("ftruncate");
  
-#ifdef MAP_POPULATE
-    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
-     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
-     * to sidestep this quirk.
-     */
-    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
-    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
-#else
      area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-#endif
      if (area == MAP_FAILED) {
          perror("file_ram_alloc: can't mmap RAM pages");
          close(fd);
          return (NULL);
      }
+
+    if (mem_prealloc) {
+        int ret, i;
+        struct sigaction act, oldact;
+        sigset_t set, oldset;
+
+        memset(&act, 0, sizeof(act));
+        act.sa_handler = &sigbus_handler;
+        act.sa_flags = 0;
+
+        ret = sigaction(SIGBUS, &act, &oldact);
+        if (ret) {
+            perror("file_ram_alloc: failed to install signal handler");
+            exit(1);
+        }
+
+        /* unblock SIGBUS */
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
+
+        if (sigsetjmp(sigjump, 1)) {
+            fprintf(stderr, "file_ram_alloc: failed to preallocate pages\n");
+            exit(1);
+        }
+
+        /* MAP_POPULATE silently ignores failures */
+        for (i = 0; i < (memory/hpagesize); i++) {
+            memset(area + (hpagesize*i), 0, 1);
+        }
+
+        ret = sigaction(SIGBUS, &oldact, NULL);
+        if (ret) {
+            perror("file_ram_alloc: failed to reinstall signal handler");
+            exit(1);
+        }
+
+        pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+    }
+
      block->fd = fd;
      return area;
  }
+#else
+static void *file_ram_alloc(RAMBlock *block,
+                            ram_addr_t memory,
+                            const char *path)
+{
+    fprintf(stderr, "-mem-path not supported on this host\n");
+    exit(1);
+}
  #endif
  
  static ram_addr_t find_ram_offset(ram_addr_t size)
@@ -1029,12 +1147,10 @@ ram_addr_t last_ram_offset(void)
  static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
  {
      int ret;
-    QemuOpts *machine_opts;
  
      /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
-    machine_opts = qemu_opts_find(qemu_find_opts("machine"), 0);
-    if (machine_opts &&
-        !qemu_opt_get_bool(machine_opts, "dump-guest-core", true)) {
+    if (!qemu_opt_get_bool(qemu_get_machine_opts(),
+                           "dump-guest-core", true)) {
          ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
          if (ret) {
              perror("qemu_madvise");
@@ -1081,10 +1197,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
  
  static int memory_try_enable_merging(void *addr, size_t len)
  {
-    QemuOpts *opts;
-
-    opts = qemu_opts_find(qemu_find_opts("machine"), 0);
-    if (opts && !qemu_opt_get_bool(opts, "mem-merge", true)) {
+    if (!qemu_opt_get_bool(qemu_get_machine_opts(), "mem-merge", true)) {
          /* disabled by the user */
          return 0;
      }
@@ -1096,9 +1209,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                     MemoryRegion *mr)
  {
      RAMBlock *block, *new_block;
+    ram_addr_t old_ram_size, new_ram_size;
+
+    old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
  
      size = TARGET_PAGE_ALIGN(size);
      new_block = g_malloc0(sizeof(*new_block));
+    new_block->fd = -1;
  
      /* This assumes the iothread lock is taken here too.  */
      qemu_mutex_lock_ramlist();
@@ -1107,26 +1224,32 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
      if (host) {
          new_block->host = host;
          new_block->flags |= RAM_PREALLOC_MASK;
+    } else if (xen_enabled()) {
+        if (mem_path) {
+            fprintf(stderr, "-mem-path not supported with Xen\n");
+            exit(1);
+        }
+        xen_ram_alloc(new_block->offset, size, mr);
      } else {
          if (mem_path) {
-#if defined (__linux__) && !defined(TARGET_S390X)
+            if (phys_mem_alloc != qemu_anon_ram_alloc) {
+                /*
+                 * file_ram_alloc() needs to allocate just like
+                 * phys_mem_alloc, but we haven't bothered to provide
+                 * a hook there.
+                 */
+                fprintf(stderr,
+                        "-mem-path not supported with this accelerator\n");
+                exit(1);
+            }
              new_block->host = file_ram_alloc(new_block, size, mem_path);
+        }
+        if (!new_block->host) {
+            new_block->host = phys_mem_alloc(size);
              if (!new_block->host) {
-                new_block->host = qemu_anon_ram_alloc(size);
-                memory_try_enable_merging(new_block->host, size);
-            }
-#else
-            fprintf(stderr, "-mem-path option unsupported\n");
-            exit(1);
-#endif
-        } else {
-            if (xen_enabled()) {
-                xen_ram_alloc(new_block->offset, size, mr);
-            } else if (kvm_enabled()) {
-                /* some s390/kvm configurations have special constraints */
-                new_block->host = kvm_ram_alloc(size);
-            } else {
-                new_block->host = qemu_anon_ram_alloc(size);
+                fprintf(stderr, "Cannot set up guest memory '%s': %s\n",
+                        new_block->mr->name, strerror(errno));
+                exit(1);
              }
              memory_try_enable_merging(new_block->host, size);
          }
@@ -1149,14 +1272,21 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
      ram_list.version++;
      qemu_mutex_unlock_ramlist();
  
-    ram_list.phys_dirty = g_realloc(ram_list.phys_dirty,
-                                       last_ram_offset() >> TARGET_PAGE_BITS);
-    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
-           0, size >> TARGET_PAGE_BITS);
-    cpu_physical_memory_set_dirty_range(new_block->offset, size, 0xff);
+    new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    if (new_ram_size > old_ram_size) {
+        int i;
+        for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
+            ram_list.dirty_memory[i] =
+                bitmap_zero_extend(ram_list.dirty_memory[i],
+                                   old_ram_size, new_ram_size);
+       }
+    }
+    cpu_physical_memory_set_dirty_range(new_block->offset, size);
  
      qemu_ram_setup_dump(new_block->host, size);
      qemu_madvise(new_block->host, size, QEMU_MADV_HUGEPAGE);
+    qemu_madvise(new_block->host, size, QEMU_MADV_DONTFORK);
  
      if (kvm_enabled())
          kvm_setup_guest_memory(new_block->host, size);
@@ -1200,23 +1330,15 @@ void qemu_ram_free(ram_addr_t addr)
              ram_list.version++;
              if (block->flags & RAM_PREALLOC_MASK) {
                  ;
-            } else if (mem_path) {
-#if defined (__linux__) && !defined(TARGET_S390X)
-                if (block->fd) {
-                    munmap(block->host, block->length);
-                    close(block->fd);
-                } else {
-                    qemu_anon_ram_free(block->host, block->length);
-                }
-#else
-                abort();
+            } else if (xen_enabled()) {
+                xen_invalidate_map_cache_entry(block->host);
+#ifndef _WIN32
+            } else if (block->fd >= 0) {
+                munmap(block->host, block->length);
+                close(block->fd);
  #endif
              } else {
-                if (xen_enabled()) {
-                    xen_invalidate_map_cache_entry(block->host);
-                } else {
-                    qemu_anon_ram_free(block->host, block->length);
-                }
+                qemu_anon_ram_free(block->host, block->length);
              }
              g_free(block);
              break;
@@ -1240,38 +1362,31 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
              vaddr = block->host + offset;
              if (block->flags & RAM_PREALLOC_MASK) {
                  ;
+            } else if (xen_enabled()) {
+                abort();
              } else {
                  flags = MAP_FIXED;
                  munmap(vaddr, length);
-                if (mem_path) {
-#if defined(__linux__) && !defined(TARGET_S390X)
-                    if (block->fd) {
+                if (block->fd >= 0) {
  #ifdef MAP_POPULATE
-                        flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED :
-                            MAP_PRIVATE;
+                    flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED :
+                        MAP_PRIVATE;
  #else
-                        flags |= MAP_PRIVATE;
-#endif
-                        area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
-                                    flags, block->fd, offset);
-                    } else {
-                        flags |= MAP_PRIVATE | MAP_ANONYMOUS;
-                        area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
-                                    flags, -1, 0);
-                    }
-#else
-                    abort();
+                    flags |= MAP_PRIVATE;
  #endif
+                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
+                                flags, block->fd, offset);
                  } else {
-#if defined(TARGET_S390X) && defined(CONFIG_KVM)
-                    flags |= MAP_SHARED | MAP_ANONYMOUS;
-                    area = mmap(vaddr, length, PROT_EXEC|PROT_READ|PROT_WRITE,
-                                flags, -1, 0);
-#else
+                    /*
+                     * Remap needs to match alloc.  Accelerators that
+                     * set phys_mem_alloc never remap.  If they did,
+                     * we'd need a remap hook here.
+                     */
+                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
+
                      flags |= MAP_PRIVATE | MAP_ANONYMOUS;
                      area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
                                  flags, -1, 0);
-#endif
                  }
                  if (area != vaddr) {
                      fprintf(stderr, "Could not remap addr: "
@@ -1298,24 +1413,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
   */
  void *qemu_get_ram_ptr(ram_addr_t addr)
  {
-    RAMBlock *block;
+    RAMBlock *block = qemu_get_ram_block(addr);
  
-    /* The list is protected by the iothread lock here.  */
-    block = ram_list.mru_block;
-    if (block && addr - block->offset < block->length) {
-        goto found;
-    }
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            goto found;
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-found:
-    ram_list.mru_block = block;
      if (xen_enabled()) {
          /* We need to check if the requested address is in the RAM
           * because we don't want to map the entire memory in QEMU.
@@ -1331,43 +1430,9 @@ found:
      return block->host + (addr - block->offset);
  }
  
-/* Return a host pointer to ram allocated with qemu_ram_alloc.  Same as
- * qemu_get_ram_ptr but do not touch ram_list.mru_block.
- *
- * ??? Is this still necessary?
- */
-static void *qemu_safe_ram_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    /* The list is protected by the iothread lock here.  */
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            if (xen_enabled()) {
-                /* We need to check if the requested address is in the RAM
-                 * because we don't want to map the entire memory in QEMU.
-                 * In that case just map until the end of the page.
-                 */
-                if (block->offset == 0) {
-                    return xen_map_cache(addr, 0, 0);
-                } else if (block->host == NULL) {
-                    block->host =
-                        xen_map_cache(block->offset, block->length, 1);
-                }
-            }
-            return block->host + (addr - block->offset);
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-    return NULL;
-}
-
  /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
   * but takes a size argument */
-static void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size)
+static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
  {
      if (*size == 0) {
          return NULL;
@@ -1390,14 +1455,21 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size)
      }
  }
  
-int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
  {
      RAMBlock *block;
      uint8_t *host = ptr;
  
      if (xen_enabled()) {
          *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        return 0;
+        return qemu_get_ram_block(*ram_addr)->mr;
+    }
+
+    block = ram_list.mru_block;
+    if (block && block->host && host - block->host < block->length) {
+        goto found;
      }
  
      QTAILQ_FOREACH(block, &ram_list.blocks, next) {
@@ -1406,35 +1478,22 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
              continue;
          }
          if (host - block->host < block->length) {
-            *ram_addr = block->offset + (host - block->host);
-            return 0;
+            goto found;
          }
      }
  
-    return -1;
-}
-
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
-{
-    ram_addr_t ram_addr;
+    return NULL;
  
-    if (qemu_ram_addr_from_host(ptr, &ram_addr)) {
-        fprintf(stderr, "Bad ram pointer %p\n", ptr);
-        abort();
-    }
-    return ram_addr;
+found:
+    *ram_addr = block->offset + (host - block->host);
+    return block->mr;
  }
  
  static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
                                 uint64_t val, unsigned size)
  {
-    int dirty_flags;
-    dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
-    if (!(dirty_flags & CODE_DIRTY_FLAG)) {
+    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
          tb_invalidate_phys_page_fast(ram_addr, size);
-        dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr);
      }
      switch (size) {
      case 1:
@@ -1449,12 +1508,14 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
      default:
          abort();
      }
-    dirty_flags |= (0xff & ~CODE_DIRTY_FLAG);
-    cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags);
+    cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_MIGRATION);
+    cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_VGA);
      /* we remove the notdirty callback only if the code has been
         flushed */
-    if (dirty_flags == 0xff)
-        tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr);
+    if (!cpu_physical_memory_is_clean(ram_addr)) {
+        CPUArchState *env = current_cpu->env_ptr;
+        tlb_set_dirty(env, env->mem_io_vaddr);
+    }
  }
  
  static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
@@ -1472,7 +1533,7 @@ static const MemoryRegionOps notdirty_mem_ops = {
  /* Generate a debug exception if a watchpoint has been hit.  */
  static void check_watchpoint(int offset, int len_mask, int flags)
  {
-    CPUArchState *env = cpu_single_env;
+    CPUArchState *env = current_cpu->env_ptr;
      target_ulong pc, cs_base;
      target_ulong vaddr;
      CPUWatchpoint *wp;
@@ -1554,7 +1615,7 @@ static uint64_t subpage_read(void *opaque, hwaddr addr,
      uint8_t buf[4];
  
  #if defined(DEBUG_SUBPAGE)
-    printf("%s: subpage %p len %d addr " TARGET_FMT_plx "\n", __func__,
+    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
             subpage, len, addr);
  #endif
      address_space_read(subpage->as, addr + subpage->base, buf, len);
@@ -1577,7 +1638,7 @@ static void subpage_write(void *opaque, hwaddr addr,
      uint8_t buf[4];
  
  #if defined(DEBUG_SUBPAGE)
-    printf("%s: subpage %p len %d addr " TARGET_FMT_plx
+    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
             " value %"PRIx64"\n",
             __func__, subpage, len, addr, value);
  #endif
@@ -1598,16 +1659,16 @@ static void subpage_write(void *opaque, hwaddr addr,
  }
  
  static bool subpage_accepts(void *opaque, hwaddr addr,
-                            unsigned size, bool is_write)
+                            unsigned len, bool is_write)
  {
      subpage_t *subpage = opaque;
  #if defined(DEBUG_SUBPAGE)
-    printf("%s: subpage %p %c len %d addr " TARGET_FMT_plx "\n",
+    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
             __func__, subpage, is_write ? 'w' : 'r', len, addr);
  #endif
  
      return address_space_access_valid(subpage->as, addr + subpage->base,
-                                      size, is_write);
+                                      len, is_write);
  }
  
  static const MemoryRegionOps subpage_ops = {
@@ -1627,8 +1688,8 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
      idx = SUBPAGE_IDX(start);
      eidx = SUBPAGE_IDX(end);
  #if defined(DEBUG_SUBPAGE)
-    printf("%s: %p start %08x end %08x idx %08x eidx %08x mem %ld\n", __func__,
-           mmio, start, end, idx, eidx, memory);
+    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
+           __func__, mmio, start, end, idx, eidx, section);
  #endif
      for (; idx <= eidx; idx++) {
          mmio->sub_section[idx] = section;
@@ -1649,15 +1710,15 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
                            "subpage", TARGET_PAGE_SIZE);
      mmio->iomem.subpage = true;
  #if defined(DEBUG_SUBPAGE)
-    printf("%s: %p base " TARGET_FMT_plx " len %08x %d\n", __func__,
-           mmio, base, TARGET_PAGE_SIZE, subpage_memory);
+    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
+           mmio, base, TARGET_PAGE_SIZE);
  #endif
-    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, phys_section_unassigned);
+    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
  
      return mmio;
  }
  
-static uint16_t dummy_section(MemoryRegion *mr)
+static uint16_t dummy_section(PhysPageMap *map, MemoryRegion *mr)
  {
      MemoryRegionSection section = {
          .mr = mr,
@@ -1666,12 +1727,13 @@ static uint16_t dummy_section(MemoryRegion *mr)
          .size = int128_2_64(),
      };
  
-    return phys_section_add(&section);
+    return phys_section_add(map, &section);
  }
  
  MemoryRegion *iotlb_to_region(hwaddr index)
  {
-    return phys_sections[index & ~TARGET_PAGE_MASK].mr;
+    return address_space_memory.dispatch->map.sections[
+           index & ~TARGET_PAGE_MASK].mr;
  }
  
  static void io_mem_init(void)
@@ -1687,44 +1749,65 @@ static void io_mem_init(void)
  
  static void mem_begin(MemoryListener *listener)
  {
-    AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
+    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
+    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
+    uint16_t n;
  
-    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
+    n = dummy_section(&d->map, &io_mem_unassigned);
+    assert(n == PHYS_SECTION_UNASSIGNED);
+    n = dummy_section(&d->map, &io_mem_notdirty);
+    assert(n == PHYS_SECTION_NOTDIRTY);
+    n = dummy_section(&d->map, &io_mem_rom);
+    assert(n == PHYS_SECTION_ROM);
+    n = dummy_section(&d->map, &io_mem_watch);
+    assert(n == PHYS_SECTION_WATCH);
+
+    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
+    d->as = as;
+    as->next_dispatch = d;
  }
  
-static void core_begin(MemoryListener *listener)
+static void mem_commit(MemoryListener *listener)
  {
-    phys_sections_clear();
-    phys_section_unassigned = dummy_section(&io_mem_unassigned);
-    phys_section_notdirty = dummy_section(&io_mem_notdirty);
-    phys_section_rom = dummy_section(&io_mem_rom);
-    phys_section_watch = dummy_section(&io_mem_watch);
+    AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
+    AddressSpaceDispatch *cur = as->dispatch;
+    AddressSpaceDispatch *next = as->next_dispatch;
+
+    phys_page_compact_all(next, next->map.nodes_nb);
+
+    as->dispatch = next;
+
+    if (cur) {
+        phys_sections_free(&cur->map);
+        g_free(cur);
+    }
  }
  
  static void tcg_commit(MemoryListener *listener)
  {
-    CPUArchState *env;
+    CPUState *cpu;
  
      /* since each CPU stores ram addresses in its TLB cache, we must
         reset the modified entries */
      /* XXX: slow ! */
-    for(env = first_cpu; env != NULL; env = env->next_cpu) {
+    CPU_FOREACH(cpu) {
+        CPUArchState *env = cpu->env_ptr;
+
          tlb_flush(env, 1);
      }
  }
  
  static void core_log_global_start(MemoryListener *listener)
  {
-    cpu_physical_memory_set_dirty_tracking(1);
+    cpu_physical_memory_set_dirty_tracking(true);
  }
  
  static void core_log_global_stop(MemoryListener *listener)
  {
-    cpu_physical_memory_set_dirty_tracking(0);
+    cpu_physical_memory_set_dirty_tracking(false);
  }
  
  static MemoryListener core_memory_listener = {
-    .begin = core_begin,
      .log_global_start = core_log_global_start,
      .log_global_stop = core_log_global_stop,
      .priority = 1,
@@ -1736,25 +1819,22 @@ static MemoryListener tcg_memory_listener = {
  
  void address_space_init_dispatch(AddressSpace *as)
  {
-    AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
-
-    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
-    d->listener = (MemoryListener) {
+    as->dispatch = NULL;
+    as->dispatch_listener = (MemoryListener) {
          .begin = mem_begin,
+        .commit = mem_commit,
          .region_add = mem_add,
          .region_nop = mem_add,
          .priority = 0,
      };
-    d->as = as;
-    as->dispatch = d;
-    memory_listener_register(&d->listener, as);
+    memory_listener_register(&as->dispatch_listener, as);
  }
  
  void address_space_destroy_dispatch(AddressSpace *as)
  {
      AddressSpaceDispatch *d = as->dispatch;
  
-    memory_listener_unregister(&d->listener);
+    memory_listener_unregister(&as->dispatch_listener);
      g_free(d);
      as->dispatch = NULL;
  }
@@ -1762,15 +1842,19 @@ void address_space_destroy_dispatch(AddressSpace *as)
  static void memory_map_init(void)
  {
      system_memory = g_malloc(sizeof(*system_memory));
-    memory_region_init(system_memory, NULL, "system", INT64_MAX);
+
+    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
      address_space_init(&address_space_memory, system_memory, "memory");
  
      system_io = g_malloc(sizeof(*system_io));
-    memory_region_init(system_io, NULL, "io", 65536);
+    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
+                          65536);
      address_space_init(&address_space_io, system_io, "I/O");
  
      memory_listener_register(&core_memory_listener, &address_space_memory);
-    memory_listener_register(&tcg_memory_listener, &address_space_memory);
+    if (tcg_enabled()) {
+        memory_listener_register(&tcg_memory_listener, &address_space_memory);
+    }
  }
  
  MemoryRegion *get_system_memory(void)
@@ -1787,7 +1871,7 @@ MemoryRegion *get_system_io(void)
  
  /* physical memory access (slow version, mainly for debug) */
  #if defined(CONFIG_USER_ONLY)
-int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
+int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
                          uint8_t *buf, int len, int is_write)
  {
      int l, flags;
@@ -1831,11 +1915,12 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
  static void invalidate_and_set_dirty(hwaddr addr,
                                       hwaddr length)
  {
-    if (!cpu_physical_memory_is_dirty(addr)) {
+    if (cpu_physical_memory_is_clean(addr)) {
          /* invalidate code */
          tb_invalidate_phys_page_range(addr, addr + length, 0);
          /* set dirty bit */
-        cpu_physical_memory_set_dirty_flags(addr, (0xff & ~CODE_DIRTY_FLAG));
+        cpu_physical_memory_set_dirty_flag(addr, DIRTY_MEMORY_VGA);
+        cpu_physical_memory_set_dirty_flag(addr, DIRTY_MEMORY_MIGRATION);
      }
      xen_modified_memory(addr, length);
  }
@@ -1852,15 +1937,33 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
      return false;
  }
  
-static inline int memory_access_size(MemoryRegion *mr, int l, hwaddr addr)
+static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
  {
-    if (l >= 4 && (((addr & 3) == 0 || mr->ops->impl.unaligned))) {
-        return 4;
+    unsigned access_size_max = mr->ops->valid.max_access_size;
+
+    /* Regions are assumed to support 1-4 byte accesses unless
+       otherwise specified.  */
+    if (access_size_max == 0) {
+        access_size_max = 4;
      }
-    if (l >= 2 && (((addr & 1) == 0) || mr->ops->impl.unaligned)) {
-        return 2;
+
+    /* Bound the maximum access by the alignment of the address.  */
+    if (!mr->ops->impl.unaligned) {
+        unsigned align_size_max = addr & -addr;
+        if (align_size_max != 0 && align_size_max < access_size_max) {
+            access_size_max = align_size_max;
+        }
      }
-    return 1;
+
+    /* Don't attempt accesses larger than the maximum.  */
+    if (l > access_size_max) {
+        l = access_size_max;
+    }
+    if (l & (l - 1)) {
+        l = 1 << (qemu_fls(l) - 1);
+    }
+
+    return l;
  }
  
  bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
@@ -1880,20 +1983,31 @@ bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
          if (is_write) {
              if (!memory_access_is_direct(mr, is_write)) {
                  l = memory_access_size(mr, l, addr1);
-                /* XXX: could force cpu_single_env to NULL to avoid
+                /* XXX: could force current_cpu to NULL to avoid
                     potential bugs */
-                if (l == 4) {
+                switch (l) {
+                case 8:
+                    /* 64 bit write access */
+                    val = ldq_p(buf);
+                    error |= io_mem_write(mr, addr1, val, 8);
+                    break;
+                case 4:
                      /* 32 bit write access */
                      val = ldl_p(buf);
                      error |= io_mem_write(mr, addr1, val, 4);
-                } else if (l == 2) {
+                    break;
+                case 2:
                      /* 16 bit write access */
                      val = lduw_p(buf);
                      error |= io_mem_write(mr, addr1, val, 2);
-                } else {
+                    break;
+                case 1:
                      /* 8 bit write access */
                      val = ldub_p(buf);
                      error |= io_mem_write(mr, addr1, val, 1);
+                    break;
+                default:
+                    abort();
                  }
              } else {
                  addr1 += memory_region_get_ram_addr(mr);
@@ -1906,18 +2020,29 @@ bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
              if (!memory_access_is_direct(mr, is_write)) {
                  /* I/O case */
                  l = memory_access_size(mr, l, addr1);
-                if (l == 4) {
+                switch (l) {
+                case 8:
+                    /* 64 bit read access */
+                    error |= io_mem_read(mr, addr1, &val, 8);
+                    stq_p(buf, val);
+                    break;
+                case 4:
                      /* 32 bit read access */
                      error |= io_mem_read(mr, addr1, &val, 4);
                      stl_p(buf, val);
-                } else if (l == 2) {
+                    break;
+                case 2:
                      /* 16 bit read access */
                      error |= io_mem_read(mr, addr1, &val, 2);
                      stw_p(buf, val);
-                } else {
+                    break;
+                case 1:
                      /* 8 bit read access */
                      error |= io_mem_read(mr, addr1, &val, 1);
                      stb_p(buf, val);
+                    break;
+                default:
+                    abort();
                  }
              } else {
                  /* RAM case */
@@ -1951,9 +2076,13 @@ void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
      address_space_rw(&address_space_memory, addr, buf, len, is_write);
  }
  
-/* used for ROM loading : can write in RAM and ROM */
-void cpu_physical_memory_write_rom(hwaddr addr,
-                                   const uint8_t *buf, int len)
+enum write_rom_type {
+    WRITE_DATA,
+    FLUSH_CACHE,
+};
+
+static inline void cpu_physical_memory_write_rom_internal(
+    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
  {
      hwaddr l;
      uint8_t *ptr;
@@ -1972,8 +2101,15 @@ void cpu_physical_memory_write_rom(hwaddr addr,
              addr1 += memory_region_get_ram_addr(mr);
              /* ROM/RAM case */
              ptr = qemu_get_ram_ptr(addr1);
-            memcpy(ptr, buf, l);
-            invalidate_and_set_dirty(addr1, l);
+            switch (type) {
+            case WRITE_DATA:
+                memcpy(ptr, buf, l);
+                invalidate_and_set_dirty(addr1, l);
+                break;
+            case FLUSH_CACHE:
+                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
+                break;
+            }
          }
          len -= l;
          buf += l;
@@ -1981,7 +2117,30 @@ void cpu_physical_memory_write_rom(hwaddr addr,
      }
  }
  
+/* used for ROM loading : can write in RAM and ROM */
+void cpu_physical_memory_write_rom(hwaddr addr,
+                                   const uint8_t *buf, int len)
+{
+    cpu_physical_memory_write_rom_internal(addr, buf, len, WRITE_DATA);
+}
+
+void cpu_flush_icache_range(hwaddr start, int len)
+{
+    /*
+     * This function should do the same thing as an icache flush that was
+     * triggered from within the guest. For TCG we are always cache coherent,
+     * so there is no need to flush anything. For KVM / Xen we need to flush
+     * the host's instruction cache at least.
+     */
+    if (tcg_enabled()) {
+        return;
+    }
+
+    cpu_physical_memory_write_rom_internal(start, NULL, len, FLUSH_CACHE);
+}
+
  typedef struct {
+    MemoryRegion *mr;
      void *buffer;
      hwaddr addr;
      hwaddr len;
@@ -2061,47 +2220,58 @@ void *address_space_map(AddressSpace *as,
                          bool is_write)
  {
      hwaddr len = *plen;
-    hwaddr todo = 0;
-    hwaddr l, xlat;
-    MemoryRegion *mr;
-    ram_addr_t raddr = RAM_ADDR_MAX;
-    ram_addr_t rlen;
-    void *ret;
+    hwaddr done = 0;
+    hwaddr l, xlat, base;
+    MemoryRegion *mr, *this_mr;
+    ram_addr_t raddr;
  
-    while (len > 0) {
-        l = len;
-        mr = address_space_translate(as, addr, &xlat, &l, is_write);
-
-        if (!memory_access_is_direct(mr, is_write)) {
-            if (todo || bounce.buffer) {
-                break;
-            }
-            bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
-            bounce.addr = addr;
-            bounce.len = l;
-            if (!is_write) {
-                address_space_read(as, addr, bounce.buffer, l);
-            }
+    if (len == 0) {
+        return NULL;
+    }
  
-            *plen = l;
-            return bounce.buffer;
+    l = len;
+    mr = address_space_translate(as, addr, &xlat, &l, is_write);
+    if (!memory_access_is_direct(mr, is_write)) {
+        if (bounce.buffer) {
+            return NULL;
          }
-        if (!todo) {
-            raddr = memory_region_get_ram_addr(mr) + xlat;
-        } else {
-            if (memory_region_get_ram_addr(mr) + xlat != raddr + todo) {
-                break;
-            }
+        /* Avoid unbounded allocations */
+        l = MIN(l, TARGET_PAGE_SIZE);
+        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
+        bounce.addr = addr;
+        bounce.len = l;
+
+        memory_region_ref(mr);
+        bounce.mr = mr;
+        if (!is_write) {
+            address_space_read(as, addr, bounce.buffer, l);
          }
  
+        *plen = l;
+        return bounce.buffer;
+    }
+
+    base = xlat;
+    raddr = memory_region_get_ram_addr(mr);
+
+    for (;;) {
          len -= l;
          addr += l;
-        todo += l;
+        done += l;
+        if (len == 0) {
+            break;
+        }
+
+        l = len;
+        this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
+        if (this_mr != mr || xlat != base + done) {
+            break;
+        }
      }
-    rlen = todo;
-    ret = qemu_ram_ptr_length(raddr, &rlen);
-    *plen = rlen;
-    return ret;
+
+    memory_region_ref(mr);
+    *plen = done;
+    return qemu_ram_ptr_length(raddr + base, plen);
  }
  
  /* Unmaps a memory region previously mapped by address_space_map().
@@ -2112,8 +2282,12 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                           int is_write, hwaddr access_len)
  {
      if (buffer != bounce.buffer) {
+        MemoryRegion *mr;
+        ram_addr_t addr1;
+
+        mr = qemu_ram_addr_from_host(buffer, &addr1);
+        assert(mr != NULL);
          if (is_write) {
-            ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer);
              while (access_len) {
                  unsigned l;
                  l = TARGET_PAGE_SIZE;
@@ -2127,6 +2301,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
          if (xen_enabled()) {
              xen_invalidate_map_cache_entry(buffer);
          }
+        memory_region_unref(mr);
          return;
      }
      if (is_write) {
@@ -2134,6 +2309,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
      }
      qemu_vfree(bounce.buffer);
      bounce.buffer = NULL;
+    memory_region_unref(bounce.mr);
      cpu_notify_map_clients();
  }
  
@@ -2355,12 +2531,13 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
          stl_p(ptr, val);
  
          if (unlikely(in_migration)) {
-            if (!cpu_physical_memory_is_dirty(addr1)) {
+            if (cpu_physical_memory_is_clean(addr1)) {
                  /* invalidate code */
                  tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
                  /* set dirty bit */
-                cpu_physical_memory_set_dirty_flags(
-                    addr1, (0xff & ~CODE_DIRTY_FLAG));
+                cpu_physical_memory_set_dirty_flag(addr1,
+                                                   DIRTY_MEMORY_MIGRATION);
+                cpu_physical_memory_set_dirty_flag(addr1, DIRTY_MEMORY_VGA);
              }
          }
      }
@@ -2505,7 +2682,7 @@ void stq_be_phys(hwaddr addr, uint64_t val)
  }
  
  /* virtual memory access for debug (includes writing to ROM) */
-int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
+int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
                          uint8_t *buf, int len, int is_write)
  {
      int l;
@@ -2514,7 +2691,7 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
  
      while (len > 0) {
          page = addr & TARGET_PAGE_MASK;
-        phys_addr = cpu_get_phys_page_debug(env, page);
+        phys_addr = cpu_get_phys_page_debug(cpu, page);
          /* if no physical page mapped, return an error */
          if (phys_addr == -1)
              return -1;