display: add separate config option for bochs-display

[qemu.git] / exec.c
diff --git a/exec.c b/exec.c

index 03238a3449d99860c4cee407254540ff6a740dbb..d0821e69aadaa4af9317ad36332ea570136c3f65 100644 (file)
--- a/exec.c
+++ b/exec.c
@@ -18,8 +18,6 @@
   */
  #include "qemu/osdep.h"
  #include "qapi/error.h"
-#ifndef _WIN32
-#endif
  
  #include "qemu/cutils.h"
  #include "cpu.h"
@@ -51,7 +49,6 @@
  #include "trace-root.h"
  
  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
-#include <fcntl.h>
  #include <linux/falloc.h>
  #endif
  
@@ -90,18 +87,6 @@ AddressSpace address_space_memory;
  
  MemoryRegion io_mem_rom, io_mem_notdirty;
  static MemoryRegion io_mem_unassigned;
-
-/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
-#define RAM_PREALLOC   (1 << 0)
-
-/* RAM is mmap-ed with MAP_SHARED */
-#define RAM_SHARED     (1 << 1)
-
-/* Only a portion of RAM (used_length) is actually used, and migrated.
- * This used_length size can change across reboots.
- */
-#define RAM_RESIZEABLE (1 << 2)
-
  #endif
  
  #ifdef TARGET_PAGE_BITS_VARY
@@ -397,12 +382,6 @@ static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
      }
  }
  
-bool memory_region_is_unassigned(MemoryRegion *mr)
-{
-    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
-        && mr != &io_mem_watch;
-}
-
  /* Called from RCU critical section */
  static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
                                                          hwaddr addr,
@@ -459,6 +438,79 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
      return section;
  }
  
+/**
+ * address_space_translate_iommu - translate an address through an IOMMU
+ * memory region and then through the target address space.
+ *
+ * @iommu_mr: the IOMMU memory region that we start the translation from
+ * @addr: the address to be translated through the MMU
+ * @xlat: the translated address offset within the destination memory region.
+ *        It cannot be %NULL.
+ * @plen_out: valid read/write length of the translated address. It
+ *            cannot be %NULL.
+ * @page_mask_out: page mask for the translated address. This
+ *            should only be meaningful for IOMMU translated
+ *            addresses, since there may be huge pages that this bit
+ *            would tell. It can be %NULL if we don't care about it.
+ * @is_write: whether the translation operation is for write
+ * @is_mmio: whether this can be MMIO, set true if it can
+ * @target_as: the address space targeted by the IOMMU
+ * @attrs: transaction attributes
+ *
+ * This function is called from RCU critical section.  It is the common
+ * part of flatview_do_translate and address_space_translate_cached.
+ */
+static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
+                                                         hwaddr *xlat,
+                                                         hwaddr *plen_out,
+                                                         hwaddr *page_mask_out,
+                                                         bool is_write,
+                                                         bool is_mmio,
+                                                         AddressSpace **target_as,
+                                                         MemTxAttrs attrs)
+{
+    MemoryRegionSection *section;
+    hwaddr page_mask = (hwaddr)-1;
+
+    do {
+        hwaddr addr = *xlat;
+        IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
+        int iommu_idx = 0;
+        IOMMUTLBEntry iotlb;
+
+        if (imrc->attrs_to_index) {
+            iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
+        }
+
+        iotlb = imrc->translate(iommu_mr, addr, is_write ?
+                                IOMMU_WO : IOMMU_RO, iommu_idx);
+
+        if (!(iotlb.perm & (1 << is_write))) {
+            goto unassigned;
+        }
+
+        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
+                | (addr & iotlb.addr_mask));
+        page_mask &= iotlb.addr_mask;
+        *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
+        *target_as = iotlb.target_as;
+
+        section = address_space_translate_internal(
+                address_space_to_dispatch(iotlb.target_as), addr, xlat,
+                plen_out, is_mmio);
+
+        iommu_mr = memory_region_get_iommu(section->mr);
+    } while (unlikely(iommu_mr));
+
+    if (page_mask_out) {
+        *page_mask_out = page_mask;
+    }
+    return *section;
+
+unassigned:
+    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
+}
+
  /**
   * flatview_do_translate - translate an address in FlatView
   *
@@ -474,6 +526,8 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
   *            would tell. It can be @NULL if we don't care about it.
   * @is_write: whether the translation operation is for write
   * @is_mmio: whether this can be MMIO, set true if it can
+ * @target_as: the address space targeted by the IOMMU
+ * @attrs: memory transaction attributes
   *
   * This function is called from RCU critical section
   */
@@ -484,68 +538,39 @@ static MemoryRegionSection flatview_do_translate(FlatView *fv,
                                                   hwaddr *page_mask_out,
                                                   bool is_write,
                                                   bool is_mmio,
-                                                 AddressSpace **target_as)
+                                                 AddressSpace **target_as,
+                                                 MemTxAttrs attrs)
  {
-    IOMMUTLBEntry iotlb;
      MemoryRegionSection *section;
      IOMMUMemoryRegion *iommu_mr;
-    IOMMUMemoryRegionClass *imrc;
-    hwaddr page_mask = (hwaddr)(-1);
      hwaddr plen = (hwaddr)(-1);
  
-    if (plen_out) {
-        plen = *plen_out;
+    if (!plen_out) {
+        plen_out = &plen;
      }
  
-    for (;;) {
-        section = address_space_translate_internal(
-                flatview_to_dispatch(fv), addr, &addr,
-                &plen, is_mmio);
+    section = address_space_translate_internal(
+            flatview_to_dispatch(fv), addr, xlat,
+            plen_out, is_mmio);
  
-        iommu_mr = memory_region_get_iommu(section->mr);
-        if (!iommu_mr) {
-            break;
-        }
-        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
-
-        iotlb = imrc->translate(iommu_mr, addr, is_write ?
-                                IOMMU_WO : IOMMU_RO);
-        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
-                | (addr & iotlb.addr_mask));
-        page_mask &= iotlb.addr_mask;
-        plen = MIN(plen, (addr | iotlb.addr_mask) - addr + 1);
-        if (!(iotlb.perm & (1 << is_write))) {
-            goto translate_fail;
-        }
-
-        fv = address_space_to_flatview(iotlb.target_as);
-        *target_as = iotlb.target_as;
-    }
-
-    *xlat = addr;
-
-    if (page_mask == (hwaddr)(-1)) {
-        /* Not behind an IOMMU, use default page size. */
-        page_mask = ~TARGET_PAGE_MASK;
+    iommu_mr = memory_region_get_iommu(section->mr);
+    if (unlikely(iommu_mr)) {
+        return address_space_translate_iommu(iommu_mr, xlat,
+                                             plen_out, page_mask_out,
+                                             is_write, is_mmio,
+                                             target_as, attrs);
      }
-
      if (page_mask_out) {
-        *page_mask_out = page_mask;
-    }
-
-    if (plen_out) {
-        *plen_out = plen;
+        /* Not behind an IOMMU, use default page size. */
+        *page_mask_out = ~TARGET_PAGE_MASK;
      }
  
      return *section;
-
-translate_fail:
-    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
  }
  
  /* Called from RCU critical section */
  IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
-                                            bool is_write)
+                                            bool is_write, MemTxAttrs attrs)
  {
      MemoryRegionSection section;
      hwaddr xlat, page_mask;
@@ -555,7 +580,8 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
       * but page mask.
       */
      section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
-                                    NULL, &page_mask, is_write, false, &as);
+                                    NULL, &page_mask, is_write, false, &as,
+                                    attrs);
  
      /* Illegal translation */
      if (section.mr == &io_mem_unassigned) {
@@ -581,7 +607,8 @@ iotlb_fail:
  
  /* Called from RCU critical section */
  MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
-                                 hwaddr *plen, bool is_write)
+                                 hwaddr *plen, bool is_write,
+                                 MemTxAttrs attrs)
  {
      MemoryRegion *mr;
      MemoryRegionSection section;
@@ -589,7 +616,7 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
  
      /* This can be MMIO, so setup MMIO bit. */
      section = flatview_do_translate(fv, addr, xlat, plen, NULL,
-                                    is_write, true, &as);
+                                    is_write, true, &as, attrs);
      mr = section.mr;
  
      if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
@@ -600,18 +627,144 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
      return mr;
  }
  
+typedef struct TCGIOMMUNotifier {
+    IOMMUNotifier n;
+    MemoryRegion *mr;
+    CPUState *cpu;
+    int iommu_idx;
+    bool active;
+} TCGIOMMUNotifier;
+
+static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+    TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
+
+    if (!notifier->active) {
+        return;
+    }
+    tlb_flush(notifier->cpu);
+    notifier->active = false;
+    /* We leave the notifier struct on the list to avoid reallocating it later.
+     * Generally the number of IOMMUs a CPU deals with will be small.
+     * In any case we can't unregister the iommu notifier from a notify
+     * callback.
+     */
+}
+
+static void tcg_register_iommu_notifier(CPUState *cpu,
+                                        IOMMUMemoryRegion *iommu_mr,
+                                        int iommu_idx)
+{
+    /* Make sure this CPU has an IOMMU notifier registered for this
+     * IOMMU/IOMMU index combination, so that we can flush its TLB
+     * when the IOMMU tells us the mappings we've cached have changed.
+     */
+    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
+    TCGIOMMUNotifier *notifier;
+    int i;
+
+    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
+        notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
+        if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
+            break;
+        }
+    }
+    if (i == cpu->iommu_notifiers->len) {
+        /* Not found, add a new entry at the end of the array */
+        cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
+        notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
+
+        notifier->mr = mr;
+        notifier->iommu_idx = iommu_idx;
+        notifier->cpu = cpu;
+        /* Rather than trying to register interest in the specific part
+         * of the iommu's address space that we've accessed and then
+         * expand it later as subsequent accesses touch more of it, we
+         * just register interest in the whole thing, on the assumption
+         * that iommu reconfiguration will be rare.
+         */
+        iommu_notifier_init(&notifier->n,
+                            tcg_iommu_unmap_notify,
+                            IOMMU_NOTIFIER_UNMAP,
+                            0,
+                            HWADDR_MAX,
+                            iommu_idx);
+        memory_region_register_iommu_notifier(notifier->mr, &notifier->n);
+    }
+
+    if (!notifier->active) {
+        notifier->active = true;
+    }
+}
+
+static void tcg_iommu_free_notifier_list(CPUState *cpu)
+{
+    /* Destroy the CPU's notifier list */
+    int i;
+    TCGIOMMUNotifier *notifier;
+
+    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
+        notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
+        memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
+    }
+    g_array_free(cpu->iommu_notifiers, true);
+}
+
  /* Called from RCU critical section */
  MemoryRegionSection *
  address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
-                                  hwaddr *xlat, hwaddr *plen)
+                                  hwaddr *xlat, hwaddr *plen,
+                                  MemTxAttrs attrs, int *prot)
  {
      MemoryRegionSection *section;
+    IOMMUMemoryRegion *iommu_mr;
+    IOMMUMemoryRegionClass *imrc;
+    IOMMUTLBEntry iotlb;
+    int iommu_idx;
      AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
  
-    section = address_space_translate_internal(d, addr, xlat, plen, false);
+    for (;;) {
+        section = address_space_translate_internal(d, addr, &addr, plen, false);
+
+        iommu_mr = memory_region_get_iommu(section->mr);
+        if (!iommu_mr) {
+            break;
+        }
+
+        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
+
+        iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
+        tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
+        /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
+         * doesn't short-cut its translation table walk.
+         */
+        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
+        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
+                | (addr & iotlb.addr_mask));
+        /* Update the caller's prot bits to remove permissions the IOMMU
+         * is giving us a failure response for. If we get down to no
+         * permissions left at all we can give up now.
+         */
+        if (!(iotlb.perm & IOMMU_RO)) {
+            *prot &= ~(PAGE_READ | PAGE_EXEC);
+        }
+        if (!(iotlb.perm & IOMMU_WO)) {
+            *prot &= ~PAGE_WRITE;
+        }
+
+        if (!*prot) {
+            goto translate_fail;
+        }
+
+        d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
+    }
  
      assert(!memory_region_is_iommu(section->mr));
+    *xlat = addr;
      return section;
+
+translate_fail:
+    return &d->map.sections[PHYS_SECTION_UNASSIGNED];
  }
  #endif
  
@@ -626,6 +779,13 @@ static int cpu_common_post_load(void *opaque, int version_id)
      cpu->interrupt_request &= ~0x01;
      tlb_flush(cpu);
  
+    /* loadvm has just updated the content of RAM, bypassing the
+     * usual mechanisms that ensure we flush TBs for writes to
+     * memory we've translated code from. So we must flush all TBs,
+     * which will now be stale.
+     */
+    tb_flush(cpu);
+
      return 0;
  }
  
@@ -708,9 +868,17 @@ CPUState *qemu_get_cpu(int index)
  }
  
  #if !defined(CONFIG_USER_ONLY)
-void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
+void cpu_address_space_init(CPUState *cpu, int asidx,
+                            const char *prefix, MemoryRegion *mr)
  {
      CPUAddressSpace *newas;
+    AddressSpace *as = g_new0(AddressSpace, 1);
+    char *as_name;
+
+    assert(mr);
+    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
+    address_space_init(as, mr, as_name);
+    g_free(as_name);
  
      /* Target code should have set num_ases before calling us */
      assert(asidx < cpu->num_ases);
@@ -755,6 +923,9 @@ void cpu_exec_unrealizefn(CPUState *cpu)
      if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
          vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
      }
+#ifndef CONFIG_USER_ONLY
+    tcg_iommu_free_notifier_list(cpu);
+#endif
  }
  
  Property cpu_common_props[] = {
@@ -802,19 +973,69 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
      if (cc->vmsd != NULL) {
          vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
      }
+
+    cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier));
  #endif
  }
  
+const char *parse_cpu_model(const char *cpu_model)
+{
+    ObjectClass *oc;
+    CPUClass *cc;
+    gchar **model_pieces;
+    const char *cpu_type;
+
+    model_pieces = g_strsplit(cpu_model, ",", 2);
+
+    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
+    if (oc == NULL) {
+        error_report("unable to find CPU model '%s'", model_pieces[0]);
+        g_strfreev(model_pieces);
+        exit(EXIT_FAILURE);
+    }
+
+    cpu_type = object_class_get_name(oc);
+    cc = CPU_CLASS(oc);
+    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
+    g_strfreev(model_pieces);
+    return cpu_type;
+}
+
  #if defined(CONFIG_USER_ONLY)
-static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
+void tb_invalidate_phys_addr(target_ulong addr)
  {
      mmap_lock();
-    tb_lock();
-    tb_invalidate_phys_page_range(pc, pc + 1, 0);
-    tb_unlock();
+    tb_invalidate_phys_page_range(addr, addr + 1, 0);
      mmap_unlock();
  }
+
+static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
+{
+    tb_invalidate_phys_addr(pc);
+}
  #else
+void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+{
+    ram_addr_t ram_addr;
+    MemoryRegion *mr;
+    hwaddr l = 1;
+
+    if (!tcg_enabled()) {
+        return;
+    }
+
+    rcu_read_lock();
+    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
+    if (!(memory_region_is_ram(mr)
+          || memory_region_is_romd(mr))) {
+        rcu_read_unlock();
+        return;
+    }
+    ram_addr = memory_region_get_ram_addr(mr) + addr;
+    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
+    rcu_read_unlock();
+}
+
  static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  {
      MemTxAttrs attrs;
@@ -823,7 +1044,7 @@ static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
      if (phys != -1) {
          /* Locks grabbed by tb_invalidate_phys_addr */
          tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
-                                phys | (pc & ~TARGET_PAGE_MASK));
+                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
      }
  }
  #endif
@@ -1049,6 +1270,7 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
          struct sigaction act;
          sigfillset(&act.sa_mask);
          act.sa_handler = SIG_DFL;
+        act.sa_flags = 0;
          sigaction(SIGABRT, &act, NULL);
      }
  #endif
@@ -1102,6 +1324,7 @@ static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
      RAMBlock *block;
      ram_addr_t end;
  
+    assert(tcg_enabled());
      end = TARGET_PAGE_ALIGN(start + length);
      start &= TARGET_PAGE_MASK;
  
@@ -1273,7 +1496,7 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
                               uint16_t section);
  static subpage_t *subpage_init(FlatView *fv, hwaddr base);
  
-static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
+static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
                                 qemu_anon_ram_alloc;
  
  /*
@@ -1281,7 +1504,7 @@ static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
   * Accelerators with unusual needs may need this.  Hopefully, we can
   * get rid of it eventually.
   */
-void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
+void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
  {
      phys_mem_alloc = alloc;
  }
@@ -1448,18 +1671,13 @@ void ram_block_dump(Monitor *mon)
   */
  static int find_max_supported_pagesize(Object *obj, void *opaque)
  {
-    char *mem_path;
      long *hpsize_min = opaque;
  
      if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
-        mem_path = object_property_get_str(obj, "mem-path", NULL);
-        if (mem_path) {
-            long hpsize = qemu_mempath_getpagesize(mem_path);
-            if (hpsize < *hpsize_min) {
-                *hpsize_min = hpsize;
-            }
-        } else {
-            *hpsize_min = getpagesize();
+        long hpsize = host_memory_backend_pagesize(MEMORY_BACKEND(obj));
+
+        if (hpsize < *hpsize_min) {
+            *hpsize_min = hpsize;
          }
      }
  
@@ -1472,11 +1690,7 @@ long qemu_getrampagesize(void)
      long mainrampagesize;
      Object *memdev_root;
  
-    if (mem_path) {
-        mainrampagesize = qemu_mempath_getpagesize(mem_path);
-    } else {
-        mainrampagesize = getpagesize();
-    }
+    mainrampagesize = qemu_mempath_getpagesize(mem_path);
  
      /* it's possible we have memory-backend objects with
       * hugepage-backed RAM. these may get mapped into system
@@ -1520,7 +1734,7 @@ long qemu_getrampagesize(void)
  }
  #endif
  
-#ifdef __linux__
+#ifdef CONFIG_POSIX
  static int64_t get_file_size(int fd)
  {
      int64_t size = lseek(fd, 0, SEEK_END);
@@ -1600,7 +1814,17 @@ static void *file_ram_alloc(RAMBlock *block,
      void *area;
  
      block->page_size = qemu_fd_getpagesize(fd);
-    block->mr->align = block->page_size;
+    if (block->mr->align % block->page_size) {
+        error_setg(errp, "alignment 0x%" PRIx64
+                   " must be multiples of page size 0x%zx",
+                   block->mr->align, block->page_size);
+        return NULL;
+    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
+        error_setg(errp, "alignment 0x%" PRIx64
+                   " must be a power of two", block->mr->align);
+        return NULL;
+    }
+    block->mr->align = MAX(block->page_size, block->mr->align);
  #if defined(__s390x__)
      if (kvm_enabled()) {
          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
@@ -1655,7 +1879,10 @@ static void *file_ram_alloc(RAMBlock *block,
  }
  #endif
  
-/* Called with the ramlist lock held.  */
+/* Allocate space within the ram_addr_t space that governs the
+ * dirty bitmaps.
+ * Called with the ramlist lock held.
+ */
  static ram_addr_t find_ram_offset(ram_addr_t size)
  {
      RAMBlock *block, *next_block;
@@ -1668,19 +1895,33 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
      }
  
      RAMBLOCK_FOREACH(block) {
-        ram_addr_t end, next = RAM_ADDR_MAX;
+        ram_addr_t candidate, next = RAM_ADDR_MAX;
  
-        end = block->offset + block->max_length;
+        /* Align blocks to start on a 'long' in the bitmap
+         * which makes the bitmap sync'ing take the fast path.
+         */
+        candidate = block->offset + block->max_length;
+        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
  
+        /* Search for the closest following block
+         * and find the gap.
+         */
          RAMBLOCK_FOREACH(next_block) {
-            if (next_block->offset >= end) {
+            if (next_block->offset >= candidate) {
                  next = MIN(next, next_block->offset);
              }
          }
-        if (next - end >= size && next - end < mingap) {
-            offset = end;
-            mingap = next - end;
+
+        /* If it fits remember our place and remember the size
+         * of gap, but keep going so that we might find a smaller
+         * gap to fill so avoiding fragmentation.
+         */
+        if (next - candidate >= size && next - candidate < mingap) {
+            offset = candidate;
+            mingap = next - candidate;
          }
+
+        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
      }
  
      if (offset == RAM_ADDR_MAX) {
@@ -1689,10 +1930,12 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
          abort();
      }
  
+    trace_find_ram_offset(size, offset);
+
      return offset;
  }
  
-unsigned long last_ram_page(void)
+static unsigned long last_ram_page(void)
  {
      RAMBlock *block;
      ram_addr_t last = 0;
@@ -1730,6 +1973,32 @@ bool qemu_ram_is_shared(RAMBlock *rb)
      return rb->flags & RAM_SHARED;
  }
  
+/* Note: Only set at the start of postcopy */
+bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
+{
+    return rb->flags & RAM_UF_ZEROPAGE;
+}
+
+void qemu_ram_set_uf_zeroable(RAMBlock *rb)
+{
+    rb->flags |= RAM_UF_ZEROPAGE;
+}
+
+bool qemu_ram_is_migratable(RAMBlock *rb)
+{
+    return rb->flags & RAM_MIGRATABLE;
+}
+
+void qemu_ram_set_migratable(RAMBlock *rb)
+{
+    rb->flags |= RAM_MIGRATABLE;
+}
+
+void qemu_ram_unset_migratable(RAMBlock *rb)
+{
+    rb->flags &= ~RAM_MIGRATABLE;
+}
+
  /* Called with iothread lock held.  */
  void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
  {
@@ -1884,7 +2153,7 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
      }
  }
  
-static void ram_block_add(RAMBlock *new_block, Error **errp)
+static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
  {
      RAMBlock *block;
      RAMBlock *last_block = NULL;
@@ -1907,7 +2176,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
              }
          } else {
              new_block->host = phys_mem_alloc(new_block->max_length,
-                                             &new_block->mr->align);
+                                             &new_block->mr->align, shared);
              if (!new_block->host) {
                  error_setg_errno(errp, errno,
                                   "cannot set up guest memory '%s'",
@@ -1961,15 +2230,18 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
      }
  }
  
-#ifdef __linux__
+#ifdef CONFIG_POSIX
  RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
-                                 bool share, int fd,
+                                 uint32_t ram_flags, int fd,
                                   Error **errp)
  {
      RAMBlock *new_block;
      Error *local_err = NULL;
      int64_t file_size;
  
+    /* Just support these ram flags by now. */
+    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
+
      if (xen_enabled()) {
          error_setg(errp, "-mem-path not supported with Xen");
          return NULL;
@@ -2005,14 +2277,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
      new_block->mr = mr;
      new_block->used_length = size;
      new_block->max_length = size;
-    new_block->flags = share ? RAM_SHARED : 0;
+    new_block->flags = ram_flags;
      new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
      if (!new_block->host) {
          g_free(new_block);
          return NULL;
      }
  
-    ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
      if (local_err) {
          g_free(new_block);
          error_propagate(errp, local_err);
@@ -2024,7 +2296,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
  
  
  RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
-                                   bool share, const char *mem_path,
+                                   uint32_t ram_flags, const char *mem_path,
                                     Error **errp)
  {
      int fd;
@@ -2036,7 +2308,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
          return NULL;
      }
  
-    block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
+    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
      if (!block) {
          if (created) {
              unlink(mem_path);
@@ -2054,7 +2326,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                    void (*resized)(const char*,
                                                    uint64_t length,
                                                    void *host),
-                                  void *host, bool resizeable,
+                                  void *host, bool resizeable, bool share,
                                    MemoryRegion *mr, Error **errp)
  {
      RAMBlock *new_block;
@@ -2077,7 +2349,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
      if (resizeable) {
          new_block->flags |= RAM_RESIZEABLE;
      }
-    ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err, share);
      if (local_err) {
          g_free(new_block);
          error_propagate(errp, local_err);
@@ -2089,12 +2361,15 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
  RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                     MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
+    return qemu_ram_alloc_internal(size, size, NULL, host, false,
+                                   false, mr, errp);
  }
  
-RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
+RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
+                         MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
+    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
+                                   share, mr, errp);
  }
  
  RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
@@ -2103,7 +2378,8 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
                                                       void *host),
                                       MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
+    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
+                                   false, mr, errp);
  }
  
  static void reclaim_ramblock(RAMBlock *block)
@@ -2179,9 +2455,9 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
                                  flags, -1, 0);
                  }
                  if (area != vaddr) {
-                    fprintf(stderr, "Could not remap addr: "
-                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
-                            length, addr);
+                    error_report("Could not remap addr: "
+                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
+                                 length, addr);
                      exit(1);
                  }
                  memory_try_enable_merging(vaddr, length);
@@ -2256,6 +2532,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
      return ramblock_ptr(block, addr);
  }
  
+/* Return the offset of a hostpointer within a ramblock */
+ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
+{
+    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
+    assert((uintptr_t)host >= (uintptr_t)rb->host);
+    assert(res < rb->max_length);
+
+    return res;
+}
+
  /*
   * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
   * in that RAMBlock.
@@ -2365,21 +2651,22 @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
      ndi->ram_addr = ram_addr;
      ndi->mem_vaddr = mem_vaddr;
      ndi->size = size;
-    ndi->locked = false;
+    ndi->pages = NULL;
  
      assert(tcg_enabled());
      if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        ndi->locked = true;
-        tb_lock();
-        tb_invalidate_phys_page_fast(ram_addr, size);
+        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
+        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
      }
  }
  
  /* Called within RCU critical section. */
  void memory_notdirty_write_complete(NotDirtyInfo *ndi)
  {
-    if (ndi->locked) {
-        tb_unlock();
+    if (ndi->pages) {
+        assert(tcg_enabled());
+        page_collection_unlock(ndi->pages);
+        ndi->pages = NULL;
      }
  
      /* Set both VGA and migration bits for simplicity and to remove
@@ -2403,27 +2690,13 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
      memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
                           ram_addr, size);
  
-    switch (size) {
-    case 1:
-        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
-        break;
-    case 2:
-        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
-        break;
-    case 4:
-        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
-        break;
-    case 8:
-        stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
-        break;
-    default:
-        abort();
-    }
+    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
      memory_notdirty_write_complete(&ndi);
  }
  
  static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
-                                 unsigned size, bool is_write)
+                                 unsigned size, bool is_write,
+                                 MemTxAttrs attrs)
  {
      return is_write;
  }
@@ -2480,18 +2753,16 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                  }
                  cpu->watchpoint_hit = wp;
  
-                /* Both tb_lock and iothread_mutex will be reset when
-                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
-                 * back into the cpu_exec main loop.
-                 */
-                tb_lock();
+                mmap_lock();
                  tb_check_watchpoint(cpu);
                  if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                      cpu->exception_index = EXCP_DEBUG;
+                    mmap_unlock();
                      cpu_loop_exit(cpu);
                  } else {
                      /* Force execution of one insn next time.  */
                      cpu->cflags_next_tb = 1 | curr_cflags();
+                    mmap_unlock();
                      cpu_loop_exit_noexc(cpu);
                  }
              }
@@ -2575,10 +2846,12 @@ static const MemoryRegionOps watch_mem_ops = {
      },
  };
  
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                      MemTxAttrs attrs, uint8_t *buf, int len);
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                    const uint8_t *buf, int len);
  static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
-                                  bool is_write);
+                                  bool is_write, MemTxAttrs attrs);
  
  static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
                                  unsigned len, MemTxAttrs attrs)
@@ -2595,22 +2868,8 @@ static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
      if (res) {
          return res;
      }
-    switch (len) {
-    case 1:
-        *data = ldub_p(buf);
-        return MEMTX_OK;
-    case 2:
-        *data = lduw_p(buf);
-        return MEMTX_OK;
-    case 4:
-        *data = ldl_p(buf);
-        return MEMTX_OK;
-    case 8:
-        *data = ldq_p(buf);
-        return MEMTX_OK;
-    default:
-        abort();
-    }
+    *data = ldn_p(buf, len);
+    return MEMTX_OK;
  }
  
  static MemTxResult subpage_write(void *opaque, hwaddr addr,
@@ -2624,27 +2883,13 @@ static MemTxResult subpage_write(void *opaque, hwaddr addr,
             " value %"PRIx64"\n",
             __func__, subpage, len, addr, value);
  #endif
-    switch (len) {
-    case 1:
-        stb_p(buf, value);
-        break;
-    case 2:
-        stw_p(buf, value);
-        break;
-    case 4:
-        stl_p(buf, value);
-        break;
-    case 8:
-        stq_p(buf, value);
-        break;
-    default:
-        abort();
-    }
+    stn_p(buf, len, value);
      return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
  }
  
  static bool subpage_accepts(void *opaque, hwaddr addr,
-                            unsigned len, bool is_write)
+                            unsigned len, bool is_write,
+                            MemTxAttrs attrs)
  {
      subpage_t *subpage = opaque;
  #if defined(DEBUG_SUBPAGE)
@@ -2653,7 +2898,7 @@ static bool subpage_accepts(void *opaque, hwaddr addr,
  #endif
  
      return flatview_access_valid(subpage->fv, addr + subpage->base,
-                                 len, is_write);
+                                 len, is_write, attrs);
  }
  
  static const MemoryRegionOps subpage_ops = {
@@ -2720,19 +2965,53 @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
      return phys_section_add(map, &section);
  }
  
-MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
+static void readonly_mem_write(void *opaque, hwaddr addr,
+                               uint64_t val, unsigned size)
+{
+    /* Ignore any write to ROM. */
+}
+
+static bool readonly_mem_accepts(void *opaque, hwaddr addr,
+                                 unsigned size, bool is_write,
+                                 MemTxAttrs attrs)
+{
+    return is_write;
+}
+
+/* This will only be used for writes, because reads are special cased
+ * to directly access the underlying host ram.
+ */
+static const MemoryRegionOps readonly_mem_ops = {
+    .write = readonly_mem_write,
+    .valid.accepts = readonly_mem_accepts,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+};
+
+MemoryRegionSection *iotlb_to_section(CPUState *cpu,
+                                      hwaddr index, MemTxAttrs attrs)
  {
      int asidx = cpu_asidx_from_attrs(cpu, attrs);
      CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
      AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
      MemoryRegionSection *sections = d->map.sections;
  
-    return sections[index & ~TARGET_PAGE_MASK].mr;
+    return &sections[index & ~TARGET_PAGE_MASK];
  }
  
  static void io_mem_init(void)
  {
-    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
+    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
+                          NULL, NULL, UINT64_MAX);
      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                            NULL, UINT64_MAX);
  
@@ -2777,6 +3056,7 @@ static void tcg_commit(MemoryListener *listener)
      CPUAddressSpace *cpuas;
      AddressSpaceDispatch *d;
  
+    assert(tcg_enabled());
      /* since each CPU stores ram addresses in its TLB cache, we must
         reset the modified entries */
      cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
@@ -2874,9 +3154,7 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
      }
      if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
          assert(tcg_enabled());
-        tb_lock();
          tb_invalidate_phys_range(addr, addr + length);
-        tb_unlock();
          dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
      }
      cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
@@ -2950,34 +3228,8 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
              l = memory_access_size(mr, l, addr1);
              /* XXX: could force current_cpu to NULL to avoid
                 potential bugs */
-            switch (l) {
-            case 8:
-                /* 64 bit write access */
-                val = ldq_p(buf);
-                result |= memory_region_dispatch_write(mr, addr1, val, 8,
-                                                       attrs);
-                break;
-            case 4:
-                /* 32 bit write access */
-                val = (uint32_t)ldl_p(buf);
-                result |= memory_region_dispatch_write(mr, addr1, val, 4,
-                                                       attrs);
-                break;
-            case 2:
-                /* 16 bit write access */
-                val = lduw_p(buf);
-                result |= memory_region_dispatch_write(mr, addr1, val, 2,
-                                                       attrs);
-                break;
-            case 1:
-                /* 8 bit write access */
-                val = ldub_p(buf);
-                result |= memory_region_dispatch_write(mr, addr1, val, 1,
-                                                       attrs);
-                break;
-            default:
-                abort();
-            }
+            val = ldn_p(buf, l);
+            result |= memory_region_dispatch_write(mr, addr1, val, l, attrs);
          } else {
              /* RAM case */
              ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
@@ -2999,12 +3251,13 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
          }
  
          l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, true);
+        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
      }
  
      return result;
  }
  
+/* Called from RCU critical section.  */
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                    const uint8_t *buf, int len)
  {
@@ -3013,25 +3266,14 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
      MemoryRegion *mr;
      MemTxResult result = MEMTX_OK;
  
-    if (len > 0) {
-        rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, true);
-        result = flatview_write_continue(fv, addr, attrs, buf, len,
-                                         addr1, l, mr);
-        rcu_read_unlock();
-    }
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
+    result = flatview_write_continue(fv, addr, attrs, buf, len,
+                                     addr1, l, mr);
  
      return result;
  }
  
-MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
-                                              MemTxAttrs attrs,
-                                              const uint8_t *buf, int len)
-{
-    return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
-}
-
  /* Called within RCU critical section.  */
  MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
                                     MemTxAttrs attrs, uint8_t *buf,
@@ -3048,34 +3290,8 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
              /* I/O case */
              release_lock |= prepare_mmio_access(mr);
              l = memory_access_size(mr, l, addr1);
-            switch (l) {
-            case 8:
-                /* 64 bit read access */
-                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
-                                                      attrs);
-                stq_p(buf, val);
-                break;
-            case 4:
-                /* 32 bit read access */
-                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
-                                                      attrs);
-                stl_p(buf, val);
-                break;
-            case 2:
-                /* 16 bit read access */
-                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
-                                                      attrs);
-                stw_p(buf, val);
-                break;
-            case 1:
-                /* 8 bit read access */
-                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
-                                                      attrs);
-                stb_p(buf, val);
-                break;
-            default:
-                abort();
-            }
+            result |= memory_region_dispatch_read(mr, addr1, &val, l, attrs);
+            stn_p(buf, l, val);
          } else {
              /* RAM case */
              ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
@@ -3096,48 +3312,67 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
          }
  
          l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, false);
+        mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
      }
  
      return result;
  }
  
-MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
-                               MemTxAttrs attrs, uint8_t *buf, int len)
+/* Called from RCU critical section.  */
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                 MemTxAttrs attrs, uint8_t *buf, int len)
  {
      hwaddr l;
      hwaddr addr1;
      MemoryRegion *mr;
+
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
+    return flatview_read_continue(fv, addr, attrs, buf, len,
+                                  addr1, l, mr);
+}
+
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+                                    MemTxAttrs attrs, uint8_t *buf, int len)
+{
      MemTxResult result = MEMTX_OK;
+    FlatView *fv;
  
      if (len > 0) {
          rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, false);
-        result = flatview_read_continue(fv, addr, attrs, buf, len,
-                                        addr1, l, mr);
+        fv = address_space_to_flatview(as);
+        result = flatview_read(fv, addr, attrs, buf, len);
          rcu_read_unlock();
      }
  
      return result;
  }
  
-static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-                               uint8_t *buf, int len, bool is_write)
+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+                                MemTxAttrs attrs,
+                                const uint8_t *buf, int len)
  {
-    if (is_write) {
-        return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
-    } else {
-        return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
+    MemTxResult result = MEMTX_OK;
+    FlatView *fv;
+
+    if (len > 0) {
+        rcu_read_lock();
+        fv = address_space_to_flatview(as);
+        result = flatview_write(fv, addr, attrs, buf, len);
+        rcu_read_unlock();
      }
+
+    return result;
  }
  
-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
-                             MemTxAttrs attrs, uint8_t *buf,
-                             int len, bool is_write)
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+                             uint8_t *buf, int len, bool is_write)
  {
-    return flatview_rw(address_space_to_flatview(as),
-                       addr, attrs, buf, len, is_write);
+    if (is_write) {
+        return address_space_write(as, addr, attrs, buf, len);
+    } else {
+        return address_space_read_full(as, addr, attrs, buf, len);
+    }
  }
  
  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
@@ -3163,7 +3398,8 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
      rcu_read_lock();
      while (len > 0) {
          l = len;
-        mr = address_space_translate(as, addr, &addr1, &l, true);
+        mr = address_space_translate(as, addr, &addr1, &l, true,
+                                     MEMTXATTRS_UNSPECIFIED);
  
          if (!(memory_region_is_ram(mr) ||
                memory_region_is_romd(mr))) {
@@ -3298,19 +3534,17 @@ static void cpu_notify_map_clients(void)
  }
  
  static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
-                                  bool is_write)
+                                  bool is_write, MemTxAttrs attrs)
  {
      MemoryRegion *mr;
      hwaddr l, xlat;
  
-    rcu_read_lock();
      while (len > 0) {
          l = len;
-        mr = flatview_translate(fv, addr, &xlat, &l, is_write);
+        mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
          if (!memory_access_is_direct(mr, is_write)) {
              l = memory_access_size(mr, l, addr);
-            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
-                rcu_read_unlock();
+            if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
                  return false;
              }
          }
@@ -3318,22 +3552,28 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
          len -= l;
          addr += l;
      }
-    rcu_read_unlock();
      return true;
  }
  
  bool address_space_access_valid(AddressSpace *as, hwaddr addr,
-                                int len, bool is_write)
+                                int len, bool is_write,
+                                MemTxAttrs attrs)
  {
-    return flatview_access_valid(address_space_to_flatview(as),
-                                 addr, len, is_write);
+    FlatView *fv;
+    bool result;
+
+    rcu_read_lock();
+    fv = address_space_to_flatview(as);
+    result = flatview_access_valid(fv, addr, len, is_write, attrs);
+    rcu_read_unlock();
+    return result;
  }
  
  static hwaddr
  flatview_extend_translation(FlatView *fv, hwaddr addr,
-                                 hwaddr target_len,
-                                 MemoryRegion *mr, hwaddr base, hwaddr len,
-                                 bool is_write)
+                            hwaddr target_len,
+                            MemoryRegion *mr, hwaddr base, hwaddr len,
+                            bool is_write, MemTxAttrs attrs)
  {
      hwaddr done = 0;
      hwaddr xlat;
@@ -3349,7 +3589,7 @@ flatview_extend_translation(FlatView *fv, hwaddr addr,
  
          len = target_len;
          this_mr = flatview_translate(fv, addr, &xlat,
-                                                   &len, is_write);
+                                     &len, is_write, attrs);
          if (this_mr != mr || xlat != base + done) {
              return done;
          }
@@ -3366,13 +3606,14 @@ flatview_extend_translation(FlatView *fv, hwaddr addr,
  void *address_space_map(AddressSpace *as,
                          hwaddr addr,
                          hwaddr *plen,
-                        bool is_write)
+                        bool is_write,
+                        MemTxAttrs attrs)
  {
      hwaddr len = *plen;
      hwaddr l, xlat;
      MemoryRegion *mr;
      void *ptr;
-    FlatView *fv = address_space_to_flatview(as);
+    FlatView *fv;
  
      if (len == 0) {
          return NULL;
@@ -3380,7 +3621,8 @@ void *address_space_map(AddressSpace *as,
  
      l = len;
      rcu_read_lock();
-    mr = flatview_translate(fv, addr, &xlat, &l, is_write);
+    fv = address_space_to_flatview(as);
+    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
  
      if (!memory_access_is_direct(mr, is_write)) {
          if (atomic_xchg(&bounce.in_use, true)) {
@@ -3408,7 +3650,7 @@ void *address_space_map(AddressSpace *as,
  
      memory_region_ref(mr);
      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
-                                             l, is_write);
+                                        l, is_write, attrs);
      ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
      rcu_read_unlock();
  
@@ -3452,7 +3694,8 @@ void *cpu_physical_memory_map(hwaddr addr,
                                hwaddr *plen,
                                int is_write)
  {
-    return address_space_map(&address_space_memory, addr, plen, is_write);
+    return address_space_map(&address_space_memory, addr, plen, is_write,
+                             MEMTXATTRS_UNSPECIFIED);
  }
  
  void cpu_physical_memory_unmap(void *buffer, hwaddr len,
@@ -3465,9 +3708,6 @@ void cpu_physical_memory_unmap(void *buffer, hwaddr len,
  #define ARG1                     as
  #define SUFFIX
  #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
-#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
-#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
-#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
  #define RCU_READ_LOCK(...)       rcu_read_lock()
  #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
  #include "memory_ldst.inc.c"
@@ -3478,33 +3718,134 @@ int64_t address_space_cache_init(MemoryRegionCache *cache,
                                   hwaddr len,
                                   bool is_write)
  {
-    cache->len = len;
-    cache->as = as;
-    cache->xlat = addr;
-    return len;
+    AddressSpaceDispatch *d;
+    hwaddr l;
+    MemoryRegion *mr;
+
+    assert(len > 0);
+
+    l = len;
+    cache->fv = address_space_get_flatview(as);
+    d = flatview_to_dispatch(cache->fv);
+    cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
+
+    mr = cache->mrs.mr;
+    memory_region_ref(mr);
+    if (memory_access_is_direct(mr, is_write)) {
+        /* We don't care about the memory attributes here as we're only
+         * doing this if we found actual RAM, which behaves the same
+         * regardless of attributes; so UNSPECIFIED is fine.
+         */
+        l = flatview_extend_translation(cache->fv, addr, len, mr,
+                                        cache->xlat, l, is_write,
+                                        MEMTXATTRS_UNSPECIFIED);
+        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
+    } else {
+        cache->ptr = NULL;
+    }
+
+    cache->len = l;
+    cache->is_write = is_write;
+    return l;
  }
  
  void address_space_cache_invalidate(MemoryRegionCache *cache,
                                      hwaddr addr,
                                      hwaddr access_len)
  {
+    assert(cache->is_write);
+    if (likely(cache->ptr)) {
+        invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
+    }
  }
  
  void address_space_cache_destroy(MemoryRegionCache *cache)
  {
-    cache->as = NULL;
+    if (!cache->mrs.mr) {
+        return;
+    }
+
+    if (xen_enabled()) {
+        xen_invalidate_map_cache_entry(cache->ptr);
+    }
+    memory_region_unref(cache->mrs.mr);
+    flatview_unref(cache->fv);
+    cache->mrs.mr = NULL;
+    cache->fv = NULL;
+}
+
+/* Called from RCU critical section.  This function has the same
+ * semantics as address_space_translate, but it only works on a
+ * predefined range of a MemoryRegion that was mapped with
+ * address_space_cache_init.
+ */
+static inline MemoryRegion *address_space_translate_cached(
+    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
+    hwaddr *plen, bool is_write, MemTxAttrs attrs)
+{
+    MemoryRegionSection section;
+    MemoryRegion *mr;
+    IOMMUMemoryRegion *iommu_mr;
+    AddressSpace *target_as;
+
+    assert(!cache->ptr);
+    *xlat = addr + cache->xlat;
+
+    mr = cache->mrs.mr;
+    iommu_mr = memory_region_get_iommu(mr);
+    if (!iommu_mr) {
+        /* MMIO region.  */
+        return mr;
+    }
+
+    section = address_space_translate_iommu(iommu_mr, xlat, plen,
+                                            NULL, is_write, true,
+                                            &target_as, attrs);
+    return section.mr;
+}
+
+/* Called from RCU critical section. address_space_read_cached uses this
+ * out of line function when the target is an MMIO or IOMMU region.
+ */
+void
+address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
+                                   void *buf, int len)
+{
+    hwaddr addr1, l;
+    MemoryRegion *mr;
+
+    l = len;
+    mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
+                                        MEMTXATTRS_UNSPECIFIED);
+    flatview_read_continue(cache->fv,
+                           addr, MEMTXATTRS_UNSPECIFIED, buf, len,
+                           addr1, l, mr);
+}
+
+/* Called from RCU critical section. address_space_write_cached uses this
+ * out of line function when the target is an MMIO or IOMMU region.
+ */
+void
+address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
+                                    const void *buf, int len)
+{
+    hwaddr addr1, l;
+    MemoryRegion *mr;
+
+    l = len;
+    mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
+                                        MEMTXATTRS_UNSPECIFIED);
+    flatview_write_continue(cache->fv,
+                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
+                            addr1, l, mr);
  }
  
  #define ARG1_DECL                MemoryRegionCache *cache
  #define ARG1                     cache
-#define SUFFIX                   _cached
-#define TRANSLATE(addr, ...)     \
-    address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
-#define IS_DIRECT(mr, is_write)  true
-#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
-#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
-#define RCU_READ_LOCK()          rcu_read_lock()
-#define RCU_READ_UNLOCK()        rcu_read_unlock()
+#define SUFFIX                   _cached_slow
+#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
+#define RCU_READ_LOCK()          ((void)0)
+#define RCU_READ_UNLOCK()        ((void)0)
  #include "memory_ldst.inc.c"
  
  /* virtual memory access for debug (includes writing to ROM) */
@@ -3588,7 +3929,8 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr)
  
      rcu_read_lock();
      mr = address_space_translate(&address_space_memory,
-                                 phys_addr, &phys_addr, &l, false);
+                                 phys_addr, &phys_addr, &l, false,
+                                 MEMTXATTRS_UNSPECIFIED);
  
      res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
      rcu_read_unlock();
@@ -3612,6 +3954,26 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
      return ret;
  }
  
+int qemu_ram_foreach_migratable_block(RAMBlockIterFunc func, void *opaque)
+{
+    RAMBlock *block;
+    int ret = 0;
+
+    rcu_read_lock();
+    RAMBLOCK_FOREACH(block) {
+        if (!qemu_ram_is_migratable(block)) {
+            continue;
+        }
+        ret = func(block->idstr, block->host, block->offset,
+                   block->used_length, opaque);
+        if (ret) {
+            break;
+        }
+    }
+    rcu_read_unlock();
+    return ret;
+}
+
  /*
   * Unmap pages of memory from start to start+length such that
   * they a) read as 0, b) Trigger whatever fault mechanism
@@ -3633,6 +3995,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
      }
  
      if ((start + length) <= rb->used_length) {
+        bool need_madvise, need_fallocate;
          uint8_t *host_endaddr = host_startaddr + length;
          if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
              error_report("ram_block_discard_range: Unaligned end address: %p",
@@ -3642,29 +4005,60 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
  
          errno = ENOTSUP; /* If we are missing MADVISE etc */
  
-        if (rb->page_size == qemu_host_page_size) {
-#if defined(CONFIG_MADVISE)
-            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
-             * freeing the page.
-             */
-            ret = madvise(host_startaddr, length, MADV_DONTNEED);
-#endif
-        } else {
-            /* Huge page case  - unfortunately it can't do DONTNEED, but
-             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
-             * huge page file.
+        /* The logic here is messy;
+         *    madvise DONTNEED fails for hugepages
+         *    fallocate works on hugepages and shmem
+         */
+        need_madvise = (rb->page_size == qemu_host_page_size);
+        need_fallocate = rb->fd != -1;
+        if (need_fallocate) {
+            /* For a file, this causes the area of the file to be zero'd
+             * if read, and for hugetlbfs also causes it to be unmapped
+             * so a userfault will trigger.
               */
  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
              ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                              start, length);
+            if (ret) {
+                ret = -errno;
+                error_report("ram_block_discard_range: Failed to fallocate "
+                             "%s:%" PRIx64 " +%zx (%d)",
+                             rb->idstr, start, length, ret);
+                goto err;
+            }
+#else
+            ret = -ENOSYS;
+            error_report("ram_block_discard_range: fallocate not available/file"
+                         "%s:%" PRIx64 " +%zx (%d)",
+                         rb->idstr, start, length, ret);
+            goto err;
  #endif
          }
-        if (ret) {
-            ret = -errno;
-            error_report("ram_block_discard_range: Failed to discard range "
+        if (need_madvise) {
+            /* For normal RAM this causes it to be unmapped,
+             * for shared memory it causes the local mapping to disappear
+             * and to fall back on the file contents (which we just
+             * fallocate'd away).
+             */
+#if defined(CONFIG_MADVISE)
+            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
+            if (ret) {
+                ret = -errno;
+                error_report("ram_block_discard_range: Failed to discard range "
+                             "%s:%" PRIx64 " +%zx (%d)",
+                             rb->idstr, start, length, ret);
+                goto err;
+            }
+#else
+            ret = -ENOSYS;
+            error_report("ram_block_discard_range: MADVISE not available"
                           "%s:%" PRIx64 " +%zx (%d)",
                           rb->idstr, start, length, ret);
+            goto err;
+#endif
          }
+        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
+                                      need_madvise, need_fallocate, ret);
      } else {
          error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
                       "/%zx/" RAM_ADDR_FMT")",
@@ -3675,6 +4069,11 @@ err:
      return ret;
  }
  
+bool ramblock_is_pmem(RAMBlock *rb)
+{
+    return rb->flags & RAM_PMEM;
+}
+
  #endif
  
  void page_size_init(void)