char: useless NULL check

[qemu.git] / exec.c
diff --git a/exec.c b/exec.c

index f2bed92b64a1a5909d055935fc00c95093f9df51..eac6085760e9d7e205e43fdb6a90e935f09c5e86 100644 (file)
--- a/exec.c
+++ b/exec.c
@@ -42,9 +42,17 @@
  #include "exec/memory.h"
  #include "exec/ioport.h"
  #include "sysemu/dma.h"
+#include "sysemu/numa.h"
+#include "sysemu/hw_accel.h"
  #include "exec/address-spaces.h"
  #include "sysemu/xen-mapcache.h"
-#include "trace.h"
+#include "trace-root.h"
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include <fcntl.h>
+#include <linux/falloc.h>
+#endif
+
  #endif
  #include "exec/cpu-all.h"
  #include "qemu/rcu_queue.h"
@@ -215,6 +223,12 @@ struct CPUAddressSpace {
      MemoryListener tcg_as_listener;
  };
  
+struct DirtyBitmapSnapshot {
+    ram_addr_t start;
+    ram_addr_t end;
+    unsigned long dirty[];
+};
+
  #endif
  
  #if !defined(CONFIG_USER_ONLY)
@@ -1053,6 +1067,75 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
      return dirty;
  }
  
+DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
+     (ram_addr_t start, ram_addr_t length, unsigned client)
+{
+    DirtyMemoryBlocks *blocks;
+    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
+    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
+    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
+    DirtyBitmapSnapshot *snap;
+    unsigned long page, end, dest;
+
+    snap = g_malloc0(sizeof(*snap) +
+                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
+    snap->start = first;
+    snap->end   = last;
+
+    page = first >> TARGET_PAGE_BITS;
+    end  = last  >> TARGET_PAGE_BITS;
+    dest = 0;
+
+    rcu_read_lock();
+
+    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+
+    while (page < end) {
+        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+
+        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
+        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
+        offset >>= BITS_PER_LEVEL;
+
+        bitmap_copy_and_clear_atomic(snap->dirty + dest,
+                                     blocks->blocks[idx] + offset,
+                                     num);
+        page += num;
+        dest += num >> BITS_PER_LEVEL;
+    }
+
+    rcu_read_unlock();
+
+    if (tcg_enabled()) {
+        tlb_reset_dirty_range_all(start, length);
+    }
+
+    return snap;
+}
+
+bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
+                                            ram_addr_t start,
+                                            ram_addr_t length)
+{
+    unsigned long page, end;
+
+    assert(start >= snap->start);
+    assert(start + length <= snap->end);
+
+    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
+    page = (start - snap->start) >> TARGET_PAGE_BITS;
+
+    while (page < end) {
+        if (test_bit(page, snap->dirty)) {
+            return true;
+        }
+        page++;
+    }
+    return false;
+}
+
  /* Called from RCU critical section */
  hwaddr memory_region_section_get_iotlb(CPUState *cpu,
                                         MemoryRegionSection *section,
@@ -1250,6 +1333,87 @@ void qemu_mutex_unlock_ramlist(void)
      qemu_mutex_unlock(&ram_list.mutex);
  }
  
+#ifdef __linux__
+/*
+ * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
+ * may or may not name the same files / on the same filesystem now as
+ * when we actually open and map them.  Iterate over the file
+ * descriptors instead, and use qemu_fd_getpagesize().
+ */
+static int find_max_supported_pagesize(Object *obj, void *opaque)
+{
+    char *mem_path;
+    long *hpsize_min = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
+        mem_path = object_property_get_str(obj, "mem-path", NULL);
+        if (mem_path) {
+            long hpsize = qemu_mempath_getpagesize(mem_path);
+            if (hpsize < *hpsize_min) {
+                *hpsize_min = hpsize;
+            }
+        } else {
+            *hpsize_min = getpagesize();
+        }
+    }
+
+    return 0;
+}
+
+long qemu_getrampagesize(void)
+{
+    long hpsize = LONG_MAX;
+    long mainrampagesize;
+    Object *memdev_root;
+
+    if (mem_path) {
+        mainrampagesize = qemu_mempath_getpagesize(mem_path);
+    } else {
+        mainrampagesize = getpagesize();
+    }
+
+    /* it's possible we have memory-backend objects with
+     * hugepage-backed RAM. these may get mapped into system
+     * address space via -numa parameters or memory hotplug
+     * hooks. we want to take these into account, but we
+     * also want to make sure these supported hugepage
+     * sizes are applicable across the entire range of memory
+     * we may boot from, so we take the min across all
+     * backends, and assume normal pages in cases where a
+     * backend isn't backed by hugepages.
+     */
+    memdev_root = object_resolve_path("/objects", NULL);
+    if (memdev_root) {
+        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
+    }
+    if (hpsize == LONG_MAX) {
+        /* No additional memory regions found ==> Report main RAM page size */
+        return mainrampagesize;
+    }
+
+    /* If NUMA is disabled or the NUMA nodes are not backed with a
+     * memory-backend, then there is at least one node using "normal" RAM,
+     * so if its page size is smaller we have got to report that size instead.
+     */
+    if (hpsize > mainrampagesize &&
+        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
+        static bool warned;
+        if (!warned) {
+            error_report("Huge page support disabled (n/a for main memory).");
+            warned = true;
+        }
+        return mainrampagesize;
+    }
+
+    return hpsize;
+}
+#else
+long qemu_getrampagesize(void)
+{
+    return getpagesize();
+}
+#endif
+
  #ifdef __linux__
  static int64_t get_file_size(int fd)
  {
@@ -1379,7 +1543,7 @@ static void *file_ram_alloc(RAMBlock *block,
      }
  
      if (mem_prealloc) {
-        os_mem_prealloc(fd, area, memory, errp);
+        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
          if (errp && *errp) {
              goto error;
          }
@@ -1439,7 +1603,7 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
      return offset;
  }
  
-ram_addr_t last_ram_offset(void)
+unsigned long last_ram_page(void)
  {
      RAMBlock *block;
      ram_addr_t last = 0;
@@ -1449,7 +1613,7 @@ ram_addr_t last_ram_offset(void)
          last = MAX(last, block->offset + block->max_length);
      }
      rcu_read_unlock();
-    return last;
+    return last >> TARGET_PAGE_BITS;
  }
  
  static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
@@ -1472,6 +1636,11 @@ const char *qemu_ram_get_idstr(RAMBlock *rb)
      return rb->idstr;
  }
  
+bool qemu_ram_is_shared(RAMBlock *rb)
+{
+    return rb->flags & RAM_SHARED;
+}
+
  /* Called with iothread lock held.  */
  void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
  {
@@ -1518,6 +1687,19 @@ size_t qemu_ram_pagesize(RAMBlock *rb)
      return rb->page_size;
  }
  
+/* Returns the largest size of page in use */
+size_t qemu_ram_pagesize_largest(void)
+{
+    RAMBlock *block;
+    size_t largest = 0;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        largest = MAX(largest, qemu_ram_pagesize(block));
+    }
+
+    return largest;
+}
+
  static int memory_try_enable_merging(void *addr, size_t len)
  {
      if (!machine_mem_merge(current_machine)) {
@@ -1620,7 +1802,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
      ram_addr_t old_ram_size, new_ram_size;
      Error *err = NULL;
  
-    old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
+    old_ram_size = last_ram_page();
  
      qemu_mutex_lock_ramlist();
      new_block->offset = find_ram_offset(new_block->max_length);
@@ -1651,7 +1833,6 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
      new_ram_size = MAX(old_ram_size,
                (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
      if (new_ram_size > old_ram_size) {
-        migration_bitmap_extend(old_ram_size, new_ram_size);
          dirty_memory_extend(old_ram_size, new_ram_size);
      }
      /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
@@ -2115,6 +2296,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
          return;
      }
      vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
+    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
      QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
          if (cpu_watchpoint_address_matches(wp, vaddr, len)
              && (wp->flags & flags)) {
@@ -2133,9 +2315,9 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                  }
                  cpu->watchpoint_hit = wp;
  
-                /* The tb_lock will be reset when cpu_loop_exit or
-                 * cpu_loop_exit_noexc longjmp back into the cpu_exec
-                 * main loop.
+                /* Both tb_lock and iothread_mutex will be reset when
+                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
+                 * back into the cpu_exec main loop.
                   */
                  tb_lock();
                  tb_check_watchpoint(cpu);
@@ -2370,8 +2552,14 @@ static void io_mem_init(void)
      memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                            NULL, UINT64_MAX);
+
+    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
+     * which can be called without the iothread mutex.
+     */
      memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
                            NULL, UINT64_MAX);
+    memory_region_clear_global_locking(&io_mem_notdirty);
+
      memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
                            NULL, UINT64_MAX);
  }
@@ -2630,7 +2818,7 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                  break;
              case 4:
                  /* 32 bit write access */
-                val = ldl_p(buf);
+                val = (uint32_t)ldl_p(buf);
                  result |= memory_region_dispatch_write(mr, addr1, val, 4,
                                                         attrs);
                  break;
@@ -3122,74 +3310,33 @@ int64_t address_space_cache_init(MemoryRegionCache *cache,
                                   hwaddr len,
                                   bool is_write)
  {
-    hwaddr l, xlat;
-    MemoryRegion *mr;
-    void *ptr;
-
-    assert(len > 0);
-
-    l = len;
-    mr = address_space_translate(as, addr, &xlat, &l, is_write);
-    if (!memory_access_is_direct(mr, is_write)) {
-        return -EINVAL;
-    }
-
-    l = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
-    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, &l);
-
-    cache->xlat = xlat;
-    cache->is_write = is_write;
-    cache->mr = mr;
-    cache->ptr = ptr;
-    cache->len = l;
-    memory_region_ref(cache->mr);
-
-    return l;
+    cache->len = len;
+    cache->as = as;
+    cache->xlat = addr;
+    return len;
  }
  
  void address_space_cache_invalidate(MemoryRegionCache *cache,
                                      hwaddr addr,
                                      hwaddr access_len)
  {
-    assert(cache->is_write);
-    invalidate_and_set_dirty(cache->mr, addr + cache->xlat, access_len);
  }
  
  void address_space_cache_destroy(MemoryRegionCache *cache)
  {
-    if (!cache->mr) {
-        return;
-    }
-
-    if (xen_enabled()) {
-        xen_invalidate_map_cache_entry(cache->ptr);
-    }
-    memory_region_unref(cache->mr);
-}
-
-/* Called from RCU critical section.  This function has the same
- * semantics as address_space_translate, but it only works on a
- * predefined range of a MemoryRegion that was mapped with
- * address_space_cache_init.
- */
-static inline MemoryRegion *address_space_translate_cached(
-    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
-    hwaddr *plen, bool is_write)
-{
-    assert(addr < cache->len && *plen <= cache->len - addr);
-    *xlat = addr + cache->xlat;
-    return cache->mr;
+    cache->as = NULL;
  }
  
  #define ARG1_DECL                MemoryRegionCache *cache
  #define ARG1                     cache
  #define SUFFIX                   _cached
-#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
+#define TRANSLATE(addr, ...)     \
+    address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
  #define IS_DIRECT(mr, is_write)  true
-#define MAP_RAM(mr, ofs)         (cache->ptr + (ofs - cache->xlat))
-#define INVALIDATE(mr, ofs, len) ((void)0)
-#define RCU_READ_LOCK()          ((void)0)
-#define RCU_READ_UNLOCK()        ((void)0)
+#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
+#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
+#define RCU_READ_LOCK()          rcu_read_lock()
+#define RCU_READ_UNLOCK()        rcu_read_unlock()
  #include "memory_ldst.inc.c"
  
  /* virtual memory access for debug (includes writing to ROM) */
@@ -3200,6 +3347,7 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
      hwaddr phys_addr;
      target_ulong page;
  
+    cpu_synchronize_state(cpu);
      while (len > 0) {
          int asidx;
          MemTxAttrs attrs;
@@ -3233,9 +3381,9 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
   * Allows code that needs to deal with migration bitmaps etc to still be built
   * target independent.
   */
-size_t qemu_target_page_bits(void)
+size_t qemu_target_page_size(void)
  {
-    return TARGET_PAGE_BITS;
+    return TARGET_PAGE_SIZE;
  }
  
  #endif
@@ -3286,4 +3434,68 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
      rcu_read_unlock();
      return ret;
  }
+
+/*
+ * Unmap pages of memory from start to start+length such that
+ * they a) read as 0, b) Trigger whatever fault mechanism
+ * the OS provides for postcopy.
+ * The pages must be unmapped by the end of the function.
+ * Returns: 0 on success, none-0 on failure
+ *
+ */
+int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
+{
+    int ret = -1;
+
+    uint8_t *host_startaddr = rb->host + start;
+
+    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
+        error_report("ram_block_discard_range: Unaligned start address: %p",
+                     host_startaddr);
+        goto err;
+    }
+
+    if ((start + length) <= rb->used_length) {
+        uint8_t *host_endaddr = host_startaddr + length;
+        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
+            error_report("ram_block_discard_range: Unaligned end address: %p",
+                         host_endaddr);
+            goto err;
+        }
+
+        errno = ENOTSUP; /* If we are missing MADVISE etc */
+
+        if (rb->page_size == qemu_host_page_size) {
+#if defined(CONFIG_MADVISE)
+            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
+             * freeing the page.
+             */
+            ret = madvise(host_startaddr, length, MADV_DONTNEED);
+#endif
+        } else {
+            /* Huge page case  - unfortunately it can't do DONTNEED, but
+             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
+             * huge page file.
+             */
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                            start, length);
+#endif
+        }
+        if (ret) {
+            ret = -errno;
+            error_report("ram_block_discard_range: Failed to discard range "
+                         "%s:%" PRIx64 " +%zx (%d)",
+                         rb->idstr, start, length, ret);
+        }
+    } else {
+        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
+                     "/%zx/" RAM_ADDR_FMT")",
+                     rb->idstr, start, length, rb->used_length);
+    }
+
+err:
+    return ret;
+}
+
  #endif