X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/863e9621c51a7544f1a2ae78387749145adaf450..9a76bd783d0421962e8c65bb853a57eef4897720:/exec.c diff --git a/exec.c b/exec.c index e63c5a1bd4..01ac21e3cd 100644 --- a/exec.c +++ b/exec.c @@ -24,8 +24,10 @@ #include "qemu/cutils.h" #include "cpu.h" #include "exec/exec-all.h" +#include "exec/target_page.h" #include "tcg.h" #include "hw/qdev-core.h" +#include "hw/qdev-properties.h" #if !defined(CONFIG_USER_ONLY) #include "hw/boards.h" #include "hw/xen/xen.h" @@ -42,9 +44,17 @@ #include "exec/memory.h" #include "exec/ioport.h" #include "sysemu/dma.h" +#include "sysemu/numa.h" +#include "sysemu/hw_accel.h" #include "exec/address-spaces.h" #include "sysemu/xen-mapcache.h" -#include "trace.h" +#include "trace-root.h" + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#include +#include +#endif + #endif #include "exec/cpu-all.h" #include "qemu/rcu_queue.h" @@ -63,6 +73,8 @@ #include "qemu/mmap-alloc.h" #endif +#include "monitor/monitor.h" + //#define DEBUG_SUBPAGE #if !defined(CONFIG_USER_ONLY) @@ -93,6 +105,11 @@ static MemoryRegion io_mem_unassigned; #endif +#ifdef TARGET_PAGE_BITS_VARY +int target_page_bits; +bool target_page_bits_decided; +#endif + struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus); /* current CPU in the current thread. It is only valid inside cpu_exec() */ @@ -102,8 +119,42 @@ __thread CPUState *current_cpu; 2 = Adaptive rate instruction counting. */ int use_icount; +uintptr_t qemu_host_page_size; +intptr_t qemu_host_page_mask; +uintptr_t qemu_real_host_page_size; +intptr_t qemu_real_host_page_mask; + +bool set_preferred_target_page_bits(int bits) +{ + /* The target page size is the lowest common denominator for all + * the CPUs in the system, so we can only make it smaller, never + * larger. And we can't make it smaller once we've committed to + * a particular size. + */ +#ifdef TARGET_PAGE_BITS_VARY + assert(bits >= TARGET_PAGE_BITS_MIN); + if (target_page_bits == 0 || target_page_bits > bits) { + if (target_page_bits_decided) { + return false; + } + target_page_bits = bits; + } +#endif + return true; +} + #if !defined(CONFIG_USER_ONLY) +static void finalize_target_page_bits(void) +{ +#ifdef TARGET_PAGE_BITS_VARY + if (target_page_bits == 0) { + target_page_bits = TARGET_PAGE_BITS_MIN; + } + target_page_bits_decided = true; +#endif +} + typedef struct PhysPageEntry PhysPageEntry; struct PhysPageEntry { @@ -153,7 +204,7 @@ typedef struct subpage_t { MemoryRegion iomem; AddressSpace *as; hwaddr base; - uint16_t sub_section[TARGET_PAGE_SIZE]; + uint16_t sub_section[]; } subpage_t; #define PHYS_SECTION_UNASSIGNED 0 @@ -181,6 +232,12 @@ struct CPUAddressSpace { MemoryListener tcg_as_listener; }; +struct DirtyBitmapSnapshot { + ram_addr_t start; + ram_addr_t end; + unsigned long dirty[]; +}; + #endif #if !defined(CONFIG_USER_ONLY) @@ -318,15 +375,16 @@ static inline bool section_covers_addr(const MemoryRegionSection *section, /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means * the section must cover the entire address space. */ - return section->size.hi || + return int128_gethi(section->size) || range_covers_byte(section->offset_within_address_space, - section->size.lo, addr); + int128_getlo(section->size), addr); } -static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr, - Node *nodes, MemoryRegionSection *sections) +static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr) { - PhysPageEntry *p; + PhysPageEntry lp = d->phys_map, *p; + Node *nodes = d->map.nodes; + MemoryRegionSection *sections = d->map.sections; hwaddr index = addr >> TARGET_PAGE_BITS; int i; @@ -364,8 +422,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, section_covers_addr(section, addr)) { update = false; } else { - section = phys_page_find(d->phys_map, addr, d->map.nodes, - d->map.sections); + section = phys_page_find(d, addr); update = true; } if (resolve_subpage && section->mr->subpage) { @@ -415,41 +472,112 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x } /* Called from RCU critical section */ -MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, - hwaddr *xlat, hwaddr *plen, - bool is_write) +static MemoryRegionSection address_space_do_translate(AddressSpace *as, + hwaddr addr, + hwaddr *xlat, + hwaddr *plen, + bool is_write, + bool is_mmio) { IOMMUTLBEntry iotlb; MemoryRegionSection *section; - MemoryRegion *mr; + IOMMUMemoryRegion *iommu_mr; + IOMMUMemoryRegionClass *imrc; for (;;) { AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch); - section = address_space_translate_internal(d, addr, &addr, plen, true); - mr = section->mr; + section = address_space_translate_internal(d, addr, &addr, plen, is_mmio); - if (!mr->iommu_ops) { + iommu_mr = memory_region_get_iommu(section->mr); + if (!iommu_mr) { break; } + imrc = memory_region_get_iommu_class_nocheck(iommu_mr); - iotlb = mr->iommu_ops->translate(mr, addr, is_write); + iotlb = imrc->translate(iommu_mr, addr, is_write ? + IOMMU_WO : IOMMU_RO); addr = ((iotlb.translated_addr & ~iotlb.addr_mask) | (addr & iotlb.addr_mask)); *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1); if (!(iotlb.perm & (1 << is_write))) { - mr = &io_mem_unassigned; - break; + goto translate_fail; } as = iotlb.target_as; } + *xlat = addr; + + return *section; + +translate_fail: + return (MemoryRegionSection) { .mr = &io_mem_unassigned }; +} + +/* Called from RCU critical section */ +IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, + bool is_write) +{ + MemoryRegionSection section; + hwaddr xlat, plen; + + /* Try to get maximum page mask during translation. */ + plen = (hwaddr)-1; + + /* This can never be MMIO. */ + section = address_space_do_translate(as, addr, &xlat, &plen, + is_write, false); + + /* Illegal translation */ + if (section.mr == &io_mem_unassigned) { + goto iotlb_fail; + } + + /* Convert memory region offset into address space offset */ + xlat += section.offset_within_address_space - + section.offset_within_region; + + if (plen == (hwaddr)-1) { + /* + * We use default page size here. Logically it only happens + * for identity mappings. + */ + plen = TARGET_PAGE_SIZE; + } + + /* Convert to address mask */ + plen -= 1; + + return (IOMMUTLBEntry) { + .target_as = section.address_space, + .iova = addr & ~plen, + .translated_addr = xlat & ~plen, + .addr_mask = plen, + /* IOTLBs are for DMAs, and DMA only allows on RAMs. */ + .perm = IOMMU_RW, + }; + +iotlb_fail: + return (IOMMUTLBEntry) {0}; +} + +/* Called from RCU critical section */ +MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, + hwaddr *xlat, hwaddr *plen, + bool is_write) +{ + MemoryRegion *mr; + MemoryRegionSection section; + + /* This can be MMIO, so setup MMIO bit. */ + section = address_space_do_translate(as, addr, xlat, plen, is_write, true); + mr = section.mr; + if (xen_enabled() && memory_access_is_direct(mr, is_write)) { hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr; *plen = MIN(page, *plen); } - *xlat = addr; return mr; } @@ -459,11 +587,11 @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, hwaddr *xlat, hwaddr *plen) { MemoryRegionSection *section; - AddressSpaceDispatch *d = cpu->cpu_ases[asidx].memory_dispatch; + AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch); section = address_space_translate_internal(d, addr, xlat, plen, false); - assert(!section->mr->iommu_ops); + assert(!memory_region_is_iommu(section->mr)); return section; } #endif @@ -477,7 +605,7 @@ static int cpu_common_post_load(void *opaque, int version_id) /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the version_id is increased. */ cpu->interrupt_request &= ~0x01; - tlb_flush(cpu, 1); + tlb_flush(cpu); return 0; } @@ -596,7 +724,7 @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) } #endif -void cpu_exec_exit(CPUState *cpu) +void cpu_exec_unrealizefn(CPUState *cpu) { CPUClass *cc = CPU_GET_CLASS(cpu); @@ -610,31 +738,35 @@ void cpu_exec_exit(CPUState *cpu) } } -void cpu_exec_init(CPUState *cpu, Error **errp) -{ - CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu); - Error *local_err ATTRIBUTE_UNUSED = NULL; - - cpu->as = NULL; - cpu->num_ases = 0; - +Property cpu_common_props[] = { #ifndef CONFIG_USER_ONLY - cpu->thread_id = qemu_get_thread_id(); - - /* This is a softmmu CPU object, so create a property for it + /* Create a memory property for softmmu CPU object, * so users can wire up its memory. (This can't go in qom/cpu.c * because that file is compiled only once for both user-mode * and system builds.) The default if no link is set up is to use * the system address space. */ - object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION, - (Object **)&cpu->memory, - qdev_prop_allow_set_link_before_realize, - OBJ_PROP_LINK_UNREF_ON_RELEASE, - &error_abort); + DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, + MemoryRegion *), +#endif + DEFINE_PROP_END_OF_LIST(), +}; + +void cpu_exec_initfn(CPUState *cpu) +{ + cpu->as = NULL; + cpu->num_ases = 0; + +#ifndef CONFIG_USER_ONLY + cpu->thread_id = qemu_get_thread_id(); cpu->memory = system_memory; object_ref(OBJECT(cpu->memory)); #endif +} + +void cpu_exec_realizefn(CPUState *cpu, Error **errp) +{ + CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu); cpu_list_add(cpu); @@ -651,7 +783,11 @@ void cpu_exec_init(CPUState *cpu, Error **errp) #if defined(CONFIG_USER_ONLY) static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) { + mmap_lock(); + tb_lock(); tb_invalidate_phys_page_range(pc, pc + 1, 0); + tb_unlock(); + mmap_unlock(); } #else static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) @@ -660,6 +796,7 @@ static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs); int asidx = cpu_asidx_from_attrs(cpu, attrs); if (phys != -1) { + /* Locks grabbed by tb_invalidate_phys_addr */ tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as, phys | (pc & ~TARGET_PAGE_MASK)); } @@ -870,11 +1007,13 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...) fprintf(stderr, "\n"); cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP); if (qemu_log_separate()) { + qemu_log_lock(); qemu_log("qemu: fatal: "); qemu_log_vprintf(fmt, ap2); qemu_log("\n"); log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP); qemu_log_flush(); + qemu_log_unlock(); qemu_log_close(); } va_end(ap2); @@ -901,7 +1040,7 @@ static RAMBlock *qemu_get_ram_block(ram_addr_t addr) if (block && addr - block->offset < block->max_length) { return block; } - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { if (addr - block->offset < block->max_length) { goto found; } @@ -990,6 +1129,75 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, return dirty; } +DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty + (ram_addr_t start, ram_addr_t length, unsigned client) +{ + DirtyMemoryBlocks *blocks; + unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL); + ram_addr_t first = QEMU_ALIGN_DOWN(start, align); + ram_addr_t last = QEMU_ALIGN_UP(start + length, align); + DirtyBitmapSnapshot *snap; + unsigned long page, end, dest; + + snap = g_malloc0(sizeof(*snap) + + ((last - first) >> (TARGET_PAGE_BITS + 3))); + snap->start = first; + snap->end = last; + + page = first >> TARGET_PAGE_BITS; + end = last >> TARGET_PAGE_BITS; + dest = 0; + + rcu_read_lock(); + + blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); + + while (page < end) { + unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; + unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; + unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); + + assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL))); + assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL))); + offset >>= BITS_PER_LEVEL; + + bitmap_copy_and_clear_atomic(snap->dirty + dest, + blocks->blocks[idx] + offset, + num); + page += num; + dest += num >> BITS_PER_LEVEL; + } + + rcu_read_unlock(); + + if (tcg_enabled()) { + tlb_reset_dirty_range_all(start, length); + } + + return snap; +} + +bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, + ram_addr_t start, + ram_addr_t length) +{ + unsigned long page, end; + + assert(start >= snap->start); + assert(start + length <= snap->end); + + end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS; + page = (start - snap->start) >> TARGET_PAGE_BITS; + + while (page < end) { + if (test_bit(page, snap->dirty)) { + return true; + } + page++; + } + return false; +} + /* Called from RCU critical section */ hwaddr memory_region_section_get_iotlb(CPUState *cpu, MemoryRegionSection *section, @@ -1100,8 +1308,7 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; - MemoryRegionSection *existing = phys_page_find(d->phys_map, base, - d->map.nodes, d->map.sections); + MemoryRegionSection *existing = phys_page_find(d, base); MemoryRegionSection subsection = { .offset_within_address_space = base, .size = int128_make64(TARGET_PAGE_SIZE), @@ -1187,25 +1394,128 @@ void qemu_mutex_unlock_ramlist(void) qemu_mutex_unlock(&ram_list.mutex); } +void ram_block_dump(Monitor *mon) +{ + RAMBlock *block; + char *psize; + + rcu_read_lock(); + monitor_printf(mon, "%24s %8s %18s %18s %18s\n", + "Block Name", "PSize", "Offset", "Used", "Total"); + RAMBLOCK_FOREACH(block) { + psize = size_to_str(block->page_size); + monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64 + " 0x%016" PRIx64 "\n", block->idstr, psize, + (uint64_t)block->offset, + (uint64_t)block->used_length, + (uint64_t)block->max_length); + g_free(psize); + } + rcu_read_unlock(); +} + #ifdef __linux__ -static void *file_ram_alloc(RAMBlock *block, - ram_addr_t memory, - const char *path, - Error **errp) +/* + * FIXME TOCTTOU: this iterates over memory backends' mem-path, which + * may or may not name the same files / on the same filesystem now as + * when we actually open and map them. Iterate over the file + * descriptors instead, and use qemu_fd_getpagesize(). + */ +static int find_max_supported_pagesize(Object *obj, void *opaque) +{ + char *mem_path; + long *hpsize_min = opaque; + + if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { + mem_path = object_property_get_str(obj, "mem-path", NULL); + if (mem_path) { + long hpsize = qemu_mempath_getpagesize(mem_path); + if (hpsize < *hpsize_min) { + *hpsize_min = hpsize; + } + } else { + *hpsize_min = getpagesize(); + } + } + + return 0; +} + +long qemu_getrampagesize(void) +{ + long hpsize = LONG_MAX; + long mainrampagesize; + Object *memdev_root; + + if (mem_path) { + mainrampagesize = qemu_mempath_getpagesize(mem_path); + } else { + mainrampagesize = getpagesize(); + } + + /* it's possible we have memory-backend objects with + * hugepage-backed RAM. these may get mapped into system + * address space via -numa parameters or memory hotplug + * hooks. we want to take these into account, but we + * also want to make sure these supported hugepage + * sizes are applicable across the entire range of memory + * we may boot from, so we take the min across all + * backends, and assume normal pages in cases where a + * backend isn't backed by hugepages. + */ + memdev_root = object_resolve_path("/objects", NULL); + if (memdev_root) { + object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize); + } + if (hpsize == LONG_MAX) { + /* No additional memory regions found ==> Report main RAM page size */ + return mainrampagesize; + } + + /* If NUMA is disabled or the NUMA nodes are not backed with a + * memory-backend, then there is at least one node using "normal" RAM, + * so if its page size is smaller we have got to report that size instead. + */ + if (hpsize > mainrampagesize && + (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) { + static bool warned; + if (!warned) { + error_report("Huge page support disabled (n/a for main memory)."); + warned = true; + } + return mainrampagesize; + } + + return hpsize; +} +#else +long qemu_getrampagesize(void) +{ + return getpagesize(); +} +#endif + +#ifdef __linux__ +static int64_t get_file_size(int fd) +{ + int64_t size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; +} + +static int file_ram_open(const char *path, + const char *region_name, + bool *created, + Error **errp) { - bool unlink_on_error = false; char *filename; char *sanitized_name; char *c; - void *area = MAP_FAILED; int fd = -1; - if (kvm_enabled() && !kvm_has_sync_mmu()) { - error_setg(errp, - "host lacks kvm mmu notifiers, -mem-path unsupported"); - return NULL; - } - + *created = false; for (;;) { fd = open(path, O_RDWR); if (fd >= 0) { @@ -1216,13 +1526,13 @@ static void *file_ram_alloc(RAMBlock *block, /* @path names a file that doesn't exist, create it */ fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); if (fd >= 0) { - unlink_on_error = true; + *created = true; break; } } else if (errno == EISDIR) { /* @path names a directory, create a file there */ /* Make name safe to use with mkstemp by replacing '/' with '_'. */ - sanitized_name = g_strdup(memory_region_name(block->mr)); + sanitized_name = g_strdup(region_name); for (c = sanitized_name; *c != '\0'; c++) { if (*c == '/') { *c = '_'; @@ -1245,7 +1555,7 @@ static void *file_ram_alloc(RAMBlock *block, error_setg_errno(errp, errno, "can't open backing store %s for guest RAM", path); - goto error; + return -1; } /* * Try again on EINTR and EEXIST. The latter happens when @@ -1253,14 +1563,30 @@ static void *file_ram_alloc(RAMBlock *block, */ } + return fd; +} + +static void *file_ram_alloc(RAMBlock *block, + ram_addr_t memory, + int fd, + bool truncate, + Error **errp) +{ + void *area; + block->page_size = qemu_fd_getpagesize(fd); - block->mr->align = MAX(block->page_size, QEMU_VMALLOC_ALIGN); + block->mr->align = block->page_size; +#if defined(__s390x__) + if (kvm_enabled()) { + block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); + } +#endif if (memory < block->page_size) { error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " "or larger than page size 0x%zx", memory, block->page_size); - goto error; + return NULL; } memory = ROUND_UP(memory, block->page_size); @@ -1270,8 +1596,16 @@ static void *file_ram_alloc(RAMBlock *block, * hosts, so don't bother bailing out on errors. * If anything goes wrong with it under other filesystems, * mmap will fail. + * + * Do not truncate the non-empty backend file to avoid corrupting + * the existing data in the file. Disabling shrinking is not + * enough. For example, the current vNVDIMM implementation stores + * the guest NVDIMM labels at the end of the backend file. If the + * backend file is later extended, QEMU will not be able to find + * those labels. Therefore, extending the non-empty backend file + * is disabled as well. */ - if (ftruncate(fd, memory)) { + if (truncate && ftruncate(fd, memory)) { perror("ftruncate"); } @@ -1280,30 +1614,19 @@ static void *file_ram_alloc(RAMBlock *block, if (area == MAP_FAILED) { error_setg_errno(errp, errno, "unable to map backing store for guest RAM"); - goto error; + return NULL; } if (mem_prealloc) { - os_mem_prealloc(fd, area, memory, errp); + os_mem_prealloc(fd, area, memory, smp_cpus, errp); if (errp && *errp) { - goto error; + qemu_ram_munmap(area, memory); + return NULL; } } block->fd = fd; return area; - -error: - if (area != MAP_FAILED) { - qemu_ram_munmap(area, memory); - } - if (unlink_on_error) { - unlink(path); - } - if (fd != -1) { - close(fd); - } - return NULL; } #endif @@ -1319,12 +1642,12 @@ static ram_addr_t find_ram_offset(ram_addr_t size) return 0; } - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { ram_addr_t end, next = RAM_ADDR_MAX; end = block->offset + block->max_length; - QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(next_block) { if (next_block->offset >= end) { next = MIN(next, next_block->offset); } @@ -1344,17 +1667,17 @@ static ram_addr_t find_ram_offset(ram_addr_t size) return offset; } -ram_addr_t last_ram_offset(void) +unsigned long last_ram_page(void) { RAMBlock *block; ram_addr_t last = 0; rcu_read_lock(); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { last = MAX(last, block->offset + block->max_length); } rcu_read_unlock(); - return last; + return last >> TARGET_PAGE_BITS; } static void qemu_ram_setup_dump(void *addr, ram_addr_t size) @@ -1377,6 +1700,11 @@ const char *qemu_ram_get_idstr(RAMBlock *rb) return rb->idstr; } +bool qemu_ram_is_shared(RAMBlock *rb) +{ + return rb->flags & RAM_SHARED; +} + /* Called with iothread lock held. */ void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) { @@ -1395,7 +1723,7 @@ void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) pstrcat(new_block->idstr, sizeof(new_block->idstr), name); rcu_read_lock(); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { if (block != new_block && !strcmp(block->idstr, new_block->idstr)) { fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", @@ -1423,6 +1751,19 @@ size_t qemu_ram_pagesize(RAMBlock *rb) return rb->page_size; } +/* Returns the largest size of page in use */ +size_t qemu_ram_pagesize_largest(void) +{ + RAMBlock *block; + size_t largest = 0; + + RAMBLOCK_FOREACH(block) { + largest = MAX(largest, qemu_ram_pagesize(block)); + } + + return largest; +} + static int memory_try_enable_merging(void *addr, size_t len) { if (!machine_mem_merge(current_machine)) { @@ -1525,7 +1866,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) ram_addr_t old_ram_size, new_ram_size; Error *err = NULL; - old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; + old_ram_size = last_ram_page(); qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); @@ -1556,14 +1897,13 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) new_ram_size = MAX(old_ram_size, (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); if (new_ram_size > old_ram_size) { - migration_bitmap_extend(old_ram_size, new_ram_size); dirty_memory_extend(old_ram_size, new_ram_size); } /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. */ - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { last_block = block; if (block->max_length < new_block->max_length) { break; @@ -1592,22 +1932,30 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE); /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */ qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK); + ram_block_notify_add(new_block->host, new_block->max_length); } } #ifdef __linux__ -RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, - bool share, const char *mem_path, - Error **errp) +RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, + bool share, int fd, + Error **errp) { RAMBlock *new_block; Error *local_err = NULL; + int64_t file_size; if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); return NULL; } + if (kvm_enabled() && !kvm_has_sync_mmu()) { + error_setg(errp, + "host lacks kvm mmu notifiers, -mem-path unsupported"); + return NULL; + } + if (phys_mem_alloc != qemu_anon_ram_alloc) { /* * file_ram_alloc() needs to allocate just like @@ -1620,13 +1968,20 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, } size = HOST_PAGE_ALIGN(size); + file_size = get_file_size(fd); + if (file_size > 0 && file_size < size) { + error_setg(errp, "backing store %s size 0x%" PRIx64 + " does not match 'size' option 0x" RAM_ADDR_FMT, + mem_path, file_size, size); + return NULL; + } + new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->used_length = size; new_block->max_length = size; new_block->flags = share ? RAM_SHARED : 0; - new_block->host = file_ram_alloc(new_block, size, - mem_path, errp); + new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp); if (!new_block->host) { g_free(new_block); return NULL; @@ -1639,6 +1994,33 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, return NULL; } return new_block; + +} + + +RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, + bool share, const char *mem_path, + Error **errp) +{ + int fd; + bool created; + RAMBlock *block; + + fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp); + if (fd < 0) { + return NULL; + } + + block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp); + if (!block) { + if (created) { + unlink(mem_path); + } + close(fd); + return NULL; + } + + return block; } #endif @@ -1722,6 +2104,10 @@ void qemu_ram_free(RAMBlock *block) return; } + if (block->host) { + ram_block_notify_remove(block->host, block->max_length); + } + qemu_mutex_lock_ramlist(); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; @@ -1740,7 +2126,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) int flags; void *area, *vaddr; - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { offset = addr - block->offset; if (offset < block->max_length) { vaddr = ramblock_ptr(block, offset); @@ -1803,10 +2189,10 @@ void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) * In that case just map until the end of the page. */ if (block->offset == 0) { - return xen_map_cache(addr, 0, 0); + return xen_map_cache(addr, 0, 0, false); } - block->host = xen_map_cache(block->offset, block->max_length, 1); + block->host = xen_map_cache(block->offset, block->max_length, 1, false); } return ramblock_ptr(block, addr); } @@ -1836,10 +2222,10 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, * In that case just map the requested area. */ if (block->offset == 0) { - return xen_map_cache(addr, *size, 1); + return xen_map_cache(addr, *size, 1, true); } - block->host = xen_map_cache(block->offset, block->max_length, 1); + block->host = xen_map_cache(block->offset, block->max_length, 1, true); } return ramblock_ptr(block, addr); @@ -1886,7 +2272,7 @@ RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, goto found; } - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { /* This case append when the block is not mapped. */ if (block->host == NULL) { continue; @@ -1919,7 +2305,7 @@ RAMBlock *qemu_ram_block_by_name(const char *name) { RAMBlock *block; - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { if (!strcmp(name, block->idstr)) { return block; } @@ -1947,7 +2333,12 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr) static void notdirty_mem_write(void *opaque, hwaddr ram_addr, uint64_t val, unsigned size) { + bool locked = false; + + assert(tcg_enabled()); if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) { + locked = true; + tb_lock(); tb_invalidate_phys_page_fast(ram_addr, size); } switch (size) { @@ -1963,6 +2354,11 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr, default: abort(); } + + if (locked) { + tb_unlock(); + } + /* Set both VGA and migration bits for simplicity and to remove * the notdirty callback faster. */ @@ -1998,6 +2394,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags) CPUWatchpoint *wp; uint32_t cpu_flags; + assert(tcg_enabled()); if (cpu->watchpoint_hit) { /* We re-entered the check after replacing the TB. Now raise * the debug interrupt so that is will trigger after the @@ -2006,6 +2403,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags) return; } vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset; + vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len); QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { if (cpu_watchpoint_address_matches(wp, vaddr, len) && (wp->flags & flags)) { @@ -2023,6 +2421,12 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags) continue; } cpu->watchpoint_hit = wp; + + /* Both tb_lock and iothread_mutex will be reset when + * cpu_loop_exit or cpu_loop_exit_noexc longjmp + * back into the cpu_exec main loop. + */ + tb_lock(); tb_check_watchpoint(cpu); if (wp->flags & BP_STOP_BEFORE_ACCESS) { cpu->exception_index = EXCP_DEBUG; @@ -2210,8 +2614,7 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base) { subpage_t *mmio; - mmio = g_malloc0(sizeof(subpage_t)); - + mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t)); mmio->as = as; mmio->base = base; memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio, @@ -2256,8 +2659,14 @@ static void io_mem_init(void) memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX); memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX); + + /* io_mem_notdirty calls tb_invalidate_phys_page_fast, + * which can be called without the iothread mutex. + */ memory_region_init_io(&io_mem_notdirty, NULL, ¬dirty_mem_ops, NULL, NULL, UINT64_MAX); + memory_region_clear_global_locking(&io_mem_notdirty); + memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL, NULL, UINT64_MAX); } @@ -2316,8 +2725,8 @@ static void tcg_commit(MemoryListener *listener) * may have split the RCU critical section. */ d = atomic_rcu_read(&cpuas->as->dispatch); - cpuas->memory_dispatch = d; - tlb_flush(cpuas->cpu, 1); + atomic_rcu_set(&cpuas->memory_dispatch, d); + tlb_flush(cpuas->cpu); } void address_space_init_dispatch(AddressSpace *as) @@ -2431,7 +2840,10 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask); } if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) { + assert(tcg_enabled()); + tb_lock(); tb_invalidate_phys_range(addr, addr + length); + tb_unlock(); dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); } cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); @@ -2514,7 +2926,7 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, break; case 4: /* 32 bit write access */ - val = ldl_p(buf); + val = (uint32_t)ldl_p(buf); result |= memory_region_dispatch_write(mr, addr1, val, 4, attrs); break; @@ -2535,7 +2947,7 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, } } else { /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); + ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l); memcpy(ptr, buf, l); invalidate_and_set_dirty(mr, addr1, l); } @@ -2626,7 +3038,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, } } else { /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); + ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l); memcpy(buf, ptr, l); } @@ -2803,6 +3215,14 @@ void cpu_register_map_client(QEMUBH *bh) void cpu_exec_init_all(void) { qemu_mutex_init(&ram_list.mutex); + /* The data structures we set up here depend on knowing the page size, + * so no more changes can be made after this point. + * In an ideal world, nothing we did before we had finished the + * machine setup would care about the target page size, and we could + * do this much later, rather than requiring board models to state + * up front what their requirements are. + */ + finalize_target_page_bits(); io_mem_init(); memory_map_init(); qemu_mutex_init(&map_client_list_lock); @@ -2841,6 +3261,7 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ if (!memory_access_is_direct(mr, is_write)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write)) { + rcu_read_unlock(); return false; } } @@ -2852,6 +3273,31 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ return true; } +static hwaddr +address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len, + MemoryRegion *mr, hwaddr base, hwaddr len, + bool is_write) +{ + hwaddr done = 0; + hwaddr xlat; + MemoryRegion *this_mr; + + for (;;) { + target_len -= len; + addr += len; + done += len; + if (target_len == 0) { + return done; + } + + len = target_len; + this_mr = address_space_translate(as, addr, &xlat, &len, is_write); + if (this_mr != mr || xlat != base + done) { + return done; + } + } +} + /* Map a physical memory region into a host virtual address. * May map a subset of the requested range, given by and returned in *plen. * May return NULL if resources needed to perform the mapping are exhausted. @@ -2865,9 +3311,8 @@ void *address_space_map(AddressSpace *as, bool is_write) { hwaddr len = *plen; - hwaddr done = 0; - hwaddr l, xlat, base; - MemoryRegion *mr, *this_mr; + hwaddr l, xlat; + MemoryRegion *mr; void *ptr; if (len == 0) { @@ -2901,26 +3346,10 @@ void *address_space_map(AddressSpace *as, return bounce.buffer; } - base = xlat; - - for (;;) { - len -= l; - addr += l; - done += l; - if (len == 0) { - break; - } - - l = len; - this_mr = address_space_translate(as, addr, &xlat, &l, is_write); - if (this_mr != mr || xlat != base + done) { - break; - } - } memory_region_ref(mr); - *plen = done; - ptr = qemu_ram_ptr_length(mr->ram_block, base, plen); + *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write); + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen); rcu_read_unlock(); return ptr; @@ -2972,597 +3401,51 @@ void cpu_physical_memory_unmap(void *buffer, hwaddr len, return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len); } -/* warning: addr must be aligned */ -static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, - MemTxResult *result, - enum device_endian endian) -{ - uint8_t *ptr; - uint64_t val; - MemoryRegion *mr; - hwaddr l = 4; - hwaddr addr1; - MemTxResult r; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, false); - if (l < 4 || !memory_access_is_direct(mr, false)) { - release_lock |= prepare_mmio_access(mr); - - /* I/O case */ - r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs); -#if defined(TARGET_WORDS_BIGENDIAN) - if (endian == DEVICE_LITTLE_ENDIAN) { - val = bswap32(val); - } -#else - if (endian == DEVICE_BIG_ENDIAN) { - val = bswap32(val); - } -#endif - } else { - /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - switch (endian) { - case DEVICE_LITTLE_ENDIAN: - val = ldl_le_p(ptr); - break; - case DEVICE_BIG_ENDIAN: - val = ldl_be_p(ptr); - break; - default: - val = ldl_p(ptr); - break; - } - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); - return val; -} - -uint32_t address_space_ldl(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldl_internal(as, addr, attrs, result, - DEVICE_NATIVE_ENDIAN); -} - -uint32_t address_space_ldl_le(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldl_internal(as, addr, attrs, result, - DEVICE_LITTLE_ENDIAN); -} - -uint32_t address_space_ldl_be(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldl_internal(as, addr, attrs, result, - DEVICE_BIG_ENDIAN); -} - -uint32_t ldl_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_ldl(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_ldl_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_ldl_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* warning: addr must be aligned */ -static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, - MemTxResult *result, - enum device_endian endian) -{ - uint8_t *ptr; - uint64_t val; - MemoryRegion *mr; - hwaddr l = 8; - hwaddr addr1; - MemTxResult r; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, - false); - if (l < 8 || !memory_access_is_direct(mr, false)) { - release_lock |= prepare_mmio_access(mr); - - /* I/O case */ - r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs); -#if defined(TARGET_WORDS_BIGENDIAN) - if (endian == DEVICE_LITTLE_ENDIAN) { - val = bswap64(val); - } -#else - if (endian == DEVICE_BIG_ENDIAN) { - val = bswap64(val); - } -#endif - } else { - /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - switch (endian) { - case DEVICE_LITTLE_ENDIAN: - val = ldq_le_p(ptr); - break; - case DEVICE_BIG_ENDIAN: - val = ldq_be_p(ptr); - break; - default: - val = ldq_p(ptr); - break; - } - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); - return val; -} - -uint64_t address_space_ldq(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldq_internal(as, addr, attrs, result, - DEVICE_NATIVE_ENDIAN); -} - -uint64_t address_space_ldq_le(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldq_internal(as, addr, attrs, result, - DEVICE_LITTLE_ENDIAN); -} - -uint64_t address_space_ldq_be(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_ldq_internal(as, addr, attrs, result, - DEVICE_BIG_ENDIAN); -} +#define ARG1_DECL AddressSpace *as +#define ARG1 as +#define SUFFIX +#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__) +#define IS_DIRECT(mr, is_write) memory_access_is_direct(mr, is_write) +#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs) +#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len) +#define RCU_READ_LOCK(...) rcu_read_lock() +#define RCU_READ_UNLOCK(...) rcu_read_unlock() +#include "memory_ldst.inc.c" -uint64_t ldq_phys(AddressSpace *as, hwaddr addr) +int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpace *as, + hwaddr addr, + hwaddr len, + bool is_write) { - return address_space_ldq(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); + cache->len = len; + cache->as = as; + cache->xlat = addr; + return len; } -uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr) +void address_space_cache_invalidate(MemoryRegionCache *cache, + hwaddr addr, + hwaddr access_len) { - return address_space_ldq_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); } -uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr) +void address_space_cache_destroy(MemoryRegionCache *cache) { - return address_space_ldq_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); + cache->as = NULL; } -/* XXX: optimize */ -uint32_t address_space_ldub(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - uint8_t val; - MemTxResult r; - - r = address_space_rw(as, addr, attrs, &val, 1, 0); - if (result) { - *result = r; - } - return val; -} - -uint32_t ldub_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_ldub(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* warning: addr must be aligned */ -static inline uint32_t address_space_lduw_internal(AddressSpace *as, - hwaddr addr, - MemTxAttrs attrs, - MemTxResult *result, - enum device_endian endian) -{ - uint8_t *ptr; - uint64_t val; - MemoryRegion *mr; - hwaddr l = 2; - hwaddr addr1; - MemTxResult r; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, - false); - if (l < 2 || !memory_access_is_direct(mr, false)) { - release_lock |= prepare_mmio_access(mr); - - /* I/O case */ - r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs); -#if defined(TARGET_WORDS_BIGENDIAN) - if (endian == DEVICE_LITTLE_ENDIAN) { - val = bswap16(val); - } -#else - if (endian == DEVICE_BIG_ENDIAN) { - val = bswap16(val); - } -#endif - } else { - /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - switch (endian) { - case DEVICE_LITTLE_ENDIAN: - val = lduw_le_p(ptr); - break; - case DEVICE_BIG_ENDIAN: - val = lduw_be_p(ptr); - break; - default: - val = lduw_p(ptr); - break; - } - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); - return val; -} - -uint32_t address_space_lduw(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_lduw_internal(as, addr, attrs, result, - DEVICE_NATIVE_ENDIAN); -} - -uint32_t address_space_lduw_le(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_lduw_internal(as, addr, attrs, result, - DEVICE_LITTLE_ENDIAN); -} - -uint32_t address_space_lduw_be(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, MemTxResult *result) -{ - return address_space_lduw_internal(as, addr, attrs, result, - DEVICE_BIG_ENDIAN); -} - -uint32_t lduw_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_lduw(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_lduw_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr) -{ - return address_space_lduw_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* warning: addr must be aligned. The ram page is not masked as dirty - and the code inside is not invalidated. It is useful if the dirty - bits are used to track modified PTEs */ -void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - uint8_t *ptr; - MemoryRegion *mr; - hwaddr l = 4; - hwaddr addr1; - MemTxResult r; - uint8_t dirty_log_mask; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, - true); - if (l < 4 || !memory_access_is_direct(mr, true)) { - release_lock |= prepare_mmio_access(mr); - - r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); - } else { - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - stl_p(ptr, val); - - dirty_log_mask = memory_region_get_dirty_log_mask(mr); - dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); - cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, - 4, dirty_log_mask); - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); -} - -void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stl_notdirty(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* warning: addr must be aligned */ -static inline void address_space_stl_internal(AddressSpace *as, - hwaddr addr, uint32_t val, - MemTxAttrs attrs, - MemTxResult *result, - enum device_endian endian) -{ - uint8_t *ptr; - MemoryRegion *mr; - hwaddr l = 4; - hwaddr addr1; - MemTxResult r; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, - true); - if (l < 4 || !memory_access_is_direct(mr, true)) { - release_lock |= prepare_mmio_access(mr); - -#if defined(TARGET_WORDS_BIGENDIAN) - if (endian == DEVICE_LITTLE_ENDIAN) { - val = bswap32(val); - } -#else - if (endian == DEVICE_BIG_ENDIAN) { - val = bswap32(val); - } -#endif - r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); - } else { - /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - switch (endian) { - case DEVICE_LITTLE_ENDIAN: - stl_le_p(ptr, val); - break; - case DEVICE_BIG_ENDIAN: - stl_be_p(ptr, val); - break; - default: - stl_p(ptr, val); - break; - } - invalidate_and_set_dirty(mr, addr1, 4); - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); -} - -void address_space_stl(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stl_internal(as, addr, val, attrs, result, - DEVICE_NATIVE_ENDIAN); -} - -void address_space_stl_le(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stl_internal(as, addr, val, attrs, result, - DEVICE_LITTLE_ENDIAN); -} - -void address_space_stl_be(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stl_internal(as, addr, val, attrs, result, - DEVICE_BIG_ENDIAN); -} - -void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stl(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stl_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stl_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* XXX: optimize */ -void address_space_stb(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - uint8_t v = val; - MemTxResult r; - - r = address_space_rw(as, addr, attrs, &v, 1, 1); - if (result) { - *result = r; - } -} - -void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stb(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* warning: addr must be aligned */ -static inline void address_space_stw_internal(AddressSpace *as, - hwaddr addr, uint32_t val, - MemTxAttrs attrs, - MemTxResult *result, - enum device_endian endian) -{ - uint8_t *ptr; - MemoryRegion *mr; - hwaddr l = 2; - hwaddr addr1; - MemTxResult r; - bool release_lock = false; - - rcu_read_lock(); - mr = address_space_translate(as, addr, &addr1, &l, true); - if (l < 2 || !memory_access_is_direct(mr, true)) { - release_lock |= prepare_mmio_access(mr); - -#if defined(TARGET_WORDS_BIGENDIAN) - if (endian == DEVICE_LITTLE_ENDIAN) { - val = bswap16(val); - } -#else - if (endian == DEVICE_BIG_ENDIAN) { - val = bswap16(val); - } -#endif - r = memory_region_dispatch_write(mr, addr1, val, 2, attrs); - } else { - /* RAM case */ - ptr = qemu_map_ram_ptr(mr->ram_block, addr1); - switch (endian) { - case DEVICE_LITTLE_ENDIAN: - stw_le_p(ptr, val); - break; - case DEVICE_BIG_ENDIAN: - stw_be_p(ptr, val); - break; - default: - stw_p(ptr, val); - break; - } - invalidate_and_set_dirty(mr, addr1, 2); - r = MEMTX_OK; - } - if (result) { - *result = r; - } - if (release_lock) { - qemu_mutex_unlock_iothread(); - } - rcu_read_unlock(); -} - -void address_space_stw(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stw_internal(as, addr, val, attrs, result, - DEVICE_NATIVE_ENDIAN); -} - -void address_space_stw_le(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stw_internal(as, addr, val, attrs, result, - DEVICE_LITTLE_ENDIAN); -} - -void address_space_stw_be(AddressSpace *as, hwaddr addr, uint32_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - address_space_stw_internal(as, addr, val, attrs, result, - DEVICE_BIG_ENDIAN); -} - -void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stw(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stw_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val) -{ - address_space_stw_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -/* XXX: optimize */ -void address_space_stq(AddressSpace *as, hwaddr addr, uint64_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - MemTxResult r; - val = tswap64(val); - r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1); - if (result) { - *result = r; - } -} - -void address_space_stq_le(AddressSpace *as, hwaddr addr, uint64_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - MemTxResult r; - val = cpu_to_le64(val); - r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1); - if (result) { - *result = r; - } -} -void address_space_stq_be(AddressSpace *as, hwaddr addr, uint64_t val, - MemTxAttrs attrs, MemTxResult *result) -{ - MemTxResult r; - val = cpu_to_be64(val); - r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1); - if (result) { - *result = r; - } -} - -void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val) -{ - address_space_stq(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val) -{ - address_space_stq_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} - -void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val) -{ - address_space_stq_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL); -} +#define ARG1_DECL MemoryRegionCache *cache +#define ARG1 cache +#define SUFFIX _cached +#define TRANSLATE(addr, ...) \ + address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__) +#define IS_DIRECT(mr, is_write) true +#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs) +#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len) +#define RCU_READ_LOCK() rcu_read_lock() +#define RCU_READ_UNLOCK() rcu_read_unlock() +#include "memory_ldst.inc.c" /* virtual memory access for debug (includes writing to ROM) */ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, @@ -3572,6 +3455,7 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, hwaddr phys_addr; target_ulong page; + cpu_synchronize_state(cpu); while (len > 0) { int asidx; MemTxAttrs attrs; @@ -3605,11 +3489,20 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, * Allows code that needs to deal with migration bitmaps etc to still be built * target independent. */ -size_t qemu_target_page_bits(void) +size_t qemu_target_page_size(void) +{ + return TARGET_PAGE_SIZE; +} + +int qemu_target_page_bits(void) { return TARGET_PAGE_BITS; } +int qemu_target_page_bits_min(void) +{ + return TARGET_PAGE_BITS_MIN; +} #endif /* @@ -3648,7 +3541,7 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) int ret = 0; rcu_read_lock(); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + RAMBLOCK_FOREACH(block) { ret = func(block->idstr, block->host, block->offset, block->used_length, opaque); if (ret) { @@ -3658,4 +3551,83 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) rcu_read_unlock(); return ret; } + +/* + * Unmap pages of memory from start to start+length such that + * they a) read as 0, b) Trigger whatever fault mechanism + * the OS provides for postcopy. + * The pages must be unmapped by the end of the function. + * Returns: 0 on success, none-0 on failure + * + */ +int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) +{ + int ret = -1; + + uint8_t *host_startaddr = rb->host + start; + + if ((uintptr_t)host_startaddr & (rb->page_size - 1)) { + error_report("ram_block_discard_range: Unaligned start address: %p", + host_startaddr); + goto err; + } + + if ((start + length) <= rb->used_length) { + uint8_t *host_endaddr = host_startaddr + length; + if ((uintptr_t)host_endaddr & (rb->page_size - 1)) { + error_report("ram_block_discard_range: Unaligned end address: %p", + host_endaddr); + goto err; + } + + errno = ENOTSUP; /* If we are missing MADVISE etc */ + + if (rb->page_size == qemu_host_page_size) { +#if defined(CONFIG_MADVISE) + /* Note: We need the madvise MADV_DONTNEED behaviour of definitely + * freeing the page. + */ + ret = madvise(host_startaddr, length, MADV_DONTNEED); +#endif + } else { + /* Huge page case - unfortunately it can't do DONTNEED, but + * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the + * huge page file. + */ +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start, length); +#endif + } + if (ret) { + ret = -errno; + error_report("ram_block_discard_range: Failed to discard range " + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + } + } else { + error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64 + "/%zx/" RAM_ADDR_FMT")", + rb->idstr, start, length, rb->used_length); + } + +err: + return ret; +} + #endif + +void page_size_init(void) +{ + /* NOTE: we can always suppose that qemu_host_page_size >= + TARGET_PAGE_SIZE */ + qemu_real_host_page_size = getpagesize(); + qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size; + if (qemu_host_page_size == 0) { + qemu_host_page_size = qemu_real_host_page_size; + } + if (qemu_host_page_size < TARGET_PAGE_SIZE) { + qemu_host_page_size = TARGET_PAGE_SIZE; + } + qemu_host_page_mask = -(intptr_t)qemu_host_page_size; +}