X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/dfde4e6e1a868f60033ece0590b1f75e6c57fa16..31db5b3638553e616eba3391dbff88f77b8a5bc9:/exec.c diff --git a/exec.c b/exec.c index e3ac1a18fd..9ad0a4b045 100644 --- a/exec.c +++ b/exec.c @@ -31,6 +31,7 @@ #include "hw/qdev.h" #include "qemu/osdep.h" #include "sysemu/kvm.h" +#include "sysemu/sysemu.h" #include "hw/xen/xen.h" #include "qemu/timer.h" #include "qemu/config-file.h" @@ -49,12 +50,15 @@ #include "translate-all.h" #include "exec/memory-internal.h" +#include "exec/ram_addr.h" +#include "qemu/cache-utils.h" + +#include "qemu/range.h" //#define DEBUG_SUBPAGE #if !defined(CONFIG_USER_ONLY) -int phys_ram_fd; -static int in_migration; +static bool in_migration; RAMList ram_list = { .blocks = QTAILQ_HEAD_INITIALIZER(ram_list.blocks) }; @@ -69,10 +73,10 @@ static MemoryRegion io_mem_unassigned; #endif -CPUArchState *first_cpu; +struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus); /* current CPU in the current thread. It is only valid inside cpu_exec() */ -DEFINE_TLS(CPUArchState *,cpu_single_env); +DEFINE_TLS(CPUState *, current_cpu); /* 0 = Do not count executed instructions. 1 = Precise instruction counting. 2 = Adaptive rate instruction counting. */ @@ -83,17 +87,39 @@ int use_icount; typedef struct PhysPageEntry PhysPageEntry; struct PhysPageEntry { - uint16_t is_leaf : 1; - /* index into phys_sections (is_leaf) or phys_map_nodes (!is_leaf) */ - uint16_t ptr : 15; + /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */ + uint32_t skip : 6; + /* index into phys_sections (!skip) or phys_map_nodes (skip) */ + uint32_t ptr : 26; }; +#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6) + +/* Size of the L2 (and L3, etc) page tables. */ +#define ADDR_SPACE_BITS 64 + +#define P_L2_BITS 9 +#define P_L2_SIZE (1 << P_L2_BITS) + +#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1) + +typedef PhysPageEntry Node[P_L2_SIZE]; + +typedef struct PhysPageMap { + unsigned sections_nb; + unsigned sections_nb_alloc; + unsigned nodes_nb; + unsigned nodes_nb_alloc; + Node *nodes; + MemoryRegionSection *sections; +} PhysPageMap; + struct AddressSpaceDispatch { /* This is a multi-level map on the physical address space. * The bottom level has pointers to MemoryRegionSections. */ PhysPageEntry phys_map; - MemoryListener listener; + PhysPageMap map; AddressSpace *as; }; @@ -105,85 +131,73 @@ typedef struct subpage_t { uint16_t sub_section[TARGET_PAGE_SIZE]; } subpage_t; -static MemoryRegionSection *phys_sections; -static unsigned phys_sections_nb, phys_sections_nb_alloc; -static uint16_t phys_section_unassigned; -static uint16_t phys_section_notdirty; -static uint16_t phys_section_rom; -static uint16_t phys_section_watch; - -/* Simple allocator for PhysPageEntry nodes */ -static PhysPageEntry (*phys_map_nodes)[L2_SIZE]; -static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc; - -#define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1) +#define PHYS_SECTION_UNASSIGNED 0 +#define PHYS_SECTION_NOTDIRTY 1 +#define PHYS_SECTION_ROM 2 +#define PHYS_SECTION_WATCH 3 static void io_mem_init(void); static void memory_map_init(void); -static void *qemu_safe_ram_ptr(ram_addr_t addr); static MemoryRegion io_mem_watch; #endif #if !defined(CONFIG_USER_ONLY) -static void phys_map_node_reserve(unsigned nodes) +static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes) { - if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) { - typedef PhysPageEntry Node[L2_SIZE]; - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16); - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc, - phys_map_nodes_nb + nodes); - phys_map_nodes = g_renew(Node, phys_map_nodes, - phys_map_nodes_nb_alloc); + if (map->nodes_nb + nodes > map->nodes_nb_alloc) { + map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16); + map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes); + map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc); } } -static uint16_t phys_map_node_alloc(void) +static uint32_t phys_map_node_alloc(PhysPageMap *map) { unsigned i; - uint16_t ret; + uint32_t ret; - ret = phys_map_nodes_nb++; + ret = map->nodes_nb++; assert(ret != PHYS_MAP_NODE_NIL); - assert(ret != phys_map_nodes_nb_alloc); - for (i = 0; i < L2_SIZE; ++i) { - phys_map_nodes[ret][i].is_leaf = 0; - phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; + assert(ret != map->nodes_nb_alloc); + for (i = 0; i < P_L2_SIZE; ++i) { + map->nodes[ret][i].skip = 1; + map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; } return ret; } -static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index, - hwaddr *nb, uint16_t leaf, +static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp, + hwaddr *index, hwaddr *nb, uint16_t leaf, int level) { PhysPageEntry *p; int i; - hwaddr step = (hwaddr)1 << (level * L2_BITS); + hwaddr step = (hwaddr)1 << (level * P_L2_BITS); - if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) { - lp->ptr = phys_map_node_alloc(); - p = phys_map_nodes[lp->ptr]; + if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) { + lp->ptr = phys_map_node_alloc(map); + p = map->nodes[lp->ptr]; if (level == 0) { - for (i = 0; i < L2_SIZE; i++) { - p[i].is_leaf = 1; - p[i].ptr = phys_section_unassigned; + for (i = 0; i < P_L2_SIZE; i++) { + p[i].skip = 0; + p[i].ptr = PHYS_SECTION_UNASSIGNED; } } } else { - p = phys_map_nodes[lp->ptr]; + p = map->nodes[lp->ptr]; } - lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)]; + lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)]; - while (*nb && lp < &p[L2_SIZE]) { + while (*nb && lp < &p[P_L2_SIZE]) { if ((*index & (step - 1)) == 0 && *nb >= step) { - lp->is_leaf = true; + lp->skip = 0; lp->ptr = leaf; *index += step; *nb -= step; } else { - phys_page_set_level(lp, index, nb, leaf, level - 1); + phys_page_set_level(map, lp, index, nb, leaf, level - 1); } ++lp; } @@ -194,25 +208,95 @@ static void phys_page_set(AddressSpaceDispatch *d, uint16_t leaf) { /* Wildly overreserve - it doesn't matter much. */ - phys_map_node_reserve(3 * P_L2_LEVELS); + phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS); - phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1); + phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1); } -static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr index) +/* Compact a non leaf page entry. Simply detect that the entry has a single child, + * and update our entry so we can skip it and go directly to the destination. + */ +static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted) { - PhysPageEntry lp = d->phys_map; + unsigned valid_ptr = P_L2_SIZE; + int valid = 0; PhysPageEntry *p; int i; - for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) { + if (lp->ptr == PHYS_MAP_NODE_NIL) { + return; + } + + p = nodes[lp->ptr]; + for (i = 0; i < P_L2_SIZE; i++) { + if (p[i].ptr == PHYS_MAP_NODE_NIL) { + continue; + } + + valid_ptr = i; + valid++; + if (p[i].skip) { + phys_page_compact(&p[i], nodes, compacted); + } + } + + /* We can only compress if there's only one child. */ + if (valid != 1) { + return; + } + + assert(valid_ptr < P_L2_SIZE); + + /* Don't compress if it won't fit in the # of bits we have. */ + if (lp->skip + p[valid_ptr].skip >= (1 << 3)) { + return; + } + + lp->ptr = p[valid_ptr].ptr; + if (!p[valid_ptr].skip) { + /* If our only child is a leaf, make this a leaf. */ + /* By design, we should have made this node a leaf to begin with so we + * should never reach here. + * But since it's so simple to handle this, let's do it just in case we + * change this rule. + */ + lp->skip = 0; + } else { + lp->skip += p[valid_ptr].skip; + } +} + +static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb) +{ + DECLARE_BITMAP(compacted, nodes_nb); + + if (d->phys_map.skip) { + phys_page_compact(&d->phys_map, d->map.nodes, compacted); + } +} + +static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr, + Node *nodes, MemoryRegionSection *sections) +{ + PhysPageEntry *p; + hwaddr index = addr >> TARGET_PAGE_BITS; + int i; + + for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) { if (lp.ptr == PHYS_MAP_NODE_NIL) { - return &phys_sections[phys_section_unassigned]; + return §ions[PHYS_SECTION_UNASSIGNED]; } - p = phys_map_nodes[lp.ptr]; - lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)]; + p = nodes[lp.ptr]; + lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)]; + } + + if (sections[lp.ptr].size.hi || + range_covers_byte(sections[lp.ptr].offset_within_address_space, + sections[lp.ptr].size.lo, addr)) { + return §ions[lp.ptr]; + } else { + return §ions[PHYS_SECTION_UNASSIGNED]; } - return &phys_sections[lp.ptr]; } bool memory_region_is_unassigned(MemoryRegion *mr) @@ -221,36 +305,38 @@ bool memory_region_is_unassigned(MemoryRegion *mr) && mr != &io_mem_watch; } -static MemoryRegionSection *address_space_lookup_region(AddressSpace *as, +static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, hwaddr addr, bool resolve_subpage) { MemoryRegionSection *section; subpage_t *subpage; - section = phys_page_find(as->dispatch, addr >> TARGET_PAGE_BITS); + section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections); if (resolve_subpage && section->mr->subpage) { subpage = container_of(section->mr, subpage_t, iomem); - section = &phys_sections[subpage->sub_section[SUBPAGE_IDX(addr)]]; + section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]]; } return section; } static MemoryRegionSection * -address_space_translate_internal(AddressSpace *as, hwaddr addr, hwaddr *xlat, +address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat, hwaddr *plen, bool resolve_subpage) { MemoryRegionSection *section; - Int128 diff; + Int128 diff, diff_page; - section = address_space_lookup_region(as, addr, resolve_subpage); + section = address_space_lookup_region(d, addr, resolve_subpage); /* Compute offset within MemoryRegionSection */ addr -= section->offset_within_address_space; /* Compute offset within MemoryRegion */ *xlat = addr + section->offset_within_region; + diff_page = int128_make64(((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr); diff = int128_sub(section->mr->size, int128_make64(addr)); + diff = int128_min(diff, diff_page); *plen = int128_get64(int128_min(diff, int128_make64(*plen))); return section; } @@ -265,7 +351,7 @@ MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, hwaddr len = *plen; for (;;) { - section = address_space_translate_internal(as, addr, &addr, plen, true); + section = address_space_translate_internal(as->dispatch, addr, &addr, &len, true); mr = section->mr; if (!mr->iommu_ops) { @@ -294,7 +380,7 @@ address_space_translate_for_iotlb(AddressSpace *as, hwaddr addr, hwaddr *xlat, hwaddr *plen) { MemoryRegionSection *section; - section = address_space_translate_internal(as, addr, xlat, plen, false); + section = address_space_translate_internal(as->dispatch, addr, xlat, plen, false); assert(!section->mr->iommu_ops); return section; @@ -341,45 +427,29 @@ const VMStateDescription vmstate_cpu_common = { CPUState *qemu_get_cpu(int index) { - CPUArchState *env = first_cpu; - CPUState *cpu = NULL; + CPUState *cpu; - while (env) { - cpu = ENV_GET_CPU(env); + CPU_FOREACH(cpu) { if (cpu->cpu_index == index) { - break; + return cpu; } - env = env->next_cpu; } - return env ? cpu : NULL; -} - -void qemu_for_each_cpu(void (*func)(CPUState *cpu, void *data), void *data) -{ - CPUArchState *env = first_cpu; - - while (env) { - func(ENV_GET_CPU(env), data); - env = env->next_cpu; - } + return NULL; } void cpu_exec_init(CPUArchState *env) { CPUState *cpu = ENV_GET_CPU(env); CPUClass *cc = CPU_GET_CLASS(cpu); - CPUArchState **penv; + CPUState *some_cpu; int cpu_index; #if defined(CONFIG_USER_ONLY) cpu_list_lock(); #endif - env->next_cpu = NULL; - penv = &first_cpu; cpu_index = 0; - while (*penv != NULL) { - penv = &(*penv)->next_cpu; + CPU_FOREACH(some_cpu) { cpu_index++; } cpu->cpu_index = cpu_index; @@ -389,15 +459,18 @@ void cpu_exec_init(CPUArchState *env) #ifndef CONFIG_USER_ONLY cpu->thread_id = qemu_get_thread_id(); #endif - *penv = env; + QTAILQ_INSERT_TAIL(&cpus, cpu, node); #if defined(CONFIG_USER_ONLY) cpu_list_unlock(); #endif - vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu); + if (qdev_get_vmsd(DEVICE(cpu)) == NULL) { + vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu); + } #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY) register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION, cpu_save, cpu_load, env); assert(cc->vmsd == NULL); + assert(qdev_get_vmsd(DEVICE(cpu)) == NULL); #endif if (cc->vmsd != NULL) { vmstate_register(NULL, cpu_index, cc->vmsd, cpu); @@ -406,15 +479,17 @@ void cpu_exec_init(CPUArchState *env) #if defined(TARGET_HAS_ICE) #if defined(CONFIG_USER_ONLY) -static void breakpoint_invalidate(CPUArchState *env, target_ulong pc) +static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) { tb_invalidate_phys_page_range(pc, pc + 1, 0); } #else -static void breakpoint_invalidate(CPUArchState *env, target_ulong pc) +static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) { - tb_invalidate_phys_addr(cpu_get_phys_page_debug(env, pc) | - (pc & ~TARGET_PAGE_MASK)); + hwaddr phys = cpu_get_phys_page_debug(cpu, pc); + if (phys != -1) { + tb_invalidate_phys_addr(phys | (pc & ~TARGET_PAGE_MASK)); + } } #endif #endif /* TARGET_HAS_ICE */ @@ -516,15 +591,17 @@ int cpu_breakpoint_insert(CPUArchState *env, target_ulong pc, int flags, bp->flags = flags; /* keep all GDB-injected breakpoints in front */ - if (flags & BP_GDB) + if (flags & BP_GDB) { QTAILQ_INSERT_HEAD(&env->breakpoints, bp, entry); - else + } else { QTAILQ_INSERT_TAIL(&env->breakpoints, bp, entry); + } - breakpoint_invalidate(env, pc); + breakpoint_invalidate(ENV_GET_CPU(env), pc); - if (breakpoint) + if (breakpoint) { *breakpoint = bp; + } return 0; #else return -ENOSYS; @@ -555,7 +632,7 @@ void cpu_breakpoint_remove_by_ref(CPUArchState *env, CPUBreakpoint *breakpoint) #if defined(TARGET_HAS_ICE) QTAILQ_REMOVE(&env->breakpoints, breakpoint, entry); - breakpoint_invalidate(env, breakpoint->pc); + breakpoint_invalidate(ENV_GET_CPU(env), breakpoint->pc); g_free(breakpoint); #endif @@ -576,16 +653,17 @@ void cpu_breakpoint_remove_all(CPUArchState *env, int mask) /* enable or disable single step mode. EXCP_DEBUG is returned by the CPU loop after each instruction */ -void cpu_single_step(CPUArchState *env, int enabled) +void cpu_single_step(CPUState *cpu, int enabled) { #if defined(TARGET_HAS_ICE) - if (env->singlestep_enabled != enabled) { - env->singlestep_enabled = enabled; - if (kvm_enabled()) - kvm_update_guest_debug(env, 0); - else { + if (cpu->singlestep_enabled != enabled) { + cpu->singlestep_enabled = enabled; + if (kvm_enabled()) { + kvm_update_guest_debug(cpu, 0); + } else { /* must flush all the translated code to avoid inconsistencies */ /* XXX: only flush what is necessary */ + CPUArchState *env = cpu->env_ptr; tb_flush(env); } } @@ -608,7 +686,7 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...) qemu_log("qemu: fatal: "); qemu_log_vprintf(fmt, ap2); qemu_log("\n"); - log_cpu_state(env, CPU_DUMP_FPU | CPU_DUMP_CCOP); + log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP); qemu_log_flush(); qemu_log_close(); } @@ -625,81 +703,61 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...) abort(); } -CPUArchState *cpu_copy(CPUArchState *env) +#if !defined(CONFIG_USER_ONLY) +static RAMBlock *qemu_get_ram_block(ram_addr_t addr) { - CPUArchState *new_env = cpu_init(env->cpu_model_str); - CPUArchState *next_cpu = new_env->next_cpu; -#if defined(TARGET_HAS_ICE) - CPUBreakpoint *bp; - CPUWatchpoint *wp; -#endif - - memcpy(new_env, env, sizeof(CPUArchState)); - - /* Preserve chaining. */ - new_env->next_cpu = next_cpu; + RAMBlock *block; - /* Clone all break/watchpoints. - Note: Once we support ptrace with hw-debug register access, make sure - BP_CPU break/watchpoints are handled correctly on clone. */ - QTAILQ_INIT(&env->breakpoints); - QTAILQ_INIT(&env->watchpoints); -#if defined(TARGET_HAS_ICE) - QTAILQ_FOREACH(bp, &env->breakpoints, entry) { - cpu_breakpoint_insert(new_env, bp->pc, bp->flags, NULL); + /* The list is protected by the iothread lock here. */ + block = ram_list.mru_block; + if (block && addr - block->offset < block->length) { + goto found; } - QTAILQ_FOREACH(wp, &env->watchpoints, entry) { - cpu_watchpoint_insert(new_env, wp->vaddr, (~wp->len_mask) + 1, - wp->flags, NULL); + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + if (addr - block->offset < block->length) { + goto found; + } } -#endif - - return new_env; -} -#if !defined(CONFIG_USER_ONLY) -static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t end, - uintptr_t length) -{ - uintptr_t start1; - - /* we modify the TLB cache so that the dirty bit will be set again - when accessing the range */ - start1 = (uintptr_t)qemu_safe_ram_ptr(start); - /* Check that we don't span multiple blocks - this breaks the - address comparisons below. */ - if ((uintptr_t)qemu_safe_ram_ptr(end - 1) - start1 - != (end - 1) - start) { - abort(); - } - cpu_tlb_reset_dirty_all(start1, length); + fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr); + abort(); +found: + ram_list.mru_block = block; + return block; } -/* Note: start and end must be within the same ram block. */ -void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, - int dirty_flags) +static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) { - uintptr_t length; + ram_addr_t start1; + RAMBlock *block; + ram_addr_t end; + end = TARGET_PAGE_ALIGN(start + length); start &= TARGET_PAGE_MASK; - end = TARGET_PAGE_ALIGN(end); - length = end - start; + block = qemu_get_ram_block(start); + assert(block == qemu_get_ram_block(end - 1)); + start1 = (uintptr_t)block->host + (start - block->offset); + cpu_tlb_reset_dirty_all(start1, length); +} + +/* Note: start and end must be within the same ram block. */ +void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length, + unsigned client) +{ if (length == 0) return; - cpu_physical_memory_mask_dirty_range(start, length, dirty_flags); + cpu_physical_memory_clear_dirty_range(start, length, client); if (tcg_enabled()) { - tlb_reset_dirty_range_all(start, end, length); + tlb_reset_dirty_range_all(start, length); } } -static int cpu_physical_memory_set_dirty_tracking(int enable) +static void cpu_physical_memory_set_dirty_tracking(bool enable) { - int ret = 0; in_migration = enable; - return ret; } hwaddr memory_region_section_get_iotlb(CPUArchState *env, @@ -717,12 +775,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env, iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK) + xlat; if (!section->readonly) { - iotlb |= phys_section_notdirty; + iotlb |= PHYS_SECTION_NOTDIRTY; } else { - iotlb |= phys_section_rom; + iotlb |= PHYS_SECTION_ROM; } } else { - iotlb = section - phys_sections; + iotlb = section - address_space_memory.dispatch->map.sections; iotlb += xlat; } @@ -732,7 +790,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env, if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) { /* Avoid trapping reads of pages with a write breakpoint. */ if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) { - iotlb = phys_section_watch + paddr; + iotlb = PHYS_SECTION_WATCH + paddr; *address |= TLB_MMIO; break; } @@ -749,22 +807,35 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, uint16_t section); static subpage_t *subpage_init(AddressSpace *as, hwaddr base); -static uint16_t phys_section_add(MemoryRegionSection *section) +static void *(*phys_mem_alloc)(size_t size) = qemu_anon_ram_alloc; + +/* + * Set a custom physical guest memory alloator. + * Accelerators with unusual needs may need this. Hopefully, we can + * get rid of it eventually. + */ +void phys_mem_set_alloc(void *(*alloc)(size_t)) +{ + phys_mem_alloc = alloc; +} + +static uint16_t phys_section_add(PhysPageMap *map, + MemoryRegionSection *section) { /* The physical section number is ORed with a page-aligned * pointer to produce the iotlb entries. Thus it should * never overflow into the page-aligned value. */ - assert(phys_sections_nb < TARGET_PAGE_SIZE); + assert(map->sections_nb < TARGET_PAGE_SIZE); - if (phys_sections_nb == phys_sections_nb_alloc) { - phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16); - phys_sections = g_renew(MemoryRegionSection, phys_sections, - phys_sections_nb_alloc); + if (map->sections_nb == map->sections_nb_alloc) { + map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16); + map->sections = g_renew(MemoryRegionSection, map->sections, + map->sections_nb_alloc); } - phys_sections[phys_sections_nb] = *section; + map->sections[map->sections_nb] = *section; memory_region_ref(section->mr); - return phys_sections_nb++; + return map->sections_nb++; } static void phys_section_destroy(MemoryRegion *mr) @@ -778,13 +849,14 @@ static void phys_section_destroy(MemoryRegion *mr) } } -static void phys_sections_clear(void) +static void phys_sections_free(PhysPageMap *map) { - while (phys_sections_nb > 0) { - MemoryRegionSection *section = &phys_sections[--phys_sections_nb]; + while (map->sections_nb > 0) { + MemoryRegionSection *section = &map->sections[--map->sections_nb]; phys_section_destroy(section->mr); } - phys_map_nodes_nb = 0; + g_free(map->sections); + g_free(map->nodes); } static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section) @@ -792,7 +864,8 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; - MemoryRegionSection *existing = phys_page_find(d, base >> TARGET_PAGE_BITS); + MemoryRegionSection *existing = phys_page_find(d->phys_map, base, + d->map.nodes, d->map.sections); MemoryRegionSection subsection = { .offset_within_address_space = base, .size = int128_make64(TARGET_PAGE_SIZE), @@ -805,13 +878,14 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti subpage = subpage_init(d->as, base); subsection.mr = &subpage->iomem; phys_page_set(d, base >> TARGET_PAGE_BITS, 1, - phys_section_add(&subsection)); + phys_section_add(&d->map, &subsection)); } else { subpage = container_of(existing->mr, subpage_t, iomem); } start = section->offset_within_address_space & ~TARGET_PAGE_MASK; end = start + int128_get64(section->size) - 1; - subpage_register(subpage, start, end, phys_section_add(section)); + subpage_register(subpage, start, end, + phys_section_add(&d->map, section)); } @@ -819,7 +893,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *section) { hwaddr start_addr = section->offset_within_address_space; - uint16_t section_index = phys_section_add(section); + uint16_t section_index = phys_section_add(&d->map, section); uint64_t num_pages = int128_get64(int128_rshift(section->size, TARGET_PAGE_BITS)); @@ -829,7 +903,8 @@ static void register_multipage(AddressSpaceDispatch *d, static void mem_add(MemoryListener *listener, MemoryRegionSection *section) { - AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener); + AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); + AddressSpaceDispatch *d = as->next_dispatch; MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -849,7 +924,7 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section) now = remain; if (int128_lt(remain.size, page_size)) { register_subpage(d, &now); - } else if (remain.offset_within_region & ~TARGET_PAGE_MASK) { + } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { now.size = page_size; register_subpage(d, &now); } else { @@ -875,7 +950,7 @@ void qemu_mutex_unlock_ramlist(void) qemu_mutex_unlock(&ram_list.mutex); } -#if defined(__linux__) && !defined(TARGET_S390X) +#ifdef __linux__ #include @@ -901,6 +976,13 @@ static long gethugepagesize(const char *path) return fs.f_bsize; } +static sigjmp_buf sigjump; + +static void sigbus_handler(int signal) +{ + siglongjmp(sigjump, 1); +} + static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, const char *path) @@ -910,9 +992,6 @@ static void *file_ram_alloc(RAMBlock *block, char *c; void *area; int fd; -#ifdef MAP_POPULATE - int flags; -#endif unsigned long hpagesize; hpagesize = gethugepagesize(path); @@ -960,24 +1039,63 @@ static void *file_ram_alloc(RAMBlock *block, if (ftruncate(fd, memory)) perror("ftruncate"); -#ifdef MAP_POPULATE - /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case - * MAP_PRIVATE is requested. For mem_prealloc we mmap as MAP_SHARED - * to sidestep this quirk. - */ - flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE; - area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0); -#else area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); -#endif if (area == MAP_FAILED) { perror("file_ram_alloc: can't mmap RAM pages"); close(fd); return (NULL); } + + if (mem_prealloc) { + int ret, i; + struct sigaction act, oldact; + sigset_t set, oldset; + + memset(&act, 0, sizeof(act)); + act.sa_handler = &sigbus_handler; + act.sa_flags = 0; + + ret = sigaction(SIGBUS, &act, &oldact); + if (ret) { + perror("file_ram_alloc: failed to install signal handler"); + exit(1); + } + + /* unblock SIGBUS */ + sigemptyset(&set); + sigaddset(&set, SIGBUS); + pthread_sigmask(SIG_UNBLOCK, &set, &oldset); + + if (sigsetjmp(sigjump, 1)) { + fprintf(stderr, "file_ram_alloc: failed to preallocate pages\n"); + exit(1); + } + + /* MAP_POPULATE silently ignores failures */ + for (i = 0; i < (memory/hpagesize); i++) { + memset(area + (hpagesize*i), 0, 1); + } + + ret = sigaction(SIGBUS, &oldact, NULL); + if (ret) { + perror("file_ram_alloc: failed to reinstall signal handler"); + exit(1); + } + + pthread_sigmask(SIG_SETMASK, &oldset, NULL); + } + block->fd = fd; return area; } +#else +static void *file_ram_alloc(RAMBlock *block, + ram_addr_t memory, + const char *path) +{ + fprintf(stderr, "-mem-path not supported on this host\n"); + exit(1); +} #endif static ram_addr_t find_ram_offset(ram_addr_t size) @@ -1029,12 +1147,10 @@ ram_addr_t last_ram_offset(void) static void qemu_ram_setup_dump(void *addr, ram_addr_t size) { int ret; - QemuOpts *machine_opts; /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */ - machine_opts = qemu_opts_find(qemu_find_opts("machine"), 0); - if (machine_opts && - !qemu_opt_get_bool(machine_opts, "dump-guest-core", true)) { + if (!qemu_opt_get_bool(qemu_get_machine_opts(), + "dump-guest-core", true)) { ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP); if (ret) { perror("qemu_madvise"); @@ -1081,10 +1197,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) static int memory_try_enable_merging(void *addr, size_t len) { - QemuOpts *opts; - - opts = qemu_opts_find(qemu_find_opts("machine"), 0); - if (opts && !qemu_opt_get_bool(opts, "mem-merge", true)) { + if (!qemu_opt_get_bool(qemu_get_machine_opts(), "mem-merge", true)) { /* disabled by the user */ return 0; } @@ -1096,9 +1209,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr) { RAMBlock *block, *new_block; + ram_addr_t old_ram_size, new_ram_size; + + old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; size = TARGET_PAGE_ALIGN(size); new_block = g_malloc0(sizeof(*new_block)); + new_block->fd = -1; /* This assumes the iothread lock is taken here too. */ qemu_mutex_lock_ramlist(); @@ -1107,26 +1224,32 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, if (host) { new_block->host = host; new_block->flags |= RAM_PREALLOC_MASK; + } else if (xen_enabled()) { + if (mem_path) { + fprintf(stderr, "-mem-path not supported with Xen\n"); + exit(1); + } + xen_ram_alloc(new_block->offset, size, mr); } else { if (mem_path) { -#if defined (__linux__) && !defined(TARGET_S390X) + if (phys_mem_alloc != qemu_anon_ram_alloc) { + /* + * file_ram_alloc() needs to allocate just like + * phys_mem_alloc, but we haven't bothered to provide + * a hook there. + */ + fprintf(stderr, + "-mem-path not supported with this accelerator\n"); + exit(1); + } new_block->host = file_ram_alloc(new_block, size, mem_path); + } + if (!new_block->host) { + new_block->host = phys_mem_alloc(size); if (!new_block->host) { - new_block->host = qemu_anon_ram_alloc(size); - memory_try_enable_merging(new_block->host, size); - } -#else - fprintf(stderr, "-mem-path option unsupported\n"); - exit(1); -#endif - } else { - if (xen_enabled()) { - xen_ram_alloc(new_block->offset, size, mr); - } else if (kvm_enabled()) { - /* some s390/kvm configurations have special constraints */ - new_block->host = kvm_ram_alloc(size); - } else { - new_block->host = qemu_anon_ram_alloc(size); + fprintf(stderr, "Cannot set up guest memory '%s': %s\n", + new_block->mr->name, strerror(errno)); + exit(1); } memory_try_enable_merging(new_block->host, size); } @@ -1149,14 +1272,21 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, ram_list.version++; qemu_mutex_unlock_ramlist(); - ram_list.phys_dirty = g_realloc(ram_list.phys_dirty, - last_ram_offset() >> TARGET_PAGE_BITS); - memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS), - 0, size >> TARGET_PAGE_BITS); - cpu_physical_memory_set_dirty_range(new_block->offset, size, 0xff); + new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; + + if (new_ram_size > old_ram_size) { + int i; + for (i = 0; i < DIRTY_MEMORY_NUM; i++) { + ram_list.dirty_memory[i] = + bitmap_zero_extend(ram_list.dirty_memory[i], + old_ram_size, new_ram_size); + } + } + cpu_physical_memory_set_dirty_range(new_block->offset, size); qemu_ram_setup_dump(new_block->host, size); qemu_madvise(new_block->host, size, QEMU_MADV_HUGEPAGE); + qemu_madvise(new_block->host, size, QEMU_MADV_DONTFORK); if (kvm_enabled()) kvm_setup_guest_memory(new_block->host, size); @@ -1200,23 +1330,15 @@ void qemu_ram_free(ram_addr_t addr) ram_list.version++; if (block->flags & RAM_PREALLOC_MASK) { ; - } else if (mem_path) { -#if defined (__linux__) && !defined(TARGET_S390X) - if (block->fd) { - munmap(block->host, block->length); - close(block->fd); - } else { - qemu_anon_ram_free(block->host, block->length); - } -#else - abort(); + } else if (xen_enabled()) { + xen_invalidate_map_cache_entry(block->host); +#ifndef _WIN32 + } else if (block->fd >= 0) { + munmap(block->host, block->length); + close(block->fd); #endif } else { - if (xen_enabled()) { - xen_invalidate_map_cache_entry(block->host); - } else { - qemu_anon_ram_free(block->host, block->length); - } + qemu_anon_ram_free(block->host, block->length); } g_free(block); break; @@ -1240,38 +1362,31 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) vaddr = block->host + offset; if (block->flags & RAM_PREALLOC_MASK) { ; + } else if (xen_enabled()) { + abort(); } else { flags = MAP_FIXED; munmap(vaddr, length); - if (mem_path) { -#if defined(__linux__) && !defined(TARGET_S390X) - if (block->fd) { + if (block->fd >= 0) { #ifdef MAP_POPULATE - flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED : - MAP_PRIVATE; + flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED : + MAP_PRIVATE; #else - flags |= MAP_PRIVATE; -#endif - area = mmap(vaddr, length, PROT_READ | PROT_WRITE, - flags, block->fd, offset); - } else { - flags |= MAP_PRIVATE | MAP_ANONYMOUS; - area = mmap(vaddr, length, PROT_READ | PROT_WRITE, - flags, -1, 0); - } -#else - abort(); + flags |= MAP_PRIVATE; #endif + area = mmap(vaddr, length, PROT_READ | PROT_WRITE, + flags, block->fd, offset); } else { -#if defined(TARGET_S390X) && defined(CONFIG_KVM) - flags |= MAP_SHARED | MAP_ANONYMOUS; - area = mmap(vaddr, length, PROT_EXEC|PROT_READ|PROT_WRITE, - flags, -1, 0); -#else + /* + * Remap needs to match alloc. Accelerators that + * set phys_mem_alloc never remap. If they did, + * we'd need a remap hook here. + */ + assert(phys_mem_alloc == qemu_anon_ram_alloc); + flags |= MAP_PRIVATE | MAP_ANONYMOUS; area = mmap(vaddr, length, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif } if (area != vaddr) { fprintf(stderr, "Could not remap addr: " @@ -1298,24 +1413,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) */ void *qemu_get_ram_ptr(ram_addr_t addr) { - RAMBlock *block; + RAMBlock *block = qemu_get_ram_block(addr); - /* The list is protected by the iothread lock here. */ - block = ram_list.mru_block; - if (block && addr - block->offset < block->length) { - goto found; - } - QTAILQ_FOREACH(block, &ram_list.blocks, next) { - if (addr - block->offset < block->length) { - goto found; - } - } - - fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr); - abort(); - -found: - ram_list.mru_block = block; if (xen_enabled()) { /* We need to check if the requested address is in the RAM * because we don't want to map the entire memory in QEMU. @@ -1331,43 +1430,9 @@ found: return block->host + (addr - block->offset); } -/* Return a host pointer to ram allocated with qemu_ram_alloc. Same as - * qemu_get_ram_ptr but do not touch ram_list.mru_block. - * - * ??? Is this still necessary? - */ -static void *qemu_safe_ram_ptr(ram_addr_t addr) -{ - RAMBlock *block; - - /* The list is protected by the iothread lock here. */ - QTAILQ_FOREACH(block, &ram_list.blocks, next) { - if (addr - block->offset < block->length) { - if (xen_enabled()) { - /* We need to check if the requested address is in the RAM - * because we don't want to map the entire memory in QEMU. - * In that case just map until the end of the page. - */ - if (block->offset == 0) { - return xen_map_cache(addr, 0, 0); - } else if (block->host == NULL) { - block->host = - xen_map_cache(block->offset, block->length, 1); - } - } - return block->host + (addr - block->offset); - } - } - - fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr); - abort(); - - return NULL; -} - /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr * but takes a size argument */ -static void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size) +static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) { if (*size == 0) { return NULL; @@ -1390,14 +1455,21 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, ram_addr_t *size) } } -int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) +/* Some of the softmmu routines need to translate from a host pointer + (typically a TLB entry) back to a ram offset. */ +MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) { RAMBlock *block; uint8_t *host = ptr; if (xen_enabled()) { *ram_addr = xen_ram_addr_from_mapcache(ptr); - return 0; + return qemu_get_ram_block(*ram_addr)->mr; + } + + block = ram_list.mru_block; + if (block && block->host && host - block->host < block->length) { + goto found; } QTAILQ_FOREACH(block, &ram_list.blocks, next) { @@ -1406,35 +1478,22 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) continue; } if (host - block->host < block->length) { - *ram_addr = block->offset + (host - block->host); - return 0; + goto found; } } - return -1; -} - -/* Some of the softmmu routines need to translate from a host pointer - (typically a TLB entry) back to a ram offset. */ -ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) -{ - ram_addr_t ram_addr; + return NULL; - if (qemu_ram_addr_from_host(ptr, &ram_addr)) { - fprintf(stderr, "Bad ram pointer %p\n", ptr); - abort(); - } - return ram_addr; +found: + *ram_addr = block->offset + (host - block->host); + return block->mr; } static void notdirty_mem_write(void *opaque, hwaddr ram_addr, uint64_t val, unsigned size) { - int dirty_flags; - dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); - if (!(dirty_flags & CODE_DIRTY_FLAG)) { + if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) { tb_invalidate_phys_page_fast(ram_addr, size); - dirty_flags = cpu_physical_memory_get_dirty_flags(ram_addr); } switch (size) { case 1: @@ -1449,12 +1508,14 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr, default: abort(); } - dirty_flags |= (0xff & ~CODE_DIRTY_FLAG); - cpu_physical_memory_set_dirty_flags(ram_addr, dirty_flags); + cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_MIGRATION); + cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_VGA); /* we remove the notdirty callback only if the code has been flushed */ - if (dirty_flags == 0xff) - tlb_set_dirty(cpu_single_env, cpu_single_env->mem_io_vaddr); + if (!cpu_physical_memory_is_clean(ram_addr)) { + CPUArchState *env = current_cpu->env_ptr; + tlb_set_dirty(env, env->mem_io_vaddr); + } } static bool notdirty_mem_accepts(void *opaque, hwaddr addr, @@ -1472,7 +1533,7 @@ static const MemoryRegionOps notdirty_mem_ops = { /* Generate a debug exception if a watchpoint has been hit. */ static void check_watchpoint(int offset, int len_mask, int flags) { - CPUArchState *env = cpu_single_env; + CPUArchState *env = current_cpu->env_ptr; target_ulong pc, cs_base; target_ulong vaddr; CPUWatchpoint *wp; @@ -1554,7 +1615,7 @@ static uint64_t subpage_read(void *opaque, hwaddr addr, uint8_t buf[4]; #if defined(DEBUG_SUBPAGE) - printf("%s: subpage %p len %d addr " TARGET_FMT_plx "\n", __func__, + printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__, subpage, len, addr); #endif address_space_read(subpage->as, addr + subpage->base, buf, len); @@ -1577,7 +1638,7 @@ static void subpage_write(void *opaque, hwaddr addr, uint8_t buf[4]; #if defined(DEBUG_SUBPAGE) - printf("%s: subpage %p len %d addr " TARGET_FMT_plx + printf("%s: subpage %p len %u addr " TARGET_FMT_plx " value %"PRIx64"\n", __func__, subpage, len, addr, value); #endif @@ -1598,16 +1659,16 @@ static void subpage_write(void *opaque, hwaddr addr, } static bool subpage_accepts(void *opaque, hwaddr addr, - unsigned size, bool is_write) + unsigned len, bool is_write) { subpage_t *subpage = opaque; #if defined(DEBUG_SUBPAGE) - printf("%s: subpage %p %c len %d addr " TARGET_FMT_plx "\n", + printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n", __func__, subpage, is_write ? 'w' : 'r', len, addr); #endif return address_space_access_valid(subpage->as, addr + subpage->base, - size, is_write); + len, is_write); } static const MemoryRegionOps subpage_ops = { @@ -1627,8 +1688,8 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, idx = SUBPAGE_IDX(start); eidx = SUBPAGE_IDX(end); #if defined(DEBUG_SUBPAGE) - printf("%s: %p start %08x end %08x idx %08x eidx %08x mem %ld\n", __func__, - mmio, start, end, idx, eidx, memory); + printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n", + __func__, mmio, start, end, idx, eidx, section); #endif for (; idx <= eidx; idx++) { mmio->sub_section[idx] = section; @@ -1649,15 +1710,15 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base) "subpage", TARGET_PAGE_SIZE); mmio->iomem.subpage = true; #if defined(DEBUG_SUBPAGE) - printf("%s: %p base " TARGET_FMT_plx " len %08x %d\n", __func__, - mmio, base, TARGET_PAGE_SIZE, subpage_memory); + printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__, + mmio, base, TARGET_PAGE_SIZE); #endif - subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, phys_section_unassigned); + subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED); return mmio; } -static uint16_t dummy_section(MemoryRegion *mr) +static uint16_t dummy_section(PhysPageMap *map, MemoryRegion *mr) { MemoryRegionSection section = { .mr = mr, @@ -1666,12 +1727,13 @@ static uint16_t dummy_section(MemoryRegion *mr) .size = int128_2_64(), }; - return phys_section_add(§ion); + return phys_section_add(map, §ion); } MemoryRegion *iotlb_to_region(hwaddr index) { - return phys_sections[index & ~TARGET_PAGE_MASK].mr; + return address_space_memory.dispatch->map.sections[ + index & ~TARGET_PAGE_MASK].mr; } static void io_mem_init(void) @@ -1687,44 +1749,65 @@ static void io_mem_init(void) static void mem_begin(MemoryListener *listener) { - AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener); + AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); + AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); + uint16_t n; - d->phys_map.ptr = PHYS_MAP_NODE_NIL; + n = dummy_section(&d->map, &io_mem_unassigned); + assert(n == PHYS_SECTION_UNASSIGNED); + n = dummy_section(&d->map, &io_mem_notdirty); + assert(n == PHYS_SECTION_NOTDIRTY); + n = dummy_section(&d->map, &io_mem_rom); + assert(n == PHYS_SECTION_ROM); + n = dummy_section(&d->map, &io_mem_watch); + assert(n == PHYS_SECTION_WATCH); + + d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; + d->as = as; + as->next_dispatch = d; } -static void core_begin(MemoryListener *listener) +static void mem_commit(MemoryListener *listener) { - phys_sections_clear(); - phys_section_unassigned = dummy_section(&io_mem_unassigned); - phys_section_notdirty = dummy_section(&io_mem_notdirty); - phys_section_rom = dummy_section(&io_mem_rom); - phys_section_watch = dummy_section(&io_mem_watch); + AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); + AddressSpaceDispatch *cur = as->dispatch; + AddressSpaceDispatch *next = as->next_dispatch; + + phys_page_compact_all(next, next->map.nodes_nb); + + as->dispatch = next; + + if (cur) { + phys_sections_free(&cur->map); + g_free(cur); + } } static void tcg_commit(MemoryListener *listener) { - CPUArchState *env; + CPUState *cpu; /* since each CPU stores ram addresses in its TLB cache, we must reset the modified entries */ /* XXX: slow ! */ - for(env = first_cpu; env != NULL; env = env->next_cpu) { + CPU_FOREACH(cpu) { + CPUArchState *env = cpu->env_ptr; + tlb_flush(env, 1); } } static void core_log_global_start(MemoryListener *listener) { - cpu_physical_memory_set_dirty_tracking(1); + cpu_physical_memory_set_dirty_tracking(true); } static void core_log_global_stop(MemoryListener *listener) { - cpu_physical_memory_set_dirty_tracking(0); + cpu_physical_memory_set_dirty_tracking(false); } static MemoryListener core_memory_listener = { - .begin = core_begin, .log_global_start = core_log_global_start, .log_global_stop = core_log_global_stop, .priority = 1, @@ -1736,25 +1819,22 @@ static MemoryListener tcg_memory_listener = { void address_space_init_dispatch(AddressSpace *as) { - AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1); - - d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 }; - d->listener = (MemoryListener) { + as->dispatch = NULL; + as->dispatch_listener = (MemoryListener) { .begin = mem_begin, + .commit = mem_commit, .region_add = mem_add, .region_nop = mem_add, .priority = 0, }; - d->as = as; - as->dispatch = d; - memory_listener_register(&d->listener, as); + memory_listener_register(&as->dispatch_listener, as); } void address_space_destroy_dispatch(AddressSpace *as) { AddressSpaceDispatch *d = as->dispatch; - memory_listener_unregister(&d->listener); + memory_listener_unregister(&as->dispatch_listener); g_free(d); as->dispatch = NULL; } @@ -1762,15 +1842,19 @@ void address_space_destroy_dispatch(AddressSpace *as) static void memory_map_init(void) { system_memory = g_malloc(sizeof(*system_memory)); - memory_region_init(system_memory, NULL, "system", INT64_MAX); + + memory_region_init(system_memory, NULL, "system", UINT64_MAX); address_space_init(&address_space_memory, system_memory, "memory"); system_io = g_malloc(sizeof(*system_io)); - memory_region_init(system_io, NULL, "io", 65536); + memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io", + 65536); address_space_init(&address_space_io, system_io, "I/O"); memory_listener_register(&core_memory_listener, &address_space_memory); - memory_listener_register(&tcg_memory_listener, &address_space_memory); + if (tcg_enabled()) { + memory_listener_register(&tcg_memory_listener, &address_space_memory); + } } MemoryRegion *get_system_memory(void) @@ -1787,7 +1871,7 @@ MemoryRegion *get_system_io(void) /* physical memory access (slow version, mainly for debug) */ #if defined(CONFIG_USER_ONLY) -int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, +int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, uint8_t *buf, int len, int is_write) { int l, flags; @@ -1831,11 +1915,12 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, static void invalidate_and_set_dirty(hwaddr addr, hwaddr length) { - if (!cpu_physical_memory_is_dirty(addr)) { + if (cpu_physical_memory_is_clean(addr)) { /* invalidate code */ tb_invalidate_phys_page_range(addr, addr + length, 0); /* set dirty bit */ - cpu_physical_memory_set_dirty_flags(addr, (0xff & ~CODE_DIRTY_FLAG)); + cpu_physical_memory_set_dirty_flag(addr, DIRTY_MEMORY_VGA); + cpu_physical_memory_set_dirty_flag(addr, DIRTY_MEMORY_MIGRATION); } xen_modified_memory(addr, length); } @@ -1852,15 +1937,33 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) return false; } -static inline int memory_access_size(MemoryRegion *mr, int l, hwaddr addr) +static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) { - if (l >= 4 && (((addr & 3) == 0 || mr->ops->impl.unaligned))) { - return 4; + unsigned access_size_max = mr->ops->valid.max_access_size; + + /* Regions are assumed to support 1-4 byte accesses unless + otherwise specified. */ + if (access_size_max == 0) { + access_size_max = 4; } - if (l >= 2 && (((addr & 1) == 0) || mr->ops->impl.unaligned)) { - return 2; + + /* Bound the maximum access by the alignment of the address. */ + if (!mr->ops->impl.unaligned) { + unsigned align_size_max = addr & -addr; + if (align_size_max != 0 && align_size_max < access_size_max) { + access_size_max = align_size_max; + } } - return 1; + + /* Don't attempt accesses larger than the maximum. */ + if (l > access_size_max) { + l = access_size_max; + } + if (l & (l - 1)) { + l = 1 << (qemu_fls(l) - 1); + } + + return l; } bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf, @@ -1880,20 +1983,31 @@ bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf, if (is_write) { if (!memory_access_is_direct(mr, is_write)) { l = memory_access_size(mr, l, addr1); - /* XXX: could force cpu_single_env to NULL to avoid + /* XXX: could force current_cpu to NULL to avoid potential bugs */ - if (l == 4) { + switch (l) { + case 8: + /* 64 bit write access */ + val = ldq_p(buf); + error |= io_mem_write(mr, addr1, val, 8); + break; + case 4: /* 32 bit write access */ val = ldl_p(buf); error |= io_mem_write(mr, addr1, val, 4); - } else if (l == 2) { + break; + case 2: /* 16 bit write access */ val = lduw_p(buf); error |= io_mem_write(mr, addr1, val, 2); - } else { + break; + case 1: /* 8 bit write access */ val = ldub_p(buf); error |= io_mem_write(mr, addr1, val, 1); + break; + default: + abort(); } } else { addr1 += memory_region_get_ram_addr(mr); @@ -1906,18 +2020,29 @@ bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf, if (!memory_access_is_direct(mr, is_write)) { /* I/O case */ l = memory_access_size(mr, l, addr1); - if (l == 4) { + switch (l) { + case 8: + /* 64 bit read access */ + error |= io_mem_read(mr, addr1, &val, 8); + stq_p(buf, val); + break; + case 4: /* 32 bit read access */ error |= io_mem_read(mr, addr1, &val, 4); stl_p(buf, val); - } else if (l == 2) { + break; + case 2: /* 16 bit read access */ error |= io_mem_read(mr, addr1, &val, 2); stw_p(buf, val); - } else { + break; + case 1: /* 8 bit read access */ error |= io_mem_read(mr, addr1, &val, 1); stb_p(buf, val); + break; + default: + abort(); } } else { /* RAM case */ @@ -1951,9 +2076,13 @@ void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, address_space_rw(&address_space_memory, addr, buf, len, is_write); } -/* used for ROM loading : can write in RAM and ROM */ -void cpu_physical_memory_write_rom(hwaddr addr, - const uint8_t *buf, int len) +enum write_rom_type { + WRITE_DATA, + FLUSH_CACHE, +}; + +static inline void cpu_physical_memory_write_rom_internal( + hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type) { hwaddr l; uint8_t *ptr; @@ -1972,8 +2101,15 @@ void cpu_physical_memory_write_rom(hwaddr addr, addr1 += memory_region_get_ram_addr(mr); /* ROM/RAM case */ ptr = qemu_get_ram_ptr(addr1); - memcpy(ptr, buf, l); - invalidate_and_set_dirty(addr1, l); + switch (type) { + case WRITE_DATA: + memcpy(ptr, buf, l); + invalidate_and_set_dirty(addr1, l); + break; + case FLUSH_CACHE: + flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l); + break; + } } len -= l; buf += l; @@ -1981,7 +2117,30 @@ void cpu_physical_memory_write_rom(hwaddr addr, } } +/* used for ROM loading : can write in RAM and ROM */ +void cpu_physical_memory_write_rom(hwaddr addr, + const uint8_t *buf, int len) +{ + cpu_physical_memory_write_rom_internal(addr, buf, len, WRITE_DATA); +} + +void cpu_flush_icache_range(hwaddr start, int len) +{ + /* + * This function should do the same thing as an icache flush that was + * triggered from within the guest. For TCG we are always cache coherent, + * so there is no need to flush anything. For KVM / Xen we need to flush + * the host's instruction cache at least. + */ + if (tcg_enabled()) { + return; + } + + cpu_physical_memory_write_rom_internal(start, NULL, len, FLUSH_CACHE); +} + typedef struct { + MemoryRegion *mr; void *buffer; hwaddr addr; hwaddr len; @@ -2061,47 +2220,58 @@ void *address_space_map(AddressSpace *as, bool is_write) { hwaddr len = *plen; - hwaddr todo = 0; - hwaddr l, xlat; - MemoryRegion *mr; - ram_addr_t raddr = RAM_ADDR_MAX; - ram_addr_t rlen; - void *ret; + hwaddr done = 0; + hwaddr l, xlat, base; + MemoryRegion *mr, *this_mr; + ram_addr_t raddr; - while (len > 0) { - l = len; - mr = address_space_translate(as, addr, &xlat, &l, is_write); - - if (!memory_access_is_direct(mr, is_write)) { - if (todo || bounce.buffer) { - break; - } - bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); - bounce.addr = addr; - bounce.len = l; - if (!is_write) { - address_space_read(as, addr, bounce.buffer, l); - } + if (len == 0) { + return NULL; + } - *plen = l; - return bounce.buffer; + l = len; + mr = address_space_translate(as, addr, &xlat, &l, is_write); + if (!memory_access_is_direct(mr, is_write)) { + if (bounce.buffer) { + return NULL; } - if (!todo) { - raddr = memory_region_get_ram_addr(mr) + xlat; - } else { - if (memory_region_get_ram_addr(mr) + xlat != raddr + todo) { - break; - } + /* Avoid unbounded allocations */ + l = MIN(l, TARGET_PAGE_SIZE); + bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); + bounce.addr = addr; + bounce.len = l; + + memory_region_ref(mr); + bounce.mr = mr; + if (!is_write) { + address_space_read(as, addr, bounce.buffer, l); } + *plen = l; + return bounce.buffer; + } + + base = xlat; + raddr = memory_region_get_ram_addr(mr); + + for (;;) { len -= l; addr += l; - todo += l; + done += l; + if (len == 0) { + break; + } + + l = len; + this_mr = address_space_translate(as, addr, &xlat, &l, is_write); + if (this_mr != mr || xlat != base + done) { + break; + } } - rlen = todo; - ret = qemu_ram_ptr_length(raddr, &rlen); - *plen = rlen; - return ret; + + memory_region_ref(mr); + *plen = done; + return qemu_ram_ptr_length(raddr + base, plen); } /* Unmaps a memory region previously mapped by address_space_map(). @@ -2112,8 +2282,12 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, int is_write, hwaddr access_len) { if (buffer != bounce.buffer) { + MemoryRegion *mr; + ram_addr_t addr1; + + mr = qemu_ram_addr_from_host(buffer, &addr1); + assert(mr != NULL); if (is_write) { - ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer); while (access_len) { unsigned l; l = TARGET_PAGE_SIZE; @@ -2127,6 +2301,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, if (xen_enabled()) { xen_invalidate_map_cache_entry(buffer); } + memory_region_unref(mr); return; } if (is_write) { @@ -2134,6 +2309,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, } qemu_vfree(bounce.buffer); bounce.buffer = NULL; + memory_region_unref(bounce.mr); cpu_notify_map_clients(); } @@ -2355,12 +2531,13 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val) stl_p(ptr, val); if (unlikely(in_migration)) { - if (!cpu_physical_memory_is_dirty(addr1)) { + if (cpu_physical_memory_is_clean(addr1)) { /* invalidate code */ tb_invalidate_phys_page_range(addr1, addr1 + 4, 0); /* set dirty bit */ - cpu_physical_memory_set_dirty_flags( - addr1, (0xff & ~CODE_DIRTY_FLAG)); + cpu_physical_memory_set_dirty_flag(addr1, + DIRTY_MEMORY_MIGRATION); + cpu_physical_memory_set_dirty_flag(addr1, DIRTY_MEMORY_VGA); } } } @@ -2505,7 +2682,7 @@ void stq_be_phys(hwaddr addr, uint64_t val) } /* virtual memory access for debug (includes writing to ROM) */ -int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, +int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, uint8_t *buf, int len, int is_write) { int l; @@ -2514,7 +2691,7 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, while (len > 0) { page = addr & TARGET_PAGE_MASK; - phys_addr = cpu_get_phys_page_debug(env, page); + phys_addr = cpu_get_phys_page_debug(cpu, page); /* if no physical page mapped, return an error */ if (phys_addr == -1) return -1;