Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19	#include "qemu/osdep.h"
	20	#include "qapi/error.h"
	21	#ifndef _WIN32
	22	#endif
	23
	24	#include "qemu/cutils.h"
	25	#include "cpu.h"
	26	#include "exec/exec-all.h"
	27	#include "exec/target_page.h"
	28	#include "tcg.h"
	29	#include "hw/qdev-core.h"
	30	#if !defined(CONFIG_USER_ONLY)
	31	#include "hw/boards.h"
	32	#include "hw/xen/xen.h"
	33	#endif
	34	#include "sysemu/kvm.h"
	35	#include "sysemu/sysemu.h"
	36	#include "qemu/timer.h"
	37	#include "qemu/config-file.h"
	38	#include "qemu/error-report.h"
	39	#if defined(CONFIG_USER_ONLY)
	40	#include "qemu.h"
	41	#else /* !CONFIG_USER_ONLY */
	42	#include "hw/hw.h"
	43	#include "exec/memory.h"
	44	#include "exec/ioport.h"
	45	#include "sysemu/dma.h"
	46	#include "sysemu/numa.h"
	47	#include "sysemu/hw_accel.h"
	48	#include "exec/address-spaces.h"
	49	#include "sysemu/xen-mapcache.h"
	50	#include "trace-root.h"
	51
	52	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	53	#include <fcntl.h>
	54	#include <linux/falloc.h>
	55	#endif
	56
	57	#endif
	58	#include "exec/cpu-all.h"
	59	#include "qemu/rcu_queue.h"
	60	#include "qemu/main-loop.h"
	61	#include "translate-all.h"
	62	#include "sysemu/replay.h"
	63
	64	#include "exec/memory-internal.h"
	65	#include "exec/ram_addr.h"
	66	#include "exec/log.h"
	67
	68	#include "migration/vmstate.h"
	69
	70	#include "qemu/range.h"
	71	#ifndef _WIN32
	72	#include "qemu/mmap-alloc.h"
	73	#endif
	74
	75	#include "monitor/monitor.h"
	76
	77	//#define DEBUG_SUBPAGE
	78
	79	#if !defined(CONFIG_USER_ONLY)
	80	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	81	* are protected by the ramlist lock.
	82	*/
	83	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	84
	85	static MemoryRegion *system_memory;
	86	static MemoryRegion *system_io;
	87
	88	AddressSpace address_space_io;
	89	AddressSpace address_space_memory;
	90
	91	MemoryRegion io_mem_rom, io_mem_notdirty;
	92	static MemoryRegion io_mem_unassigned;
	93
	94	/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
	95	#define RAM_PREALLOC (1 << 0)
	96
	97	/* RAM is mmap-ed with MAP_SHARED */
	98	#define RAM_SHARED (1 << 1)
	99
	100	/* Only a portion of RAM (used_length) is actually used, and migrated.
	101	* This used_length size can change across reboots.
	102	*/
	103	#define RAM_RESIZEABLE (1 << 2)
	104
	105	#endif
	106
	107	#ifdef TARGET_PAGE_BITS_VARY
	108	int target_page_bits;
	109	bool target_page_bits_decided;
	110	#endif
	111
	112	struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	113	/* current CPU in the current thread. It is only valid inside
	114	cpu_exec() */
	115	__thread CPUState *current_cpu;
	116	/* 0 = Do not count executed instructions.
	117	1 = Precise instruction counting.
	118	2 = Adaptive rate instruction counting. */
	119	int use_icount;
	120
	121	uintptr_t qemu_host_page_size;
	122	intptr_t qemu_host_page_mask;
	123	uintptr_t qemu_real_host_page_size;
	124	intptr_t qemu_real_host_page_mask;
	125
	126	bool set_preferred_target_page_bits(int bits)
	127	{
	128	/* The target page size is the lowest common denominator for all
	129	* the CPUs in the system, so we can only make it smaller, never
	130	* larger. And we can't make it smaller once we've committed to
	131	* a particular size.
	132	*/
	133	#ifdef TARGET_PAGE_BITS_VARY
	134	assert(bits >= TARGET_PAGE_BITS_MIN);
	135	if (target_page_bits == 0 \|\| target_page_bits > bits) {
	136	if (target_page_bits_decided) {
	137	return false;
	138	}
	139	target_page_bits = bits;
	140	}
	141	#endif
	142	return true;
	143	}
	144
	145	#if !defined(CONFIG_USER_ONLY)
	146
	147	static void finalize_target_page_bits(void)
	148	{
	149	#ifdef TARGET_PAGE_BITS_VARY
	150	if (target_page_bits == 0) {
	151	target_page_bits = TARGET_PAGE_BITS_MIN;
	152	}
	153	target_page_bits_decided = true;
	154	#endif
	155	}
	156
	157	typedef struct PhysPageEntry PhysPageEntry;
	158
	159	struct PhysPageEntry {
	160	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	161	uint32_t skip : 6;
	162	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	163	uint32_t ptr : 26;
	164	};
	165
	166	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	167
	168	/* Size of the L2 (and L3, etc) page tables. */
	169	#define ADDR_SPACE_BITS 64
	170
	171	#define P_L2_BITS 9
	172	#define P_L2_SIZE (1 << P_L2_BITS)
	173
	174	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	175
	176	typedef PhysPageEntry Node[P_L2_SIZE];
	177
	178	typedef struct PhysPageMap {
	179	struct rcu_head rcu;
	180
	181	unsigned sections_nb;
	182	unsigned sections_nb_alloc;
	183	unsigned nodes_nb;
	184	unsigned nodes_nb_alloc;
	185	Node *nodes;
	186	MemoryRegionSection *sections;
	187	} PhysPageMap;
	188
	189	struct AddressSpaceDispatch {
	190	struct rcu_head rcu;
	191
	192	MemoryRegionSection *mru_section;
	193	/* This is a multi-level map on the physical address space.
	194	* The bottom level has pointers to MemoryRegionSections.
	195	*/
	196	PhysPageEntry phys_map;
	197	PhysPageMap map;
	198	AddressSpace *as;
	199	};
	200
	201	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	202	typedef struct subpage_t {
	203	MemoryRegion iomem;
	204	AddressSpace *as;
	205	hwaddr base;
	206	uint16_t sub_section[];
	207	} subpage_t;
	208
	209	#define PHYS_SECTION_UNASSIGNED 0
	210	#define PHYS_SECTION_NOTDIRTY 1
	211	#define PHYS_SECTION_ROM 2
	212	#define PHYS_SECTION_WATCH 3
	213
	214	static void io_mem_init(void);
	215	static void memory_map_init(void);
	216	static void tcg_commit(MemoryListener *listener);
	217
	218	static MemoryRegion io_mem_watch;
	219
	220	/**
	221	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	222	* @cpu: the CPU whose AddressSpace this is
	223	* @as: the AddressSpace itself
	224	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	225	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	226	*/
	227	struct CPUAddressSpace {
	228	CPUState *cpu;
	229	AddressSpace *as;
	230	struct AddressSpaceDispatch *memory_dispatch;
	231	MemoryListener tcg_as_listener;
	232	};
	233
	234	struct DirtyBitmapSnapshot {
	235	ram_addr_t start;
	236	ram_addr_t end;
	237	unsigned long dirty[];
	238	};
	239
	240	#endif
	241
	242	#if !defined(CONFIG_USER_ONLY)
	243
	244	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	245	{
	246	static unsigned alloc_hint = 16;
	247	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	248	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
	249	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
	250	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	251	alloc_hint = map->nodes_nb_alloc;
	252	}
	253	}
	254
	255	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	256	{
	257	unsigned i;
	258	uint32_t ret;
	259	PhysPageEntry e;
	260	PhysPageEntry *p;
	261
	262	ret = map->nodes_nb++;
	263	p = map->nodes[ret];
	264	assert(ret != PHYS_MAP_NODE_NIL);
	265	assert(ret != map->nodes_nb_alloc);
	266
	267	e.skip = leaf ? 0 : 1;
	268	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	269	for (i = 0; i < P_L2_SIZE; ++i) {
	270	memcpy(&p[i], &e, sizeof(e));
	271	}
	272	return ret;
	273	}
	274
	275	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	276	hwaddr index, hwaddr nb, uint16_t leaf,
	277	int level)
	278	{
	279	PhysPageEntry *p;
	280	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	281
	282	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	283	lp->ptr = phys_map_node_alloc(map, level == 0);
	284	}
	285	p = map->nodes[lp->ptr];
	286	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	287
	288	while (*nb && lp < &p[P_L2_SIZE]) {
	289	if ((index & (step - 1)) == 0 && nb >= step) {
	290	lp->skip = 0;
	291	lp->ptr = leaf;
	292	*index += step;
	293	*nb -= step;
	294	} else {
	295	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	296	}
	297	++lp;
	298	}
	299	}
	300
	301	static void phys_page_set(AddressSpaceDispatch *d,
	302	hwaddr index, hwaddr nb,
	303	uint16_t leaf)
	304	{
	305	/* Wildly overreserve - it doesn't matter much. */
	306	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	307
	308	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	309	}
	310
	311	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	312	* and update our entry so we can skip it and go directly to the destination.
	313	*/
	314	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	315	{
	316	unsigned valid_ptr = P_L2_SIZE;
	317	int valid = 0;
	318	PhysPageEntry *p;
	319	int i;
	320
	321	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	322	return;
	323	}
	324
	325	p = nodes[lp->ptr];
	326	for (i = 0; i < P_L2_SIZE; i++) {
	327	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	328	continue;
	329	}
	330
	331	valid_ptr = i;
	332	valid++;
	333	if (p[i].skip) {
	334	phys_page_compact(&p[i], nodes);
	335	}
	336	}
	337
	338	/* We can only compress if there's only one child. */
	339	if (valid != 1) {
	340	return;
	341	}
	342
	343	assert(valid_ptr < P_L2_SIZE);
	344
	345	/* Don't compress if it won't fit in the # of bits we have. */
	346	if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
	347	return;
	348	}
	349
	350	lp->ptr = p[valid_ptr].ptr;
	351	if (!p[valid_ptr].skip) {
	352	/* If our only child is a leaf, make this a leaf. */
	353	/* By design, we should have made this node a leaf to begin with so we
	354	* should never reach here.
	355	* But since it's so simple to handle this, let's do it just in case we
	356	* change this rule.
	357	*/
	358	lp->skip = 0;
	359	} else {
	360	lp->skip += p[valid_ptr].skip;
	361	}
	362	}
	363
	364	static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
	365	{
	366	if (d->phys_map.skip) {
	367	phys_page_compact(&d->phys_map, d->map.nodes);
	368	}
	369	}
	370
	371	static inline bool section_covers_addr(const MemoryRegionSection *section,
	372	hwaddr addr)
	373	{
	374	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	375	* the section must cover the entire address space.
	376	*/
	377	return int128_gethi(section->size) \|\|
	378	range_covers_byte(section->offset_within_address_space,
	379	int128_getlo(section->size), addr);
	380	}
	381
	382	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	383	{
	384	PhysPageEntry lp = d->phys_map, *p;
	385	Node *nodes = d->map.nodes;
	386	MemoryRegionSection *sections = d->map.sections;
	387	hwaddr index = addr >> TARGET_PAGE_BITS;
	388	int i;
	389
	390	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	391	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	392	return &sections[PHYS_SECTION_UNASSIGNED];
	393	}
	394	p = nodes[lp.ptr];
	395	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	396	}
	397
	398	if (section_covers_addr(&sections[lp.ptr], addr)) {
	399	return &sections[lp.ptr];
	400	} else {
	401	return &sections[PHYS_SECTION_UNASSIGNED];
	402	}
	403	}
	404
	405	bool memory_region_is_unassigned(MemoryRegion *mr)
	406	{
	407	return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
	408	&& mr != &io_mem_watch;
	409	}
	410
	411	/* Called from RCU critical section */
	412	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	413	hwaddr addr,
	414	bool resolve_subpage)
	415	{
	416	MemoryRegionSection *section = atomic_read(&d->mru_section);
	417	subpage_t *subpage;
	418	bool update;
	419
	420	if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
	421	section_covers_addr(section, addr)) {
	422	update = false;
	423	} else {
	424	section = phys_page_find(d, addr);
	425	update = true;
	426	}
	427	if (resolve_subpage && section->mr->subpage) {
	428	subpage = container_of(section->mr, subpage_t, iomem);
	429	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	430	}
	431	if (update) {
	432	atomic_set(&d->mru_section, section);
	433	}
	434	return section;
	435	}
	436
	437	/* Called from RCU critical section */
	438	static MemoryRegionSection *
	439	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	440	hwaddr *plen, bool resolve_subpage)
	441	{
	442	MemoryRegionSection *section;
	443	MemoryRegion *mr;
	444	Int128 diff;
	445
	446	section = address_space_lookup_region(d, addr, resolve_subpage);
	447	/* Compute offset within MemoryRegionSection */
	448	addr -= section->offset_within_address_space;
	449
	450	/* Compute offset within MemoryRegion */
	451	*xlat = addr + section->offset_within_region;
	452
	453	mr = section->mr;
	454
	455	/* MMIO registers can be expected to perform full-width accesses based only
	456	* on their address, without considering adjacent registers that could
	457	* decode to completely different MemoryRegions. When such registers
	458	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	459	* regions overlap wildly. For this reason we cannot clamp the accesses
	460	* here.
	461	*
	462	* If the length is small (as is the case for address_space_ldl/stl),
	463	* everything works fine. If the incoming length is large, however,
	464	* the caller really has to do the clamping through memory_access_size.
	465	*/
	466	if (memory_region_is_ram(mr)) {
	467	diff = int128_sub(section->size, int128_make64(addr));
	468	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	469	}
	470	return section;
	471	}
	472
	473	/* Called from RCU critical section */
	474	static MemoryRegionSection address_space_do_translate(AddressSpace *as,
	475	hwaddr addr,
	476	hwaddr *xlat,
	477	hwaddr *plen,
	478	bool is_write,
	479	bool is_mmio)
	480	{
	481	IOMMUTLBEntry iotlb;
	482	MemoryRegionSection *section;
	483	IOMMUMemoryRegion *iommu_mr;
	484	IOMMUMemoryRegionClass *imrc;
	485
	486	for (;;) {
	487	AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
	488	section = address_space_translate_internal(d, addr, &addr, plen, is_mmio);
	489
	490	iommu_mr = memory_region_get_iommu(section->mr);
	491	if (!iommu_mr) {
	492	break;
	493	}
	494	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	495
	496	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	497	IOMMU_WO : IOMMU_RO);
	498	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	499	\| (addr & iotlb.addr_mask));
	500	plen = MIN(plen, (addr \| iotlb.addr_mask) - addr + 1);
	501	if (!(iotlb.perm & (1 << is_write))) {
	502	goto translate_fail;
	503	}
	504
	505	as = iotlb.target_as;
	506	}
	507
	508	*xlat = addr;
	509
	510	return *section;
	511
	512	translate_fail:
	513	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	514	}
	515
	516	/* Called from RCU critical section */
	517	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	518	bool is_write)
	519	{
	520	MemoryRegionSection section;
	521	hwaddr xlat, plen;
	522
	523	/* Try to get maximum page mask during translation. */
	524	plen = (hwaddr)-1;
	525
	526	/* This can never be MMIO. */
	527	section = address_space_do_translate(as, addr, &xlat, &plen,
	528	is_write, false);
	529
	530	/* Illegal translation */
	531	if (section.mr == &io_mem_unassigned) {
	532	goto iotlb_fail;
	533	}
	534
	535	/* Convert memory region offset into address space offset */
	536	xlat += section.offset_within_address_space -
	537	section.offset_within_region;
	538
	539	if (plen == (hwaddr)-1) {
	540	/*
	541	* We use default page size here. Logically it only happens
	542	* for identity mappings.
	543	*/
	544	plen = TARGET_PAGE_SIZE;
	545	}
	546
	547	/* Convert to address mask */
	548	plen -= 1;
	549
	550	return (IOMMUTLBEntry) {
	551	.target_as = section.address_space,
	552	.iova = addr & ~plen,
	553	.translated_addr = xlat & ~plen,
	554	.addr_mask = plen,
	555	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	556	.perm = IOMMU_RW,
	557	};
	558
	559	iotlb_fail:
	560	return (IOMMUTLBEntry) {0};
	561	}
	562
	563	/* Called from RCU critical section */
	564	MemoryRegion address_space_translate(AddressSpace as, hwaddr addr,
	565	hwaddr xlat, hwaddr plen,
	566	bool is_write)
	567	{
	568	MemoryRegion *mr;
	569	MemoryRegionSection section;
	570
	571	/* This can be MMIO, so setup MMIO bit. */
	572	section = address_space_do_translate(as, addr, xlat, plen, is_write, true);
	573	mr = section.mr;
	574
	575	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	576	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	577	plen = MIN(page, plen);
	578	}
	579
	580	return mr;
	581	}
	582
	583	/* Called from RCU critical section */
	584	MemoryRegionSection *
	585	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	586	hwaddr xlat, hwaddr plen)
	587	{
	588	MemoryRegionSection *section;
	589	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	590
	591	section = address_space_translate_internal(d, addr, xlat, plen, false);
	592
	593	assert(!memory_region_is_iommu(section->mr));
	594	return section;
	595	}
	596	#endif
	597
	598	#if !defined(CONFIG_USER_ONLY)
	599
	600	static int cpu_common_post_load(void *opaque, int version_id)
	601	{
	602	CPUState *cpu = opaque;
	603
	604	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	605	version_id is increased. */
	606	cpu->interrupt_request &= ~0x01;
	607	tlb_flush(cpu);
	608
	609	return 0;
	610	}
	611
	612	static int cpu_common_pre_load(void *opaque)
	613	{
	614	CPUState *cpu = opaque;
	615
	616	cpu->exception_index = -1;
	617
	618	return 0;
	619	}
	620
	621	static bool cpu_common_exception_index_needed(void *opaque)
	622	{
	623	CPUState *cpu = opaque;
	624
	625	return tcg_enabled() && cpu->exception_index != -1;
	626	}
	627
	628	static const VMStateDescription vmstate_cpu_common_exception_index = {
	629	.name = "cpu_common/exception_index",
	630	.version_id = 1,
	631	.minimum_version_id = 1,
	632	.needed = cpu_common_exception_index_needed,
	633	.fields = (VMStateField[]) {
	634	VMSTATE_INT32(exception_index, CPUState),
	635	VMSTATE_END_OF_LIST()
	636	}
	637	};
	638
	639	static bool cpu_common_crash_occurred_needed(void *opaque)
	640	{
	641	CPUState *cpu = opaque;
	642
	643	return cpu->crash_occurred;
	644	}
	645
	646	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	647	.name = "cpu_common/crash_occurred",
	648	.version_id = 1,
	649	.minimum_version_id = 1,
	650	.needed = cpu_common_crash_occurred_needed,
	651	.fields = (VMStateField[]) {
	652	VMSTATE_BOOL(crash_occurred, CPUState),
	653	VMSTATE_END_OF_LIST()
	654	}
	655	};
	656
	657	const VMStateDescription vmstate_cpu_common = {
	658	.name = "cpu_common",
	659	.version_id = 1,
	660	.minimum_version_id = 1,
	661	.pre_load = cpu_common_pre_load,
	662	.post_load = cpu_common_post_load,
	663	.fields = (VMStateField[]) {
	664	VMSTATE_UINT32(halted, CPUState),
	665	VMSTATE_UINT32(interrupt_request, CPUState),
	666	VMSTATE_END_OF_LIST()
	667	},
	668	.subsections = (const VMStateDescription*[]) {
	669	&vmstate_cpu_common_exception_index,
	670	&vmstate_cpu_common_crash_occurred,
	671	NULL
	672	}
	673	};
	674
	675	#endif
	676
	677	CPUState *qemu_get_cpu(int index)
	678	{
	679	CPUState *cpu;
	680
	681	CPU_FOREACH(cpu) {
	682	if (cpu->cpu_index == index) {
	683	return cpu;
	684	}
	685	}
	686
	687	return NULL;
	688	}
	689
	690	#if !defined(CONFIG_USER_ONLY)
	691	void cpu_address_space_init(CPUState cpu, AddressSpace as, int asidx)
	692	{
	693	CPUAddressSpace *newas;
	694
	695	/* Target code should have set num_ases before calling us */
	696	assert(asidx < cpu->num_ases);
	697
	698	if (asidx == 0) {
	699	/* address space 0 gets the convenience alias */
	700	cpu->as = as;
	701	}
	702
	703	/* KVM cannot currently support multiple address spaces. */
	704	assert(asidx == 0 \|\| !kvm_enabled());
	705
	706	if (!cpu->cpu_ases) {
	707	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	708	}
	709
	710	newas = &cpu->cpu_ases[asidx];
	711	newas->cpu = cpu;
	712	newas->as = as;
	713	if (tcg_enabled()) {
	714	newas->tcg_as_listener.commit = tcg_commit;
	715	memory_listener_register(&newas->tcg_as_listener, as);
	716	}
	717	}
	718
	719	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	720	{
	721	/* Return the AddressSpace corresponding to the specified index */
	722	return cpu->cpu_ases[asidx].as;
	723	}
	724	#endif
	725
	726	void cpu_exec_unrealizefn(CPUState *cpu)
	727	{
	728	CPUClass *cc = CPU_GET_CLASS(cpu);
	729
	730	cpu_list_remove(cpu);
	731
	732	if (cc->vmsd != NULL) {
	733	vmstate_unregister(NULL, cc->vmsd, cpu);
	734	}
	735	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	736	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	737	}
	738	}
	739
	740	void cpu_exec_initfn(CPUState *cpu)
	741	{
	742	cpu->as = NULL;
	743	cpu->num_ases = 0;
	744
	745	#ifndef CONFIG_USER_ONLY
	746	cpu->thread_id = qemu_get_thread_id();
	747
	748	/* This is a softmmu CPU object, so create a property for it
	749	* so users can wire up its memory. (This can't go in qom/cpu.c
	750	* because that file is compiled only once for both user-mode
	751	* and system builds.) The default if no link is set up is to use
	752	* the system address space.
	753	*/
	754	object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
	755	(Object **)&cpu->memory,
	756	qdev_prop_allow_set_link_before_realize,
	757	OBJ_PROP_LINK_UNREF_ON_RELEASE,
	758	&error_abort);
	759	cpu->memory = system_memory;
	760	object_ref(OBJECT(cpu->memory));
	761	#endif
	762	}
	763
	764	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	765	{
	766	CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
	767
	768	cpu_list_add(cpu);
	769
	770	#ifndef CONFIG_USER_ONLY
	771	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	772	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	773	}
	774	if (cc->vmsd != NULL) {
	775	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	776	}
	777	#endif
	778	}
	779
	780	#if defined(CONFIG_USER_ONLY)
	781	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	782	{
	783	mmap_lock();
	784	tb_lock();
	785	tb_invalidate_phys_page_range(pc, pc + 1, 0);
	786	tb_unlock();
	787	mmap_unlock();
	788	}
	789	#else
	790	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	791	{
	792	MemTxAttrs attrs;
	793	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	794	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	795	if (phys != -1) {
	796	/* Locks grabbed by tb_invalidate_phys_addr */
	797	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	798	phys \| (pc & ~TARGET_PAGE_MASK));
	799	}
	800	}
	801	#endif
	802
	803	#if defined(CONFIG_USER_ONLY)
	804	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	805
	806	{
	807	}
	808
	809	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	810	int flags)
	811	{
	812	return -ENOSYS;
	813	}
	814
	815	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	816	{
	817	}
	818
	819	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	820	int flags, CPUWatchpoint **watchpoint)
	821	{
	822	return -ENOSYS;
	823	}
	824	#else
	825	/* Add a watchpoint. */
	826	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	827	int flags, CPUWatchpoint **watchpoint)
	828	{
	829	CPUWatchpoint *wp;
	830
	831	/* forbid ranges which are empty or run off the end of the address space */
	832	if (len == 0 \|\| (addr + len - 1) < addr) {
	833	error_report("tried to set invalid watchpoint at %"
	834	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	835	return -EINVAL;
	836	}
	837	wp = g_malloc(sizeof(*wp));
	838
	839	wp->vaddr = addr;
	840	wp->len = len;
	841	wp->flags = flags;
	842
	843	/* keep all GDB-injected watchpoints in front */
	844	if (flags & BP_GDB) {
	845	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	846	} else {
	847	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	848	}
	849
	850	tlb_flush_page(cpu, addr);
	851
	852	if (watchpoint)
	853	*watchpoint = wp;
	854	return 0;
	855	}
	856
	857	/* Remove a specific watchpoint. */
	858	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	859	int flags)
	860	{
	861	CPUWatchpoint *wp;
	862
	863	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	864	if (addr == wp->vaddr && len == wp->len
	865	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	866	cpu_watchpoint_remove_by_ref(cpu, wp);
	867	return 0;
	868	}
	869	}
	870	return -ENOENT;
	871	}
	872
	873	/* Remove a specific watchpoint by reference. */
	874	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	875	{
	876	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	877
	878	tlb_flush_page(cpu, watchpoint->vaddr);
	879
	880	g_free(watchpoint);
	881	}
	882
	883	/* Remove all matching watchpoints. */
	884	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	885	{
	886	CPUWatchpoint wp, next;
	887
	888	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	889	if (wp->flags & mask) {
	890	cpu_watchpoint_remove_by_ref(cpu, wp);
	891	}
	892	}
	893	}
	894
	895	/* Return true if this watchpoint address matches the specified
	896	* access (ie the address range covered by the watchpoint overlaps
	897	* partially or completely with the address range covered by the
	898	* access).
	899	*/
	900	static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
	901	vaddr addr,
	902	vaddr len)
	903	{
	904	/* We know the lengths are non-zero, but a little caution is
	905	* required to avoid errors in the case where the range ends
	906	* exactly at the top of the address space and so addr + len
	907	* wraps round to zero.
	908	*/
	909	vaddr wpend = wp->vaddr + wp->len - 1;
	910	vaddr addrend = addr + len - 1;
	911
	912	return !(addr > wpend \|\| wp->vaddr > addrend);
	913	}
	914
	915	#endif
	916
	917	/* Add a breakpoint. */
	918	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	919	CPUBreakpoint **breakpoint)
	920	{
	921	CPUBreakpoint *bp;
	922
	923	bp = g_malloc(sizeof(*bp));
	924
	925	bp->pc = pc;
	926	bp->flags = flags;
	927
	928	/* keep all GDB-injected breakpoints in front */
	929	if (flags & BP_GDB) {
	930	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	931	} else {
	932	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	933	}
	934
	935	breakpoint_invalidate(cpu, pc);
	936
	937	if (breakpoint) {
	938	*breakpoint = bp;
	939	}
	940	return 0;
	941	}
	942
	943	/* Remove a specific breakpoint. */
	944	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	945	{
	946	CPUBreakpoint *bp;
	947
	948	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	949	if (bp->pc == pc && bp->flags == flags) {
	950	cpu_breakpoint_remove_by_ref(cpu, bp);
	951	return 0;
	952	}
	953	}
	954	return -ENOENT;
	955	}
	956
	957	/* Remove a specific breakpoint by reference. */
	958	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	959	{
	960	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	961
	962	breakpoint_invalidate(cpu, breakpoint->pc);
	963
	964	g_free(breakpoint);
	965	}
	966
	967	/* Remove all matching breakpoints. */
	968	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	969	{
	970	CPUBreakpoint bp, next;
	971
	972	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	973	if (bp->flags & mask) {
	974	cpu_breakpoint_remove_by_ref(cpu, bp);
	975	}
	976	}
	977	}
	978
	979	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	980	CPU loop after each instruction */
	981	void cpu_single_step(CPUState *cpu, int enabled)
	982	{
	983	if (cpu->singlestep_enabled != enabled) {
	984	cpu->singlestep_enabled = enabled;
	985	if (kvm_enabled()) {
	986	kvm_update_guest_debug(cpu, 0);
	987	} else {
	988	/* must flush all the translated code to avoid inconsistencies */
	989	/* XXX: only flush what is necessary */
	990	tb_flush(cpu);
	991	}
	992	}
	993	}
	994
	995	void cpu_abort(CPUState cpu, const char fmt, ...)
	996	{
	997	va_list ap;
	998	va_list ap2;
	999
	1000	va_start(ap, fmt);
	1001	va_copy(ap2, ap);
	1002	fprintf(stderr, "qemu: fatal: ");
	1003	vfprintf(stderr, fmt, ap);
	1004	fprintf(stderr, "\n");
	1005	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1006	if (qemu_log_separate()) {
	1007	qemu_log_lock();
	1008	qemu_log("qemu: fatal: ");
	1009	qemu_log_vprintf(fmt, ap2);
	1010	qemu_log("\n");
	1011	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1012	qemu_log_flush();
	1013	qemu_log_unlock();
	1014	qemu_log_close();
	1015	}
	1016	va_end(ap2);
	1017	va_end(ap);
	1018	replay_finish();
	1019	#if defined(CONFIG_USER_ONLY)
	1020	{
	1021	struct sigaction act;
	1022	sigfillset(&act.sa_mask);
	1023	act.sa_handler = SIG_DFL;
	1024	sigaction(SIGABRT, &act, NULL);
	1025	}
	1026	#endif
	1027	abort();
	1028	}
	1029
	1030	#if !defined(CONFIG_USER_ONLY)
	1031	/* Called from RCU critical section */
	1032	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1033	{
	1034	RAMBlock *block;
	1035
	1036	block = atomic_rcu_read(&ram_list.mru_block);
	1037	if (block && addr - block->offset < block->max_length) {
	1038	return block;
	1039	}
	1040	RAMBLOCK_FOREACH(block) {
	1041	if (addr - block->offset < block->max_length) {
	1042	goto found;
	1043	}
	1044	}
	1045
	1046	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1047	abort();
	1048
	1049	found:
	1050	/* It is safe to write mru_block outside the iothread lock. This
	1051	* is what happens:
	1052	*
	1053	* mru_block = xxx
	1054	* rcu_read_unlock()
	1055	* xxx removed from list
	1056	* rcu_read_lock()
	1057	* read mru_block
	1058	* mru_block = NULL;
	1059	* call_rcu(reclaim_ramblock, xxx);
	1060	* rcu_read_unlock()
	1061	*
	1062	* atomic_rcu_set is not needed here. The block was already published
	1063	* when it was placed into the list. Here we're just making an extra
	1064	* copy of the pointer.
	1065	*/
	1066	ram_list.mru_block = block;
	1067	return block;
	1068	}
	1069
	1070	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1071	{
	1072	CPUState *cpu;
	1073	ram_addr_t start1;
	1074	RAMBlock *block;
	1075	ram_addr_t end;
	1076
	1077	end = TARGET_PAGE_ALIGN(start + length);
	1078	start &= TARGET_PAGE_MASK;
	1079
	1080	rcu_read_lock();
	1081	block = qemu_get_ram_block(start);
	1082	assert(block == qemu_get_ram_block(end - 1));
	1083	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1084	CPU_FOREACH(cpu) {
	1085	tlb_reset_dirty(cpu, start1, length);
	1086	}
	1087	rcu_read_unlock();
	1088	}
	1089
	1090	/* Note: start and end must be within the same ram block. */
	1091	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1092	ram_addr_t length,
	1093	unsigned client)
	1094	{
	1095	DirtyMemoryBlocks *blocks;
	1096	unsigned long end, page;
	1097	bool dirty = false;
	1098
	1099	if (length == 0) {
	1100	return false;
	1101	}
	1102
	1103	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1104	page = start >> TARGET_PAGE_BITS;
	1105
	1106	rcu_read_lock();
	1107
	1108	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1109
	1110	while (page < end) {
	1111	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1112	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1113	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1114
	1115	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1116	offset, num);
	1117	page += num;
	1118	}
	1119
	1120	rcu_read_unlock();
	1121
	1122	if (dirty && tcg_enabled()) {
	1123	tlb_reset_dirty_range_all(start, length);
	1124	}
	1125
	1126	return dirty;
	1127	}
	1128
	1129	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1130	(ram_addr_t start, ram_addr_t length, unsigned client)
	1131	{
	1132	DirtyMemoryBlocks *blocks;
	1133	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1134	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1135	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1136	DirtyBitmapSnapshot *snap;
	1137	unsigned long page, end, dest;
	1138
	1139	snap = g_malloc0(sizeof(*snap) +
	1140	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1141	snap->start = first;
	1142	snap->end = last;
	1143
	1144	page = first >> TARGET_PAGE_BITS;
	1145	end = last >> TARGET_PAGE_BITS;
	1146	dest = 0;
	1147
	1148	rcu_read_lock();
	1149
	1150	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1151
	1152	while (page < end) {
	1153	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1154	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1155	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1156
	1157	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1158	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1159	offset >>= BITS_PER_LEVEL;
	1160
	1161	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1162	blocks->blocks[idx] + offset,
	1163	num);
	1164	page += num;
	1165	dest += num >> BITS_PER_LEVEL;
	1166	}
	1167
	1168	rcu_read_unlock();
	1169
	1170	if (tcg_enabled()) {
	1171	tlb_reset_dirty_range_all(start, length);
	1172	}
	1173
	1174	return snap;
	1175	}
	1176
	1177	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1178	ram_addr_t start,
	1179	ram_addr_t length)
	1180	{
	1181	unsigned long page, end;
	1182
	1183	assert(start >= snap->start);
	1184	assert(start + length <= snap->end);
	1185
	1186	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1187	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1188
	1189	while (page < end) {
	1190	if (test_bit(page, snap->dirty)) {
	1191	return true;
	1192	}
	1193	page++;
	1194	}
	1195	return false;
	1196	}
	1197
	1198	/* Called from RCU critical section */
	1199	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1200	MemoryRegionSection *section,
	1201	target_ulong vaddr,
	1202	hwaddr paddr, hwaddr xlat,
	1203	int prot,
	1204	target_ulong *address)
	1205	{
	1206	hwaddr iotlb;
	1207	CPUWatchpoint *wp;
	1208
	1209	if (memory_region_is_ram(section->mr)) {
	1210	/* Normal RAM. */
	1211	iotlb = memory_region_get_ram_addr(section->mr) + xlat;
	1212	if (!section->readonly) {
	1213	iotlb \|= PHYS_SECTION_NOTDIRTY;
	1214	} else {
	1215	iotlb \|= PHYS_SECTION_ROM;
	1216	}
	1217	} else {
	1218	AddressSpaceDispatch *d;
	1219
	1220	d = atomic_rcu_read(&section->address_space->dispatch);
	1221	iotlb = section - d->map.sections;
	1222	iotlb += xlat;
	1223	}
	1224
	1225	/* Make accesses to pages with watchpoints go via the
	1226	watchpoint trap routines. */
	1227	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1228	if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
	1229	/* Avoid trapping reads of pages with a write breakpoint. */
	1230	if ((prot & PAGE_WRITE) \|\| (wp->flags & BP_MEM_READ)) {
	1231	iotlb = PHYS_SECTION_WATCH + paddr;
	1232	*address \|= TLB_MMIO;
	1233	break;
	1234	}
	1235	}
	1236	}
	1237
	1238	return iotlb;
	1239	}
	1240	#endif /* defined(CONFIG_USER_ONLY) */
	1241
	1242	#if !defined(CONFIG_USER_ONLY)
	1243
	1244	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	1245	uint16_t section);
	1246	static subpage_t subpage_init(AddressSpace as, hwaddr base);
	1247
	1248	static void (phys_mem_alloc)(size_t size, uint64_t *align) =
	1249	qemu_anon_ram_alloc;
	1250
	1251	/*
	1252	* Set a custom physical guest memory alloator.
	1253	* Accelerators with unusual needs may need this. Hopefully, we can
	1254	* get rid of it eventually.
	1255	*/
	1256	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align))
	1257	{
	1258	phys_mem_alloc = alloc;
	1259	}
	1260
	1261	static uint16_t phys_section_add(PhysPageMap *map,
	1262	MemoryRegionSection *section)
	1263	{
	1264	/* The physical section number is ORed with a page-aligned
	1265	* pointer to produce the iotlb entries. Thus it should
	1266	* never overflow into the page-aligned value.
	1267	*/
	1268	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1269
	1270	if (map->sections_nb == map->sections_nb_alloc) {
	1271	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1272	map->sections = g_renew(MemoryRegionSection, map->sections,
	1273	map->sections_nb_alloc);
	1274	}
	1275	map->sections[map->sections_nb] = *section;
	1276	memory_region_ref(section->mr);
	1277	return map->sections_nb++;
	1278	}
	1279
	1280	static void phys_section_destroy(MemoryRegion *mr)
	1281	{
	1282	bool have_sub_page = mr->subpage;
	1283
	1284	memory_region_unref(mr);
	1285
	1286	if (have_sub_page) {
	1287	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1288	object_unref(OBJECT(&subpage->iomem));
	1289	g_free(subpage);
	1290	}
	1291	}
	1292
	1293	static void phys_sections_free(PhysPageMap *map)
	1294	{
	1295	while (map->sections_nb > 0) {
	1296	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1297	phys_section_destroy(section->mr);
	1298	}
	1299	g_free(map->sections);
	1300	g_free(map->nodes);
	1301	}
	1302
	1303	static void register_subpage(AddressSpaceDispatch d, MemoryRegionSection section)
	1304	{
	1305	subpage_t *subpage;
	1306	hwaddr base = section->offset_within_address_space
	1307	& TARGET_PAGE_MASK;
	1308	MemoryRegionSection *existing = phys_page_find(d, base);
	1309	MemoryRegionSection subsection = {
	1310	.offset_within_address_space = base,
	1311	.size = int128_make64(TARGET_PAGE_SIZE),
	1312	};
	1313	hwaddr start, end;
	1314
	1315	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1316
	1317	if (!(existing->mr->subpage)) {
	1318	subpage = subpage_init(d->as, base);
	1319	subsection.address_space = d->as;
	1320	subsection.mr = &subpage->iomem;
	1321	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1322	phys_section_add(&d->map, &subsection));
	1323	} else {
	1324	subpage = container_of(existing->mr, subpage_t, iomem);
	1325	}
	1326	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1327	end = start + int128_get64(section->size) - 1;
	1328	subpage_register(subpage, start, end,
	1329	phys_section_add(&d->map, section));
	1330	}
	1331
	1332
	1333	static void register_multipage(AddressSpaceDispatch *d,
	1334	MemoryRegionSection *section)
	1335	{
	1336	hwaddr start_addr = section->offset_within_address_space;
	1337	uint16_t section_index = phys_section_add(&d->map, section);
	1338	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1339	TARGET_PAGE_BITS));
	1340
	1341	assert(num_pages);
	1342	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1343	}
	1344
	1345	static void mem_add(MemoryListener listener, MemoryRegionSection section)
	1346	{
	1347	AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
	1348	AddressSpaceDispatch *d = as->next_dispatch;
	1349	MemoryRegionSection now = section, remain = section;
	1350	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1351
	1352	if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1353	uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
	1354	- now.offset_within_address_space;
	1355
	1356	now.size = int128_min(int128_make64(left), now.size);
	1357	register_subpage(d, &now);
	1358	} else {
	1359	now.size = int128_zero();
	1360	}
	1361	while (int128_ne(remain.size, now.size)) {
	1362	remain.size = int128_sub(remain.size, now.size);
	1363	remain.offset_within_address_space += int128_get64(now.size);
	1364	remain.offset_within_region += int128_get64(now.size);
	1365	now = remain;
	1366	if (int128_lt(remain.size, page_size)) {
	1367	register_subpage(d, &now);
	1368	} else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1369	now.size = page_size;
	1370	register_subpage(d, &now);
	1371	} else {
	1372	now.size = int128_and(now.size, int128_neg(page_size));
	1373	register_multipage(d, &now);
	1374	}
	1375	}
	1376	}
	1377
	1378	void qemu_flush_coalesced_mmio_buffer(void)
	1379	{
	1380	if (kvm_enabled())
	1381	kvm_flush_coalesced_mmio_buffer();
	1382	}
	1383
	1384	void qemu_mutex_lock_ramlist(void)
	1385	{
	1386	qemu_mutex_lock(&ram_list.mutex);
	1387	}
	1388
	1389	void qemu_mutex_unlock_ramlist(void)
	1390	{
	1391	qemu_mutex_unlock(&ram_list.mutex);
	1392	}
	1393
	1394	void ram_block_dump(Monitor *mon)
	1395	{
	1396	RAMBlock *block;
	1397	char *psize;
	1398
	1399	rcu_read_lock();
	1400	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1401	"Block Name", "PSize", "Offset", "Used", "Total");
	1402	RAMBLOCK_FOREACH(block) {
	1403	psize = size_to_str(block->page_size);
	1404	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1405	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1406	(uint64_t)block->offset,
	1407	(uint64_t)block->used_length,
	1408	(uint64_t)block->max_length);
	1409	g_free(psize);
	1410	}
	1411	rcu_read_unlock();
	1412	}
	1413
	1414	#ifdef __linux__
	1415	/*
	1416	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1417	* may or may not name the same files / on the same filesystem now as
	1418	* when we actually open and map them. Iterate over the file
	1419	* descriptors instead, and use qemu_fd_getpagesize().
	1420	*/
	1421	static int find_max_supported_pagesize(Object obj, void opaque)
	1422	{
	1423	char *mem_path;
	1424	long *hpsize_min = opaque;
	1425
	1426	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1427	mem_path = object_property_get_str(obj, "mem-path", NULL);
	1428	if (mem_path) {
	1429	long hpsize = qemu_mempath_getpagesize(mem_path);
	1430	if (hpsize < *hpsize_min) {
	1431	*hpsize_min = hpsize;
	1432	}
	1433	} else {
	1434	*hpsize_min = getpagesize();
	1435	}
	1436	}
	1437
	1438	return 0;
	1439	}
	1440
	1441	long qemu_getrampagesize(void)
	1442	{
	1443	long hpsize = LONG_MAX;
	1444	long mainrampagesize;
	1445	Object *memdev_root;
	1446
	1447	if (mem_path) {
	1448	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1449	} else {
	1450	mainrampagesize = getpagesize();
	1451	}
	1452
	1453	/* it's possible we have memory-backend objects with
	1454	* hugepage-backed RAM. these may get mapped into system
	1455	* address space via -numa parameters or memory hotplug
	1456	* hooks. we want to take these into account, but we
	1457	* also want to make sure these supported hugepage
	1458	* sizes are applicable across the entire range of memory
	1459	* we may boot from, so we take the min across all
	1460	* backends, and assume normal pages in cases where a
	1461	* backend isn't backed by hugepages.
	1462	*/
	1463	memdev_root = object_resolve_path("/objects", NULL);
	1464	if (memdev_root) {
	1465	object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
	1466	}
	1467	if (hpsize == LONG_MAX) {
	1468	/* No additional memory regions found ==> Report main RAM page size */
	1469	return mainrampagesize;
	1470	}
	1471
	1472	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1473	* memory-backend, then there is at least one node using "normal" RAM,
	1474	* so if its page size is smaller we have got to report that size instead.
	1475	*/
	1476	if (hpsize > mainrampagesize &&
	1477	(nb_numa_nodes == 0 \|\| numa_info[0].node_memdev == NULL)) {
	1478	static bool warned;
	1479	if (!warned) {
	1480	error_report("Huge page support disabled (n/a for main memory).");
	1481	warned = true;
	1482	}
	1483	return mainrampagesize;
	1484	}
	1485
	1486	return hpsize;
	1487	}
	1488	#else
	1489	long qemu_getrampagesize(void)
	1490	{
	1491	return getpagesize();
	1492	}
	1493	#endif
	1494
	1495	#ifdef __linux__
	1496	static int64_t get_file_size(int fd)
	1497	{
	1498	int64_t size = lseek(fd, 0, SEEK_END);
	1499	if (size < 0) {
	1500	return -errno;
	1501	}
	1502	return size;
	1503	}
	1504
	1505	static int file_ram_open(const char *path,
	1506	const char *region_name,
	1507	bool *created,
	1508	Error **errp)
	1509	{
	1510	char *filename;
	1511	char *sanitized_name;
	1512	char *c;
	1513	int fd = -1;
	1514
	1515	*created = false;
	1516	for (;;) {
	1517	fd = open(path, O_RDWR);
	1518	if (fd >= 0) {
	1519	/* @path names an existing file, use it */
	1520	break;
	1521	}
	1522	if (errno == ENOENT) {
	1523	/* @path names a file that doesn't exist, create it */
	1524	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1525	if (fd >= 0) {
	1526	*created = true;
	1527	break;
	1528	}
	1529	} else if (errno == EISDIR) {
	1530	/* @path names a directory, create a file there */
	1531	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1532	sanitized_name = g_strdup(region_name);
	1533	for (c = sanitized_name; *c != '\0'; c++) {
	1534	if (*c == '/') {
	1535	*c = '_';
	1536	}
	1537	}
	1538
	1539	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1540	sanitized_name);
	1541	g_free(sanitized_name);
	1542
	1543	fd = mkstemp(filename);
	1544	if (fd >= 0) {
	1545	unlink(filename);
	1546	g_free(filename);
	1547	break;
	1548	}
	1549	g_free(filename);
	1550	}
	1551	if (errno != EEXIST && errno != EINTR) {
	1552	error_setg_errno(errp, errno,
	1553	"can't open backing store %s for guest RAM",
	1554	path);
	1555	return -1;
	1556	}
	1557	/*
	1558	* Try again on EINTR and EEXIST. The latter happens when
	1559	* something else creates the file between our two open().
	1560	*/
	1561	}
	1562
	1563	return fd;
	1564	}
	1565
	1566	static void file_ram_alloc(RAMBlock block,
	1567	ram_addr_t memory,
	1568	int fd,
	1569	bool truncate,
	1570	Error **errp)
	1571	{
	1572	void *area;
	1573
	1574	block->page_size = qemu_fd_getpagesize(fd);
	1575	block->mr->align = block->page_size;
	1576	#if defined(__s390x__)
	1577	if (kvm_enabled()) {
	1578	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1579	}
	1580	#endif
	1581
	1582	if (memory < block->page_size) {
	1583	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1584	"or larger than page size 0x%zx",
	1585	memory, block->page_size);
	1586	return NULL;
	1587	}
	1588
	1589	memory = ROUND_UP(memory, block->page_size);
	1590
	1591	/*
	1592	* ftruncate is not supported by hugetlbfs in older
	1593	* hosts, so don't bother bailing out on errors.
	1594	* If anything goes wrong with it under other filesystems,
	1595	* mmap will fail.
	1596	*
	1597	* Do not truncate the non-empty backend file to avoid corrupting
	1598	* the existing data in the file. Disabling shrinking is not
	1599	* enough. For example, the current vNVDIMM implementation stores
	1600	* the guest NVDIMM labels at the end of the backend file. If the
	1601	* backend file is later extended, QEMU will not be able to find
	1602	* those labels. Therefore, extending the non-empty backend file
	1603	* is disabled as well.
	1604	*/
	1605	if (truncate && ftruncate(fd, memory)) {
	1606	perror("ftruncate");
	1607	}
	1608
	1609	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1610	block->flags & RAM_SHARED);
	1611	if (area == MAP_FAILED) {
	1612	error_setg_errno(errp, errno,
	1613	"unable to map backing store for guest RAM");
	1614	return NULL;
	1615	}
	1616
	1617	if (mem_prealloc) {
	1618	os_mem_prealloc(fd, area, memory, smp_cpus, errp);
	1619	if (errp && *errp) {
	1620	qemu_ram_munmap(area, memory);
	1621	return NULL;
	1622	}
	1623	}
	1624
	1625	block->fd = fd;
	1626	return area;
	1627	}
	1628	#endif
	1629
	1630	/* Called with the ramlist lock held. */
	1631	static ram_addr_t find_ram_offset(ram_addr_t size)
	1632	{
	1633	RAMBlock block, next_block;
	1634	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1635
	1636	assert(size != 0); /* it would hand out same offset multiple times */
	1637
	1638	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1639	return 0;
	1640	}
	1641
	1642	RAMBLOCK_FOREACH(block) {
	1643	ram_addr_t end, next = RAM_ADDR_MAX;
	1644
	1645	end = block->offset + block->max_length;
	1646
	1647	RAMBLOCK_FOREACH(next_block) {
	1648	if (next_block->offset >= end) {
	1649	next = MIN(next, next_block->offset);
	1650	}
	1651	}
	1652	if (next - end >= size && next - end < mingap) {
	1653	offset = end;
	1654	mingap = next - end;
	1655	}
	1656	}
	1657
	1658	if (offset == RAM_ADDR_MAX) {
	1659	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1660	(uint64_t)size);
	1661	abort();
	1662	}
	1663
	1664	return offset;
	1665	}
	1666
	1667	unsigned long last_ram_page(void)
	1668	{
	1669	RAMBlock *block;
	1670	ram_addr_t last = 0;
	1671
	1672	rcu_read_lock();
	1673	RAMBLOCK_FOREACH(block) {
	1674	last = MAX(last, block->offset + block->max_length);
	1675	}
	1676	rcu_read_unlock();
	1677	return last >> TARGET_PAGE_BITS;
	1678	}
	1679
	1680	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	1681	{
	1682	int ret;
	1683
	1684	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	1685	if (!machine_dump_guest_core(current_machine)) {
	1686	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	1687	if (ret) {
	1688	perror("qemu_madvise");
	1689	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	1690	"but dump_guest_core=off specified\n");
	1691	}
	1692	}
	1693	}
	1694
	1695	const char qemu_ram_get_idstr(RAMBlock rb)
	1696	{
	1697	return rb->idstr;
	1698	}
	1699
	1700	bool qemu_ram_is_shared(RAMBlock *rb)
	1701	{
	1702	return rb->flags & RAM_SHARED;
	1703	}
	1704
	1705	/* Called with iothread lock held. */
	1706	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	1707	{
	1708	RAMBlock *block;
	1709
	1710	assert(new_block);
	1711	assert(!new_block->idstr[0]);
	1712
	1713	if (dev) {
	1714	char *id = qdev_get_dev_path(dev);
	1715	if (id) {
	1716	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	1717	g_free(id);
	1718	}
	1719	}
	1720	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	1721
	1722	rcu_read_lock();
	1723	RAMBLOCK_FOREACH(block) {
	1724	if (block != new_block &&
	1725	!strcmp(block->idstr, new_block->idstr)) {
	1726	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	1727	new_block->idstr);
	1728	abort();
	1729	}
	1730	}
	1731	rcu_read_unlock();
	1732	}
	1733
	1734	/* Called with iothread lock held. */
	1735	void qemu_ram_unset_idstr(RAMBlock *block)
	1736	{
	1737	/* FIXME: arch_init.c assumes that this is not called throughout
	1738	* migration. Ignore the problem since hot-unplug during migration
	1739	* does not work anyway.
	1740	*/
	1741	if (block) {
	1742	memset(block->idstr, 0, sizeof(block->idstr));
	1743	}
	1744	}
	1745
	1746	size_t qemu_ram_pagesize(RAMBlock *rb)
	1747	{
	1748	return rb->page_size;
	1749	}
	1750
	1751	/* Returns the largest size of page in use */
	1752	size_t qemu_ram_pagesize_largest(void)
	1753	{
	1754	RAMBlock *block;
	1755	size_t largest = 0;
	1756
	1757	RAMBLOCK_FOREACH(block) {
	1758	largest = MAX(largest, qemu_ram_pagesize(block));
	1759	}
	1760
	1761	return largest;
	1762	}
	1763
	1764	static int memory_try_enable_merging(void *addr, size_t len)
	1765	{
	1766	if (!machine_mem_merge(current_machine)) {
	1767	/* disabled by the user */
	1768	return 0;
	1769	}
	1770
	1771	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	1772	}
	1773
	1774	/* Only legal before guest might have detected the memory size: e.g. on
	1775	* incoming migration, or right after reset.
	1776	*
	1777	* As memory core doesn't know how is memory accessed, it is up to
	1778	* resize callback to update device state and/or add assertions to detect
	1779	* misuse, if necessary.
	1780	*/
	1781	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	1782	{
	1783	assert(block);
	1784
	1785	newsize = HOST_PAGE_ALIGN(newsize);
	1786
	1787	if (block->used_length == newsize) {
	1788	return 0;
	1789	}
	1790
	1791	if (!(block->flags & RAM_RESIZEABLE)) {
	1792	error_setg_errno(errp, EINVAL,
	1793	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	1794	" in != 0x" RAM_ADDR_FMT, block->idstr,
	1795	newsize, block->used_length);
	1796	return -EINVAL;
	1797	}
	1798
	1799	if (block->max_length < newsize) {
	1800	error_setg_errno(errp, EINVAL,
	1801	"Length too large: %s: 0x" RAM_ADDR_FMT
	1802	" > 0x" RAM_ADDR_FMT, block->idstr,
	1803	newsize, block->max_length);
	1804	return -EINVAL;
	1805	}
	1806
	1807	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	1808	block->used_length = newsize;
	1809	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	1810	DIRTY_CLIENTS_ALL);
	1811	memory_region_set_size(block->mr, newsize);
	1812	if (block->resized) {
	1813	block->resized(block->idstr, newsize, block->host);
	1814	}
	1815	return 0;
	1816	}
	1817
	1818	/* Called with ram_list.mutex held */
	1819	static void dirty_memory_extend(ram_addr_t old_ram_size,
	1820	ram_addr_t new_ram_size)
	1821	{
	1822	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	1823	DIRTY_MEMORY_BLOCK_SIZE);
	1824	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	1825	DIRTY_MEMORY_BLOCK_SIZE);
	1826	int i;
	1827
	1828	/* Only need to extend if block count increased */
	1829	if (new_num_blocks <= old_num_blocks) {
	1830	return;
	1831	}
	1832
	1833	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	1834	DirtyMemoryBlocks *old_blocks;
	1835	DirtyMemoryBlocks *new_blocks;
	1836	int j;
	1837
	1838	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	1839	new_blocks = g_malloc(sizeof(*new_blocks) +
	1840	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	1841
	1842	if (old_num_blocks) {
	1843	memcpy(new_blocks->blocks, old_blocks->blocks,
	1844	old_num_blocks * sizeof(old_blocks->blocks[0]));
	1845	}
	1846
	1847	for (j = old_num_blocks; j < new_num_blocks; j++) {
	1848	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	1849	}
	1850
	1851	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	1852
	1853	if (old_blocks) {
	1854	g_free_rcu(old_blocks, rcu);
	1855	}
	1856	}
	1857	}
	1858
	1859	static void ram_block_add(RAMBlock new_block, Error *errp)
	1860	{
	1861	RAMBlock *block;
	1862	RAMBlock *last_block = NULL;
	1863	ram_addr_t old_ram_size, new_ram_size;
	1864	Error *err = NULL;
	1865
	1866	old_ram_size = last_ram_page();
	1867
	1868	qemu_mutex_lock_ramlist();
	1869	new_block->offset = find_ram_offset(new_block->max_length);
	1870
	1871	if (!new_block->host) {
	1872	if (xen_enabled()) {
	1873	xen_ram_alloc(new_block->offset, new_block->max_length,
	1874	new_block->mr, &err);
	1875	if (err) {
	1876	error_propagate(errp, err);
	1877	qemu_mutex_unlock_ramlist();
	1878	return;
	1879	}
	1880	} else {
	1881	new_block->host = phys_mem_alloc(new_block->max_length,
	1882	&new_block->mr->align);
	1883	if (!new_block->host) {
	1884	error_setg_errno(errp, errno,
	1885	"cannot set up guest memory '%s'",
	1886	memory_region_name(new_block->mr));
	1887	qemu_mutex_unlock_ramlist();
	1888	return;
	1889	}
	1890	memory_try_enable_merging(new_block->host, new_block->max_length);
	1891	}
	1892	}
	1893
	1894	new_ram_size = MAX(old_ram_size,
	1895	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	1896	if (new_ram_size > old_ram_size) {
	1897	dirty_memory_extend(old_ram_size, new_ram_size);
	1898	}
	1899	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	1900	* QLIST (which has an RCU-friendly variant) does not have insertion at
	1901	* tail, so save the last element in last_block.
	1902	*/
	1903	RAMBLOCK_FOREACH(block) {
	1904	last_block = block;
	1905	if (block->max_length < new_block->max_length) {
	1906	break;
	1907	}
	1908	}
	1909	if (block) {
	1910	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	1911	} else if (last_block) {
	1912	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	1913	} else { /* list is empty */
	1914	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	1915	}
	1916	ram_list.mru_block = NULL;
	1917
	1918	/* Write list before version */
	1919	smp_wmb();
	1920	ram_list.version++;
	1921	qemu_mutex_unlock_ramlist();
	1922
	1923	cpu_physical_memory_set_dirty_range(new_block->offset,
	1924	new_block->used_length,
	1925	DIRTY_CLIENTS_ALL);
	1926
	1927	if (new_block->host) {
	1928	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	1929	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	1930	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	1931	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	1932	ram_block_notify_add(new_block->host, new_block->max_length);
	1933	}
	1934	}
	1935
	1936	#ifdef __linux__
	1937	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	1938	bool share, int fd,
	1939	Error **errp)
	1940	{
	1941	RAMBlock *new_block;
	1942	Error *local_err = NULL;
	1943	int64_t file_size;
	1944
	1945	if (xen_enabled()) {
	1946	error_setg(errp, "-mem-path not supported with Xen");
	1947	return NULL;
	1948	}
	1949
	1950	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	1951	error_setg(errp,
	1952	"host lacks kvm mmu notifiers, -mem-path unsupported");
	1953	return NULL;
	1954	}
	1955
	1956	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	1957	/*
	1958	* file_ram_alloc() needs to allocate just like
	1959	* phys_mem_alloc, but we haven't bothered to provide
	1960	* a hook there.
	1961	*/
	1962	error_setg(errp,
	1963	"-mem-path not supported with this accelerator");
	1964	return NULL;
	1965	}
	1966
	1967	size = HOST_PAGE_ALIGN(size);
	1968	file_size = get_file_size(fd);
	1969	if (file_size > 0 && file_size < size) {
	1970	error_setg(errp, "backing store %s size 0x%" PRIx64
	1971	" does not match 'size' option 0x" RAM_ADDR_FMT,
	1972	mem_path, file_size, size);
	1973	return NULL;
	1974	}
	1975
	1976	new_block = g_malloc0(sizeof(*new_block));
	1977	new_block->mr = mr;
	1978	new_block->used_length = size;
	1979	new_block->max_length = size;
	1980	new_block->flags = share ? RAM_SHARED : 0;
	1981	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	1982	if (!new_block->host) {
	1983	g_free(new_block);
	1984	return NULL;
	1985	}
	1986
	1987	ram_block_add(new_block, &local_err);
	1988	if (local_err) {
	1989	g_free(new_block);
	1990	error_propagate(errp, local_err);
	1991	return NULL;
	1992	}
	1993	return new_block;
	1994
	1995	}
	1996
	1997
	1998	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	1999	bool share, const char *mem_path,
	2000	Error **errp)
	2001	{
	2002	int fd;
	2003	bool created;
	2004	RAMBlock *block;
	2005
	2006	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2007	if (fd < 0) {
	2008	return NULL;
	2009	}
	2010
	2011	block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
	2012	if (!block) {
	2013	if (created) {
	2014	unlink(mem_path);
	2015	}
	2016	close(fd);
	2017	return NULL;
	2018	}
	2019
	2020	return block;
	2021	}
	2022	#endif
	2023
	2024	static
	2025	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2026	void (resized)(const char,
	2027	uint64_t length,
	2028	void *host),
	2029	void *host, bool resizeable,
	2030	MemoryRegion mr, Error *errp)
	2031	{
	2032	RAMBlock *new_block;
	2033	Error *local_err = NULL;
	2034
	2035	size = HOST_PAGE_ALIGN(size);
	2036	max_size = HOST_PAGE_ALIGN(max_size);
	2037	new_block = g_malloc0(sizeof(*new_block));
	2038	new_block->mr = mr;
	2039	new_block->resized = resized;
	2040	new_block->used_length = size;
	2041	new_block->max_length = max_size;
	2042	assert(max_size >= size);
	2043	new_block->fd = -1;
	2044	new_block->page_size = getpagesize();
	2045	new_block->host = host;
	2046	if (host) {
	2047	new_block->flags \|= RAM_PREALLOC;
	2048	}
	2049	if (resizeable) {
	2050	new_block->flags \|= RAM_RESIZEABLE;
	2051	}
	2052	ram_block_add(new_block, &local_err);
	2053	if (local_err) {
	2054	g_free(new_block);
	2055	error_propagate(errp, local_err);
	2056	return NULL;
	2057	}
	2058	return new_block;
	2059	}
	2060
	2061	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2062	MemoryRegion mr, Error *errp)
	2063	{
	2064	return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
	2065	}
	2066
	2067	RAMBlock qemu_ram_alloc(ram_addr_t size, MemoryRegion mr, Error **errp)
	2068	{
	2069	return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
	2070	}
	2071
	2072	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2073	void (resized)(const char,
	2074	uint64_t length,
	2075	void *host),
	2076	MemoryRegion mr, Error *errp)
	2077	{
	2078	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
	2079	}
	2080
	2081	static void reclaim_ramblock(RAMBlock *block)
	2082	{
	2083	if (block->flags & RAM_PREALLOC) {
	2084	;
	2085	} else if (xen_enabled()) {
	2086	xen_invalidate_map_cache_entry(block->host);
	2087	#ifndef _WIN32
	2088	} else if (block->fd >= 0) {
	2089	qemu_ram_munmap(block->host, block->max_length);
	2090	close(block->fd);
	2091	#endif
	2092	} else {
	2093	qemu_anon_ram_free(block->host, block->max_length);
	2094	}
	2095	g_free(block);
	2096	}
	2097
	2098	void qemu_ram_free(RAMBlock *block)
	2099	{
	2100	if (!block) {
	2101	return;
	2102	}
	2103
	2104	if (block->host) {
	2105	ram_block_notify_remove(block->host, block->max_length);
	2106	}
	2107
	2108	qemu_mutex_lock_ramlist();
	2109	QLIST_REMOVE_RCU(block, next);
	2110	ram_list.mru_block = NULL;
	2111	/* Write list before version */
	2112	smp_wmb();
	2113	ram_list.version++;
	2114	call_rcu(block, reclaim_ramblock, rcu);
	2115	qemu_mutex_unlock_ramlist();
	2116	}
	2117
	2118	#ifndef _WIN32
	2119	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2120	{
	2121	RAMBlock *block;
	2122	ram_addr_t offset;
	2123	int flags;
	2124	void area, vaddr;
	2125
	2126	RAMBLOCK_FOREACH(block) {
	2127	offset = addr - block->offset;
	2128	if (offset < block->max_length) {
	2129	vaddr = ramblock_ptr(block, offset);
	2130	if (block->flags & RAM_PREALLOC) {
	2131	;
	2132	} else if (xen_enabled()) {
	2133	abort();
	2134	} else {
	2135	flags = MAP_FIXED;
	2136	if (block->fd >= 0) {
	2137	flags \|= (block->flags & RAM_SHARED ?
	2138	MAP_SHARED : MAP_PRIVATE);
	2139	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2140	flags, block->fd, offset);
	2141	} else {
	2142	/*
	2143	* Remap needs to match alloc. Accelerators that
	2144	* set phys_mem_alloc never remap. If they did,
	2145	* we'd need a remap hook here.
	2146	*/
	2147	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2148
	2149	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2150	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2151	flags, -1, 0);
	2152	}
	2153	if (area != vaddr) {
	2154	fprintf(stderr, "Could not remap addr: "
	2155	RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
	2156	length, addr);
	2157	exit(1);
	2158	}
	2159	memory_try_enable_merging(vaddr, length);
	2160	qemu_ram_setup_dump(vaddr, length);
	2161	}
	2162	}
	2163	}
	2164	}
	2165	#endif /* !_WIN32 */
	2166
	2167	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2168	* This should not be used for general purpose DMA. Use address_space_map
	2169	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2170	* device owns, use memory_region_get_ram_ptr.
	2171	*
	2172	* Called within RCU critical section.
	2173	*/
	2174	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2175	{
	2176	RAMBlock *block = ram_block;
	2177
	2178	if (block == NULL) {
	2179	block = qemu_get_ram_block(addr);
	2180	addr -= block->offset;
	2181	}
	2182
	2183	if (xen_enabled() && block->host == NULL) {
	2184	/* We need to check if the requested address is in the RAM
	2185	* because we don't want to map the entire memory in QEMU.
	2186	* In that case just map until the end of the page.
	2187	*/
	2188	if (block->offset == 0) {
	2189	return xen_map_cache(addr, 0, 0, false);
	2190	}
	2191
	2192	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2193	}
	2194	return ramblock_ptr(block, addr);
	2195	}
	2196
	2197	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2198	* but takes a size argument.
	2199	*
	2200	* Called within RCU critical section.
	2201	*/
	2202	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2203	hwaddr *size)
	2204	{
	2205	RAMBlock *block = ram_block;
	2206	if (*size == 0) {
	2207	return NULL;
	2208	}
	2209
	2210	if (block == NULL) {
	2211	block = qemu_get_ram_block(addr);
	2212	addr -= block->offset;
	2213	}
	2214	size = MIN(size, block->max_length - addr);
	2215
	2216	if (xen_enabled() && block->host == NULL) {
	2217	/* We need to check if the requested address is in the RAM
	2218	* because we don't want to map the entire memory in QEMU.
	2219	* In that case just map the requested area.
	2220	*/
	2221	if (block->offset == 0) {
	2222	return xen_map_cache(addr, *size, 1, true);
	2223	}
	2224
	2225	block->host = xen_map_cache(block->offset, block->max_length, 1, true);
	2226	}
	2227
	2228	return ramblock_ptr(block, addr);
	2229	}
	2230
	2231	/*
	2232	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2233	* in that RAMBlock.
	2234	*
	2235	* ptr: Host pointer to look up
	2236	* round_offset: If true round the result offset down to a page boundary
	2237	* *ram_addr: set to result ram_addr
	2238	* *offset: set to result offset within the RAMBlock
	2239	*
	2240	* Returns: RAMBlock (or NULL if not found)
	2241	*
	2242	* By the time this function returns, the returned pointer is not protected
	2243	* by RCU anymore. If the caller is not within an RCU critical section and
	2244	* does not hold the iothread lock, it must have other means of protecting the
	2245	* pointer, such as a reference to the region that includes the incoming
	2246	* ram_addr_t.
	2247	*/
	2248	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2249	ram_addr_t *offset)
	2250	{
	2251	RAMBlock *block;
	2252	uint8_t *host = ptr;
	2253
	2254	if (xen_enabled()) {
	2255	ram_addr_t ram_addr;
	2256	rcu_read_lock();
	2257	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2258	block = qemu_get_ram_block(ram_addr);
	2259	if (block) {
	2260	*offset = ram_addr - block->offset;
	2261	}
	2262	rcu_read_unlock();
	2263	return block;
	2264	}
	2265
	2266	rcu_read_lock();
	2267	block = atomic_rcu_read(&ram_list.mru_block);
	2268	if (block && block->host && host - block->host < block->max_length) {
	2269	goto found;
	2270	}
	2271
	2272	RAMBLOCK_FOREACH(block) {
	2273	/* This case append when the block is not mapped. */
	2274	if (block->host == NULL) {
	2275	continue;
	2276	}
	2277	if (host - block->host < block->max_length) {
	2278	goto found;
	2279	}
	2280	}
	2281
	2282	rcu_read_unlock();
	2283	return NULL;
	2284
	2285	found:
	2286	*offset = (host - block->host);
	2287	if (round_offset) {
	2288	*offset &= TARGET_PAGE_MASK;
	2289	}
	2290	rcu_read_unlock();
	2291	return block;
	2292	}
	2293
	2294	/*
	2295	* Finds the named RAMBlock
	2296	*
	2297	* name: The name of RAMBlock to find
	2298	*
	2299	* Returns: RAMBlock (or NULL if not found)
	2300	*/
	2301	RAMBlock qemu_ram_block_by_name(const char name)
	2302	{
	2303	RAMBlock *block;
	2304
	2305	RAMBLOCK_FOREACH(block) {
	2306	if (!strcmp(name, block->idstr)) {
	2307	return block;
	2308	}
	2309	}
	2310
	2311	return NULL;
	2312	}
	2313
	2314	/* Some of the softmmu routines need to translate from a host pointer
	2315	(typically a TLB entry) back to a ram offset. */
	2316	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2317	{
	2318	RAMBlock *block;
	2319	ram_addr_t offset;
	2320
	2321	block = qemu_ram_block_from_host(ptr, false, &offset);
	2322	if (!block) {
	2323	return RAM_ADDR_INVALID;
	2324	}
	2325
	2326	return block->offset + offset;
	2327	}
	2328
	2329	/* Called within RCU critical section. */
	2330	static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
	2331	uint64_t val, unsigned size)
	2332	{
	2333	bool locked = false;
	2334
	2335	assert(tcg_enabled());
	2336	if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
	2337	locked = true;
	2338	tb_lock();
	2339	tb_invalidate_phys_page_fast(ram_addr, size);
	2340	}
	2341	switch (size) {
	2342	case 1:
	2343	stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2344	break;
	2345	case 2:
	2346	stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2347	break;
	2348	case 4:
	2349	stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2350	break;
	2351	default:
	2352	abort();
	2353	}
	2354
	2355	if (locked) {
	2356	tb_unlock();
	2357	}
	2358
	2359	/* Set both VGA and migration bits for simplicity and to remove
	2360	* the notdirty callback faster.
	2361	*/
	2362	cpu_physical_memory_set_dirty_range(ram_addr, size,
	2363	DIRTY_CLIENTS_NOCODE);
	2364	/* we remove the notdirty callback only if the code has been
	2365	flushed */
	2366	if (!cpu_physical_memory_is_clean(ram_addr)) {
	2367	tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
	2368	}
	2369	}
	2370
	2371	static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
	2372	unsigned size, bool is_write)
	2373	{
	2374	return is_write;
	2375	}
	2376
	2377	static const MemoryRegionOps notdirty_mem_ops = {
	2378	.write = notdirty_mem_write,
	2379	.valid.accepts = notdirty_mem_accepts,
	2380	.endianness = DEVICE_NATIVE_ENDIAN,
	2381	};
	2382
	2383	/* Generate a debug exception if a watchpoint has been hit. */
	2384	static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
	2385	{
	2386	CPUState *cpu = current_cpu;
	2387	CPUClass *cc = CPU_GET_CLASS(cpu);
	2388	CPUArchState *env = cpu->env_ptr;
	2389	target_ulong pc, cs_base;
	2390	target_ulong vaddr;
	2391	CPUWatchpoint *wp;
	2392	uint32_t cpu_flags;
	2393
	2394	assert(tcg_enabled());
	2395	if (cpu->watchpoint_hit) {
	2396	/* We re-entered the check after replacing the TB. Now raise
	2397	* the debug interrupt so that is will trigger after the
	2398	* current instruction. */
	2399	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2400	return;
	2401	}
	2402	vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
	2403	vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
	2404	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2405	if (cpu_watchpoint_address_matches(wp, vaddr, len)
	2406	&& (wp->flags & flags)) {
	2407	if (flags == BP_MEM_READ) {
	2408	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2409	} else {
	2410	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2411	}
	2412	wp->hitaddr = vaddr;
	2413	wp->hitattrs = attrs;
	2414	if (!cpu->watchpoint_hit) {
	2415	if (wp->flags & BP_CPU &&
	2416	!cc->debug_check_watchpoint(cpu, wp)) {
	2417	wp->flags &= ~BP_WATCHPOINT_HIT;
	2418	continue;
	2419	}
	2420	cpu->watchpoint_hit = wp;
	2421
	2422	/* Both tb_lock and iothread_mutex will be reset when
	2423	* cpu_loop_exit or cpu_loop_exit_noexc longjmp
	2424	* back into the cpu_exec main loop.
	2425	*/
	2426	tb_lock();
	2427	tb_check_watchpoint(cpu);
	2428	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2429	cpu->exception_index = EXCP_DEBUG;
	2430	cpu_loop_exit(cpu);
	2431	} else {
	2432	cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
	2433	tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
	2434	cpu_loop_exit_noexc(cpu);
	2435	}
	2436	}
	2437	} else {
	2438	wp->flags &= ~BP_WATCHPOINT_HIT;
	2439	}
	2440	}
	2441	}
	2442
	2443	/* Watchpoint access routines. Watchpoints are inserted using TLB tricks,
	2444	so these check for a hit then pass through to the normal out-of-line
	2445	phys routines. */
	2446	static MemTxResult watch_mem_read(void opaque, hwaddr addr, uint64_t pdata,
	2447	unsigned size, MemTxAttrs attrs)
	2448	{
	2449	MemTxResult res;
	2450	uint64_t data;
	2451	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2452	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2453
	2454	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
	2455	switch (size) {
	2456	case 1:
	2457	data = address_space_ldub(as, addr, attrs, &res);
	2458	break;
	2459	case 2:
	2460	data = address_space_lduw(as, addr, attrs, &res);
	2461	break;
	2462	case 4:
	2463	data = address_space_ldl(as, addr, attrs, &res);
	2464	break;
	2465	default: abort();
	2466	}
	2467	*pdata = data;
	2468	return res;
	2469	}
	2470
	2471	static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
	2472	uint64_t val, unsigned size,
	2473	MemTxAttrs attrs)
	2474	{
	2475	MemTxResult res;
	2476	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2477	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2478
	2479	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
	2480	switch (size) {
	2481	case 1:
	2482	address_space_stb(as, addr, val, attrs, &res);
	2483	break;
	2484	case 2:
	2485	address_space_stw(as, addr, val, attrs, &res);
	2486	break;
	2487	case 4:
	2488	address_space_stl(as, addr, val, attrs, &res);
	2489	break;
	2490	default: abort();
	2491	}
	2492	return res;
	2493	}
	2494
	2495	static const MemoryRegionOps watch_mem_ops = {
	2496	.read_with_attrs = watch_mem_read,
	2497	.write_with_attrs = watch_mem_write,
	2498	.endianness = DEVICE_NATIVE_ENDIAN,
	2499	};
	2500
	2501	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2502	unsigned len, MemTxAttrs attrs)
	2503	{
	2504	subpage_t *subpage = opaque;
	2505	uint8_t buf[8];
	2506	MemTxResult res;
	2507
	2508	#if defined(DEBUG_SUBPAGE)
	2509	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2510	subpage, len, addr);
	2511	#endif
	2512	res = address_space_read(subpage->as, addr + subpage->base,
	2513	attrs, buf, len);
	2514	if (res) {
	2515	return res;
	2516	}
	2517	switch (len) {
	2518	case 1:
	2519	*data = ldub_p(buf);
	2520	return MEMTX_OK;
	2521	case 2:
	2522	*data = lduw_p(buf);
	2523	return MEMTX_OK;
	2524	case 4:
	2525	*data = ldl_p(buf);
	2526	return MEMTX_OK;
	2527	case 8:
	2528	*data = ldq_p(buf);
	2529	return MEMTX_OK;
	2530	default:
	2531	abort();
	2532	}
	2533	}
	2534
	2535	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2536	uint64_t value, unsigned len, MemTxAttrs attrs)
	2537	{
	2538	subpage_t *subpage = opaque;
	2539	uint8_t buf[8];
	2540
	2541	#if defined(DEBUG_SUBPAGE)
	2542	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2543	" value %"PRIx64"\n",
	2544	__func__, subpage, len, addr, value);
	2545	#endif
	2546	switch (len) {
	2547	case 1:
	2548	stb_p(buf, value);
	2549	break;
	2550	case 2:
	2551	stw_p(buf, value);
	2552	break;
	2553	case 4:
	2554	stl_p(buf, value);
	2555	break;
	2556	case 8:
	2557	stq_p(buf, value);
	2558	break;
	2559	default:
	2560	abort();
	2561	}
	2562	return address_space_write(subpage->as, addr + subpage->base,
	2563	attrs, buf, len);
	2564	}
	2565
	2566	static bool subpage_accepts(void *opaque, hwaddr addr,
	2567	unsigned len, bool is_write)
	2568	{
	2569	subpage_t *subpage = opaque;
	2570	#if defined(DEBUG_SUBPAGE)
	2571	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2572	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2573	#endif
	2574
	2575	return address_space_access_valid(subpage->as, addr + subpage->base,
	2576	len, is_write);
	2577	}
	2578
	2579	static const MemoryRegionOps subpage_ops = {
	2580	.read_with_attrs = subpage_read,
	2581	.write_with_attrs = subpage_write,
	2582	.impl.min_access_size = 1,
	2583	.impl.max_access_size = 8,
	2584	.valid.min_access_size = 1,
	2585	.valid.max_access_size = 8,
	2586	.valid.accepts = subpage_accepts,
	2587	.endianness = DEVICE_NATIVE_ENDIAN,
	2588	};
	2589
	2590	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	2591	uint16_t section)
	2592	{
	2593	int idx, eidx;
	2594
	2595	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2596	return -1;
	2597	idx = SUBPAGE_IDX(start);
	2598	eidx = SUBPAGE_IDX(end);
	2599	#if defined(DEBUG_SUBPAGE)
	2600	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2601	__func__, mmio, start, end, idx, eidx, section);
	2602	#endif
	2603	for (; idx <= eidx; idx++) {
	2604	mmio->sub_section[idx] = section;
	2605	}
	2606
	2607	return 0;
	2608	}
	2609
	2610	static subpage_t subpage_init(AddressSpace as, hwaddr base)
	2611	{
	2612	subpage_t *mmio;
	2613
	2614	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2615	mmio->as = as;
	2616	mmio->base = base;
	2617	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2618	NULL, TARGET_PAGE_SIZE);
	2619	mmio->iomem.subpage = true;
	2620	#if defined(DEBUG_SUBPAGE)
	2621	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2622	mmio, base, TARGET_PAGE_SIZE);
	2623	#endif
	2624	subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
	2625
	2626	return mmio;
	2627	}
	2628
	2629	static uint16_t dummy_section(PhysPageMap map, AddressSpace as,
	2630	MemoryRegion *mr)
	2631	{
	2632	assert(as);
	2633	MemoryRegionSection section = {
	2634	.address_space = as,
	2635	.mr = mr,
	2636	.offset_within_address_space = 0,
	2637	.offset_within_region = 0,
	2638	.size = int128_2_64(),
	2639	};
	2640
	2641	return phys_section_add(map, &section);
	2642	}
	2643
	2644	MemoryRegion iotlb_to_region(CPUState cpu, hwaddr index, MemTxAttrs attrs)
	2645	{
	2646	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	2647	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	2648	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	2649	MemoryRegionSection *sections = d->map.sections;
	2650
	2651	return sections[index & ~TARGET_PAGE_MASK].mr;
	2652	}
	2653
	2654	static void io_mem_init(void)
	2655	{
	2656	memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
	2657	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	2658	NULL, UINT64_MAX);
	2659
	2660	/* io_mem_notdirty calls tb_invalidate_phys_page_fast,
	2661	* which can be called without the iothread mutex.
	2662	*/
	2663	memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
	2664	NULL, UINT64_MAX);
	2665	memory_region_clear_global_locking(&io_mem_notdirty);
	2666
	2667	memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
	2668	NULL, UINT64_MAX);
	2669	}
	2670
	2671	static void mem_begin(MemoryListener *listener)
	2672	{
	2673	AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
	2674	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	2675	uint16_t n;
	2676
	2677	n = dummy_section(&d->map, as, &io_mem_unassigned);
	2678	assert(n == PHYS_SECTION_UNASSIGNED);
	2679	n = dummy_section(&d->map, as, &io_mem_notdirty);
	2680	assert(n == PHYS_SECTION_NOTDIRTY);
	2681	n = dummy_section(&d->map, as, &io_mem_rom);
	2682	assert(n == PHYS_SECTION_ROM);
	2683	n = dummy_section(&d->map, as, &io_mem_watch);
	2684	assert(n == PHYS_SECTION_WATCH);
	2685
	2686	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	2687	d->as = as;
	2688	as->next_dispatch = d;
	2689	}
	2690
	2691	static void address_space_dispatch_free(AddressSpaceDispatch *d)
	2692	{
	2693	phys_sections_free(&d->map);
	2694	g_free(d);
	2695	}
	2696
	2697	static void mem_commit(MemoryListener *listener)
	2698	{
	2699	AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
	2700	AddressSpaceDispatch *cur = as->dispatch;
	2701	AddressSpaceDispatch *next = as->next_dispatch;
	2702
	2703	phys_page_compact_all(next, next->map.nodes_nb);
	2704
	2705	atomic_rcu_set(&as->dispatch, next);
	2706	if (cur) {
	2707	call_rcu(cur, address_space_dispatch_free, rcu);
	2708	}
	2709	}
	2710
	2711	static void tcg_commit(MemoryListener *listener)
	2712	{
	2713	CPUAddressSpace *cpuas;
	2714	AddressSpaceDispatch *d;
	2715
	2716	/* since each CPU stores ram addresses in its TLB cache, we must
	2717	reset the modified entries */
	2718	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2719	cpu_reloading_memory_map();
	2720	/* The CPU and TLB are protected by the iothread lock.
	2721	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	2722	* may have split the RCU critical section.
	2723	*/
	2724	d = atomic_rcu_read(&cpuas->as->dispatch);
	2725	atomic_rcu_set(&cpuas->memory_dispatch, d);
	2726	tlb_flush(cpuas->cpu);
	2727	}
	2728
	2729	void address_space_init_dispatch(AddressSpace *as)
	2730	{
	2731	as->dispatch = NULL;
	2732	as->dispatch_listener = (MemoryListener) {
	2733	.begin = mem_begin,
	2734	.commit = mem_commit,
	2735	.region_add = mem_add,
	2736	.region_nop = mem_add,
	2737	.priority = 0,
	2738	};
	2739	memory_listener_register(&as->dispatch_listener, as);
	2740	}
	2741
	2742	void address_space_unregister(AddressSpace *as)
	2743	{
	2744	memory_listener_unregister(&as->dispatch_listener);
	2745	}
	2746
	2747	void address_space_destroy_dispatch(AddressSpace *as)
	2748	{
	2749	AddressSpaceDispatch *d = as->dispatch;
	2750
	2751	atomic_rcu_set(&as->dispatch, NULL);
	2752	if (d) {
	2753	call_rcu(d, address_space_dispatch_free, rcu);
	2754	}
	2755	}
	2756
	2757	static void memory_map_init(void)
	2758	{
	2759	system_memory = g_malloc(sizeof(*system_memory));
	2760
	2761	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	2762	address_space_init(&address_space_memory, system_memory, "memory");
	2763
	2764	system_io = g_malloc(sizeof(*system_io));
	2765	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	2766	65536);
	2767	address_space_init(&address_space_io, system_io, "I/O");
	2768	}
	2769
	2770	MemoryRegion *get_system_memory(void)
	2771	{
	2772	return system_memory;
	2773	}
	2774
	2775	MemoryRegion *get_system_io(void)
	2776	{
	2777	return system_io;
	2778	}
	2779
	2780	#endif /* !defined(CONFIG_USER_ONLY) */
	2781
	2782	/* physical memory access (slow version, mainly for debug) */
	2783	#if defined(CONFIG_USER_ONLY)
	2784	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	2785	uint8_t *buf, int len, int is_write)
	2786	{
	2787	int l, flags;
	2788	target_ulong page;
	2789	void * p;
	2790
	2791	while (len > 0) {
	2792	page = addr & TARGET_PAGE_MASK;
	2793	l = (page + TARGET_PAGE_SIZE) - addr;
	2794	if (l > len)
	2795	l = len;
	2796	flags = page_get_flags(page);
	2797	if (!(flags & PAGE_VALID))
	2798	return -1;
	2799	if (is_write) {
	2800	if (!(flags & PAGE_WRITE))
	2801	return -1;
	2802	/* XXX: this code should not depend on lock_user */
	2803	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	2804	return -1;
	2805	memcpy(p, buf, l);
	2806	unlock_user(p, addr, l);
	2807	} else {
	2808	if (!(flags & PAGE_READ))
	2809	return -1;
	2810	/* XXX: this code should not depend on lock_user */
	2811	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	2812	return -1;
	2813	memcpy(buf, p, l);
	2814	unlock_user(p, addr, 0);
	2815	}
	2816	len -= l;
	2817	buf += l;
	2818	addr += l;
	2819	}
	2820	return 0;
	2821	}
	2822
	2823	#else
	2824
	2825	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	2826	hwaddr length)
	2827	{
	2828	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	2829	addr += memory_region_get_ram_addr(mr);
	2830
	2831	/* No early return if dirty_log_mask is or becomes 0, because
	2832	* cpu_physical_memory_set_dirty_range will still call
	2833	* xen_modified_memory.
	2834	*/
	2835	if (dirty_log_mask) {
	2836	dirty_log_mask =
	2837	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	2838	}
	2839	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	2840	assert(tcg_enabled());
	2841	tb_lock();
	2842	tb_invalidate_phys_range(addr, addr + length);
	2843	tb_unlock();
	2844	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	2845	}
	2846	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	2847	}
	2848
	2849	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	2850	{
	2851	unsigned access_size_max = mr->ops->valid.max_access_size;
	2852
	2853	/* Regions are assumed to support 1-4 byte accesses unless
	2854	otherwise specified. */
	2855	if (access_size_max == 0) {
	2856	access_size_max = 4;
	2857	}
	2858
	2859	/* Bound the maximum access by the alignment of the address. */
	2860	if (!mr->ops->impl.unaligned) {
	2861	unsigned align_size_max = addr & -addr;
	2862	if (align_size_max != 0 && align_size_max < access_size_max) {
	2863	access_size_max = align_size_max;
	2864	}
	2865	}
	2866
	2867	/* Don't attempt accesses larger than the maximum. */
	2868	if (l > access_size_max) {
	2869	l = access_size_max;
	2870	}
	2871	l = pow2floor(l);
	2872
	2873	return l;
	2874	}
	2875
	2876	static bool prepare_mmio_access(MemoryRegion *mr)
	2877	{
	2878	bool unlocked = !qemu_mutex_iothread_locked();
	2879	bool release_lock = false;
	2880
	2881	if (unlocked && mr->global_locking) {
	2882	qemu_mutex_lock_iothread();
	2883	unlocked = false;
	2884	release_lock = true;
	2885	}
	2886	if (mr->flush_coalesced_mmio) {
	2887	if (unlocked) {
	2888	qemu_mutex_lock_iothread();
	2889	}
	2890	qemu_flush_coalesced_mmio_buffer();
	2891	if (unlocked) {
	2892	qemu_mutex_unlock_iothread();
	2893	}
	2894	}
	2895
	2896	return release_lock;
	2897	}
	2898
	2899	/* Called within RCU critical section. */
	2900	static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
	2901	MemTxAttrs attrs,
	2902	const uint8_t *buf,
	2903	int len, hwaddr addr1,
	2904	hwaddr l, MemoryRegion *mr)
	2905	{
	2906	uint8_t *ptr;
	2907	uint64_t val;
	2908	MemTxResult result = MEMTX_OK;
	2909	bool release_lock = false;
	2910
	2911	for (;;) {
	2912	if (!memory_access_is_direct(mr, true)) {
	2913	release_lock \|= prepare_mmio_access(mr);
	2914	l = memory_access_size(mr, l, addr1);
	2915	/* XXX: could force current_cpu to NULL to avoid
	2916	potential bugs */
	2917	switch (l) {
	2918	case 8:
	2919	/* 64 bit write access */
	2920	val = ldq_p(buf);
	2921	result \|= memory_region_dispatch_write(mr, addr1, val, 8,
	2922	attrs);
	2923	break;
	2924	case 4:
	2925	/* 32 bit write access */
	2926	val = (uint32_t)ldl_p(buf);
	2927	result \|= memory_region_dispatch_write(mr, addr1, val, 4,
	2928	attrs);
	2929	break;
	2930	case 2:
	2931	/* 16 bit write access */
	2932	val = lduw_p(buf);
	2933	result \|= memory_region_dispatch_write(mr, addr1, val, 2,
	2934	attrs);
	2935	break;
	2936	case 1:
	2937	/* 8 bit write access */
	2938	val = ldub_p(buf);
	2939	result \|= memory_region_dispatch_write(mr, addr1, val, 1,
	2940	attrs);
	2941	break;
	2942	default:
	2943	abort();
	2944	}
	2945	} else {
	2946	/* RAM case */
	2947	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l);
	2948	memcpy(ptr, buf, l);
	2949	invalidate_and_set_dirty(mr, addr1, l);
	2950	}
	2951
	2952	if (release_lock) {
	2953	qemu_mutex_unlock_iothread();
	2954	release_lock = false;
	2955	}
	2956
	2957	len -= l;
	2958	buf += l;
	2959	addr += l;
	2960
	2961	if (!len) {
	2962	break;
	2963	}
	2964
	2965	l = len;
	2966	mr = address_space_translate(as, addr, &addr1, &l, true);
	2967	}
	2968
	2969	return result;
	2970	}
	2971
	2972	MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	2973	const uint8_t *buf, int len)
	2974	{
	2975	hwaddr l;
	2976	hwaddr addr1;
	2977	MemoryRegion *mr;
	2978	MemTxResult result = MEMTX_OK;
	2979
	2980	if (len > 0) {
	2981	rcu_read_lock();
	2982	l = len;
	2983	mr = address_space_translate(as, addr, &addr1, &l, true);
	2984	result = address_space_write_continue(as, addr, attrs, buf, len,
	2985	addr1, l, mr);
	2986	rcu_read_unlock();
	2987	}
	2988
	2989	return result;
	2990	}
	2991
	2992	/* Called within RCU critical section. */
	2993	MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
	2994	MemTxAttrs attrs, uint8_t *buf,
	2995	int len, hwaddr addr1, hwaddr l,
	2996	MemoryRegion *mr)
	2997	{
	2998	uint8_t *ptr;
	2999	uint64_t val;
	3000	MemTxResult result = MEMTX_OK;
	3001	bool release_lock = false;
	3002
	3003	for (;;) {
	3004	if (!memory_access_is_direct(mr, false)) {
	3005	/* I/O case */
	3006	release_lock \|= prepare_mmio_access(mr);
	3007	l = memory_access_size(mr, l, addr1);
	3008	switch (l) {
	3009	case 8:
	3010	/* 64 bit read access */
	3011	result \|= memory_region_dispatch_read(mr, addr1, &val, 8,
	3012	attrs);
	3013	stq_p(buf, val);
	3014	break;
	3015	case 4:
	3016	/* 32 bit read access */
	3017	result \|= memory_region_dispatch_read(mr, addr1, &val, 4,
	3018	attrs);
	3019	stl_p(buf, val);
	3020	break;
	3021	case 2:
	3022	/* 16 bit read access */
	3023	result \|= memory_region_dispatch_read(mr, addr1, &val, 2,
	3024	attrs);
	3025	stw_p(buf, val);
	3026	break;
	3027	case 1:
	3028	/* 8 bit read access */
	3029	result \|= memory_region_dispatch_read(mr, addr1, &val, 1,
	3030	attrs);
	3031	stb_p(buf, val);
	3032	break;
	3033	default:
	3034	abort();
	3035	}
	3036	} else {
	3037	/* RAM case */
	3038	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l);
	3039	memcpy(buf, ptr, l);
	3040	}
	3041
	3042	if (release_lock) {
	3043	qemu_mutex_unlock_iothread();
	3044	release_lock = false;
	3045	}
	3046
	3047	len -= l;
	3048	buf += l;
	3049	addr += l;
	3050
	3051	if (!len) {
	3052	break;
	3053	}
	3054
	3055	l = len;
	3056	mr = address_space_translate(as, addr, &addr1, &l, false);
	3057	}
	3058
	3059	return result;
	3060	}
	3061
	3062	MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
	3063	MemTxAttrs attrs, uint8_t *buf, int len)
	3064	{
	3065	hwaddr l;
	3066	hwaddr addr1;
	3067	MemoryRegion *mr;
	3068	MemTxResult result = MEMTX_OK;
	3069
	3070	if (len > 0) {
	3071	rcu_read_lock();
	3072	l = len;
	3073	mr = address_space_translate(as, addr, &addr1, &l, false);
	3074	result = address_space_read_continue(as, addr, attrs, buf, len,
	3075	addr1, l, mr);
	3076	rcu_read_unlock();
	3077	}
	3078
	3079	return result;
	3080	}
	3081
	3082	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	3083	uint8_t *buf, int len, bool is_write)
	3084	{
	3085	if (is_write) {
	3086	return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
	3087	} else {
	3088	return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
	3089	}
	3090	}
	3091
	3092	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3093	int len, int is_write)
	3094	{
	3095	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3096	buf, len, is_write);
	3097	}
	3098
	3099	enum write_rom_type {
	3100	WRITE_DATA,
	3101	FLUSH_CACHE,
	3102	};
	3103
	3104	static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
	3105	hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
	3106	{
	3107	hwaddr l;
	3108	uint8_t *ptr;
	3109	hwaddr addr1;
	3110	MemoryRegion *mr;
	3111
	3112	rcu_read_lock();
	3113	while (len > 0) {
	3114	l = len;
	3115	mr = address_space_translate(as, addr, &addr1, &l, true);
	3116
	3117	if (!(memory_region_is_ram(mr) \|\|
	3118	memory_region_is_romd(mr))) {
	3119	l = memory_access_size(mr, l, addr1);
	3120	} else {
	3121	/* ROM/RAM case */
	3122	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3123	switch (type) {
	3124	case WRITE_DATA:
	3125	memcpy(ptr, buf, l);
	3126	invalidate_and_set_dirty(mr, addr1, l);
	3127	break;
	3128	case FLUSH_CACHE:
	3129	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3130	break;
	3131	}
	3132	}
	3133	len -= l;
	3134	buf += l;
	3135	addr += l;
	3136	}
	3137	rcu_read_unlock();
	3138	}
	3139
	3140	/* used for ROM loading : can write in RAM and ROM */
	3141	void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
	3142	const uint8_t *buf, int len)
	3143	{
	3144	cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
	3145	}
	3146
	3147	void cpu_flush_icache_range(hwaddr start, int len)
	3148	{
	3149	/*
	3150	* This function should do the same thing as an icache flush that was
	3151	* triggered from within the guest. For TCG we are always cache coherent,
	3152	* so there is no need to flush anything. For KVM / Xen we need to flush
	3153	* the host's instruction cache at least.
	3154	*/
	3155	if (tcg_enabled()) {
	3156	return;
	3157	}
	3158
	3159	cpu_physical_memory_write_rom_internal(&address_space_memory,
	3160	start, NULL, len, FLUSH_CACHE);
	3161	}
	3162
	3163	typedef struct {
	3164	MemoryRegion *mr;
	3165	void *buffer;
	3166	hwaddr addr;
	3167	hwaddr len;
	3168	bool in_use;
	3169	} BounceBuffer;
	3170
	3171	static BounceBuffer bounce;
	3172
	3173	typedef struct MapClient {
	3174	QEMUBH *bh;
	3175	QLIST_ENTRY(MapClient) link;
	3176	} MapClient;
	3177
	3178	QemuMutex map_client_list_lock;
	3179	static QLIST_HEAD(map_client_list, MapClient) map_client_list
	3180	= QLIST_HEAD_INITIALIZER(map_client_list);
	3181
	3182	static void cpu_unregister_map_client_do(MapClient *client)
	3183	{
	3184	QLIST_REMOVE(client, link);
	3185	g_free(client);
	3186	}
	3187
	3188	static void cpu_notify_map_clients_locked(void)
	3189	{
	3190	MapClient *client;
	3191
	3192	while (!QLIST_EMPTY(&map_client_list)) {
	3193	client = QLIST_FIRST(&map_client_list);
	3194	qemu_bh_schedule(client->bh);
	3195	cpu_unregister_map_client_do(client);
	3196	}
	3197	}
	3198
	3199	void cpu_register_map_client(QEMUBH *bh)
	3200	{
	3201	MapClient client = g_malloc(sizeof(client));
	3202
	3203	qemu_mutex_lock(&map_client_list_lock);
	3204	client->bh = bh;
	3205	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3206	if (!atomic_read(&bounce.in_use)) {
	3207	cpu_notify_map_clients_locked();
	3208	}
	3209	qemu_mutex_unlock(&map_client_list_lock);
	3210	}
	3211
	3212	void cpu_exec_init_all(void)
	3213	{
	3214	qemu_mutex_init(&ram_list.mutex);
	3215	/* The data structures we set up here depend on knowing the page size,
	3216	* so no more changes can be made after this point.
	3217	* In an ideal world, nothing we did before we had finished the
	3218	* machine setup would care about the target page size, and we could
	3219	* do this much later, rather than requiring board models to state
	3220	* up front what their requirements are.
	3221	*/
	3222	finalize_target_page_bits();
	3223	io_mem_init();
	3224	memory_map_init();
	3225	qemu_mutex_init(&map_client_list_lock);
	3226	}
	3227
	3228	void cpu_unregister_map_client(QEMUBH *bh)
	3229	{
	3230	MapClient *client;
	3231
	3232	qemu_mutex_lock(&map_client_list_lock);
	3233	QLIST_FOREACH(client, &map_client_list, link) {
	3234	if (client->bh == bh) {
	3235	cpu_unregister_map_client_do(client);
	3236	break;
	3237	}
	3238	}
	3239	qemu_mutex_unlock(&map_client_list_lock);
	3240	}
	3241
	3242	static void cpu_notify_map_clients(void)
	3243	{
	3244	qemu_mutex_lock(&map_client_list_lock);
	3245	cpu_notify_map_clients_locked();
	3246	qemu_mutex_unlock(&map_client_list_lock);
	3247	}
	3248
	3249	bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
	3250	{
	3251	MemoryRegion *mr;
	3252	hwaddr l, xlat;
	3253
	3254	rcu_read_lock();
	3255	while (len > 0) {
	3256	l = len;
	3257	mr = address_space_translate(as, addr, &xlat, &l, is_write);
	3258	if (!memory_access_is_direct(mr, is_write)) {
	3259	l = memory_access_size(mr, l, addr);
	3260	if (!memory_region_access_valid(mr, xlat, l, is_write)) {
	3261	rcu_read_unlock();
	3262	return false;
	3263	}
	3264	}
	3265
	3266	len -= l;
	3267	addr += l;
	3268	}
	3269	rcu_read_unlock();
	3270	return true;
	3271	}
	3272
	3273	static hwaddr
	3274	address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len,
	3275	MemoryRegion *mr, hwaddr base, hwaddr len,
	3276	bool is_write)
	3277	{
	3278	hwaddr done = 0;
	3279	hwaddr xlat;
	3280	MemoryRegion *this_mr;
	3281
	3282	for (;;) {
	3283	target_len -= len;
	3284	addr += len;
	3285	done += len;
	3286	if (target_len == 0) {
	3287	return done;
	3288	}
	3289
	3290	len = target_len;
	3291	this_mr = address_space_translate(as, addr, &xlat, &len, is_write);
	3292	if (this_mr != mr \|\| xlat != base + done) {
	3293	return done;
	3294	}
	3295	}
	3296	}
	3297
	3298	/* Map a physical memory region into a host virtual address.
	3299	* May map a subset of the requested range, given by and returned in *plen.
	3300	* May return NULL if resources needed to perform the mapping are exhausted.
	3301	* Use only for reads OR writes - not for read-modify-write operations.
	3302	* Use cpu_register_map_client() to know when retrying the map operation is
	3303	* likely to succeed.
	3304	*/
	3305	void address_space_map(AddressSpace as,
	3306	hwaddr addr,
	3307	hwaddr *plen,
	3308	bool is_write)
	3309	{
	3310	hwaddr len = *plen;
	3311	hwaddr l, xlat;
	3312	MemoryRegion *mr;
	3313	void *ptr;
	3314
	3315	if (len == 0) {
	3316	return NULL;
	3317	}
	3318
	3319	l = len;
	3320	rcu_read_lock();
	3321	mr = address_space_translate(as, addr, &xlat, &l, is_write);
	3322
	3323	if (!memory_access_is_direct(mr, is_write)) {
	3324	if (atomic_xchg(&bounce.in_use, true)) {
	3325	rcu_read_unlock();
	3326	return NULL;
	3327	}
	3328	/* Avoid unbounded allocations */
	3329	l = MIN(l, TARGET_PAGE_SIZE);
	3330	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3331	bounce.addr = addr;
	3332	bounce.len = l;
	3333
	3334	memory_region_ref(mr);
	3335	bounce.mr = mr;
	3336	if (!is_write) {
	3337	address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
	3338	bounce.buffer, l);
	3339	}
	3340
	3341	rcu_read_unlock();
	3342	*plen = l;
	3343	return bounce.buffer;
	3344	}
	3345
	3346
	3347	memory_region_ref(mr);
	3348	*plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write);
	3349	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen);
	3350	rcu_read_unlock();
	3351
	3352	return ptr;
	3353	}
	3354
	3355	/* Unmaps a memory region previously mapped by address_space_map().
	3356	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3357	* the amount of memory that was actually read or written by the caller.
	3358	*/
	3359	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3360	int is_write, hwaddr access_len)
	3361	{
	3362	if (buffer != bounce.buffer) {
	3363	MemoryRegion *mr;
	3364	ram_addr_t addr1;
	3365
	3366	mr = memory_region_from_host(buffer, &addr1);
	3367	assert(mr != NULL);
	3368	if (is_write) {
	3369	invalidate_and_set_dirty(mr, addr1, access_len);
	3370	}
	3371	if (xen_enabled()) {
	3372	xen_invalidate_map_cache_entry(buffer);
	3373	}
	3374	memory_region_unref(mr);
	3375	return;
	3376	}
	3377	if (is_write) {
	3378	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3379	bounce.buffer, access_len);
	3380	}
	3381	qemu_vfree(bounce.buffer);
	3382	bounce.buffer = NULL;
	3383	memory_region_unref(bounce.mr);
	3384	atomic_mb_set(&bounce.in_use, false);
	3385	cpu_notify_map_clients();
	3386	}
	3387
	3388	void *cpu_physical_memory_map(hwaddr addr,
	3389	hwaddr *plen,
	3390	int is_write)
	3391	{
	3392	return address_space_map(&address_space_memory, addr, plen, is_write);
	3393	}
	3394
	3395	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3396	int is_write, hwaddr access_len)
	3397	{
	3398	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3399	}
	3400
	3401	#define ARG1_DECL AddressSpace *as
	3402	#define ARG1 as
	3403	#define SUFFIX
	3404	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3405	#define IS_DIRECT(mr, is_write) memory_access_is_direct(mr, is_write)
	3406	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3407	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3408	#define RCU_READ_LOCK(...) rcu_read_lock()
	3409	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3410	#include "memory_ldst.inc.c"
	3411
	3412	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3413	AddressSpace *as,
	3414	hwaddr addr,
	3415	hwaddr len,
	3416	bool is_write)
	3417	{
	3418	cache->len = len;
	3419	cache->as = as;
	3420	cache->xlat = addr;
	3421	return len;
	3422	}
	3423
	3424	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3425	hwaddr addr,
	3426	hwaddr access_len)
	3427	{
	3428	}
	3429
	3430	void address_space_cache_destroy(MemoryRegionCache *cache)
	3431	{
	3432	cache->as = NULL;
	3433	}
	3434
	3435	#define ARG1_DECL MemoryRegionCache *cache
	3436	#define ARG1 cache
	3437	#define SUFFIX _cached
	3438	#define TRANSLATE(addr, ...) \
	3439	address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
	3440	#define IS_DIRECT(mr, is_write) true
	3441	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3442	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3443	#define RCU_READ_LOCK() rcu_read_lock()
	3444	#define RCU_READ_UNLOCK() rcu_read_unlock()
	3445	#include "memory_ldst.inc.c"
	3446
	3447	/* virtual memory access for debug (includes writing to ROM) */
	3448	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3449	uint8_t *buf, int len, int is_write)
	3450	{
	3451	int l;
	3452	hwaddr phys_addr;
	3453	target_ulong page;
	3454
	3455	cpu_synchronize_state(cpu);
	3456	while (len > 0) {
	3457	int asidx;
	3458	MemTxAttrs attrs;
	3459
	3460	page = addr & TARGET_PAGE_MASK;
	3461	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3462	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3463	/* if no physical page mapped, return an error */
	3464	if (phys_addr == -1)
	3465	return -1;
	3466	l = (page + TARGET_PAGE_SIZE) - addr;
	3467	if (l > len)
	3468	l = len;
	3469	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3470	if (is_write) {
	3471	cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
	3472	phys_addr, buf, l);
	3473	} else {
	3474	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3475	MEMTXATTRS_UNSPECIFIED,
	3476	buf, l, 0);
	3477	}
	3478	len -= l;
	3479	buf += l;
	3480	addr += l;
	3481	}
	3482	return 0;
	3483	}
	3484
	3485	/*
	3486	* Allows code that needs to deal with migration bitmaps etc to still be built
	3487	* target independent.
	3488	*/
	3489	size_t qemu_target_page_size(void)
	3490	{
	3491	return TARGET_PAGE_SIZE;
	3492	}
	3493
	3494	int qemu_target_page_bits(void)
	3495	{
	3496	return TARGET_PAGE_BITS;
	3497	}
	3498
	3499	int qemu_target_page_bits_min(void)
	3500	{
	3501	return TARGET_PAGE_BITS_MIN;
	3502	}
	3503	#endif
	3504
	3505	/*
	3506	* A helper function for the _utterly broken_ virtio device model to find out if
	3507	* it's running on a big endian machine. Don't do this at home kids!
	3508	*/
	3509	bool target_words_bigendian(void);
	3510	bool target_words_bigendian(void)
	3511	{
	3512	#if defined(TARGET_WORDS_BIGENDIAN)
	3513	return true;
	3514	#else
	3515	return false;
	3516	#endif
	3517	}
	3518
	3519	#ifndef CONFIG_USER_ONLY
	3520	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3521	{
	3522	MemoryRegion*mr;
	3523	hwaddr l = 1;
	3524	bool res;
	3525
	3526	rcu_read_lock();
	3527	mr = address_space_translate(&address_space_memory,
	3528	phys_addr, &phys_addr, &l, false);
	3529
	3530	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3531	rcu_read_unlock();
	3532	return res;
	3533	}
	3534
	3535	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3536	{
	3537	RAMBlock *block;
	3538	int ret = 0;
	3539
	3540	rcu_read_lock();
	3541	RAMBLOCK_FOREACH(block) {
	3542	ret = func(block->idstr, block->host, block->offset,
	3543	block->used_length, opaque);
	3544	if (ret) {
	3545	break;
	3546	}
	3547	}
	3548	rcu_read_unlock();
	3549	return ret;
	3550	}
	3551
	3552	/*
	3553	* Unmap pages of memory from start to start+length such that
	3554	* they a) read as 0, b) Trigger whatever fault mechanism
	3555	* the OS provides for postcopy.
	3556	* The pages must be unmapped by the end of the function.
	3557	* Returns: 0 on success, none-0 on failure
	3558	*
	3559	*/
	3560	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	3561	{
	3562	int ret = -1;
	3563
	3564	uint8_t *host_startaddr = rb->host + start;
	3565
	3566	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	3567	error_report("ram_block_discard_range: Unaligned start address: %p",
	3568	host_startaddr);
	3569	goto err;
	3570	}
	3571
	3572	if ((start + length) <= rb->used_length) {
	3573	uint8_t *host_endaddr = host_startaddr + length;
	3574	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	3575	error_report("ram_block_discard_range: Unaligned end address: %p",
	3576	host_endaddr);
	3577	goto err;
	3578	}
	3579
	3580	errno = ENOTSUP; /* If we are missing MADVISE etc */
	3581
	3582	if (rb->page_size == qemu_host_page_size) {
	3583	#if defined(CONFIG_MADVISE)
	3584	/* Note: We need the madvise MADV_DONTNEED behaviour of definitely
	3585	* freeing the page.
	3586	*/
	3587	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	3588	#endif
	3589	} else {
	3590	/* Huge page case - unfortunately it can't do DONTNEED, but
	3591	* it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
	3592	* huge page file.
	3593	*/
	3594	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	3595	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	3596	start, length);
	3597	#endif
	3598	}
	3599	if (ret) {
	3600	ret = -errno;
	3601	error_report("ram_block_discard_range: Failed to discard range "
	3602	"%s:%" PRIx64 " +%zx (%d)",
	3603	rb->idstr, start, length, ret);
	3604	}
	3605	} else {
	3606	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	3607	"/%zx/" RAM_ADDR_FMT")",
	3608	rb->idstr, start, length, rb->used_length);
	3609	}
	3610
	3611	err:
	3612	return ret;
	3613	}
	3614
	3615	#endif
	3616
	3617	void page_size_init(void)
	3618	{
	3619	/* NOTE: we can always suppose that qemu_host_page_size >=
	3620	TARGET_PAGE_SIZE */
	3621	qemu_real_host_page_size = getpagesize();
	3622	qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
	3623	if (qemu_host_page_size == 0) {
	3624	qemu_host_page_size = qemu_real_host_page_size;
	3625	}
	3626	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	3627	qemu_host_page_size = TARGET_PAGE_SIZE;
	3628	}
	3629	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	3630	}