Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19	#include "qemu/osdep.h"
	20	#include "qapi/error.h"
	21
	22	#include "qemu/cutils.h"
	23	#include "cpu.h"
	24	#include "exec/exec-all.h"
	25	#include "exec/target_page.h"
	26	#include "tcg.h"
	27	#include "hw/qdev-core.h"
	28	#include "hw/qdev-properties.h"
	29	#if !defined(CONFIG_USER_ONLY)
	30	#include "hw/boards.h"
	31	#include "hw/xen/xen.h"
	32	#endif
	33	#include "sysemu/kvm.h"
	34	#include "sysemu/sysemu.h"
	35	#include "qemu/timer.h"
	36	#include "qemu/config-file.h"
	37	#include "qemu/error-report.h"
	38	#if defined(CONFIG_USER_ONLY)
	39	#include "qemu.h"
	40	#else /* !CONFIG_USER_ONLY */
	41	#include "hw/hw.h"
	42	#include "exec/memory.h"
	43	#include "exec/ioport.h"
	44	#include "sysemu/dma.h"
	45	#include "sysemu/numa.h"
	46	#include "sysemu/hw_accel.h"
	47	#include "exec/address-spaces.h"
	48	#include "sysemu/xen-mapcache.h"
	49	#include "trace-root.h"
	50
	51	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	52	#include <linux/falloc.h>
	53	#endif
	54
	55	#endif
	56	#include "qemu/rcu_queue.h"
	57	#include "qemu/main-loop.h"
	58	#include "translate-all.h"
	59	#include "sysemu/replay.h"
	60
	61	#include "exec/memory-internal.h"
	62	#include "exec/ram_addr.h"
	63	#include "exec/log.h"
	64
	65	#include "migration/vmstate.h"
	66
	67	#include "qemu/range.h"
	68	#ifndef _WIN32
	69	#include "qemu/mmap-alloc.h"
	70	#endif
	71
	72	#include "monitor/monitor.h"
	73
	74	//#define DEBUG_SUBPAGE
	75
	76	#if !defined(CONFIG_USER_ONLY)
	77	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	78	* are protected by the ramlist lock.
	79	*/
	80	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	81
	82	static MemoryRegion *system_memory;
	83	static MemoryRegion *system_io;
	84
	85	AddressSpace address_space_io;
	86	AddressSpace address_space_memory;
	87
	88	MemoryRegion io_mem_rom, io_mem_notdirty;
	89	static MemoryRegion io_mem_unassigned;
	90
	91	/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
	92	#define RAM_PREALLOC (1 << 0)
	93
	94	/* RAM is mmap-ed with MAP_SHARED */
	95	#define RAM_SHARED (1 << 1)
	96
	97	/* Only a portion of RAM (used_length) is actually used, and migrated.
	98	* This used_length size can change across reboots.
	99	*/
	100	#define RAM_RESIZEABLE (1 << 2)
	101
	102	/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically
	103	* zero the page and wake waiting processes.
	104	* (Set during postcopy)
	105	*/
	106	#define RAM_UF_ZEROPAGE (1 << 3)
	107	#endif
	108
	109	#ifdef TARGET_PAGE_BITS_VARY
	110	int target_page_bits;
	111	bool target_page_bits_decided;
	112	#endif
	113
	114	struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	115	/* current CPU in the current thread. It is only valid inside
	116	cpu_exec() */
	117	__thread CPUState *current_cpu;
	118	/* 0 = Do not count executed instructions.
	119	1 = Precise instruction counting.
	120	2 = Adaptive rate instruction counting. */
	121	int use_icount;
	122
	123	uintptr_t qemu_host_page_size;
	124	intptr_t qemu_host_page_mask;
	125
	126	bool set_preferred_target_page_bits(int bits)
	127	{
	128	/* The target page size is the lowest common denominator for all
	129	* the CPUs in the system, so we can only make it smaller, never
	130	* larger. And we can't make it smaller once we've committed to
	131	* a particular size.
	132	*/
	133	#ifdef TARGET_PAGE_BITS_VARY
	134	assert(bits >= TARGET_PAGE_BITS_MIN);
	135	if (target_page_bits == 0 \|\| target_page_bits > bits) {
	136	if (target_page_bits_decided) {
	137	return false;
	138	}
	139	target_page_bits = bits;
	140	}
	141	#endif
	142	return true;
	143	}
	144
	145	#if !defined(CONFIG_USER_ONLY)
	146
	147	static void finalize_target_page_bits(void)
	148	{
	149	#ifdef TARGET_PAGE_BITS_VARY
	150	if (target_page_bits == 0) {
	151	target_page_bits = TARGET_PAGE_BITS_MIN;
	152	}
	153	target_page_bits_decided = true;
	154	#endif
	155	}
	156
	157	typedef struct PhysPageEntry PhysPageEntry;
	158
	159	struct PhysPageEntry {
	160	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	161	uint32_t skip : 6;
	162	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	163	uint32_t ptr : 26;
	164	};
	165
	166	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	167
	168	/* Size of the L2 (and L3, etc) page tables. */
	169	#define ADDR_SPACE_BITS 64
	170
	171	#define P_L2_BITS 9
	172	#define P_L2_SIZE (1 << P_L2_BITS)
	173
	174	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	175
	176	typedef PhysPageEntry Node[P_L2_SIZE];
	177
	178	typedef struct PhysPageMap {
	179	struct rcu_head rcu;
	180
	181	unsigned sections_nb;
	182	unsigned sections_nb_alloc;
	183	unsigned nodes_nb;
	184	unsigned nodes_nb_alloc;
	185	Node *nodes;
	186	MemoryRegionSection *sections;
	187	} PhysPageMap;
	188
	189	struct AddressSpaceDispatch {
	190	MemoryRegionSection *mru_section;
	191	/* This is a multi-level map on the physical address space.
	192	* The bottom level has pointers to MemoryRegionSections.
	193	*/
	194	PhysPageEntry phys_map;
	195	PhysPageMap map;
	196	};
	197
	198	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	199	typedef struct subpage_t {
	200	MemoryRegion iomem;
	201	FlatView *fv;
	202	hwaddr base;
	203	uint16_t sub_section[];
	204	} subpage_t;
	205
	206	#define PHYS_SECTION_UNASSIGNED 0
	207	#define PHYS_SECTION_NOTDIRTY 1
	208	#define PHYS_SECTION_ROM 2
	209	#define PHYS_SECTION_WATCH 3
	210
	211	static void io_mem_init(void);
	212	static void memory_map_init(void);
	213	static void tcg_commit(MemoryListener *listener);
	214
	215	static MemoryRegion io_mem_watch;
	216
	217	/**
	218	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	219	* @cpu: the CPU whose AddressSpace this is
	220	* @as: the AddressSpace itself
	221	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	222	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	223	*/
	224	struct CPUAddressSpace {
	225	CPUState *cpu;
	226	AddressSpace *as;
	227	struct AddressSpaceDispatch *memory_dispatch;
	228	MemoryListener tcg_as_listener;
	229	};
	230
	231	struct DirtyBitmapSnapshot {
	232	ram_addr_t start;
	233	ram_addr_t end;
	234	unsigned long dirty[];
	235	};
	236
	237	#endif
	238
	239	#if !defined(CONFIG_USER_ONLY)
	240
	241	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	242	{
	243	static unsigned alloc_hint = 16;
	244	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	245	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
	246	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
	247	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	248	alloc_hint = map->nodes_nb_alloc;
	249	}
	250	}
	251
	252	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	253	{
	254	unsigned i;
	255	uint32_t ret;
	256	PhysPageEntry e;
	257	PhysPageEntry *p;
	258
	259	ret = map->nodes_nb++;
	260	p = map->nodes[ret];
	261	assert(ret != PHYS_MAP_NODE_NIL);
	262	assert(ret != map->nodes_nb_alloc);
	263
	264	e.skip = leaf ? 0 : 1;
	265	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	266	for (i = 0; i < P_L2_SIZE; ++i) {
	267	memcpy(&p[i], &e, sizeof(e));
	268	}
	269	return ret;
	270	}
	271
	272	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	273	hwaddr index, hwaddr nb, uint16_t leaf,
	274	int level)
	275	{
	276	PhysPageEntry *p;
	277	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	278
	279	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	280	lp->ptr = phys_map_node_alloc(map, level == 0);
	281	}
	282	p = map->nodes[lp->ptr];
	283	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	284
	285	while (*nb && lp < &p[P_L2_SIZE]) {
	286	if ((index & (step - 1)) == 0 && nb >= step) {
	287	lp->skip = 0;
	288	lp->ptr = leaf;
	289	*index += step;
	290	*nb -= step;
	291	} else {
	292	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	293	}
	294	++lp;
	295	}
	296	}
	297
	298	static void phys_page_set(AddressSpaceDispatch *d,
	299	hwaddr index, hwaddr nb,
	300	uint16_t leaf)
	301	{
	302	/* Wildly overreserve - it doesn't matter much. */
	303	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	304
	305	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	306	}
	307
	308	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	309	* and update our entry so we can skip it and go directly to the destination.
	310	*/
	311	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	312	{
	313	unsigned valid_ptr = P_L2_SIZE;
	314	int valid = 0;
	315	PhysPageEntry *p;
	316	int i;
	317
	318	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	319	return;
	320	}
	321
	322	p = nodes[lp->ptr];
	323	for (i = 0; i < P_L2_SIZE; i++) {
	324	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	325	continue;
	326	}
	327
	328	valid_ptr = i;
	329	valid++;
	330	if (p[i].skip) {
	331	phys_page_compact(&p[i], nodes);
	332	}
	333	}
	334
	335	/* We can only compress if there's only one child. */
	336	if (valid != 1) {
	337	return;
	338	}
	339
	340	assert(valid_ptr < P_L2_SIZE);
	341
	342	/* Don't compress if it won't fit in the # of bits we have. */
	343	if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
	344	return;
	345	}
	346
	347	lp->ptr = p[valid_ptr].ptr;
	348	if (!p[valid_ptr].skip) {
	349	/* If our only child is a leaf, make this a leaf. */
	350	/* By design, we should have made this node a leaf to begin with so we
	351	* should never reach here.
	352	* But since it's so simple to handle this, let's do it just in case we
	353	* change this rule.
	354	*/
	355	lp->skip = 0;
	356	} else {
	357	lp->skip += p[valid_ptr].skip;
	358	}
	359	}
	360
	361	void address_space_dispatch_compact(AddressSpaceDispatch *d)
	362	{
	363	if (d->phys_map.skip) {
	364	phys_page_compact(&d->phys_map, d->map.nodes);
	365	}
	366	}
	367
	368	static inline bool section_covers_addr(const MemoryRegionSection *section,
	369	hwaddr addr)
	370	{
	371	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	372	* the section must cover the entire address space.
	373	*/
	374	return int128_gethi(section->size) \|\|
	375	range_covers_byte(section->offset_within_address_space,
	376	int128_getlo(section->size), addr);
	377	}
	378
	379	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	380	{
	381	PhysPageEntry lp = d->phys_map, *p;
	382	Node *nodes = d->map.nodes;
	383	MemoryRegionSection *sections = d->map.sections;
	384	hwaddr index = addr >> TARGET_PAGE_BITS;
	385	int i;
	386
	387	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	388	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	389	return &sections[PHYS_SECTION_UNASSIGNED];
	390	}
	391	p = nodes[lp.ptr];
	392	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	393	}
	394
	395	if (section_covers_addr(&sections[lp.ptr], addr)) {
	396	return &sections[lp.ptr];
	397	} else {
	398	return &sections[PHYS_SECTION_UNASSIGNED];
	399	}
	400	}
	401
	402	bool memory_region_is_unassigned(MemoryRegion *mr)
	403	{
	404	return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
	405	&& mr != &io_mem_watch;
	406	}
	407
	408	/* Called from RCU critical section */
	409	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	410	hwaddr addr,
	411	bool resolve_subpage)
	412	{
	413	MemoryRegionSection *section = atomic_read(&d->mru_section);
	414	subpage_t *subpage;
	415
	416	if (!section \|\| section == &d->map.sections[PHYS_SECTION_UNASSIGNED] \|\|
	417	!section_covers_addr(section, addr)) {
	418	section = phys_page_find(d, addr);
	419	atomic_set(&d->mru_section, section);
	420	}
	421	if (resolve_subpage && section->mr->subpage) {
	422	subpage = container_of(section->mr, subpage_t, iomem);
	423	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	424	}
	425	return section;
	426	}
	427
	428	/* Called from RCU critical section */
	429	static MemoryRegionSection *
	430	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	431	hwaddr *plen, bool resolve_subpage)
	432	{
	433	MemoryRegionSection *section;
	434	MemoryRegion *mr;
	435	Int128 diff;
	436
	437	section = address_space_lookup_region(d, addr, resolve_subpage);
	438	/* Compute offset within MemoryRegionSection */
	439	addr -= section->offset_within_address_space;
	440
	441	/* Compute offset within MemoryRegion */
	442	*xlat = addr + section->offset_within_region;
	443
	444	mr = section->mr;
	445
	446	/* MMIO registers can be expected to perform full-width accesses based only
	447	* on their address, without considering adjacent registers that could
	448	* decode to completely different MemoryRegions. When such registers
	449	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	450	* regions overlap wildly. For this reason we cannot clamp the accesses
	451	* here.
	452	*
	453	* If the length is small (as is the case for address_space_ldl/stl),
	454	* everything works fine. If the incoming length is large, however,
	455	* the caller really has to do the clamping through memory_access_size.
	456	*/
	457	if (memory_region_is_ram(mr)) {
	458	diff = int128_sub(section->size, int128_make64(addr));
	459	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	460	}
	461	return section;
	462	}
	463
	464	/**
	465	* flatview_do_translate - translate an address in FlatView
	466	*
	467	* @fv: the flat view that we want to translate on
	468	* @addr: the address to be translated in above address space
	469	* @xlat: the translated address offset within memory region. It
	470	* cannot be @NULL.
	471	* @plen_out: valid read/write length of the translated address. It
	472	* can be @NULL when we don't care about it.
	473	* @page_mask_out: page mask for the translated address. This
	474	* should only be meaningful for IOMMU translated
	475	* addresses, since there may be huge pages that this bit
	476	* would tell. It can be @NULL if we don't care about it.
	477	* @is_write: whether the translation operation is for write
	478	* @is_mmio: whether this can be MMIO, set true if it can
	479	*
	480	* This function is called from RCU critical section
	481	*/
	482	static MemoryRegionSection flatview_do_translate(FlatView *fv,
	483	hwaddr addr,
	484	hwaddr *xlat,
	485	hwaddr *plen_out,
	486	hwaddr *page_mask_out,
	487	bool is_write,
	488	bool is_mmio,
	489	AddressSpace **target_as)
	490	{
	491	IOMMUTLBEntry iotlb;
	492	MemoryRegionSection *section;
	493	IOMMUMemoryRegion *iommu_mr;
	494	IOMMUMemoryRegionClass *imrc;
	495	hwaddr page_mask = (hwaddr)(-1);
	496	hwaddr plen = (hwaddr)(-1);
	497
	498	if (plen_out) {
	499	plen = *plen_out;
	500	}
	501
	502	for (;;) {
	503	section = address_space_translate_internal(
	504	flatview_to_dispatch(fv), addr, &addr,
	505	&plen, is_mmio);
	506
	507	iommu_mr = memory_region_get_iommu(section->mr);
	508	if (!iommu_mr) {
	509	break;
	510	}
	511	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	512
	513	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	514	IOMMU_WO : IOMMU_RO);
	515	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	516	\| (addr & iotlb.addr_mask));
	517	page_mask &= iotlb.addr_mask;
	518	plen = MIN(plen, (addr \| iotlb.addr_mask) - addr + 1);
	519	if (!(iotlb.perm & (1 << is_write))) {
	520	goto translate_fail;
	521	}
	522
	523	fv = address_space_to_flatview(iotlb.target_as);
	524	*target_as = iotlb.target_as;
	525	}
	526
	527	*xlat = addr;
	528
	529	if (page_mask == (hwaddr)(-1)) {
	530	/* Not behind an IOMMU, use default page size. */
	531	page_mask = ~TARGET_PAGE_MASK;
	532	}
	533
	534	if (page_mask_out) {
	535	*page_mask_out = page_mask;
	536	}
	537
	538	if (plen_out) {
	539	*plen_out = plen;
	540	}
	541
	542	return *section;
	543
	544	translate_fail:
	545	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	546	}
	547
	548	/* Called from RCU critical section */
	549	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	550	bool is_write)
	551	{
	552	MemoryRegionSection section;
	553	hwaddr xlat, page_mask;
	554
	555	/*
	556	* This can never be MMIO, and we don't really care about plen,
	557	* but page mask.
	558	*/
	559	section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
	560	NULL, &page_mask, is_write, false, &as);
	561
	562	/* Illegal translation */
	563	if (section.mr == &io_mem_unassigned) {
	564	goto iotlb_fail;
	565	}
	566
	567	/* Convert memory region offset into address space offset */
	568	xlat += section.offset_within_address_space -
	569	section.offset_within_region;
	570
	571	return (IOMMUTLBEntry) {
	572	.target_as = as,
	573	.iova = addr & ~page_mask,
	574	.translated_addr = xlat & ~page_mask,
	575	.addr_mask = page_mask,
	576	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	577	.perm = IOMMU_RW,
	578	};
	579
	580	iotlb_fail:
	581	return (IOMMUTLBEntry) {0};
	582	}
	583
	584	/* Called from RCU critical section */
	585	MemoryRegion flatview_translate(FlatView fv, hwaddr addr, hwaddr *xlat,
	586	hwaddr *plen, bool is_write)
	587	{
	588	MemoryRegion *mr;
	589	MemoryRegionSection section;
	590	AddressSpace *as = NULL;
	591
	592	/* This can be MMIO, so setup MMIO bit. */
	593	section = flatview_do_translate(fv, addr, xlat, plen, NULL,
	594	is_write, true, &as);
	595	mr = section.mr;
	596
	597	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	598	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	599	plen = MIN(page, plen);
	600	}
	601
	602	return mr;
	603	}
	604
	605	/* Called from RCU critical section */
	606	MemoryRegionSection *
	607	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	608	hwaddr xlat, hwaddr plen)
	609	{
	610	MemoryRegionSection *section;
	611	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	612
	613	section = address_space_translate_internal(d, addr, xlat, plen, false);
	614
	615	assert(!memory_region_is_iommu(section->mr));
	616	return section;
	617	}
	618	#endif
	619
	620	#if !defined(CONFIG_USER_ONLY)
	621
	622	static int cpu_common_post_load(void *opaque, int version_id)
	623	{
	624	CPUState *cpu = opaque;
	625
	626	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	627	version_id is increased. */
	628	cpu->interrupt_request &= ~0x01;
	629	tlb_flush(cpu);
	630
	631	/* loadvm has just updated the content of RAM, bypassing the
	632	* usual mechanisms that ensure we flush TBs for writes to
	633	* memory we've translated code from. So we must flush all TBs,
	634	* which will now be stale.
	635	*/
	636	tb_flush(cpu);
	637
	638	return 0;
	639	}
	640
	641	static int cpu_common_pre_load(void *opaque)
	642	{
	643	CPUState *cpu = opaque;
	644
	645	cpu->exception_index = -1;
	646
	647	return 0;
	648	}
	649
	650	static bool cpu_common_exception_index_needed(void *opaque)
	651	{
	652	CPUState *cpu = opaque;
	653
	654	return tcg_enabled() && cpu->exception_index != -1;
	655	}
	656
	657	static const VMStateDescription vmstate_cpu_common_exception_index = {
	658	.name = "cpu_common/exception_index",
	659	.version_id = 1,
	660	.minimum_version_id = 1,
	661	.needed = cpu_common_exception_index_needed,
	662	.fields = (VMStateField[]) {
	663	VMSTATE_INT32(exception_index, CPUState),
	664	VMSTATE_END_OF_LIST()
	665	}
	666	};
	667
	668	static bool cpu_common_crash_occurred_needed(void *opaque)
	669	{
	670	CPUState *cpu = opaque;
	671
	672	return cpu->crash_occurred;
	673	}
	674
	675	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	676	.name = "cpu_common/crash_occurred",
	677	.version_id = 1,
	678	.minimum_version_id = 1,
	679	.needed = cpu_common_crash_occurred_needed,
	680	.fields = (VMStateField[]) {
	681	VMSTATE_BOOL(crash_occurred, CPUState),
	682	VMSTATE_END_OF_LIST()
	683	}
	684	};
	685
	686	const VMStateDescription vmstate_cpu_common = {
	687	.name = "cpu_common",
	688	.version_id = 1,
	689	.minimum_version_id = 1,
	690	.pre_load = cpu_common_pre_load,
	691	.post_load = cpu_common_post_load,
	692	.fields = (VMStateField[]) {
	693	VMSTATE_UINT32(halted, CPUState),
	694	VMSTATE_UINT32(interrupt_request, CPUState),
	695	VMSTATE_END_OF_LIST()
	696	},
	697	.subsections = (const VMStateDescription*[]) {
	698	&vmstate_cpu_common_exception_index,
	699	&vmstate_cpu_common_crash_occurred,
	700	NULL
	701	}
	702	};
	703
	704	#endif
	705
	706	CPUState *qemu_get_cpu(int index)
	707	{
	708	CPUState *cpu;
	709
	710	CPU_FOREACH(cpu) {
	711	if (cpu->cpu_index == index) {
	712	return cpu;
	713	}
	714	}
	715
	716	return NULL;
	717	}
	718
	719	#if !defined(CONFIG_USER_ONLY)
	720	void cpu_address_space_init(CPUState *cpu, int asidx,
	721	const char prefix, MemoryRegion mr)
	722	{
	723	CPUAddressSpace *newas;
	724	AddressSpace *as = g_new0(AddressSpace, 1);
	725	char *as_name;
	726
	727	assert(mr);
	728	as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
	729	address_space_init(as, mr, as_name);
	730	g_free(as_name);
	731
	732	/* Target code should have set num_ases before calling us */
	733	assert(asidx < cpu->num_ases);
	734
	735	if (asidx == 0) {
	736	/* address space 0 gets the convenience alias */
	737	cpu->as = as;
	738	}
	739
	740	/* KVM cannot currently support multiple address spaces. */
	741	assert(asidx == 0 \|\| !kvm_enabled());
	742
	743	if (!cpu->cpu_ases) {
	744	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	745	}
	746
	747	newas = &cpu->cpu_ases[asidx];
	748	newas->cpu = cpu;
	749	newas->as = as;
	750	if (tcg_enabled()) {
	751	newas->tcg_as_listener.commit = tcg_commit;
	752	memory_listener_register(&newas->tcg_as_listener, as);
	753	}
	754	}
	755
	756	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	757	{
	758	/* Return the AddressSpace corresponding to the specified index */
	759	return cpu->cpu_ases[asidx].as;
	760	}
	761	#endif
	762
	763	void cpu_exec_unrealizefn(CPUState *cpu)
	764	{
	765	CPUClass *cc = CPU_GET_CLASS(cpu);
	766
	767	cpu_list_remove(cpu);
	768
	769	if (cc->vmsd != NULL) {
	770	vmstate_unregister(NULL, cc->vmsd, cpu);
	771	}
	772	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	773	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	774	}
	775	}
	776
	777	Property cpu_common_props[] = {
	778	#ifndef CONFIG_USER_ONLY
	779	/* Create a memory property for softmmu CPU object,
	780	* so users can wire up its memory. (This can't go in qom/cpu.c
	781	* because that file is compiled only once for both user-mode
	782	* and system builds.) The default if no link is set up is to use
	783	* the system address space.
	784	*/
	785	DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
	786	MemoryRegion *),
	787	#endif
	788	DEFINE_PROP_END_OF_LIST(),
	789	};
	790
	791	void cpu_exec_initfn(CPUState *cpu)
	792	{
	793	cpu->as = NULL;
	794	cpu->num_ases = 0;
	795
	796	#ifndef CONFIG_USER_ONLY
	797	cpu->thread_id = qemu_get_thread_id();
	798	cpu->memory = system_memory;
	799	object_ref(OBJECT(cpu->memory));
	800	#endif
	801	}
	802
	803	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	804	{
	805	CPUClass *cc = CPU_GET_CLASS(cpu);
	806	static bool tcg_target_initialized;
	807
	808	cpu_list_add(cpu);
	809
	810	if (tcg_enabled() && !tcg_target_initialized) {
	811	tcg_target_initialized = true;
	812	cc->tcg_initialize();
	813	}
	814
	815	#ifndef CONFIG_USER_ONLY
	816	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	817	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	818	}
	819	if (cc->vmsd != NULL) {
	820	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	821	}
	822	#endif
	823	}
	824
	825	const char parse_cpu_model(const char cpu_model)
	826	{
	827	ObjectClass *oc;
	828	CPUClass *cc;
	829	gchar **model_pieces;
	830	const char *cpu_type;
	831
	832	model_pieces = g_strsplit(cpu_model, ",", 2);
	833
	834	oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
	835	if (oc == NULL) {
	836	error_report("unable to find CPU model '%s'", model_pieces[0]);
	837	g_strfreev(model_pieces);
	838	exit(EXIT_FAILURE);
	839	}
	840
	841	cpu_type = object_class_get_name(oc);
	842	cc = CPU_CLASS(oc);
	843	cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
	844	g_strfreev(model_pieces);
	845	return cpu_type;
	846	}
	847
	848	#if defined(CONFIG_USER_ONLY)
	849	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	850	{
	851	mmap_lock();
	852	tb_lock();
	853	tb_invalidate_phys_page_range(pc, pc + 1, 0);
	854	tb_unlock();
	855	mmap_unlock();
	856	}
	857	#else
	858	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	859	{
	860	MemTxAttrs attrs;
	861	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	862	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	863	if (phys != -1) {
	864	/* Locks grabbed by tb_invalidate_phys_addr */
	865	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	866	phys \| (pc & ~TARGET_PAGE_MASK));
	867	}
	868	}
	869	#endif
	870
	871	#if defined(CONFIG_USER_ONLY)
	872	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	873
	874	{
	875	}
	876
	877	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	878	int flags)
	879	{
	880	return -ENOSYS;
	881	}
	882
	883	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	884	{
	885	}
	886
	887	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	888	int flags, CPUWatchpoint **watchpoint)
	889	{
	890	return -ENOSYS;
	891	}
	892	#else
	893	/* Add a watchpoint. */
	894	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	895	int flags, CPUWatchpoint **watchpoint)
	896	{
	897	CPUWatchpoint *wp;
	898
	899	/* forbid ranges which are empty or run off the end of the address space */
	900	if (len == 0 \|\| (addr + len - 1) < addr) {
	901	error_report("tried to set invalid watchpoint at %"
	902	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	903	return -EINVAL;
	904	}
	905	wp = g_malloc(sizeof(*wp));
	906
	907	wp->vaddr = addr;
	908	wp->len = len;
	909	wp->flags = flags;
	910
	911	/* keep all GDB-injected watchpoints in front */
	912	if (flags & BP_GDB) {
	913	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	914	} else {
	915	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	916	}
	917
	918	tlb_flush_page(cpu, addr);
	919
	920	if (watchpoint)
	921	*watchpoint = wp;
	922	return 0;
	923	}
	924
	925	/* Remove a specific watchpoint. */
	926	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	927	int flags)
	928	{
	929	CPUWatchpoint *wp;
	930
	931	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	932	if (addr == wp->vaddr && len == wp->len
	933	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	934	cpu_watchpoint_remove_by_ref(cpu, wp);
	935	return 0;
	936	}
	937	}
	938	return -ENOENT;
	939	}
	940
	941	/* Remove a specific watchpoint by reference. */
	942	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	943	{
	944	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	945
	946	tlb_flush_page(cpu, watchpoint->vaddr);
	947
	948	g_free(watchpoint);
	949	}
	950
	951	/* Remove all matching watchpoints. */
	952	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	953	{
	954	CPUWatchpoint wp, next;
	955
	956	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	957	if (wp->flags & mask) {
	958	cpu_watchpoint_remove_by_ref(cpu, wp);
	959	}
	960	}
	961	}
	962
	963	/* Return true if this watchpoint address matches the specified
	964	* access (ie the address range covered by the watchpoint overlaps
	965	* partially or completely with the address range covered by the
	966	* access).
	967	*/
	968	static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
	969	vaddr addr,
	970	vaddr len)
	971	{
	972	/* We know the lengths are non-zero, but a little caution is
	973	* required to avoid errors in the case where the range ends
	974	* exactly at the top of the address space and so addr + len
	975	* wraps round to zero.
	976	*/
	977	vaddr wpend = wp->vaddr + wp->len - 1;
	978	vaddr addrend = addr + len - 1;
	979
	980	return !(addr > wpend \|\| wp->vaddr > addrend);
	981	}
	982
	983	#endif
	984
	985	/* Add a breakpoint. */
	986	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	987	CPUBreakpoint **breakpoint)
	988	{
	989	CPUBreakpoint *bp;
	990
	991	bp = g_malloc(sizeof(*bp));
	992
	993	bp->pc = pc;
	994	bp->flags = flags;
	995
	996	/* keep all GDB-injected breakpoints in front */
	997	if (flags & BP_GDB) {
	998	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	999	} else {
	1000	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	1001	}
	1002
	1003	breakpoint_invalidate(cpu, pc);
	1004
	1005	if (breakpoint) {
	1006	*breakpoint = bp;
	1007	}
	1008	return 0;
	1009	}
	1010
	1011	/* Remove a specific breakpoint. */
	1012	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	1013	{
	1014	CPUBreakpoint *bp;
	1015
	1016	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	1017	if (bp->pc == pc && bp->flags == flags) {
	1018	cpu_breakpoint_remove_by_ref(cpu, bp);
	1019	return 0;
	1020	}
	1021	}
	1022	return -ENOENT;
	1023	}
	1024
	1025	/* Remove a specific breakpoint by reference. */
	1026	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	1027	{
	1028	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	1029
	1030	breakpoint_invalidate(cpu, breakpoint->pc);
	1031
	1032	g_free(breakpoint);
	1033	}
	1034
	1035	/* Remove all matching breakpoints. */
	1036	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	1037	{
	1038	CPUBreakpoint bp, next;
	1039
	1040	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	1041	if (bp->flags & mask) {
	1042	cpu_breakpoint_remove_by_ref(cpu, bp);
	1043	}
	1044	}
	1045	}
	1046
	1047	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	1048	CPU loop after each instruction */
	1049	void cpu_single_step(CPUState *cpu, int enabled)
	1050	{
	1051	if (cpu->singlestep_enabled != enabled) {
	1052	cpu->singlestep_enabled = enabled;
	1053	if (kvm_enabled()) {
	1054	kvm_update_guest_debug(cpu, 0);
	1055	} else {
	1056	/* must flush all the translated code to avoid inconsistencies */
	1057	/* XXX: only flush what is necessary */
	1058	tb_flush(cpu);
	1059	}
	1060	}
	1061	}
	1062
	1063	void cpu_abort(CPUState cpu, const char fmt, ...)
	1064	{
	1065	va_list ap;
	1066	va_list ap2;
	1067
	1068	va_start(ap, fmt);
	1069	va_copy(ap2, ap);
	1070	fprintf(stderr, "qemu: fatal: ");
	1071	vfprintf(stderr, fmt, ap);
	1072	fprintf(stderr, "\n");
	1073	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1074	if (qemu_log_separate()) {
	1075	qemu_log_lock();
	1076	qemu_log("qemu: fatal: ");
	1077	qemu_log_vprintf(fmt, ap2);
	1078	qemu_log("\n");
	1079	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1080	qemu_log_flush();
	1081	qemu_log_unlock();
	1082	qemu_log_close();
	1083	}
	1084	va_end(ap2);
	1085	va_end(ap);
	1086	replay_finish();
	1087	#if defined(CONFIG_USER_ONLY)
	1088	{
	1089	struct sigaction act;
	1090	sigfillset(&act.sa_mask);
	1091	act.sa_handler = SIG_DFL;
	1092	sigaction(SIGABRT, &act, NULL);
	1093	}
	1094	#endif
	1095	abort();
	1096	}
	1097
	1098	#if !defined(CONFIG_USER_ONLY)
	1099	/* Called from RCU critical section */
	1100	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1101	{
	1102	RAMBlock *block;
	1103
	1104	block = atomic_rcu_read(&ram_list.mru_block);
	1105	if (block && addr - block->offset < block->max_length) {
	1106	return block;
	1107	}
	1108	RAMBLOCK_FOREACH(block) {
	1109	if (addr - block->offset < block->max_length) {
	1110	goto found;
	1111	}
	1112	}
	1113
	1114	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1115	abort();
	1116
	1117	found:
	1118	/* It is safe to write mru_block outside the iothread lock. This
	1119	* is what happens:
	1120	*
	1121	* mru_block = xxx
	1122	* rcu_read_unlock()
	1123	* xxx removed from list
	1124	* rcu_read_lock()
	1125	* read mru_block
	1126	* mru_block = NULL;
	1127	* call_rcu(reclaim_ramblock, xxx);
	1128	* rcu_read_unlock()
	1129	*
	1130	* atomic_rcu_set is not needed here. The block was already published
	1131	* when it was placed into the list. Here we're just making an extra
	1132	* copy of the pointer.
	1133	*/
	1134	ram_list.mru_block = block;
	1135	return block;
	1136	}
	1137
	1138	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1139	{
	1140	CPUState *cpu;
	1141	ram_addr_t start1;
	1142	RAMBlock *block;
	1143	ram_addr_t end;
	1144
	1145	end = TARGET_PAGE_ALIGN(start + length);
	1146	start &= TARGET_PAGE_MASK;
	1147
	1148	rcu_read_lock();
	1149	block = qemu_get_ram_block(start);
	1150	assert(block == qemu_get_ram_block(end - 1));
	1151	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1152	CPU_FOREACH(cpu) {
	1153	tlb_reset_dirty(cpu, start1, length);
	1154	}
	1155	rcu_read_unlock();
	1156	}
	1157
	1158	/* Note: start and end must be within the same ram block. */
	1159	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1160	ram_addr_t length,
	1161	unsigned client)
	1162	{
	1163	DirtyMemoryBlocks *blocks;
	1164	unsigned long end, page;
	1165	bool dirty = false;
	1166
	1167	if (length == 0) {
	1168	return false;
	1169	}
	1170
	1171	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1172	page = start >> TARGET_PAGE_BITS;
	1173
	1174	rcu_read_lock();
	1175
	1176	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1177
	1178	while (page < end) {
	1179	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1180	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1181	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1182
	1183	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1184	offset, num);
	1185	page += num;
	1186	}
	1187
	1188	rcu_read_unlock();
	1189
	1190	if (dirty && tcg_enabled()) {
	1191	tlb_reset_dirty_range_all(start, length);
	1192	}
	1193
	1194	return dirty;
	1195	}
	1196
	1197	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1198	(ram_addr_t start, ram_addr_t length, unsigned client)
	1199	{
	1200	DirtyMemoryBlocks *blocks;
	1201	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1202	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1203	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1204	DirtyBitmapSnapshot *snap;
	1205	unsigned long page, end, dest;
	1206
	1207	snap = g_malloc0(sizeof(*snap) +
	1208	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1209	snap->start = first;
	1210	snap->end = last;
	1211
	1212	page = first >> TARGET_PAGE_BITS;
	1213	end = last >> TARGET_PAGE_BITS;
	1214	dest = 0;
	1215
	1216	rcu_read_lock();
	1217
	1218	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1219
	1220	while (page < end) {
	1221	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1222	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1223	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1224
	1225	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1226	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1227	offset >>= BITS_PER_LEVEL;
	1228
	1229	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1230	blocks->blocks[idx] + offset,
	1231	num);
	1232	page += num;
	1233	dest += num >> BITS_PER_LEVEL;
	1234	}
	1235
	1236	rcu_read_unlock();
	1237
	1238	if (tcg_enabled()) {
	1239	tlb_reset_dirty_range_all(start, length);
	1240	}
	1241
	1242	return snap;
	1243	}
	1244
	1245	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1246	ram_addr_t start,
	1247	ram_addr_t length)
	1248	{
	1249	unsigned long page, end;
	1250
	1251	assert(start >= snap->start);
	1252	assert(start + length <= snap->end);
	1253
	1254	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1255	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1256
	1257	while (page < end) {
	1258	if (test_bit(page, snap->dirty)) {
	1259	return true;
	1260	}
	1261	page++;
	1262	}
	1263	return false;
	1264	}
	1265
	1266	/* Called from RCU critical section */
	1267	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1268	MemoryRegionSection *section,
	1269	target_ulong vaddr,
	1270	hwaddr paddr, hwaddr xlat,
	1271	int prot,
	1272	target_ulong *address)
	1273	{
	1274	hwaddr iotlb;
	1275	CPUWatchpoint *wp;
	1276
	1277	if (memory_region_is_ram(section->mr)) {
	1278	/* Normal RAM. */
	1279	iotlb = memory_region_get_ram_addr(section->mr) + xlat;
	1280	if (!section->readonly) {
	1281	iotlb \|= PHYS_SECTION_NOTDIRTY;
	1282	} else {
	1283	iotlb \|= PHYS_SECTION_ROM;
	1284	}
	1285	} else {
	1286	AddressSpaceDispatch *d;
	1287
	1288	d = flatview_to_dispatch(section->fv);
	1289	iotlb = section - d->map.sections;
	1290	iotlb += xlat;
	1291	}
	1292
	1293	/* Make accesses to pages with watchpoints go via the
	1294	watchpoint trap routines. */
	1295	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1296	if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
	1297	/* Avoid trapping reads of pages with a write breakpoint. */
	1298	if ((prot & PAGE_WRITE) \|\| (wp->flags & BP_MEM_READ)) {
	1299	iotlb = PHYS_SECTION_WATCH + paddr;
	1300	*address \|= TLB_MMIO;
	1301	break;
	1302	}
	1303	}
	1304	}
	1305
	1306	return iotlb;
	1307	}
	1308	#endif /* defined(CONFIG_USER_ONLY) */
	1309
	1310	#if !defined(CONFIG_USER_ONLY)
	1311
	1312	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	1313	uint16_t section);
	1314	static subpage_t subpage_init(FlatView fv, hwaddr base);
	1315
	1316	static void (phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
	1317	qemu_anon_ram_alloc;
	1318
	1319	/*
	1320	* Set a custom physical guest memory alloator.
	1321	* Accelerators with unusual needs may need this. Hopefully, we can
	1322	* get rid of it eventually.
	1323	*/
	1324	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align, bool shared))
	1325	{
	1326	phys_mem_alloc = alloc;
	1327	}
	1328
	1329	static uint16_t phys_section_add(PhysPageMap *map,
	1330	MemoryRegionSection *section)
	1331	{
	1332	/* The physical section number is ORed with a page-aligned
	1333	* pointer to produce the iotlb entries. Thus it should
	1334	* never overflow into the page-aligned value.
	1335	*/
	1336	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1337
	1338	if (map->sections_nb == map->sections_nb_alloc) {
	1339	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1340	map->sections = g_renew(MemoryRegionSection, map->sections,
	1341	map->sections_nb_alloc);
	1342	}
	1343	map->sections[map->sections_nb] = *section;
	1344	memory_region_ref(section->mr);
	1345	return map->sections_nb++;
	1346	}
	1347
	1348	static void phys_section_destroy(MemoryRegion *mr)
	1349	{
	1350	bool have_sub_page = mr->subpage;
	1351
	1352	memory_region_unref(mr);
	1353
	1354	if (have_sub_page) {
	1355	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1356	object_unref(OBJECT(&subpage->iomem));
	1357	g_free(subpage);
	1358	}
	1359	}
	1360
	1361	static void phys_sections_free(PhysPageMap *map)
	1362	{
	1363	while (map->sections_nb > 0) {
	1364	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1365	phys_section_destroy(section->mr);
	1366	}
	1367	g_free(map->sections);
	1368	g_free(map->nodes);
	1369	}
	1370
	1371	static void register_subpage(FlatView fv, MemoryRegionSection section)
	1372	{
	1373	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1374	subpage_t *subpage;
	1375	hwaddr base = section->offset_within_address_space
	1376	& TARGET_PAGE_MASK;
	1377	MemoryRegionSection *existing = phys_page_find(d, base);
	1378	MemoryRegionSection subsection = {
	1379	.offset_within_address_space = base,
	1380	.size = int128_make64(TARGET_PAGE_SIZE),
	1381	};
	1382	hwaddr start, end;
	1383
	1384	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1385
	1386	if (!(existing->mr->subpage)) {
	1387	subpage = subpage_init(fv, base);
	1388	subsection.fv = fv;
	1389	subsection.mr = &subpage->iomem;
	1390	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1391	phys_section_add(&d->map, &subsection));
	1392	} else {
	1393	subpage = container_of(existing->mr, subpage_t, iomem);
	1394	}
	1395	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1396	end = start + int128_get64(section->size) - 1;
	1397	subpage_register(subpage, start, end,
	1398	phys_section_add(&d->map, section));
	1399	}
	1400
	1401
	1402	static void register_multipage(FlatView *fv,
	1403	MemoryRegionSection *section)
	1404	{
	1405	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1406	hwaddr start_addr = section->offset_within_address_space;
	1407	uint16_t section_index = phys_section_add(&d->map, section);
	1408	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1409	TARGET_PAGE_BITS));
	1410
	1411	assert(num_pages);
	1412	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1413	}
	1414
	1415	void flatview_add_to_dispatch(FlatView fv, MemoryRegionSection section)
	1416	{
	1417	MemoryRegionSection now = section, remain = section;
	1418	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1419
	1420	if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1421	uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
	1422	- now.offset_within_address_space;
	1423
	1424	now.size = int128_min(int128_make64(left), now.size);
	1425	register_subpage(fv, &now);
	1426	} else {
	1427	now.size = int128_zero();
	1428	}
	1429	while (int128_ne(remain.size, now.size)) {
	1430	remain.size = int128_sub(remain.size, now.size);
	1431	remain.offset_within_address_space += int128_get64(now.size);
	1432	remain.offset_within_region += int128_get64(now.size);
	1433	now = remain;
	1434	if (int128_lt(remain.size, page_size)) {
	1435	register_subpage(fv, &now);
	1436	} else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1437	now.size = page_size;
	1438	register_subpage(fv, &now);
	1439	} else {
	1440	now.size = int128_and(now.size, int128_neg(page_size));
	1441	register_multipage(fv, &now);
	1442	}
	1443	}
	1444	}
	1445
	1446	void qemu_flush_coalesced_mmio_buffer(void)
	1447	{
	1448	if (kvm_enabled())
	1449	kvm_flush_coalesced_mmio_buffer();
	1450	}
	1451
	1452	void qemu_mutex_lock_ramlist(void)
	1453	{
	1454	qemu_mutex_lock(&ram_list.mutex);
	1455	}
	1456
	1457	void qemu_mutex_unlock_ramlist(void)
	1458	{
	1459	qemu_mutex_unlock(&ram_list.mutex);
	1460	}
	1461
	1462	void ram_block_dump(Monitor *mon)
	1463	{
	1464	RAMBlock *block;
	1465	char *psize;
	1466
	1467	rcu_read_lock();
	1468	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1469	"Block Name", "PSize", "Offset", "Used", "Total");
	1470	RAMBLOCK_FOREACH(block) {
	1471	psize = size_to_str(block->page_size);
	1472	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1473	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1474	(uint64_t)block->offset,
	1475	(uint64_t)block->used_length,
	1476	(uint64_t)block->max_length);
	1477	g_free(psize);
	1478	}
	1479	rcu_read_unlock();
	1480	}
	1481
	1482	#ifdef __linux__
	1483	/*
	1484	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1485	* may or may not name the same files / on the same filesystem now as
	1486	* when we actually open and map them. Iterate over the file
	1487	* descriptors instead, and use qemu_fd_getpagesize().
	1488	*/
	1489	static int find_max_supported_pagesize(Object obj, void opaque)
	1490	{
	1491	char *mem_path;
	1492	long *hpsize_min = opaque;
	1493
	1494	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1495	mem_path = object_property_get_str(obj, "mem-path", NULL);
	1496	if (mem_path) {
	1497	long hpsize = qemu_mempath_getpagesize(mem_path);
	1498	g_free(mem_path);
	1499	if (hpsize < *hpsize_min) {
	1500	*hpsize_min = hpsize;
	1501	}
	1502	} else {
	1503	*hpsize_min = getpagesize();
	1504	}
	1505	}
	1506
	1507	return 0;
	1508	}
	1509
	1510	long qemu_getrampagesize(void)
	1511	{
	1512	long hpsize = LONG_MAX;
	1513	long mainrampagesize;
	1514	Object *memdev_root;
	1515
	1516	if (mem_path) {
	1517	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1518	} else {
	1519	mainrampagesize = getpagesize();
	1520	}
	1521
	1522	/* it's possible we have memory-backend objects with
	1523	* hugepage-backed RAM. these may get mapped into system
	1524	* address space via -numa parameters or memory hotplug
	1525	* hooks. we want to take these into account, but we
	1526	* also want to make sure these supported hugepage
	1527	* sizes are applicable across the entire range of memory
	1528	* we may boot from, so we take the min across all
	1529	* backends, and assume normal pages in cases where a
	1530	* backend isn't backed by hugepages.
	1531	*/
	1532	memdev_root = object_resolve_path("/objects", NULL);
	1533	if (memdev_root) {
	1534	object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
	1535	}
	1536	if (hpsize == LONG_MAX) {
	1537	/* No additional memory regions found ==> Report main RAM page size */
	1538	return mainrampagesize;
	1539	}
	1540
	1541	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1542	* memory-backend, then there is at least one node using "normal" RAM,
	1543	* so if its page size is smaller we have got to report that size instead.
	1544	*/
	1545	if (hpsize > mainrampagesize &&
	1546	(nb_numa_nodes == 0 \|\| numa_info[0].node_memdev == NULL)) {
	1547	static bool warned;
	1548	if (!warned) {
	1549	error_report("Huge page support disabled (n/a for main memory).");
	1550	warned = true;
	1551	}
	1552	return mainrampagesize;
	1553	}
	1554
	1555	return hpsize;
	1556	}
	1557	#else
	1558	long qemu_getrampagesize(void)
	1559	{
	1560	return getpagesize();
	1561	}
	1562	#endif
	1563
	1564	#ifdef __linux__
	1565	static int64_t get_file_size(int fd)
	1566	{
	1567	int64_t size = lseek(fd, 0, SEEK_END);
	1568	if (size < 0) {
	1569	return -errno;
	1570	}
	1571	return size;
	1572	}
	1573
	1574	static int file_ram_open(const char *path,
	1575	const char *region_name,
	1576	bool *created,
	1577	Error **errp)
	1578	{
	1579	char *filename;
	1580	char *sanitized_name;
	1581	char *c;
	1582	int fd = -1;
	1583
	1584	*created = false;
	1585	for (;;) {
	1586	fd = open(path, O_RDWR);
	1587	if (fd >= 0) {
	1588	/* @path names an existing file, use it */
	1589	break;
	1590	}
	1591	if (errno == ENOENT) {
	1592	/* @path names a file that doesn't exist, create it */
	1593	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1594	if (fd >= 0) {
	1595	*created = true;
	1596	break;
	1597	}
	1598	} else if (errno == EISDIR) {
	1599	/* @path names a directory, create a file there */
	1600	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1601	sanitized_name = g_strdup(region_name);
	1602	for (c = sanitized_name; *c != '\0'; c++) {
	1603	if (*c == '/') {
	1604	*c = '_';
	1605	}
	1606	}
	1607
	1608	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1609	sanitized_name);
	1610	g_free(sanitized_name);
	1611
	1612	fd = mkstemp(filename);
	1613	if (fd >= 0) {
	1614	unlink(filename);
	1615	g_free(filename);
	1616	break;
	1617	}
	1618	g_free(filename);
	1619	}
	1620	if (errno != EEXIST && errno != EINTR) {
	1621	error_setg_errno(errp, errno,
	1622	"can't open backing store %s for guest RAM",
	1623	path);
	1624	return -1;
	1625	}
	1626	/*
	1627	* Try again on EINTR and EEXIST. The latter happens when
	1628	* something else creates the file between our two open().
	1629	*/
	1630	}
	1631
	1632	return fd;
	1633	}
	1634
	1635	static void file_ram_alloc(RAMBlock block,
	1636	ram_addr_t memory,
	1637	int fd,
	1638	bool truncate,
	1639	Error **errp)
	1640	{
	1641	void *area;
	1642
	1643	block->page_size = qemu_fd_getpagesize(fd);
	1644	if (block->mr->align % block->page_size) {
	1645	error_setg(errp, "alignment 0x%" PRIx64
	1646	" must be multiples of page size 0x%zx",
	1647	block->mr->align, block->page_size);
	1648	return NULL;
	1649	}
	1650	block->mr->align = MAX(block->page_size, block->mr->align);
	1651	#if defined(__s390x__)
	1652	if (kvm_enabled()) {
	1653	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1654	}
	1655	#endif
	1656
	1657	if (memory < block->page_size) {
	1658	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1659	"or larger than page size 0x%zx",
	1660	memory, block->page_size);
	1661	return NULL;
	1662	}
	1663
	1664	memory = ROUND_UP(memory, block->page_size);
	1665
	1666	/*
	1667	* ftruncate is not supported by hugetlbfs in older
	1668	* hosts, so don't bother bailing out on errors.
	1669	* If anything goes wrong with it under other filesystems,
	1670	* mmap will fail.
	1671	*
	1672	* Do not truncate the non-empty backend file to avoid corrupting
	1673	* the existing data in the file. Disabling shrinking is not
	1674	* enough. For example, the current vNVDIMM implementation stores
	1675	* the guest NVDIMM labels at the end of the backend file. If the
	1676	* backend file is later extended, QEMU will not be able to find
	1677	* those labels. Therefore, extending the non-empty backend file
	1678	* is disabled as well.
	1679	*/
	1680	if (truncate && ftruncate(fd, memory)) {
	1681	perror("ftruncate");
	1682	}
	1683
	1684	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1685	block->flags & RAM_SHARED);
	1686	if (area == MAP_FAILED) {
	1687	error_setg_errno(errp, errno,
	1688	"unable to map backing store for guest RAM");
	1689	return NULL;
	1690	}
	1691
	1692	if (mem_prealloc) {
	1693	os_mem_prealloc(fd, area, memory, smp_cpus, errp);
	1694	if (errp && *errp) {
	1695	qemu_ram_munmap(area, memory);
	1696	return NULL;
	1697	}
	1698	}
	1699
	1700	block->fd = fd;
	1701	return area;
	1702	}
	1703	#endif
	1704
	1705	/* Allocate space within the ram_addr_t space that governs the
	1706	* dirty bitmaps.
	1707	* Called with the ramlist lock held.
	1708	*/
	1709	static ram_addr_t find_ram_offset(ram_addr_t size)
	1710	{
	1711	RAMBlock block, next_block;
	1712	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1713
	1714	assert(size != 0); /* it would hand out same offset multiple times */
	1715
	1716	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1717	return 0;
	1718	}
	1719
	1720	RAMBLOCK_FOREACH(block) {
	1721	ram_addr_t candidate, next = RAM_ADDR_MAX;
	1722
	1723	/* Align blocks to start on a 'long' in the bitmap
	1724	* which makes the bitmap sync'ing take the fast path.
	1725	*/
	1726	candidate = block->offset + block->max_length;
	1727	candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
	1728
	1729	/* Search for the closest following block
	1730	* and find the gap.
	1731	*/
	1732	RAMBLOCK_FOREACH(next_block) {
	1733	if (next_block->offset >= candidate) {
	1734	next = MIN(next, next_block->offset);
	1735	}
	1736	}
	1737
	1738	/* If it fits remember our place and remember the size
	1739	* of gap, but keep going so that we might find a smaller
	1740	* gap to fill so avoiding fragmentation.
	1741	*/
	1742	if (next - candidate >= size && next - candidate < mingap) {
	1743	offset = candidate;
	1744	mingap = next - candidate;
	1745	}
	1746
	1747	trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
	1748	}
	1749
	1750	if (offset == RAM_ADDR_MAX) {
	1751	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1752	(uint64_t)size);
	1753	abort();
	1754	}
	1755
	1756	trace_find_ram_offset(size, offset);
	1757
	1758	return offset;
	1759	}
	1760
	1761	unsigned long last_ram_page(void)
	1762	{
	1763	RAMBlock *block;
	1764	ram_addr_t last = 0;
	1765
	1766	rcu_read_lock();
	1767	RAMBLOCK_FOREACH(block) {
	1768	last = MAX(last, block->offset + block->max_length);
	1769	}
	1770	rcu_read_unlock();
	1771	return last >> TARGET_PAGE_BITS;
	1772	}
	1773
	1774	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	1775	{
	1776	int ret;
	1777
	1778	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	1779	if (!machine_dump_guest_core(current_machine)) {
	1780	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	1781	if (ret) {
	1782	perror("qemu_madvise");
	1783	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	1784	"but dump_guest_core=off specified\n");
	1785	}
	1786	}
	1787	}
	1788
	1789	const char qemu_ram_get_idstr(RAMBlock rb)
	1790	{
	1791	return rb->idstr;
	1792	}
	1793
	1794	bool qemu_ram_is_shared(RAMBlock *rb)
	1795	{
	1796	return rb->flags & RAM_SHARED;
	1797	}
	1798
	1799	/* Note: Only set at the start of postcopy */
	1800	bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
	1801	{
	1802	return rb->flags & RAM_UF_ZEROPAGE;
	1803	}
	1804
	1805	void qemu_ram_set_uf_zeroable(RAMBlock *rb)
	1806	{
	1807	rb->flags \|= RAM_UF_ZEROPAGE;
	1808	}
	1809
	1810	/* Called with iothread lock held. */
	1811	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	1812	{
	1813	RAMBlock *block;
	1814
	1815	assert(new_block);
	1816	assert(!new_block->idstr[0]);
	1817
	1818	if (dev) {
	1819	char *id = qdev_get_dev_path(dev);
	1820	if (id) {
	1821	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	1822	g_free(id);
	1823	}
	1824	}
	1825	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	1826
	1827	rcu_read_lock();
	1828	RAMBLOCK_FOREACH(block) {
	1829	if (block != new_block &&
	1830	!strcmp(block->idstr, new_block->idstr)) {
	1831	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	1832	new_block->idstr);
	1833	abort();
	1834	}
	1835	}
	1836	rcu_read_unlock();
	1837	}
	1838
	1839	/* Called with iothread lock held. */
	1840	void qemu_ram_unset_idstr(RAMBlock *block)
	1841	{
	1842	/* FIXME: arch_init.c assumes that this is not called throughout
	1843	* migration. Ignore the problem since hot-unplug during migration
	1844	* does not work anyway.
	1845	*/
	1846	if (block) {
	1847	memset(block->idstr, 0, sizeof(block->idstr));
	1848	}
	1849	}
	1850
	1851	size_t qemu_ram_pagesize(RAMBlock *rb)
	1852	{
	1853	return rb->page_size;
	1854	}
	1855
	1856	/* Returns the largest size of page in use */
	1857	size_t qemu_ram_pagesize_largest(void)
	1858	{
	1859	RAMBlock *block;
	1860	size_t largest = 0;
	1861
	1862	RAMBLOCK_FOREACH(block) {
	1863	largest = MAX(largest, qemu_ram_pagesize(block));
	1864	}
	1865
	1866	return largest;
	1867	}
	1868
	1869	static int memory_try_enable_merging(void *addr, size_t len)
	1870	{
	1871	if (!machine_mem_merge(current_machine)) {
	1872	/* disabled by the user */
	1873	return 0;
	1874	}
	1875
	1876	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	1877	}
	1878
	1879	/* Only legal before guest might have detected the memory size: e.g. on
	1880	* incoming migration, or right after reset.
	1881	*
	1882	* As memory core doesn't know how is memory accessed, it is up to
	1883	* resize callback to update device state and/or add assertions to detect
	1884	* misuse, if necessary.
	1885	*/
	1886	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	1887	{
	1888	assert(block);
	1889
	1890	newsize = HOST_PAGE_ALIGN(newsize);
	1891
	1892	if (block->used_length == newsize) {
	1893	return 0;
	1894	}
	1895
	1896	if (!(block->flags & RAM_RESIZEABLE)) {
	1897	error_setg_errno(errp, EINVAL,
	1898	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	1899	" in != 0x" RAM_ADDR_FMT, block->idstr,
	1900	newsize, block->used_length);
	1901	return -EINVAL;
	1902	}
	1903
	1904	if (block->max_length < newsize) {
	1905	error_setg_errno(errp, EINVAL,
	1906	"Length too large: %s: 0x" RAM_ADDR_FMT
	1907	" > 0x" RAM_ADDR_FMT, block->idstr,
	1908	newsize, block->max_length);
	1909	return -EINVAL;
	1910	}
	1911
	1912	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	1913	block->used_length = newsize;
	1914	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	1915	DIRTY_CLIENTS_ALL);
	1916	memory_region_set_size(block->mr, newsize);
	1917	if (block->resized) {
	1918	block->resized(block->idstr, newsize, block->host);
	1919	}
	1920	return 0;
	1921	}
	1922
	1923	/* Called with ram_list.mutex held */
	1924	static void dirty_memory_extend(ram_addr_t old_ram_size,
	1925	ram_addr_t new_ram_size)
	1926	{
	1927	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	1928	DIRTY_MEMORY_BLOCK_SIZE);
	1929	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	1930	DIRTY_MEMORY_BLOCK_SIZE);
	1931	int i;
	1932
	1933	/* Only need to extend if block count increased */
	1934	if (new_num_blocks <= old_num_blocks) {
	1935	return;
	1936	}
	1937
	1938	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	1939	DirtyMemoryBlocks *old_blocks;
	1940	DirtyMemoryBlocks *new_blocks;
	1941	int j;
	1942
	1943	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	1944	new_blocks = g_malloc(sizeof(*new_blocks) +
	1945	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	1946
	1947	if (old_num_blocks) {
	1948	memcpy(new_blocks->blocks, old_blocks->blocks,
	1949	old_num_blocks * sizeof(old_blocks->blocks[0]));
	1950	}
	1951
	1952	for (j = old_num_blocks; j < new_num_blocks; j++) {
	1953	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	1954	}
	1955
	1956	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	1957
	1958	if (old_blocks) {
	1959	g_free_rcu(old_blocks, rcu);
	1960	}
	1961	}
	1962	}
	1963
	1964	static void ram_block_add(RAMBlock new_block, Error *errp, bool shared)
	1965	{
	1966	RAMBlock *block;
	1967	RAMBlock *last_block = NULL;
	1968	ram_addr_t old_ram_size, new_ram_size;
	1969	Error *err = NULL;
	1970
	1971	old_ram_size = last_ram_page();
	1972
	1973	qemu_mutex_lock_ramlist();
	1974	new_block->offset = find_ram_offset(new_block->max_length);
	1975
	1976	if (!new_block->host) {
	1977	if (xen_enabled()) {
	1978	xen_ram_alloc(new_block->offset, new_block->max_length,
	1979	new_block->mr, &err);
	1980	if (err) {
	1981	error_propagate(errp, err);
	1982	qemu_mutex_unlock_ramlist();
	1983	return;
	1984	}
	1985	} else {
	1986	new_block->host = phys_mem_alloc(new_block->max_length,
	1987	&new_block->mr->align, shared);
	1988	if (!new_block->host) {
	1989	error_setg_errno(errp, errno,
	1990	"cannot set up guest memory '%s'",
	1991	memory_region_name(new_block->mr));
	1992	qemu_mutex_unlock_ramlist();
	1993	return;
	1994	}
	1995	memory_try_enable_merging(new_block->host, new_block->max_length);
	1996	}
	1997	}
	1998
	1999	new_ram_size = MAX(old_ram_size,
	2000	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	2001	if (new_ram_size > old_ram_size) {
	2002	dirty_memory_extend(old_ram_size, new_ram_size);
	2003	}
	2004	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	2005	* QLIST (which has an RCU-friendly variant) does not have insertion at
	2006	* tail, so save the last element in last_block.
	2007	*/
	2008	RAMBLOCK_FOREACH(block) {
	2009	last_block = block;
	2010	if (block->max_length < new_block->max_length) {
	2011	break;
	2012	}
	2013	}
	2014	if (block) {
	2015	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	2016	} else if (last_block) {
	2017	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	2018	} else { /* list is empty */
	2019	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	2020	}
	2021	ram_list.mru_block = NULL;
	2022
	2023	/* Write list before version */
	2024	smp_wmb();
	2025	ram_list.version++;
	2026	qemu_mutex_unlock_ramlist();
	2027
	2028	cpu_physical_memory_set_dirty_range(new_block->offset,
	2029	new_block->used_length,
	2030	DIRTY_CLIENTS_ALL);
	2031
	2032	if (new_block->host) {
	2033	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	2034	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	2035	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	2036	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	2037	ram_block_notify_add(new_block->host, new_block->max_length);
	2038	}
	2039	}
	2040
	2041	#ifdef __linux__
	2042	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	2043	bool share, int fd,
	2044	Error **errp)
	2045	{
	2046	RAMBlock *new_block;
	2047	Error *local_err = NULL;
	2048	int64_t file_size;
	2049
	2050	if (xen_enabled()) {
	2051	error_setg(errp, "-mem-path not supported with Xen");
	2052	return NULL;
	2053	}
	2054
	2055	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	2056	error_setg(errp,
	2057	"host lacks kvm mmu notifiers, -mem-path unsupported");
	2058	return NULL;
	2059	}
	2060
	2061	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	2062	/*
	2063	* file_ram_alloc() needs to allocate just like
	2064	* phys_mem_alloc, but we haven't bothered to provide
	2065	* a hook there.
	2066	*/
	2067	error_setg(errp,
	2068	"-mem-path not supported with this accelerator");
	2069	return NULL;
	2070	}
	2071
	2072	size = HOST_PAGE_ALIGN(size);
	2073	file_size = get_file_size(fd);
	2074	if (file_size > 0 && file_size < size) {
	2075	error_setg(errp, "backing store %s size 0x%" PRIx64
	2076	" does not match 'size' option 0x" RAM_ADDR_FMT,
	2077	mem_path, file_size, size);
	2078	return NULL;
	2079	}
	2080
	2081	new_block = g_malloc0(sizeof(*new_block));
	2082	new_block->mr = mr;
	2083	new_block->used_length = size;
	2084	new_block->max_length = size;
	2085	new_block->flags = share ? RAM_SHARED : 0;
	2086	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	2087	if (!new_block->host) {
	2088	g_free(new_block);
	2089	return NULL;
	2090	}
	2091
	2092	ram_block_add(new_block, &local_err, share);
	2093	if (local_err) {
	2094	g_free(new_block);
	2095	error_propagate(errp, local_err);
	2096	return NULL;
	2097	}
	2098	return new_block;
	2099
	2100	}
	2101
	2102
	2103	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	2104	bool share, const char *mem_path,
	2105	Error **errp)
	2106	{
	2107	int fd;
	2108	bool created;
	2109	RAMBlock *block;
	2110
	2111	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2112	if (fd < 0) {
	2113	return NULL;
	2114	}
	2115
	2116	block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
	2117	if (!block) {
	2118	if (created) {
	2119	unlink(mem_path);
	2120	}
	2121	close(fd);
	2122	return NULL;
	2123	}
	2124
	2125	return block;
	2126	}
	2127	#endif
	2128
	2129	static
	2130	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2131	void (resized)(const char,
	2132	uint64_t length,
	2133	void *host),
	2134	void *host, bool resizeable, bool share,
	2135	MemoryRegion mr, Error *errp)
	2136	{
	2137	RAMBlock *new_block;
	2138	Error *local_err = NULL;
	2139
	2140	size = HOST_PAGE_ALIGN(size);
	2141	max_size = HOST_PAGE_ALIGN(max_size);
	2142	new_block = g_malloc0(sizeof(*new_block));
	2143	new_block->mr = mr;
	2144	new_block->resized = resized;
	2145	new_block->used_length = size;
	2146	new_block->max_length = max_size;
	2147	assert(max_size >= size);
	2148	new_block->fd = -1;
	2149	new_block->page_size = getpagesize();
	2150	new_block->host = host;
	2151	if (host) {
	2152	new_block->flags \|= RAM_PREALLOC;
	2153	}
	2154	if (resizeable) {
	2155	new_block->flags \|= RAM_RESIZEABLE;
	2156	}
	2157	ram_block_add(new_block, &local_err, share);
	2158	if (local_err) {
	2159	g_free(new_block);
	2160	error_propagate(errp, local_err);
	2161	return NULL;
	2162	}
	2163	return new_block;
	2164	}
	2165
	2166	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2167	MemoryRegion mr, Error *errp)
	2168	{
	2169	return qemu_ram_alloc_internal(size, size, NULL, host, false,
	2170	false, mr, errp);
	2171	}
	2172
	2173	RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
	2174	MemoryRegion mr, Error *errp)
	2175	{
	2176	return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
	2177	share, mr, errp);
	2178	}
	2179
	2180	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2181	void (resized)(const char,
	2182	uint64_t length,
	2183	void *host),
	2184	MemoryRegion mr, Error *errp)
	2185	{
	2186	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
	2187	false, mr, errp);
	2188	}
	2189
	2190	static void reclaim_ramblock(RAMBlock *block)
	2191	{
	2192	if (block->flags & RAM_PREALLOC) {
	2193	;
	2194	} else if (xen_enabled()) {
	2195	xen_invalidate_map_cache_entry(block->host);
	2196	#ifndef _WIN32
	2197	} else if (block->fd >= 0) {
	2198	qemu_ram_munmap(block->host, block->max_length);
	2199	close(block->fd);
	2200	#endif
	2201	} else {
	2202	qemu_anon_ram_free(block->host, block->max_length);
	2203	}
	2204	g_free(block);
	2205	}
	2206
	2207	void qemu_ram_free(RAMBlock *block)
	2208	{
	2209	if (!block) {
	2210	return;
	2211	}
	2212
	2213	if (block->host) {
	2214	ram_block_notify_remove(block->host, block->max_length);
	2215	}
	2216
	2217	qemu_mutex_lock_ramlist();
	2218	QLIST_REMOVE_RCU(block, next);
	2219	ram_list.mru_block = NULL;
	2220	/* Write list before version */
	2221	smp_wmb();
	2222	ram_list.version++;
	2223	call_rcu(block, reclaim_ramblock, rcu);
	2224	qemu_mutex_unlock_ramlist();
	2225	}
	2226
	2227	#ifndef _WIN32
	2228	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2229	{
	2230	RAMBlock *block;
	2231	ram_addr_t offset;
	2232	int flags;
	2233	void area, vaddr;
	2234
	2235	RAMBLOCK_FOREACH(block) {
	2236	offset = addr - block->offset;
	2237	if (offset < block->max_length) {
	2238	vaddr = ramblock_ptr(block, offset);
	2239	if (block->flags & RAM_PREALLOC) {
	2240	;
	2241	} else if (xen_enabled()) {
	2242	abort();
	2243	} else {
	2244	flags = MAP_FIXED;
	2245	if (block->fd >= 0) {
	2246	flags \|= (block->flags & RAM_SHARED ?
	2247	MAP_SHARED : MAP_PRIVATE);
	2248	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2249	flags, block->fd, offset);
	2250	} else {
	2251	/*
	2252	* Remap needs to match alloc. Accelerators that
	2253	* set phys_mem_alloc never remap. If they did,
	2254	* we'd need a remap hook here.
	2255	*/
	2256	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2257
	2258	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2259	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2260	flags, -1, 0);
	2261	}
	2262	if (area != vaddr) {
	2263	error_report("Could not remap addr: "
	2264	RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
	2265	length, addr);
	2266	exit(1);
	2267	}
	2268	memory_try_enable_merging(vaddr, length);
	2269	qemu_ram_setup_dump(vaddr, length);
	2270	}
	2271	}
	2272	}
	2273	}
	2274	#endif /* !_WIN32 */
	2275
	2276	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2277	* This should not be used for general purpose DMA. Use address_space_map
	2278	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2279	* device owns, use memory_region_get_ram_ptr.
	2280	*
	2281	* Called within RCU critical section.
	2282	*/
	2283	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2284	{
	2285	RAMBlock *block = ram_block;
	2286
	2287	if (block == NULL) {
	2288	block = qemu_get_ram_block(addr);
	2289	addr -= block->offset;
	2290	}
	2291
	2292	if (xen_enabled() && block->host == NULL) {
	2293	/* We need to check if the requested address is in the RAM
	2294	* because we don't want to map the entire memory in QEMU.
	2295	* In that case just map until the end of the page.
	2296	*/
	2297	if (block->offset == 0) {
	2298	return xen_map_cache(addr, 0, 0, false);
	2299	}
	2300
	2301	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2302	}
	2303	return ramblock_ptr(block, addr);
	2304	}
	2305
	2306	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2307	* but takes a size argument.
	2308	*
	2309	* Called within RCU critical section.
	2310	*/
	2311	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2312	hwaddr *size, bool lock)
	2313	{
	2314	RAMBlock *block = ram_block;
	2315	if (*size == 0) {
	2316	return NULL;
	2317	}
	2318
	2319	if (block == NULL) {
	2320	block = qemu_get_ram_block(addr);
	2321	addr -= block->offset;
	2322	}
	2323	size = MIN(size, block->max_length - addr);
	2324
	2325	if (xen_enabled() && block->host == NULL) {
	2326	/* We need to check if the requested address is in the RAM
	2327	* because we don't want to map the entire memory in QEMU.
	2328	* In that case just map the requested area.
	2329	*/
	2330	if (block->offset == 0) {
	2331	return xen_map_cache(addr, *size, lock, lock);
	2332	}
	2333
	2334	block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
	2335	}
	2336
	2337	return ramblock_ptr(block, addr);
	2338	}
	2339
	2340	/* Return the offset of a hostpointer within a ramblock */
	2341	ram_addr_t qemu_ram_block_host_offset(RAMBlock rb, void host)
	2342	{
	2343	ram_addr_t res = (uint8_t )host - (uint8_t )rb->host;
	2344	assert((uintptr_t)host >= (uintptr_t)rb->host);
	2345	assert(res < rb->max_length);
	2346
	2347	return res;
	2348	}
	2349
	2350	/*
	2351	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2352	* in that RAMBlock.
	2353	*
	2354	* ptr: Host pointer to look up
	2355	* round_offset: If true round the result offset down to a page boundary
	2356	* *ram_addr: set to result ram_addr
	2357	* *offset: set to result offset within the RAMBlock
	2358	*
	2359	* Returns: RAMBlock (or NULL if not found)
	2360	*
	2361	* By the time this function returns, the returned pointer is not protected
	2362	* by RCU anymore. If the caller is not within an RCU critical section and
	2363	* does not hold the iothread lock, it must have other means of protecting the
	2364	* pointer, such as a reference to the region that includes the incoming
	2365	* ram_addr_t.
	2366	*/
	2367	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2368	ram_addr_t *offset)
	2369	{
	2370	RAMBlock *block;
	2371	uint8_t *host = ptr;
	2372
	2373	if (xen_enabled()) {
	2374	ram_addr_t ram_addr;
	2375	rcu_read_lock();
	2376	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2377	block = qemu_get_ram_block(ram_addr);
	2378	if (block) {
	2379	*offset = ram_addr - block->offset;
	2380	}
	2381	rcu_read_unlock();
	2382	return block;
	2383	}
	2384
	2385	rcu_read_lock();
	2386	block = atomic_rcu_read(&ram_list.mru_block);
	2387	if (block && block->host && host - block->host < block->max_length) {
	2388	goto found;
	2389	}
	2390
	2391	RAMBLOCK_FOREACH(block) {
	2392	/* This case append when the block is not mapped. */
	2393	if (block->host == NULL) {
	2394	continue;
	2395	}
	2396	if (host - block->host < block->max_length) {
	2397	goto found;
	2398	}
	2399	}
	2400
	2401	rcu_read_unlock();
	2402	return NULL;
	2403
	2404	found:
	2405	*offset = (host - block->host);
	2406	if (round_offset) {
	2407	*offset &= TARGET_PAGE_MASK;
	2408	}
	2409	rcu_read_unlock();
	2410	return block;
	2411	}
	2412
	2413	/*
	2414	* Finds the named RAMBlock
	2415	*
	2416	* name: The name of RAMBlock to find
	2417	*
	2418	* Returns: RAMBlock (or NULL if not found)
	2419	*/
	2420	RAMBlock qemu_ram_block_by_name(const char name)
	2421	{
	2422	RAMBlock *block;
	2423
	2424	RAMBLOCK_FOREACH(block) {
	2425	if (!strcmp(name, block->idstr)) {
	2426	return block;
	2427	}
	2428	}
	2429
	2430	return NULL;
	2431	}
	2432
	2433	/* Some of the softmmu routines need to translate from a host pointer
	2434	(typically a TLB entry) back to a ram offset. */
	2435	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2436	{
	2437	RAMBlock *block;
	2438	ram_addr_t offset;
	2439
	2440	block = qemu_ram_block_from_host(ptr, false, &offset);
	2441	if (!block) {
	2442	return RAM_ADDR_INVALID;
	2443	}
	2444
	2445	return block->offset + offset;
	2446	}
	2447
	2448	/* Called within RCU critical section. */
	2449	void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
	2450	CPUState *cpu,
	2451	vaddr mem_vaddr,
	2452	ram_addr_t ram_addr,
	2453	unsigned size)
	2454	{
	2455	ndi->cpu = cpu;
	2456	ndi->ram_addr = ram_addr;
	2457	ndi->mem_vaddr = mem_vaddr;
	2458	ndi->size = size;
	2459	ndi->locked = false;
	2460
	2461	assert(tcg_enabled());
	2462	if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
	2463	ndi->locked = true;
	2464	tb_lock();
	2465	tb_invalidate_phys_page_fast(ram_addr, size);
	2466	}
	2467	}
	2468
	2469	/* Called within RCU critical section. */
	2470	void memory_notdirty_write_complete(NotDirtyInfo *ndi)
	2471	{
	2472	if (ndi->locked) {
	2473	tb_unlock();
	2474	}
	2475
	2476	/* Set both VGA and migration bits for simplicity and to remove
	2477	* the notdirty callback faster.
	2478	*/
	2479	cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
	2480	DIRTY_CLIENTS_NOCODE);
	2481	/* we remove the notdirty callback only if the code has been
	2482	flushed */
	2483	if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
	2484	tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
	2485	}
	2486	}
	2487
	2488	/* Called within RCU critical section. */
	2489	static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
	2490	uint64_t val, unsigned size)
	2491	{
	2492	NotDirtyInfo ndi;
	2493
	2494	memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
	2495	ram_addr, size);
	2496
	2497	switch (size) {
	2498	case 1:
	2499	stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2500	break;
	2501	case 2:
	2502	stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2503	break;
	2504	case 4:
	2505	stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2506	break;
	2507	case 8:
	2508	stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2509	break;
	2510	default:
	2511	abort();
	2512	}
	2513	memory_notdirty_write_complete(&ndi);
	2514	}
	2515
	2516	static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
	2517	unsigned size, bool is_write)
	2518	{
	2519	return is_write;
	2520	}
	2521
	2522	static const MemoryRegionOps notdirty_mem_ops = {
	2523	.write = notdirty_mem_write,
	2524	.valid.accepts = notdirty_mem_accepts,
	2525	.endianness = DEVICE_NATIVE_ENDIAN,
	2526	.valid = {
	2527	.min_access_size = 1,
	2528	.max_access_size = 8,
	2529	.unaligned = false,
	2530	},
	2531	.impl = {
	2532	.min_access_size = 1,
	2533	.max_access_size = 8,
	2534	.unaligned = false,
	2535	},
	2536	};
	2537
	2538	/* Generate a debug exception if a watchpoint has been hit. */
	2539	static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
	2540	{
	2541	CPUState *cpu = current_cpu;
	2542	CPUClass *cc = CPU_GET_CLASS(cpu);
	2543	target_ulong vaddr;
	2544	CPUWatchpoint *wp;
	2545
	2546	assert(tcg_enabled());
	2547	if (cpu->watchpoint_hit) {
	2548	/* We re-entered the check after replacing the TB. Now raise
	2549	* the debug interrupt so that is will trigger after the
	2550	* current instruction. */
	2551	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2552	return;
	2553	}
	2554	vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
	2555	vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
	2556	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2557	if (cpu_watchpoint_address_matches(wp, vaddr, len)
	2558	&& (wp->flags & flags)) {
	2559	if (flags == BP_MEM_READ) {
	2560	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2561	} else {
	2562	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2563	}
	2564	wp->hitaddr = vaddr;
	2565	wp->hitattrs = attrs;
	2566	if (!cpu->watchpoint_hit) {
	2567	if (wp->flags & BP_CPU &&
	2568	!cc->debug_check_watchpoint(cpu, wp)) {
	2569	wp->flags &= ~BP_WATCHPOINT_HIT;
	2570	continue;
	2571	}
	2572	cpu->watchpoint_hit = wp;
	2573
	2574	/* Both tb_lock and iothread_mutex will be reset when
	2575	* cpu_loop_exit or cpu_loop_exit_noexc longjmp
	2576	* back into the cpu_exec main loop.
	2577	*/
	2578	tb_lock();
	2579	tb_check_watchpoint(cpu);
	2580	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2581	cpu->exception_index = EXCP_DEBUG;
	2582	cpu_loop_exit(cpu);
	2583	} else {
	2584	/* Force execution of one insn next time. */
	2585	cpu->cflags_next_tb = 1 \| curr_cflags();
	2586	cpu_loop_exit_noexc(cpu);
	2587	}
	2588	}
	2589	} else {
	2590	wp->flags &= ~BP_WATCHPOINT_HIT;
	2591	}
	2592	}
	2593	}
	2594
	2595	/* Watchpoint access routines. Watchpoints are inserted using TLB tricks,
	2596	so these check for a hit then pass through to the normal out-of-line
	2597	phys routines. */
	2598	static MemTxResult watch_mem_read(void opaque, hwaddr addr, uint64_t pdata,
	2599	unsigned size, MemTxAttrs attrs)
	2600	{
	2601	MemTxResult res;
	2602	uint64_t data;
	2603	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2604	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2605
	2606	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
	2607	switch (size) {
	2608	case 1:
	2609	data = address_space_ldub(as, addr, attrs, &res);
	2610	break;
	2611	case 2:
	2612	data = address_space_lduw(as, addr, attrs, &res);
	2613	break;
	2614	case 4:
	2615	data = address_space_ldl(as, addr, attrs, &res);
	2616	break;
	2617	case 8:
	2618	data = address_space_ldq(as, addr, attrs, &res);
	2619	break;
	2620	default: abort();
	2621	}
	2622	*pdata = data;
	2623	return res;
	2624	}
	2625
	2626	static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
	2627	uint64_t val, unsigned size,
	2628	MemTxAttrs attrs)
	2629	{
	2630	MemTxResult res;
	2631	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2632	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2633
	2634	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
	2635	switch (size) {
	2636	case 1:
	2637	address_space_stb(as, addr, val, attrs, &res);
	2638	break;
	2639	case 2:
	2640	address_space_stw(as, addr, val, attrs, &res);
	2641	break;
	2642	case 4:
	2643	address_space_stl(as, addr, val, attrs, &res);
	2644	break;
	2645	case 8:
	2646	address_space_stq(as, addr, val, attrs, &res);
	2647	break;
	2648	default: abort();
	2649	}
	2650	return res;
	2651	}
	2652
	2653	static const MemoryRegionOps watch_mem_ops = {
	2654	.read_with_attrs = watch_mem_read,
	2655	.write_with_attrs = watch_mem_write,
	2656	.endianness = DEVICE_NATIVE_ENDIAN,
	2657	.valid = {
	2658	.min_access_size = 1,
	2659	.max_access_size = 8,
	2660	.unaligned = false,
	2661	},
	2662	.impl = {
	2663	.min_access_size = 1,
	2664	.max_access_size = 8,
	2665	.unaligned = false,
	2666	},
	2667	};
	2668
	2669	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	2670	MemTxAttrs attrs, uint8_t *buf, int len);
	2671	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	2672	const uint8_t *buf, int len);
	2673	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	2674	bool is_write);
	2675
	2676	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2677	unsigned len, MemTxAttrs attrs)
	2678	{
	2679	subpage_t *subpage = opaque;
	2680	uint8_t buf[8];
	2681	MemTxResult res;
	2682
	2683	#if defined(DEBUG_SUBPAGE)
	2684	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2685	subpage, len, addr);
	2686	#endif
	2687	res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
	2688	if (res) {
	2689	return res;
	2690	}
	2691	switch (len) {
	2692	case 1:
	2693	*data = ldub_p(buf);
	2694	return MEMTX_OK;
	2695	case 2:
	2696	*data = lduw_p(buf);
	2697	return MEMTX_OK;
	2698	case 4:
	2699	*data = ldl_p(buf);
	2700	return MEMTX_OK;
	2701	case 8:
	2702	*data = ldq_p(buf);
	2703	return MEMTX_OK;
	2704	default:
	2705	abort();
	2706	}
	2707	}
	2708
	2709	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2710	uint64_t value, unsigned len, MemTxAttrs attrs)
	2711	{
	2712	subpage_t *subpage = opaque;
	2713	uint8_t buf[8];
	2714
	2715	#if defined(DEBUG_SUBPAGE)
	2716	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2717	" value %"PRIx64"\n",
	2718	__func__, subpage, len, addr, value);
	2719	#endif
	2720	switch (len) {
	2721	case 1:
	2722	stb_p(buf, value);
	2723	break;
	2724	case 2:
	2725	stw_p(buf, value);
	2726	break;
	2727	case 4:
	2728	stl_p(buf, value);
	2729	break;
	2730	case 8:
	2731	stq_p(buf, value);
	2732	break;
	2733	default:
	2734	abort();
	2735	}
	2736	return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
	2737	}
	2738
	2739	static bool subpage_accepts(void *opaque, hwaddr addr,
	2740	unsigned len, bool is_write)
	2741	{
	2742	subpage_t *subpage = opaque;
	2743	#if defined(DEBUG_SUBPAGE)
	2744	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2745	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2746	#endif
	2747
	2748	return flatview_access_valid(subpage->fv, addr + subpage->base,
	2749	len, is_write);
	2750	}
	2751
	2752	static const MemoryRegionOps subpage_ops = {
	2753	.read_with_attrs = subpage_read,
	2754	.write_with_attrs = subpage_write,
	2755	.impl.min_access_size = 1,
	2756	.impl.max_access_size = 8,
	2757	.valid.min_access_size = 1,
	2758	.valid.max_access_size = 8,
	2759	.valid.accepts = subpage_accepts,
	2760	.endianness = DEVICE_NATIVE_ENDIAN,
	2761	};
	2762
	2763	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	2764	uint16_t section)
	2765	{
	2766	int idx, eidx;
	2767
	2768	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2769	return -1;
	2770	idx = SUBPAGE_IDX(start);
	2771	eidx = SUBPAGE_IDX(end);
	2772	#if defined(DEBUG_SUBPAGE)
	2773	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2774	__func__, mmio, start, end, idx, eidx, section);
	2775	#endif
	2776	for (; idx <= eidx; idx++) {
	2777	mmio->sub_section[idx] = section;
	2778	}
	2779
	2780	return 0;
	2781	}
	2782
	2783	static subpage_t subpage_init(FlatView fv, hwaddr base)
	2784	{
	2785	subpage_t *mmio;
	2786
	2787	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2788	mmio->fv = fv;
	2789	mmio->base = base;
	2790	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2791	NULL, TARGET_PAGE_SIZE);
	2792	mmio->iomem.subpage = true;
	2793	#if defined(DEBUG_SUBPAGE)
	2794	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2795	mmio, base, TARGET_PAGE_SIZE);
	2796	#endif
	2797	subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
	2798
	2799	return mmio;
	2800	}
	2801
	2802	static uint16_t dummy_section(PhysPageMap map, FlatView fv, MemoryRegion *mr)
	2803	{
	2804	assert(fv);
	2805	MemoryRegionSection section = {
	2806	.fv = fv,
	2807	.mr = mr,
	2808	.offset_within_address_space = 0,
	2809	.offset_within_region = 0,
	2810	.size = int128_2_64(),
	2811	};
	2812
	2813	return phys_section_add(map, &section);
	2814	}
	2815
	2816	static void readonly_mem_write(void *opaque, hwaddr addr,
	2817	uint64_t val, unsigned size)
	2818	{
	2819	/* Ignore any write to ROM. */
	2820	}
	2821
	2822	static bool readonly_mem_accepts(void *opaque, hwaddr addr,
	2823	unsigned size, bool is_write)
	2824	{
	2825	return is_write;
	2826	}
	2827
	2828	/* This will only be used for writes, because reads are special cased
	2829	* to directly access the underlying host ram.
	2830	*/
	2831	static const MemoryRegionOps readonly_mem_ops = {
	2832	.write = readonly_mem_write,
	2833	.valid.accepts = readonly_mem_accepts,
	2834	.endianness = DEVICE_NATIVE_ENDIAN,
	2835	.valid = {
	2836	.min_access_size = 1,
	2837	.max_access_size = 8,
	2838	.unaligned = false,
	2839	},
	2840	.impl = {
	2841	.min_access_size = 1,
	2842	.max_access_size = 8,
	2843	.unaligned = false,
	2844	},
	2845	};
	2846
	2847	MemoryRegion iotlb_to_region(CPUState cpu, hwaddr index, MemTxAttrs attrs)
	2848	{
	2849	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	2850	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	2851	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	2852	MemoryRegionSection *sections = d->map.sections;
	2853
	2854	return sections[index & ~TARGET_PAGE_MASK].mr;
	2855	}
	2856
	2857	static void io_mem_init(void)
	2858	{
	2859	memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
	2860	NULL, NULL, UINT64_MAX);
	2861	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	2862	NULL, UINT64_MAX);
	2863
	2864	/* io_mem_notdirty calls tb_invalidate_phys_page_fast,
	2865	* which can be called without the iothread mutex.
	2866	*/
	2867	memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
	2868	NULL, UINT64_MAX);
	2869	memory_region_clear_global_locking(&io_mem_notdirty);
	2870
	2871	memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
	2872	NULL, UINT64_MAX);
	2873	}
	2874
	2875	AddressSpaceDispatch address_space_dispatch_new(FlatView fv)
	2876	{
	2877	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	2878	uint16_t n;
	2879
	2880	n = dummy_section(&d->map, fv, &io_mem_unassigned);
	2881	assert(n == PHYS_SECTION_UNASSIGNED);
	2882	n = dummy_section(&d->map, fv, &io_mem_notdirty);
	2883	assert(n == PHYS_SECTION_NOTDIRTY);
	2884	n = dummy_section(&d->map, fv, &io_mem_rom);
	2885	assert(n == PHYS_SECTION_ROM);
	2886	n = dummy_section(&d->map, fv, &io_mem_watch);
	2887	assert(n == PHYS_SECTION_WATCH);
	2888
	2889	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	2890
	2891	return d;
	2892	}
	2893
	2894	void address_space_dispatch_free(AddressSpaceDispatch *d)
	2895	{
	2896	phys_sections_free(&d->map);
	2897	g_free(d);
	2898	}
	2899
	2900	static void tcg_commit(MemoryListener *listener)
	2901	{
	2902	CPUAddressSpace *cpuas;
	2903	AddressSpaceDispatch *d;
	2904
	2905	/* since each CPU stores ram addresses in its TLB cache, we must
	2906	reset the modified entries */
	2907	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2908	cpu_reloading_memory_map();
	2909	/* The CPU and TLB are protected by the iothread lock.
	2910	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	2911	* may have split the RCU critical section.
	2912	*/
	2913	d = address_space_to_dispatch(cpuas->as);
	2914	atomic_rcu_set(&cpuas->memory_dispatch, d);
	2915	tlb_flush(cpuas->cpu);
	2916	}
	2917
	2918	static void memory_map_init(void)
	2919	{
	2920	system_memory = g_malloc(sizeof(*system_memory));
	2921
	2922	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	2923	address_space_init(&address_space_memory, system_memory, "memory");
	2924
	2925	system_io = g_malloc(sizeof(*system_io));
	2926	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	2927	65536);
	2928	address_space_init(&address_space_io, system_io, "I/O");
	2929	}
	2930
	2931	MemoryRegion *get_system_memory(void)
	2932	{
	2933	return system_memory;
	2934	}
	2935
	2936	MemoryRegion *get_system_io(void)
	2937	{
	2938	return system_io;
	2939	}
	2940
	2941	#endif /* !defined(CONFIG_USER_ONLY) */
	2942
	2943	/* physical memory access (slow version, mainly for debug) */
	2944	#if defined(CONFIG_USER_ONLY)
	2945	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	2946	uint8_t *buf, int len, int is_write)
	2947	{
	2948	int l, flags;
	2949	target_ulong page;
	2950	void * p;
	2951
	2952	while (len > 0) {
	2953	page = addr & TARGET_PAGE_MASK;
	2954	l = (page + TARGET_PAGE_SIZE) - addr;
	2955	if (l > len)
	2956	l = len;
	2957	flags = page_get_flags(page);
	2958	if (!(flags & PAGE_VALID))
	2959	return -1;
	2960	if (is_write) {
	2961	if (!(flags & PAGE_WRITE))
	2962	return -1;
	2963	/* XXX: this code should not depend on lock_user */
	2964	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	2965	return -1;
	2966	memcpy(p, buf, l);
	2967	unlock_user(p, addr, l);
	2968	} else {
	2969	if (!(flags & PAGE_READ))
	2970	return -1;
	2971	/* XXX: this code should not depend on lock_user */
	2972	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	2973	return -1;
	2974	memcpy(buf, p, l);
	2975	unlock_user(p, addr, 0);
	2976	}
	2977	len -= l;
	2978	buf += l;
	2979	addr += l;
	2980	}
	2981	return 0;
	2982	}
	2983
	2984	#else
	2985
	2986	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	2987	hwaddr length)
	2988	{
	2989	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	2990	addr += memory_region_get_ram_addr(mr);
	2991
	2992	/* No early return if dirty_log_mask is or becomes 0, because
	2993	* cpu_physical_memory_set_dirty_range will still call
	2994	* xen_modified_memory.
	2995	*/
	2996	if (dirty_log_mask) {
	2997	dirty_log_mask =
	2998	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	2999	}
	3000	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	3001	assert(tcg_enabled());
	3002	tb_lock();
	3003	tb_invalidate_phys_range(addr, addr + length);
	3004	tb_unlock();
	3005	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	3006	}
	3007	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	3008	}
	3009
	3010	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	3011	{
	3012	unsigned access_size_max = mr->ops->valid.max_access_size;
	3013
	3014	/* Regions are assumed to support 1-4 byte accesses unless
	3015	otherwise specified. */
	3016	if (access_size_max == 0) {
	3017	access_size_max = 4;
	3018	}
	3019
	3020	/* Bound the maximum access by the alignment of the address. */
	3021	if (!mr->ops->impl.unaligned) {
	3022	unsigned align_size_max = addr & -addr;
	3023	if (align_size_max != 0 && align_size_max < access_size_max) {
	3024	access_size_max = align_size_max;
	3025	}
	3026	}
	3027
	3028	/* Don't attempt accesses larger than the maximum. */
	3029	if (l > access_size_max) {
	3030	l = access_size_max;
	3031	}
	3032	l = pow2floor(l);
	3033
	3034	return l;
	3035	}
	3036
	3037	static bool prepare_mmio_access(MemoryRegion *mr)
	3038	{
	3039	bool unlocked = !qemu_mutex_iothread_locked();
	3040	bool release_lock = false;
	3041
	3042	if (unlocked && mr->global_locking) {
	3043	qemu_mutex_lock_iothread();
	3044	unlocked = false;
	3045	release_lock = true;
	3046	}
	3047	if (mr->flush_coalesced_mmio) {
	3048	if (unlocked) {
	3049	qemu_mutex_lock_iothread();
	3050	}
	3051	qemu_flush_coalesced_mmio_buffer();
	3052	if (unlocked) {
	3053	qemu_mutex_unlock_iothread();
	3054	}
	3055	}
	3056
	3057	return release_lock;
	3058	}
	3059
	3060	/* Called within RCU critical section. */
	3061	static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
	3062	MemTxAttrs attrs,
	3063	const uint8_t *buf,
	3064	int len, hwaddr addr1,
	3065	hwaddr l, MemoryRegion *mr)
	3066	{
	3067	uint8_t *ptr;
	3068	uint64_t val;
	3069	MemTxResult result = MEMTX_OK;
	3070	bool release_lock = false;
	3071
	3072	for (;;) {
	3073	if (!memory_access_is_direct(mr, true)) {
	3074	release_lock \|= prepare_mmio_access(mr);
	3075	l = memory_access_size(mr, l, addr1);
	3076	/* XXX: could force current_cpu to NULL to avoid
	3077	potential bugs */
	3078	switch (l) {
	3079	case 8:
	3080	/* 64 bit write access */
	3081	val = ldq_p(buf);
	3082	result \|= memory_region_dispatch_write(mr, addr1, val, 8,
	3083	attrs);
	3084	break;
	3085	case 4:
	3086	/* 32 bit write access */
	3087	val = (uint32_t)ldl_p(buf);
	3088	result \|= memory_region_dispatch_write(mr, addr1, val, 4,
	3089	attrs);
	3090	break;
	3091	case 2:
	3092	/* 16 bit write access */
	3093	val = lduw_p(buf);
	3094	result \|= memory_region_dispatch_write(mr, addr1, val, 2,
	3095	attrs);
	3096	break;
	3097	case 1:
	3098	/* 8 bit write access */
	3099	val = ldub_p(buf);
	3100	result \|= memory_region_dispatch_write(mr, addr1, val, 1,
	3101	attrs);
	3102	break;
	3103	default:
	3104	abort();
	3105	}
	3106	} else {
	3107	/* RAM case */
	3108	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3109	memcpy(ptr, buf, l);
	3110	invalidate_and_set_dirty(mr, addr1, l);
	3111	}
	3112
	3113	if (release_lock) {
	3114	qemu_mutex_unlock_iothread();
	3115	release_lock = false;
	3116	}
	3117
	3118	len -= l;
	3119	buf += l;
	3120	addr += l;
	3121
	3122	if (!len) {
	3123	break;
	3124	}
	3125
	3126	l = len;
	3127	mr = flatview_translate(fv, addr, &addr1, &l, true);
	3128	}
	3129
	3130	return result;
	3131	}
	3132
	3133	/* Called from RCU critical section. */
	3134	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3135	const uint8_t *buf, int len)
	3136	{
	3137	hwaddr l;
	3138	hwaddr addr1;
	3139	MemoryRegion *mr;
	3140	MemTxResult result = MEMTX_OK;
	3141
	3142	l = len;
	3143	mr = flatview_translate(fv, addr, &addr1, &l, true);
	3144	result = flatview_write_continue(fv, addr, attrs, buf, len,
	3145	addr1, l, mr);
	3146
	3147	return result;
	3148	}
	3149
	3150	/* Called within RCU critical section. */
	3151	MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
	3152	MemTxAttrs attrs, uint8_t *buf,
	3153	int len, hwaddr addr1, hwaddr l,
	3154	MemoryRegion *mr)
	3155	{
	3156	uint8_t *ptr;
	3157	uint64_t val;
	3158	MemTxResult result = MEMTX_OK;
	3159	bool release_lock = false;
	3160
	3161	for (;;) {
	3162	if (!memory_access_is_direct(mr, false)) {
	3163	/* I/O case */
	3164	release_lock \|= prepare_mmio_access(mr);
	3165	l = memory_access_size(mr, l, addr1);
	3166	switch (l) {
	3167	case 8:
	3168	/* 64 bit read access */
	3169	result \|= memory_region_dispatch_read(mr, addr1, &val, 8,
	3170	attrs);
	3171	stq_p(buf, val);
	3172	break;
	3173	case 4:
	3174	/* 32 bit read access */
	3175	result \|= memory_region_dispatch_read(mr, addr1, &val, 4,
	3176	attrs);
	3177	stl_p(buf, val);
	3178	break;
	3179	case 2:
	3180	/* 16 bit read access */
	3181	result \|= memory_region_dispatch_read(mr, addr1, &val, 2,
	3182	attrs);
	3183	stw_p(buf, val);
	3184	break;
	3185	case 1:
	3186	/* 8 bit read access */
	3187	result \|= memory_region_dispatch_read(mr, addr1, &val, 1,
	3188	attrs);
	3189	stb_p(buf, val);
	3190	break;
	3191	default:
	3192	abort();
	3193	}
	3194	} else {
	3195	/* RAM case */
	3196	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3197	memcpy(buf, ptr, l);
	3198	}
	3199
	3200	if (release_lock) {
	3201	qemu_mutex_unlock_iothread();
	3202	release_lock = false;
	3203	}
	3204
	3205	len -= l;
	3206	buf += l;
	3207	addr += l;
	3208
	3209	if (!len) {
	3210	break;
	3211	}
	3212
	3213	l = len;
	3214	mr = flatview_translate(fv, addr, &addr1, &l, false);
	3215	}
	3216
	3217	return result;
	3218	}
	3219
	3220	/* Called from RCU critical section. */
	3221	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	3222	MemTxAttrs attrs, uint8_t *buf, int len)
	3223	{
	3224	hwaddr l;
	3225	hwaddr addr1;
	3226	MemoryRegion *mr;
	3227
	3228	l = len;
	3229	mr = flatview_translate(fv, addr, &addr1, &l, false);
	3230	return flatview_read_continue(fv, addr, attrs, buf, len,
	3231	addr1, l, mr);
	3232	}
	3233
	3234	MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
	3235	MemTxAttrs attrs, uint8_t *buf, int len)
	3236	{
	3237	MemTxResult result = MEMTX_OK;
	3238	FlatView *fv;
	3239
	3240	if (len > 0) {
	3241	rcu_read_lock();
	3242	fv = address_space_to_flatview(as);
	3243	result = flatview_read(fv, addr, attrs, buf, len);
	3244	rcu_read_unlock();
	3245	}
	3246
	3247	return result;
	3248	}
	3249
	3250	MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
	3251	MemTxAttrs attrs,
	3252	const uint8_t *buf, int len)
	3253	{
	3254	MemTxResult result = MEMTX_OK;
	3255	FlatView *fv;
	3256
	3257	if (len > 0) {
	3258	rcu_read_lock();
	3259	fv = address_space_to_flatview(as);
	3260	result = flatview_write(fv, addr, attrs, buf, len);
	3261	rcu_read_unlock();
	3262	}
	3263
	3264	return result;
	3265	}
	3266
	3267	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	3268	uint8_t *buf, int len, bool is_write)
	3269	{
	3270	if (is_write) {
	3271	return address_space_write(as, addr, attrs, buf, len);
	3272	} else {
	3273	return address_space_read_full(as, addr, attrs, buf, len);
	3274	}
	3275	}
	3276
	3277	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3278	int len, int is_write)
	3279	{
	3280	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3281	buf, len, is_write);
	3282	}
	3283
	3284	enum write_rom_type {
	3285	WRITE_DATA,
	3286	FLUSH_CACHE,
	3287	};
	3288
	3289	static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
	3290	hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
	3291	{
	3292	hwaddr l;
	3293	uint8_t *ptr;
	3294	hwaddr addr1;
	3295	MemoryRegion *mr;
	3296
	3297	rcu_read_lock();
	3298	while (len > 0) {
	3299	l = len;
	3300	mr = address_space_translate(as, addr, &addr1, &l, true);
	3301
	3302	if (!(memory_region_is_ram(mr) \|\|
	3303	memory_region_is_romd(mr))) {
	3304	l = memory_access_size(mr, l, addr1);
	3305	} else {
	3306	/* ROM/RAM case */
	3307	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3308	switch (type) {
	3309	case WRITE_DATA:
	3310	memcpy(ptr, buf, l);
	3311	invalidate_and_set_dirty(mr, addr1, l);
	3312	break;
	3313	case FLUSH_CACHE:
	3314	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3315	break;
	3316	}
	3317	}
	3318	len -= l;
	3319	buf += l;
	3320	addr += l;
	3321	}
	3322	rcu_read_unlock();
	3323	}
	3324
	3325	/* used for ROM loading : can write in RAM and ROM */
	3326	void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
	3327	const uint8_t *buf, int len)
	3328	{
	3329	cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
	3330	}
	3331
	3332	void cpu_flush_icache_range(hwaddr start, int len)
	3333	{
	3334	/*
	3335	* This function should do the same thing as an icache flush that was
	3336	* triggered from within the guest. For TCG we are always cache coherent,
	3337	* so there is no need to flush anything. For KVM / Xen we need to flush
	3338	* the host's instruction cache at least.
	3339	*/
	3340	if (tcg_enabled()) {
	3341	return;
	3342	}
	3343
	3344	cpu_physical_memory_write_rom_internal(&address_space_memory,
	3345	start, NULL, len, FLUSH_CACHE);
	3346	}
	3347
	3348	typedef struct {
	3349	MemoryRegion *mr;
	3350	void *buffer;
	3351	hwaddr addr;
	3352	hwaddr len;
	3353	bool in_use;
	3354	} BounceBuffer;
	3355
	3356	static BounceBuffer bounce;
	3357
	3358	typedef struct MapClient {
	3359	QEMUBH *bh;
	3360	QLIST_ENTRY(MapClient) link;
	3361	} MapClient;
	3362
	3363	QemuMutex map_client_list_lock;
	3364	static QLIST_HEAD(map_client_list, MapClient) map_client_list
	3365	= QLIST_HEAD_INITIALIZER(map_client_list);
	3366
	3367	static void cpu_unregister_map_client_do(MapClient *client)
	3368	{
	3369	QLIST_REMOVE(client, link);
	3370	g_free(client);
	3371	}
	3372
	3373	static void cpu_notify_map_clients_locked(void)
	3374	{
	3375	MapClient *client;
	3376
	3377	while (!QLIST_EMPTY(&map_client_list)) {
	3378	client = QLIST_FIRST(&map_client_list);
	3379	qemu_bh_schedule(client->bh);
	3380	cpu_unregister_map_client_do(client);
	3381	}
	3382	}
	3383
	3384	void cpu_register_map_client(QEMUBH *bh)
	3385	{
	3386	MapClient client = g_malloc(sizeof(client));
	3387
	3388	qemu_mutex_lock(&map_client_list_lock);
	3389	client->bh = bh;
	3390	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3391	if (!atomic_read(&bounce.in_use)) {
	3392	cpu_notify_map_clients_locked();
	3393	}
	3394	qemu_mutex_unlock(&map_client_list_lock);
	3395	}
	3396
	3397	void cpu_exec_init_all(void)
	3398	{
	3399	qemu_mutex_init(&ram_list.mutex);
	3400	/* The data structures we set up here depend on knowing the page size,
	3401	* so no more changes can be made after this point.
	3402	* In an ideal world, nothing we did before we had finished the
	3403	* machine setup would care about the target page size, and we could
	3404	* do this much later, rather than requiring board models to state
	3405	* up front what their requirements are.
	3406	*/
	3407	finalize_target_page_bits();
	3408	io_mem_init();
	3409	memory_map_init();
	3410	qemu_mutex_init(&map_client_list_lock);
	3411	}
	3412
	3413	void cpu_unregister_map_client(QEMUBH *bh)
	3414	{
	3415	MapClient *client;
	3416
	3417	qemu_mutex_lock(&map_client_list_lock);
	3418	QLIST_FOREACH(client, &map_client_list, link) {
	3419	if (client->bh == bh) {
	3420	cpu_unregister_map_client_do(client);
	3421	break;
	3422	}
	3423	}
	3424	qemu_mutex_unlock(&map_client_list_lock);
	3425	}
	3426
	3427	static void cpu_notify_map_clients(void)
	3428	{
	3429	qemu_mutex_lock(&map_client_list_lock);
	3430	cpu_notify_map_clients_locked();
	3431	qemu_mutex_unlock(&map_client_list_lock);
	3432	}
	3433
	3434	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	3435	bool is_write)
	3436	{
	3437	MemoryRegion *mr;
	3438	hwaddr l, xlat;
	3439
	3440	while (len > 0) {
	3441	l = len;
	3442	mr = flatview_translate(fv, addr, &xlat, &l, is_write);
	3443	if (!memory_access_is_direct(mr, is_write)) {
	3444	l = memory_access_size(mr, l, addr);
	3445	if (!memory_region_access_valid(mr, xlat, l, is_write)) {
	3446	return false;
	3447	}
	3448	}
	3449
	3450	len -= l;
	3451	addr += l;
	3452	}
	3453	return true;
	3454	}
	3455
	3456	bool address_space_access_valid(AddressSpace *as, hwaddr addr,
	3457	int len, bool is_write)
	3458	{
	3459	FlatView *fv;
	3460	bool result;
	3461
	3462	rcu_read_lock();
	3463	fv = address_space_to_flatview(as);
	3464	result = flatview_access_valid(fv, addr, len, is_write);
	3465	rcu_read_unlock();
	3466	return result;
	3467	}
	3468
	3469	static hwaddr
	3470	flatview_extend_translation(FlatView *fv, hwaddr addr,
	3471	hwaddr target_len,
	3472	MemoryRegion *mr, hwaddr base, hwaddr len,
	3473	bool is_write)
	3474	{
	3475	hwaddr done = 0;
	3476	hwaddr xlat;
	3477	MemoryRegion *this_mr;
	3478
	3479	for (;;) {
	3480	target_len -= len;
	3481	addr += len;
	3482	done += len;
	3483	if (target_len == 0) {
	3484	return done;
	3485	}
	3486
	3487	len = target_len;
	3488	this_mr = flatview_translate(fv, addr, &xlat,
	3489	&len, is_write);
	3490	if (this_mr != mr \|\| xlat != base + done) {
	3491	return done;
	3492	}
	3493	}
	3494	}
	3495
	3496	/* Map a physical memory region into a host virtual address.
	3497	* May map a subset of the requested range, given by and returned in *plen.
	3498	* May return NULL if resources needed to perform the mapping are exhausted.
	3499	* Use only for reads OR writes - not for read-modify-write operations.
	3500	* Use cpu_register_map_client() to know when retrying the map operation is
	3501	* likely to succeed.
	3502	*/
	3503	void address_space_map(AddressSpace as,
	3504	hwaddr addr,
	3505	hwaddr *plen,
	3506	bool is_write)
	3507	{
	3508	hwaddr len = *plen;
	3509	hwaddr l, xlat;
	3510	MemoryRegion *mr;
	3511	void *ptr;
	3512	FlatView *fv;
	3513
	3514	if (len == 0) {
	3515	return NULL;
	3516	}
	3517
	3518	l = len;
	3519	rcu_read_lock();
	3520	fv = address_space_to_flatview(as);
	3521	mr = flatview_translate(fv, addr, &xlat, &l, is_write);
	3522
	3523	if (!memory_access_is_direct(mr, is_write)) {
	3524	if (atomic_xchg(&bounce.in_use, true)) {
	3525	rcu_read_unlock();
	3526	return NULL;
	3527	}
	3528	/* Avoid unbounded allocations */
	3529	l = MIN(l, TARGET_PAGE_SIZE);
	3530	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3531	bounce.addr = addr;
	3532	bounce.len = l;
	3533
	3534	memory_region_ref(mr);
	3535	bounce.mr = mr;
	3536	if (!is_write) {
	3537	flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
	3538	bounce.buffer, l);
	3539	}
	3540
	3541	rcu_read_unlock();
	3542	*plen = l;
	3543	return bounce.buffer;
	3544	}
	3545
	3546
	3547	memory_region_ref(mr);
	3548	*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
	3549	l, is_write);
	3550	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
	3551	rcu_read_unlock();
	3552
	3553	return ptr;
	3554	}
	3555
	3556	/* Unmaps a memory region previously mapped by address_space_map().
	3557	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3558	* the amount of memory that was actually read or written by the caller.
	3559	*/
	3560	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3561	int is_write, hwaddr access_len)
	3562	{
	3563	if (buffer != bounce.buffer) {
	3564	MemoryRegion *mr;
	3565	ram_addr_t addr1;
	3566
	3567	mr = memory_region_from_host(buffer, &addr1);
	3568	assert(mr != NULL);
	3569	if (is_write) {
	3570	invalidate_and_set_dirty(mr, addr1, access_len);
	3571	}
	3572	if (xen_enabled()) {
	3573	xen_invalidate_map_cache_entry(buffer);
	3574	}
	3575	memory_region_unref(mr);
	3576	return;
	3577	}
	3578	if (is_write) {
	3579	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3580	bounce.buffer, access_len);
	3581	}
	3582	qemu_vfree(bounce.buffer);
	3583	bounce.buffer = NULL;
	3584	memory_region_unref(bounce.mr);
	3585	atomic_mb_set(&bounce.in_use, false);
	3586	cpu_notify_map_clients();
	3587	}
	3588
	3589	void *cpu_physical_memory_map(hwaddr addr,
	3590	hwaddr *plen,
	3591	int is_write)
	3592	{
	3593	return address_space_map(&address_space_memory, addr, plen, is_write);
	3594	}
	3595
	3596	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3597	int is_write, hwaddr access_len)
	3598	{
	3599	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3600	}
	3601
	3602	#define ARG1_DECL AddressSpace *as
	3603	#define ARG1 as
	3604	#define SUFFIX
	3605	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3606	#define IS_DIRECT(mr, is_write) memory_access_is_direct(mr, is_write)
	3607	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3608	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3609	#define RCU_READ_LOCK(...) rcu_read_lock()
	3610	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3611	#include "memory_ldst.inc.c"
	3612
	3613	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3614	AddressSpace *as,
	3615	hwaddr addr,
	3616	hwaddr len,
	3617	bool is_write)
	3618	{
	3619	cache->len = len;
	3620	cache->as = as;
	3621	cache->xlat = addr;
	3622	return len;
	3623	}
	3624
	3625	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3626	hwaddr addr,
	3627	hwaddr access_len)
	3628	{
	3629	}
	3630
	3631	void address_space_cache_destroy(MemoryRegionCache *cache)
	3632	{
	3633	cache->as = NULL;
	3634	}
	3635
	3636	#define ARG1_DECL MemoryRegionCache *cache
	3637	#define ARG1 cache
	3638	#define SUFFIX _cached
	3639	#define TRANSLATE(addr, ...) \
	3640	address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
	3641	#define IS_DIRECT(mr, is_write) true
	3642	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3643	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3644	#define RCU_READ_LOCK() rcu_read_lock()
	3645	#define RCU_READ_UNLOCK() rcu_read_unlock()
	3646	#include "memory_ldst.inc.c"
	3647
	3648	/* virtual memory access for debug (includes writing to ROM) */
	3649	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3650	uint8_t *buf, int len, int is_write)
	3651	{
	3652	int l;
	3653	hwaddr phys_addr;
	3654	target_ulong page;
	3655
	3656	cpu_synchronize_state(cpu);
	3657	while (len > 0) {
	3658	int asidx;
	3659	MemTxAttrs attrs;
	3660
	3661	page = addr & TARGET_PAGE_MASK;
	3662	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3663	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3664	/* if no physical page mapped, return an error */
	3665	if (phys_addr == -1)
	3666	return -1;
	3667	l = (page + TARGET_PAGE_SIZE) - addr;
	3668	if (l > len)
	3669	l = len;
	3670	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3671	if (is_write) {
	3672	cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
	3673	phys_addr, buf, l);
	3674	} else {
	3675	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3676	MEMTXATTRS_UNSPECIFIED,
	3677	buf, l, 0);
	3678	}
	3679	len -= l;
	3680	buf += l;
	3681	addr += l;
	3682	}
	3683	return 0;
	3684	}
	3685
	3686	/*
	3687	* Allows code that needs to deal with migration bitmaps etc to still be built
	3688	* target independent.
	3689	*/
	3690	size_t qemu_target_page_size(void)
	3691	{
	3692	return TARGET_PAGE_SIZE;
	3693	}
	3694
	3695	int qemu_target_page_bits(void)
	3696	{
	3697	return TARGET_PAGE_BITS;
	3698	}
	3699
	3700	int qemu_target_page_bits_min(void)
	3701	{
	3702	return TARGET_PAGE_BITS_MIN;
	3703	}
	3704	#endif
	3705
	3706	/*
	3707	* A helper function for the _utterly broken_ virtio device model to find out if
	3708	* it's running on a big endian machine. Don't do this at home kids!
	3709	*/
	3710	bool target_words_bigendian(void);
	3711	bool target_words_bigendian(void)
	3712	{
	3713	#if defined(TARGET_WORDS_BIGENDIAN)
	3714	return true;
	3715	#else
	3716	return false;
	3717	#endif
	3718	}
	3719
	3720	#ifndef CONFIG_USER_ONLY
	3721	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3722	{
	3723	MemoryRegion*mr;
	3724	hwaddr l = 1;
	3725	bool res;
	3726
	3727	rcu_read_lock();
	3728	mr = address_space_translate(&address_space_memory,
	3729	phys_addr, &phys_addr, &l, false);
	3730
	3731	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3732	rcu_read_unlock();
	3733	return res;
	3734	}
	3735
	3736	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3737	{
	3738	RAMBlock *block;
	3739	int ret = 0;
	3740
	3741	rcu_read_lock();
	3742	RAMBLOCK_FOREACH(block) {
	3743	ret = func(block->idstr, block->host, block->offset,
	3744	block->used_length, opaque);
	3745	if (ret) {
	3746	break;
	3747	}
	3748	}
	3749	rcu_read_unlock();
	3750	return ret;
	3751	}
	3752
	3753	/*
	3754	* Unmap pages of memory from start to start+length such that
	3755	* they a) read as 0, b) Trigger whatever fault mechanism
	3756	* the OS provides for postcopy.
	3757	* The pages must be unmapped by the end of the function.
	3758	* Returns: 0 on success, none-0 on failure
	3759	*
	3760	*/
	3761	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	3762	{
	3763	int ret = -1;
	3764
	3765	uint8_t *host_startaddr = rb->host + start;
	3766
	3767	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	3768	error_report("ram_block_discard_range: Unaligned start address: %p",
	3769	host_startaddr);
	3770	goto err;
	3771	}
	3772
	3773	if ((start + length) <= rb->used_length) {
	3774	bool need_madvise, need_fallocate;
	3775	uint8_t *host_endaddr = host_startaddr + length;
	3776	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	3777	error_report("ram_block_discard_range: Unaligned end address: %p",
	3778	host_endaddr);
	3779	goto err;
	3780	}
	3781
	3782	errno = ENOTSUP; /* If we are missing MADVISE etc */
	3783
	3784	/* The logic here is messy;
	3785	* madvise DONTNEED fails for hugepages
	3786	* fallocate works on hugepages and shmem
	3787	*/
	3788	need_madvise = (rb->page_size == qemu_host_page_size);
	3789	need_fallocate = rb->fd != -1;
	3790	if (need_fallocate) {
	3791	/* For a file, this causes the area of the file to be zero'd
	3792	* if read, and for hugetlbfs also causes it to be unmapped
	3793	* so a userfault will trigger.
	3794	*/
	3795	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	3796	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	3797	start, length);
	3798	if (ret) {
	3799	ret = -errno;
	3800	error_report("ram_block_discard_range: Failed to fallocate "
	3801	"%s:%" PRIx64 " +%zx (%d)",
	3802	rb->idstr, start, length, ret);
	3803	goto err;
	3804	}
	3805	#else
	3806	ret = -ENOSYS;
	3807	error_report("ram_block_discard_range: fallocate not available/file"
	3808	"%s:%" PRIx64 " +%zx (%d)",
	3809	rb->idstr, start, length, ret);
	3810	goto err;
	3811	#endif
	3812	}
	3813	if (need_madvise) {
	3814	/* For normal RAM this causes it to be unmapped,
	3815	* for shared memory it causes the local mapping to disappear
	3816	* and to fall back on the file contents (which we just
	3817	* fallocate'd away).
	3818	*/
	3819	#if defined(CONFIG_MADVISE)
	3820	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	3821	if (ret) {
	3822	ret = -errno;
	3823	error_report("ram_block_discard_range: Failed to discard range "
	3824	"%s:%" PRIx64 " +%zx (%d)",
	3825	rb->idstr, start, length, ret);
	3826	goto err;
	3827	}
	3828	#else
	3829	ret = -ENOSYS;
	3830	error_report("ram_block_discard_range: MADVISE not available"
	3831	"%s:%" PRIx64 " +%zx (%d)",
	3832	rb->idstr, start, length, ret);
	3833	goto err;
	3834	#endif
	3835	}
	3836	trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
	3837	need_madvise, need_fallocate, ret);
	3838	} else {
	3839	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	3840	"/%zx/" RAM_ADDR_FMT")",
	3841	rb->idstr, start, length, rb->used_length);
	3842	}
	3843
	3844	err:
	3845	return ret;
	3846	}
	3847
	3848	#endif
	3849
	3850	void page_size_init(void)
	3851	{
	3852	/* NOTE: we can always suppose that qemu_host_page_size >=
	3853	TARGET_PAGE_SIZE */
	3854	if (qemu_host_page_size == 0) {
	3855	qemu_host_page_size = qemu_real_host_page_size;
	3856	}
	3857	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	3858	qemu_host_page_size = TARGET_PAGE_SIZE;
	3859	}
	3860	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	3861	}
	3862
	3863	#if !defined(CONFIG_USER_ONLY)
	3864
	3865	static void mtree_print_phys_entries(fprintf_function mon, void *f,
	3866	int start, int end, int skip, int ptr)
	3867	{
	3868	if (start == end - 1) {
	3869	mon(f, "\t%3d ", start);
	3870	} else {
	3871	mon(f, "\t%3d..%-3d ", start, end - 1);
	3872	}
	3873	mon(f, " skip=%d ", skip);
	3874	if (ptr == PHYS_MAP_NODE_NIL) {
	3875	mon(f, " ptr=NIL");
	3876	} else if (!skip) {
	3877	mon(f, " ptr=#%d", ptr);
	3878	} else {
	3879	mon(f, " ptr=[%d]", ptr);
	3880	}
	3881	mon(f, "\n");
	3882	}
	3883
	3884	#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
	3885	int128_sub((size), int128_one())) : 0)
	3886
	3887	void mtree_print_dispatch(fprintf_function mon, void *f,
	3888	AddressSpaceDispatch d, MemoryRegion root)
	3889	{
	3890	int i;
	3891
	3892	mon(f, " Dispatch\n");
	3893	mon(f, " Physical sections\n");
	3894
	3895	for (i = 0; i < d->map.sections_nb; ++i) {
	3896	MemoryRegionSection *s = d->map.sections + i;
	3897	const char *names[] = { " [unassigned]", " [not dirty]",
	3898	" [ROM]", " [watch]" };
	3899
	3900	mon(f, " #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
	3901	i,
	3902	s->offset_within_address_space,
	3903	s->offset_within_address_space + MR_SIZE(s->mr->size),
	3904	s->mr->name ? s->mr->name : "(noname)",
	3905	i < ARRAY_SIZE(names) ? names[i] : "",
	3906	s->mr == root ? " [ROOT]" : "",
	3907	s == d->mru_section ? " [MRU]" : "",
	3908	s->mr->is_iommu ? " [iommu]" : "");
	3909
	3910	if (s->mr->alias) {
	3911	mon(f, " alias=%s", s->mr->alias->name ?
	3912	s->mr->alias->name : "noname");
	3913	}
	3914	mon(f, "\n");
	3915	}
	3916
	3917	mon(f, " Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
	3918	P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
	3919	for (i = 0; i < d->map.nodes_nb; ++i) {
	3920	int j, jprev;
	3921	PhysPageEntry prev;
	3922	Node *n = d->map.nodes + i;
	3923
	3924	mon(f, " [%d]\n", i);
	3925
	3926	for (j = 0, jprev = 0, prev = n[0]; j < ARRAY_SIZE(n); ++j) {
	3927	PhysPageEntry pe = n + j;
	3928
	3929	if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
	3930	continue;
	3931	}
	3932
	3933	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	3934
	3935	jprev = j;
	3936	prev = *pe;
	3937	}
	3938
	3939	if (jprev != ARRAY_SIZE(*n)) {
	3940	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	3941	}
	3942	}
	3943	}
	3944
	3945	#endif