Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19
	20	#include "qemu/osdep.h"
	21	#include "qemu-common.h"
	22	#include "qapi/error.h"
	23
	24	#include "qemu/cutils.h"
	25	#include "cpu.h"
	26	#include "exec/exec-all.h"
	27	#include "exec/target_page.h"
	28	#include "tcg.h"
	29	#include "hw/qdev-core.h"
	30	#include "hw/qdev-properties.h"
	31	#if !defined(CONFIG_USER_ONLY)
	32	#include "hw/boards.h"
	33	#include "hw/xen/xen.h"
	34	#endif
	35	#include "sysemu/kvm.h"
	36	#include "sysemu/sysemu.h"
	37	#include "sysemu/tcg.h"
	38	#include "qemu/timer.h"
	39	#include "qemu/config-file.h"
	40	#include "qemu/error-report.h"
	41	#include "qemu/qemu-print.h"
	42	#if defined(CONFIG_USER_ONLY)
	43	#include "qemu.h"
	44	#else /* !CONFIG_USER_ONLY */
	45	#include "exec/memory.h"
	46	#include "exec/ioport.h"
	47	#include "sysemu/dma.h"
	48	#include "sysemu/hostmem.h"
	49	#include "sysemu/hw_accel.h"
	50	#include "exec/address-spaces.h"
	51	#include "sysemu/xen-mapcache.h"
	52	#include "trace-root.h"
	53
	54	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	55	#include <linux/falloc.h>
	56	#endif
	57
	58	#endif
	59	#include "qemu/rcu_queue.h"
	60	#include "qemu/main-loop.h"
	61	#include "translate-all.h"
	62	#include "sysemu/replay.h"
	63
	64	#include "exec/memory-internal.h"
	65	#include "exec/ram_addr.h"
	66	#include "exec/log.h"
	67
	68	#include "migration/vmstate.h"
	69
	70	#include "qemu/range.h"
	71	#ifndef _WIN32
	72	#include "qemu/mmap-alloc.h"
	73	#endif
	74
	75	#include "monitor/monitor.h"
	76
	77	//#define DEBUG_SUBPAGE
	78
	79	#if !defined(CONFIG_USER_ONLY)
	80	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	81	* are protected by the ramlist lock.
	82	*/
	83	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	84
	85	static MemoryRegion *system_memory;
	86	static MemoryRegion *system_io;
	87
	88	AddressSpace address_space_io;
	89	AddressSpace address_space_memory;
	90
	91	static MemoryRegion io_mem_unassigned;
	92	#endif
	93
	94	#ifdef TARGET_PAGE_BITS_VARY
	95	int target_page_bits;
	96	bool target_page_bits_decided;
	97	#endif
	98
	99	CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	100
	101	/* current CPU in the current thread. It is only valid inside
	102	cpu_exec() */
	103	__thread CPUState *current_cpu;
	104	/* 0 = Do not count executed instructions.
	105	1 = Precise instruction counting.
	106	2 = Adaptive rate instruction counting. */
	107	int use_icount;
	108
	109	uintptr_t qemu_host_page_size;
	110	intptr_t qemu_host_page_mask;
	111
	112	bool set_preferred_target_page_bits(int bits)
	113	{
	114	/* The target page size is the lowest common denominator for all
	115	* the CPUs in the system, so we can only make it smaller, never
	116	* larger. And we can't make it smaller once we've committed to
	117	* a particular size.
	118	*/
	119	#ifdef TARGET_PAGE_BITS_VARY
	120	assert(bits >= TARGET_PAGE_BITS_MIN);
	121	if (target_page_bits == 0 \|\| target_page_bits > bits) {
	122	if (target_page_bits_decided) {
	123	return false;
	124	}
	125	target_page_bits = bits;
	126	}
	127	#endif
	128	return true;
	129	}
	130
	131	#if !defined(CONFIG_USER_ONLY)
	132
	133	static void finalize_target_page_bits(void)
	134	{
	135	#ifdef TARGET_PAGE_BITS_VARY
	136	if (target_page_bits == 0) {
	137	target_page_bits = TARGET_PAGE_BITS_MIN;
	138	}
	139	target_page_bits_decided = true;
	140	#endif
	141	}
	142
	143	typedef struct PhysPageEntry PhysPageEntry;
	144
	145	struct PhysPageEntry {
	146	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	147	uint32_t skip : 6;
	148	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	149	uint32_t ptr : 26;
	150	};
	151
	152	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	153
	154	/* Size of the L2 (and L3, etc) page tables. */
	155	#define ADDR_SPACE_BITS 64
	156
	157	#define P_L2_BITS 9
	158	#define P_L2_SIZE (1 << P_L2_BITS)
	159
	160	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	161
	162	typedef PhysPageEntry Node[P_L2_SIZE];
	163
	164	typedef struct PhysPageMap {
	165	struct rcu_head rcu;
	166
	167	unsigned sections_nb;
	168	unsigned sections_nb_alloc;
	169	unsigned nodes_nb;
	170	unsigned nodes_nb_alloc;
	171	Node *nodes;
	172	MemoryRegionSection *sections;
	173	} PhysPageMap;
	174
	175	struct AddressSpaceDispatch {
	176	MemoryRegionSection *mru_section;
	177	/* This is a multi-level map on the physical address space.
	178	* The bottom level has pointers to MemoryRegionSections.
	179	*/
	180	PhysPageEntry phys_map;
	181	PhysPageMap map;
	182	};
	183
	184	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	185	typedef struct subpage_t {
	186	MemoryRegion iomem;
	187	FlatView *fv;
	188	hwaddr base;
	189	uint16_t sub_section[];
	190	} subpage_t;
	191
	192	#define PHYS_SECTION_UNASSIGNED 0
	193
	194	static void io_mem_init(void);
	195	static void memory_map_init(void);
	196	static void tcg_log_global_after_sync(MemoryListener *listener);
	197	static void tcg_commit(MemoryListener *listener);
	198
	199	/**
	200	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	201	* @cpu: the CPU whose AddressSpace this is
	202	* @as: the AddressSpace itself
	203	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	204	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	205	*/
	206	struct CPUAddressSpace {
	207	CPUState *cpu;
	208	AddressSpace *as;
	209	struct AddressSpaceDispatch *memory_dispatch;
	210	MemoryListener tcg_as_listener;
	211	};
	212
	213	struct DirtyBitmapSnapshot {
	214	ram_addr_t start;
	215	ram_addr_t end;
	216	unsigned long dirty[];
	217	};
	218
	219	#endif
	220
	221	#if !defined(CONFIG_USER_ONLY)
	222
	223	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	224	{
	225	static unsigned alloc_hint = 16;
	226	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	227	map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
	228	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	229	alloc_hint = map->nodes_nb_alloc;
	230	}
	231	}
	232
	233	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	234	{
	235	unsigned i;
	236	uint32_t ret;
	237	PhysPageEntry e;
	238	PhysPageEntry *p;
	239
	240	ret = map->nodes_nb++;
	241	p = map->nodes[ret];
	242	assert(ret != PHYS_MAP_NODE_NIL);
	243	assert(ret != map->nodes_nb_alloc);
	244
	245	e.skip = leaf ? 0 : 1;
	246	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	247	for (i = 0; i < P_L2_SIZE; ++i) {
	248	memcpy(&p[i], &e, sizeof(e));
	249	}
	250	return ret;
	251	}
	252
	253	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	254	hwaddr index, uint64_t nb, uint16_t leaf,
	255	int level)
	256	{
	257	PhysPageEntry *p;
	258	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	259
	260	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	261	lp->ptr = phys_map_node_alloc(map, level == 0);
	262	}
	263	p = map->nodes[lp->ptr];
	264	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	265
	266	while (*nb && lp < &p[P_L2_SIZE]) {
	267	if ((index & (step - 1)) == 0 && nb >= step) {
	268	lp->skip = 0;
	269	lp->ptr = leaf;
	270	*index += step;
	271	*nb -= step;
	272	} else {
	273	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	274	}
	275	++lp;
	276	}
	277	}
	278
	279	static void phys_page_set(AddressSpaceDispatch *d,
	280	hwaddr index, uint64_t nb,
	281	uint16_t leaf)
	282	{
	283	/* Wildly overreserve - it doesn't matter much. */
	284	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	285
	286	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	287	}
	288
	289	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	290	* and update our entry so we can skip it and go directly to the destination.
	291	*/
	292	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	293	{
	294	unsigned valid_ptr = P_L2_SIZE;
	295	int valid = 0;
	296	PhysPageEntry *p;
	297	int i;
	298
	299	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	300	return;
	301	}
	302
	303	p = nodes[lp->ptr];
	304	for (i = 0; i < P_L2_SIZE; i++) {
	305	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	306	continue;
	307	}
	308
	309	valid_ptr = i;
	310	valid++;
	311	if (p[i].skip) {
	312	phys_page_compact(&p[i], nodes);
	313	}
	314	}
	315
	316	/* We can only compress if there's only one child. */
	317	if (valid != 1) {
	318	return;
	319	}
	320
	321	assert(valid_ptr < P_L2_SIZE);
	322
	323	/* Don't compress if it won't fit in the # of bits we have. */
	324	if (P_L2_LEVELS >= (1 << 6) &&
	325	lp->skip + p[valid_ptr].skip >= (1 << 6)) {
	326	return;
	327	}
	328
	329	lp->ptr = p[valid_ptr].ptr;
	330	if (!p[valid_ptr].skip) {
	331	/* If our only child is a leaf, make this a leaf. */
	332	/* By design, we should have made this node a leaf to begin with so we
	333	* should never reach here.
	334	* But since it's so simple to handle this, let's do it just in case we
	335	* change this rule.
	336	*/
	337	lp->skip = 0;
	338	} else {
	339	lp->skip += p[valid_ptr].skip;
	340	}
	341	}
	342
	343	void address_space_dispatch_compact(AddressSpaceDispatch *d)
	344	{
	345	if (d->phys_map.skip) {
	346	phys_page_compact(&d->phys_map, d->map.nodes);
	347	}
	348	}
	349
	350	static inline bool section_covers_addr(const MemoryRegionSection *section,
	351	hwaddr addr)
	352	{
	353	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	354	* the section must cover the entire address space.
	355	*/
	356	return int128_gethi(section->size) \|\|
	357	range_covers_byte(section->offset_within_address_space,
	358	int128_getlo(section->size), addr);
	359	}
	360
	361	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	362	{
	363	PhysPageEntry lp = d->phys_map, *p;
	364	Node *nodes = d->map.nodes;
	365	MemoryRegionSection *sections = d->map.sections;
	366	hwaddr index = addr >> TARGET_PAGE_BITS;
	367	int i;
	368
	369	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	370	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	371	return &sections[PHYS_SECTION_UNASSIGNED];
	372	}
	373	p = nodes[lp.ptr];
	374	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	375	}
	376
	377	if (section_covers_addr(&sections[lp.ptr], addr)) {
	378	return &sections[lp.ptr];
	379	} else {
	380	return &sections[PHYS_SECTION_UNASSIGNED];
	381	}
	382	}
	383
	384	/* Called from RCU critical section */
	385	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	386	hwaddr addr,
	387	bool resolve_subpage)
	388	{
	389	MemoryRegionSection *section = atomic_read(&d->mru_section);
	390	subpage_t *subpage;
	391
	392	if (!section \|\| section == &d->map.sections[PHYS_SECTION_UNASSIGNED] \|\|
	393	!section_covers_addr(section, addr)) {
	394	section = phys_page_find(d, addr);
	395	atomic_set(&d->mru_section, section);
	396	}
	397	if (resolve_subpage && section->mr->subpage) {
	398	subpage = container_of(section->mr, subpage_t, iomem);
	399	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	400	}
	401	return section;
	402	}
	403
	404	/* Called from RCU critical section */
	405	static MemoryRegionSection *
	406	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	407	hwaddr *plen, bool resolve_subpage)
	408	{
	409	MemoryRegionSection *section;
	410	MemoryRegion *mr;
	411	Int128 diff;
	412
	413	section = address_space_lookup_region(d, addr, resolve_subpage);
	414	/* Compute offset within MemoryRegionSection */
	415	addr -= section->offset_within_address_space;
	416
	417	/* Compute offset within MemoryRegion */
	418	*xlat = addr + section->offset_within_region;
	419
	420	mr = section->mr;
	421
	422	/* MMIO registers can be expected to perform full-width accesses based only
	423	* on their address, without considering adjacent registers that could
	424	* decode to completely different MemoryRegions. When such registers
	425	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	426	* regions overlap wildly. For this reason we cannot clamp the accesses
	427	* here.
	428	*
	429	* If the length is small (as is the case for address_space_ldl/stl),
	430	* everything works fine. If the incoming length is large, however,
	431	* the caller really has to do the clamping through memory_access_size.
	432	*/
	433	if (memory_region_is_ram(mr)) {
	434	diff = int128_sub(section->size, int128_make64(addr));
	435	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	436	}
	437	return section;
	438	}
	439
	440	/**
	441	* address_space_translate_iommu - translate an address through an IOMMU
	442	* memory region and then through the target address space.
	443	*
	444	* @iommu_mr: the IOMMU memory region that we start the translation from
	445	* @addr: the address to be translated through the MMU
	446	* @xlat: the translated address offset within the destination memory region.
	447	* It cannot be %NULL.
	448	* @plen_out: valid read/write length of the translated address. It
	449	* cannot be %NULL.
	450	* @page_mask_out: page mask for the translated address. This
	451	* should only be meaningful for IOMMU translated
	452	* addresses, since there may be huge pages that this bit
	453	* would tell. It can be %NULL if we don't care about it.
	454	* @is_write: whether the translation operation is for write
	455	* @is_mmio: whether this can be MMIO, set true if it can
	456	* @target_as: the address space targeted by the IOMMU
	457	* @attrs: transaction attributes
	458	*
	459	* This function is called from RCU critical section. It is the common
	460	* part of flatview_do_translate and address_space_translate_cached.
	461	*/
	462	static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
	463	hwaddr *xlat,
	464	hwaddr *plen_out,
	465	hwaddr *page_mask_out,
	466	bool is_write,
	467	bool is_mmio,
	468	AddressSpace **target_as,
	469	MemTxAttrs attrs)
	470	{
	471	MemoryRegionSection *section;
	472	hwaddr page_mask = (hwaddr)-1;
	473
	474	do {
	475	hwaddr addr = *xlat;
	476	IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	477	int iommu_idx = 0;
	478	IOMMUTLBEntry iotlb;
	479
	480	if (imrc->attrs_to_index) {
	481	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	482	}
	483
	484	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	485	IOMMU_WO : IOMMU_RO, iommu_idx);
	486
	487	if (!(iotlb.perm & (1 << is_write))) {
	488	goto unassigned;
	489	}
	490
	491	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	492	\| (addr & iotlb.addr_mask));
	493	page_mask &= iotlb.addr_mask;
	494	plen_out = MIN(plen_out, (addr \| iotlb.addr_mask) - addr + 1);
	495	*target_as = iotlb.target_as;
	496
	497	section = address_space_translate_internal(
	498	address_space_to_dispatch(iotlb.target_as), addr, xlat,
	499	plen_out, is_mmio);
	500
	501	iommu_mr = memory_region_get_iommu(section->mr);
	502	} while (unlikely(iommu_mr));
	503
	504	if (page_mask_out) {
	505	*page_mask_out = page_mask;
	506	}
	507	return *section;
	508
	509	unassigned:
	510	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	511	}
	512
	513	/**
	514	* flatview_do_translate - translate an address in FlatView
	515	*
	516	* @fv: the flat view that we want to translate on
	517	* @addr: the address to be translated in above address space
	518	* @xlat: the translated address offset within memory region. It
	519	* cannot be @NULL.
	520	* @plen_out: valid read/write length of the translated address. It
	521	* can be @NULL when we don't care about it.
	522	* @page_mask_out: page mask for the translated address. This
	523	* should only be meaningful for IOMMU translated
	524	* addresses, since there may be huge pages that this bit
	525	* would tell. It can be @NULL if we don't care about it.
	526	* @is_write: whether the translation operation is for write
	527	* @is_mmio: whether this can be MMIO, set true if it can
	528	* @target_as: the address space targeted by the IOMMU
	529	* @attrs: memory transaction attributes
	530	*
	531	* This function is called from RCU critical section
	532	*/
	533	static MemoryRegionSection flatview_do_translate(FlatView *fv,
	534	hwaddr addr,
	535	hwaddr *xlat,
	536	hwaddr *plen_out,
	537	hwaddr *page_mask_out,
	538	bool is_write,
	539	bool is_mmio,
	540	AddressSpace **target_as,
	541	MemTxAttrs attrs)
	542	{
	543	MemoryRegionSection *section;
	544	IOMMUMemoryRegion *iommu_mr;
	545	hwaddr plen = (hwaddr)(-1);
	546
	547	if (!plen_out) {
	548	plen_out = &plen;
	549	}
	550
	551	section = address_space_translate_internal(
	552	flatview_to_dispatch(fv), addr, xlat,
	553	plen_out, is_mmio);
	554
	555	iommu_mr = memory_region_get_iommu(section->mr);
	556	if (unlikely(iommu_mr)) {
	557	return address_space_translate_iommu(iommu_mr, xlat,
	558	plen_out, page_mask_out,
	559	is_write, is_mmio,
	560	target_as, attrs);
	561	}
	562	if (page_mask_out) {
	563	/* Not behind an IOMMU, use default page size. */
	564	*page_mask_out = ~TARGET_PAGE_MASK;
	565	}
	566
	567	return *section;
	568	}
	569
	570	/* Called from RCU critical section */
	571	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	572	bool is_write, MemTxAttrs attrs)
	573	{
	574	MemoryRegionSection section;
	575	hwaddr xlat, page_mask;
	576
	577	/*
	578	* This can never be MMIO, and we don't really care about plen,
	579	* but page mask.
	580	*/
	581	section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
	582	NULL, &page_mask, is_write, false, &as,
	583	attrs);
	584
	585	/* Illegal translation */
	586	if (section.mr == &io_mem_unassigned) {
	587	goto iotlb_fail;
	588	}
	589
	590	/* Convert memory region offset into address space offset */
	591	xlat += section.offset_within_address_space -
	592	section.offset_within_region;
	593
	594	return (IOMMUTLBEntry) {
	595	.target_as = as,
	596	.iova = addr & ~page_mask,
	597	.translated_addr = xlat & ~page_mask,
	598	.addr_mask = page_mask,
	599	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	600	.perm = IOMMU_RW,
	601	};
	602
	603	iotlb_fail:
	604	return (IOMMUTLBEntry) {0};
	605	}
	606
	607	/* Called from RCU critical section */
	608	MemoryRegion flatview_translate(FlatView fv, hwaddr addr, hwaddr *xlat,
	609	hwaddr *plen, bool is_write,
	610	MemTxAttrs attrs)
	611	{
	612	MemoryRegion *mr;
	613	MemoryRegionSection section;
	614	AddressSpace *as = NULL;
	615
	616	/* This can be MMIO, so setup MMIO bit. */
	617	section = flatview_do_translate(fv, addr, xlat, plen, NULL,
	618	is_write, true, &as, attrs);
	619	mr = section.mr;
	620
	621	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	622	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	623	plen = MIN(page, plen);
	624	}
	625
	626	return mr;
	627	}
	628
	629	typedef struct TCGIOMMUNotifier {
	630	IOMMUNotifier n;
	631	MemoryRegion *mr;
	632	CPUState *cpu;
	633	int iommu_idx;
	634	bool active;
	635	} TCGIOMMUNotifier;
	636
	637	static void tcg_iommu_unmap_notify(IOMMUNotifier n, IOMMUTLBEntry iotlb)
	638	{
	639	TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
	640
	641	if (!notifier->active) {
	642	return;
	643	}
	644	tlb_flush(notifier->cpu);
	645	notifier->active = false;
	646	/* We leave the notifier struct on the list to avoid reallocating it later.
	647	* Generally the number of IOMMUs a CPU deals with will be small.
	648	* In any case we can't unregister the iommu notifier from a notify
	649	* callback.
	650	*/
	651	}
	652
	653	static void tcg_register_iommu_notifier(CPUState *cpu,
	654	IOMMUMemoryRegion *iommu_mr,
	655	int iommu_idx)
	656	{
	657	/* Make sure this CPU has an IOMMU notifier registered for this
	658	* IOMMU/IOMMU index combination, so that we can flush its TLB
	659	* when the IOMMU tells us the mappings we've cached have changed.
	660	*/
	661	MemoryRegion *mr = MEMORY_REGION(iommu_mr);
	662	TCGIOMMUNotifier *notifier;
	663	Error *err = NULL;
	664	int i, ret;
	665
	666	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	667	notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
	668	if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
	669	break;
	670	}
	671	}
	672	if (i == cpu->iommu_notifiers->len) {
	673	/* Not found, add a new entry at the end of the array */
	674	cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
	675	notifier = g_new0(TCGIOMMUNotifier, 1);
	676	g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
	677
	678	notifier->mr = mr;
	679	notifier->iommu_idx = iommu_idx;
	680	notifier->cpu = cpu;
	681	/* Rather than trying to register interest in the specific part
	682	* of the iommu's address space that we've accessed and then
	683	* expand it later as subsequent accesses touch more of it, we
	684	* just register interest in the whole thing, on the assumption
	685	* that iommu reconfiguration will be rare.
	686	*/
	687	iommu_notifier_init(&notifier->n,
	688	tcg_iommu_unmap_notify,
	689	IOMMU_NOTIFIER_UNMAP,
	690	0,
	691	HWADDR_MAX,
	692	iommu_idx);
	693	ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
	694	&err);
	695	if (ret) {
	696	error_report_err(err);
	697	exit(1);
	698	}
	699	}
	700
	701	if (!notifier->active) {
	702	notifier->active = true;
	703	}
	704	}
	705
	706	static void tcg_iommu_free_notifier_list(CPUState *cpu)
	707	{
	708	/* Destroy the CPU's notifier list */
	709	int i;
	710	TCGIOMMUNotifier *notifier;
	711
	712	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	713	notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
	714	memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
	715	g_free(notifier);
	716	}
	717	g_array_free(cpu->iommu_notifiers, true);
	718	}
	719
	720	/* Called from RCU critical section */
	721	MemoryRegionSection *
	722	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	723	hwaddr xlat, hwaddr plen,
	724	MemTxAttrs attrs, int *prot)
	725	{
	726	MemoryRegionSection *section;
	727	IOMMUMemoryRegion *iommu_mr;
	728	IOMMUMemoryRegionClass *imrc;
	729	IOMMUTLBEntry iotlb;
	730	int iommu_idx;
	731	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	732
	733	for (;;) {
	734	section = address_space_translate_internal(d, addr, &addr, plen, false);
	735
	736	iommu_mr = memory_region_get_iommu(section->mr);
	737	if (!iommu_mr) {
	738	break;
	739	}
	740
	741	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	742
	743	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	744	tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
	745	/* We need all the permissions, so pass IOMMU_NONE so the IOMMU
	746	* doesn't short-cut its translation table walk.
	747	*/
	748	iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
	749	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	750	\| (addr & iotlb.addr_mask));
	751	/* Update the caller's prot bits to remove permissions the IOMMU
	752	* is giving us a failure response for. If we get down to no
	753	* permissions left at all we can give up now.
	754	*/
	755	if (!(iotlb.perm & IOMMU_RO)) {
	756	*prot &= ~(PAGE_READ \| PAGE_EXEC);
	757	}
	758	if (!(iotlb.perm & IOMMU_WO)) {
	759	*prot &= ~PAGE_WRITE;
	760	}
	761
	762	if (!*prot) {
	763	goto translate_fail;
	764	}
	765
	766	d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
	767	}
	768
	769	assert(!memory_region_is_iommu(section->mr));
	770	*xlat = addr;
	771	return section;
	772
	773	translate_fail:
	774	return &d->map.sections[PHYS_SECTION_UNASSIGNED];
	775	}
	776	#endif
	777
	778	#if !defined(CONFIG_USER_ONLY)
	779
	780	static int cpu_common_post_load(void *opaque, int version_id)
	781	{
	782	CPUState *cpu = opaque;
	783
	784	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	785	version_id is increased. */
	786	cpu->interrupt_request &= ~0x01;
	787	tlb_flush(cpu);
	788
	789	/* loadvm has just updated the content of RAM, bypassing the
	790	* usual mechanisms that ensure we flush TBs for writes to
	791	* memory we've translated code from. So we must flush all TBs,
	792	* which will now be stale.
	793	*/
	794	tb_flush(cpu);
	795
	796	return 0;
	797	}
	798
	799	static int cpu_common_pre_load(void *opaque)
	800	{
	801	CPUState *cpu = opaque;
	802
	803	cpu->exception_index = -1;
	804
	805	return 0;
	806	}
	807
	808	static bool cpu_common_exception_index_needed(void *opaque)
	809	{
	810	CPUState *cpu = opaque;
	811
	812	return tcg_enabled() && cpu->exception_index != -1;
	813	}
	814
	815	static const VMStateDescription vmstate_cpu_common_exception_index = {
	816	.name = "cpu_common/exception_index",
	817	.version_id = 1,
	818	.minimum_version_id = 1,
	819	.needed = cpu_common_exception_index_needed,
	820	.fields = (VMStateField[]) {
	821	VMSTATE_INT32(exception_index, CPUState),
	822	VMSTATE_END_OF_LIST()
	823	}
	824	};
	825
	826	static bool cpu_common_crash_occurred_needed(void *opaque)
	827	{
	828	CPUState *cpu = opaque;
	829
	830	return cpu->crash_occurred;
	831	}
	832
	833	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	834	.name = "cpu_common/crash_occurred",
	835	.version_id = 1,
	836	.minimum_version_id = 1,
	837	.needed = cpu_common_crash_occurred_needed,
	838	.fields = (VMStateField[]) {
	839	VMSTATE_BOOL(crash_occurred, CPUState),
	840	VMSTATE_END_OF_LIST()
	841	}
	842	};
	843
	844	const VMStateDescription vmstate_cpu_common = {
	845	.name = "cpu_common",
	846	.version_id = 1,
	847	.minimum_version_id = 1,
	848	.pre_load = cpu_common_pre_load,
	849	.post_load = cpu_common_post_load,
	850	.fields = (VMStateField[]) {
	851	VMSTATE_UINT32(halted, CPUState),
	852	VMSTATE_UINT32(interrupt_request, CPUState),
	853	VMSTATE_END_OF_LIST()
	854	},
	855	.subsections = (const VMStateDescription*[]) {
	856	&vmstate_cpu_common_exception_index,
	857	&vmstate_cpu_common_crash_occurred,
	858	NULL
	859	}
	860	};
	861
	862	#endif
	863
	864	CPUState *qemu_get_cpu(int index)
	865	{
	866	CPUState *cpu;
	867
	868	CPU_FOREACH(cpu) {
	869	if (cpu->cpu_index == index) {
	870	return cpu;
	871	}
	872	}
	873
	874	return NULL;
	875	}
	876
	877	#if !defined(CONFIG_USER_ONLY)
	878	void cpu_address_space_init(CPUState *cpu, int asidx,
	879	const char prefix, MemoryRegion mr)
	880	{
	881	CPUAddressSpace *newas;
	882	AddressSpace *as = g_new0(AddressSpace, 1);
	883	char *as_name;
	884
	885	assert(mr);
	886	as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
	887	address_space_init(as, mr, as_name);
	888	g_free(as_name);
	889
	890	/* Target code should have set num_ases before calling us */
	891	assert(asidx < cpu->num_ases);
	892
	893	if (asidx == 0) {
	894	/* address space 0 gets the convenience alias */
	895	cpu->as = as;
	896	}
	897
	898	/* KVM cannot currently support multiple address spaces. */
	899	assert(asidx == 0 \|\| !kvm_enabled());
	900
	901	if (!cpu->cpu_ases) {
	902	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	903	}
	904
	905	newas = &cpu->cpu_ases[asidx];
	906	newas->cpu = cpu;
	907	newas->as = as;
	908	if (tcg_enabled()) {
	909	newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
	910	newas->tcg_as_listener.commit = tcg_commit;
	911	memory_listener_register(&newas->tcg_as_listener, as);
	912	}
	913	}
	914
	915	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	916	{
	917	/* Return the AddressSpace corresponding to the specified index */
	918	return cpu->cpu_ases[asidx].as;
	919	}
	920	#endif
	921
	922	void cpu_exec_unrealizefn(CPUState *cpu)
	923	{
	924	CPUClass *cc = CPU_GET_CLASS(cpu);
	925
	926	cpu_list_remove(cpu);
	927
	928	if (cc->vmsd != NULL) {
	929	vmstate_unregister(NULL, cc->vmsd, cpu);
	930	}
	931	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	932	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	933	}
	934	#ifndef CONFIG_USER_ONLY
	935	tcg_iommu_free_notifier_list(cpu);
	936	#endif
	937	}
	938
	939	Property cpu_common_props[] = {
	940	#ifndef CONFIG_USER_ONLY
	941	/* Create a memory property for softmmu CPU object,
	942	* so users can wire up its memory. (This can't go in hw/core/cpu.c
	943	* because that file is compiled only once for both user-mode
	944	* and system builds.) The default if no link is set up is to use
	945	* the system address space.
	946	*/
	947	DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
	948	MemoryRegion *),
	949	#endif
	950	DEFINE_PROP_END_OF_LIST(),
	951	};
	952
	953	void cpu_exec_initfn(CPUState *cpu)
	954	{
	955	cpu->as = NULL;
	956	cpu->num_ases = 0;
	957
	958	#ifndef CONFIG_USER_ONLY
	959	cpu->thread_id = qemu_get_thread_id();
	960	cpu->memory = system_memory;
	961	object_ref(OBJECT(cpu->memory));
	962	#endif
	963	}
	964
	965	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	966	{
	967	CPUClass *cc = CPU_GET_CLASS(cpu);
	968	static bool tcg_target_initialized;
	969
	970	cpu_list_add(cpu);
	971
	972	if (tcg_enabled() && !tcg_target_initialized) {
	973	tcg_target_initialized = true;
	974	cc->tcg_initialize();
	975	}
	976	tlb_init(cpu);
	977
	978	#ifndef CONFIG_USER_ONLY
	979	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	980	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	981	}
	982	if (cc->vmsd != NULL) {
	983	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	984	}
	985
	986	cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
	987	#endif
	988	}
	989
	990	const char parse_cpu_option(const char cpu_option)
	991	{
	992	ObjectClass *oc;
	993	CPUClass *cc;
	994	gchar **model_pieces;
	995	const char *cpu_type;
	996
	997	model_pieces = g_strsplit(cpu_option, ",", 2);
	998	if (!model_pieces[0]) {
	999	error_report("-cpu option cannot be empty");
	1000	exit(1);
	1001	}
	1002
	1003	oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
	1004	if (oc == NULL) {
	1005	error_report("unable to find CPU model '%s'", model_pieces[0]);
	1006	g_strfreev(model_pieces);
	1007	exit(EXIT_FAILURE);
	1008	}
	1009
	1010	cpu_type = object_class_get_name(oc);
	1011	cc = CPU_CLASS(oc);
	1012	cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
	1013	g_strfreev(model_pieces);
	1014	return cpu_type;
	1015	}
	1016
	1017	#if defined(CONFIG_USER_ONLY)
	1018	void tb_invalidate_phys_addr(target_ulong addr)
	1019	{
	1020	mmap_lock();
	1021	tb_invalidate_phys_page_range(addr, addr + 1);
	1022	mmap_unlock();
	1023	}
	1024
	1025	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	1026	{
	1027	tb_invalidate_phys_addr(pc);
	1028	}
	1029	#else
	1030	void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
	1031	{
	1032	ram_addr_t ram_addr;
	1033	MemoryRegion *mr;
	1034	hwaddr l = 1;
	1035
	1036	if (!tcg_enabled()) {
	1037	return;
	1038	}
	1039
	1040	RCU_READ_LOCK_GUARD();
	1041	mr = address_space_translate(as, addr, &addr, &l, false, attrs);
	1042	if (!(memory_region_is_ram(mr)
	1043	\|\| memory_region_is_romd(mr))) {
	1044	return;
	1045	}
	1046	ram_addr = memory_region_get_ram_addr(mr) + addr;
	1047	tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
	1048	}
	1049
	1050	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	1051	{
	1052	MemTxAttrs attrs;
	1053	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	1054	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	1055	if (phys != -1) {
	1056	/* Locks grabbed by tb_invalidate_phys_addr */
	1057	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	1058	phys \| (pc & ~TARGET_PAGE_MASK), attrs);
	1059	}
	1060	}
	1061	#endif
	1062
	1063	#ifndef CONFIG_USER_ONLY
	1064	/* Add a watchpoint. */
	1065	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	1066	int flags, CPUWatchpoint **watchpoint)
	1067	{
	1068	CPUWatchpoint *wp;
	1069
	1070	/* forbid ranges which are empty or run off the end of the address space */
	1071	if (len == 0 \|\| (addr + len - 1) < addr) {
	1072	error_report("tried to set invalid watchpoint at %"
	1073	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	1074	return -EINVAL;
	1075	}
	1076	wp = g_malloc(sizeof(*wp));
	1077
	1078	wp->vaddr = addr;
	1079	wp->len = len;
	1080	wp->flags = flags;
	1081
	1082	/* keep all GDB-injected watchpoints in front */
	1083	if (flags & BP_GDB) {
	1084	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	1085	} else {
	1086	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	1087	}
	1088
	1089	tlb_flush_page(cpu, addr);
	1090
	1091	if (watchpoint)
	1092	*watchpoint = wp;
	1093	return 0;
	1094	}
	1095
	1096	/* Remove a specific watchpoint. */
	1097	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	1098	int flags)
	1099	{
	1100	CPUWatchpoint *wp;
	1101
	1102	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1103	if (addr == wp->vaddr && len == wp->len
	1104	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	1105	cpu_watchpoint_remove_by_ref(cpu, wp);
	1106	return 0;
	1107	}
	1108	}
	1109	return -ENOENT;
	1110	}
	1111
	1112	/* Remove a specific watchpoint by reference. */
	1113	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	1114	{
	1115	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	1116
	1117	tlb_flush_page(cpu, watchpoint->vaddr);
	1118
	1119	g_free(watchpoint);
	1120	}
	1121
	1122	/* Remove all matching watchpoints. */
	1123	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	1124	{
	1125	CPUWatchpoint wp, next;
	1126
	1127	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	1128	if (wp->flags & mask) {
	1129	cpu_watchpoint_remove_by_ref(cpu, wp);
	1130	}
	1131	}
	1132	}
	1133
	1134	/* Return true if this watchpoint address matches the specified
	1135	* access (ie the address range covered by the watchpoint overlaps
	1136	* partially or completely with the address range covered by the
	1137	* access).
	1138	*/
	1139	static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
	1140	vaddr addr, vaddr len)
	1141	{
	1142	/* We know the lengths are non-zero, but a little caution is
	1143	* required to avoid errors in the case where the range ends
	1144	* exactly at the top of the address space and so addr + len
	1145	* wraps round to zero.
	1146	*/
	1147	vaddr wpend = wp->vaddr + wp->len - 1;
	1148	vaddr addrend = addr + len - 1;
	1149
	1150	return !(addr > wpend \|\| wp->vaddr > addrend);
	1151	}
	1152
	1153	/* Return flags for watchpoints that match addr + prot. */
	1154	int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
	1155	{
	1156	CPUWatchpoint *wp;
	1157	int ret = 0;
	1158
	1159	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1160	if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
	1161	ret \|= wp->flags;
	1162	}
	1163	}
	1164	return ret;
	1165	}
	1166	#endif /* !CONFIG_USER_ONLY */
	1167
	1168	/* Add a breakpoint. */
	1169	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	1170	CPUBreakpoint **breakpoint)
	1171	{
	1172	CPUBreakpoint *bp;
	1173
	1174	bp = g_malloc(sizeof(*bp));
	1175
	1176	bp->pc = pc;
	1177	bp->flags = flags;
	1178
	1179	/* keep all GDB-injected breakpoints in front */
	1180	if (flags & BP_GDB) {
	1181	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	1182	} else {
	1183	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	1184	}
	1185
	1186	breakpoint_invalidate(cpu, pc);
	1187
	1188	if (breakpoint) {
	1189	*breakpoint = bp;
	1190	}
	1191	return 0;
	1192	}
	1193
	1194	/* Remove a specific breakpoint. */
	1195	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	1196	{
	1197	CPUBreakpoint *bp;
	1198
	1199	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	1200	if (bp->pc == pc && bp->flags == flags) {
	1201	cpu_breakpoint_remove_by_ref(cpu, bp);
	1202	return 0;
	1203	}
	1204	}
	1205	return -ENOENT;
	1206	}
	1207
	1208	/* Remove a specific breakpoint by reference. */
	1209	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	1210	{
	1211	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	1212
	1213	breakpoint_invalidate(cpu, breakpoint->pc);
	1214
	1215	g_free(breakpoint);
	1216	}
	1217
	1218	/* Remove all matching breakpoints. */
	1219	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	1220	{
	1221	CPUBreakpoint bp, next;
	1222
	1223	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	1224	if (bp->flags & mask) {
	1225	cpu_breakpoint_remove_by_ref(cpu, bp);
	1226	}
	1227	}
	1228	}
	1229
	1230	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	1231	CPU loop after each instruction */
	1232	void cpu_single_step(CPUState *cpu, int enabled)
	1233	{
	1234	if (cpu->singlestep_enabled != enabled) {
	1235	cpu->singlestep_enabled = enabled;
	1236	if (kvm_enabled()) {
	1237	kvm_update_guest_debug(cpu, 0);
	1238	} else {
	1239	/* must flush all the translated code to avoid inconsistencies */
	1240	/* XXX: only flush what is necessary */
	1241	tb_flush(cpu);
	1242	}
	1243	}
	1244	}
	1245
	1246	void cpu_abort(CPUState cpu, const char fmt, ...)
	1247	{
	1248	va_list ap;
	1249	va_list ap2;
	1250
	1251	va_start(ap, fmt);
	1252	va_copy(ap2, ap);
	1253	fprintf(stderr, "qemu: fatal: ");
	1254	vfprintf(stderr, fmt, ap);
	1255	fprintf(stderr, "\n");
	1256	cpu_dump_state(cpu, stderr, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1257	if (qemu_log_separate()) {
	1258	qemu_log_lock();
	1259	qemu_log("qemu: fatal: ");
	1260	qemu_log_vprintf(fmt, ap2);
	1261	qemu_log("\n");
	1262	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1263	qemu_log_flush();
	1264	qemu_log_unlock();
	1265	qemu_log_close();
	1266	}
	1267	va_end(ap2);
	1268	va_end(ap);
	1269	replay_finish();
	1270	#if defined(CONFIG_USER_ONLY)
	1271	{
	1272	struct sigaction act;
	1273	sigfillset(&act.sa_mask);
	1274	act.sa_handler = SIG_DFL;
	1275	act.sa_flags = 0;
	1276	sigaction(SIGABRT, &act, NULL);
	1277	}
	1278	#endif
	1279	abort();
	1280	}
	1281
	1282	#if !defined(CONFIG_USER_ONLY)
	1283	/* Called from RCU critical section */
	1284	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1285	{
	1286	RAMBlock *block;
	1287
	1288	block = atomic_rcu_read(&ram_list.mru_block);
	1289	if (block && addr - block->offset < block->max_length) {
	1290	return block;
	1291	}
	1292	RAMBLOCK_FOREACH(block) {
	1293	if (addr - block->offset < block->max_length) {
	1294	goto found;
	1295	}
	1296	}
	1297
	1298	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1299	abort();
	1300
	1301	found:
	1302	/* It is safe to write mru_block outside the iothread lock. This
	1303	* is what happens:
	1304	*
	1305	* mru_block = xxx
	1306	* rcu_read_unlock()
	1307	* xxx removed from list
	1308	* rcu_read_lock()
	1309	* read mru_block
	1310	* mru_block = NULL;
	1311	* call_rcu(reclaim_ramblock, xxx);
	1312	* rcu_read_unlock()
	1313	*
	1314	* atomic_rcu_set is not needed here. The block was already published
	1315	* when it was placed into the list. Here we're just making an extra
	1316	* copy of the pointer.
	1317	*/
	1318	ram_list.mru_block = block;
	1319	return block;
	1320	}
	1321
	1322	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1323	{
	1324	CPUState *cpu;
	1325	ram_addr_t start1;
	1326	RAMBlock *block;
	1327	ram_addr_t end;
	1328
	1329	assert(tcg_enabled());
	1330	end = TARGET_PAGE_ALIGN(start + length);
	1331	start &= TARGET_PAGE_MASK;
	1332
	1333	RCU_READ_LOCK_GUARD();
	1334	block = qemu_get_ram_block(start);
	1335	assert(block == qemu_get_ram_block(end - 1));
	1336	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1337	CPU_FOREACH(cpu) {
	1338	tlb_reset_dirty(cpu, start1, length);
	1339	}
	1340	}
	1341
	1342	/* Note: start and end must be within the same ram block. */
	1343	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1344	ram_addr_t length,
	1345	unsigned client)
	1346	{
	1347	DirtyMemoryBlocks *blocks;
	1348	unsigned long end, page;
	1349	bool dirty = false;
	1350	RAMBlock *ramblock;
	1351	uint64_t mr_offset, mr_size;
	1352
	1353	if (length == 0) {
	1354	return false;
	1355	}
	1356
	1357	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1358	page = start >> TARGET_PAGE_BITS;
	1359
	1360	WITH_RCU_READ_LOCK_GUARD() {
	1361	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1362	ramblock = qemu_get_ram_block(start);
	1363	/* Range sanity check on the ramblock */
	1364	assert(start >= ramblock->offset &&
	1365	start + length <= ramblock->offset + ramblock->used_length);
	1366
	1367	while (page < end) {
	1368	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1369	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1370	unsigned long num = MIN(end - page,
	1371	DIRTY_MEMORY_BLOCK_SIZE - offset);
	1372
	1373	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1374	offset, num);
	1375	page += num;
	1376	}
	1377
	1378	mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
	1379	mr_size = (end - page) << TARGET_PAGE_BITS;
	1380	memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
	1381	}
	1382
	1383	if (dirty && tcg_enabled()) {
	1384	tlb_reset_dirty_range_all(start, length);
	1385	}
	1386
	1387	return dirty;
	1388	}
	1389
	1390	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1391	(MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
	1392	{
	1393	DirtyMemoryBlocks *blocks;
	1394	ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
	1395	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1396	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1397	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1398	DirtyBitmapSnapshot *snap;
	1399	unsigned long page, end, dest;
	1400
	1401	snap = g_malloc0(sizeof(*snap) +
	1402	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1403	snap->start = first;
	1404	snap->end = last;
	1405
	1406	page = first >> TARGET_PAGE_BITS;
	1407	end = last >> TARGET_PAGE_BITS;
	1408	dest = 0;
	1409
	1410	WITH_RCU_READ_LOCK_GUARD() {
	1411	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1412
	1413	while (page < end) {
	1414	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1415	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1416	unsigned long num = MIN(end - page,
	1417	DIRTY_MEMORY_BLOCK_SIZE - offset);
	1418
	1419	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1420	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1421	offset >>= BITS_PER_LEVEL;
	1422
	1423	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1424	blocks->blocks[idx] + offset,
	1425	num);
	1426	page += num;
	1427	dest += num >> BITS_PER_LEVEL;
	1428	}
	1429	}
	1430
	1431	if (tcg_enabled()) {
	1432	tlb_reset_dirty_range_all(start, length);
	1433	}
	1434
	1435	memory_region_clear_dirty_bitmap(mr, offset, length);
	1436
	1437	return snap;
	1438	}
	1439
	1440	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1441	ram_addr_t start,
	1442	ram_addr_t length)
	1443	{
	1444	unsigned long page, end;
	1445
	1446	assert(start >= snap->start);
	1447	assert(start + length <= snap->end);
	1448
	1449	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1450	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1451
	1452	while (page < end) {
	1453	if (test_bit(page, snap->dirty)) {
	1454	return true;
	1455	}
	1456	page++;
	1457	}
	1458	return false;
	1459	}
	1460
	1461	/* Called from RCU critical section */
	1462	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1463	MemoryRegionSection *section)
	1464	{
	1465	AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
	1466	return section - d->map.sections;
	1467	}
	1468	#endif /* defined(CONFIG_USER_ONLY) */
	1469
	1470	#if !defined(CONFIG_USER_ONLY)
	1471
	1472	static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
	1473	uint16_t section);
	1474	static subpage_t subpage_init(FlatView fv, hwaddr base);
	1475
	1476	static void (phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
	1477	qemu_anon_ram_alloc;
	1478
	1479	/*
	1480	* Set a custom physical guest memory alloator.
	1481	* Accelerators with unusual needs may need this. Hopefully, we can
	1482	* get rid of it eventually.
	1483	*/
	1484	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align, bool shared))
	1485	{
	1486	phys_mem_alloc = alloc;
	1487	}
	1488
	1489	static uint16_t phys_section_add(PhysPageMap *map,
	1490	MemoryRegionSection *section)
	1491	{
	1492	/* The physical section number is ORed with a page-aligned
	1493	* pointer to produce the iotlb entries. Thus it should
	1494	* never overflow into the page-aligned value.
	1495	*/
	1496	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1497
	1498	if (map->sections_nb == map->sections_nb_alloc) {
	1499	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1500	map->sections = g_renew(MemoryRegionSection, map->sections,
	1501	map->sections_nb_alloc);
	1502	}
	1503	map->sections[map->sections_nb] = *section;
	1504	memory_region_ref(section->mr);
	1505	return map->sections_nb++;
	1506	}
	1507
	1508	static void phys_section_destroy(MemoryRegion *mr)
	1509	{
	1510	bool have_sub_page = mr->subpage;
	1511
	1512	memory_region_unref(mr);
	1513
	1514	if (have_sub_page) {
	1515	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1516	object_unref(OBJECT(&subpage->iomem));
	1517	g_free(subpage);
	1518	}
	1519	}
	1520
	1521	static void phys_sections_free(PhysPageMap *map)
	1522	{
	1523	while (map->sections_nb > 0) {
	1524	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1525	phys_section_destroy(section->mr);
	1526	}
	1527	g_free(map->sections);
	1528	g_free(map->nodes);
	1529	}
	1530
	1531	static void register_subpage(FlatView fv, MemoryRegionSection section)
	1532	{
	1533	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1534	subpage_t *subpage;
	1535	hwaddr base = section->offset_within_address_space
	1536	& TARGET_PAGE_MASK;
	1537	MemoryRegionSection *existing = phys_page_find(d, base);
	1538	MemoryRegionSection subsection = {
	1539	.offset_within_address_space = base,
	1540	.size = int128_make64(TARGET_PAGE_SIZE),
	1541	};
	1542	hwaddr start, end;
	1543
	1544	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1545
	1546	if (!(existing->mr->subpage)) {
	1547	subpage = subpage_init(fv, base);
	1548	subsection.fv = fv;
	1549	subsection.mr = &subpage->iomem;
	1550	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1551	phys_section_add(&d->map, &subsection));
	1552	} else {
	1553	subpage = container_of(existing->mr, subpage_t, iomem);
	1554	}
	1555	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1556	end = start + int128_get64(section->size) - 1;
	1557	subpage_register(subpage, start, end,
	1558	phys_section_add(&d->map, section));
	1559	}
	1560
	1561
	1562	static void register_multipage(FlatView *fv,
	1563	MemoryRegionSection *section)
	1564	{
	1565	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1566	hwaddr start_addr = section->offset_within_address_space;
	1567	uint16_t section_index = phys_section_add(&d->map, section);
	1568	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1569	TARGET_PAGE_BITS));
	1570
	1571	assert(num_pages);
	1572	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1573	}
	1574
	1575	/*
	1576	* The range in section may look like this:
	1577	*
	1578	* \|s\|PPPPPPP\|s\|
	1579	*
	1580	* where s stands for subpage and P for page.
	1581	*/
	1582	void flatview_add_to_dispatch(FlatView fv, MemoryRegionSection section)
	1583	{
	1584	MemoryRegionSection remain = *section;
	1585	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1586
	1587	/* register first subpage */
	1588	if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1589	uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
	1590	- remain.offset_within_address_space;
	1591
	1592	MemoryRegionSection now = remain;
	1593	now.size = int128_min(int128_make64(left), now.size);
	1594	register_subpage(fv, &now);
	1595	if (int128_eq(remain.size, now.size)) {
	1596	return;
	1597	}
	1598	remain.size = int128_sub(remain.size, now.size);
	1599	remain.offset_within_address_space += int128_get64(now.size);
	1600	remain.offset_within_region += int128_get64(now.size);
	1601	}
	1602
	1603	/* register whole pages */
	1604	if (int128_ge(remain.size, page_size)) {
	1605	MemoryRegionSection now = remain;
	1606	now.size = int128_and(now.size, int128_neg(page_size));
	1607	register_multipage(fv, &now);
	1608	if (int128_eq(remain.size, now.size)) {
	1609	return;
	1610	}
	1611	remain.size = int128_sub(remain.size, now.size);
	1612	remain.offset_within_address_space += int128_get64(now.size);
	1613	remain.offset_within_region += int128_get64(now.size);
	1614	}
	1615
	1616	/* register last subpage */
	1617	register_subpage(fv, &remain);
	1618	}
	1619
	1620	void qemu_flush_coalesced_mmio_buffer(void)
	1621	{
	1622	if (kvm_enabled())
	1623	kvm_flush_coalesced_mmio_buffer();
	1624	}
	1625
	1626	void qemu_mutex_lock_ramlist(void)
	1627	{
	1628	qemu_mutex_lock(&ram_list.mutex);
	1629	}
	1630
	1631	void qemu_mutex_unlock_ramlist(void)
	1632	{
	1633	qemu_mutex_unlock(&ram_list.mutex);
	1634	}
	1635
	1636	void ram_block_dump(Monitor *mon)
	1637	{
	1638	RAMBlock *block;
	1639	char *psize;
	1640
	1641	RCU_READ_LOCK_GUARD();
	1642	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1643	"Block Name", "PSize", "Offset", "Used", "Total");
	1644	RAMBLOCK_FOREACH(block) {
	1645	psize = size_to_str(block->page_size);
	1646	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1647	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1648	(uint64_t)block->offset,
	1649	(uint64_t)block->used_length,
	1650	(uint64_t)block->max_length);
	1651	g_free(psize);
	1652	}
	1653	}
	1654
	1655	#ifdef __linux__
	1656	/*
	1657	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1658	* may or may not name the same files / on the same filesystem now as
	1659	* when we actually open and map them. Iterate over the file
	1660	* descriptors instead, and use qemu_fd_getpagesize().
	1661	*/
	1662	static int find_min_backend_pagesize(Object obj, void opaque)
	1663	{
	1664	long *hpsize_min = opaque;
	1665
	1666	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1667	HostMemoryBackend *backend = MEMORY_BACKEND(obj);
	1668	long hpsize = host_memory_backend_pagesize(backend);
	1669
	1670	if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
	1671	*hpsize_min = hpsize;
	1672	}
	1673	}
	1674
	1675	return 0;
	1676	}
	1677
	1678	static int find_max_backend_pagesize(Object obj, void opaque)
	1679	{
	1680	long *hpsize_max = opaque;
	1681
	1682	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1683	HostMemoryBackend *backend = MEMORY_BACKEND(obj);
	1684	long hpsize = host_memory_backend_pagesize(backend);
	1685
	1686	if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
	1687	*hpsize_max = hpsize;
	1688	}
	1689	}
	1690
	1691	return 0;
	1692	}
	1693
	1694	/*
	1695	* TODO: We assume right now that all mapped host memory backends are
	1696	* used as RAM, however some might be used for different purposes.
	1697	*/
	1698	long qemu_minrampagesize(void)
	1699	{
	1700	long hpsize = LONG_MAX;
	1701	long mainrampagesize;
	1702	Object *memdev_root;
	1703	MachineState *ms = MACHINE(qdev_get_machine());
	1704
	1705	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1706
	1707	/* it's possible we have memory-backend objects with
	1708	* hugepage-backed RAM. these may get mapped into system
	1709	* address space via -numa parameters or memory hotplug
	1710	* hooks. we want to take these into account, but we
	1711	* also want to make sure these supported hugepage
	1712	* sizes are applicable across the entire range of memory
	1713	* we may boot from, so we take the min across all
	1714	* backends, and assume normal pages in cases where a
	1715	* backend isn't backed by hugepages.
	1716	*/
	1717	memdev_root = object_resolve_path("/objects", NULL);
	1718	if (memdev_root) {
	1719	object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
	1720	}
	1721	if (hpsize == LONG_MAX) {
	1722	/* No additional memory regions found ==> Report main RAM page size */
	1723	return mainrampagesize;
	1724	}
	1725
	1726	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1727	* memory-backend, then there is at least one node using "normal" RAM,
	1728	* so if its page size is smaller we have got to report that size instead.
	1729	*/
	1730	if (hpsize > mainrampagesize &&
	1731	(ms->numa_state == NULL \|\|
	1732	ms->numa_state->num_nodes == 0 \|\|
	1733	ms->numa_state->nodes[0].node_memdev == NULL)) {
	1734	static bool warned;
	1735	if (!warned) {
	1736	error_report("Huge page support disabled (n/a for main memory).");
	1737	warned = true;
	1738	}
	1739	return mainrampagesize;
	1740	}
	1741
	1742	return hpsize;
	1743	}
	1744
	1745	long qemu_maxrampagesize(void)
	1746	{
	1747	long pagesize = qemu_mempath_getpagesize(mem_path);
	1748	Object *memdev_root = object_resolve_path("/objects", NULL);
	1749
	1750	if (memdev_root) {
	1751	object_child_foreach(memdev_root, find_max_backend_pagesize,
	1752	&pagesize);
	1753	}
	1754	return pagesize;
	1755	}
	1756	#else
	1757	long qemu_minrampagesize(void)
	1758	{
	1759	return getpagesize();
	1760	}
	1761	long qemu_maxrampagesize(void)
	1762	{
	1763	return getpagesize();
	1764	}
	1765	#endif
	1766
	1767	#ifdef CONFIG_POSIX
	1768	static int64_t get_file_size(int fd)
	1769	{
	1770	int64_t size;
	1771	#if defined(__linux__)
	1772	struct stat st;
	1773
	1774	if (fstat(fd, &st) < 0) {
	1775	return -errno;
	1776	}
	1777
	1778	/* Special handling for devdax character devices */
	1779	if (S_ISCHR(st.st_mode)) {
	1780	g_autofree char *subsystem_path = NULL;
	1781	g_autofree char *subsystem = NULL;
	1782
	1783	subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
	1784	major(st.st_rdev), minor(st.st_rdev));
	1785	subsystem = g_file_read_link(subsystem_path, NULL);
	1786
	1787	if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
	1788	g_autofree char *size_path = NULL;
	1789	g_autofree char *size_str = NULL;
	1790
	1791	size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
	1792	major(st.st_rdev), minor(st.st_rdev));
	1793
	1794	if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
	1795	return g_ascii_strtoll(size_str, NULL, 0);
	1796	}
	1797	}
	1798	}
	1799	#endif /* defined(__linux__) */
	1800
	1801	/* st.st_size may be zero for special files yet lseek(2) works */
	1802	size = lseek(fd, 0, SEEK_END);
	1803	if (size < 0) {
	1804	return -errno;
	1805	}
	1806	return size;
	1807	}
	1808
	1809	static int file_ram_open(const char *path,
	1810	const char *region_name,
	1811	bool *created,
	1812	Error **errp)
	1813	{
	1814	char *filename;
	1815	char *sanitized_name;
	1816	char *c;
	1817	int fd = -1;
	1818
	1819	*created = false;
	1820	for (;;) {
	1821	fd = open(path, O_RDWR);
	1822	if (fd >= 0) {
	1823	/* @path names an existing file, use it */
	1824	break;
	1825	}
	1826	if (errno == ENOENT) {
	1827	/* @path names a file that doesn't exist, create it */
	1828	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1829	if (fd >= 0) {
	1830	*created = true;
	1831	break;
	1832	}
	1833	} else if (errno == EISDIR) {
	1834	/* @path names a directory, create a file there */
	1835	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1836	sanitized_name = g_strdup(region_name);
	1837	for (c = sanitized_name; *c != '\0'; c++) {
	1838	if (*c == '/') {
	1839	*c = '_';
	1840	}
	1841	}
	1842
	1843	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1844	sanitized_name);
	1845	g_free(sanitized_name);
	1846
	1847	fd = mkstemp(filename);
	1848	if (fd >= 0) {
	1849	unlink(filename);
	1850	g_free(filename);
	1851	break;
	1852	}
	1853	g_free(filename);
	1854	}
	1855	if (errno != EEXIST && errno != EINTR) {
	1856	error_setg_errno(errp, errno,
	1857	"can't open backing store %s for guest RAM",
	1858	path);
	1859	return -1;
	1860	}
	1861	/*
	1862	* Try again on EINTR and EEXIST. The latter happens when
	1863	* something else creates the file between our two open().
	1864	*/
	1865	}
	1866
	1867	return fd;
	1868	}
	1869
	1870	static void file_ram_alloc(RAMBlock block,
	1871	ram_addr_t memory,
	1872	int fd,
	1873	bool truncate,
	1874	Error **errp)
	1875	{
	1876	MachineState *ms = MACHINE(qdev_get_machine());
	1877	void *area;
	1878
	1879	block->page_size = qemu_fd_getpagesize(fd);
	1880	if (block->mr->align % block->page_size) {
	1881	error_setg(errp, "alignment 0x%" PRIx64
	1882	" must be multiples of page size 0x%zx",
	1883	block->mr->align, block->page_size);
	1884	return NULL;
	1885	} else if (block->mr->align && !is_power_of_2(block->mr->align)) {
	1886	error_setg(errp, "alignment 0x%" PRIx64
	1887	" must be a power of two", block->mr->align);
	1888	return NULL;
	1889	}
	1890	block->mr->align = MAX(block->page_size, block->mr->align);
	1891	#if defined(__s390x__)
	1892	if (kvm_enabled()) {
	1893	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1894	}
	1895	#endif
	1896
	1897	if (memory < block->page_size) {
	1898	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1899	"or larger than page size 0x%zx",
	1900	memory, block->page_size);
	1901	return NULL;
	1902	}
	1903
	1904	memory = ROUND_UP(memory, block->page_size);
	1905
	1906	/*
	1907	* ftruncate is not supported by hugetlbfs in older
	1908	* hosts, so don't bother bailing out on errors.
	1909	* If anything goes wrong with it under other filesystems,
	1910	* mmap will fail.
	1911	*
	1912	* Do not truncate the non-empty backend file to avoid corrupting
	1913	* the existing data in the file. Disabling shrinking is not
	1914	* enough. For example, the current vNVDIMM implementation stores
	1915	* the guest NVDIMM labels at the end of the backend file. If the
	1916	* backend file is later extended, QEMU will not be able to find
	1917	* those labels. Therefore, extending the non-empty backend file
	1918	* is disabled as well.
	1919	*/
	1920	if (truncate && ftruncate(fd, memory)) {
	1921	perror("ftruncate");
	1922	}
	1923
	1924	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1925	block->flags & RAM_SHARED, block->flags & RAM_PMEM);
	1926	if (area == MAP_FAILED) {
	1927	error_setg_errno(errp, errno,
	1928	"unable to map backing store for guest RAM");
	1929	return NULL;
	1930	}
	1931
	1932	if (mem_prealloc) {
	1933	os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
	1934	if (errp && *errp) {
	1935	qemu_ram_munmap(fd, area, memory);
	1936	return NULL;
	1937	}
	1938	}
	1939
	1940	block->fd = fd;
	1941	return area;
	1942	}
	1943	#endif
	1944
	1945	/* Allocate space within the ram_addr_t space that governs the
	1946	* dirty bitmaps.
	1947	* Called with the ramlist lock held.
	1948	*/
	1949	static ram_addr_t find_ram_offset(ram_addr_t size)
	1950	{
	1951	RAMBlock block, next_block;
	1952	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1953
	1954	assert(size != 0); /* it would hand out same offset multiple times */
	1955
	1956	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1957	return 0;
	1958	}
	1959
	1960	RAMBLOCK_FOREACH(block) {
	1961	ram_addr_t candidate, next = RAM_ADDR_MAX;
	1962
	1963	/* Align blocks to start on a 'long' in the bitmap
	1964	* which makes the bitmap sync'ing take the fast path.
	1965	*/
	1966	candidate = block->offset + block->max_length;
	1967	candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
	1968
	1969	/* Search for the closest following block
	1970	* and find the gap.
	1971	*/
	1972	RAMBLOCK_FOREACH(next_block) {
	1973	if (next_block->offset >= candidate) {
	1974	next = MIN(next, next_block->offset);
	1975	}
	1976	}
	1977
	1978	/* If it fits remember our place and remember the size
	1979	* of gap, but keep going so that we might find a smaller
	1980	* gap to fill so avoiding fragmentation.
	1981	*/
	1982	if (next - candidate >= size && next - candidate < mingap) {
	1983	offset = candidate;
	1984	mingap = next - candidate;
	1985	}
	1986
	1987	trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
	1988	}
	1989
	1990	if (offset == RAM_ADDR_MAX) {
	1991	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1992	(uint64_t)size);
	1993	abort();
	1994	}
	1995
	1996	trace_find_ram_offset(size, offset);
	1997
	1998	return offset;
	1999	}
	2000
	2001	static unsigned long last_ram_page(void)
	2002	{
	2003	RAMBlock *block;
	2004	ram_addr_t last = 0;
	2005
	2006	RCU_READ_LOCK_GUARD();
	2007	RAMBLOCK_FOREACH(block) {
	2008	last = MAX(last, block->offset + block->max_length);
	2009	}
	2010	return last >> TARGET_PAGE_BITS;
	2011	}
	2012
	2013	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	2014	{
	2015	int ret;
	2016
	2017	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	2018	if (!machine_dump_guest_core(current_machine)) {
	2019	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	2020	if (ret) {
	2021	perror("qemu_madvise");
	2022	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	2023	"but dump_guest_core=off specified\n");
	2024	}
	2025	}
	2026	}
	2027
	2028	const char qemu_ram_get_idstr(RAMBlock rb)
	2029	{
	2030	return rb->idstr;
	2031	}
	2032
	2033	void qemu_ram_get_host_addr(RAMBlock rb)
	2034	{
	2035	return rb->host;
	2036	}
	2037
	2038	ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
	2039	{
	2040	return rb->offset;
	2041	}
	2042
	2043	ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
	2044	{
	2045	return rb->used_length;
	2046	}
	2047
	2048	bool qemu_ram_is_shared(RAMBlock *rb)
	2049	{
	2050	return rb->flags & RAM_SHARED;
	2051	}
	2052
	2053	/* Note: Only set at the start of postcopy */
	2054	bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
	2055	{
	2056	return rb->flags & RAM_UF_ZEROPAGE;
	2057	}
	2058
	2059	void qemu_ram_set_uf_zeroable(RAMBlock *rb)
	2060	{
	2061	rb->flags \|= RAM_UF_ZEROPAGE;
	2062	}
	2063
	2064	bool qemu_ram_is_migratable(RAMBlock *rb)
	2065	{
	2066	return rb->flags & RAM_MIGRATABLE;
	2067	}
	2068
	2069	void qemu_ram_set_migratable(RAMBlock *rb)
	2070	{
	2071	rb->flags \|= RAM_MIGRATABLE;
	2072	}
	2073
	2074	void qemu_ram_unset_migratable(RAMBlock *rb)
	2075	{
	2076	rb->flags &= ~RAM_MIGRATABLE;
	2077	}
	2078
	2079	/* Called with iothread lock held. */
	2080	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	2081	{
	2082	RAMBlock *block;
	2083
	2084	assert(new_block);
	2085	assert(!new_block->idstr[0]);
	2086
	2087	if (dev) {
	2088	char *id = qdev_get_dev_path(dev);
	2089	if (id) {
	2090	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	2091	g_free(id);
	2092	}
	2093	}
	2094	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	2095
	2096	RCU_READ_LOCK_GUARD();
	2097	RAMBLOCK_FOREACH(block) {
	2098	if (block != new_block &&
	2099	!strcmp(block->idstr, new_block->idstr)) {
	2100	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	2101	new_block->idstr);
	2102	abort();
	2103	}
	2104	}
	2105	}
	2106
	2107	/* Called with iothread lock held. */
	2108	void qemu_ram_unset_idstr(RAMBlock *block)
	2109	{
	2110	/* FIXME: arch_init.c assumes that this is not called throughout
	2111	* migration. Ignore the problem since hot-unplug during migration
	2112	* does not work anyway.
	2113	*/
	2114	if (block) {
	2115	memset(block->idstr, 0, sizeof(block->idstr));
	2116	}
	2117	}
	2118
	2119	size_t qemu_ram_pagesize(RAMBlock *rb)
	2120	{
	2121	return rb->page_size;
	2122	}
	2123
	2124	/* Returns the largest size of page in use */
	2125	size_t qemu_ram_pagesize_largest(void)
	2126	{
	2127	RAMBlock *block;
	2128	size_t largest = 0;
	2129
	2130	RAMBLOCK_FOREACH(block) {
	2131	largest = MAX(largest, qemu_ram_pagesize(block));
	2132	}
	2133
	2134	return largest;
	2135	}
	2136
	2137	static int memory_try_enable_merging(void *addr, size_t len)
	2138	{
	2139	if (!machine_mem_merge(current_machine)) {
	2140	/* disabled by the user */
	2141	return 0;
	2142	}
	2143
	2144	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	2145	}
	2146
	2147	/* Only legal before guest might have detected the memory size: e.g. on
	2148	* incoming migration, or right after reset.
	2149	*
	2150	* As memory core doesn't know how is memory accessed, it is up to
	2151	* resize callback to update device state and/or add assertions to detect
	2152	* misuse, if necessary.
	2153	*/
	2154	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	2155	{
	2156	assert(block);
	2157
	2158	newsize = HOST_PAGE_ALIGN(newsize);
	2159
	2160	if (block->used_length == newsize) {
	2161	return 0;
	2162	}
	2163
	2164	if (!(block->flags & RAM_RESIZEABLE)) {
	2165	error_setg_errno(errp, EINVAL,
	2166	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	2167	" in != 0x" RAM_ADDR_FMT, block->idstr,
	2168	newsize, block->used_length);
	2169	return -EINVAL;
	2170	}
	2171
	2172	if (block->max_length < newsize) {
	2173	error_setg_errno(errp, EINVAL,
	2174	"Length too large: %s: 0x" RAM_ADDR_FMT
	2175	" > 0x" RAM_ADDR_FMT, block->idstr,
	2176	newsize, block->max_length);
	2177	return -EINVAL;
	2178	}
	2179
	2180	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	2181	block->used_length = newsize;
	2182	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	2183	DIRTY_CLIENTS_ALL);
	2184	memory_region_set_size(block->mr, newsize);
	2185	if (block->resized) {
	2186	block->resized(block->idstr, newsize, block->host);
	2187	}
	2188	return 0;
	2189	}
	2190
	2191	/* Called with ram_list.mutex held */
	2192	static void dirty_memory_extend(ram_addr_t old_ram_size,
	2193	ram_addr_t new_ram_size)
	2194	{
	2195	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	2196	DIRTY_MEMORY_BLOCK_SIZE);
	2197	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	2198	DIRTY_MEMORY_BLOCK_SIZE);
	2199	int i;
	2200
	2201	/* Only need to extend if block count increased */
	2202	if (new_num_blocks <= old_num_blocks) {
	2203	return;
	2204	}
	2205
	2206	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	2207	DirtyMemoryBlocks *old_blocks;
	2208	DirtyMemoryBlocks *new_blocks;
	2209	int j;
	2210
	2211	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	2212	new_blocks = g_malloc(sizeof(*new_blocks) +
	2213	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	2214
	2215	if (old_num_blocks) {
	2216	memcpy(new_blocks->blocks, old_blocks->blocks,
	2217	old_num_blocks * sizeof(old_blocks->blocks[0]));
	2218	}
	2219
	2220	for (j = old_num_blocks; j < new_num_blocks; j++) {
	2221	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	2222	}
	2223
	2224	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	2225
	2226	if (old_blocks) {
	2227	g_free_rcu(old_blocks, rcu);
	2228	}
	2229	}
	2230	}
	2231
	2232	static void ram_block_add(RAMBlock new_block, Error *errp, bool shared)
	2233	{
	2234	RAMBlock *block;
	2235	RAMBlock *last_block = NULL;
	2236	ram_addr_t old_ram_size, new_ram_size;
	2237	Error *err = NULL;
	2238
	2239	old_ram_size = last_ram_page();
	2240
	2241	qemu_mutex_lock_ramlist();
	2242	new_block->offset = find_ram_offset(new_block->max_length);
	2243
	2244	if (!new_block->host) {
	2245	if (xen_enabled()) {
	2246	xen_ram_alloc(new_block->offset, new_block->max_length,
	2247	new_block->mr, &err);
	2248	if (err) {
	2249	error_propagate(errp, err);
	2250	qemu_mutex_unlock_ramlist();
	2251	return;
	2252	}
	2253	} else {
	2254	new_block->host = phys_mem_alloc(new_block->max_length,
	2255	&new_block->mr->align, shared);
	2256	if (!new_block->host) {
	2257	error_setg_errno(errp, errno,
	2258	"cannot set up guest memory '%s'",
	2259	memory_region_name(new_block->mr));
	2260	qemu_mutex_unlock_ramlist();
	2261	return;
	2262	}
	2263	memory_try_enable_merging(new_block->host, new_block->max_length);
	2264	}
	2265	}
	2266
	2267	new_ram_size = MAX(old_ram_size,
	2268	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	2269	if (new_ram_size > old_ram_size) {
	2270	dirty_memory_extend(old_ram_size, new_ram_size);
	2271	}
	2272	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	2273	* QLIST (which has an RCU-friendly variant) does not have insertion at
	2274	* tail, so save the last element in last_block.
	2275	*/
	2276	RAMBLOCK_FOREACH(block) {
	2277	last_block = block;
	2278	if (block->max_length < new_block->max_length) {
	2279	break;
	2280	}
	2281	}
	2282	if (block) {
	2283	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	2284	} else if (last_block) {
	2285	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	2286	} else { /* list is empty */
	2287	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	2288	}
	2289	ram_list.mru_block = NULL;
	2290
	2291	/* Write list before version */
	2292	smp_wmb();
	2293	ram_list.version++;
	2294	qemu_mutex_unlock_ramlist();
	2295
	2296	cpu_physical_memory_set_dirty_range(new_block->offset,
	2297	new_block->used_length,
	2298	DIRTY_CLIENTS_ALL);
	2299
	2300	if (new_block->host) {
	2301	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	2302	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	2303	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	2304	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	2305	ram_block_notify_add(new_block->host, new_block->max_length);
	2306	}
	2307	}
	2308
	2309	#ifdef CONFIG_POSIX
	2310	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	2311	uint32_t ram_flags, int fd,
	2312	Error **errp)
	2313	{
	2314	RAMBlock *new_block;
	2315	Error *local_err = NULL;
	2316	int64_t file_size;
	2317
	2318	/* Just support these ram flags by now. */
	2319	assert((ram_flags & ~(RAM_SHARED \| RAM_PMEM)) == 0);
	2320
	2321	if (xen_enabled()) {
	2322	error_setg(errp, "-mem-path not supported with Xen");
	2323	return NULL;
	2324	}
	2325
	2326	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	2327	error_setg(errp,
	2328	"host lacks kvm mmu notifiers, -mem-path unsupported");
	2329	return NULL;
	2330	}
	2331
	2332	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	2333	/*
	2334	* file_ram_alloc() needs to allocate just like
	2335	* phys_mem_alloc, but we haven't bothered to provide
	2336	* a hook there.
	2337	*/
	2338	error_setg(errp,
	2339	"-mem-path not supported with this accelerator");
	2340	return NULL;
	2341	}
	2342
	2343	size = HOST_PAGE_ALIGN(size);
	2344	file_size = get_file_size(fd);
	2345	if (file_size > 0 && file_size < size) {
	2346	error_setg(errp, "backing store %s size 0x%" PRIx64
	2347	" does not match 'size' option 0x" RAM_ADDR_FMT,
	2348	mem_path, file_size, size);
	2349	return NULL;
	2350	}
	2351
	2352	new_block = g_malloc0(sizeof(*new_block));
	2353	new_block->mr = mr;
	2354	new_block->used_length = size;
	2355	new_block->max_length = size;
	2356	new_block->flags = ram_flags;
	2357	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	2358	if (!new_block->host) {
	2359	g_free(new_block);
	2360	return NULL;
	2361	}
	2362
	2363	ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
	2364	if (local_err) {
	2365	g_free(new_block);
	2366	error_propagate(errp, local_err);
	2367	return NULL;
	2368	}
	2369	return new_block;
	2370
	2371	}
	2372
	2373
	2374	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	2375	uint32_t ram_flags, const char *mem_path,
	2376	Error **errp)
	2377	{
	2378	int fd;
	2379	bool created;
	2380	RAMBlock *block;
	2381
	2382	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2383	if (fd < 0) {
	2384	return NULL;
	2385	}
	2386
	2387	block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
	2388	if (!block) {
	2389	if (created) {
	2390	unlink(mem_path);
	2391	}
	2392	close(fd);
	2393	return NULL;
	2394	}
	2395
	2396	return block;
	2397	}
	2398	#endif
	2399
	2400	static
	2401	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2402	void (resized)(const char,
	2403	uint64_t length,
	2404	void *host),
	2405	void *host, bool resizeable, bool share,
	2406	MemoryRegion mr, Error *errp)
	2407	{
	2408	RAMBlock *new_block;
	2409	Error *local_err = NULL;
	2410
	2411	size = HOST_PAGE_ALIGN(size);
	2412	max_size = HOST_PAGE_ALIGN(max_size);
	2413	new_block = g_malloc0(sizeof(*new_block));
	2414	new_block->mr = mr;
	2415	new_block->resized = resized;
	2416	new_block->used_length = size;
	2417	new_block->max_length = max_size;
	2418	assert(max_size >= size);
	2419	new_block->fd = -1;
	2420	new_block->page_size = getpagesize();
	2421	new_block->host = host;
	2422	if (host) {
	2423	new_block->flags \|= RAM_PREALLOC;
	2424	}
	2425	if (resizeable) {
	2426	new_block->flags \|= RAM_RESIZEABLE;
	2427	}
	2428	ram_block_add(new_block, &local_err, share);
	2429	if (local_err) {
	2430	g_free(new_block);
	2431	error_propagate(errp, local_err);
	2432	return NULL;
	2433	}
	2434	return new_block;
	2435	}
	2436
	2437	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2438	MemoryRegion mr, Error *errp)
	2439	{
	2440	return qemu_ram_alloc_internal(size, size, NULL, host, false,
	2441	false, mr, errp);
	2442	}
	2443
	2444	RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
	2445	MemoryRegion mr, Error *errp)
	2446	{
	2447	return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
	2448	share, mr, errp);
	2449	}
	2450
	2451	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2452	void (resized)(const char,
	2453	uint64_t length,
	2454	void *host),
	2455	MemoryRegion mr, Error *errp)
	2456	{
	2457	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
	2458	false, mr, errp);
	2459	}
	2460
	2461	static void reclaim_ramblock(RAMBlock *block)
	2462	{
	2463	if (block->flags & RAM_PREALLOC) {
	2464	;
	2465	} else if (xen_enabled()) {
	2466	xen_invalidate_map_cache_entry(block->host);
	2467	#ifndef _WIN32
	2468	} else if (block->fd >= 0) {
	2469	qemu_ram_munmap(block->fd, block->host, block->max_length);
	2470	close(block->fd);
	2471	#endif
	2472	} else {
	2473	qemu_anon_ram_free(block->host, block->max_length);
	2474	}
	2475	g_free(block);
	2476	}
	2477
	2478	void qemu_ram_free(RAMBlock *block)
	2479	{
	2480	if (!block) {
	2481	return;
	2482	}
	2483
	2484	if (block->host) {
	2485	ram_block_notify_remove(block->host, block->max_length);
	2486	}
	2487
	2488	qemu_mutex_lock_ramlist();
	2489	QLIST_REMOVE_RCU(block, next);
	2490	ram_list.mru_block = NULL;
	2491	/* Write list before version */
	2492	smp_wmb();
	2493	ram_list.version++;
	2494	call_rcu(block, reclaim_ramblock, rcu);
	2495	qemu_mutex_unlock_ramlist();
	2496	}
	2497
	2498	#ifndef _WIN32
	2499	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2500	{
	2501	RAMBlock *block;
	2502	ram_addr_t offset;
	2503	int flags;
	2504	void area, vaddr;
	2505
	2506	RAMBLOCK_FOREACH(block) {
	2507	offset = addr - block->offset;
	2508	if (offset < block->max_length) {
	2509	vaddr = ramblock_ptr(block, offset);
	2510	if (block->flags & RAM_PREALLOC) {
	2511	;
	2512	} else if (xen_enabled()) {
	2513	abort();
	2514	} else {
	2515	flags = MAP_FIXED;
	2516	if (block->fd >= 0) {
	2517	flags \|= (block->flags & RAM_SHARED ?
	2518	MAP_SHARED : MAP_PRIVATE);
	2519	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2520	flags, block->fd, offset);
	2521	} else {
	2522	/*
	2523	* Remap needs to match alloc. Accelerators that
	2524	* set phys_mem_alloc never remap. If they did,
	2525	* we'd need a remap hook here.
	2526	*/
	2527	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2528
	2529	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2530	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2531	flags, -1, 0);
	2532	}
	2533	if (area != vaddr) {
	2534	error_report("Could not remap addr: "
	2535	RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
	2536	length, addr);
	2537	exit(1);
	2538	}
	2539	memory_try_enable_merging(vaddr, length);
	2540	qemu_ram_setup_dump(vaddr, length);
	2541	}
	2542	}
	2543	}
	2544	}
	2545	#endif /* !_WIN32 */
	2546
	2547	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2548	* This should not be used for general purpose DMA. Use address_space_map
	2549	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2550	* device owns, use memory_region_get_ram_ptr.
	2551	*
	2552	* Called within RCU critical section.
	2553	*/
	2554	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2555	{
	2556	RAMBlock *block = ram_block;
	2557
	2558	if (block == NULL) {
	2559	block = qemu_get_ram_block(addr);
	2560	addr -= block->offset;
	2561	}
	2562
	2563	if (xen_enabled() && block->host == NULL) {
	2564	/* We need to check if the requested address is in the RAM
	2565	* because we don't want to map the entire memory in QEMU.
	2566	* In that case just map until the end of the page.
	2567	*/
	2568	if (block->offset == 0) {
	2569	return xen_map_cache(addr, 0, 0, false);
	2570	}
	2571
	2572	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2573	}
	2574	return ramblock_ptr(block, addr);
	2575	}
	2576
	2577	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2578	* but takes a size argument.
	2579	*
	2580	* Called within RCU critical section.
	2581	*/
	2582	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2583	hwaddr *size, bool lock)
	2584	{
	2585	RAMBlock *block = ram_block;
	2586	if (*size == 0) {
	2587	return NULL;
	2588	}
	2589
	2590	if (block == NULL) {
	2591	block = qemu_get_ram_block(addr);
	2592	addr -= block->offset;
	2593	}
	2594	size = MIN(size, block->max_length - addr);
	2595
	2596	if (xen_enabled() && block->host == NULL) {
	2597	/* We need to check if the requested address is in the RAM
	2598	* because we don't want to map the entire memory in QEMU.
	2599	* In that case just map the requested area.
	2600	*/
	2601	if (block->offset == 0) {
	2602	return xen_map_cache(addr, *size, lock, lock);
	2603	}
	2604
	2605	block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
	2606	}
	2607
	2608	return ramblock_ptr(block, addr);
	2609	}
	2610
	2611	/* Return the offset of a hostpointer within a ramblock */
	2612	ram_addr_t qemu_ram_block_host_offset(RAMBlock rb, void host)
	2613	{
	2614	ram_addr_t res = (uint8_t )host - (uint8_t )rb->host;
	2615	assert((uintptr_t)host >= (uintptr_t)rb->host);
	2616	assert(res < rb->max_length);
	2617
	2618	return res;
	2619	}
	2620
	2621	/*
	2622	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2623	* in that RAMBlock.
	2624	*
	2625	* ptr: Host pointer to look up
	2626	* round_offset: If true round the result offset down to a page boundary
	2627	* *ram_addr: set to result ram_addr
	2628	* *offset: set to result offset within the RAMBlock
	2629	*
	2630	* Returns: RAMBlock (or NULL if not found)
	2631	*
	2632	* By the time this function returns, the returned pointer is not protected
	2633	* by RCU anymore. If the caller is not within an RCU critical section and
	2634	* does not hold the iothread lock, it must have other means of protecting the
	2635	* pointer, such as a reference to the region that includes the incoming
	2636	* ram_addr_t.
	2637	*/
	2638	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2639	ram_addr_t *offset)
	2640	{
	2641	RAMBlock *block;
	2642	uint8_t *host = ptr;
	2643
	2644	if (xen_enabled()) {
	2645	ram_addr_t ram_addr;
	2646	RCU_READ_LOCK_GUARD();
	2647	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2648	block = qemu_get_ram_block(ram_addr);
	2649	if (block) {
	2650	*offset = ram_addr - block->offset;
	2651	}
	2652	return block;
	2653	}
	2654
	2655	RCU_READ_LOCK_GUARD();
	2656	block = atomic_rcu_read(&ram_list.mru_block);
	2657	if (block && block->host && host - block->host < block->max_length) {
	2658	goto found;
	2659	}
	2660
	2661	RAMBLOCK_FOREACH(block) {
	2662	/* This case append when the block is not mapped. */
	2663	if (block->host == NULL) {
	2664	continue;
	2665	}
	2666	if (host - block->host < block->max_length) {
	2667	goto found;
	2668	}
	2669	}
	2670
	2671	return NULL;
	2672
	2673	found:
	2674	*offset = (host - block->host);
	2675	if (round_offset) {
	2676	*offset &= TARGET_PAGE_MASK;
	2677	}
	2678	return block;
	2679	}
	2680
	2681	/*
	2682	* Finds the named RAMBlock
	2683	*
	2684	* name: The name of RAMBlock to find
	2685	*
	2686	* Returns: RAMBlock (or NULL if not found)
	2687	*/
	2688	RAMBlock qemu_ram_block_by_name(const char name)
	2689	{
	2690	RAMBlock *block;
	2691
	2692	RAMBLOCK_FOREACH(block) {
	2693	if (!strcmp(name, block->idstr)) {
	2694	return block;
	2695	}
	2696	}
	2697
	2698	return NULL;
	2699	}
	2700
	2701	/* Some of the softmmu routines need to translate from a host pointer
	2702	(typically a TLB entry) back to a ram offset. */
	2703	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2704	{
	2705	RAMBlock *block;
	2706	ram_addr_t offset;
	2707
	2708	block = qemu_ram_block_from_host(ptr, false, &offset);
	2709	if (!block) {
	2710	return RAM_ADDR_INVALID;
	2711	}
	2712
	2713	return block->offset + offset;
	2714	}
	2715
	2716	/* Generate a debug exception if a watchpoint has been hit. */
	2717	void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
	2718	MemTxAttrs attrs, int flags, uintptr_t ra)
	2719	{
	2720	CPUClass *cc = CPU_GET_CLASS(cpu);
	2721	CPUWatchpoint *wp;
	2722
	2723	assert(tcg_enabled());
	2724	if (cpu->watchpoint_hit) {
	2725	/*
	2726	* We re-entered the check after replacing the TB.
	2727	* Now raise the debug interrupt so that it will
	2728	* trigger after the current instruction.
	2729	*/
	2730	qemu_mutex_lock_iothread();
	2731	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2732	qemu_mutex_unlock_iothread();
	2733	return;
	2734	}
	2735
	2736	addr = cc->adjust_watchpoint_address(cpu, addr, len);
	2737	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2738	if (watchpoint_address_matches(wp, addr, len)
	2739	&& (wp->flags & flags)) {
	2740	if (flags == BP_MEM_READ) {
	2741	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2742	} else {
	2743	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2744	}
	2745	wp->hitaddr = MAX(addr, wp->vaddr);
	2746	wp->hitattrs = attrs;
	2747	if (!cpu->watchpoint_hit) {
	2748	if (wp->flags & BP_CPU &&
	2749	!cc->debug_check_watchpoint(cpu, wp)) {
	2750	wp->flags &= ~BP_WATCHPOINT_HIT;
	2751	continue;
	2752	}
	2753	cpu->watchpoint_hit = wp;
	2754
	2755	mmap_lock();
	2756	tb_check_watchpoint(cpu, ra);
	2757	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2758	cpu->exception_index = EXCP_DEBUG;
	2759	mmap_unlock();
	2760	cpu_loop_exit_restore(cpu, ra);
	2761	} else {
	2762	/* Force execution of one insn next time. */
	2763	cpu->cflags_next_tb = 1 \| curr_cflags();
	2764	mmap_unlock();
	2765	if (ra) {
	2766	cpu_restore_state(cpu, ra, true);
	2767	}
	2768	cpu_loop_exit_noexc(cpu);
	2769	}
	2770	}
	2771	} else {
	2772	wp->flags &= ~BP_WATCHPOINT_HIT;
	2773	}
	2774	}
	2775	}
	2776
	2777	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	2778	MemTxAttrs attrs, uint8_t *buf, hwaddr len);
	2779	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	2780	const uint8_t *buf, hwaddr len);
	2781	static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
	2782	bool is_write, MemTxAttrs attrs);
	2783
	2784	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2785	unsigned len, MemTxAttrs attrs)
	2786	{
	2787	subpage_t *subpage = opaque;
	2788	uint8_t buf[8];
	2789	MemTxResult res;
	2790
	2791	#if defined(DEBUG_SUBPAGE)
	2792	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2793	subpage, len, addr);
	2794	#endif
	2795	res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
	2796	if (res) {
	2797	return res;
	2798	}
	2799	*data = ldn_p(buf, len);
	2800	return MEMTX_OK;
	2801	}
	2802
	2803	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2804	uint64_t value, unsigned len, MemTxAttrs attrs)
	2805	{
	2806	subpage_t *subpage = opaque;
	2807	uint8_t buf[8];
	2808
	2809	#if defined(DEBUG_SUBPAGE)
	2810	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2811	" value %"PRIx64"\n",
	2812	__func__, subpage, len, addr, value);
	2813	#endif
	2814	stn_p(buf, len, value);
	2815	return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
	2816	}
	2817
	2818	static bool subpage_accepts(void *opaque, hwaddr addr,
	2819	unsigned len, bool is_write,
	2820	MemTxAttrs attrs)
	2821	{
	2822	subpage_t *subpage = opaque;
	2823	#if defined(DEBUG_SUBPAGE)
	2824	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2825	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2826	#endif
	2827
	2828	return flatview_access_valid(subpage->fv, addr + subpage->base,
	2829	len, is_write, attrs);
	2830	}
	2831
	2832	static const MemoryRegionOps subpage_ops = {
	2833	.read_with_attrs = subpage_read,
	2834	.write_with_attrs = subpage_write,
	2835	.impl.min_access_size = 1,
	2836	.impl.max_access_size = 8,
	2837	.valid.min_access_size = 1,
	2838	.valid.max_access_size = 8,
	2839	.valid.accepts = subpage_accepts,
	2840	.endianness = DEVICE_NATIVE_ENDIAN,
	2841	};
	2842
	2843	static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
	2844	uint16_t section)
	2845	{
	2846	int idx, eidx;
	2847
	2848	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2849	return -1;
	2850	idx = SUBPAGE_IDX(start);
	2851	eidx = SUBPAGE_IDX(end);
	2852	#if defined(DEBUG_SUBPAGE)
	2853	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2854	__func__, mmio, start, end, idx, eidx, section);
	2855	#endif
	2856	for (; idx <= eidx; idx++) {
	2857	mmio->sub_section[idx] = section;
	2858	}
	2859
	2860	return 0;
	2861	}
	2862
	2863	static subpage_t subpage_init(FlatView fv, hwaddr base)
	2864	{
	2865	subpage_t *mmio;
	2866
	2867	/* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
	2868	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2869	mmio->fv = fv;
	2870	mmio->base = base;
	2871	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2872	NULL, TARGET_PAGE_SIZE);
	2873	mmio->iomem.subpage = true;
	2874	#if defined(DEBUG_SUBPAGE)
	2875	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2876	mmio, base, TARGET_PAGE_SIZE);
	2877	#endif
	2878
	2879	return mmio;
	2880	}
	2881
	2882	static uint16_t dummy_section(PhysPageMap map, FlatView fv, MemoryRegion *mr)
	2883	{
	2884	assert(fv);
	2885	MemoryRegionSection section = {
	2886	.fv = fv,
	2887	.mr = mr,
	2888	.offset_within_address_space = 0,
	2889	.offset_within_region = 0,
	2890	.size = int128_2_64(),
	2891	};
	2892
	2893	return phys_section_add(map, &section);
	2894	}
	2895
	2896	MemoryRegionSection iotlb_to_section(CPUState cpu,
	2897	hwaddr index, MemTxAttrs attrs)
	2898	{
	2899	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	2900	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	2901	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	2902	MemoryRegionSection *sections = d->map.sections;
	2903
	2904	return &sections[index & ~TARGET_PAGE_MASK];
	2905	}
	2906
	2907	static void io_mem_init(void)
	2908	{
	2909	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	2910	NULL, UINT64_MAX);
	2911	}
	2912
	2913	AddressSpaceDispatch address_space_dispatch_new(FlatView fv)
	2914	{
	2915	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	2916	uint16_t n;
	2917
	2918	n = dummy_section(&d->map, fv, &io_mem_unassigned);
	2919	assert(n == PHYS_SECTION_UNASSIGNED);
	2920
	2921	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	2922
	2923	return d;
	2924	}
	2925
	2926	void address_space_dispatch_free(AddressSpaceDispatch *d)
	2927	{
	2928	phys_sections_free(&d->map);
	2929	g_free(d);
	2930	}
	2931
	2932	static void do_nothing(CPUState *cpu, run_on_cpu_data d)
	2933	{
	2934	}
	2935
	2936	static void tcg_log_global_after_sync(MemoryListener *listener)
	2937	{
	2938	CPUAddressSpace *cpuas;
	2939
	2940	/* Wait for the CPU to end the current TB. This avoids the following
	2941	* incorrect race:
	2942	*
	2943	* vCPU migration
	2944	* ---------------------- -------------------------
	2945	* TLB check -> slow path
	2946	* notdirty_mem_write
	2947	* write to RAM
	2948	* mark dirty
	2949	* clear dirty flag
	2950	* TLB check -> fast path
	2951	* read memory
	2952	* write to RAM
	2953	*
	2954	* by pushing the migration thread's memory read after the vCPU thread has
	2955	* written the memory.
	2956	*/
	2957	if (replay_mode == REPLAY_MODE_NONE) {
	2958	/*
	2959	* VGA can make calls to this function while updating the screen.
	2960	* In record/replay mode this causes a deadlock, because
	2961	* run_on_cpu waits for rr mutex. Therefore no races are possible
	2962	* in this case and no need for making run_on_cpu when
	2963	* record/replay is not enabled.
	2964	*/
	2965	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2966	run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
	2967	}
	2968	}
	2969
	2970	static void tcg_commit(MemoryListener *listener)
	2971	{
	2972	CPUAddressSpace *cpuas;
	2973	AddressSpaceDispatch *d;
	2974
	2975	assert(tcg_enabled());
	2976	/* since each CPU stores ram addresses in its TLB cache, we must
	2977	reset the modified entries */
	2978	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2979	cpu_reloading_memory_map();
	2980	/* The CPU and TLB are protected by the iothread lock.
	2981	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	2982	* may have split the RCU critical section.
	2983	*/
	2984	d = address_space_to_dispatch(cpuas->as);
	2985	atomic_rcu_set(&cpuas->memory_dispatch, d);
	2986	tlb_flush(cpuas->cpu);
	2987	}
	2988
	2989	static void memory_map_init(void)
	2990	{
	2991	system_memory = g_malloc(sizeof(*system_memory));
	2992
	2993	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	2994	address_space_init(&address_space_memory, system_memory, "memory");
	2995
	2996	system_io = g_malloc(sizeof(*system_io));
	2997	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	2998	65536);
	2999	address_space_init(&address_space_io, system_io, "I/O");
	3000	}
	3001
	3002	MemoryRegion *get_system_memory(void)
	3003	{
	3004	return system_memory;
	3005	}
	3006
	3007	MemoryRegion *get_system_io(void)
	3008	{
	3009	return system_io;
	3010	}
	3011
	3012	#endif /* !defined(CONFIG_USER_ONLY) */
	3013
	3014	/* physical memory access (slow version, mainly for debug) */
	3015	#if defined(CONFIG_USER_ONLY)
	3016	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3017	uint8_t *buf, target_ulong len, int is_write)
	3018	{
	3019	int flags;
	3020	target_ulong l, page;
	3021	void * p;
	3022
	3023	while (len > 0) {
	3024	page = addr & TARGET_PAGE_MASK;
	3025	l = (page + TARGET_PAGE_SIZE) - addr;
	3026	if (l > len)
	3027	l = len;
	3028	flags = page_get_flags(page);
	3029	if (!(flags & PAGE_VALID))
	3030	return -1;
	3031	if (is_write) {
	3032	if (!(flags & PAGE_WRITE))
	3033	return -1;
	3034	/* XXX: this code should not depend on lock_user */
	3035	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	3036	return -1;
	3037	memcpy(p, buf, l);
	3038	unlock_user(p, addr, l);
	3039	} else {
	3040	if (!(flags & PAGE_READ))
	3041	return -1;
	3042	/* XXX: this code should not depend on lock_user */
	3043	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	3044	return -1;
	3045	memcpy(buf, p, l);
	3046	unlock_user(p, addr, 0);
	3047	}
	3048	len -= l;
	3049	buf += l;
	3050	addr += l;
	3051	}
	3052	return 0;
	3053	}
	3054
	3055	#else
	3056
	3057	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	3058	hwaddr length)
	3059	{
	3060	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	3061	addr += memory_region_get_ram_addr(mr);
	3062
	3063	/* No early return if dirty_log_mask is or becomes 0, because
	3064	* cpu_physical_memory_set_dirty_range will still call
	3065	* xen_modified_memory.
	3066	*/
	3067	if (dirty_log_mask) {
	3068	dirty_log_mask =
	3069	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	3070	}
	3071	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	3072	assert(tcg_enabled());
	3073	tb_invalidate_phys_range(addr, addr + length);
	3074	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	3075	}
	3076	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	3077	}
	3078
	3079	void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
	3080	{
	3081	/*
	3082	* In principle this function would work on other memory region types too,
	3083	* but the ROM device use case is the only one where this operation is
	3084	* necessary. Other memory regions should use the
	3085	* address_space_read/write() APIs.
	3086	*/
	3087	assert(memory_region_is_romd(mr));
	3088
	3089	invalidate_and_set_dirty(mr, addr, size);
	3090	}
	3091
	3092	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	3093	{
	3094	unsigned access_size_max = mr->ops->valid.max_access_size;
	3095
	3096	/* Regions are assumed to support 1-4 byte accesses unless
	3097	otherwise specified. */
	3098	if (access_size_max == 0) {
	3099	access_size_max = 4;
	3100	}
	3101
	3102	/* Bound the maximum access by the alignment of the address. */
	3103	if (!mr->ops->impl.unaligned) {
	3104	unsigned align_size_max = addr & -addr;
	3105	if (align_size_max != 0 && align_size_max < access_size_max) {
	3106	access_size_max = align_size_max;
	3107	}
	3108	}
	3109
	3110	/* Don't attempt accesses larger than the maximum. */
	3111	if (l > access_size_max) {
	3112	l = access_size_max;
	3113	}
	3114	l = pow2floor(l);
	3115
	3116	return l;
	3117	}
	3118
	3119	static bool prepare_mmio_access(MemoryRegion *mr)
	3120	{
	3121	bool unlocked = !qemu_mutex_iothread_locked();
	3122	bool release_lock = false;
	3123
	3124	if (unlocked && mr->global_locking) {
	3125	qemu_mutex_lock_iothread();
	3126	unlocked = false;
	3127	release_lock = true;
	3128	}
	3129	if (mr->flush_coalesced_mmio) {
	3130	if (unlocked) {
	3131	qemu_mutex_lock_iothread();
	3132	}
	3133	qemu_flush_coalesced_mmio_buffer();
	3134	if (unlocked) {
	3135	qemu_mutex_unlock_iothread();
	3136	}
	3137	}
	3138
	3139	return release_lock;
	3140	}
	3141
	3142	/* Called within RCU critical section. */
	3143	static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
	3144	MemTxAttrs attrs,
	3145	const uint8_t *buf,
	3146	hwaddr len, hwaddr addr1,
	3147	hwaddr l, MemoryRegion *mr)
	3148	{
	3149	uint8_t *ptr;
	3150	uint64_t val;
	3151	MemTxResult result = MEMTX_OK;
	3152	bool release_lock = false;
	3153
	3154	for (;;) {
	3155	if (!memory_access_is_direct(mr, true)) {
	3156	release_lock \|= prepare_mmio_access(mr);
	3157	l = memory_access_size(mr, l, addr1);
	3158	/* XXX: could force current_cpu to NULL to avoid
	3159	potential bugs */
	3160	val = ldn_he_p(buf, l);
	3161	result \|= memory_region_dispatch_write(mr, addr1, val,
	3162	size_memop(l), attrs);
	3163	} else {
	3164	/* RAM case */
	3165	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3166	memcpy(ptr, buf, l);
	3167	invalidate_and_set_dirty(mr, addr1, l);
	3168	}
	3169
	3170	if (release_lock) {
	3171	qemu_mutex_unlock_iothread();
	3172	release_lock = false;
	3173	}
	3174
	3175	len -= l;
	3176	buf += l;
	3177	addr += l;
	3178
	3179	if (!len) {
	3180	break;
	3181	}
	3182
	3183	l = len;
	3184	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3185	}
	3186
	3187	return result;
	3188	}
	3189
	3190	/* Called from RCU critical section. */
	3191	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3192	const uint8_t *buf, hwaddr len)
	3193	{
	3194	hwaddr l;
	3195	hwaddr addr1;
	3196	MemoryRegion *mr;
	3197	MemTxResult result = MEMTX_OK;
	3198
	3199	l = len;
	3200	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3201	result = flatview_write_continue(fv, addr, attrs, buf, len,
	3202	addr1, l, mr);
	3203
	3204	return result;
	3205	}
	3206
	3207	/* Called within RCU critical section. */
	3208	MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
	3209	MemTxAttrs attrs, uint8_t *buf,
	3210	hwaddr len, hwaddr addr1, hwaddr l,
	3211	MemoryRegion *mr)
	3212	{
	3213	uint8_t *ptr;
	3214	uint64_t val;
	3215	MemTxResult result = MEMTX_OK;
	3216	bool release_lock = false;
	3217
	3218	for (;;) {
	3219	if (!memory_access_is_direct(mr, false)) {
	3220	/* I/O case */
	3221	release_lock \|= prepare_mmio_access(mr);
	3222	l = memory_access_size(mr, l, addr1);
	3223	result \|= memory_region_dispatch_read(mr, addr1, &val,
	3224	size_memop(l), attrs);
	3225	stn_he_p(buf, l, val);
	3226	} else {
	3227	/* RAM case */
	3228	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3229	memcpy(buf, ptr, l);
	3230	}
	3231
	3232	if (release_lock) {
	3233	qemu_mutex_unlock_iothread();
	3234	release_lock = false;
	3235	}
	3236
	3237	len -= l;
	3238	buf += l;
	3239	addr += l;
	3240
	3241	if (!len) {
	3242	break;
	3243	}
	3244
	3245	l = len;
	3246	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3247	}
	3248
	3249	return result;
	3250	}
	3251
	3252	/* Called from RCU critical section. */
	3253	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	3254	MemTxAttrs attrs, uint8_t *buf, hwaddr len)
	3255	{
	3256	hwaddr l;
	3257	hwaddr addr1;
	3258	MemoryRegion *mr;
	3259
	3260	l = len;
	3261	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3262	return flatview_read_continue(fv, addr, attrs, buf, len,
	3263	addr1, l, mr);
	3264	}
	3265
	3266	MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
	3267	MemTxAttrs attrs, uint8_t *buf, hwaddr len)
	3268	{
	3269	MemTxResult result = MEMTX_OK;
	3270	FlatView *fv;
	3271
	3272	if (len > 0) {
	3273	RCU_READ_LOCK_GUARD();
	3274	fv = address_space_to_flatview(as);
	3275	result = flatview_read(fv, addr, attrs, buf, len);
	3276	}
	3277
	3278	return result;
	3279	}
	3280
	3281	MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
	3282	MemTxAttrs attrs,
	3283	const uint8_t *buf, hwaddr len)
	3284	{
	3285	MemTxResult result = MEMTX_OK;
	3286	FlatView *fv;
	3287
	3288	if (len > 0) {
	3289	RCU_READ_LOCK_GUARD();
	3290	fv = address_space_to_flatview(as);
	3291	result = flatview_write(fv, addr, attrs, buf, len);
	3292	}
	3293
	3294	return result;
	3295	}
	3296
	3297	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	3298	uint8_t *buf, hwaddr len, bool is_write)
	3299	{
	3300	if (is_write) {
	3301	return address_space_write(as, addr, attrs, buf, len);
	3302	} else {
	3303	return address_space_read_full(as, addr, attrs, buf, len);
	3304	}
	3305	}
	3306
	3307	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3308	hwaddr len, int is_write)
	3309	{
	3310	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3311	buf, len, is_write);
	3312	}
	3313
	3314	enum write_rom_type {
	3315	WRITE_DATA,
	3316	FLUSH_CACHE,
	3317	};
	3318
	3319	static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
	3320	hwaddr addr,
	3321	MemTxAttrs attrs,
	3322	const uint8_t *buf,
	3323	hwaddr len,
	3324	enum write_rom_type type)
	3325	{
	3326	hwaddr l;
	3327	uint8_t *ptr;
	3328	hwaddr addr1;
	3329	MemoryRegion *mr;
	3330
	3331	RCU_READ_LOCK_GUARD();
	3332	while (len > 0) {
	3333	l = len;
	3334	mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
	3335
	3336	if (!(memory_region_is_ram(mr) \|\|
	3337	memory_region_is_romd(mr))) {
	3338	l = memory_access_size(mr, l, addr1);
	3339	} else {
	3340	/* ROM/RAM case */
	3341	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3342	switch (type) {
	3343	case WRITE_DATA:
	3344	memcpy(ptr, buf, l);
	3345	invalidate_and_set_dirty(mr, addr1, l);
	3346	break;
	3347	case FLUSH_CACHE:
	3348	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3349	break;
	3350	}
	3351	}
	3352	len -= l;
	3353	buf += l;
	3354	addr += l;
	3355	}
	3356	return MEMTX_OK;
	3357	}
	3358
	3359	/* used for ROM loading : can write in RAM and ROM */
	3360	MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
	3361	MemTxAttrs attrs,
	3362	const uint8_t *buf, hwaddr len)
	3363	{
	3364	return address_space_write_rom_internal(as, addr, attrs,
	3365	buf, len, WRITE_DATA);
	3366	}
	3367
	3368	void cpu_flush_icache_range(hwaddr start, hwaddr len)
	3369	{
	3370	/*
	3371	* This function should do the same thing as an icache flush that was
	3372	* triggered from within the guest. For TCG we are always cache coherent,
	3373	* so there is no need to flush anything. For KVM / Xen we need to flush
	3374	* the host's instruction cache at least.
	3375	*/
	3376	if (tcg_enabled()) {
	3377	return;
	3378	}
	3379
	3380	address_space_write_rom_internal(&address_space_memory,
	3381	start, MEMTXATTRS_UNSPECIFIED,
	3382	NULL, len, FLUSH_CACHE);
	3383	}
	3384
	3385	typedef struct {
	3386	MemoryRegion *mr;
	3387	void *buffer;
	3388	hwaddr addr;
	3389	hwaddr len;
	3390	bool in_use;
	3391	} BounceBuffer;
	3392
	3393	static BounceBuffer bounce;
	3394
	3395	typedef struct MapClient {
	3396	QEMUBH *bh;
	3397	QLIST_ENTRY(MapClient) link;
	3398	} MapClient;
	3399
	3400	QemuMutex map_client_list_lock;
	3401	static QLIST_HEAD(, MapClient) map_client_list
	3402	= QLIST_HEAD_INITIALIZER(map_client_list);
	3403
	3404	static void cpu_unregister_map_client_do(MapClient *client)
	3405	{
	3406	QLIST_REMOVE(client, link);
	3407	g_free(client);
	3408	}
	3409
	3410	static void cpu_notify_map_clients_locked(void)
	3411	{
	3412	MapClient *client;
	3413
	3414	while (!QLIST_EMPTY(&map_client_list)) {
	3415	client = QLIST_FIRST(&map_client_list);
	3416	qemu_bh_schedule(client->bh);
	3417	cpu_unregister_map_client_do(client);
	3418	}
	3419	}
	3420
	3421	void cpu_register_map_client(QEMUBH *bh)
	3422	{
	3423	MapClient client = g_malloc(sizeof(client));
	3424
	3425	qemu_mutex_lock(&map_client_list_lock);
	3426	client->bh = bh;
	3427	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3428	if (!atomic_read(&bounce.in_use)) {
	3429	cpu_notify_map_clients_locked();
	3430	}
	3431	qemu_mutex_unlock(&map_client_list_lock);
	3432	}
	3433
	3434	void cpu_exec_init_all(void)
	3435	{
	3436	qemu_mutex_init(&ram_list.mutex);
	3437	/* The data structures we set up here depend on knowing the page size,
	3438	* so no more changes can be made after this point.
	3439	* In an ideal world, nothing we did before we had finished the
	3440	* machine setup would care about the target page size, and we could
	3441	* do this much later, rather than requiring board models to state
	3442	* up front what their requirements are.
	3443	*/
	3444	finalize_target_page_bits();
	3445	io_mem_init();
	3446	memory_map_init();
	3447	qemu_mutex_init(&map_client_list_lock);
	3448	}
	3449
	3450	void cpu_unregister_map_client(QEMUBH *bh)
	3451	{
	3452	MapClient *client;
	3453
	3454	qemu_mutex_lock(&map_client_list_lock);
	3455	QLIST_FOREACH(client, &map_client_list, link) {
	3456	if (client->bh == bh) {
	3457	cpu_unregister_map_client_do(client);
	3458	break;
	3459	}
	3460	}
	3461	qemu_mutex_unlock(&map_client_list_lock);
	3462	}
	3463
	3464	static void cpu_notify_map_clients(void)
	3465	{
	3466	qemu_mutex_lock(&map_client_list_lock);
	3467	cpu_notify_map_clients_locked();
	3468	qemu_mutex_unlock(&map_client_list_lock);
	3469	}
	3470
	3471	static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
	3472	bool is_write, MemTxAttrs attrs)
	3473	{
	3474	MemoryRegion *mr;
	3475	hwaddr l, xlat;
	3476
	3477	while (len > 0) {
	3478	l = len;
	3479	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3480	if (!memory_access_is_direct(mr, is_write)) {
	3481	l = memory_access_size(mr, l, addr);
	3482	if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
	3483	return false;
	3484	}
	3485	}
	3486
	3487	len -= l;
	3488	addr += l;
	3489	}
	3490	return true;
	3491	}
	3492
	3493	bool address_space_access_valid(AddressSpace *as, hwaddr addr,
	3494	hwaddr len, bool is_write,
	3495	MemTxAttrs attrs)
	3496	{
	3497	FlatView *fv;
	3498	bool result;
	3499
	3500	RCU_READ_LOCK_GUARD();
	3501	fv = address_space_to_flatview(as);
	3502	result = flatview_access_valid(fv, addr, len, is_write, attrs);
	3503	return result;
	3504	}
	3505
	3506	static hwaddr
	3507	flatview_extend_translation(FlatView *fv, hwaddr addr,
	3508	hwaddr target_len,
	3509	MemoryRegion *mr, hwaddr base, hwaddr len,
	3510	bool is_write, MemTxAttrs attrs)
	3511	{
	3512	hwaddr done = 0;
	3513	hwaddr xlat;
	3514	MemoryRegion *this_mr;
	3515
	3516	for (;;) {
	3517	target_len -= len;
	3518	addr += len;
	3519	done += len;
	3520	if (target_len == 0) {
	3521	return done;
	3522	}
	3523
	3524	len = target_len;
	3525	this_mr = flatview_translate(fv, addr, &xlat,
	3526	&len, is_write, attrs);
	3527	if (this_mr != mr \|\| xlat != base + done) {
	3528	return done;
	3529	}
	3530	}
	3531	}
	3532
	3533	/* Map a physical memory region into a host virtual address.
	3534	* May map a subset of the requested range, given by and returned in *plen.
	3535	* May return NULL if resources needed to perform the mapping are exhausted.
	3536	* Use only for reads OR writes - not for read-modify-write operations.
	3537	* Use cpu_register_map_client() to know when retrying the map operation is
	3538	* likely to succeed.
	3539	*/
	3540	void address_space_map(AddressSpace as,
	3541	hwaddr addr,
	3542	hwaddr *plen,
	3543	bool is_write,
	3544	MemTxAttrs attrs)
	3545	{
	3546	hwaddr len = *plen;
	3547	hwaddr l, xlat;
	3548	MemoryRegion *mr;
	3549	void *ptr;
	3550	FlatView *fv;
	3551
	3552	if (len == 0) {
	3553	return NULL;
	3554	}
	3555
	3556	l = len;
	3557	RCU_READ_LOCK_GUARD();
	3558	fv = address_space_to_flatview(as);
	3559	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3560
	3561	if (!memory_access_is_direct(mr, is_write)) {
	3562	if (atomic_xchg(&bounce.in_use, true)) {
	3563	return NULL;
	3564	}
	3565	/* Avoid unbounded allocations */
	3566	l = MIN(l, TARGET_PAGE_SIZE);
	3567	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3568	bounce.addr = addr;
	3569	bounce.len = l;
	3570
	3571	memory_region_ref(mr);
	3572	bounce.mr = mr;
	3573	if (!is_write) {
	3574	flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
	3575	bounce.buffer, l);
	3576	}
	3577
	3578	*plen = l;
	3579	return bounce.buffer;
	3580	}
	3581
	3582
	3583	memory_region_ref(mr);
	3584	*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
	3585	l, is_write, attrs);
	3586	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
	3587
	3588	return ptr;
	3589	}
	3590
	3591	/* Unmaps a memory region previously mapped by address_space_map().
	3592	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3593	* the amount of memory that was actually read or written by the caller.
	3594	*/
	3595	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3596	int is_write, hwaddr access_len)
	3597	{
	3598	if (buffer != bounce.buffer) {
	3599	MemoryRegion *mr;
	3600	ram_addr_t addr1;
	3601
	3602	mr = memory_region_from_host(buffer, &addr1);
	3603	assert(mr != NULL);
	3604	if (is_write) {
	3605	invalidate_and_set_dirty(mr, addr1, access_len);
	3606	}
	3607	if (xen_enabled()) {
	3608	xen_invalidate_map_cache_entry(buffer);
	3609	}
	3610	memory_region_unref(mr);
	3611	return;
	3612	}
	3613	if (is_write) {
	3614	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3615	bounce.buffer, access_len);
	3616	}
	3617	qemu_vfree(bounce.buffer);
	3618	bounce.buffer = NULL;
	3619	memory_region_unref(bounce.mr);
	3620	atomic_mb_set(&bounce.in_use, false);
	3621	cpu_notify_map_clients();
	3622	}
	3623
	3624	void *cpu_physical_memory_map(hwaddr addr,
	3625	hwaddr *plen,
	3626	int is_write)
	3627	{
	3628	return address_space_map(&address_space_memory, addr, plen, is_write,
	3629	MEMTXATTRS_UNSPECIFIED);
	3630	}
	3631
	3632	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3633	int is_write, hwaddr access_len)
	3634	{
	3635	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3636	}
	3637
	3638	#define ARG1_DECL AddressSpace *as
	3639	#define ARG1 as
	3640	#define SUFFIX
	3641	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3642	#define RCU_READ_LOCK(...) rcu_read_lock()
	3643	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3644	#include "memory_ldst.inc.c"
	3645
	3646	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3647	AddressSpace *as,
	3648	hwaddr addr,
	3649	hwaddr len,
	3650	bool is_write)
	3651	{
	3652	AddressSpaceDispatch *d;
	3653	hwaddr l;
	3654	MemoryRegion *mr;
	3655
	3656	assert(len > 0);
	3657
	3658	l = len;
	3659	cache->fv = address_space_get_flatview(as);
	3660	d = flatview_to_dispatch(cache->fv);
	3661	cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
	3662
	3663	mr = cache->mrs.mr;
	3664	memory_region_ref(mr);
	3665	if (memory_access_is_direct(mr, is_write)) {
	3666	/* We don't care about the memory attributes here as we're only
	3667	* doing this if we found actual RAM, which behaves the same
	3668	* regardless of attributes; so UNSPECIFIED is fine.
	3669	*/
	3670	l = flatview_extend_translation(cache->fv, addr, len, mr,
	3671	cache->xlat, l, is_write,
	3672	MEMTXATTRS_UNSPECIFIED);
	3673	cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
	3674	} else {
	3675	cache->ptr = NULL;
	3676	}
	3677
	3678	cache->len = l;
	3679	cache->is_write = is_write;
	3680	return l;
	3681	}
	3682
	3683	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3684	hwaddr addr,
	3685	hwaddr access_len)
	3686	{
	3687	assert(cache->is_write);
	3688	if (likely(cache->ptr)) {
	3689	invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
	3690	}
	3691	}
	3692
	3693	void address_space_cache_destroy(MemoryRegionCache *cache)
	3694	{
	3695	if (!cache->mrs.mr) {
	3696	return;
	3697	}
	3698
	3699	if (xen_enabled()) {
	3700	xen_invalidate_map_cache_entry(cache->ptr);
	3701	}
	3702	memory_region_unref(cache->mrs.mr);
	3703	flatview_unref(cache->fv);
	3704	cache->mrs.mr = NULL;
	3705	cache->fv = NULL;
	3706	}
	3707
	3708	/* Called from RCU critical section. This function has the same
	3709	* semantics as address_space_translate, but it only works on a
	3710	* predefined range of a MemoryRegion that was mapped with
	3711	* address_space_cache_init.
	3712	*/
	3713	static inline MemoryRegion *address_space_translate_cached(
	3714	MemoryRegionCache cache, hwaddr addr, hwaddr xlat,
	3715	hwaddr *plen, bool is_write, MemTxAttrs attrs)
	3716	{
	3717	MemoryRegionSection section;
	3718	MemoryRegion *mr;
	3719	IOMMUMemoryRegion *iommu_mr;
	3720	AddressSpace *target_as;
	3721
	3722	assert(!cache->ptr);
	3723	*xlat = addr + cache->xlat;
	3724
	3725	mr = cache->mrs.mr;
	3726	iommu_mr = memory_region_get_iommu(mr);
	3727	if (!iommu_mr) {
	3728	/* MMIO region. */
	3729	return mr;
	3730	}
	3731
	3732	section = address_space_translate_iommu(iommu_mr, xlat, plen,
	3733	NULL, is_write, true,
	3734	&target_as, attrs);
	3735	return section.mr;
	3736	}
	3737
	3738	/* Called from RCU critical section. address_space_read_cached uses this
	3739	* out of line function when the target is an MMIO or IOMMU region.
	3740	*/
	3741	void
	3742	address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3743	void *buf, hwaddr len)
	3744	{
	3745	hwaddr addr1, l;
	3746	MemoryRegion *mr;
	3747
	3748	l = len;
	3749	mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
	3750	MEMTXATTRS_UNSPECIFIED);
	3751	flatview_read_continue(cache->fv,
	3752	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3753	addr1, l, mr);
	3754	}
	3755
	3756	/* Called from RCU critical section. address_space_write_cached uses this
	3757	* out of line function when the target is an MMIO or IOMMU region.
	3758	*/
	3759	void
	3760	address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3761	const void *buf, hwaddr len)
	3762	{
	3763	hwaddr addr1, l;
	3764	MemoryRegion *mr;
	3765
	3766	l = len;
	3767	mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
	3768	MEMTXATTRS_UNSPECIFIED);
	3769	flatview_write_continue(cache->fv,
	3770	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3771	addr1, l, mr);
	3772	}
	3773
	3774	#define ARG1_DECL MemoryRegionCache *cache
	3775	#define ARG1 cache
	3776	#define SUFFIX _cached_slow
	3777	#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
	3778	#define RCU_READ_LOCK() ((void)0)
	3779	#define RCU_READ_UNLOCK() ((void)0)
	3780	#include "memory_ldst.inc.c"
	3781
	3782	/* virtual memory access for debug (includes writing to ROM) */
	3783	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3784	uint8_t *buf, target_ulong len, int is_write)
	3785	{
	3786	hwaddr phys_addr;
	3787	target_ulong l, page;
	3788
	3789	cpu_synchronize_state(cpu);
	3790	while (len > 0) {
	3791	int asidx;
	3792	MemTxAttrs attrs;
	3793
	3794	page = addr & TARGET_PAGE_MASK;
	3795	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3796	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3797	/* if no physical page mapped, return an error */
	3798	if (phys_addr == -1)
	3799	return -1;
	3800	l = (page + TARGET_PAGE_SIZE) - addr;
	3801	if (l > len)
	3802	l = len;
	3803	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3804	if (is_write) {
	3805	address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
	3806	attrs, buf, l);
	3807	} else {
	3808	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3809	attrs, buf, l, 0);
	3810	}
	3811	len -= l;
	3812	buf += l;
	3813	addr += l;
	3814	}
	3815	return 0;
	3816	}
	3817
	3818	/*
	3819	* Allows code that needs to deal with migration bitmaps etc to still be built
	3820	* target independent.
	3821	*/
	3822	size_t qemu_target_page_size(void)
	3823	{
	3824	return TARGET_PAGE_SIZE;
	3825	}
	3826
	3827	int qemu_target_page_bits(void)
	3828	{
	3829	return TARGET_PAGE_BITS;
	3830	}
	3831
	3832	int qemu_target_page_bits_min(void)
	3833	{
	3834	return TARGET_PAGE_BITS_MIN;
	3835	}
	3836	#endif
	3837
	3838	bool target_words_bigendian(void)
	3839	{
	3840	#if defined(TARGET_WORDS_BIGENDIAN)
	3841	return true;
	3842	#else
	3843	return false;
	3844	#endif
	3845	}
	3846
	3847	#ifndef CONFIG_USER_ONLY
	3848	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3849	{
	3850	MemoryRegion*mr;
	3851	hwaddr l = 1;
	3852	bool res;
	3853
	3854	RCU_READ_LOCK_GUARD();
	3855	mr = address_space_translate(&address_space_memory,
	3856	phys_addr, &phys_addr, &l, false,
	3857	MEMTXATTRS_UNSPECIFIED);
	3858
	3859	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3860	return res;
	3861	}
	3862
	3863	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3864	{
	3865	RAMBlock *block;
	3866	int ret = 0;
	3867
	3868	RCU_READ_LOCK_GUARD();
	3869	RAMBLOCK_FOREACH(block) {
	3870	ret = func(block, opaque);
	3871	if (ret) {
	3872	break;
	3873	}
	3874	}
	3875	return ret;
	3876	}
	3877
	3878	/*
	3879	* Unmap pages of memory from start to start+length such that
	3880	* they a) read as 0, b) Trigger whatever fault mechanism
	3881	* the OS provides for postcopy.
	3882	* The pages must be unmapped by the end of the function.
	3883	* Returns: 0 on success, none-0 on failure
	3884	*
	3885	*/
	3886	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	3887	{
	3888	int ret = -1;
	3889
	3890	uint8_t *host_startaddr = rb->host + start;
	3891
	3892	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	3893	error_report("ram_block_discard_range: Unaligned start address: %p",
	3894	host_startaddr);
	3895	goto err;
	3896	}
	3897
	3898	if ((start + length) <= rb->used_length) {
	3899	bool need_madvise, need_fallocate;
	3900	uint8_t *host_endaddr = host_startaddr + length;
	3901	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	3902	error_report("ram_block_discard_range: Unaligned end address: %p",
	3903	host_endaddr);
	3904	goto err;
	3905	}
	3906
	3907	errno = ENOTSUP; /* If we are missing MADVISE etc */
	3908
	3909	/* The logic here is messy;
	3910	* madvise DONTNEED fails for hugepages
	3911	* fallocate works on hugepages and shmem
	3912	*/
	3913	need_madvise = (rb->page_size == qemu_host_page_size);
	3914	need_fallocate = rb->fd != -1;
	3915	if (need_fallocate) {
	3916	/* For a file, this causes the area of the file to be zero'd
	3917	* if read, and for hugetlbfs also causes it to be unmapped
	3918	* so a userfault will trigger.
	3919	*/
	3920	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	3921	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	3922	start, length);
	3923	if (ret) {
	3924	ret = -errno;
	3925	error_report("ram_block_discard_range: Failed to fallocate "
	3926	"%s:%" PRIx64 " +%zx (%d)",
	3927	rb->idstr, start, length, ret);
	3928	goto err;
	3929	}
	3930	#else
	3931	ret = -ENOSYS;
	3932	error_report("ram_block_discard_range: fallocate not available/file"
	3933	"%s:%" PRIx64 " +%zx (%d)",
	3934	rb->idstr, start, length, ret);
	3935	goto err;
	3936	#endif
	3937	}
	3938	if (need_madvise) {
	3939	/* For normal RAM this causes it to be unmapped,
	3940	* for shared memory it causes the local mapping to disappear
	3941	* and to fall back on the file contents (which we just
	3942	* fallocate'd away).
	3943	*/
	3944	#if defined(CONFIG_MADVISE)
	3945	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	3946	if (ret) {
	3947	ret = -errno;
	3948	error_report("ram_block_discard_range: Failed to discard range "
	3949	"%s:%" PRIx64 " +%zx (%d)",
	3950	rb->idstr, start, length, ret);
	3951	goto err;
	3952	}
	3953	#else
	3954	ret = -ENOSYS;
	3955	error_report("ram_block_discard_range: MADVISE not available"
	3956	"%s:%" PRIx64 " +%zx (%d)",
	3957	rb->idstr, start, length, ret);
	3958	goto err;
	3959	#endif
	3960	}
	3961	trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
	3962	need_madvise, need_fallocate, ret);
	3963	} else {
	3964	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	3965	"/%zx/" RAM_ADDR_FMT")",
	3966	rb->idstr, start, length, rb->used_length);
	3967	}
	3968
	3969	err:
	3970	return ret;
	3971	}
	3972
	3973	bool ramblock_is_pmem(RAMBlock *rb)
	3974	{
	3975	return rb->flags & RAM_PMEM;
	3976	}
	3977
	3978	#endif
	3979
	3980	void page_size_init(void)
	3981	{
	3982	/* NOTE: we can always suppose that qemu_host_page_size >=
	3983	TARGET_PAGE_SIZE */
	3984	if (qemu_host_page_size == 0) {
	3985	qemu_host_page_size = qemu_real_host_page_size;
	3986	}
	3987	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	3988	qemu_host_page_size = TARGET_PAGE_SIZE;
	3989	}
	3990	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	3991	}
	3992
	3993	#if !defined(CONFIG_USER_ONLY)
	3994
	3995	static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
	3996	{
	3997	if (start == end - 1) {
	3998	qemu_printf("\t%3d ", start);
	3999	} else {
	4000	qemu_printf("\t%3d..%-3d ", start, end - 1);
	4001	}
	4002	qemu_printf(" skip=%d ", skip);
	4003	if (ptr == PHYS_MAP_NODE_NIL) {
	4004	qemu_printf(" ptr=NIL");
	4005	} else if (!skip) {
	4006	qemu_printf(" ptr=#%d", ptr);
	4007	} else {
	4008	qemu_printf(" ptr=[%d]", ptr);
	4009	}
	4010	qemu_printf("\n");
	4011	}
	4012
	4013	#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
	4014	int128_sub((size), int128_one())) : 0)
	4015
	4016	void mtree_print_dispatch(AddressSpaceDispatch d, MemoryRegion root)
	4017	{
	4018	int i;
	4019
	4020	qemu_printf(" Dispatch\n");
	4021	qemu_printf(" Physical sections\n");
	4022
	4023	for (i = 0; i < d->map.sections_nb; ++i) {
	4024	MemoryRegionSection *s = d->map.sections + i;
	4025	const char *names[] = { " [unassigned]", " [not dirty]",
	4026	" [ROM]", " [watch]" };
	4027
	4028	qemu_printf(" #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
	4029	" %s%s%s%s%s",
	4030	i,
	4031	s->offset_within_address_space,
	4032	s->offset_within_address_space + MR_SIZE(s->mr->size),
	4033	s->mr->name ? s->mr->name : "(noname)",
	4034	i < ARRAY_SIZE(names) ? names[i] : "",
	4035	s->mr == root ? " [ROOT]" : "",
	4036	s == d->mru_section ? " [MRU]" : "",
	4037	s->mr->is_iommu ? " [iommu]" : "");
	4038
	4039	if (s->mr->alias) {
	4040	qemu_printf(" alias=%s", s->mr->alias->name ?
	4041	s->mr->alias->name : "noname");
	4042	}
	4043	qemu_printf("\n");
	4044	}
	4045
	4046	qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
	4047	P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
	4048	for (i = 0; i < d->map.nodes_nb; ++i) {
	4049	int j, jprev;
	4050	PhysPageEntry prev;
	4051	Node *n = d->map.nodes + i;
	4052
	4053	qemu_printf(" [%d]\n", i);
	4054
	4055	for (j = 0, jprev = 0, prev = n[0]; j < ARRAY_SIZE(n); ++j) {
	4056	PhysPageEntry pe = n + j;
	4057
	4058	if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
	4059	continue;
	4060	}
	4061
	4062	mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
	4063
	4064	jprev = j;
	4065	prev = *pe;
	4066	}
	4067
	4068	if (jprev != ARRAY_SIZE(*n)) {
	4069	mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
	4070	}
	4071	}
	4072	}
	4073
	4074	#endif