Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19
	20	#include "qemu/osdep.h"
	21	#include "qemu-common.h"
	22	#include "qapi/error.h"
	23
	24	#include "qemu/cutils.h"
	25	#include "cpu.h"
	26	#include "exec/exec-all.h"
	27	#include "exec/target_page.h"
	28	#include "tcg.h"
	29	#include "hw/qdev-core.h"
	30	#include "hw/qdev-properties.h"
	31	#if !defined(CONFIG_USER_ONLY)
	32	#include "hw/boards.h"
	33	#include "hw/xen/xen.h"
	34	#endif
	35	#include "sysemu/kvm.h"
	36	#include "sysemu/sysemu.h"
	37	#include "sysemu/tcg.h"
	38	#include "qemu/timer.h"
	39	#include "qemu/config-file.h"
	40	#include "qemu/error-report.h"
	41	#include "qemu/qemu-print.h"
	42	#if defined(CONFIG_USER_ONLY)
	43	#include "qemu.h"
	44	#else /* !CONFIG_USER_ONLY */
	45	#include "exec/memory.h"
	46	#include "exec/ioport.h"
	47	#include "sysemu/dma.h"
	48	#include "sysemu/hostmem.h"
	49	#include "sysemu/hw_accel.h"
	50	#include "exec/address-spaces.h"
	51	#include "sysemu/xen-mapcache.h"
	52	#include "trace-root.h"
	53
	54	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	55	#include <linux/falloc.h>
	56	#endif
	57
	58	#endif
	59	#include "qemu/rcu_queue.h"
	60	#include "qemu/main-loop.h"
	61	#include "translate-all.h"
	62	#include "sysemu/replay.h"
	63
	64	#include "exec/memory-internal.h"
	65	#include "exec/ram_addr.h"
	66	#include "exec/log.h"
	67
	68	#include "qemu/pmem.h"
	69
	70	#include "migration/vmstate.h"
	71
	72	#include "qemu/range.h"
	73	#ifndef _WIN32
	74	#include "qemu/mmap-alloc.h"
	75	#endif
	76
	77	#include "monitor/monitor.h"
	78
	79	//#define DEBUG_SUBPAGE
	80
	81	#if !defined(CONFIG_USER_ONLY)
	82	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	83	* are protected by the ramlist lock.
	84	*/
	85	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	86
	87	static MemoryRegion *system_memory;
	88	static MemoryRegion *system_io;
	89
	90	AddressSpace address_space_io;
	91	AddressSpace address_space_memory;
	92
	93	static MemoryRegion io_mem_unassigned;
	94	#endif
	95
	96	CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	97
	98	/* current CPU in the current thread. It is only valid inside
	99	cpu_exec() */
	100	__thread CPUState *current_cpu;
	101	/* 0 = Do not count executed instructions.
	102	1 = Precise instruction counting.
	103	2 = Adaptive rate instruction counting. */
	104	int use_icount;
	105
	106	uintptr_t qemu_host_page_size;
	107	intptr_t qemu_host_page_mask;
	108
	109	#if !defined(CONFIG_USER_ONLY)
	110
	111	typedef struct PhysPageEntry PhysPageEntry;
	112
	113	struct PhysPageEntry {
	114	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	115	uint32_t skip : 6;
	116	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	117	uint32_t ptr : 26;
	118	};
	119
	120	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	121
	122	/* Size of the L2 (and L3, etc) page tables. */
	123	#define ADDR_SPACE_BITS 64
	124
	125	#define P_L2_BITS 9
	126	#define P_L2_SIZE (1 << P_L2_BITS)
	127
	128	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	129
	130	typedef PhysPageEntry Node[P_L2_SIZE];
	131
	132	typedef struct PhysPageMap {
	133	struct rcu_head rcu;
	134
	135	unsigned sections_nb;
	136	unsigned sections_nb_alloc;
	137	unsigned nodes_nb;
	138	unsigned nodes_nb_alloc;
	139	Node *nodes;
	140	MemoryRegionSection *sections;
	141	} PhysPageMap;
	142
	143	struct AddressSpaceDispatch {
	144	MemoryRegionSection *mru_section;
	145	/* This is a multi-level map on the physical address space.
	146	* The bottom level has pointers to MemoryRegionSections.
	147	*/
	148	PhysPageEntry phys_map;
	149	PhysPageMap map;
	150	};
	151
	152	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	153	typedef struct subpage_t {
	154	MemoryRegion iomem;
	155	FlatView *fv;
	156	hwaddr base;
	157	uint16_t sub_section[];
	158	} subpage_t;
	159
	160	#define PHYS_SECTION_UNASSIGNED 0
	161
	162	static void io_mem_init(void);
	163	static void memory_map_init(void);
	164	static void tcg_log_global_after_sync(MemoryListener *listener);
	165	static void tcg_commit(MemoryListener *listener);
	166
	167	/**
	168	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	169	* @cpu: the CPU whose AddressSpace this is
	170	* @as: the AddressSpace itself
	171	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	172	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	173	*/
	174	struct CPUAddressSpace {
	175	CPUState *cpu;
	176	AddressSpace *as;
	177	struct AddressSpaceDispatch *memory_dispatch;
	178	MemoryListener tcg_as_listener;
	179	};
	180
	181	struct DirtyBitmapSnapshot {
	182	ram_addr_t start;
	183	ram_addr_t end;
	184	unsigned long dirty[];
	185	};
	186
	187	#endif
	188
	189	#if !defined(CONFIG_USER_ONLY)
	190
	191	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	192	{
	193	static unsigned alloc_hint = 16;
	194	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	195	map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
	196	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	197	alloc_hint = map->nodes_nb_alloc;
	198	}
	199	}
	200
	201	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	202	{
	203	unsigned i;
	204	uint32_t ret;
	205	PhysPageEntry e;
	206	PhysPageEntry *p;
	207
	208	ret = map->nodes_nb++;
	209	p = map->nodes[ret];
	210	assert(ret != PHYS_MAP_NODE_NIL);
	211	assert(ret != map->nodes_nb_alloc);
	212
	213	e.skip = leaf ? 0 : 1;
	214	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	215	for (i = 0; i < P_L2_SIZE; ++i) {
	216	memcpy(&p[i], &e, sizeof(e));
	217	}
	218	return ret;
	219	}
	220
	221	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	222	hwaddr index, uint64_t nb, uint16_t leaf,
	223	int level)
	224	{
	225	PhysPageEntry *p;
	226	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	227
	228	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	229	lp->ptr = phys_map_node_alloc(map, level == 0);
	230	}
	231	p = map->nodes[lp->ptr];
	232	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	233
	234	while (*nb && lp < &p[P_L2_SIZE]) {
	235	if ((index & (step - 1)) == 0 && nb >= step) {
	236	lp->skip = 0;
	237	lp->ptr = leaf;
	238	*index += step;
	239	*nb -= step;
	240	} else {
	241	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	242	}
	243	++lp;
	244	}
	245	}
	246
	247	static void phys_page_set(AddressSpaceDispatch *d,
	248	hwaddr index, uint64_t nb,
	249	uint16_t leaf)
	250	{
	251	/* Wildly overreserve - it doesn't matter much. */
	252	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	253
	254	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	255	}
	256
	257	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	258	* and update our entry so we can skip it and go directly to the destination.
	259	*/
	260	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	261	{
	262	unsigned valid_ptr = P_L2_SIZE;
	263	int valid = 0;
	264	PhysPageEntry *p;
	265	int i;
	266
	267	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	268	return;
	269	}
	270
	271	p = nodes[lp->ptr];
	272	for (i = 0; i < P_L2_SIZE; i++) {
	273	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	274	continue;
	275	}
	276
	277	valid_ptr = i;
	278	valid++;
	279	if (p[i].skip) {
	280	phys_page_compact(&p[i], nodes);
	281	}
	282	}
	283
	284	/* We can only compress if there's only one child. */
	285	if (valid != 1) {
	286	return;
	287	}
	288
	289	assert(valid_ptr < P_L2_SIZE);
	290
	291	/* Don't compress if it won't fit in the # of bits we have. */
	292	if (P_L2_LEVELS >= (1 << 6) &&
	293	lp->skip + p[valid_ptr].skip >= (1 << 6)) {
	294	return;
	295	}
	296
	297	lp->ptr = p[valid_ptr].ptr;
	298	if (!p[valid_ptr].skip) {
	299	/* If our only child is a leaf, make this a leaf. */
	300	/* By design, we should have made this node a leaf to begin with so we
	301	* should never reach here.
	302	* But since it's so simple to handle this, let's do it just in case we
	303	* change this rule.
	304	*/
	305	lp->skip = 0;
	306	} else {
	307	lp->skip += p[valid_ptr].skip;
	308	}
	309	}
	310
	311	void address_space_dispatch_compact(AddressSpaceDispatch *d)
	312	{
	313	if (d->phys_map.skip) {
	314	phys_page_compact(&d->phys_map, d->map.nodes);
	315	}
	316	}
	317
	318	static inline bool section_covers_addr(const MemoryRegionSection *section,
	319	hwaddr addr)
	320	{
	321	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	322	* the section must cover the entire address space.
	323	*/
	324	return int128_gethi(section->size) \|\|
	325	range_covers_byte(section->offset_within_address_space,
	326	int128_getlo(section->size), addr);
	327	}
	328
	329	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	330	{
	331	PhysPageEntry lp = d->phys_map, *p;
	332	Node *nodes = d->map.nodes;
	333	MemoryRegionSection *sections = d->map.sections;
	334	hwaddr index = addr >> TARGET_PAGE_BITS;
	335	int i;
	336
	337	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	338	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	339	return &sections[PHYS_SECTION_UNASSIGNED];
	340	}
	341	p = nodes[lp.ptr];
	342	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	343	}
	344
	345	if (section_covers_addr(&sections[lp.ptr], addr)) {
	346	return &sections[lp.ptr];
	347	} else {
	348	return &sections[PHYS_SECTION_UNASSIGNED];
	349	}
	350	}
	351
	352	/* Called from RCU critical section */
	353	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	354	hwaddr addr,
	355	bool resolve_subpage)
	356	{
	357	MemoryRegionSection *section = atomic_read(&d->mru_section);
	358	subpage_t *subpage;
	359
	360	if (!section \|\| section == &d->map.sections[PHYS_SECTION_UNASSIGNED] \|\|
	361	!section_covers_addr(section, addr)) {
	362	section = phys_page_find(d, addr);
	363	atomic_set(&d->mru_section, section);
	364	}
	365	if (resolve_subpage && section->mr->subpage) {
	366	subpage = container_of(section->mr, subpage_t, iomem);
	367	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	368	}
	369	return section;
	370	}
	371
	372	/* Called from RCU critical section */
	373	static MemoryRegionSection *
	374	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	375	hwaddr *plen, bool resolve_subpage)
	376	{
	377	MemoryRegionSection *section;
	378	MemoryRegion *mr;
	379	Int128 diff;
	380
	381	section = address_space_lookup_region(d, addr, resolve_subpage);
	382	/* Compute offset within MemoryRegionSection */
	383	addr -= section->offset_within_address_space;
	384
	385	/* Compute offset within MemoryRegion */
	386	*xlat = addr + section->offset_within_region;
	387
	388	mr = section->mr;
	389
	390	/* MMIO registers can be expected to perform full-width accesses based only
	391	* on their address, without considering adjacent registers that could
	392	* decode to completely different MemoryRegions. When such registers
	393	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	394	* regions overlap wildly. For this reason we cannot clamp the accesses
	395	* here.
	396	*
	397	* If the length is small (as is the case for address_space_ldl/stl),
	398	* everything works fine. If the incoming length is large, however,
	399	* the caller really has to do the clamping through memory_access_size.
	400	*/
	401	if (memory_region_is_ram(mr)) {
	402	diff = int128_sub(section->size, int128_make64(addr));
	403	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	404	}
	405	return section;
	406	}
	407
	408	/**
	409	* address_space_translate_iommu - translate an address through an IOMMU
	410	* memory region and then through the target address space.
	411	*
	412	* @iommu_mr: the IOMMU memory region that we start the translation from
	413	* @addr: the address to be translated through the MMU
	414	* @xlat: the translated address offset within the destination memory region.
	415	* It cannot be %NULL.
	416	* @plen_out: valid read/write length of the translated address. It
	417	* cannot be %NULL.
	418	* @page_mask_out: page mask for the translated address. This
	419	* should only be meaningful for IOMMU translated
	420	* addresses, since there may be huge pages that this bit
	421	* would tell. It can be %NULL if we don't care about it.
	422	* @is_write: whether the translation operation is for write
	423	* @is_mmio: whether this can be MMIO, set true if it can
	424	* @target_as: the address space targeted by the IOMMU
	425	* @attrs: transaction attributes
	426	*
	427	* This function is called from RCU critical section. It is the common
	428	* part of flatview_do_translate and address_space_translate_cached.
	429	*/
	430	static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
	431	hwaddr *xlat,
	432	hwaddr *plen_out,
	433	hwaddr *page_mask_out,
	434	bool is_write,
	435	bool is_mmio,
	436	AddressSpace **target_as,
	437	MemTxAttrs attrs)
	438	{
	439	MemoryRegionSection *section;
	440	hwaddr page_mask = (hwaddr)-1;
	441
	442	do {
	443	hwaddr addr = *xlat;
	444	IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	445	int iommu_idx = 0;
	446	IOMMUTLBEntry iotlb;
	447
	448	if (imrc->attrs_to_index) {
	449	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	450	}
	451
	452	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	453	IOMMU_WO : IOMMU_RO, iommu_idx);
	454
	455	if (!(iotlb.perm & (1 << is_write))) {
	456	goto unassigned;
	457	}
	458
	459	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	460	\| (addr & iotlb.addr_mask));
	461	page_mask &= iotlb.addr_mask;
	462	plen_out = MIN(plen_out, (addr \| iotlb.addr_mask) - addr + 1);
	463	*target_as = iotlb.target_as;
	464
	465	section = address_space_translate_internal(
	466	address_space_to_dispatch(iotlb.target_as), addr, xlat,
	467	plen_out, is_mmio);
	468
	469	iommu_mr = memory_region_get_iommu(section->mr);
	470	} while (unlikely(iommu_mr));
	471
	472	if (page_mask_out) {
	473	*page_mask_out = page_mask;
	474	}
	475	return *section;
	476
	477	unassigned:
	478	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	479	}
	480
	481	/**
	482	* flatview_do_translate - translate an address in FlatView
	483	*
	484	* @fv: the flat view that we want to translate on
	485	* @addr: the address to be translated in above address space
	486	* @xlat: the translated address offset within memory region. It
	487	* cannot be @NULL.
	488	* @plen_out: valid read/write length of the translated address. It
	489	* can be @NULL when we don't care about it.
	490	* @page_mask_out: page mask for the translated address. This
	491	* should only be meaningful for IOMMU translated
	492	* addresses, since there may be huge pages that this bit
	493	* would tell. It can be @NULL if we don't care about it.
	494	* @is_write: whether the translation operation is for write
	495	* @is_mmio: whether this can be MMIO, set true if it can
	496	* @target_as: the address space targeted by the IOMMU
	497	* @attrs: memory transaction attributes
	498	*
	499	* This function is called from RCU critical section
	500	*/
	501	static MemoryRegionSection flatview_do_translate(FlatView *fv,
	502	hwaddr addr,
	503	hwaddr *xlat,
	504	hwaddr *plen_out,
	505	hwaddr *page_mask_out,
	506	bool is_write,
	507	bool is_mmio,
	508	AddressSpace **target_as,
	509	MemTxAttrs attrs)
	510	{
	511	MemoryRegionSection *section;
	512	IOMMUMemoryRegion *iommu_mr;
	513	hwaddr plen = (hwaddr)(-1);
	514
	515	if (!plen_out) {
	516	plen_out = &plen;
	517	}
	518
	519	section = address_space_translate_internal(
	520	flatview_to_dispatch(fv), addr, xlat,
	521	plen_out, is_mmio);
	522
	523	iommu_mr = memory_region_get_iommu(section->mr);
	524	if (unlikely(iommu_mr)) {
	525	return address_space_translate_iommu(iommu_mr, xlat,
	526	plen_out, page_mask_out,
	527	is_write, is_mmio,
	528	target_as, attrs);
	529	}
	530	if (page_mask_out) {
	531	/* Not behind an IOMMU, use default page size. */
	532	*page_mask_out = ~TARGET_PAGE_MASK;
	533	}
	534
	535	return *section;
	536	}
	537
	538	/* Called from RCU critical section */
	539	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	540	bool is_write, MemTxAttrs attrs)
	541	{
	542	MemoryRegionSection section;
	543	hwaddr xlat, page_mask;
	544
	545	/*
	546	* This can never be MMIO, and we don't really care about plen,
	547	* but page mask.
	548	*/
	549	section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
	550	NULL, &page_mask, is_write, false, &as,
	551	attrs);
	552
	553	/* Illegal translation */
	554	if (section.mr == &io_mem_unassigned) {
	555	goto iotlb_fail;
	556	}
	557
	558	/* Convert memory region offset into address space offset */
	559	xlat += section.offset_within_address_space -
	560	section.offset_within_region;
	561
	562	return (IOMMUTLBEntry) {
	563	.target_as = as,
	564	.iova = addr & ~page_mask,
	565	.translated_addr = xlat & ~page_mask,
	566	.addr_mask = page_mask,
	567	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	568	.perm = IOMMU_RW,
	569	};
	570
	571	iotlb_fail:
	572	return (IOMMUTLBEntry) {0};
	573	}
	574
	575	/* Called from RCU critical section */
	576	MemoryRegion flatview_translate(FlatView fv, hwaddr addr, hwaddr *xlat,
	577	hwaddr *plen, bool is_write,
	578	MemTxAttrs attrs)
	579	{
	580	MemoryRegion *mr;
	581	MemoryRegionSection section;
	582	AddressSpace *as = NULL;
	583
	584	/* This can be MMIO, so setup MMIO bit. */
	585	section = flatview_do_translate(fv, addr, xlat, plen, NULL,
	586	is_write, true, &as, attrs);
	587	mr = section.mr;
	588
	589	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	590	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	591	plen = MIN(page, plen);
	592	}
	593
	594	return mr;
	595	}
	596
	597	typedef struct TCGIOMMUNotifier {
	598	IOMMUNotifier n;
	599	MemoryRegion *mr;
	600	CPUState *cpu;
	601	int iommu_idx;
	602	bool active;
	603	} TCGIOMMUNotifier;
	604
	605	static void tcg_iommu_unmap_notify(IOMMUNotifier n, IOMMUTLBEntry iotlb)
	606	{
	607	TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
	608
	609	if (!notifier->active) {
	610	return;
	611	}
	612	tlb_flush(notifier->cpu);
	613	notifier->active = false;
	614	/* We leave the notifier struct on the list to avoid reallocating it later.
	615	* Generally the number of IOMMUs a CPU deals with will be small.
	616	* In any case we can't unregister the iommu notifier from a notify
	617	* callback.
	618	*/
	619	}
	620
	621	static void tcg_register_iommu_notifier(CPUState *cpu,
	622	IOMMUMemoryRegion *iommu_mr,
	623	int iommu_idx)
	624	{
	625	/* Make sure this CPU has an IOMMU notifier registered for this
	626	* IOMMU/IOMMU index combination, so that we can flush its TLB
	627	* when the IOMMU tells us the mappings we've cached have changed.
	628	*/
	629	MemoryRegion *mr = MEMORY_REGION(iommu_mr);
	630	TCGIOMMUNotifier *notifier;
	631	Error *err = NULL;
	632	int i, ret;
	633
	634	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	635	notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
	636	if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
	637	break;
	638	}
	639	}
	640	if (i == cpu->iommu_notifiers->len) {
	641	/* Not found, add a new entry at the end of the array */
	642	cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
	643	notifier = g_new0(TCGIOMMUNotifier, 1);
	644	g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
	645
	646	notifier->mr = mr;
	647	notifier->iommu_idx = iommu_idx;
	648	notifier->cpu = cpu;
	649	/* Rather than trying to register interest in the specific part
	650	* of the iommu's address space that we've accessed and then
	651	* expand it later as subsequent accesses touch more of it, we
	652	* just register interest in the whole thing, on the assumption
	653	* that iommu reconfiguration will be rare.
	654	*/
	655	iommu_notifier_init(&notifier->n,
	656	tcg_iommu_unmap_notify,
	657	IOMMU_NOTIFIER_UNMAP,
	658	0,
	659	HWADDR_MAX,
	660	iommu_idx);
	661	ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
	662	&err);
	663	if (ret) {
	664	error_report_err(err);
	665	exit(1);
	666	}
	667	}
	668
	669	if (!notifier->active) {
	670	notifier->active = true;
	671	}
	672	}
	673
	674	static void tcg_iommu_free_notifier_list(CPUState *cpu)
	675	{
	676	/* Destroy the CPU's notifier list */
	677	int i;
	678	TCGIOMMUNotifier *notifier;
	679
	680	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	681	notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
	682	memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
	683	g_free(notifier);
	684	}
	685	g_array_free(cpu->iommu_notifiers, true);
	686	}
	687
	688	/* Called from RCU critical section */
	689	MemoryRegionSection *
	690	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	691	hwaddr xlat, hwaddr plen,
	692	MemTxAttrs attrs, int *prot)
	693	{
	694	MemoryRegionSection *section;
	695	IOMMUMemoryRegion *iommu_mr;
	696	IOMMUMemoryRegionClass *imrc;
	697	IOMMUTLBEntry iotlb;
	698	int iommu_idx;
	699	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	700
	701	for (;;) {
	702	section = address_space_translate_internal(d, addr, &addr, plen, false);
	703
	704	iommu_mr = memory_region_get_iommu(section->mr);
	705	if (!iommu_mr) {
	706	break;
	707	}
	708
	709	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	710
	711	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	712	tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
	713	/* We need all the permissions, so pass IOMMU_NONE so the IOMMU
	714	* doesn't short-cut its translation table walk.
	715	*/
	716	iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
	717	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	718	\| (addr & iotlb.addr_mask));
	719	/* Update the caller's prot bits to remove permissions the IOMMU
	720	* is giving us a failure response for. If we get down to no
	721	* permissions left at all we can give up now.
	722	*/
	723	if (!(iotlb.perm & IOMMU_RO)) {
	724	*prot &= ~(PAGE_READ \| PAGE_EXEC);
	725	}
	726	if (!(iotlb.perm & IOMMU_WO)) {
	727	*prot &= ~PAGE_WRITE;
	728	}
	729
	730	if (!*prot) {
	731	goto translate_fail;
	732	}
	733
	734	d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
	735	}
	736
	737	assert(!memory_region_is_iommu(section->mr));
	738	*xlat = addr;
	739	return section;
	740
	741	translate_fail:
	742	return &d->map.sections[PHYS_SECTION_UNASSIGNED];
	743	}
	744	#endif
	745
	746	#if !defined(CONFIG_USER_ONLY)
	747
	748	static int cpu_common_post_load(void *opaque, int version_id)
	749	{
	750	CPUState *cpu = opaque;
	751
	752	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	753	version_id is increased. */
	754	cpu->interrupt_request &= ~0x01;
	755	tlb_flush(cpu);
	756
	757	/* loadvm has just updated the content of RAM, bypassing the
	758	* usual mechanisms that ensure we flush TBs for writes to
	759	* memory we've translated code from. So we must flush all TBs,
	760	* which will now be stale.
	761	*/
	762	tb_flush(cpu);
	763
	764	return 0;
	765	}
	766
	767	static int cpu_common_pre_load(void *opaque)
	768	{
	769	CPUState *cpu = opaque;
	770
	771	cpu->exception_index = -1;
	772
	773	return 0;
	774	}
	775
	776	static bool cpu_common_exception_index_needed(void *opaque)
	777	{
	778	CPUState *cpu = opaque;
	779
	780	return tcg_enabled() && cpu->exception_index != -1;
	781	}
	782
	783	static const VMStateDescription vmstate_cpu_common_exception_index = {
	784	.name = "cpu_common/exception_index",
	785	.version_id = 1,
	786	.minimum_version_id = 1,
	787	.needed = cpu_common_exception_index_needed,
	788	.fields = (VMStateField[]) {
	789	VMSTATE_INT32(exception_index, CPUState),
	790	VMSTATE_END_OF_LIST()
	791	}
	792	};
	793
	794	static bool cpu_common_crash_occurred_needed(void *opaque)
	795	{
	796	CPUState *cpu = opaque;
	797
	798	return cpu->crash_occurred;
	799	}
	800
	801	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	802	.name = "cpu_common/crash_occurred",
	803	.version_id = 1,
	804	.minimum_version_id = 1,
	805	.needed = cpu_common_crash_occurred_needed,
	806	.fields = (VMStateField[]) {
	807	VMSTATE_BOOL(crash_occurred, CPUState),
	808	VMSTATE_END_OF_LIST()
	809	}
	810	};
	811
	812	const VMStateDescription vmstate_cpu_common = {
	813	.name = "cpu_common",
	814	.version_id = 1,
	815	.minimum_version_id = 1,
	816	.pre_load = cpu_common_pre_load,
	817	.post_load = cpu_common_post_load,
	818	.fields = (VMStateField[]) {
	819	VMSTATE_UINT32(halted, CPUState),
	820	VMSTATE_UINT32(interrupt_request, CPUState),
	821	VMSTATE_END_OF_LIST()
	822	},
	823	.subsections = (const VMStateDescription*[]) {
	824	&vmstate_cpu_common_exception_index,
	825	&vmstate_cpu_common_crash_occurred,
	826	NULL
	827	}
	828	};
	829
	830	#endif
	831
	832	CPUState *qemu_get_cpu(int index)
	833	{
	834	CPUState *cpu;
	835
	836	CPU_FOREACH(cpu) {
	837	if (cpu->cpu_index == index) {
	838	return cpu;
	839	}
	840	}
	841
	842	return NULL;
	843	}
	844
	845	#if !defined(CONFIG_USER_ONLY)
	846	void cpu_address_space_init(CPUState *cpu, int asidx,
	847	const char prefix, MemoryRegion mr)
	848	{
	849	CPUAddressSpace *newas;
	850	AddressSpace *as = g_new0(AddressSpace, 1);
	851	char *as_name;
	852
	853	assert(mr);
	854	as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
	855	address_space_init(as, mr, as_name);
	856	g_free(as_name);
	857
	858	/* Target code should have set num_ases before calling us */
	859	assert(asidx < cpu->num_ases);
	860
	861	if (asidx == 0) {
	862	/* address space 0 gets the convenience alias */
	863	cpu->as = as;
	864	}
	865
	866	/* KVM cannot currently support multiple address spaces. */
	867	assert(asidx == 0 \|\| !kvm_enabled());
	868
	869	if (!cpu->cpu_ases) {
	870	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	871	}
	872
	873	newas = &cpu->cpu_ases[asidx];
	874	newas->cpu = cpu;
	875	newas->as = as;
	876	if (tcg_enabled()) {
	877	newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
	878	newas->tcg_as_listener.commit = tcg_commit;
	879	memory_listener_register(&newas->tcg_as_listener, as);
	880	}
	881	}
	882
	883	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	884	{
	885	/* Return the AddressSpace corresponding to the specified index */
	886	return cpu->cpu_ases[asidx].as;
	887	}
	888	#endif
	889
	890	void cpu_exec_unrealizefn(CPUState *cpu)
	891	{
	892	CPUClass *cc = CPU_GET_CLASS(cpu);
	893
	894	cpu_list_remove(cpu);
	895
	896	if (cc->vmsd != NULL) {
	897	vmstate_unregister(NULL, cc->vmsd, cpu);
	898	}
	899	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	900	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	901	}
	902	#ifndef CONFIG_USER_ONLY
	903	tcg_iommu_free_notifier_list(cpu);
	904	#endif
	905	}
	906
	907	Property cpu_common_props[] = {
	908	#ifndef CONFIG_USER_ONLY
	909	/* Create a memory property for softmmu CPU object,
	910	* so users can wire up its memory. (This can't go in hw/core/cpu.c
	911	* because that file is compiled only once for both user-mode
	912	* and system builds.) The default if no link is set up is to use
	913	* the system address space.
	914	*/
	915	DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
	916	MemoryRegion *),
	917	#endif
	918	DEFINE_PROP_END_OF_LIST(),
	919	};
	920
	921	void cpu_exec_initfn(CPUState *cpu)
	922	{
	923	cpu->as = NULL;
	924	cpu->num_ases = 0;
	925
	926	#ifndef CONFIG_USER_ONLY
	927	cpu->thread_id = qemu_get_thread_id();
	928	cpu->memory = system_memory;
	929	object_ref(OBJECT(cpu->memory));
	930	#endif
	931	}
	932
	933	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	934	{
	935	CPUClass *cc = CPU_GET_CLASS(cpu);
	936	static bool tcg_target_initialized;
	937
	938	cpu_list_add(cpu);
	939
	940	if (tcg_enabled() && !tcg_target_initialized) {
	941	tcg_target_initialized = true;
	942	cc->tcg_initialize();
	943	}
	944	tlb_init(cpu);
	945
	946	qemu_plugin_vcpu_init_hook(cpu);
	947
	948	#ifndef CONFIG_USER_ONLY
	949	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	950	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	951	}
	952	if (cc->vmsd != NULL) {
	953	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	954	}
	955
	956	cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
	957	#endif
	958	}
	959
	960	const char parse_cpu_option(const char cpu_option)
	961	{
	962	ObjectClass *oc;
	963	CPUClass *cc;
	964	gchar **model_pieces;
	965	const char *cpu_type;
	966
	967	model_pieces = g_strsplit(cpu_option, ",", 2);
	968	if (!model_pieces[0]) {
	969	error_report("-cpu option cannot be empty");
	970	exit(1);
	971	}
	972
	973	oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
	974	if (oc == NULL) {
	975	error_report("unable to find CPU model '%s'", model_pieces[0]);
	976	g_strfreev(model_pieces);
	977	exit(EXIT_FAILURE);
	978	}
	979
	980	cpu_type = object_class_get_name(oc);
	981	cc = CPU_CLASS(oc);
	982	cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
	983	g_strfreev(model_pieces);
	984	return cpu_type;
	985	}
	986
	987	#if defined(CONFIG_USER_ONLY)
	988	void tb_invalidate_phys_addr(target_ulong addr)
	989	{
	990	mmap_lock();
	991	tb_invalidate_phys_page_range(addr, addr + 1);
	992	mmap_unlock();
	993	}
	994
	995	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	996	{
	997	tb_invalidate_phys_addr(pc);
	998	}
	999	#else
	1000	void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
	1001	{
	1002	ram_addr_t ram_addr;
	1003	MemoryRegion *mr;
	1004	hwaddr l = 1;
	1005
	1006	if (!tcg_enabled()) {
	1007	return;
	1008	}
	1009
	1010	RCU_READ_LOCK_GUARD();
	1011	mr = address_space_translate(as, addr, &addr, &l, false, attrs);
	1012	if (!(memory_region_is_ram(mr)
	1013	\|\| memory_region_is_romd(mr))) {
	1014	return;
	1015	}
	1016	ram_addr = memory_region_get_ram_addr(mr) + addr;
	1017	tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
	1018	}
	1019
	1020	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	1021	{
	1022	MemTxAttrs attrs;
	1023	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	1024	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	1025	if (phys != -1) {
	1026	/* Locks grabbed by tb_invalidate_phys_addr */
	1027	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	1028	phys \| (pc & ~TARGET_PAGE_MASK), attrs);
	1029	}
	1030	}
	1031	#endif
	1032
	1033	#ifndef CONFIG_USER_ONLY
	1034	/* Add a watchpoint. */
	1035	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	1036	int flags, CPUWatchpoint **watchpoint)
	1037	{
	1038	CPUWatchpoint *wp;
	1039
	1040	/* forbid ranges which are empty or run off the end of the address space */
	1041	if (len == 0 \|\| (addr + len - 1) < addr) {
	1042	error_report("tried to set invalid watchpoint at %"
	1043	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	1044	return -EINVAL;
	1045	}
	1046	wp = g_malloc(sizeof(*wp));
	1047
	1048	wp->vaddr = addr;
	1049	wp->len = len;
	1050	wp->flags = flags;
	1051
	1052	/* keep all GDB-injected watchpoints in front */
	1053	if (flags & BP_GDB) {
	1054	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	1055	} else {
	1056	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	1057	}
	1058
	1059	tlb_flush_page(cpu, addr);
	1060
	1061	if (watchpoint)
	1062	*watchpoint = wp;
	1063	return 0;
	1064	}
	1065
	1066	/* Remove a specific watchpoint. */
	1067	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	1068	int flags)
	1069	{
	1070	CPUWatchpoint *wp;
	1071
	1072	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1073	if (addr == wp->vaddr && len == wp->len
	1074	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	1075	cpu_watchpoint_remove_by_ref(cpu, wp);
	1076	return 0;
	1077	}
	1078	}
	1079	return -ENOENT;
	1080	}
	1081
	1082	/* Remove a specific watchpoint by reference. */
	1083	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	1084	{
	1085	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	1086
	1087	tlb_flush_page(cpu, watchpoint->vaddr);
	1088
	1089	g_free(watchpoint);
	1090	}
	1091
	1092	/* Remove all matching watchpoints. */
	1093	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	1094	{
	1095	CPUWatchpoint wp, next;
	1096
	1097	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	1098	if (wp->flags & mask) {
	1099	cpu_watchpoint_remove_by_ref(cpu, wp);
	1100	}
	1101	}
	1102	}
	1103
	1104	/* Return true if this watchpoint address matches the specified
	1105	* access (ie the address range covered by the watchpoint overlaps
	1106	* partially or completely with the address range covered by the
	1107	* access).
	1108	*/
	1109	static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
	1110	vaddr addr, vaddr len)
	1111	{
	1112	/* We know the lengths are non-zero, but a little caution is
	1113	* required to avoid errors in the case where the range ends
	1114	* exactly at the top of the address space and so addr + len
	1115	* wraps round to zero.
	1116	*/
	1117	vaddr wpend = wp->vaddr + wp->len - 1;
	1118	vaddr addrend = addr + len - 1;
	1119
	1120	return !(addr > wpend \|\| wp->vaddr > addrend);
	1121	}
	1122
	1123	/* Return flags for watchpoints that match addr + prot. */
	1124	int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
	1125	{
	1126	CPUWatchpoint *wp;
	1127	int ret = 0;
	1128
	1129	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1130	if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
	1131	ret \|= wp->flags;
	1132	}
	1133	}
	1134	return ret;
	1135	}
	1136	#endif /* !CONFIG_USER_ONLY */
	1137
	1138	/* Add a breakpoint. */
	1139	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	1140	CPUBreakpoint **breakpoint)
	1141	{
	1142	CPUBreakpoint *bp;
	1143
	1144	bp = g_malloc(sizeof(*bp));
	1145
	1146	bp->pc = pc;
	1147	bp->flags = flags;
	1148
	1149	/* keep all GDB-injected breakpoints in front */
	1150	if (flags & BP_GDB) {
	1151	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	1152	} else {
	1153	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	1154	}
	1155
	1156	breakpoint_invalidate(cpu, pc);
	1157
	1158	if (breakpoint) {
	1159	*breakpoint = bp;
	1160	}
	1161	return 0;
	1162	}
	1163
	1164	/* Remove a specific breakpoint. */
	1165	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	1166	{
	1167	CPUBreakpoint *bp;
	1168
	1169	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	1170	if (bp->pc == pc && bp->flags == flags) {
	1171	cpu_breakpoint_remove_by_ref(cpu, bp);
	1172	return 0;
	1173	}
	1174	}
	1175	return -ENOENT;
	1176	}
	1177
	1178	/* Remove a specific breakpoint by reference. */
	1179	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	1180	{
	1181	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	1182
	1183	breakpoint_invalidate(cpu, breakpoint->pc);
	1184
	1185	g_free(breakpoint);
	1186	}
	1187
	1188	/* Remove all matching breakpoints. */
	1189	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	1190	{
	1191	CPUBreakpoint bp, next;
	1192
	1193	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	1194	if (bp->flags & mask) {
	1195	cpu_breakpoint_remove_by_ref(cpu, bp);
	1196	}
	1197	}
	1198	}
	1199
	1200	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	1201	CPU loop after each instruction */
	1202	void cpu_single_step(CPUState *cpu, int enabled)
	1203	{
	1204	if (cpu->singlestep_enabled != enabled) {
	1205	cpu->singlestep_enabled = enabled;
	1206	if (kvm_enabled()) {
	1207	kvm_update_guest_debug(cpu, 0);
	1208	} else {
	1209	/* must flush all the translated code to avoid inconsistencies */
	1210	/* XXX: only flush what is necessary */
	1211	tb_flush(cpu);
	1212	}
	1213	}
	1214	}
	1215
	1216	void cpu_abort(CPUState cpu, const char fmt, ...)
	1217	{
	1218	va_list ap;
	1219	va_list ap2;
	1220
	1221	va_start(ap, fmt);
	1222	va_copy(ap2, ap);
	1223	fprintf(stderr, "qemu: fatal: ");
	1224	vfprintf(stderr, fmt, ap);
	1225	fprintf(stderr, "\n");
	1226	cpu_dump_state(cpu, stderr, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1227	if (qemu_log_separate()) {
	1228	qemu_log_lock();
	1229	qemu_log("qemu: fatal: ");
	1230	qemu_log_vprintf(fmt, ap2);
	1231	qemu_log("\n");
	1232	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1233	qemu_log_flush();
	1234	qemu_log_unlock();
	1235	qemu_log_close();
	1236	}
	1237	va_end(ap2);
	1238	va_end(ap);
	1239	replay_finish();
	1240	#if defined(CONFIG_USER_ONLY)
	1241	{
	1242	struct sigaction act;
	1243	sigfillset(&act.sa_mask);
	1244	act.sa_handler = SIG_DFL;
	1245	act.sa_flags = 0;
	1246	sigaction(SIGABRT, &act, NULL);
	1247	}
	1248	#endif
	1249	abort();
	1250	}
	1251
	1252	#if !defined(CONFIG_USER_ONLY)
	1253	/* Called from RCU critical section */
	1254	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1255	{
	1256	RAMBlock *block;
	1257
	1258	block = atomic_rcu_read(&ram_list.mru_block);
	1259	if (block && addr - block->offset < block->max_length) {
	1260	return block;
	1261	}
	1262	RAMBLOCK_FOREACH(block) {
	1263	if (addr - block->offset < block->max_length) {
	1264	goto found;
	1265	}
	1266	}
	1267
	1268	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1269	abort();
	1270
	1271	found:
	1272	/* It is safe to write mru_block outside the iothread lock. This
	1273	* is what happens:
	1274	*
	1275	* mru_block = xxx
	1276	* rcu_read_unlock()
	1277	* xxx removed from list
	1278	* rcu_read_lock()
	1279	* read mru_block
	1280	* mru_block = NULL;
	1281	* call_rcu(reclaim_ramblock, xxx);
	1282	* rcu_read_unlock()
	1283	*
	1284	* atomic_rcu_set is not needed here. The block was already published
	1285	* when it was placed into the list. Here we're just making an extra
	1286	* copy of the pointer.
	1287	*/
	1288	ram_list.mru_block = block;
	1289	return block;
	1290	}
	1291
	1292	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1293	{
	1294	CPUState *cpu;
	1295	ram_addr_t start1;
	1296	RAMBlock *block;
	1297	ram_addr_t end;
	1298
	1299	assert(tcg_enabled());
	1300	end = TARGET_PAGE_ALIGN(start + length);
	1301	start &= TARGET_PAGE_MASK;
	1302
	1303	RCU_READ_LOCK_GUARD();
	1304	block = qemu_get_ram_block(start);
	1305	assert(block == qemu_get_ram_block(end - 1));
	1306	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1307	CPU_FOREACH(cpu) {
	1308	tlb_reset_dirty(cpu, start1, length);
	1309	}
	1310	}
	1311
	1312	/* Note: start and end must be within the same ram block. */
	1313	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1314	ram_addr_t length,
	1315	unsigned client)
	1316	{
	1317	DirtyMemoryBlocks *blocks;
	1318	unsigned long end, page;
	1319	bool dirty = false;
	1320	RAMBlock *ramblock;
	1321	uint64_t mr_offset, mr_size;
	1322
	1323	if (length == 0) {
	1324	return false;
	1325	}
	1326
	1327	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1328	page = start >> TARGET_PAGE_BITS;
	1329
	1330	WITH_RCU_READ_LOCK_GUARD() {
	1331	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1332	ramblock = qemu_get_ram_block(start);
	1333	/* Range sanity check on the ramblock */
	1334	assert(start >= ramblock->offset &&
	1335	start + length <= ramblock->offset + ramblock->used_length);
	1336
	1337	while (page < end) {
	1338	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1339	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1340	unsigned long num = MIN(end - page,
	1341	DIRTY_MEMORY_BLOCK_SIZE - offset);
	1342
	1343	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1344	offset, num);
	1345	page += num;
	1346	}
	1347
	1348	mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
	1349	mr_size = (end - page) << TARGET_PAGE_BITS;
	1350	memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
	1351	}
	1352
	1353	if (dirty && tcg_enabled()) {
	1354	tlb_reset_dirty_range_all(start, length);
	1355	}
	1356
	1357	return dirty;
	1358	}
	1359
	1360	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1361	(MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
	1362	{
	1363	DirtyMemoryBlocks *blocks;
	1364	ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
	1365	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1366	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1367	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1368	DirtyBitmapSnapshot *snap;
	1369	unsigned long page, end, dest;
	1370
	1371	snap = g_malloc0(sizeof(*snap) +
	1372	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1373	snap->start = first;
	1374	snap->end = last;
	1375
	1376	page = first >> TARGET_PAGE_BITS;
	1377	end = last >> TARGET_PAGE_BITS;
	1378	dest = 0;
	1379
	1380	WITH_RCU_READ_LOCK_GUARD() {
	1381	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1382
	1383	while (page < end) {
	1384	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1385	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1386	unsigned long num = MIN(end - page,
	1387	DIRTY_MEMORY_BLOCK_SIZE - offset);
	1388
	1389	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1390	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1391	offset >>= BITS_PER_LEVEL;
	1392
	1393	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1394	blocks->blocks[idx] + offset,
	1395	num);
	1396	page += num;
	1397	dest += num >> BITS_PER_LEVEL;
	1398	}
	1399	}
	1400
	1401	if (tcg_enabled()) {
	1402	tlb_reset_dirty_range_all(start, length);
	1403	}
	1404
	1405	memory_region_clear_dirty_bitmap(mr, offset, length);
	1406
	1407	return snap;
	1408	}
	1409
	1410	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1411	ram_addr_t start,
	1412	ram_addr_t length)
	1413	{
	1414	unsigned long page, end;
	1415
	1416	assert(start >= snap->start);
	1417	assert(start + length <= snap->end);
	1418
	1419	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1420	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1421
	1422	while (page < end) {
	1423	if (test_bit(page, snap->dirty)) {
	1424	return true;
	1425	}
	1426	page++;
	1427	}
	1428	return false;
	1429	}
	1430
	1431	/* Called from RCU critical section */
	1432	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1433	MemoryRegionSection *section)
	1434	{
	1435	AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
	1436	return section - d->map.sections;
	1437	}
	1438	#endif /* defined(CONFIG_USER_ONLY) */
	1439
	1440	#if !defined(CONFIG_USER_ONLY)
	1441
	1442	static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
	1443	uint16_t section);
	1444	static subpage_t subpage_init(FlatView fv, hwaddr base);
	1445
	1446	static void (phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
	1447	qemu_anon_ram_alloc;
	1448
	1449	/*
	1450	* Set a custom physical guest memory alloator.
	1451	* Accelerators with unusual needs may need this. Hopefully, we can
	1452	* get rid of it eventually.
	1453	*/
	1454	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align, bool shared))
	1455	{
	1456	phys_mem_alloc = alloc;
	1457	}
	1458
	1459	static uint16_t phys_section_add(PhysPageMap *map,
	1460	MemoryRegionSection *section)
	1461	{
	1462	/* The physical section number is ORed with a page-aligned
	1463	* pointer to produce the iotlb entries. Thus it should
	1464	* never overflow into the page-aligned value.
	1465	*/
	1466	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1467
	1468	if (map->sections_nb == map->sections_nb_alloc) {
	1469	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1470	map->sections = g_renew(MemoryRegionSection, map->sections,
	1471	map->sections_nb_alloc);
	1472	}
	1473	map->sections[map->sections_nb] = *section;
	1474	memory_region_ref(section->mr);
	1475	return map->sections_nb++;
	1476	}
	1477
	1478	static void phys_section_destroy(MemoryRegion *mr)
	1479	{
	1480	bool have_sub_page = mr->subpage;
	1481
	1482	memory_region_unref(mr);
	1483
	1484	if (have_sub_page) {
	1485	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1486	object_unref(OBJECT(&subpage->iomem));
	1487	g_free(subpage);
	1488	}
	1489	}
	1490
	1491	static void phys_sections_free(PhysPageMap *map)
	1492	{
	1493	while (map->sections_nb > 0) {
	1494	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1495	phys_section_destroy(section->mr);
	1496	}
	1497	g_free(map->sections);
	1498	g_free(map->nodes);
	1499	}
	1500
	1501	static void register_subpage(FlatView fv, MemoryRegionSection section)
	1502	{
	1503	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1504	subpage_t *subpage;
	1505	hwaddr base = section->offset_within_address_space
	1506	& TARGET_PAGE_MASK;
	1507	MemoryRegionSection *existing = phys_page_find(d, base);
	1508	MemoryRegionSection subsection = {
	1509	.offset_within_address_space = base,
	1510	.size = int128_make64(TARGET_PAGE_SIZE),
	1511	};
	1512	hwaddr start, end;
	1513
	1514	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1515
	1516	if (!(existing->mr->subpage)) {
	1517	subpage = subpage_init(fv, base);
	1518	subsection.fv = fv;
	1519	subsection.mr = &subpage->iomem;
	1520	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1521	phys_section_add(&d->map, &subsection));
	1522	} else {
	1523	subpage = container_of(existing->mr, subpage_t, iomem);
	1524	}
	1525	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1526	end = start + int128_get64(section->size) - 1;
	1527	subpage_register(subpage, start, end,
	1528	phys_section_add(&d->map, section));
	1529	}
	1530
	1531
	1532	static void register_multipage(FlatView *fv,
	1533	MemoryRegionSection *section)
	1534	{
	1535	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1536	hwaddr start_addr = section->offset_within_address_space;
	1537	uint16_t section_index = phys_section_add(&d->map, section);
	1538	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1539	TARGET_PAGE_BITS));
	1540
	1541	assert(num_pages);
	1542	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1543	}
	1544
	1545	/*
	1546	* The range in section may look like this:
	1547	*
	1548	* \|s\|PPPPPPP\|s\|
	1549	*
	1550	* where s stands for subpage and P for page.
	1551	*/
	1552	void flatview_add_to_dispatch(FlatView fv, MemoryRegionSection section)
	1553	{
	1554	MemoryRegionSection remain = *section;
	1555	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1556
	1557	/* register first subpage */
	1558	if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1559	uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
	1560	- remain.offset_within_address_space;
	1561
	1562	MemoryRegionSection now = remain;
	1563	now.size = int128_min(int128_make64(left), now.size);
	1564	register_subpage(fv, &now);
	1565	if (int128_eq(remain.size, now.size)) {
	1566	return;
	1567	}
	1568	remain.size = int128_sub(remain.size, now.size);
	1569	remain.offset_within_address_space += int128_get64(now.size);
	1570	remain.offset_within_region += int128_get64(now.size);
	1571	}
	1572
	1573	/* register whole pages */
	1574	if (int128_ge(remain.size, page_size)) {
	1575	MemoryRegionSection now = remain;
	1576	now.size = int128_and(now.size, int128_neg(page_size));
	1577	register_multipage(fv, &now);
	1578	if (int128_eq(remain.size, now.size)) {
	1579	return;
	1580	}
	1581	remain.size = int128_sub(remain.size, now.size);
	1582	remain.offset_within_address_space += int128_get64(now.size);
	1583	remain.offset_within_region += int128_get64(now.size);
	1584	}
	1585
	1586	/* register last subpage */
	1587	register_subpage(fv, &remain);
	1588	}
	1589
	1590	void qemu_flush_coalesced_mmio_buffer(void)
	1591	{
	1592	if (kvm_enabled())
	1593	kvm_flush_coalesced_mmio_buffer();
	1594	}
	1595
	1596	void qemu_mutex_lock_ramlist(void)
	1597	{
	1598	qemu_mutex_lock(&ram_list.mutex);
	1599	}
	1600
	1601	void qemu_mutex_unlock_ramlist(void)
	1602	{
	1603	qemu_mutex_unlock(&ram_list.mutex);
	1604	}
	1605
	1606	void ram_block_dump(Monitor *mon)
	1607	{
	1608	RAMBlock *block;
	1609	char *psize;
	1610
	1611	RCU_READ_LOCK_GUARD();
	1612	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1613	"Block Name", "PSize", "Offset", "Used", "Total");
	1614	RAMBLOCK_FOREACH(block) {
	1615	psize = size_to_str(block->page_size);
	1616	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1617	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1618	(uint64_t)block->offset,
	1619	(uint64_t)block->used_length,
	1620	(uint64_t)block->max_length);
	1621	g_free(psize);
	1622	}
	1623	}
	1624
	1625	#ifdef __linux__
	1626	/*
	1627	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1628	* may or may not name the same files / on the same filesystem now as
	1629	* when we actually open and map them. Iterate over the file
	1630	* descriptors instead, and use qemu_fd_getpagesize().
	1631	*/
	1632	static int find_min_backend_pagesize(Object obj, void opaque)
	1633	{
	1634	long *hpsize_min = opaque;
	1635
	1636	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1637	HostMemoryBackend *backend = MEMORY_BACKEND(obj);
	1638	long hpsize = host_memory_backend_pagesize(backend);
	1639
	1640	if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
	1641	*hpsize_min = hpsize;
	1642	}
	1643	}
	1644
	1645	return 0;
	1646	}
	1647
	1648	static int find_max_backend_pagesize(Object obj, void opaque)
	1649	{
	1650	long *hpsize_max = opaque;
	1651
	1652	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1653	HostMemoryBackend *backend = MEMORY_BACKEND(obj);
	1654	long hpsize = host_memory_backend_pagesize(backend);
	1655
	1656	if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
	1657	*hpsize_max = hpsize;
	1658	}
	1659	}
	1660
	1661	return 0;
	1662	}
	1663
	1664	/*
	1665	* TODO: We assume right now that all mapped host memory backends are
	1666	* used as RAM, however some might be used for different purposes.
	1667	*/
	1668	long qemu_minrampagesize(void)
	1669	{
	1670	long hpsize = LONG_MAX;
	1671	long mainrampagesize;
	1672	Object *memdev_root;
	1673	MachineState *ms = MACHINE(qdev_get_machine());
	1674
	1675	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1676
	1677	/* it's possible we have memory-backend objects with
	1678	* hugepage-backed RAM. these may get mapped into system
	1679	* address space via -numa parameters or memory hotplug
	1680	* hooks. we want to take these into account, but we
	1681	* also want to make sure these supported hugepage
	1682	* sizes are applicable across the entire range of memory
	1683	* we may boot from, so we take the min across all
	1684	* backends, and assume normal pages in cases where a
	1685	* backend isn't backed by hugepages.
	1686	*/
	1687	memdev_root = object_resolve_path("/objects", NULL);
	1688	if (memdev_root) {
	1689	object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
	1690	}
	1691	if (hpsize == LONG_MAX) {
	1692	/* No additional memory regions found ==> Report main RAM page size */
	1693	return mainrampagesize;
	1694	}
	1695
	1696	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1697	* memory-backend, then there is at least one node using "normal" RAM,
	1698	* so if its page size is smaller we have got to report that size instead.
	1699	*/
	1700	if (hpsize > mainrampagesize &&
	1701	(ms->numa_state == NULL \|\|
	1702	ms->numa_state->num_nodes == 0 \|\|
	1703	ms->numa_state->nodes[0].node_memdev == NULL)) {
	1704	static bool warned;
	1705	if (!warned) {
	1706	error_report("Huge page support disabled (n/a for main memory).");
	1707	warned = true;
	1708	}
	1709	return mainrampagesize;
	1710	}
	1711
	1712	return hpsize;
	1713	}
	1714
	1715	long qemu_maxrampagesize(void)
	1716	{
	1717	long pagesize = qemu_mempath_getpagesize(mem_path);
	1718	Object *memdev_root = object_resolve_path("/objects", NULL);
	1719
	1720	if (memdev_root) {
	1721	object_child_foreach(memdev_root, find_max_backend_pagesize,
	1722	&pagesize);
	1723	}
	1724	return pagesize;
	1725	}
	1726	#else
	1727	long qemu_minrampagesize(void)
	1728	{
	1729	return qemu_real_host_page_size;
	1730	}
	1731	long qemu_maxrampagesize(void)
	1732	{
	1733	return qemu_real_host_page_size;
	1734	}
	1735	#endif
	1736
	1737	#ifdef CONFIG_POSIX
	1738	static int64_t get_file_size(int fd)
	1739	{
	1740	int64_t size;
	1741	#if defined(__linux__)
	1742	struct stat st;
	1743
	1744	if (fstat(fd, &st) < 0) {
	1745	return -errno;
	1746	}
	1747
	1748	/* Special handling for devdax character devices */
	1749	if (S_ISCHR(st.st_mode)) {
	1750	g_autofree char *subsystem_path = NULL;
	1751	g_autofree char *subsystem = NULL;
	1752
	1753	subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
	1754	major(st.st_rdev), minor(st.st_rdev));
	1755	subsystem = g_file_read_link(subsystem_path, NULL);
	1756
	1757	if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
	1758	g_autofree char *size_path = NULL;
	1759	g_autofree char *size_str = NULL;
	1760
	1761	size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
	1762	major(st.st_rdev), minor(st.st_rdev));
	1763
	1764	if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
	1765	return g_ascii_strtoll(size_str, NULL, 0);
	1766	}
	1767	}
	1768	}
	1769	#endif /* defined(__linux__) */
	1770
	1771	/* st.st_size may be zero for special files yet lseek(2) works */
	1772	size = lseek(fd, 0, SEEK_END);
	1773	if (size < 0) {
	1774	return -errno;
	1775	}
	1776	return size;
	1777	}
	1778
	1779	static int file_ram_open(const char *path,
	1780	const char *region_name,
	1781	bool *created,
	1782	Error **errp)
	1783	{
	1784	char *filename;
	1785	char *sanitized_name;
	1786	char *c;
	1787	int fd = -1;
	1788
	1789	*created = false;
	1790	for (;;) {
	1791	fd = open(path, O_RDWR);
	1792	if (fd >= 0) {
	1793	/* @path names an existing file, use it */
	1794	break;
	1795	}
	1796	if (errno == ENOENT) {
	1797	/* @path names a file that doesn't exist, create it */
	1798	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1799	if (fd >= 0) {
	1800	*created = true;
	1801	break;
	1802	}
	1803	} else if (errno == EISDIR) {
	1804	/* @path names a directory, create a file there */
	1805	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1806	sanitized_name = g_strdup(region_name);
	1807	for (c = sanitized_name; *c != '\0'; c++) {
	1808	if (*c == '/') {
	1809	*c = '_';
	1810	}
	1811	}
	1812
	1813	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1814	sanitized_name);
	1815	g_free(sanitized_name);
	1816
	1817	fd = mkstemp(filename);
	1818	if (fd >= 0) {
	1819	unlink(filename);
	1820	g_free(filename);
	1821	break;
	1822	}
	1823	g_free(filename);
	1824	}
	1825	if (errno != EEXIST && errno != EINTR) {
	1826	error_setg_errno(errp, errno,
	1827	"can't open backing store %s for guest RAM",
	1828	path);
	1829	return -1;
	1830	}
	1831	/*
	1832	* Try again on EINTR and EEXIST. The latter happens when
	1833	* something else creates the file between our two open().
	1834	*/
	1835	}
	1836
	1837	return fd;
	1838	}
	1839
	1840	static void file_ram_alloc(RAMBlock block,
	1841	ram_addr_t memory,
	1842	int fd,
	1843	bool truncate,
	1844	Error **errp)
	1845	{
	1846	Error *err = NULL;
	1847	MachineState *ms = MACHINE(qdev_get_machine());
	1848	void *area;
	1849
	1850	block->page_size = qemu_fd_getpagesize(fd);
	1851	if (block->mr->align % block->page_size) {
	1852	error_setg(errp, "alignment 0x%" PRIx64
	1853	" must be multiples of page size 0x%zx",
	1854	block->mr->align, block->page_size);
	1855	return NULL;
	1856	} else if (block->mr->align && !is_power_of_2(block->mr->align)) {
	1857	error_setg(errp, "alignment 0x%" PRIx64
	1858	" must be a power of two", block->mr->align);
	1859	return NULL;
	1860	}
	1861	block->mr->align = MAX(block->page_size, block->mr->align);
	1862	#if defined(__s390x__)
	1863	if (kvm_enabled()) {
	1864	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1865	}
	1866	#endif
	1867
	1868	if (memory < block->page_size) {
	1869	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1870	"or larger than page size 0x%zx",
	1871	memory, block->page_size);
	1872	return NULL;
	1873	}
	1874
	1875	memory = ROUND_UP(memory, block->page_size);
	1876
	1877	/*
	1878	* ftruncate is not supported by hugetlbfs in older
	1879	* hosts, so don't bother bailing out on errors.
	1880	* If anything goes wrong with it under other filesystems,
	1881	* mmap will fail.
	1882	*
	1883	* Do not truncate the non-empty backend file to avoid corrupting
	1884	* the existing data in the file. Disabling shrinking is not
	1885	* enough. For example, the current vNVDIMM implementation stores
	1886	* the guest NVDIMM labels at the end of the backend file. If the
	1887	* backend file is later extended, QEMU will not be able to find
	1888	* those labels. Therefore, extending the non-empty backend file
	1889	* is disabled as well.
	1890	*/
	1891	if (truncate && ftruncate(fd, memory)) {
	1892	perror("ftruncate");
	1893	}
	1894
	1895	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1896	block->flags & RAM_SHARED, block->flags & RAM_PMEM);
	1897	if (area == MAP_FAILED) {
	1898	error_setg_errno(errp, errno,
	1899	"unable to map backing store for guest RAM");
	1900	return NULL;
	1901	}
	1902
	1903	if (mem_prealloc) {
	1904	os_mem_prealloc(fd, area, memory, ms->smp.cpus, &err);
	1905	if (err) {
	1906	error_propagate(errp, err);
	1907	qemu_ram_munmap(fd, area, memory);
	1908	return NULL;
	1909	}
	1910	}
	1911
	1912	block->fd = fd;
	1913	return area;
	1914	}
	1915	#endif
	1916
	1917	/* Allocate space within the ram_addr_t space that governs the
	1918	* dirty bitmaps.
	1919	* Called with the ramlist lock held.
	1920	*/
	1921	static ram_addr_t find_ram_offset(ram_addr_t size)
	1922	{
	1923	RAMBlock block, next_block;
	1924	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1925
	1926	assert(size != 0); /* it would hand out same offset multiple times */
	1927
	1928	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1929	return 0;
	1930	}
	1931
	1932	RAMBLOCK_FOREACH(block) {
	1933	ram_addr_t candidate, next = RAM_ADDR_MAX;
	1934
	1935	/* Align blocks to start on a 'long' in the bitmap
	1936	* which makes the bitmap sync'ing take the fast path.
	1937	*/
	1938	candidate = block->offset + block->max_length;
	1939	candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
	1940
	1941	/* Search for the closest following block
	1942	* and find the gap.
	1943	*/
	1944	RAMBLOCK_FOREACH(next_block) {
	1945	if (next_block->offset >= candidate) {
	1946	next = MIN(next, next_block->offset);
	1947	}
	1948	}
	1949
	1950	/* If it fits remember our place and remember the size
	1951	* of gap, but keep going so that we might find a smaller
	1952	* gap to fill so avoiding fragmentation.
	1953	*/
	1954	if (next - candidate >= size && next - candidate < mingap) {
	1955	offset = candidate;
	1956	mingap = next - candidate;
	1957	}
	1958
	1959	trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
	1960	}
	1961
	1962	if (offset == RAM_ADDR_MAX) {
	1963	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1964	(uint64_t)size);
	1965	abort();
	1966	}
	1967
	1968	trace_find_ram_offset(size, offset);
	1969
	1970	return offset;
	1971	}
	1972
	1973	static unsigned long last_ram_page(void)
	1974	{
	1975	RAMBlock *block;
	1976	ram_addr_t last = 0;
	1977
	1978	RCU_READ_LOCK_GUARD();
	1979	RAMBLOCK_FOREACH(block) {
	1980	last = MAX(last, block->offset + block->max_length);
	1981	}
	1982	return last >> TARGET_PAGE_BITS;
	1983	}
	1984
	1985	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	1986	{
	1987	int ret;
	1988
	1989	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	1990	if (!machine_dump_guest_core(current_machine)) {
	1991	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	1992	if (ret) {
	1993	perror("qemu_madvise");
	1994	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	1995	"but dump_guest_core=off specified\n");
	1996	}
	1997	}
	1998	}
	1999
	2000	const char qemu_ram_get_idstr(RAMBlock rb)
	2001	{
	2002	return rb->idstr;
	2003	}
	2004
	2005	void qemu_ram_get_host_addr(RAMBlock rb)
	2006	{
	2007	return rb->host;
	2008	}
	2009
	2010	ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
	2011	{
	2012	return rb->offset;
	2013	}
	2014
	2015	ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
	2016	{
	2017	return rb->used_length;
	2018	}
	2019
	2020	bool qemu_ram_is_shared(RAMBlock *rb)
	2021	{
	2022	return rb->flags & RAM_SHARED;
	2023	}
	2024
	2025	/* Note: Only set at the start of postcopy */
	2026	bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
	2027	{
	2028	return rb->flags & RAM_UF_ZEROPAGE;
	2029	}
	2030
	2031	void qemu_ram_set_uf_zeroable(RAMBlock *rb)
	2032	{
	2033	rb->flags \|= RAM_UF_ZEROPAGE;
	2034	}
	2035
	2036	bool qemu_ram_is_migratable(RAMBlock *rb)
	2037	{
	2038	return rb->flags & RAM_MIGRATABLE;
	2039	}
	2040
	2041	void qemu_ram_set_migratable(RAMBlock *rb)
	2042	{
	2043	rb->flags \|= RAM_MIGRATABLE;
	2044	}
	2045
	2046	void qemu_ram_unset_migratable(RAMBlock *rb)
	2047	{
	2048	rb->flags &= ~RAM_MIGRATABLE;
	2049	}
	2050
	2051	/* Called with iothread lock held. */
	2052	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	2053	{
	2054	RAMBlock *block;
	2055
	2056	assert(new_block);
	2057	assert(!new_block->idstr[0]);
	2058
	2059	if (dev) {
	2060	char *id = qdev_get_dev_path(dev);
	2061	if (id) {
	2062	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	2063	g_free(id);
	2064	}
	2065	}
	2066	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	2067
	2068	RCU_READ_LOCK_GUARD();
	2069	RAMBLOCK_FOREACH(block) {
	2070	if (block != new_block &&
	2071	!strcmp(block->idstr, new_block->idstr)) {
	2072	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	2073	new_block->idstr);
	2074	abort();
	2075	}
	2076	}
	2077	}
	2078
	2079	/* Called with iothread lock held. */
	2080	void qemu_ram_unset_idstr(RAMBlock *block)
	2081	{
	2082	/* FIXME: arch_init.c assumes that this is not called throughout
	2083	* migration. Ignore the problem since hot-unplug during migration
	2084	* does not work anyway.
	2085	*/
	2086	if (block) {
	2087	memset(block->idstr, 0, sizeof(block->idstr));
	2088	}
	2089	}
	2090
	2091	size_t qemu_ram_pagesize(RAMBlock *rb)
	2092	{
	2093	return rb->page_size;
	2094	}
	2095
	2096	/* Returns the largest size of page in use */
	2097	size_t qemu_ram_pagesize_largest(void)
	2098	{
	2099	RAMBlock *block;
	2100	size_t largest = 0;
	2101
	2102	RAMBLOCK_FOREACH(block) {
	2103	largest = MAX(largest, qemu_ram_pagesize(block));
	2104	}
	2105
	2106	return largest;
	2107	}
	2108
	2109	static int memory_try_enable_merging(void *addr, size_t len)
	2110	{
	2111	if (!machine_mem_merge(current_machine)) {
	2112	/* disabled by the user */
	2113	return 0;
	2114	}
	2115
	2116	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	2117	}
	2118
	2119	/* Only legal before guest might have detected the memory size: e.g. on
	2120	* incoming migration, or right after reset.
	2121	*
	2122	* As memory core doesn't know how is memory accessed, it is up to
	2123	* resize callback to update device state and/or add assertions to detect
	2124	* misuse, if necessary.
	2125	*/
	2126	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	2127	{
	2128	assert(block);
	2129
	2130	newsize = HOST_PAGE_ALIGN(newsize);
	2131
	2132	if (block->used_length == newsize) {
	2133	return 0;
	2134	}
	2135
	2136	if (!(block->flags & RAM_RESIZEABLE)) {
	2137	error_setg_errno(errp, EINVAL,
	2138	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	2139	" in != 0x" RAM_ADDR_FMT, block->idstr,
	2140	newsize, block->used_length);
	2141	return -EINVAL;
	2142	}
	2143
	2144	if (block->max_length < newsize) {
	2145	error_setg_errno(errp, EINVAL,
	2146	"Length too large: %s: 0x" RAM_ADDR_FMT
	2147	" > 0x" RAM_ADDR_FMT, block->idstr,
	2148	newsize, block->max_length);
	2149	return -EINVAL;
	2150	}
	2151
	2152	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	2153	block->used_length = newsize;
	2154	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	2155	DIRTY_CLIENTS_ALL);
	2156	memory_region_set_size(block->mr, newsize);
	2157	if (block->resized) {
	2158	block->resized(block->idstr, newsize, block->host);
	2159	}
	2160	return 0;
	2161	}
	2162
	2163	/*
	2164	* Trigger sync on the given ram block for range [start, start + length]
	2165	* with the backing store if one is available.
	2166	* Otherwise no-op.
	2167	* @Note: this is supposed to be a synchronous op.
	2168	*/
	2169	void qemu_ram_writeback(RAMBlock *block, ram_addr_t start, ram_addr_t length)
	2170	{
	2171	void *addr = ramblock_ptr(block, start);
	2172
	2173	/* The requested range should fit in within the block range */
	2174	g_assert((start + length) <= block->used_length);
	2175
	2176	#ifdef CONFIG_LIBPMEM
	2177	/* The lack of support for pmem should not block the sync */
	2178	if (ramblock_is_pmem(block)) {
	2179	pmem_persist(addr, length);
	2180	return;
	2181	}
	2182	#endif
	2183	if (block->fd >= 0) {
	2184	/**
	2185	* Case there is no support for PMEM or the memory has not been
	2186	* specified as persistent (or is not one) - use the msync.
	2187	* Less optimal but still achieves the same goal
	2188	*/
	2189	if (qemu_msync(addr, length, block->fd)) {
	2190	warn_report("%s: failed to sync memory range: start: "
	2191	RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
	2192	__func__, start, length);
	2193	}
	2194	}
	2195	}
	2196
	2197	/* Called with ram_list.mutex held */
	2198	static void dirty_memory_extend(ram_addr_t old_ram_size,
	2199	ram_addr_t new_ram_size)
	2200	{
	2201	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	2202	DIRTY_MEMORY_BLOCK_SIZE);
	2203	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	2204	DIRTY_MEMORY_BLOCK_SIZE);
	2205	int i;
	2206
	2207	/* Only need to extend if block count increased */
	2208	if (new_num_blocks <= old_num_blocks) {
	2209	return;
	2210	}
	2211
	2212	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	2213	DirtyMemoryBlocks *old_blocks;
	2214	DirtyMemoryBlocks *new_blocks;
	2215	int j;
	2216
	2217	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	2218	new_blocks = g_malloc(sizeof(*new_blocks) +
	2219	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	2220
	2221	if (old_num_blocks) {
	2222	memcpy(new_blocks->blocks, old_blocks->blocks,
	2223	old_num_blocks * sizeof(old_blocks->blocks[0]));
	2224	}
	2225
	2226	for (j = old_num_blocks; j < new_num_blocks; j++) {
	2227	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	2228	}
	2229
	2230	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	2231
	2232	if (old_blocks) {
	2233	g_free_rcu(old_blocks, rcu);
	2234	}
	2235	}
	2236	}
	2237
	2238	static void ram_block_add(RAMBlock new_block, Error *errp, bool shared)
	2239	{
	2240	RAMBlock *block;
	2241	RAMBlock *last_block = NULL;
	2242	ram_addr_t old_ram_size, new_ram_size;
	2243	Error *err = NULL;
	2244
	2245	old_ram_size = last_ram_page();
	2246
	2247	qemu_mutex_lock_ramlist();
	2248	new_block->offset = find_ram_offset(new_block->max_length);
	2249
	2250	if (!new_block->host) {
	2251	if (xen_enabled()) {
	2252	xen_ram_alloc(new_block->offset, new_block->max_length,
	2253	new_block->mr, &err);
	2254	if (err) {
	2255	error_propagate(errp, err);
	2256	qemu_mutex_unlock_ramlist();
	2257	return;
	2258	}
	2259	} else {
	2260	new_block->host = phys_mem_alloc(new_block->max_length,
	2261	&new_block->mr->align, shared);
	2262	if (!new_block->host) {
	2263	error_setg_errno(errp, errno,
	2264	"cannot set up guest memory '%s'",
	2265	memory_region_name(new_block->mr));
	2266	qemu_mutex_unlock_ramlist();
	2267	return;
	2268	}
	2269	memory_try_enable_merging(new_block->host, new_block->max_length);
	2270	}
	2271	}
	2272
	2273	new_ram_size = MAX(old_ram_size,
	2274	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	2275	if (new_ram_size > old_ram_size) {
	2276	dirty_memory_extend(old_ram_size, new_ram_size);
	2277	}
	2278	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	2279	* QLIST (which has an RCU-friendly variant) does not have insertion at
	2280	* tail, so save the last element in last_block.
	2281	*/
	2282	RAMBLOCK_FOREACH(block) {
	2283	last_block = block;
	2284	if (block->max_length < new_block->max_length) {
	2285	break;
	2286	}
	2287	}
	2288	if (block) {
	2289	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	2290	} else if (last_block) {
	2291	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	2292	} else { /* list is empty */
	2293	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	2294	}
	2295	ram_list.mru_block = NULL;
	2296
	2297	/* Write list before version */
	2298	smp_wmb();
	2299	ram_list.version++;
	2300	qemu_mutex_unlock_ramlist();
	2301
	2302	cpu_physical_memory_set_dirty_range(new_block->offset,
	2303	new_block->used_length,
	2304	DIRTY_CLIENTS_ALL);
	2305
	2306	if (new_block->host) {
	2307	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	2308	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	2309	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	2310	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	2311	ram_block_notify_add(new_block->host, new_block->max_length);
	2312	}
	2313	}
	2314
	2315	#ifdef CONFIG_POSIX
	2316	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	2317	uint32_t ram_flags, int fd,
	2318	Error **errp)
	2319	{
	2320	RAMBlock *new_block;
	2321	Error *local_err = NULL;
	2322	int64_t file_size;
	2323
	2324	/* Just support these ram flags by now. */
	2325	assert((ram_flags & ~(RAM_SHARED \| RAM_PMEM)) == 0);
	2326
	2327	if (xen_enabled()) {
	2328	error_setg(errp, "-mem-path not supported with Xen");
	2329	return NULL;
	2330	}
	2331
	2332	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	2333	error_setg(errp,
	2334	"host lacks kvm mmu notifiers, -mem-path unsupported");
	2335	return NULL;
	2336	}
	2337
	2338	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	2339	/*
	2340	* file_ram_alloc() needs to allocate just like
	2341	* phys_mem_alloc, but we haven't bothered to provide
	2342	* a hook there.
	2343	*/
	2344	error_setg(errp,
	2345	"-mem-path not supported with this accelerator");
	2346	return NULL;
	2347	}
	2348
	2349	size = HOST_PAGE_ALIGN(size);
	2350	file_size = get_file_size(fd);
	2351	if (file_size > 0 && file_size < size) {
	2352	error_setg(errp, "backing store %s size 0x%" PRIx64
	2353	" does not match 'size' option 0x" RAM_ADDR_FMT,
	2354	mem_path, file_size, size);
	2355	return NULL;
	2356	}
	2357
	2358	new_block = g_malloc0(sizeof(*new_block));
	2359	new_block->mr = mr;
	2360	new_block->used_length = size;
	2361	new_block->max_length = size;
	2362	new_block->flags = ram_flags;
	2363	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	2364	if (!new_block->host) {
	2365	g_free(new_block);
	2366	return NULL;
	2367	}
	2368
	2369	ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
	2370	if (local_err) {
	2371	g_free(new_block);
	2372	error_propagate(errp, local_err);
	2373	return NULL;
	2374	}
	2375	return new_block;
	2376
	2377	}
	2378
	2379
	2380	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	2381	uint32_t ram_flags, const char *mem_path,
	2382	Error **errp)
	2383	{
	2384	int fd;
	2385	bool created;
	2386	RAMBlock *block;
	2387
	2388	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2389	if (fd < 0) {
	2390	return NULL;
	2391	}
	2392
	2393	block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
	2394	if (!block) {
	2395	if (created) {
	2396	unlink(mem_path);
	2397	}
	2398	close(fd);
	2399	return NULL;
	2400	}
	2401
	2402	return block;
	2403	}
	2404	#endif
	2405
	2406	static
	2407	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2408	void (resized)(const char,
	2409	uint64_t length,
	2410	void *host),
	2411	void *host, bool resizeable, bool share,
	2412	MemoryRegion mr, Error *errp)
	2413	{
	2414	RAMBlock *new_block;
	2415	Error *local_err = NULL;
	2416
	2417	size = HOST_PAGE_ALIGN(size);
	2418	max_size = HOST_PAGE_ALIGN(max_size);
	2419	new_block = g_malloc0(sizeof(*new_block));
	2420	new_block->mr = mr;
	2421	new_block->resized = resized;
	2422	new_block->used_length = size;
	2423	new_block->max_length = max_size;
	2424	assert(max_size >= size);
	2425	new_block->fd = -1;
	2426	new_block->page_size = qemu_real_host_page_size;
	2427	new_block->host = host;
	2428	if (host) {
	2429	new_block->flags \|= RAM_PREALLOC;
	2430	}
	2431	if (resizeable) {
	2432	new_block->flags \|= RAM_RESIZEABLE;
	2433	}
	2434	ram_block_add(new_block, &local_err, share);
	2435	if (local_err) {
	2436	g_free(new_block);
	2437	error_propagate(errp, local_err);
	2438	return NULL;
	2439	}
	2440	return new_block;
	2441	}
	2442
	2443	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2444	MemoryRegion mr, Error *errp)
	2445	{
	2446	return qemu_ram_alloc_internal(size, size, NULL, host, false,
	2447	false, mr, errp);
	2448	}
	2449
	2450	RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
	2451	MemoryRegion mr, Error *errp)
	2452	{
	2453	return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
	2454	share, mr, errp);
	2455	}
	2456
	2457	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2458	void (resized)(const char,
	2459	uint64_t length,
	2460	void *host),
	2461	MemoryRegion mr, Error *errp)
	2462	{
	2463	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
	2464	false, mr, errp);
	2465	}
	2466
	2467	static void reclaim_ramblock(RAMBlock *block)
	2468	{
	2469	if (block->flags & RAM_PREALLOC) {
	2470	;
	2471	} else if (xen_enabled()) {
	2472	xen_invalidate_map_cache_entry(block->host);
	2473	#ifndef _WIN32
	2474	} else if (block->fd >= 0) {
	2475	qemu_ram_munmap(block->fd, block->host, block->max_length);
	2476	close(block->fd);
	2477	#endif
	2478	} else {
	2479	qemu_anon_ram_free(block->host, block->max_length);
	2480	}
	2481	g_free(block);
	2482	}
	2483
	2484	void qemu_ram_free(RAMBlock *block)
	2485	{
	2486	if (!block) {
	2487	return;
	2488	}
	2489
	2490	if (block->host) {
	2491	ram_block_notify_remove(block->host, block->max_length);
	2492	}
	2493
	2494	qemu_mutex_lock_ramlist();
	2495	QLIST_REMOVE_RCU(block, next);
	2496	ram_list.mru_block = NULL;
	2497	/* Write list before version */
	2498	smp_wmb();
	2499	ram_list.version++;
	2500	call_rcu(block, reclaim_ramblock, rcu);
	2501	qemu_mutex_unlock_ramlist();
	2502	}
	2503
	2504	#ifndef _WIN32
	2505	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2506	{
	2507	RAMBlock *block;
	2508	ram_addr_t offset;
	2509	int flags;
	2510	void area, vaddr;
	2511
	2512	RAMBLOCK_FOREACH(block) {
	2513	offset = addr - block->offset;
	2514	if (offset < block->max_length) {
	2515	vaddr = ramblock_ptr(block, offset);
	2516	if (block->flags & RAM_PREALLOC) {
	2517	;
	2518	} else if (xen_enabled()) {
	2519	abort();
	2520	} else {
	2521	flags = MAP_FIXED;
	2522	if (block->fd >= 0) {
	2523	flags \|= (block->flags & RAM_SHARED ?
	2524	MAP_SHARED : MAP_PRIVATE);
	2525	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2526	flags, block->fd, offset);
	2527	} else {
	2528	/*
	2529	* Remap needs to match alloc. Accelerators that
	2530	* set phys_mem_alloc never remap. If they did,
	2531	* we'd need a remap hook here.
	2532	*/
	2533	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2534
	2535	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2536	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2537	flags, -1, 0);
	2538	}
	2539	if (area != vaddr) {
	2540	error_report("Could not remap addr: "
	2541	RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
	2542	length, addr);
	2543	exit(1);
	2544	}
	2545	memory_try_enable_merging(vaddr, length);
	2546	qemu_ram_setup_dump(vaddr, length);
	2547	}
	2548	}
	2549	}
	2550	}
	2551	#endif /* !_WIN32 */
	2552
	2553	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2554	* This should not be used for general purpose DMA. Use address_space_map
	2555	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2556	* device owns, use memory_region_get_ram_ptr.
	2557	*
	2558	* Called within RCU critical section.
	2559	*/
	2560	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2561	{
	2562	RAMBlock *block = ram_block;
	2563
	2564	if (block == NULL) {
	2565	block = qemu_get_ram_block(addr);
	2566	addr -= block->offset;
	2567	}
	2568
	2569	if (xen_enabled() && block->host == NULL) {
	2570	/* We need to check if the requested address is in the RAM
	2571	* because we don't want to map the entire memory in QEMU.
	2572	* In that case just map until the end of the page.
	2573	*/
	2574	if (block->offset == 0) {
	2575	return xen_map_cache(addr, 0, 0, false);
	2576	}
	2577
	2578	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2579	}
	2580	return ramblock_ptr(block, addr);
	2581	}
	2582
	2583	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2584	* but takes a size argument.
	2585	*
	2586	* Called within RCU critical section.
	2587	*/
	2588	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2589	hwaddr *size, bool lock)
	2590	{
	2591	RAMBlock *block = ram_block;
	2592	if (*size == 0) {
	2593	return NULL;
	2594	}
	2595
	2596	if (block == NULL) {
	2597	block = qemu_get_ram_block(addr);
	2598	addr -= block->offset;
	2599	}
	2600	size = MIN(size, block->max_length - addr);
	2601
	2602	if (xen_enabled() && block->host == NULL) {
	2603	/* We need to check if the requested address is in the RAM
	2604	* because we don't want to map the entire memory in QEMU.
	2605	* In that case just map the requested area.
	2606	*/
	2607	if (block->offset == 0) {
	2608	return xen_map_cache(addr, *size, lock, lock);
	2609	}
	2610
	2611	block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
	2612	}
	2613
	2614	return ramblock_ptr(block, addr);
	2615	}
	2616
	2617	/* Return the offset of a hostpointer within a ramblock */
	2618	ram_addr_t qemu_ram_block_host_offset(RAMBlock rb, void host)
	2619	{
	2620	ram_addr_t res = (uint8_t )host - (uint8_t )rb->host;
	2621	assert((uintptr_t)host >= (uintptr_t)rb->host);
	2622	assert(res < rb->max_length);
	2623
	2624	return res;
	2625	}
	2626
	2627	/*
	2628	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2629	* in that RAMBlock.
	2630	*
	2631	* ptr: Host pointer to look up
	2632	* round_offset: If true round the result offset down to a page boundary
	2633	* *ram_addr: set to result ram_addr
	2634	* *offset: set to result offset within the RAMBlock
	2635	*
	2636	* Returns: RAMBlock (or NULL if not found)
	2637	*
	2638	* By the time this function returns, the returned pointer is not protected
	2639	* by RCU anymore. If the caller is not within an RCU critical section and
	2640	* does not hold the iothread lock, it must have other means of protecting the
	2641	* pointer, such as a reference to the region that includes the incoming
	2642	* ram_addr_t.
	2643	*/
	2644	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2645	ram_addr_t *offset)
	2646	{
	2647	RAMBlock *block;
	2648	uint8_t *host = ptr;
	2649
	2650	if (xen_enabled()) {
	2651	ram_addr_t ram_addr;
	2652	RCU_READ_LOCK_GUARD();
	2653	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2654	block = qemu_get_ram_block(ram_addr);
	2655	if (block) {
	2656	*offset = ram_addr - block->offset;
	2657	}
	2658	return block;
	2659	}
	2660
	2661	RCU_READ_LOCK_GUARD();
	2662	block = atomic_rcu_read(&ram_list.mru_block);
	2663	if (block && block->host && host - block->host < block->max_length) {
	2664	goto found;
	2665	}
	2666
	2667	RAMBLOCK_FOREACH(block) {
	2668	/* This case append when the block is not mapped. */
	2669	if (block->host == NULL) {
	2670	continue;
	2671	}
	2672	if (host - block->host < block->max_length) {
	2673	goto found;
	2674	}
	2675	}
	2676
	2677	return NULL;
	2678
	2679	found:
	2680	*offset = (host - block->host);
	2681	if (round_offset) {
	2682	*offset &= TARGET_PAGE_MASK;
	2683	}
	2684	return block;
	2685	}
	2686
	2687	/*
	2688	* Finds the named RAMBlock
	2689	*
	2690	* name: The name of RAMBlock to find
	2691	*
	2692	* Returns: RAMBlock (or NULL if not found)
	2693	*/
	2694	RAMBlock qemu_ram_block_by_name(const char name)
	2695	{
	2696	RAMBlock *block;
	2697
	2698	RAMBLOCK_FOREACH(block) {
	2699	if (!strcmp(name, block->idstr)) {
	2700	return block;
	2701	}
	2702	}
	2703
	2704	return NULL;
	2705	}
	2706
	2707	/* Some of the softmmu routines need to translate from a host pointer
	2708	(typically a TLB entry) back to a ram offset. */
	2709	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2710	{
	2711	RAMBlock *block;
	2712	ram_addr_t offset;
	2713
	2714	block = qemu_ram_block_from_host(ptr, false, &offset);
	2715	if (!block) {
	2716	return RAM_ADDR_INVALID;
	2717	}
	2718
	2719	return block->offset + offset;
	2720	}
	2721
	2722	/* Generate a debug exception if a watchpoint has been hit. */
	2723	void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
	2724	MemTxAttrs attrs, int flags, uintptr_t ra)
	2725	{
	2726	CPUClass *cc = CPU_GET_CLASS(cpu);
	2727	CPUWatchpoint *wp;
	2728
	2729	assert(tcg_enabled());
	2730	if (cpu->watchpoint_hit) {
	2731	/*
	2732	* We re-entered the check after replacing the TB.
	2733	* Now raise the debug interrupt so that it will
	2734	* trigger after the current instruction.
	2735	*/
	2736	qemu_mutex_lock_iothread();
	2737	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2738	qemu_mutex_unlock_iothread();
	2739	return;
	2740	}
	2741
	2742	addr = cc->adjust_watchpoint_address(cpu, addr, len);
	2743	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2744	if (watchpoint_address_matches(wp, addr, len)
	2745	&& (wp->flags & flags)) {
	2746	if (flags == BP_MEM_READ) {
	2747	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2748	} else {
	2749	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2750	}
	2751	wp->hitaddr = MAX(addr, wp->vaddr);
	2752	wp->hitattrs = attrs;
	2753	if (!cpu->watchpoint_hit) {
	2754	if (wp->flags & BP_CPU &&
	2755	!cc->debug_check_watchpoint(cpu, wp)) {
	2756	wp->flags &= ~BP_WATCHPOINT_HIT;
	2757	continue;
	2758	}
	2759	cpu->watchpoint_hit = wp;
	2760
	2761	mmap_lock();
	2762	tb_check_watchpoint(cpu, ra);
	2763	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2764	cpu->exception_index = EXCP_DEBUG;
	2765	mmap_unlock();
	2766	cpu_loop_exit_restore(cpu, ra);
	2767	} else {
	2768	/* Force execution of one insn next time. */
	2769	cpu->cflags_next_tb = 1 \| curr_cflags();
	2770	mmap_unlock();
	2771	if (ra) {
	2772	cpu_restore_state(cpu, ra, true);
	2773	}
	2774	cpu_loop_exit_noexc(cpu);
	2775	}
	2776	}
	2777	} else {
	2778	wp->flags &= ~BP_WATCHPOINT_HIT;
	2779	}
	2780	}
	2781	}
	2782
	2783	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	2784	MemTxAttrs attrs, uint8_t *buf, hwaddr len);
	2785	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	2786	const uint8_t *buf, hwaddr len);
	2787	static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
	2788	bool is_write, MemTxAttrs attrs);
	2789
	2790	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2791	unsigned len, MemTxAttrs attrs)
	2792	{
	2793	subpage_t *subpage = opaque;
	2794	uint8_t buf[8];
	2795	MemTxResult res;
	2796
	2797	#if defined(DEBUG_SUBPAGE)
	2798	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2799	subpage, len, addr);
	2800	#endif
	2801	res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
	2802	if (res) {
	2803	return res;
	2804	}
	2805	*data = ldn_p(buf, len);
	2806	return MEMTX_OK;
	2807	}
	2808
	2809	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2810	uint64_t value, unsigned len, MemTxAttrs attrs)
	2811	{
	2812	subpage_t *subpage = opaque;
	2813	uint8_t buf[8];
	2814
	2815	#if defined(DEBUG_SUBPAGE)
	2816	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2817	" value %"PRIx64"\n",
	2818	__func__, subpage, len, addr, value);
	2819	#endif
	2820	stn_p(buf, len, value);
	2821	return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
	2822	}
	2823
	2824	static bool subpage_accepts(void *opaque, hwaddr addr,
	2825	unsigned len, bool is_write,
	2826	MemTxAttrs attrs)
	2827	{
	2828	subpage_t *subpage = opaque;
	2829	#if defined(DEBUG_SUBPAGE)
	2830	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2831	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2832	#endif
	2833
	2834	return flatview_access_valid(subpage->fv, addr + subpage->base,
	2835	len, is_write, attrs);
	2836	}
	2837
	2838	static const MemoryRegionOps subpage_ops = {
	2839	.read_with_attrs = subpage_read,
	2840	.write_with_attrs = subpage_write,
	2841	.impl.min_access_size = 1,
	2842	.impl.max_access_size = 8,
	2843	.valid.min_access_size = 1,
	2844	.valid.max_access_size = 8,
	2845	.valid.accepts = subpage_accepts,
	2846	.endianness = DEVICE_NATIVE_ENDIAN,
	2847	};
	2848
	2849	static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
	2850	uint16_t section)
	2851	{
	2852	int idx, eidx;
	2853
	2854	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2855	return -1;
	2856	idx = SUBPAGE_IDX(start);
	2857	eidx = SUBPAGE_IDX(end);
	2858	#if defined(DEBUG_SUBPAGE)
	2859	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2860	__func__, mmio, start, end, idx, eidx, section);
	2861	#endif
	2862	for (; idx <= eidx; idx++) {
	2863	mmio->sub_section[idx] = section;
	2864	}
	2865
	2866	return 0;
	2867	}
	2868
	2869	static subpage_t subpage_init(FlatView fv, hwaddr base)
	2870	{
	2871	subpage_t *mmio;
	2872
	2873	/* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
	2874	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2875	mmio->fv = fv;
	2876	mmio->base = base;
	2877	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2878	NULL, TARGET_PAGE_SIZE);
	2879	mmio->iomem.subpage = true;
	2880	#if defined(DEBUG_SUBPAGE)
	2881	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2882	mmio, base, TARGET_PAGE_SIZE);
	2883	#endif
	2884
	2885	return mmio;
	2886	}
	2887
	2888	static uint16_t dummy_section(PhysPageMap map, FlatView fv, MemoryRegion *mr)
	2889	{
	2890	assert(fv);
	2891	MemoryRegionSection section = {
	2892	.fv = fv,
	2893	.mr = mr,
	2894	.offset_within_address_space = 0,
	2895	.offset_within_region = 0,
	2896	.size = int128_2_64(),
	2897	};
	2898
	2899	return phys_section_add(map, &section);
	2900	}
	2901
	2902	MemoryRegionSection iotlb_to_section(CPUState cpu,
	2903	hwaddr index, MemTxAttrs attrs)
	2904	{
	2905	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	2906	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	2907	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	2908	MemoryRegionSection *sections = d->map.sections;
	2909
	2910	return &sections[index & ~TARGET_PAGE_MASK];
	2911	}
	2912
	2913	static void io_mem_init(void)
	2914	{
	2915	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	2916	NULL, UINT64_MAX);
	2917	}
	2918
	2919	AddressSpaceDispatch address_space_dispatch_new(FlatView fv)
	2920	{
	2921	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	2922	uint16_t n;
	2923
	2924	n = dummy_section(&d->map, fv, &io_mem_unassigned);
	2925	assert(n == PHYS_SECTION_UNASSIGNED);
	2926
	2927	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	2928
	2929	return d;
	2930	}
	2931
	2932	void address_space_dispatch_free(AddressSpaceDispatch *d)
	2933	{
	2934	phys_sections_free(&d->map);
	2935	g_free(d);
	2936	}
	2937
	2938	static void do_nothing(CPUState *cpu, run_on_cpu_data d)
	2939	{
	2940	}
	2941
	2942	static void tcg_log_global_after_sync(MemoryListener *listener)
	2943	{
	2944	CPUAddressSpace *cpuas;
	2945
	2946	/* Wait for the CPU to end the current TB. This avoids the following
	2947	* incorrect race:
	2948	*
	2949	* vCPU migration
	2950	* ---------------------- -------------------------
	2951	* TLB check -> slow path
	2952	* notdirty_mem_write
	2953	* write to RAM
	2954	* mark dirty
	2955	* clear dirty flag
	2956	* TLB check -> fast path
	2957	* read memory
	2958	* write to RAM
	2959	*
	2960	* by pushing the migration thread's memory read after the vCPU thread has
	2961	* written the memory.
	2962	*/
	2963	if (replay_mode == REPLAY_MODE_NONE) {
	2964	/*
	2965	* VGA can make calls to this function while updating the screen.
	2966	* In record/replay mode this causes a deadlock, because
	2967	* run_on_cpu waits for rr mutex. Therefore no races are possible
	2968	* in this case and no need for making run_on_cpu when
	2969	* record/replay is not enabled.
	2970	*/
	2971	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2972	run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
	2973	}
	2974	}
	2975
	2976	static void tcg_commit(MemoryListener *listener)
	2977	{
	2978	CPUAddressSpace *cpuas;
	2979	AddressSpaceDispatch *d;
	2980
	2981	assert(tcg_enabled());
	2982	/* since each CPU stores ram addresses in its TLB cache, we must
	2983	reset the modified entries */
	2984	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2985	cpu_reloading_memory_map();
	2986	/* The CPU and TLB are protected by the iothread lock.
	2987	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	2988	* may have split the RCU critical section.
	2989	*/
	2990	d = address_space_to_dispatch(cpuas->as);
	2991	atomic_rcu_set(&cpuas->memory_dispatch, d);
	2992	tlb_flush(cpuas->cpu);
	2993	}
	2994
	2995	static void memory_map_init(void)
	2996	{
	2997	system_memory = g_malloc(sizeof(*system_memory));
	2998
	2999	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	3000	address_space_init(&address_space_memory, system_memory, "memory");
	3001
	3002	system_io = g_malloc(sizeof(*system_io));
	3003	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	3004	65536);
	3005	address_space_init(&address_space_io, system_io, "I/O");
	3006	}
	3007
	3008	MemoryRegion *get_system_memory(void)
	3009	{
	3010	return system_memory;
	3011	}
	3012
	3013	MemoryRegion *get_system_io(void)
	3014	{
	3015	return system_io;
	3016	}
	3017
	3018	#endif /* !defined(CONFIG_USER_ONLY) */
	3019
	3020	/* physical memory access (slow version, mainly for debug) */
	3021	#if defined(CONFIG_USER_ONLY)
	3022	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3023	uint8_t *buf, target_ulong len, int is_write)
	3024	{
	3025	int flags;
	3026	target_ulong l, page;
	3027	void * p;
	3028
	3029	while (len > 0) {
	3030	page = addr & TARGET_PAGE_MASK;
	3031	l = (page + TARGET_PAGE_SIZE) - addr;
	3032	if (l > len)
	3033	l = len;
	3034	flags = page_get_flags(page);
	3035	if (!(flags & PAGE_VALID))
	3036	return -1;
	3037	if (is_write) {
	3038	if (!(flags & PAGE_WRITE))
	3039	return -1;
	3040	/* XXX: this code should not depend on lock_user */
	3041	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	3042	return -1;
	3043	memcpy(p, buf, l);
	3044	unlock_user(p, addr, l);
	3045	} else {
	3046	if (!(flags & PAGE_READ))
	3047	return -1;
	3048	/* XXX: this code should not depend on lock_user */
	3049	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	3050	return -1;
	3051	memcpy(buf, p, l);
	3052	unlock_user(p, addr, 0);
	3053	}
	3054	len -= l;
	3055	buf += l;
	3056	addr += l;
	3057	}
	3058	return 0;
	3059	}
	3060
	3061	#else
	3062
	3063	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	3064	hwaddr length)
	3065	{
	3066	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	3067	addr += memory_region_get_ram_addr(mr);
	3068
	3069	/* No early return if dirty_log_mask is or becomes 0, because
	3070	* cpu_physical_memory_set_dirty_range will still call
	3071	* xen_modified_memory.
	3072	*/
	3073	if (dirty_log_mask) {
	3074	dirty_log_mask =
	3075	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	3076	}
	3077	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	3078	assert(tcg_enabled());
	3079	tb_invalidate_phys_range(addr, addr + length);
	3080	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	3081	}
	3082	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	3083	}
	3084
	3085	void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
	3086	{
	3087	/*
	3088	* In principle this function would work on other memory region types too,
	3089	* but the ROM device use case is the only one where this operation is
	3090	* necessary. Other memory regions should use the
	3091	* address_space_read/write() APIs.
	3092	*/
	3093	assert(memory_region_is_romd(mr));
	3094
	3095	invalidate_and_set_dirty(mr, addr, size);
	3096	}
	3097
	3098	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	3099	{
	3100	unsigned access_size_max = mr->ops->valid.max_access_size;
	3101
	3102	/* Regions are assumed to support 1-4 byte accesses unless
	3103	otherwise specified. */
	3104	if (access_size_max == 0) {
	3105	access_size_max = 4;
	3106	}
	3107
	3108	/* Bound the maximum access by the alignment of the address. */
	3109	if (!mr->ops->impl.unaligned) {
	3110	unsigned align_size_max = addr & -addr;
	3111	if (align_size_max != 0 && align_size_max < access_size_max) {
	3112	access_size_max = align_size_max;
	3113	}
	3114	}
	3115
	3116	/* Don't attempt accesses larger than the maximum. */
	3117	if (l > access_size_max) {
	3118	l = access_size_max;
	3119	}
	3120	l = pow2floor(l);
	3121
	3122	return l;
	3123	}
	3124
	3125	static bool prepare_mmio_access(MemoryRegion *mr)
	3126	{
	3127	bool unlocked = !qemu_mutex_iothread_locked();
	3128	bool release_lock = false;
	3129
	3130	if (unlocked && mr->global_locking) {
	3131	qemu_mutex_lock_iothread();
	3132	unlocked = false;
	3133	release_lock = true;
	3134	}
	3135	if (mr->flush_coalesced_mmio) {
	3136	if (unlocked) {
	3137	qemu_mutex_lock_iothread();
	3138	}
	3139	qemu_flush_coalesced_mmio_buffer();
	3140	if (unlocked) {
	3141	qemu_mutex_unlock_iothread();
	3142	}
	3143	}
	3144
	3145	return release_lock;
	3146	}
	3147
	3148	/* Called within RCU critical section. */
	3149	static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
	3150	MemTxAttrs attrs,
	3151	const uint8_t *buf,
	3152	hwaddr len, hwaddr addr1,
	3153	hwaddr l, MemoryRegion *mr)
	3154	{
	3155	uint8_t *ptr;
	3156	uint64_t val;
	3157	MemTxResult result = MEMTX_OK;
	3158	bool release_lock = false;
	3159
	3160	for (;;) {
	3161	if (!memory_access_is_direct(mr, true)) {
	3162	release_lock \|= prepare_mmio_access(mr);
	3163	l = memory_access_size(mr, l, addr1);
	3164	/* XXX: could force current_cpu to NULL to avoid
	3165	potential bugs */
	3166	val = ldn_he_p(buf, l);
	3167	result \|= memory_region_dispatch_write(mr, addr1, val,
	3168	size_memop(l), attrs);
	3169	} else {
	3170	/* RAM case */
	3171	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3172	memcpy(ptr, buf, l);
	3173	invalidate_and_set_dirty(mr, addr1, l);
	3174	}
	3175
	3176	if (release_lock) {
	3177	qemu_mutex_unlock_iothread();
	3178	release_lock = false;
	3179	}
	3180
	3181	len -= l;
	3182	buf += l;
	3183	addr += l;
	3184
	3185	if (!len) {
	3186	break;
	3187	}
	3188
	3189	l = len;
	3190	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3191	}
	3192
	3193	return result;
	3194	}
	3195
	3196	/* Called from RCU critical section. */
	3197	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3198	const uint8_t *buf, hwaddr len)
	3199	{
	3200	hwaddr l;
	3201	hwaddr addr1;
	3202	MemoryRegion *mr;
	3203	MemTxResult result = MEMTX_OK;
	3204
	3205	l = len;
	3206	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3207	result = flatview_write_continue(fv, addr, attrs, buf, len,
	3208	addr1, l, mr);
	3209
	3210	return result;
	3211	}
	3212
	3213	/* Called within RCU critical section. */
	3214	MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
	3215	MemTxAttrs attrs, uint8_t *buf,
	3216	hwaddr len, hwaddr addr1, hwaddr l,
	3217	MemoryRegion *mr)
	3218	{
	3219	uint8_t *ptr;
	3220	uint64_t val;
	3221	MemTxResult result = MEMTX_OK;
	3222	bool release_lock = false;
	3223
	3224	for (;;) {
	3225	if (!memory_access_is_direct(mr, false)) {
	3226	/* I/O case */
	3227	release_lock \|= prepare_mmio_access(mr);
	3228	l = memory_access_size(mr, l, addr1);
	3229	result \|= memory_region_dispatch_read(mr, addr1, &val,
	3230	size_memop(l), attrs);
	3231	stn_he_p(buf, l, val);
	3232	} else {
	3233	/* RAM case */
	3234	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3235	memcpy(buf, ptr, l);
	3236	}
	3237
	3238	if (release_lock) {
	3239	qemu_mutex_unlock_iothread();
	3240	release_lock = false;
	3241	}
	3242
	3243	len -= l;
	3244	buf += l;
	3245	addr += l;
	3246
	3247	if (!len) {
	3248	break;
	3249	}
	3250
	3251	l = len;
	3252	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3253	}
	3254
	3255	return result;
	3256	}
	3257
	3258	/* Called from RCU critical section. */
	3259	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	3260	MemTxAttrs attrs, uint8_t *buf, hwaddr len)
	3261	{
	3262	hwaddr l;
	3263	hwaddr addr1;
	3264	MemoryRegion *mr;
	3265
	3266	l = len;
	3267	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3268	return flatview_read_continue(fv, addr, attrs, buf, len,
	3269	addr1, l, mr);
	3270	}
	3271
	3272	MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
	3273	MemTxAttrs attrs, uint8_t *buf, hwaddr len)
	3274	{
	3275	MemTxResult result = MEMTX_OK;
	3276	FlatView *fv;
	3277
	3278	if (len > 0) {
	3279	RCU_READ_LOCK_GUARD();
	3280	fv = address_space_to_flatview(as);
	3281	result = flatview_read(fv, addr, attrs, buf, len);
	3282	}
	3283
	3284	return result;
	3285	}
	3286
	3287	MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
	3288	MemTxAttrs attrs,
	3289	const uint8_t *buf, hwaddr len)
	3290	{
	3291	MemTxResult result = MEMTX_OK;
	3292	FlatView *fv;
	3293
	3294	if (len > 0) {
	3295	RCU_READ_LOCK_GUARD();
	3296	fv = address_space_to_flatview(as);
	3297	result = flatview_write(fv, addr, attrs, buf, len);
	3298	}
	3299
	3300	return result;
	3301	}
	3302
	3303	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	3304	uint8_t *buf, hwaddr len, bool is_write)
	3305	{
	3306	if (is_write) {
	3307	return address_space_write(as, addr, attrs, buf, len);
	3308	} else {
	3309	return address_space_read_full(as, addr, attrs, buf, len);
	3310	}
	3311	}
	3312
	3313	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3314	hwaddr len, int is_write)
	3315	{
	3316	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3317	buf, len, is_write);
	3318	}
	3319
	3320	enum write_rom_type {
	3321	WRITE_DATA,
	3322	FLUSH_CACHE,
	3323	};
	3324
	3325	static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
	3326	hwaddr addr,
	3327	MemTxAttrs attrs,
	3328	const uint8_t *buf,
	3329	hwaddr len,
	3330	enum write_rom_type type)
	3331	{
	3332	hwaddr l;
	3333	uint8_t *ptr;
	3334	hwaddr addr1;
	3335	MemoryRegion *mr;
	3336
	3337	RCU_READ_LOCK_GUARD();
	3338	while (len > 0) {
	3339	l = len;
	3340	mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
	3341
	3342	if (!(memory_region_is_ram(mr) \|\|
	3343	memory_region_is_romd(mr))) {
	3344	l = memory_access_size(mr, l, addr1);
	3345	} else {
	3346	/* ROM/RAM case */
	3347	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3348	switch (type) {
	3349	case WRITE_DATA:
	3350	memcpy(ptr, buf, l);
	3351	invalidate_and_set_dirty(mr, addr1, l);
	3352	break;
	3353	case FLUSH_CACHE:
	3354	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3355	break;
	3356	}
	3357	}
	3358	len -= l;
	3359	buf += l;
	3360	addr += l;
	3361	}
	3362	return MEMTX_OK;
	3363	}
	3364
	3365	/* used for ROM loading : can write in RAM and ROM */
	3366	MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
	3367	MemTxAttrs attrs,
	3368	const uint8_t *buf, hwaddr len)
	3369	{
	3370	return address_space_write_rom_internal(as, addr, attrs,
	3371	buf, len, WRITE_DATA);
	3372	}
	3373
	3374	void cpu_flush_icache_range(hwaddr start, hwaddr len)
	3375	{
	3376	/*
	3377	* This function should do the same thing as an icache flush that was
	3378	* triggered from within the guest. For TCG we are always cache coherent,
	3379	* so there is no need to flush anything. For KVM / Xen we need to flush
	3380	* the host's instruction cache at least.
	3381	*/
	3382	if (tcg_enabled()) {
	3383	return;
	3384	}
	3385
	3386	address_space_write_rom_internal(&address_space_memory,
	3387	start, MEMTXATTRS_UNSPECIFIED,
	3388	NULL, len, FLUSH_CACHE);
	3389	}
	3390
	3391	typedef struct {
	3392	MemoryRegion *mr;
	3393	void *buffer;
	3394	hwaddr addr;
	3395	hwaddr len;
	3396	bool in_use;
	3397	} BounceBuffer;
	3398
	3399	static BounceBuffer bounce;
	3400
	3401	typedef struct MapClient {
	3402	QEMUBH *bh;
	3403	QLIST_ENTRY(MapClient) link;
	3404	} MapClient;
	3405
	3406	QemuMutex map_client_list_lock;
	3407	static QLIST_HEAD(, MapClient) map_client_list
	3408	= QLIST_HEAD_INITIALIZER(map_client_list);
	3409
	3410	static void cpu_unregister_map_client_do(MapClient *client)
	3411	{
	3412	QLIST_REMOVE(client, link);
	3413	g_free(client);
	3414	}
	3415
	3416	static void cpu_notify_map_clients_locked(void)
	3417	{
	3418	MapClient *client;
	3419
	3420	while (!QLIST_EMPTY(&map_client_list)) {
	3421	client = QLIST_FIRST(&map_client_list);
	3422	qemu_bh_schedule(client->bh);
	3423	cpu_unregister_map_client_do(client);
	3424	}
	3425	}
	3426
	3427	void cpu_register_map_client(QEMUBH *bh)
	3428	{
	3429	MapClient client = g_malloc(sizeof(client));
	3430
	3431	qemu_mutex_lock(&map_client_list_lock);
	3432	client->bh = bh;
	3433	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3434	if (!atomic_read(&bounce.in_use)) {
	3435	cpu_notify_map_clients_locked();
	3436	}
	3437	qemu_mutex_unlock(&map_client_list_lock);
	3438	}
	3439
	3440	void cpu_exec_init_all(void)
	3441	{
	3442	qemu_mutex_init(&ram_list.mutex);
	3443	/* The data structures we set up here depend on knowing the page size,
	3444	* so no more changes can be made after this point.
	3445	* In an ideal world, nothing we did before we had finished the
	3446	* machine setup would care about the target page size, and we could
	3447	* do this much later, rather than requiring board models to state
	3448	* up front what their requirements are.
	3449	*/
	3450	finalize_target_page_bits();
	3451	io_mem_init();
	3452	memory_map_init();
	3453	qemu_mutex_init(&map_client_list_lock);
	3454	}
	3455
	3456	void cpu_unregister_map_client(QEMUBH *bh)
	3457	{
	3458	MapClient *client;
	3459
	3460	qemu_mutex_lock(&map_client_list_lock);
	3461	QLIST_FOREACH(client, &map_client_list, link) {
	3462	if (client->bh == bh) {
	3463	cpu_unregister_map_client_do(client);
	3464	break;
	3465	}
	3466	}
	3467	qemu_mutex_unlock(&map_client_list_lock);
	3468	}
	3469
	3470	static void cpu_notify_map_clients(void)
	3471	{
	3472	qemu_mutex_lock(&map_client_list_lock);
	3473	cpu_notify_map_clients_locked();
	3474	qemu_mutex_unlock(&map_client_list_lock);
	3475	}
	3476
	3477	static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
	3478	bool is_write, MemTxAttrs attrs)
	3479	{
	3480	MemoryRegion *mr;
	3481	hwaddr l, xlat;
	3482
	3483	while (len > 0) {
	3484	l = len;
	3485	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3486	if (!memory_access_is_direct(mr, is_write)) {
	3487	l = memory_access_size(mr, l, addr);
	3488	if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
	3489	return false;
	3490	}
	3491	}
	3492
	3493	len -= l;
	3494	addr += l;
	3495	}
	3496	return true;
	3497	}
	3498
	3499	bool address_space_access_valid(AddressSpace *as, hwaddr addr,
	3500	hwaddr len, bool is_write,
	3501	MemTxAttrs attrs)
	3502	{
	3503	FlatView *fv;
	3504	bool result;
	3505
	3506	RCU_READ_LOCK_GUARD();
	3507	fv = address_space_to_flatview(as);
	3508	result = flatview_access_valid(fv, addr, len, is_write, attrs);
	3509	return result;
	3510	}
	3511
	3512	static hwaddr
	3513	flatview_extend_translation(FlatView *fv, hwaddr addr,
	3514	hwaddr target_len,
	3515	MemoryRegion *mr, hwaddr base, hwaddr len,
	3516	bool is_write, MemTxAttrs attrs)
	3517	{
	3518	hwaddr done = 0;
	3519	hwaddr xlat;
	3520	MemoryRegion *this_mr;
	3521
	3522	for (;;) {
	3523	target_len -= len;
	3524	addr += len;
	3525	done += len;
	3526	if (target_len == 0) {
	3527	return done;
	3528	}
	3529
	3530	len = target_len;
	3531	this_mr = flatview_translate(fv, addr, &xlat,
	3532	&len, is_write, attrs);
	3533	if (this_mr != mr \|\| xlat != base + done) {
	3534	return done;
	3535	}
	3536	}
	3537	}
	3538
	3539	/* Map a physical memory region into a host virtual address.
	3540	* May map a subset of the requested range, given by and returned in *plen.
	3541	* May return NULL if resources needed to perform the mapping are exhausted.
	3542	* Use only for reads OR writes - not for read-modify-write operations.
	3543	* Use cpu_register_map_client() to know when retrying the map operation is
	3544	* likely to succeed.
	3545	*/
	3546	void address_space_map(AddressSpace as,
	3547	hwaddr addr,
	3548	hwaddr *plen,
	3549	bool is_write,
	3550	MemTxAttrs attrs)
	3551	{
	3552	hwaddr len = *plen;
	3553	hwaddr l, xlat;
	3554	MemoryRegion *mr;
	3555	void *ptr;
	3556	FlatView *fv;
	3557
	3558	if (len == 0) {
	3559	return NULL;
	3560	}
	3561
	3562	l = len;
	3563	RCU_READ_LOCK_GUARD();
	3564	fv = address_space_to_flatview(as);
	3565	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3566
	3567	if (!memory_access_is_direct(mr, is_write)) {
	3568	if (atomic_xchg(&bounce.in_use, true)) {
	3569	return NULL;
	3570	}
	3571	/* Avoid unbounded allocations */
	3572	l = MIN(l, TARGET_PAGE_SIZE);
	3573	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3574	bounce.addr = addr;
	3575	bounce.len = l;
	3576
	3577	memory_region_ref(mr);
	3578	bounce.mr = mr;
	3579	if (!is_write) {
	3580	flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
	3581	bounce.buffer, l);
	3582	}
	3583
	3584	*plen = l;
	3585	return bounce.buffer;
	3586	}
	3587
	3588
	3589	memory_region_ref(mr);
	3590	*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
	3591	l, is_write, attrs);
	3592	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
	3593
	3594	return ptr;
	3595	}
	3596
	3597	/* Unmaps a memory region previously mapped by address_space_map().
	3598	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3599	* the amount of memory that was actually read or written by the caller.
	3600	*/
	3601	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3602	int is_write, hwaddr access_len)
	3603	{
	3604	if (buffer != bounce.buffer) {
	3605	MemoryRegion *mr;
	3606	ram_addr_t addr1;
	3607
	3608	mr = memory_region_from_host(buffer, &addr1);
	3609	assert(mr != NULL);
	3610	if (is_write) {
	3611	invalidate_and_set_dirty(mr, addr1, access_len);
	3612	}
	3613	if (xen_enabled()) {
	3614	xen_invalidate_map_cache_entry(buffer);
	3615	}
	3616	memory_region_unref(mr);
	3617	return;
	3618	}
	3619	if (is_write) {
	3620	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3621	bounce.buffer, access_len);
	3622	}
	3623	qemu_vfree(bounce.buffer);
	3624	bounce.buffer = NULL;
	3625	memory_region_unref(bounce.mr);
	3626	atomic_mb_set(&bounce.in_use, false);
	3627	cpu_notify_map_clients();
	3628	}
	3629
	3630	void *cpu_physical_memory_map(hwaddr addr,
	3631	hwaddr *plen,
	3632	int is_write)
	3633	{
	3634	return address_space_map(&address_space_memory, addr, plen, is_write,
	3635	MEMTXATTRS_UNSPECIFIED);
	3636	}
	3637
	3638	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3639	int is_write, hwaddr access_len)
	3640	{
	3641	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3642	}
	3643
	3644	#define ARG1_DECL AddressSpace *as
	3645	#define ARG1 as
	3646	#define SUFFIX
	3647	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3648	#define RCU_READ_LOCK(...) rcu_read_lock()
	3649	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3650	#include "memory_ldst.inc.c"
	3651
	3652	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3653	AddressSpace *as,
	3654	hwaddr addr,
	3655	hwaddr len,
	3656	bool is_write)
	3657	{
	3658	AddressSpaceDispatch *d;
	3659	hwaddr l;
	3660	MemoryRegion *mr;
	3661
	3662	assert(len > 0);
	3663
	3664	l = len;
	3665	cache->fv = address_space_get_flatview(as);
	3666	d = flatview_to_dispatch(cache->fv);
	3667	cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
	3668
	3669	mr = cache->mrs.mr;
	3670	memory_region_ref(mr);
	3671	if (memory_access_is_direct(mr, is_write)) {
	3672	/* We don't care about the memory attributes here as we're only
	3673	* doing this if we found actual RAM, which behaves the same
	3674	* regardless of attributes; so UNSPECIFIED is fine.
	3675	*/
	3676	l = flatview_extend_translation(cache->fv, addr, len, mr,
	3677	cache->xlat, l, is_write,
	3678	MEMTXATTRS_UNSPECIFIED);
	3679	cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
	3680	} else {
	3681	cache->ptr = NULL;
	3682	}
	3683
	3684	cache->len = l;
	3685	cache->is_write = is_write;
	3686	return l;
	3687	}
	3688
	3689	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3690	hwaddr addr,
	3691	hwaddr access_len)
	3692	{
	3693	assert(cache->is_write);
	3694	if (likely(cache->ptr)) {
	3695	invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
	3696	}
	3697	}
	3698
	3699	void address_space_cache_destroy(MemoryRegionCache *cache)
	3700	{
	3701	if (!cache->mrs.mr) {
	3702	return;
	3703	}
	3704
	3705	if (xen_enabled()) {
	3706	xen_invalidate_map_cache_entry(cache->ptr);
	3707	}
	3708	memory_region_unref(cache->mrs.mr);
	3709	flatview_unref(cache->fv);
	3710	cache->mrs.mr = NULL;
	3711	cache->fv = NULL;
	3712	}
	3713
	3714	/* Called from RCU critical section. This function has the same
	3715	* semantics as address_space_translate, but it only works on a
	3716	* predefined range of a MemoryRegion that was mapped with
	3717	* address_space_cache_init.
	3718	*/
	3719	static inline MemoryRegion *address_space_translate_cached(
	3720	MemoryRegionCache cache, hwaddr addr, hwaddr xlat,
	3721	hwaddr *plen, bool is_write, MemTxAttrs attrs)
	3722	{
	3723	MemoryRegionSection section;
	3724	MemoryRegion *mr;
	3725	IOMMUMemoryRegion *iommu_mr;
	3726	AddressSpace *target_as;
	3727
	3728	assert(!cache->ptr);
	3729	*xlat = addr + cache->xlat;
	3730
	3731	mr = cache->mrs.mr;
	3732	iommu_mr = memory_region_get_iommu(mr);
	3733	if (!iommu_mr) {
	3734	/* MMIO region. */
	3735	return mr;
	3736	}
	3737
	3738	section = address_space_translate_iommu(iommu_mr, xlat, plen,
	3739	NULL, is_write, true,
	3740	&target_as, attrs);
	3741	return section.mr;
	3742	}
	3743
	3744	/* Called from RCU critical section. address_space_read_cached uses this
	3745	* out of line function when the target is an MMIO or IOMMU region.
	3746	*/
	3747	void
	3748	address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3749	void *buf, hwaddr len)
	3750	{
	3751	hwaddr addr1, l;
	3752	MemoryRegion *mr;
	3753
	3754	l = len;
	3755	mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
	3756	MEMTXATTRS_UNSPECIFIED);
	3757	flatview_read_continue(cache->fv,
	3758	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3759	addr1, l, mr);
	3760	}
	3761
	3762	/* Called from RCU critical section. address_space_write_cached uses this
	3763	* out of line function when the target is an MMIO or IOMMU region.
	3764	*/
	3765	void
	3766	address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3767	const void *buf, hwaddr len)
	3768	{
	3769	hwaddr addr1, l;
	3770	MemoryRegion *mr;
	3771
	3772	l = len;
	3773	mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
	3774	MEMTXATTRS_UNSPECIFIED);
	3775	flatview_write_continue(cache->fv,
	3776	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3777	addr1, l, mr);
	3778	}
	3779
	3780	#define ARG1_DECL MemoryRegionCache *cache
	3781	#define ARG1 cache
	3782	#define SUFFIX _cached_slow
	3783	#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
	3784	#define RCU_READ_LOCK() ((void)0)
	3785	#define RCU_READ_UNLOCK() ((void)0)
	3786	#include "memory_ldst.inc.c"
	3787
	3788	/* virtual memory access for debug (includes writing to ROM) */
	3789	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3790	uint8_t *buf, target_ulong len, int is_write)
	3791	{
	3792	hwaddr phys_addr;
	3793	target_ulong l, page;
	3794
	3795	cpu_synchronize_state(cpu);
	3796	while (len > 0) {
	3797	int asidx;
	3798	MemTxAttrs attrs;
	3799
	3800	page = addr & TARGET_PAGE_MASK;
	3801	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3802	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3803	/* if no physical page mapped, return an error */
	3804	if (phys_addr == -1)
	3805	return -1;
	3806	l = (page + TARGET_PAGE_SIZE) - addr;
	3807	if (l > len)
	3808	l = len;
	3809	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3810	if (is_write) {
	3811	address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
	3812	attrs, buf, l);
	3813	} else {
	3814	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3815	attrs, buf, l, 0);
	3816	}
	3817	len -= l;
	3818	buf += l;
	3819	addr += l;
	3820	}
	3821	return 0;
	3822	}
	3823
	3824	/*
	3825	* Allows code that needs to deal with migration bitmaps etc to still be built
	3826	* target independent.
	3827	*/
	3828	size_t qemu_target_page_size(void)
	3829	{
	3830	return TARGET_PAGE_SIZE;
	3831	}
	3832
	3833	int qemu_target_page_bits(void)
	3834	{
	3835	return TARGET_PAGE_BITS;
	3836	}
	3837
	3838	int qemu_target_page_bits_min(void)
	3839	{
	3840	return TARGET_PAGE_BITS_MIN;
	3841	}
	3842	#endif
	3843
	3844	bool target_words_bigendian(void)
	3845	{
	3846	#if defined(TARGET_WORDS_BIGENDIAN)
	3847	return true;
	3848	#else
	3849	return false;
	3850	#endif
	3851	}
	3852
	3853	#ifndef CONFIG_USER_ONLY
	3854	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3855	{
	3856	MemoryRegion*mr;
	3857	hwaddr l = 1;
	3858	bool res;
	3859
	3860	RCU_READ_LOCK_GUARD();
	3861	mr = address_space_translate(&address_space_memory,
	3862	phys_addr, &phys_addr, &l, false,
	3863	MEMTXATTRS_UNSPECIFIED);
	3864
	3865	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3866	return res;
	3867	}
	3868
	3869	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3870	{
	3871	RAMBlock *block;
	3872	int ret = 0;
	3873
	3874	RCU_READ_LOCK_GUARD();
	3875	RAMBLOCK_FOREACH(block) {
	3876	ret = func(block, opaque);
	3877	if (ret) {
	3878	break;
	3879	}
	3880	}
	3881	return ret;
	3882	}
	3883
	3884	/*
	3885	* Unmap pages of memory from start to start+length such that
	3886	* they a) read as 0, b) Trigger whatever fault mechanism
	3887	* the OS provides for postcopy.
	3888	* The pages must be unmapped by the end of the function.
	3889	* Returns: 0 on success, none-0 on failure
	3890	*
	3891	*/
	3892	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	3893	{
	3894	int ret = -1;
	3895
	3896	uint8_t *host_startaddr = rb->host + start;
	3897
	3898	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	3899	error_report("ram_block_discard_range: Unaligned start address: %p",
	3900	host_startaddr);
	3901	goto err;
	3902	}
	3903
	3904	if ((start + length) <= rb->used_length) {
	3905	bool need_madvise, need_fallocate;
	3906	uint8_t *host_endaddr = host_startaddr + length;
	3907	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	3908	error_report("ram_block_discard_range: Unaligned end address: %p",
	3909	host_endaddr);
	3910	goto err;
	3911	}
	3912
	3913	errno = ENOTSUP; /* If we are missing MADVISE etc */
	3914
	3915	/* The logic here is messy;
	3916	* madvise DONTNEED fails for hugepages
	3917	* fallocate works on hugepages and shmem
	3918	*/
	3919	need_madvise = (rb->page_size == qemu_host_page_size);
	3920	need_fallocate = rb->fd != -1;
	3921	if (need_fallocate) {
	3922	/* For a file, this causes the area of the file to be zero'd
	3923	* if read, and for hugetlbfs also causes it to be unmapped
	3924	* so a userfault will trigger.
	3925	*/
	3926	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	3927	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	3928	start, length);
	3929	if (ret) {
	3930	ret = -errno;
	3931	error_report("ram_block_discard_range: Failed to fallocate "
	3932	"%s:%" PRIx64 " +%zx (%d)",
	3933	rb->idstr, start, length, ret);
	3934	goto err;
	3935	}
	3936	#else
	3937	ret = -ENOSYS;
	3938	error_report("ram_block_discard_range: fallocate not available/file"
	3939	"%s:%" PRIx64 " +%zx (%d)",
	3940	rb->idstr, start, length, ret);
	3941	goto err;
	3942	#endif
	3943	}
	3944	if (need_madvise) {
	3945	/* For normal RAM this causes it to be unmapped,
	3946	* for shared memory it causes the local mapping to disappear
	3947	* and to fall back on the file contents (which we just
	3948	* fallocate'd away).
	3949	*/
	3950	#if defined(CONFIG_MADVISE)
	3951	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	3952	if (ret) {
	3953	ret = -errno;
	3954	error_report("ram_block_discard_range: Failed to discard range "
	3955	"%s:%" PRIx64 " +%zx (%d)",
	3956	rb->idstr, start, length, ret);
	3957	goto err;
	3958	}
	3959	#else
	3960	ret = -ENOSYS;
	3961	error_report("ram_block_discard_range: MADVISE not available"
	3962	"%s:%" PRIx64 " +%zx (%d)",
	3963	rb->idstr, start, length, ret);
	3964	goto err;
	3965	#endif
	3966	}
	3967	trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
	3968	need_madvise, need_fallocate, ret);
	3969	} else {
	3970	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	3971	"/%zx/" RAM_ADDR_FMT")",
	3972	rb->idstr, start, length, rb->used_length);
	3973	}
	3974
	3975	err:
	3976	return ret;
	3977	}
	3978
	3979	bool ramblock_is_pmem(RAMBlock *rb)
	3980	{
	3981	return rb->flags & RAM_PMEM;
	3982	}
	3983
	3984	#endif
	3985
	3986	void page_size_init(void)
	3987	{
	3988	/* NOTE: we can always suppose that qemu_host_page_size >=
	3989	TARGET_PAGE_SIZE */
	3990	if (qemu_host_page_size == 0) {
	3991	qemu_host_page_size = qemu_real_host_page_size;
	3992	}
	3993	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	3994	qemu_host_page_size = TARGET_PAGE_SIZE;
	3995	}
	3996	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	3997	}
	3998
	3999	#if !defined(CONFIG_USER_ONLY)
	4000
	4001	static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
	4002	{
	4003	if (start == end - 1) {
	4004	qemu_printf("\t%3d ", start);
	4005	} else {
	4006	qemu_printf("\t%3d..%-3d ", start, end - 1);
	4007	}
	4008	qemu_printf(" skip=%d ", skip);
	4009	if (ptr == PHYS_MAP_NODE_NIL) {
	4010	qemu_printf(" ptr=NIL");
	4011	} else if (!skip) {
	4012	qemu_printf(" ptr=#%d", ptr);
	4013	} else {
	4014	qemu_printf(" ptr=[%d]", ptr);
	4015	}
	4016	qemu_printf("\n");
	4017	}
	4018
	4019	#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
	4020	int128_sub((size), int128_one())) : 0)
	4021
	4022	void mtree_print_dispatch(AddressSpaceDispatch d, MemoryRegion root)
	4023	{
	4024	int i;
	4025
	4026	qemu_printf(" Dispatch\n");
	4027	qemu_printf(" Physical sections\n");
	4028
	4029	for (i = 0; i < d->map.sections_nb; ++i) {
	4030	MemoryRegionSection *s = d->map.sections + i;
	4031	const char *names[] = { " [unassigned]", " [not dirty]",
	4032	" [ROM]", " [watch]" };
	4033
	4034	qemu_printf(" #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
	4035	" %s%s%s%s%s",
	4036	i,
	4037	s->offset_within_address_space,
	4038	s->offset_within_address_space + MR_SIZE(s->mr->size),
	4039	s->mr->name ? s->mr->name : "(noname)",
	4040	i < ARRAY_SIZE(names) ? names[i] : "",
	4041	s->mr == root ? " [ROOT]" : "",
	4042	s == d->mru_section ? " [MRU]" : "",
	4043	s->mr->is_iommu ? " [iommu]" : "");
	4044
	4045	if (s->mr->alias) {
	4046	qemu_printf(" alias=%s", s->mr->alias->name ?
	4047	s->mr->alias->name : "noname");
	4048	}
	4049	qemu_printf("\n");
	4050	}
	4051
	4052	qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
	4053	P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
	4054	for (i = 0; i < d->map.nodes_nb; ++i) {
	4055	int j, jprev;
	4056	PhysPageEntry prev;
	4057	Node *n = d->map.nodes + i;
	4058
	4059	qemu_printf(" [%d]\n", i);
	4060
	4061	for (j = 0, jprev = 0, prev = n[0]; j < ARRAY_SIZE(n); ++j) {
	4062	PhysPageEntry pe = n + j;
	4063
	4064	if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
	4065	continue;
	4066	}
	4067
	4068	mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
	4069
	4070	jprev = j;
	4071	prev = *pe;
	4072	}
	4073
	4074	if (jprev != ARRAY_SIZE(*n)) {
	4075	mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
	4076	}
	4077	}
	4078	}
	4079
	4080	#endif