Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19	#include "qemu/osdep.h"
	20	#include "qapi/error.h"
	21
	22	#include "qemu/cutils.h"
	23	#include "cpu.h"
	24	#include "exec/exec-all.h"
	25	#include "exec/target_page.h"
	26	#include "tcg.h"
	27	#include "hw/qdev-core.h"
	28	#include "hw/qdev-properties.h"
	29	#if !defined(CONFIG_USER_ONLY)
	30	#include "hw/boards.h"
	31	#include "hw/xen/xen.h"
	32	#endif
	33	#include "sysemu/kvm.h"
	34	#include "sysemu/sysemu.h"
	35	#include "qemu/timer.h"
	36	#include "qemu/config-file.h"
	37	#include "qemu/error-report.h"
	38	#if defined(CONFIG_USER_ONLY)
	39	#include "qemu.h"
	40	#else /* !CONFIG_USER_ONLY */
	41	#include "hw/hw.h"
	42	#include "exec/memory.h"
	43	#include "exec/ioport.h"
	44	#include "sysemu/dma.h"
	45	#include "sysemu/numa.h"
	46	#include "sysemu/hw_accel.h"
	47	#include "exec/address-spaces.h"
	48	#include "sysemu/xen-mapcache.h"
	49	#include "trace-root.h"
	50
	51	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	52	#include <linux/falloc.h>
	53	#endif
	54
	55	#endif
	56	#include "qemu/rcu_queue.h"
	57	#include "qemu/main-loop.h"
	58	#include "translate-all.h"
	59	#include "sysemu/replay.h"
	60
	61	#include "exec/memory-internal.h"
	62	#include "exec/ram_addr.h"
	63	#include "exec/log.h"
	64
	65	#include "migration/vmstate.h"
	66
	67	#include "qemu/range.h"
	68	#ifndef _WIN32
	69	#include "qemu/mmap-alloc.h"
	70	#endif
	71
	72	#include "monitor/monitor.h"
	73
	74	//#define DEBUG_SUBPAGE
	75
	76	#if !defined(CONFIG_USER_ONLY)
	77	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	78	* are protected by the ramlist lock.
	79	*/
	80	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	81
	82	static MemoryRegion *system_memory;
	83	static MemoryRegion *system_io;
	84
	85	AddressSpace address_space_io;
	86	AddressSpace address_space_memory;
	87
	88	MemoryRegion io_mem_rom, io_mem_notdirty;
	89	static MemoryRegion io_mem_unassigned;
	90
	91	/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
	92	#define RAM_PREALLOC (1 << 0)
	93
	94	/* RAM is mmap-ed with MAP_SHARED */
	95	#define RAM_SHARED (1 << 1)
	96
	97	/* Only a portion of RAM (used_length) is actually used, and migrated.
	98	* This used_length size can change across reboots.
	99	*/
	100	#define RAM_RESIZEABLE (1 << 2)
	101
	102	/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically
	103	* zero the page and wake waiting processes.
	104	* (Set during postcopy)
	105	*/
	106	#define RAM_UF_ZEROPAGE (1 << 3)
	107
	108	/* RAM can be migrated */
	109	#define RAM_MIGRATABLE (1 << 4)
	110	#endif
	111
	112	#ifdef TARGET_PAGE_BITS_VARY
	113	int target_page_bits;
	114	bool target_page_bits_decided;
	115	#endif
	116
	117	struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	118	/* current CPU in the current thread. It is only valid inside
	119	cpu_exec() */
	120	__thread CPUState *current_cpu;
	121	/* 0 = Do not count executed instructions.
	122	1 = Precise instruction counting.
	123	2 = Adaptive rate instruction counting. */
	124	int use_icount;
	125
	126	uintptr_t qemu_host_page_size;
	127	intptr_t qemu_host_page_mask;
	128
	129	bool set_preferred_target_page_bits(int bits)
	130	{
	131	/* The target page size is the lowest common denominator for all
	132	* the CPUs in the system, so we can only make it smaller, never
	133	* larger. And we can't make it smaller once we've committed to
	134	* a particular size.
	135	*/
	136	#ifdef TARGET_PAGE_BITS_VARY
	137	assert(bits >= TARGET_PAGE_BITS_MIN);
	138	if (target_page_bits == 0 \|\| target_page_bits > bits) {
	139	if (target_page_bits_decided) {
	140	return false;
	141	}
	142	target_page_bits = bits;
	143	}
	144	#endif
	145	return true;
	146	}
	147
	148	#if !defined(CONFIG_USER_ONLY)
	149
	150	static void finalize_target_page_bits(void)
	151	{
	152	#ifdef TARGET_PAGE_BITS_VARY
	153	if (target_page_bits == 0) {
	154	target_page_bits = TARGET_PAGE_BITS_MIN;
	155	}
	156	target_page_bits_decided = true;
	157	#endif
	158	}
	159
	160	typedef struct PhysPageEntry PhysPageEntry;
	161
	162	struct PhysPageEntry {
	163	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	164	uint32_t skip : 6;
	165	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	166	uint32_t ptr : 26;
	167	};
	168
	169	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	170
	171	/* Size of the L2 (and L3, etc) page tables. */
	172	#define ADDR_SPACE_BITS 64
	173
	174	#define P_L2_BITS 9
	175	#define P_L2_SIZE (1 << P_L2_BITS)
	176
	177	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	178
	179	typedef PhysPageEntry Node[P_L2_SIZE];
	180
	181	typedef struct PhysPageMap {
	182	struct rcu_head rcu;
	183
	184	unsigned sections_nb;
	185	unsigned sections_nb_alloc;
	186	unsigned nodes_nb;
	187	unsigned nodes_nb_alloc;
	188	Node *nodes;
	189	MemoryRegionSection *sections;
	190	} PhysPageMap;
	191
	192	struct AddressSpaceDispatch {
	193	MemoryRegionSection *mru_section;
	194	/* This is a multi-level map on the physical address space.
	195	* The bottom level has pointers to MemoryRegionSections.
	196	*/
	197	PhysPageEntry phys_map;
	198	PhysPageMap map;
	199	};
	200
	201	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	202	typedef struct subpage_t {
	203	MemoryRegion iomem;
	204	FlatView *fv;
	205	hwaddr base;
	206	uint16_t sub_section[];
	207	} subpage_t;
	208
	209	#define PHYS_SECTION_UNASSIGNED 0
	210	#define PHYS_SECTION_NOTDIRTY 1
	211	#define PHYS_SECTION_ROM 2
	212	#define PHYS_SECTION_WATCH 3
	213
	214	static void io_mem_init(void);
	215	static void memory_map_init(void);
	216	static void tcg_commit(MemoryListener *listener);
	217
	218	static MemoryRegion io_mem_watch;
	219
	220	/**
	221	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	222	* @cpu: the CPU whose AddressSpace this is
	223	* @as: the AddressSpace itself
	224	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	225	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	226	*/
	227	struct CPUAddressSpace {
	228	CPUState *cpu;
	229	AddressSpace *as;
	230	struct AddressSpaceDispatch *memory_dispatch;
	231	MemoryListener tcg_as_listener;
	232	};
	233
	234	struct DirtyBitmapSnapshot {
	235	ram_addr_t start;
	236	ram_addr_t end;
	237	unsigned long dirty[];
	238	};
	239
	240	#endif
	241
	242	#if !defined(CONFIG_USER_ONLY)
	243
	244	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	245	{
	246	static unsigned alloc_hint = 16;
	247	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	248	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
	249	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
	250	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	251	alloc_hint = map->nodes_nb_alloc;
	252	}
	253	}
	254
	255	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	256	{
	257	unsigned i;
	258	uint32_t ret;
	259	PhysPageEntry e;
	260	PhysPageEntry *p;
	261
	262	ret = map->nodes_nb++;
	263	p = map->nodes[ret];
	264	assert(ret != PHYS_MAP_NODE_NIL);
	265	assert(ret != map->nodes_nb_alloc);
	266
	267	e.skip = leaf ? 0 : 1;
	268	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	269	for (i = 0; i < P_L2_SIZE; ++i) {
	270	memcpy(&p[i], &e, sizeof(e));
	271	}
	272	return ret;
	273	}
	274
	275	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	276	hwaddr index, hwaddr nb, uint16_t leaf,
	277	int level)
	278	{
	279	PhysPageEntry *p;
	280	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	281
	282	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	283	lp->ptr = phys_map_node_alloc(map, level == 0);
	284	}
	285	p = map->nodes[lp->ptr];
	286	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	287
	288	while (*nb && lp < &p[P_L2_SIZE]) {
	289	if ((index & (step - 1)) == 0 && nb >= step) {
	290	lp->skip = 0;
	291	lp->ptr = leaf;
	292	*index += step;
	293	*nb -= step;
	294	} else {
	295	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	296	}
	297	++lp;
	298	}
	299	}
	300
	301	static void phys_page_set(AddressSpaceDispatch *d,
	302	hwaddr index, hwaddr nb,
	303	uint16_t leaf)
	304	{
	305	/* Wildly overreserve - it doesn't matter much. */
	306	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	307
	308	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	309	}
	310
	311	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	312	* and update our entry so we can skip it and go directly to the destination.
	313	*/
	314	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	315	{
	316	unsigned valid_ptr = P_L2_SIZE;
	317	int valid = 0;
	318	PhysPageEntry *p;
	319	int i;
	320
	321	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	322	return;
	323	}
	324
	325	p = nodes[lp->ptr];
	326	for (i = 0; i < P_L2_SIZE; i++) {
	327	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	328	continue;
	329	}
	330
	331	valid_ptr = i;
	332	valid++;
	333	if (p[i].skip) {
	334	phys_page_compact(&p[i], nodes);
	335	}
	336	}
	337
	338	/* We can only compress if there's only one child. */
	339	if (valid != 1) {
	340	return;
	341	}
	342
	343	assert(valid_ptr < P_L2_SIZE);
	344
	345	/* Don't compress if it won't fit in the # of bits we have. */
	346	if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
	347	return;
	348	}
	349
	350	lp->ptr = p[valid_ptr].ptr;
	351	if (!p[valid_ptr].skip) {
	352	/* If our only child is a leaf, make this a leaf. */
	353	/* By design, we should have made this node a leaf to begin with so we
	354	* should never reach here.
	355	* But since it's so simple to handle this, let's do it just in case we
	356	* change this rule.
	357	*/
	358	lp->skip = 0;
	359	} else {
	360	lp->skip += p[valid_ptr].skip;
	361	}
	362	}
	363
	364	void address_space_dispatch_compact(AddressSpaceDispatch *d)
	365	{
	366	if (d->phys_map.skip) {
	367	phys_page_compact(&d->phys_map, d->map.nodes);
	368	}
	369	}
	370
	371	static inline bool section_covers_addr(const MemoryRegionSection *section,
	372	hwaddr addr)
	373	{
	374	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	375	* the section must cover the entire address space.
	376	*/
	377	return int128_gethi(section->size) \|\|
	378	range_covers_byte(section->offset_within_address_space,
	379	int128_getlo(section->size), addr);
	380	}
	381
	382	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	383	{
	384	PhysPageEntry lp = d->phys_map, *p;
	385	Node *nodes = d->map.nodes;
	386	MemoryRegionSection *sections = d->map.sections;
	387	hwaddr index = addr >> TARGET_PAGE_BITS;
	388	int i;
	389
	390	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	391	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	392	return &sections[PHYS_SECTION_UNASSIGNED];
	393	}
	394	p = nodes[lp.ptr];
	395	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	396	}
	397
	398	if (section_covers_addr(&sections[lp.ptr], addr)) {
	399	return &sections[lp.ptr];
	400	} else {
	401	return &sections[PHYS_SECTION_UNASSIGNED];
	402	}
	403	}
	404
	405	bool memory_region_is_unassigned(MemoryRegion *mr)
	406	{
	407	return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
	408	&& mr != &io_mem_watch;
	409	}
	410
	411	/* Called from RCU critical section */
	412	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	413	hwaddr addr,
	414	bool resolve_subpage)
	415	{
	416	MemoryRegionSection *section = atomic_read(&d->mru_section);
	417	subpage_t *subpage;
	418
	419	if (!section \|\| section == &d->map.sections[PHYS_SECTION_UNASSIGNED] \|\|
	420	!section_covers_addr(section, addr)) {
	421	section = phys_page_find(d, addr);
	422	atomic_set(&d->mru_section, section);
	423	}
	424	if (resolve_subpage && section->mr->subpage) {
	425	subpage = container_of(section->mr, subpage_t, iomem);
	426	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	427	}
	428	return section;
	429	}
	430
	431	/* Called from RCU critical section */
	432	static MemoryRegionSection *
	433	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	434	hwaddr *plen, bool resolve_subpage)
	435	{
	436	MemoryRegionSection *section;
	437	MemoryRegion *mr;
	438	Int128 diff;
	439
	440	section = address_space_lookup_region(d, addr, resolve_subpage);
	441	/* Compute offset within MemoryRegionSection */
	442	addr -= section->offset_within_address_space;
	443
	444	/* Compute offset within MemoryRegion */
	445	*xlat = addr + section->offset_within_region;
	446
	447	mr = section->mr;
	448
	449	/* MMIO registers can be expected to perform full-width accesses based only
	450	* on their address, without considering adjacent registers that could
	451	* decode to completely different MemoryRegions. When such registers
	452	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	453	* regions overlap wildly. For this reason we cannot clamp the accesses
	454	* here.
	455	*
	456	* If the length is small (as is the case for address_space_ldl/stl),
	457	* everything works fine. If the incoming length is large, however,
	458	* the caller really has to do the clamping through memory_access_size.
	459	*/
	460	if (memory_region_is_ram(mr)) {
	461	diff = int128_sub(section->size, int128_make64(addr));
	462	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	463	}
	464	return section;
	465	}
	466
	467	/**
	468	* address_space_translate_iommu - translate an address through an IOMMU
	469	* memory region and then through the target address space.
	470	*
	471	* @iommu_mr: the IOMMU memory region that we start the translation from
	472	* @addr: the address to be translated through the MMU
	473	* @xlat: the translated address offset within the destination memory region.
	474	* It cannot be %NULL.
	475	* @plen_out: valid read/write length of the translated address. It
	476	* cannot be %NULL.
	477	* @page_mask_out: page mask for the translated address. This
	478	* should only be meaningful for IOMMU translated
	479	* addresses, since there may be huge pages that this bit
	480	* would tell. It can be %NULL if we don't care about it.
	481	* @is_write: whether the translation operation is for write
	482	* @is_mmio: whether this can be MMIO, set true if it can
	483	* @target_as: the address space targeted by the IOMMU
	484	* @attrs: transaction attributes
	485	*
	486	* This function is called from RCU critical section. It is the common
	487	* part of flatview_do_translate and address_space_translate_cached.
	488	*/
	489	static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
	490	hwaddr *xlat,
	491	hwaddr *plen_out,
	492	hwaddr *page_mask_out,
	493	bool is_write,
	494	bool is_mmio,
	495	AddressSpace **target_as,
	496	MemTxAttrs attrs)
	497	{
	498	MemoryRegionSection *section;
	499	hwaddr page_mask = (hwaddr)-1;
	500
	501	do {
	502	hwaddr addr = *xlat;
	503	IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	504	int iommu_idx = 0;
	505	IOMMUTLBEntry iotlb;
	506
	507	if (imrc->attrs_to_index) {
	508	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	509	}
	510
	511	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	512	IOMMU_WO : IOMMU_RO, iommu_idx);
	513
	514	if (!(iotlb.perm & (1 << is_write))) {
	515	goto unassigned;
	516	}
	517
	518	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	519	\| (addr & iotlb.addr_mask));
	520	page_mask &= iotlb.addr_mask;
	521	plen_out = MIN(plen_out, (addr \| iotlb.addr_mask) - addr + 1);
	522	*target_as = iotlb.target_as;
	523
	524	section = address_space_translate_internal(
	525	address_space_to_dispatch(iotlb.target_as), addr, xlat,
	526	plen_out, is_mmio);
	527
	528	iommu_mr = memory_region_get_iommu(section->mr);
	529	} while (unlikely(iommu_mr));
	530
	531	if (page_mask_out) {
	532	*page_mask_out = page_mask;
	533	}
	534	return *section;
	535
	536	unassigned:
	537	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	538	}
	539
	540	/**
	541	* flatview_do_translate - translate an address in FlatView
	542	*
	543	* @fv: the flat view that we want to translate on
	544	* @addr: the address to be translated in above address space
	545	* @xlat: the translated address offset within memory region. It
	546	* cannot be @NULL.
	547	* @plen_out: valid read/write length of the translated address. It
	548	* can be @NULL when we don't care about it.
	549	* @page_mask_out: page mask for the translated address. This
	550	* should only be meaningful for IOMMU translated
	551	* addresses, since there may be huge pages that this bit
	552	* would tell. It can be @NULL if we don't care about it.
	553	* @is_write: whether the translation operation is for write
	554	* @is_mmio: whether this can be MMIO, set true if it can
	555	* @target_as: the address space targeted by the IOMMU
	556	* @attrs: memory transaction attributes
	557	*
	558	* This function is called from RCU critical section
	559	*/
	560	static MemoryRegionSection flatview_do_translate(FlatView *fv,
	561	hwaddr addr,
	562	hwaddr *xlat,
	563	hwaddr *plen_out,
	564	hwaddr *page_mask_out,
	565	bool is_write,
	566	bool is_mmio,
	567	AddressSpace **target_as,
	568	MemTxAttrs attrs)
	569	{
	570	MemoryRegionSection *section;
	571	IOMMUMemoryRegion *iommu_mr;
	572	hwaddr plen = (hwaddr)(-1);
	573
	574	if (!plen_out) {
	575	plen_out = &plen;
	576	}
	577
	578	section = address_space_translate_internal(
	579	flatview_to_dispatch(fv), addr, xlat,
	580	plen_out, is_mmio);
	581
	582	iommu_mr = memory_region_get_iommu(section->mr);
	583	if (unlikely(iommu_mr)) {
	584	return address_space_translate_iommu(iommu_mr, xlat,
	585	plen_out, page_mask_out,
	586	is_write, is_mmio,
	587	target_as, attrs);
	588	}
	589	if (page_mask_out) {
	590	/* Not behind an IOMMU, use default page size. */
	591	*page_mask_out = ~TARGET_PAGE_MASK;
	592	}
	593
	594	return *section;
	595	}
	596
	597	/* Called from RCU critical section */
	598	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	599	bool is_write, MemTxAttrs attrs)
	600	{
	601	MemoryRegionSection section;
	602	hwaddr xlat, page_mask;
	603
	604	/*
	605	* This can never be MMIO, and we don't really care about plen,
	606	* but page mask.
	607	*/
	608	section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
	609	NULL, &page_mask, is_write, false, &as,
	610	attrs);
	611
	612	/* Illegal translation */
	613	if (section.mr == &io_mem_unassigned) {
	614	goto iotlb_fail;
	615	}
	616
	617	/* Convert memory region offset into address space offset */
	618	xlat += section.offset_within_address_space -
	619	section.offset_within_region;
	620
	621	return (IOMMUTLBEntry) {
	622	.target_as = as,
	623	.iova = addr & ~page_mask,
	624	.translated_addr = xlat & ~page_mask,
	625	.addr_mask = page_mask,
	626	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	627	.perm = IOMMU_RW,
	628	};
	629
	630	iotlb_fail:
	631	return (IOMMUTLBEntry) {0};
	632	}
	633
	634	/* Called from RCU critical section */
	635	MemoryRegion flatview_translate(FlatView fv, hwaddr addr, hwaddr *xlat,
	636	hwaddr *plen, bool is_write,
	637	MemTxAttrs attrs)
	638	{
	639	MemoryRegion *mr;
	640	MemoryRegionSection section;
	641	AddressSpace *as = NULL;
	642
	643	/* This can be MMIO, so setup MMIO bit. */
	644	section = flatview_do_translate(fv, addr, xlat, plen, NULL,
	645	is_write, true, &as, attrs);
	646	mr = section.mr;
	647
	648	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	649	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	650	plen = MIN(page, plen);
	651	}
	652
	653	return mr;
	654	}
	655
	656	typedef struct TCGIOMMUNotifier {
	657	IOMMUNotifier n;
	658	MemoryRegion *mr;
	659	CPUState *cpu;
	660	int iommu_idx;
	661	bool active;
	662	} TCGIOMMUNotifier;
	663
	664	static void tcg_iommu_unmap_notify(IOMMUNotifier n, IOMMUTLBEntry iotlb)
	665	{
	666	TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
	667
	668	if (!notifier->active) {
	669	return;
	670	}
	671	tlb_flush(notifier->cpu);
	672	notifier->active = false;
	673	/* We leave the notifier struct on the list to avoid reallocating it later.
	674	* Generally the number of IOMMUs a CPU deals with will be small.
	675	* In any case we can't unregister the iommu notifier from a notify
	676	* callback.
	677	*/
	678	}
	679
	680	static void tcg_register_iommu_notifier(CPUState *cpu,
	681	IOMMUMemoryRegion *iommu_mr,
	682	int iommu_idx)
	683	{
	684	/* Make sure this CPU has an IOMMU notifier registered for this
	685	* IOMMU/IOMMU index combination, so that we can flush its TLB
	686	* when the IOMMU tells us the mappings we've cached have changed.
	687	*/
	688	MemoryRegion *mr = MEMORY_REGION(iommu_mr);
	689	TCGIOMMUNotifier *notifier;
	690	int i;
	691
	692	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	693	notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
	694	if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
	695	break;
	696	}
	697	}
	698	if (i == cpu->iommu_notifiers->len) {
	699	/* Not found, add a new entry at the end of the array */
	700	cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
	701	notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
	702
	703	notifier->mr = mr;
	704	notifier->iommu_idx = iommu_idx;
	705	notifier->cpu = cpu;
	706	/* Rather than trying to register interest in the specific part
	707	* of the iommu's address space that we've accessed and then
	708	* expand it later as subsequent accesses touch more of it, we
	709	* just register interest in the whole thing, on the assumption
	710	* that iommu reconfiguration will be rare.
	711	*/
	712	iommu_notifier_init(&notifier->n,
	713	tcg_iommu_unmap_notify,
	714	IOMMU_NOTIFIER_UNMAP,
	715	0,
	716	HWADDR_MAX,
	717	iommu_idx);
	718	memory_region_register_iommu_notifier(notifier->mr, &notifier->n);
	719	}
	720
	721	if (!notifier->active) {
	722	notifier->active = true;
	723	}
	724	}
	725
	726	static void tcg_iommu_free_notifier_list(CPUState *cpu)
	727	{
	728	/* Destroy the CPU's notifier list */
	729	int i;
	730	TCGIOMMUNotifier *notifier;
	731
	732	for (i = 0; i < cpu->iommu_notifiers->len; i++) {
	733	notifier = &g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier, i);
	734	memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
	735	}
	736	g_array_free(cpu->iommu_notifiers, true);
	737	}
	738
	739	/* Called from RCU critical section */
	740	MemoryRegionSection *
	741	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	742	hwaddr xlat, hwaddr plen,
	743	MemTxAttrs attrs, int *prot)
	744	{
	745	MemoryRegionSection *section;
	746	IOMMUMemoryRegion *iommu_mr;
	747	IOMMUMemoryRegionClass *imrc;
	748	IOMMUTLBEntry iotlb;
	749	int iommu_idx;
	750	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	751
	752	for (;;) {
	753	section = address_space_translate_internal(d, addr, &addr, plen, false);
	754
	755	iommu_mr = memory_region_get_iommu(section->mr);
	756	if (!iommu_mr) {
	757	break;
	758	}
	759
	760	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	761
	762	iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
	763	tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
	764	/* We need all the permissions, so pass IOMMU_NONE so the IOMMU
	765	* doesn't short-cut its translation table walk.
	766	*/
	767	iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
	768	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	769	\| (addr & iotlb.addr_mask));
	770	/* Update the caller's prot bits to remove permissions the IOMMU
	771	* is giving us a failure response for. If we get down to no
	772	* permissions left at all we can give up now.
	773	*/
	774	if (!(iotlb.perm & IOMMU_RO)) {
	775	*prot &= ~(PAGE_READ \| PAGE_EXEC);
	776	}
	777	if (!(iotlb.perm & IOMMU_WO)) {
	778	*prot &= ~PAGE_WRITE;
	779	}
	780
	781	if (!*prot) {
	782	goto translate_fail;
	783	}
	784
	785	d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
	786	}
	787
	788	assert(!memory_region_is_iommu(section->mr));
	789	*xlat = addr;
	790	return section;
	791
	792	translate_fail:
	793	return &d->map.sections[PHYS_SECTION_UNASSIGNED];
	794	}
	795	#endif
	796
	797	#if !defined(CONFIG_USER_ONLY)
	798
	799	static int cpu_common_post_load(void *opaque, int version_id)
	800	{
	801	CPUState *cpu = opaque;
	802
	803	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	804	version_id is increased. */
	805	cpu->interrupt_request &= ~0x01;
	806	tlb_flush(cpu);
	807
	808	/* loadvm has just updated the content of RAM, bypassing the
	809	* usual mechanisms that ensure we flush TBs for writes to
	810	* memory we've translated code from. So we must flush all TBs,
	811	* which will now be stale.
	812	*/
	813	tb_flush(cpu);
	814
	815	return 0;
	816	}
	817
	818	static int cpu_common_pre_load(void *opaque)
	819	{
	820	CPUState *cpu = opaque;
	821
	822	cpu->exception_index = -1;
	823
	824	return 0;
	825	}
	826
	827	static bool cpu_common_exception_index_needed(void *opaque)
	828	{
	829	CPUState *cpu = opaque;
	830
	831	return tcg_enabled() && cpu->exception_index != -1;
	832	}
	833
	834	static const VMStateDescription vmstate_cpu_common_exception_index = {
	835	.name = "cpu_common/exception_index",
	836	.version_id = 1,
	837	.minimum_version_id = 1,
	838	.needed = cpu_common_exception_index_needed,
	839	.fields = (VMStateField[]) {
	840	VMSTATE_INT32(exception_index, CPUState),
	841	VMSTATE_END_OF_LIST()
	842	}
	843	};
	844
	845	static bool cpu_common_crash_occurred_needed(void *opaque)
	846	{
	847	CPUState *cpu = opaque;
	848
	849	return cpu->crash_occurred;
	850	}
	851
	852	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	853	.name = "cpu_common/crash_occurred",
	854	.version_id = 1,
	855	.minimum_version_id = 1,
	856	.needed = cpu_common_crash_occurred_needed,
	857	.fields = (VMStateField[]) {
	858	VMSTATE_BOOL(crash_occurred, CPUState),
	859	VMSTATE_END_OF_LIST()
	860	}
	861	};
	862
	863	const VMStateDescription vmstate_cpu_common = {
	864	.name = "cpu_common",
	865	.version_id = 1,
	866	.minimum_version_id = 1,
	867	.pre_load = cpu_common_pre_load,
	868	.post_load = cpu_common_post_load,
	869	.fields = (VMStateField[]) {
	870	VMSTATE_UINT32(halted, CPUState),
	871	VMSTATE_UINT32(interrupt_request, CPUState),
	872	VMSTATE_END_OF_LIST()
	873	},
	874	.subsections = (const VMStateDescription*[]) {
	875	&vmstate_cpu_common_exception_index,
	876	&vmstate_cpu_common_crash_occurred,
	877	NULL
	878	}
	879	};
	880
	881	#endif
	882
	883	CPUState *qemu_get_cpu(int index)
	884	{
	885	CPUState *cpu;
	886
	887	CPU_FOREACH(cpu) {
	888	if (cpu->cpu_index == index) {
	889	return cpu;
	890	}
	891	}
	892
	893	return NULL;
	894	}
	895
	896	#if !defined(CONFIG_USER_ONLY)
	897	void cpu_address_space_init(CPUState *cpu, int asidx,
	898	const char prefix, MemoryRegion mr)
	899	{
	900	CPUAddressSpace *newas;
	901	AddressSpace *as = g_new0(AddressSpace, 1);
	902	char *as_name;
	903
	904	assert(mr);
	905	as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
	906	address_space_init(as, mr, as_name);
	907	g_free(as_name);
	908
	909	/* Target code should have set num_ases before calling us */
	910	assert(asidx < cpu->num_ases);
	911
	912	if (asidx == 0) {
	913	/* address space 0 gets the convenience alias */
	914	cpu->as = as;
	915	}
	916
	917	/* KVM cannot currently support multiple address spaces. */
	918	assert(asidx == 0 \|\| !kvm_enabled());
	919
	920	if (!cpu->cpu_ases) {
	921	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	922	}
	923
	924	newas = &cpu->cpu_ases[asidx];
	925	newas->cpu = cpu;
	926	newas->as = as;
	927	if (tcg_enabled()) {
	928	newas->tcg_as_listener.commit = tcg_commit;
	929	memory_listener_register(&newas->tcg_as_listener, as);
	930	}
	931	}
	932
	933	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	934	{
	935	/* Return the AddressSpace corresponding to the specified index */
	936	return cpu->cpu_ases[asidx].as;
	937	}
	938	#endif
	939
	940	void cpu_exec_unrealizefn(CPUState *cpu)
	941	{
	942	CPUClass *cc = CPU_GET_CLASS(cpu);
	943
	944	cpu_list_remove(cpu);
	945
	946	if (cc->vmsd != NULL) {
	947	vmstate_unregister(NULL, cc->vmsd, cpu);
	948	}
	949	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	950	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	951	}
	952	#ifndef CONFIG_USER_ONLY
	953	tcg_iommu_free_notifier_list(cpu);
	954	#endif
	955	}
	956
	957	Property cpu_common_props[] = {
	958	#ifndef CONFIG_USER_ONLY
	959	/* Create a memory property for softmmu CPU object,
	960	* so users can wire up its memory. (This can't go in qom/cpu.c
	961	* because that file is compiled only once for both user-mode
	962	* and system builds.) The default if no link is set up is to use
	963	* the system address space.
	964	*/
	965	DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
	966	MemoryRegion *),
	967	#endif
	968	DEFINE_PROP_END_OF_LIST(),
	969	};
	970
	971	void cpu_exec_initfn(CPUState *cpu)
	972	{
	973	cpu->as = NULL;
	974	cpu->num_ases = 0;
	975
	976	#ifndef CONFIG_USER_ONLY
	977	cpu->thread_id = qemu_get_thread_id();
	978	cpu->memory = system_memory;
	979	object_ref(OBJECT(cpu->memory));
	980	#endif
	981	}
	982
	983	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	984	{
	985	CPUClass *cc = CPU_GET_CLASS(cpu);
	986	static bool tcg_target_initialized;
	987
	988	cpu_list_add(cpu);
	989
	990	if (tcg_enabled() && !tcg_target_initialized) {
	991	tcg_target_initialized = true;
	992	cc->tcg_initialize();
	993	}
	994
	995	#ifndef CONFIG_USER_ONLY
	996	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	997	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	998	}
	999	if (cc->vmsd != NULL) {
	1000	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	1001	}
	1002
	1003	cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier));
	1004	#endif
	1005	}
	1006
	1007	const char parse_cpu_model(const char cpu_model)
	1008	{
	1009	ObjectClass *oc;
	1010	CPUClass *cc;
	1011	gchar **model_pieces;
	1012	const char *cpu_type;
	1013
	1014	model_pieces = g_strsplit(cpu_model, ",", 2);
	1015
	1016	oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
	1017	if (oc == NULL) {
	1018	error_report("unable to find CPU model '%s'", model_pieces[0]);
	1019	g_strfreev(model_pieces);
	1020	exit(EXIT_FAILURE);
	1021	}
	1022
	1023	cpu_type = object_class_get_name(oc);
	1024	cc = CPU_CLASS(oc);
	1025	cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
	1026	g_strfreev(model_pieces);
	1027	return cpu_type;
	1028	}
	1029
	1030	#if defined(CONFIG_USER_ONLY)
	1031	void tb_invalidate_phys_addr(target_ulong addr)
	1032	{
	1033	mmap_lock();
	1034	tb_invalidate_phys_page_range(addr, addr + 1, 0);
	1035	mmap_unlock();
	1036	}
	1037
	1038	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	1039	{
	1040	tb_invalidate_phys_addr(pc);
	1041	}
	1042	#else
	1043	void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
	1044	{
	1045	ram_addr_t ram_addr;
	1046	MemoryRegion *mr;
	1047	hwaddr l = 1;
	1048
	1049	if (!tcg_enabled()) {
	1050	return;
	1051	}
	1052
	1053	rcu_read_lock();
	1054	mr = address_space_translate(as, addr, &addr, &l, false, attrs);
	1055	if (!(memory_region_is_ram(mr)
	1056	\|\| memory_region_is_romd(mr))) {
	1057	rcu_read_unlock();
	1058	return;
	1059	}
	1060	ram_addr = memory_region_get_ram_addr(mr) + addr;
	1061	tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
	1062	rcu_read_unlock();
	1063	}
	1064
	1065	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	1066	{
	1067	MemTxAttrs attrs;
	1068	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	1069	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	1070	if (phys != -1) {
	1071	/* Locks grabbed by tb_invalidate_phys_addr */
	1072	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	1073	phys \| (pc & ~TARGET_PAGE_MASK), attrs);
	1074	}
	1075	}
	1076	#endif
	1077
	1078	#if defined(CONFIG_USER_ONLY)
	1079	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	1080
	1081	{
	1082	}
	1083
	1084	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	1085	int flags)
	1086	{
	1087	return -ENOSYS;
	1088	}
	1089
	1090	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	1091	{
	1092	}
	1093
	1094	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	1095	int flags, CPUWatchpoint **watchpoint)
	1096	{
	1097	return -ENOSYS;
	1098	}
	1099	#else
	1100	/* Add a watchpoint. */
	1101	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	1102	int flags, CPUWatchpoint **watchpoint)
	1103	{
	1104	CPUWatchpoint *wp;
	1105
	1106	/* forbid ranges which are empty or run off the end of the address space */
	1107	if (len == 0 \|\| (addr + len - 1) < addr) {
	1108	error_report("tried to set invalid watchpoint at %"
	1109	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	1110	return -EINVAL;
	1111	}
	1112	wp = g_malloc(sizeof(*wp));
	1113
	1114	wp->vaddr = addr;
	1115	wp->len = len;
	1116	wp->flags = flags;
	1117
	1118	/* keep all GDB-injected watchpoints in front */
	1119	if (flags & BP_GDB) {
	1120	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	1121	} else {
	1122	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	1123	}
	1124
	1125	tlb_flush_page(cpu, addr);
	1126
	1127	if (watchpoint)
	1128	*watchpoint = wp;
	1129	return 0;
	1130	}
	1131
	1132	/* Remove a specific watchpoint. */
	1133	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	1134	int flags)
	1135	{
	1136	CPUWatchpoint *wp;
	1137
	1138	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1139	if (addr == wp->vaddr && len == wp->len
	1140	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	1141	cpu_watchpoint_remove_by_ref(cpu, wp);
	1142	return 0;
	1143	}
	1144	}
	1145	return -ENOENT;
	1146	}
	1147
	1148	/* Remove a specific watchpoint by reference. */
	1149	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	1150	{
	1151	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	1152
	1153	tlb_flush_page(cpu, watchpoint->vaddr);
	1154
	1155	g_free(watchpoint);
	1156	}
	1157
	1158	/* Remove all matching watchpoints. */
	1159	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	1160	{
	1161	CPUWatchpoint wp, next;
	1162
	1163	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	1164	if (wp->flags & mask) {
	1165	cpu_watchpoint_remove_by_ref(cpu, wp);
	1166	}
	1167	}
	1168	}
	1169
	1170	/* Return true if this watchpoint address matches the specified
	1171	* access (ie the address range covered by the watchpoint overlaps
	1172	* partially or completely with the address range covered by the
	1173	* access).
	1174	*/
	1175	static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
	1176	vaddr addr,
	1177	vaddr len)
	1178	{
	1179	/* We know the lengths are non-zero, but a little caution is
	1180	* required to avoid errors in the case where the range ends
	1181	* exactly at the top of the address space and so addr + len
	1182	* wraps round to zero.
	1183	*/
	1184	vaddr wpend = wp->vaddr + wp->len - 1;
	1185	vaddr addrend = addr + len - 1;
	1186
	1187	return !(addr > wpend \|\| wp->vaddr > addrend);
	1188	}
	1189
	1190	#endif
	1191
	1192	/* Add a breakpoint. */
	1193	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	1194	CPUBreakpoint **breakpoint)
	1195	{
	1196	CPUBreakpoint *bp;
	1197
	1198	bp = g_malloc(sizeof(*bp));
	1199
	1200	bp->pc = pc;
	1201	bp->flags = flags;
	1202
	1203	/* keep all GDB-injected breakpoints in front */
	1204	if (flags & BP_GDB) {
	1205	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	1206	} else {
	1207	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	1208	}
	1209
	1210	breakpoint_invalidate(cpu, pc);
	1211
	1212	if (breakpoint) {
	1213	*breakpoint = bp;
	1214	}
	1215	return 0;
	1216	}
	1217
	1218	/* Remove a specific breakpoint. */
	1219	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	1220	{
	1221	CPUBreakpoint *bp;
	1222
	1223	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	1224	if (bp->pc == pc && bp->flags == flags) {
	1225	cpu_breakpoint_remove_by_ref(cpu, bp);
	1226	return 0;
	1227	}
	1228	}
	1229	return -ENOENT;
	1230	}
	1231
	1232	/* Remove a specific breakpoint by reference. */
	1233	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	1234	{
	1235	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	1236
	1237	breakpoint_invalidate(cpu, breakpoint->pc);
	1238
	1239	g_free(breakpoint);
	1240	}
	1241
	1242	/* Remove all matching breakpoints. */
	1243	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	1244	{
	1245	CPUBreakpoint bp, next;
	1246
	1247	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	1248	if (bp->flags & mask) {
	1249	cpu_breakpoint_remove_by_ref(cpu, bp);
	1250	}
	1251	}
	1252	}
	1253
	1254	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	1255	CPU loop after each instruction */
	1256	void cpu_single_step(CPUState *cpu, int enabled)
	1257	{
	1258	if (cpu->singlestep_enabled != enabled) {
	1259	cpu->singlestep_enabled = enabled;
	1260	if (kvm_enabled()) {
	1261	kvm_update_guest_debug(cpu, 0);
	1262	} else {
	1263	/* must flush all the translated code to avoid inconsistencies */
	1264	/* XXX: only flush what is necessary */
	1265	tb_flush(cpu);
	1266	}
	1267	}
	1268	}
	1269
	1270	void cpu_abort(CPUState cpu, const char fmt, ...)
	1271	{
	1272	va_list ap;
	1273	va_list ap2;
	1274
	1275	va_start(ap, fmt);
	1276	va_copy(ap2, ap);
	1277	fprintf(stderr, "qemu: fatal: ");
	1278	vfprintf(stderr, fmt, ap);
	1279	fprintf(stderr, "\n");
	1280	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1281	if (qemu_log_separate()) {
	1282	qemu_log_lock();
	1283	qemu_log("qemu: fatal: ");
	1284	qemu_log_vprintf(fmt, ap2);
	1285	qemu_log("\n");
	1286	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1287	qemu_log_flush();
	1288	qemu_log_unlock();
	1289	qemu_log_close();
	1290	}
	1291	va_end(ap2);
	1292	va_end(ap);
	1293	replay_finish();
	1294	#if defined(CONFIG_USER_ONLY)
	1295	{
	1296	struct sigaction act;
	1297	sigfillset(&act.sa_mask);
	1298	act.sa_handler = SIG_DFL;
	1299	act.sa_flags = 0;
	1300	sigaction(SIGABRT, &act, NULL);
	1301	}
	1302	#endif
	1303	abort();
	1304	}
	1305
	1306	#if !defined(CONFIG_USER_ONLY)
	1307	/* Called from RCU critical section */
	1308	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1309	{
	1310	RAMBlock *block;
	1311
	1312	block = atomic_rcu_read(&ram_list.mru_block);
	1313	if (block && addr - block->offset < block->max_length) {
	1314	return block;
	1315	}
	1316	RAMBLOCK_FOREACH(block) {
	1317	if (addr - block->offset < block->max_length) {
	1318	goto found;
	1319	}
	1320	}
	1321
	1322	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1323	abort();
	1324
	1325	found:
	1326	/* It is safe to write mru_block outside the iothread lock. This
	1327	* is what happens:
	1328	*
	1329	* mru_block = xxx
	1330	* rcu_read_unlock()
	1331	* xxx removed from list
	1332	* rcu_read_lock()
	1333	* read mru_block
	1334	* mru_block = NULL;
	1335	* call_rcu(reclaim_ramblock, xxx);
	1336	* rcu_read_unlock()
	1337	*
	1338	* atomic_rcu_set is not needed here. The block was already published
	1339	* when it was placed into the list. Here we're just making an extra
	1340	* copy of the pointer.
	1341	*/
	1342	ram_list.mru_block = block;
	1343	return block;
	1344	}
	1345
	1346	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1347	{
	1348	CPUState *cpu;
	1349	ram_addr_t start1;
	1350	RAMBlock *block;
	1351	ram_addr_t end;
	1352
	1353	assert(tcg_enabled());
	1354	end = TARGET_PAGE_ALIGN(start + length);
	1355	start &= TARGET_PAGE_MASK;
	1356
	1357	rcu_read_lock();
	1358	block = qemu_get_ram_block(start);
	1359	assert(block == qemu_get_ram_block(end - 1));
	1360	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1361	CPU_FOREACH(cpu) {
	1362	tlb_reset_dirty(cpu, start1, length);
	1363	}
	1364	rcu_read_unlock();
	1365	}
	1366
	1367	/* Note: start and end must be within the same ram block. */
	1368	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1369	ram_addr_t length,
	1370	unsigned client)
	1371	{
	1372	DirtyMemoryBlocks *blocks;
	1373	unsigned long end, page;
	1374	bool dirty = false;
	1375
	1376	if (length == 0) {
	1377	return false;
	1378	}
	1379
	1380	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1381	page = start >> TARGET_PAGE_BITS;
	1382
	1383	rcu_read_lock();
	1384
	1385	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1386
	1387	while (page < end) {
	1388	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1389	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1390	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1391
	1392	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1393	offset, num);
	1394	page += num;
	1395	}
	1396
	1397	rcu_read_unlock();
	1398
	1399	if (dirty && tcg_enabled()) {
	1400	tlb_reset_dirty_range_all(start, length);
	1401	}
	1402
	1403	return dirty;
	1404	}
	1405
	1406	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1407	(ram_addr_t start, ram_addr_t length, unsigned client)
	1408	{
	1409	DirtyMemoryBlocks *blocks;
	1410	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1411	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1412	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1413	DirtyBitmapSnapshot *snap;
	1414	unsigned long page, end, dest;
	1415
	1416	snap = g_malloc0(sizeof(*snap) +
	1417	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1418	snap->start = first;
	1419	snap->end = last;
	1420
	1421	page = first >> TARGET_PAGE_BITS;
	1422	end = last >> TARGET_PAGE_BITS;
	1423	dest = 0;
	1424
	1425	rcu_read_lock();
	1426
	1427	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1428
	1429	while (page < end) {
	1430	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1431	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1432	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1433
	1434	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1435	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1436	offset >>= BITS_PER_LEVEL;
	1437
	1438	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1439	blocks->blocks[idx] + offset,
	1440	num);
	1441	page += num;
	1442	dest += num >> BITS_PER_LEVEL;
	1443	}
	1444
	1445	rcu_read_unlock();
	1446
	1447	if (tcg_enabled()) {
	1448	tlb_reset_dirty_range_all(start, length);
	1449	}
	1450
	1451	return snap;
	1452	}
	1453
	1454	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1455	ram_addr_t start,
	1456	ram_addr_t length)
	1457	{
	1458	unsigned long page, end;
	1459
	1460	assert(start >= snap->start);
	1461	assert(start + length <= snap->end);
	1462
	1463	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1464	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1465
	1466	while (page < end) {
	1467	if (test_bit(page, snap->dirty)) {
	1468	return true;
	1469	}
	1470	page++;
	1471	}
	1472	return false;
	1473	}
	1474
	1475	/* Called from RCU critical section */
	1476	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1477	MemoryRegionSection *section,
	1478	target_ulong vaddr,
	1479	hwaddr paddr, hwaddr xlat,
	1480	int prot,
	1481	target_ulong *address)
	1482	{
	1483	hwaddr iotlb;
	1484	CPUWatchpoint *wp;
	1485
	1486	if (memory_region_is_ram(section->mr)) {
	1487	/* Normal RAM. */
	1488	iotlb = memory_region_get_ram_addr(section->mr) + xlat;
	1489	if (!section->readonly) {
	1490	iotlb \|= PHYS_SECTION_NOTDIRTY;
	1491	} else {
	1492	iotlb \|= PHYS_SECTION_ROM;
	1493	}
	1494	} else {
	1495	AddressSpaceDispatch *d;
	1496
	1497	d = flatview_to_dispatch(section->fv);
	1498	iotlb = section - d->map.sections;
	1499	iotlb += xlat;
	1500	}
	1501
	1502	/* Make accesses to pages with watchpoints go via the
	1503	watchpoint trap routines. */
	1504	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1505	if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
	1506	/* Avoid trapping reads of pages with a write breakpoint. */
	1507	if ((prot & PAGE_WRITE) \|\| (wp->flags & BP_MEM_READ)) {
	1508	iotlb = PHYS_SECTION_WATCH + paddr;
	1509	*address \|= TLB_MMIO;
	1510	break;
	1511	}
	1512	}
	1513	}
	1514
	1515	return iotlb;
	1516	}
	1517	#endif /* defined(CONFIG_USER_ONLY) */
	1518
	1519	#if !defined(CONFIG_USER_ONLY)
	1520
	1521	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	1522	uint16_t section);
	1523	static subpage_t subpage_init(FlatView fv, hwaddr base);
	1524
	1525	static void (phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
	1526	qemu_anon_ram_alloc;
	1527
	1528	/*
	1529	* Set a custom physical guest memory alloator.
	1530	* Accelerators with unusual needs may need this. Hopefully, we can
	1531	* get rid of it eventually.
	1532	*/
	1533	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align, bool shared))
	1534	{
	1535	phys_mem_alloc = alloc;
	1536	}
	1537
	1538	static uint16_t phys_section_add(PhysPageMap *map,
	1539	MemoryRegionSection *section)
	1540	{
	1541	/* The physical section number is ORed with a page-aligned
	1542	* pointer to produce the iotlb entries. Thus it should
	1543	* never overflow into the page-aligned value.
	1544	*/
	1545	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1546
	1547	if (map->sections_nb == map->sections_nb_alloc) {
	1548	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1549	map->sections = g_renew(MemoryRegionSection, map->sections,
	1550	map->sections_nb_alloc);
	1551	}
	1552	map->sections[map->sections_nb] = *section;
	1553	memory_region_ref(section->mr);
	1554	return map->sections_nb++;
	1555	}
	1556
	1557	static void phys_section_destroy(MemoryRegion *mr)
	1558	{
	1559	bool have_sub_page = mr->subpage;
	1560
	1561	memory_region_unref(mr);
	1562
	1563	if (have_sub_page) {
	1564	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1565	object_unref(OBJECT(&subpage->iomem));
	1566	g_free(subpage);
	1567	}
	1568	}
	1569
	1570	static void phys_sections_free(PhysPageMap *map)
	1571	{
	1572	while (map->sections_nb > 0) {
	1573	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1574	phys_section_destroy(section->mr);
	1575	}
	1576	g_free(map->sections);
	1577	g_free(map->nodes);
	1578	}
	1579
	1580	static void register_subpage(FlatView fv, MemoryRegionSection section)
	1581	{
	1582	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1583	subpage_t *subpage;
	1584	hwaddr base = section->offset_within_address_space
	1585	& TARGET_PAGE_MASK;
	1586	MemoryRegionSection *existing = phys_page_find(d, base);
	1587	MemoryRegionSection subsection = {
	1588	.offset_within_address_space = base,
	1589	.size = int128_make64(TARGET_PAGE_SIZE),
	1590	};
	1591	hwaddr start, end;
	1592
	1593	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1594
	1595	if (!(existing->mr->subpage)) {
	1596	subpage = subpage_init(fv, base);
	1597	subsection.fv = fv;
	1598	subsection.mr = &subpage->iomem;
	1599	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1600	phys_section_add(&d->map, &subsection));
	1601	} else {
	1602	subpage = container_of(existing->mr, subpage_t, iomem);
	1603	}
	1604	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1605	end = start + int128_get64(section->size) - 1;
	1606	subpage_register(subpage, start, end,
	1607	phys_section_add(&d->map, section));
	1608	}
	1609
	1610
	1611	static void register_multipage(FlatView *fv,
	1612	MemoryRegionSection *section)
	1613	{
	1614	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1615	hwaddr start_addr = section->offset_within_address_space;
	1616	uint16_t section_index = phys_section_add(&d->map, section);
	1617	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1618	TARGET_PAGE_BITS));
	1619
	1620	assert(num_pages);
	1621	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1622	}
	1623
	1624	void flatview_add_to_dispatch(FlatView fv, MemoryRegionSection section)
	1625	{
	1626	MemoryRegionSection now = section, remain = section;
	1627	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1628
	1629	if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1630	uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
	1631	- now.offset_within_address_space;
	1632
	1633	now.size = int128_min(int128_make64(left), now.size);
	1634	register_subpage(fv, &now);
	1635	} else {
	1636	now.size = int128_zero();
	1637	}
	1638	while (int128_ne(remain.size, now.size)) {
	1639	remain.size = int128_sub(remain.size, now.size);
	1640	remain.offset_within_address_space += int128_get64(now.size);
	1641	remain.offset_within_region += int128_get64(now.size);
	1642	now = remain;
	1643	if (int128_lt(remain.size, page_size)) {
	1644	register_subpage(fv, &now);
	1645	} else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1646	now.size = page_size;
	1647	register_subpage(fv, &now);
	1648	} else {
	1649	now.size = int128_and(now.size, int128_neg(page_size));
	1650	register_multipage(fv, &now);
	1651	}
	1652	}
	1653	}
	1654
	1655	void qemu_flush_coalesced_mmio_buffer(void)
	1656	{
	1657	if (kvm_enabled())
	1658	kvm_flush_coalesced_mmio_buffer();
	1659	}
	1660
	1661	void qemu_mutex_lock_ramlist(void)
	1662	{
	1663	qemu_mutex_lock(&ram_list.mutex);
	1664	}
	1665
	1666	void qemu_mutex_unlock_ramlist(void)
	1667	{
	1668	qemu_mutex_unlock(&ram_list.mutex);
	1669	}
	1670
	1671	void ram_block_dump(Monitor *mon)
	1672	{
	1673	RAMBlock *block;
	1674	char *psize;
	1675
	1676	rcu_read_lock();
	1677	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1678	"Block Name", "PSize", "Offset", "Used", "Total");
	1679	RAMBLOCK_FOREACH(block) {
	1680	psize = size_to_str(block->page_size);
	1681	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1682	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1683	(uint64_t)block->offset,
	1684	(uint64_t)block->used_length,
	1685	(uint64_t)block->max_length);
	1686	g_free(psize);
	1687	}
	1688	rcu_read_unlock();
	1689	}
	1690
	1691	#ifdef __linux__
	1692	/*
	1693	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1694	* may or may not name the same files / on the same filesystem now as
	1695	* when we actually open and map them. Iterate over the file
	1696	* descriptors instead, and use qemu_fd_getpagesize().
	1697	*/
	1698	static int find_max_supported_pagesize(Object obj, void opaque)
	1699	{
	1700	long *hpsize_min = opaque;
	1701
	1702	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1703	long hpsize = host_memory_backend_pagesize(MEMORY_BACKEND(obj));
	1704
	1705	if (hpsize < *hpsize_min) {
	1706	*hpsize_min = hpsize;
	1707	}
	1708	}
	1709
	1710	return 0;
	1711	}
	1712
	1713	long qemu_getrampagesize(void)
	1714	{
	1715	long hpsize = LONG_MAX;
	1716	long mainrampagesize;
	1717	Object *memdev_root;
	1718
	1719	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1720
	1721	/* it's possible we have memory-backend objects with
	1722	* hugepage-backed RAM. these may get mapped into system
	1723	* address space via -numa parameters or memory hotplug
	1724	* hooks. we want to take these into account, but we
	1725	* also want to make sure these supported hugepage
	1726	* sizes are applicable across the entire range of memory
	1727	* we may boot from, so we take the min across all
	1728	* backends, and assume normal pages in cases where a
	1729	* backend isn't backed by hugepages.
	1730	*/
	1731	memdev_root = object_resolve_path("/objects", NULL);
	1732	if (memdev_root) {
	1733	object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
	1734	}
	1735	if (hpsize == LONG_MAX) {
	1736	/* No additional memory regions found ==> Report main RAM page size */
	1737	return mainrampagesize;
	1738	}
	1739
	1740	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1741	* memory-backend, then there is at least one node using "normal" RAM,
	1742	* so if its page size is smaller we have got to report that size instead.
	1743	*/
	1744	if (hpsize > mainrampagesize &&
	1745	(nb_numa_nodes == 0 \|\| numa_info[0].node_memdev == NULL)) {
	1746	static bool warned;
	1747	if (!warned) {
	1748	error_report("Huge page support disabled (n/a for main memory).");
	1749	warned = true;
	1750	}
	1751	return mainrampagesize;
	1752	}
	1753
	1754	return hpsize;
	1755	}
	1756	#else
	1757	long qemu_getrampagesize(void)
	1758	{
	1759	return getpagesize();
	1760	}
	1761	#endif
	1762
	1763	#ifdef __linux__
	1764	static int64_t get_file_size(int fd)
	1765	{
	1766	int64_t size = lseek(fd, 0, SEEK_END);
	1767	if (size < 0) {
	1768	return -errno;
	1769	}
	1770	return size;
	1771	}
	1772
	1773	static int file_ram_open(const char *path,
	1774	const char *region_name,
	1775	bool *created,
	1776	Error **errp)
	1777	{
	1778	char *filename;
	1779	char *sanitized_name;
	1780	char *c;
	1781	int fd = -1;
	1782
	1783	*created = false;
	1784	for (;;) {
	1785	fd = open(path, O_RDWR);
	1786	if (fd >= 0) {
	1787	/* @path names an existing file, use it */
	1788	break;
	1789	}
	1790	if (errno == ENOENT) {
	1791	/* @path names a file that doesn't exist, create it */
	1792	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1793	if (fd >= 0) {
	1794	*created = true;
	1795	break;
	1796	}
	1797	} else if (errno == EISDIR) {
	1798	/* @path names a directory, create a file there */
	1799	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1800	sanitized_name = g_strdup(region_name);
	1801	for (c = sanitized_name; *c != '\0'; c++) {
	1802	if (*c == '/') {
	1803	*c = '_';
	1804	}
	1805	}
	1806
	1807	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1808	sanitized_name);
	1809	g_free(sanitized_name);
	1810
	1811	fd = mkstemp(filename);
	1812	if (fd >= 0) {
	1813	unlink(filename);
	1814	g_free(filename);
	1815	break;
	1816	}
	1817	g_free(filename);
	1818	}
	1819	if (errno != EEXIST && errno != EINTR) {
	1820	error_setg_errno(errp, errno,
	1821	"can't open backing store %s for guest RAM",
	1822	path);
	1823	return -1;
	1824	}
	1825	/*
	1826	* Try again on EINTR and EEXIST. The latter happens when
	1827	* something else creates the file between our two open().
	1828	*/
	1829	}
	1830
	1831	return fd;
	1832	}
	1833
	1834	static void file_ram_alloc(RAMBlock block,
	1835	ram_addr_t memory,
	1836	int fd,
	1837	bool truncate,
	1838	Error **errp)
	1839	{
	1840	void *area;
	1841
	1842	block->page_size = qemu_fd_getpagesize(fd);
	1843	if (block->mr->align % block->page_size) {
	1844	error_setg(errp, "alignment 0x%" PRIx64
	1845	" must be multiples of page size 0x%zx",
	1846	block->mr->align, block->page_size);
	1847	return NULL;
	1848	} else if (block->mr->align && !is_power_of_2(block->mr->align)) {
	1849	error_setg(errp, "alignment 0x%" PRIx64
	1850	" must be a power of two", block->mr->align);
	1851	return NULL;
	1852	}
	1853	block->mr->align = MAX(block->page_size, block->mr->align);
	1854	#if defined(__s390x__)
	1855	if (kvm_enabled()) {
	1856	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1857	}
	1858	#endif
	1859
	1860	if (memory < block->page_size) {
	1861	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1862	"or larger than page size 0x%zx",
	1863	memory, block->page_size);
	1864	return NULL;
	1865	}
	1866
	1867	memory = ROUND_UP(memory, block->page_size);
	1868
	1869	/*
	1870	* ftruncate is not supported by hugetlbfs in older
	1871	* hosts, so don't bother bailing out on errors.
	1872	* If anything goes wrong with it under other filesystems,
	1873	* mmap will fail.
	1874	*
	1875	* Do not truncate the non-empty backend file to avoid corrupting
	1876	* the existing data in the file. Disabling shrinking is not
	1877	* enough. For example, the current vNVDIMM implementation stores
	1878	* the guest NVDIMM labels at the end of the backend file. If the
	1879	* backend file is later extended, QEMU will not be able to find
	1880	* those labels. Therefore, extending the non-empty backend file
	1881	* is disabled as well.
	1882	*/
	1883	if (truncate && ftruncate(fd, memory)) {
	1884	perror("ftruncate");
	1885	}
	1886
	1887	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1888	block->flags & RAM_SHARED);
	1889	if (area == MAP_FAILED) {
	1890	error_setg_errno(errp, errno,
	1891	"unable to map backing store for guest RAM");
	1892	return NULL;
	1893	}
	1894
	1895	if (mem_prealloc) {
	1896	os_mem_prealloc(fd, area, memory, smp_cpus, errp);
	1897	if (errp && *errp) {
	1898	qemu_ram_munmap(area, memory);
	1899	return NULL;
	1900	}
	1901	}
	1902
	1903	block->fd = fd;
	1904	return area;
	1905	}
	1906	#endif
	1907
	1908	/* Allocate space within the ram_addr_t space that governs the
	1909	* dirty bitmaps.
	1910	* Called with the ramlist lock held.
	1911	*/
	1912	static ram_addr_t find_ram_offset(ram_addr_t size)
	1913	{
	1914	RAMBlock block, next_block;
	1915	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1916
	1917	assert(size != 0); /* it would hand out same offset multiple times */
	1918
	1919	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1920	return 0;
	1921	}
	1922
	1923	RAMBLOCK_FOREACH(block) {
	1924	ram_addr_t candidate, next = RAM_ADDR_MAX;
	1925
	1926	/* Align blocks to start on a 'long' in the bitmap
	1927	* which makes the bitmap sync'ing take the fast path.
	1928	*/
	1929	candidate = block->offset + block->max_length;
	1930	candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
	1931
	1932	/* Search for the closest following block
	1933	* and find the gap.
	1934	*/
	1935	RAMBLOCK_FOREACH(next_block) {
	1936	if (next_block->offset >= candidate) {
	1937	next = MIN(next, next_block->offset);
	1938	}
	1939	}
	1940
	1941	/* If it fits remember our place and remember the size
	1942	* of gap, but keep going so that we might find a smaller
	1943	* gap to fill so avoiding fragmentation.
	1944	*/
	1945	if (next - candidate >= size && next - candidate < mingap) {
	1946	offset = candidate;
	1947	mingap = next - candidate;
	1948	}
	1949
	1950	trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
	1951	}
	1952
	1953	if (offset == RAM_ADDR_MAX) {
	1954	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1955	(uint64_t)size);
	1956	abort();
	1957	}
	1958
	1959	trace_find_ram_offset(size, offset);
	1960
	1961	return offset;
	1962	}
	1963
	1964	static unsigned long last_ram_page(void)
	1965	{
	1966	RAMBlock *block;
	1967	ram_addr_t last = 0;
	1968
	1969	rcu_read_lock();
	1970	RAMBLOCK_FOREACH(block) {
	1971	last = MAX(last, block->offset + block->max_length);
	1972	}
	1973	rcu_read_unlock();
	1974	return last >> TARGET_PAGE_BITS;
	1975	}
	1976
	1977	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	1978	{
	1979	int ret;
	1980
	1981	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	1982	if (!machine_dump_guest_core(current_machine)) {
	1983	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	1984	if (ret) {
	1985	perror("qemu_madvise");
	1986	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	1987	"but dump_guest_core=off specified\n");
	1988	}
	1989	}
	1990	}
	1991
	1992	const char qemu_ram_get_idstr(RAMBlock rb)
	1993	{
	1994	return rb->idstr;
	1995	}
	1996
	1997	bool qemu_ram_is_shared(RAMBlock *rb)
	1998	{
	1999	return rb->flags & RAM_SHARED;
	2000	}
	2001
	2002	/* Note: Only set at the start of postcopy */
	2003	bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
	2004	{
	2005	return rb->flags & RAM_UF_ZEROPAGE;
	2006	}
	2007
	2008	void qemu_ram_set_uf_zeroable(RAMBlock *rb)
	2009	{
	2010	rb->flags \|= RAM_UF_ZEROPAGE;
	2011	}
	2012
	2013	bool qemu_ram_is_migratable(RAMBlock *rb)
	2014	{
	2015	return rb->flags & RAM_MIGRATABLE;
	2016	}
	2017
	2018	void qemu_ram_set_migratable(RAMBlock *rb)
	2019	{
	2020	rb->flags \|= RAM_MIGRATABLE;
	2021	}
	2022
	2023	void qemu_ram_unset_migratable(RAMBlock *rb)
	2024	{
	2025	rb->flags &= ~RAM_MIGRATABLE;
	2026	}
	2027
	2028	/* Called with iothread lock held. */
	2029	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	2030	{
	2031	RAMBlock *block;
	2032
	2033	assert(new_block);
	2034	assert(!new_block->idstr[0]);
	2035
	2036	if (dev) {
	2037	char *id = qdev_get_dev_path(dev);
	2038	if (id) {
	2039	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	2040	g_free(id);
	2041	}
	2042	}
	2043	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	2044
	2045	rcu_read_lock();
	2046	RAMBLOCK_FOREACH(block) {
	2047	if (block != new_block &&
	2048	!strcmp(block->idstr, new_block->idstr)) {
	2049	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	2050	new_block->idstr);
	2051	abort();
	2052	}
	2053	}
	2054	rcu_read_unlock();
	2055	}
	2056
	2057	/* Called with iothread lock held. */
	2058	void qemu_ram_unset_idstr(RAMBlock *block)
	2059	{
	2060	/* FIXME: arch_init.c assumes that this is not called throughout
	2061	* migration. Ignore the problem since hot-unplug during migration
	2062	* does not work anyway.
	2063	*/
	2064	if (block) {
	2065	memset(block->idstr, 0, sizeof(block->idstr));
	2066	}
	2067	}
	2068
	2069	size_t qemu_ram_pagesize(RAMBlock *rb)
	2070	{
	2071	return rb->page_size;
	2072	}
	2073
	2074	/* Returns the largest size of page in use */
	2075	size_t qemu_ram_pagesize_largest(void)
	2076	{
	2077	RAMBlock *block;
	2078	size_t largest = 0;
	2079
	2080	RAMBLOCK_FOREACH(block) {
	2081	largest = MAX(largest, qemu_ram_pagesize(block));
	2082	}
	2083
	2084	return largest;
	2085	}
	2086
	2087	static int memory_try_enable_merging(void *addr, size_t len)
	2088	{
	2089	if (!machine_mem_merge(current_machine)) {
	2090	/* disabled by the user */
	2091	return 0;
	2092	}
	2093
	2094	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	2095	}
	2096
	2097	/* Only legal before guest might have detected the memory size: e.g. on
	2098	* incoming migration, or right after reset.
	2099	*
	2100	* As memory core doesn't know how is memory accessed, it is up to
	2101	* resize callback to update device state and/or add assertions to detect
	2102	* misuse, if necessary.
	2103	*/
	2104	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	2105	{
	2106	assert(block);
	2107
	2108	newsize = HOST_PAGE_ALIGN(newsize);
	2109
	2110	if (block->used_length == newsize) {
	2111	return 0;
	2112	}
	2113
	2114	if (!(block->flags & RAM_RESIZEABLE)) {
	2115	error_setg_errno(errp, EINVAL,
	2116	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	2117	" in != 0x" RAM_ADDR_FMT, block->idstr,
	2118	newsize, block->used_length);
	2119	return -EINVAL;
	2120	}
	2121
	2122	if (block->max_length < newsize) {
	2123	error_setg_errno(errp, EINVAL,
	2124	"Length too large: %s: 0x" RAM_ADDR_FMT
	2125	" > 0x" RAM_ADDR_FMT, block->idstr,
	2126	newsize, block->max_length);
	2127	return -EINVAL;
	2128	}
	2129
	2130	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	2131	block->used_length = newsize;
	2132	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	2133	DIRTY_CLIENTS_ALL);
	2134	memory_region_set_size(block->mr, newsize);
	2135	if (block->resized) {
	2136	block->resized(block->idstr, newsize, block->host);
	2137	}
	2138	return 0;
	2139	}
	2140
	2141	/* Called with ram_list.mutex held */
	2142	static void dirty_memory_extend(ram_addr_t old_ram_size,
	2143	ram_addr_t new_ram_size)
	2144	{
	2145	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	2146	DIRTY_MEMORY_BLOCK_SIZE);
	2147	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	2148	DIRTY_MEMORY_BLOCK_SIZE);
	2149	int i;
	2150
	2151	/* Only need to extend if block count increased */
	2152	if (new_num_blocks <= old_num_blocks) {
	2153	return;
	2154	}
	2155
	2156	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	2157	DirtyMemoryBlocks *old_blocks;
	2158	DirtyMemoryBlocks *new_blocks;
	2159	int j;
	2160
	2161	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	2162	new_blocks = g_malloc(sizeof(*new_blocks) +
	2163	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	2164
	2165	if (old_num_blocks) {
	2166	memcpy(new_blocks->blocks, old_blocks->blocks,
	2167	old_num_blocks * sizeof(old_blocks->blocks[0]));
	2168	}
	2169
	2170	for (j = old_num_blocks; j < new_num_blocks; j++) {
	2171	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	2172	}
	2173
	2174	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	2175
	2176	if (old_blocks) {
	2177	g_free_rcu(old_blocks, rcu);
	2178	}
	2179	}
	2180	}
	2181
	2182	static void ram_block_add(RAMBlock new_block, Error *errp, bool shared)
	2183	{
	2184	RAMBlock *block;
	2185	RAMBlock *last_block = NULL;
	2186	ram_addr_t old_ram_size, new_ram_size;
	2187	Error *err = NULL;
	2188
	2189	old_ram_size = last_ram_page();
	2190
	2191	qemu_mutex_lock_ramlist();
	2192	new_block->offset = find_ram_offset(new_block->max_length);
	2193
	2194	if (!new_block->host) {
	2195	if (xen_enabled()) {
	2196	xen_ram_alloc(new_block->offset, new_block->max_length,
	2197	new_block->mr, &err);
	2198	if (err) {
	2199	error_propagate(errp, err);
	2200	qemu_mutex_unlock_ramlist();
	2201	return;
	2202	}
	2203	} else {
	2204	new_block->host = phys_mem_alloc(new_block->max_length,
	2205	&new_block->mr->align, shared);
	2206	if (!new_block->host) {
	2207	error_setg_errno(errp, errno,
	2208	"cannot set up guest memory '%s'",
	2209	memory_region_name(new_block->mr));
	2210	qemu_mutex_unlock_ramlist();
	2211	return;
	2212	}
	2213	memory_try_enable_merging(new_block->host, new_block->max_length);
	2214	}
	2215	}
	2216
	2217	new_ram_size = MAX(old_ram_size,
	2218	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	2219	if (new_ram_size > old_ram_size) {
	2220	dirty_memory_extend(old_ram_size, new_ram_size);
	2221	}
	2222	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	2223	* QLIST (which has an RCU-friendly variant) does not have insertion at
	2224	* tail, so save the last element in last_block.
	2225	*/
	2226	RAMBLOCK_FOREACH(block) {
	2227	last_block = block;
	2228	if (block->max_length < new_block->max_length) {
	2229	break;
	2230	}
	2231	}
	2232	if (block) {
	2233	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	2234	} else if (last_block) {
	2235	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	2236	} else { /* list is empty */
	2237	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	2238	}
	2239	ram_list.mru_block = NULL;
	2240
	2241	/* Write list before version */
	2242	smp_wmb();
	2243	ram_list.version++;
	2244	qemu_mutex_unlock_ramlist();
	2245
	2246	cpu_physical_memory_set_dirty_range(new_block->offset,
	2247	new_block->used_length,
	2248	DIRTY_CLIENTS_ALL);
	2249
	2250	if (new_block->host) {
	2251	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	2252	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	2253	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	2254	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	2255	ram_block_notify_add(new_block->host, new_block->max_length);
	2256	}
	2257	}
	2258
	2259	#ifdef __linux__
	2260	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	2261	bool share, int fd,
	2262	Error **errp)
	2263	{
	2264	RAMBlock *new_block;
	2265	Error *local_err = NULL;
	2266	int64_t file_size;
	2267
	2268	if (xen_enabled()) {
	2269	error_setg(errp, "-mem-path not supported with Xen");
	2270	return NULL;
	2271	}
	2272
	2273	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	2274	error_setg(errp,
	2275	"host lacks kvm mmu notifiers, -mem-path unsupported");
	2276	return NULL;
	2277	}
	2278
	2279	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	2280	/*
	2281	* file_ram_alloc() needs to allocate just like
	2282	* phys_mem_alloc, but we haven't bothered to provide
	2283	* a hook there.
	2284	*/
	2285	error_setg(errp,
	2286	"-mem-path not supported with this accelerator");
	2287	return NULL;
	2288	}
	2289
	2290	size = HOST_PAGE_ALIGN(size);
	2291	file_size = get_file_size(fd);
	2292	if (file_size > 0 && file_size < size) {
	2293	error_setg(errp, "backing store %s size 0x%" PRIx64
	2294	" does not match 'size' option 0x" RAM_ADDR_FMT,
	2295	mem_path, file_size, size);
	2296	return NULL;
	2297	}
	2298
	2299	new_block = g_malloc0(sizeof(*new_block));
	2300	new_block->mr = mr;
	2301	new_block->used_length = size;
	2302	new_block->max_length = size;
	2303	new_block->flags = share ? RAM_SHARED : 0;
	2304	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	2305	if (!new_block->host) {
	2306	g_free(new_block);
	2307	return NULL;
	2308	}
	2309
	2310	ram_block_add(new_block, &local_err, share);
	2311	if (local_err) {
	2312	g_free(new_block);
	2313	error_propagate(errp, local_err);
	2314	return NULL;
	2315	}
	2316	return new_block;
	2317
	2318	}
	2319
	2320
	2321	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	2322	bool share, const char *mem_path,
	2323	Error **errp)
	2324	{
	2325	int fd;
	2326	bool created;
	2327	RAMBlock *block;
	2328
	2329	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2330	if (fd < 0) {
	2331	return NULL;
	2332	}
	2333
	2334	block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
	2335	if (!block) {
	2336	if (created) {
	2337	unlink(mem_path);
	2338	}
	2339	close(fd);
	2340	return NULL;
	2341	}
	2342
	2343	return block;
	2344	}
	2345	#endif
	2346
	2347	static
	2348	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2349	void (resized)(const char,
	2350	uint64_t length,
	2351	void *host),
	2352	void *host, bool resizeable, bool share,
	2353	MemoryRegion mr, Error *errp)
	2354	{
	2355	RAMBlock *new_block;
	2356	Error *local_err = NULL;
	2357
	2358	size = HOST_PAGE_ALIGN(size);
	2359	max_size = HOST_PAGE_ALIGN(max_size);
	2360	new_block = g_malloc0(sizeof(*new_block));
	2361	new_block->mr = mr;
	2362	new_block->resized = resized;
	2363	new_block->used_length = size;
	2364	new_block->max_length = max_size;
	2365	assert(max_size >= size);
	2366	new_block->fd = -1;
	2367	new_block->page_size = getpagesize();
	2368	new_block->host = host;
	2369	if (host) {
	2370	new_block->flags \|= RAM_PREALLOC;
	2371	}
	2372	if (resizeable) {
	2373	new_block->flags \|= RAM_RESIZEABLE;
	2374	}
	2375	ram_block_add(new_block, &local_err, share);
	2376	if (local_err) {
	2377	g_free(new_block);
	2378	error_propagate(errp, local_err);
	2379	return NULL;
	2380	}
	2381	return new_block;
	2382	}
	2383
	2384	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2385	MemoryRegion mr, Error *errp)
	2386	{
	2387	return qemu_ram_alloc_internal(size, size, NULL, host, false,
	2388	false, mr, errp);
	2389	}
	2390
	2391	RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
	2392	MemoryRegion mr, Error *errp)
	2393	{
	2394	return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
	2395	share, mr, errp);
	2396	}
	2397
	2398	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2399	void (resized)(const char,
	2400	uint64_t length,
	2401	void *host),
	2402	MemoryRegion mr, Error *errp)
	2403	{
	2404	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
	2405	false, mr, errp);
	2406	}
	2407
	2408	static void reclaim_ramblock(RAMBlock *block)
	2409	{
	2410	if (block->flags & RAM_PREALLOC) {
	2411	;
	2412	} else if (xen_enabled()) {
	2413	xen_invalidate_map_cache_entry(block->host);
	2414	#ifndef _WIN32
	2415	} else if (block->fd >= 0) {
	2416	qemu_ram_munmap(block->host, block->max_length);
	2417	close(block->fd);
	2418	#endif
	2419	} else {
	2420	qemu_anon_ram_free(block->host, block->max_length);
	2421	}
	2422	g_free(block);
	2423	}
	2424
	2425	void qemu_ram_free(RAMBlock *block)
	2426	{
	2427	if (!block) {
	2428	return;
	2429	}
	2430
	2431	if (block->host) {
	2432	ram_block_notify_remove(block->host, block->max_length);
	2433	}
	2434
	2435	qemu_mutex_lock_ramlist();
	2436	QLIST_REMOVE_RCU(block, next);
	2437	ram_list.mru_block = NULL;
	2438	/* Write list before version */
	2439	smp_wmb();
	2440	ram_list.version++;
	2441	call_rcu(block, reclaim_ramblock, rcu);
	2442	qemu_mutex_unlock_ramlist();
	2443	}
	2444
	2445	#ifndef _WIN32
	2446	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2447	{
	2448	RAMBlock *block;
	2449	ram_addr_t offset;
	2450	int flags;
	2451	void area, vaddr;
	2452
	2453	RAMBLOCK_FOREACH(block) {
	2454	offset = addr - block->offset;
	2455	if (offset < block->max_length) {
	2456	vaddr = ramblock_ptr(block, offset);
	2457	if (block->flags & RAM_PREALLOC) {
	2458	;
	2459	} else if (xen_enabled()) {
	2460	abort();
	2461	} else {
	2462	flags = MAP_FIXED;
	2463	if (block->fd >= 0) {
	2464	flags \|= (block->flags & RAM_SHARED ?
	2465	MAP_SHARED : MAP_PRIVATE);
	2466	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2467	flags, block->fd, offset);
	2468	} else {
	2469	/*
	2470	* Remap needs to match alloc. Accelerators that
	2471	* set phys_mem_alloc never remap. If they did,
	2472	* we'd need a remap hook here.
	2473	*/
	2474	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2475
	2476	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2477	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2478	flags, -1, 0);
	2479	}
	2480	if (area != vaddr) {
	2481	error_report("Could not remap addr: "
	2482	RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
	2483	length, addr);
	2484	exit(1);
	2485	}
	2486	memory_try_enable_merging(vaddr, length);
	2487	qemu_ram_setup_dump(vaddr, length);
	2488	}
	2489	}
	2490	}
	2491	}
	2492	#endif /* !_WIN32 */
	2493
	2494	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2495	* This should not be used for general purpose DMA. Use address_space_map
	2496	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2497	* device owns, use memory_region_get_ram_ptr.
	2498	*
	2499	* Called within RCU critical section.
	2500	*/
	2501	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2502	{
	2503	RAMBlock *block = ram_block;
	2504
	2505	if (block == NULL) {
	2506	block = qemu_get_ram_block(addr);
	2507	addr -= block->offset;
	2508	}
	2509
	2510	if (xen_enabled() && block->host == NULL) {
	2511	/* We need to check if the requested address is in the RAM
	2512	* because we don't want to map the entire memory in QEMU.
	2513	* In that case just map until the end of the page.
	2514	*/
	2515	if (block->offset == 0) {
	2516	return xen_map_cache(addr, 0, 0, false);
	2517	}
	2518
	2519	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2520	}
	2521	return ramblock_ptr(block, addr);
	2522	}
	2523
	2524	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2525	* but takes a size argument.
	2526	*
	2527	* Called within RCU critical section.
	2528	*/
	2529	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2530	hwaddr *size, bool lock)
	2531	{
	2532	RAMBlock *block = ram_block;
	2533	if (*size == 0) {
	2534	return NULL;
	2535	}
	2536
	2537	if (block == NULL) {
	2538	block = qemu_get_ram_block(addr);
	2539	addr -= block->offset;
	2540	}
	2541	size = MIN(size, block->max_length - addr);
	2542
	2543	if (xen_enabled() && block->host == NULL) {
	2544	/* We need to check if the requested address is in the RAM
	2545	* because we don't want to map the entire memory in QEMU.
	2546	* In that case just map the requested area.
	2547	*/
	2548	if (block->offset == 0) {
	2549	return xen_map_cache(addr, *size, lock, lock);
	2550	}
	2551
	2552	block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
	2553	}
	2554
	2555	return ramblock_ptr(block, addr);
	2556	}
	2557
	2558	/* Return the offset of a hostpointer within a ramblock */
	2559	ram_addr_t qemu_ram_block_host_offset(RAMBlock rb, void host)
	2560	{
	2561	ram_addr_t res = (uint8_t )host - (uint8_t )rb->host;
	2562	assert((uintptr_t)host >= (uintptr_t)rb->host);
	2563	assert(res < rb->max_length);
	2564
	2565	return res;
	2566	}
	2567
	2568	/*
	2569	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2570	* in that RAMBlock.
	2571	*
	2572	* ptr: Host pointer to look up
	2573	* round_offset: If true round the result offset down to a page boundary
	2574	* *ram_addr: set to result ram_addr
	2575	* *offset: set to result offset within the RAMBlock
	2576	*
	2577	* Returns: RAMBlock (or NULL if not found)
	2578	*
	2579	* By the time this function returns, the returned pointer is not protected
	2580	* by RCU anymore. If the caller is not within an RCU critical section and
	2581	* does not hold the iothread lock, it must have other means of protecting the
	2582	* pointer, such as a reference to the region that includes the incoming
	2583	* ram_addr_t.
	2584	*/
	2585	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2586	ram_addr_t *offset)
	2587	{
	2588	RAMBlock *block;
	2589	uint8_t *host = ptr;
	2590
	2591	if (xen_enabled()) {
	2592	ram_addr_t ram_addr;
	2593	rcu_read_lock();
	2594	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2595	block = qemu_get_ram_block(ram_addr);
	2596	if (block) {
	2597	*offset = ram_addr - block->offset;
	2598	}
	2599	rcu_read_unlock();
	2600	return block;
	2601	}
	2602
	2603	rcu_read_lock();
	2604	block = atomic_rcu_read(&ram_list.mru_block);
	2605	if (block && block->host && host - block->host < block->max_length) {
	2606	goto found;
	2607	}
	2608
	2609	RAMBLOCK_FOREACH(block) {
	2610	/* This case append when the block is not mapped. */
	2611	if (block->host == NULL) {
	2612	continue;
	2613	}
	2614	if (host - block->host < block->max_length) {
	2615	goto found;
	2616	}
	2617	}
	2618
	2619	rcu_read_unlock();
	2620	return NULL;
	2621
	2622	found:
	2623	*offset = (host - block->host);
	2624	if (round_offset) {
	2625	*offset &= TARGET_PAGE_MASK;
	2626	}
	2627	rcu_read_unlock();
	2628	return block;
	2629	}
	2630
	2631	/*
	2632	* Finds the named RAMBlock
	2633	*
	2634	* name: The name of RAMBlock to find
	2635	*
	2636	* Returns: RAMBlock (or NULL if not found)
	2637	*/
	2638	RAMBlock qemu_ram_block_by_name(const char name)
	2639	{
	2640	RAMBlock *block;
	2641
	2642	RAMBLOCK_FOREACH(block) {
	2643	if (!strcmp(name, block->idstr)) {
	2644	return block;
	2645	}
	2646	}
	2647
	2648	return NULL;
	2649	}
	2650
	2651	/* Some of the softmmu routines need to translate from a host pointer
	2652	(typically a TLB entry) back to a ram offset. */
	2653	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2654	{
	2655	RAMBlock *block;
	2656	ram_addr_t offset;
	2657
	2658	block = qemu_ram_block_from_host(ptr, false, &offset);
	2659	if (!block) {
	2660	return RAM_ADDR_INVALID;
	2661	}
	2662
	2663	return block->offset + offset;
	2664	}
	2665
	2666	/* Called within RCU critical section. */
	2667	void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
	2668	CPUState *cpu,
	2669	vaddr mem_vaddr,
	2670	ram_addr_t ram_addr,
	2671	unsigned size)
	2672	{
	2673	ndi->cpu = cpu;
	2674	ndi->ram_addr = ram_addr;
	2675	ndi->mem_vaddr = mem_vaddr;
	2676	ndi->size = size;
	2677	ndi->pages = NULL;
	2678
	2679	assert(tcg_enabled());
	2680	if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
	2681	ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
	2682	tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
	2683	}
	2684	}
	2685
	2686	/* Called within RCU critical section. */
	2687	void memory_notdirty_write_complete(NotDirtyInfo *ndi)
	2688	{
	2689	if (ndi->pages) {
	2690	assert(tcg_enabled());
	2691	page_collection_unlock(ndi->pages);
	2692	ndi->pages = NULL;
	2693	}
	2694
	2695	/* Set both VGA and migration bits for simplicity and to remove
	2696	* the notdirty callback faster.
	2697	*/
	2698	cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
	2699	DIRTY_CLIENTS_NOCODE);
	2700	/* we remove the notdirty callback only if the code has been
	2701	flushed */
	2702	if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
	2703	tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
	2704	}
	2705	}
	2706
	2707	/* Called within RCU critical section. */
	2708	static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
	2709	uint64_t val, unsigned size)
	2710	{
	2711	NotDirtyInfo ndi;
	2712
	2713	memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
	2714	ram_addr, size);
	2715
	2716	stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
	2717	memory_notdirty_write_complete(&ndi);
	2718	}
	2719
	2720	static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
	2721	unsigned size, bool is_write,
	2722	MemTxAttrs attrs)
	2723	{
	2724	return is_write;
	2725	}
	2726
	2727	static const MemoryRegionOps notdirty_mem_ops = {
	2728	.write = notdirty_mem_write,
	2729	.valid.accepts = notdirty_mem_accepts,
	2730	.endianness = DEVICE_NATIVE_ENDIAN,
	2731	.valid = {
	2732	.min_access_size = 1,
	2733	.max_access_size = 8,
	2734	.unaligned = false,
	2735	},
	2736	.impl = {
	2737	.min_access_size = 1,
	2738	.max_access_size = 8,
	2739	.unaligned = false,
	2740	},
	2741	};
	2742
	2743	/* Generate a debug exception if a watchpoint has been hit. */
	2744	static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
	2745	{
	2746	CPUState *cpu = current_cpu;
	2747	CPUClass *cc = CPU_GET_CLASS(cpu);
	2748	target_ulong vaddr;
	2749	CPUWatchpoint *wp;
	2750
	2751	assert(tcg_enabled());
	2752	if (cpu->watchpoint_hit) {
	2753	/* We re-entered the check after replacing the TB. Now raise
	2754	* the debug interrupt so that is will trigger after the
	2755	* current instruction. */
	2756	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2757	return;
	2758	}
	2759	vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
	2760	vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
	2761	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2762	if (cpu_watchpoint_address_matches(wp, vaddr, len)
	2763	&& (wp->flags & flags)) {
	2764	if (flags == BP_MEM_READ) {
	2765	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2766	} else {
	2767	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2768	}
	2769	wp->hitaddr = vaddr;
	2770	wp->hitattrs = attrs;
	2771	if (!cpu->watchpoint_hit) {
	2772	if (wp->flags & BP_CPU &&
	2773	!cc->debug_check_watchpoint(cpu, wp)) {
	2774	wp->flags &= ~BP_WATCHPOINT_HIT;
	2775	continue;
	2776	}
	2777	cpu->watchpoint_hit = wp;
	2778
	2779	mmap_lock();
	2780	tb_check_watchpoint(cpu);
	2781	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2782	cpu->exception_index = EXCP_DEBUG;
	2783	mmap_unlock();
	2784	cpu_loop_exit(cpu);
	2785	} else {
	2786	/* Force execution of one insn next time. */
	2787	cpu->cflags_next_tb = 1 \| curr_cflags();
	2788	mmap_unlock();
	2789	cpu_loop_exit_noexc(cpu);
	2790	}
	2791	}
	2792	} else {
	2793	wp->flags &= ~BP_WATCHPOINT_HIT;
	2794	}
	2795	}
	2796	}
	2797
	2798	/* Watchpoint access routines. Watchpoints are inserted using TLB tricks,
	2799	so these check for a hit then pass through to the normal out-of-line
	2800	phys routines. */
	2801	static MemTxResult watch_mem_read(void opaque, hwaddr addr, uint64_t pdata,
	2802	unsigned size, MemTxAttrs attrs)
	2803	{
	2804	MemTxResult res;
	2805	uint64_t data;
	2806	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2807	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2808
	2809	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
	2810	switch (size) {
	2811	case 1:
	2812	data = address_space_ldub(as, addr, attrs, &res);
	2813	break;
	2814	case 2:
	2815	data = address_space_lduw(as, addr, attrs, &res);
	2816	break;
	2817	case 4:
	2818	data = address_space_ldl(as, addr, attrs, &res);
	2819	break;
	2820	case 8:
	2821	data = address_space_ldq(as, addr, attrs, &res);
	2822	break;
	2823	default: abort();
	2824	}
	2825	*pdata = data;
	2826	return res;
	2827	}
	2828
	2829	static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
	2830	uint64_t val, unsigned size,
	2831	MemTxAttrs attrs)
	2832	{
	2833	MemTxResult res;
	2834	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2835	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2836
	2837	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
	2838	switch (size) {
	2839	case 1:
	2840	address_space_stb(as, addr, val, attrs, &res);
	2841	break;
	2842	case 2:
	2843	address_space_stw(as, addr, val, attrs, &res);
	2844	break;
	2845	case 4:
	2846	address_space_stl(as, addr, val, attrs, &res);
	2847	break;
	2848	case 8:
	2849	address_space_stq(as, addr, val, attrs, &res);
	2850	break;
	2851	default: abort();
	2852	}
	2853	return res;
	2854	}
	2855
	2856	static const MemoryRegionOps watch_mem_ops = {
	2857	.read_with_attrs = watch_mem_read,
	2858	.write_with_attrs = watch_mem_write,
	2859	.endianness = DEVICE_NATIVE_ENDIAN,
	2860	.valid = {
	2861	.min_access_size = 1,
	2862	.max_access_size = 8,
	2863	.unaligned = false,
	2864	},
	2865	.impl = {
	2866	.min_access_size = 1,
	2867	.max_access_size = 8,
	2868	.unaligned = false,
	2869	},
	2870	};
	2871
	2872	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	2873	MemTxAttrs attrs, uint8_t *buf, int len);
	2874	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	2875	const uint8_t *buf, int len);
	2876	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	2877	bool is_write, MemTxAttrs attrs);
	2878
	2879	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2880	unsigned len, MemTxAttrs attrs)
	2881	{
	2882	subpage_t *subpage = opaque;
	2883	uint8_t buf[8];
	2884	MemTxResult res;
	2885
	2886	#if defined(DEBUG_SUBPAGE)
	2887	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2888	subpage, len, addr);
	2889	#endif
	2890	res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
	2891	if (res) {
	2892	return res;
	2893	}
	2894	*data = ldn_p(buf, len);
	2895	return MEMTX_OK;
	2896	}
	2897
	2898	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2899	uint64_t value, unsigned len, MemTxAttrs attrs)
	2900	{
	2901	subpage_t *subpage = opaque;
	2902	uint8_t buf[8];
	2903
	2904	#if defined(DEBUG_SUBPAGE)
	2905	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2906	" value %"PRIx64"\n",
	2907	__func__, subpage, len, addr, value);
	2908	#endif
	2909	stn_p(buf, len, value);
	2910	return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
	2911	}
	2912
	2913	static bool subpage_accepts(void *opaque, hwaddr addr,
	2914	unsigned len, bool is_write,
	2915	MemTxAttrs attrs)
	2916	{
	2917	subpage_t *subpage = opaque;
	2918	#if defined(DEBUG_SUBPAGE)
	2919	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2920	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2921	#endif
	2922
	2923	return flatview_access_valid(subpage->fv, addr + subpage->base,
	2924	len, is_write, attrs);
	2925	}
	2926
	2927	static const MemoryRegionOps subpage_ops = {
	2928	.read_with_attrs = subpage_read,
	2929	.write_with_attrs = subpage_write,
	2930	.impl.min_access_size = 1,
	2931	.impl.max_access_size = 8,
	2932	.valid.min_access_size = 1,
	2933	.valid.max_access_size = 8,
	2934	.valid.accepts = subpage_accepts,
	2935	.endianness = DEVICE_NATIVE_ENDIAN,
	2936	};
	2937
	2938	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	2939	uint16_t section)
	2940	{
	2941	int idx, eidx;
	2942
	2943	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2944	return -1;
	2945	idx = SUBPAGE_IDX(start);
	2946	eidx = SUBPAGE_IDX(end);
	2947	#if defined(DEBUG_SUBPAGE)
	2948	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2949	__func__, mmio, start, end, idx, eidx, section);
	2950	#endif
	2951	for (; idx <= eidx; idx++) {
	2952	mmio->sub_section[idx] = section;
	2953	}
	2954
	2955	return 0;
	2956	}
	2957
	2958	static subpage_t subpage_init(FlatView fv, hwaddr base)
	2959	{
	2960	subpage_t *mmio;
	2961
	2962	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2963	mmio->fv = fv;
	2964	mmio->base = base;
	2965	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2966	NULL, TARGET_PAGE_SIZE);
	2967	mmio->iomem.subpage = true;
	2968	#if defined(DEBUG_SUBPAGE)
	2969	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2970	mmio, base, TARGET_PAGE_SIZE);
	2971	#endif
	2972	subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
	2973
	2974	return mmio;
	2975	}
	2976
	2977	static uint16_t dummy_section(PhysPageMap map, FlatView fv, MemoryRegion *mr)
	2978	{
	2979	assert(fv);
	2980	MemoryRegionSection section = {
	2981	.fv = fv,
	2982	.mr = mr,
	2983	.offset_within_address_space = 0,
	2984	.offset_within_region = 0,
	2985	.size = int128_2_64(),
	2986	};
	2987
	2988	return phys_section_add(map, &section);
	2989	}
	2990
	2991	static void readonly_mem_write(void *opaque, hwaddr addr,
	2992	uint64_t val, unsigned size)
	2993	{
	2994	/* Ignore any write to ROM. */
	2995	}
	2996
	2997	static bool readonly_mem_accepts(void *opaque, hwaddr addr,
	2998	unsigned size, bool is_write,
	2999	MemTxAttrs attrs)
	3000	{
	3001	return is_write;
	3002	}
	3003
	3004	/* This will only be used for writes, because reads are special cased
	3005	* to directly access the underlying host ram.
	3006	*/
	3007	static const MemoryRegionOps readonly_mem_ops = {
	3008	.write = readonly_mem_write,
	3009	.valid.accepts = readonly_mem_accepts,
	3010	.endianness = DEVICE_NATIVE_ENDIAN,
	3011	.valid = {
	3012	.min_access_size = 1,
	3013	.max_access_size = 8,
	3014	.unaligned = false,
	3015	},
	3016	.impl = {
	3017	.min_access_size = 1,
	3018	.max_access_size = 8,
	3019	.unaligned = false,
	3020	},
	3021	};
	3022
	3023	MemoryRegionSection iotlb_to_section(CPUState cpu,
	3024	hwaddr index, MemTxAttrs attrs)
	3025	{
	3026	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	3027	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	3028	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	3029	MemoryRegionSection *sections = d->map.sections;
	3030
	3031	return &sections[index & ~TARGET_PAGE_MASK];
	3032	}
	3033
	3034	static void io_mem_init(void)
	3035	{
	3036	memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
	3037	NULL, NULL, UINT64_MAX);
	3038	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	3039	NULL, UINT64_MAX);
	3040
	3041	/* io_mem_notdirty calls tb_invalidate_phys_page_fast,
	3042	* which can be called without the iothread mutex.
	3043	*/
	3044	memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
	3045	NULL, UINT64_MAX);
	3046	memory_region_clear_global_locking(&io_mem_notdirty);
	3047
	3048	memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
	3049	NULL, UINT64_MAX);
	3050	}
	3051
	3052	AddressSpaceDispatch address_space_dispatch_new(FlatView fv)
	3053	{
	3054	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	3055	uint16_t n;
	3056
	3057	n = dummy_section(&d->map, fv, &io_mem_unassigned);
	3058	assert(n == PHYS_SECTION_UNASSIGNED);
	3059	n = dummy_section(&d->map, fv, &io_mem_notdirty);
	3060	assert(n == PHYS_SECTION_NOTDIRTY);
	3061	n = dummy_section(&d->map, fv, &io_mem_rom);
	3062	assert(n == PHYS_SECTION_ROM);
	3063	n = dummy_section(&d->map, fv, &io_mem_watch);
	3064	assert(n == PHYS_SECTION_WATCH);
	3065
	3066	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	3067
	3068	return d;
	3069	}
	3070
	3071	void address_space_dispatch_free(AddressSpaceDispatch *d)
	3072	{
	3073	phys_sections_free(&d->map);
	3074	g_free(d);
	3075	}
	3076
	3077	static void tcg_commit(MemoryListener *listener)
	3078	{
	3079	CPUAddressSpace *cpuas;
	3080	AddressSpaceDispatch *d;
	3081
	3082	assert(tcg_enabled());
	3083	/* since each CPU stores ram addresses in its TLB cache, we must
	3084	reset the modified entries */
	3085	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	3086	cpu_reloading_memory_map();
	3087	/* The CPU and TLB are protected by the iothread lock.
	3088	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	3089	* may have split the RCU critical section.
	3090	*/
	3091	d = address_space_to_dispatch(cpuas->as);
	3092	atomic_rcu_set(&cpuas->memory_dispatch, d);
	3093	tlb_flush(cpuas->cpu);
	3094	}
	3095
	3096	static void memory_map_init(void)
	3097	{
	3098	system_memory = g_malloc(sizeof(*system_memory));
	3099
	3100	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	3101	address_space_init(&address_space_memory, system_memory, "memory");
	3102
	3103	system_io = g_malloc(sizeof(*system_io));
	3104	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	3105	65536);
	3106	address_space_init(&address_space_io, system_io, "I/O");
	3107	}
	3108
	3109	MemoryRegion *get_system_memory(void)
	3110	{
	3111	return system_memory;
	3112	}
	3113
	3114	MemoryRegion *get_system_io(void)
	3115	{
	3116	return system_io;
	3117	}
	3118
	3119	#endif /* !defined(CONFIG_USER_ONLY) */
	3120
	3121	/* physical memory access (slow version, mainly for debug) */
	3122	#if defined(CONFIG_USER_ONLY)
	3123	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3124	uint8_t *buf, int len, int is_write)
	3125	{
	3126	int l, flags;
	3127	target_ulong page;
	3128	void * p;
	3129
	3130	while (len > 0) {
	3131	page = addr & TARGET_PAGE_MASK;
	3132	l = (page + TARGET_PAGE_SIZE) - addr;
	3133	if (l > len)
	3134	l = len;
	3135	flags = page_get_flags(page);
	3136	if (!(flags & PAGE_VALID))
	3137	return -1;
	3138	if (is_write) {
	3139	if (!(flags & PAGE_WRITE))
	3140	return -1;
	3141	/* XXX: this code should not depend on lock_user */
	3142	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	3143	return -1;
	3144	memcpy(p, buf, l);
	3145	unlock_user(p, addr, l);
	3146	} else {
	3147	if (!(flags & PAGE_READ))
	3148	return -1;
	3149	/* XXX: this code should not depend on lock_user */
	3150	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	3151	return -1;
	3152	memcpy(buf, p, l);
	3153	unlock_user(p, addr, 0);
	3154	}
	3155	len -= l;
	3156	buf += l;
	3157	addr += l;
	3158	}
	3159	return 0;
	3160	}
	3161
	3162	#else
	3163
	3164	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	3165	hwaddr length)
	3166	{
	3167	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	3168	addr += memory_region_get_ram_addr(mr);
	3169
	3170	/* No early return if dirty_log_mask is or becomes 0, because
	3171	* cpu_physical_memory_set_dirty_range will still call
	3172	* xen_modified_memory.
	3173	*/
	3174	if (dirty_log_mask) {
	3175	dirty_log_mask =
	3176	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	3177	}
	3178	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	3179	assert(tcg_enabled());
	3180	tb_invalidate_phys_range(addr, addr + length);
	3181	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	3182	}
	3183	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	3184	}
	3185
	3186	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	3187	{
	3188	unsigned access_size_max = mr->ops->valid.max_access_size;
	3189
	3190	/* Regions are assumed to support 1-4 byte accesses unless
	3191	otherwise specified. */
	3192	if (access_size_max == 0) {
	3193	access_size_max = 4;
	3194	}
	3195
	3196	/* Bound the maximum access by the alignment of the address. */
	3197	if (!mr->ops->impl.unaligned) {
	3198	unsigned align_size_max = addr & -addr;
	3199	if (align_size_max != 0 && align_size_max < access_size_max) {
	3200	access_size_max = align_size_max;
	3201	}
	3202	}
	3203
	3204	/* Don't attempt accesses larger than the maximum. */
	3205	if (l > access_size_max) {
	3206	l = access_size_max;
	3207	}
	3208	l = pow2floor(l);
	3209
	3210	return l;
	3211	}
	3212
	3213	static bool prepare_mmio_access(MemoryRegion *mr)
	3214	{
	3215	bool unlocked = !qemu_mutex_iothread_locked();
	3216	bool release_lock = false;
	3217
	3218	if (unlocked && mr->global_locking) {
	3219	qemu_mutex_lock_iothread();
	3220	unlocked = false;
	3221	release_lock = true;
	3222	}
	3223	if (mr->flush_coalesced_mmio) {
	3224	if (unlocked) {
	3225	qemu_mutex_lock_iothread();
	3226	}
	3227	qemu_flush_coalesced_mmio_buffer();
	3228	if (unlocked) {
	3229	qemu_mutex_unlock_iothread();
	3230	}
	3231	}
	3232
	3233	return release_lock;
	3234	}
	3235
	3236	/* Called within RCU critical section. */
	3237	static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
	3238	MemTxAttrs attrs,
	3239	const uint8_t *buf,
	3240	int len, hwaddr addr1,
	3241	hwaddr l, MemoryRegion *mr)
	3242	{
	3243	uint8_t *ptr;
	3244	uint64_t val;
	3245	MemTxResult result = MEMTX_OK;
	3246	bool release_lock = false;
	3247
	3248	for (;;) {
	3249	if (!memory_access_is_direct(mr, true)) {
	3250	release_lock \|= prepare_mmio_access(mr);
	3251	l = memory_access_size(mr, l, addr1);
	3252	/* XXX: could force current_cpu to NULL to avoid
	3253	potential bugs */
	3254	val = ldn_p(buf, l);
	3255	result \|= memory_region_dispatch_write(mr, addr1, val, l, attrs);
	3256	} else {
	3257	/* RAM case */
	3258	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3259	memcpy(ptr, buf, l);
	3260	invalidate_and_set_dirty(mr, addr1, l);
	3261	}
	3262
	3263	if (release_lock) {
	3264	qemu_mutex_unlock_iothread();
	3265	release_lock = false;
	3266	}
	3267
	3268	len -= l;
	3269	buf += l;
	3270	addr += l;
	3271
	3272	if (!len) {
	3273	break;
	3274	}
	3275
	3276	l = len;
	3277	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3278	}
	3279
	3280	return result;
	3281	}
	3282
	3283	/* Called from RCU critical section. */
	3284	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3285	const uint8_t *buf, int len)
	3286	{
	3287	hwaddr l;
	3288	hwaddr addr1;
	3289	MemoryRegion *mr;
	3290	MemTxResult result = MEMTX_OK;
	3291
	3292	l = len;
	3293	mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
	3294	result = flatview_write_continue(fv, addr, attrs, buf, len,
	3295	addr1, l, mr);
	3296
	3297	return result;
	3298	}
	3299
	3300	/* Called within RCU critical section. */
	3301	MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
	3302	MemTxAttrs attrs, uint8_t *buf,
	3303	int len, hwaddr addr1, hwaddr l,
	3304	MemoryRegion *mr)
	3305	{
	3306	uint8_t *ptr;
	3307	uint64_t val;
	3308	MemTxResult result = MEMTX_OK;
	3309	bool release_lock = false;
	3310
	3311	for (;;) {
	3312	if (!memory_access_is_direct(mr, false)) {
	3313	/* I/O case */
	3314	release_lock \|= prepare_mmio_access(mr);
	3315	l = memory_access_size(mr, l, addr1);
	3316	result \|= memory_region_dispatch_read(mr, addr1, &val, l, attrs);
	3317	stn_p(buf, l, val);
	3318	} else {
	3319	/* RAM case */
	3320	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3321	memcpy(buf, ptr, l);
	3322	}
	3323
	3324	if (release_lock) {
	3325	qemu_mutex_unlock_iothread();
	3326	release_lock = false;
	3327	}
	3328
	3329	len -= l;
	3330	buf += l;
	3331	addr += l;
	3332
	3333	if (!len) {
	3334	break;
	3335	}
	3336
	3337	l = len;
	3338	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3339	}
	3340
	3341	return result;
	3342	}
	3343
	3344	/* Called from RCU critical section. */
	3345	static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
	3346	MemTxAttrs attrs, uint8_t *buf, int len)
	3347	{
	3348	hwaddr l;
	3349	hwaddr addr1;
	3350	MemoryRegion *mr;
	3351
	3352	l = len;
	3353	mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
	3354	return flatview_read_continue(fv, addr, attrs, buf, len,
	3355	addr1, l, mr);
	3356	}
	3357
	3358	MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
	3359	MemTxAttrs attrs, uint8_t *buf, int len)
	3360	{
	3361	MemTxResult result = MEMTX_OK;
	3362	FlatView *fv;
	3363
	3364	if (len > 0) {
	3365	rcu_read_lock();
	3366	fv = address_space_to_flatview(as);
	3367	result = flatview_read(fv, addr, attrs, buf, len);
	3368	rcu_read_unlock();
	3369	}
	3370
	3371	return result;
	3372	}
	3373
	3374	MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
	3375	MemTxAttrs attrs,
	3376	const uint8_t *buf, int len)
	3377	{
	3378	MemTxResult result = MEMTX_OK;
	3379	FlatView *fv;
	3380
	3381	if (len > 0) {
	3382	rcu_read_lock();
	3383	fv = address_space_to_flatview(as);
	3384	result = flatview_write(fv, addr, attrs, buf, len);
	3385	rcu_read_unlock();
	3386	}
	3387
	3388	return result;
	3389	}
	3390
	3391	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
	3392	uint8_t *buf, int len, bool is_write)
	3393	{
	3394	if (is_write) {
	3395	return address_space_write(as, addr, attrs, buf, len);
	3396	} else {
	3397	return address_space_read_full(as, addr, attrs, buf, len);
	3398	}
	3399	}
	3400
	3401	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3402	int len, int is_write)
	3403	{
	3404	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3405	buf, len, is_write);
	3406	}
	3407
	3408	enum write_rom_type {
	3409	WRITE_DATA,
	3410	FLUSH_CACHE,
	3411	};
	3412
	3413	static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
	3414	hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
	3415	{
	3416	hwaddr l;
	3417	uint8_t *ptr;
	3418	hwaddr addr1;
	3419	MemoryRegion *mr;
	3420
	3421	rcu_read_lock();
	3422	while (len > 0) {
	3423	l = len;
	3424	mr = address_space_translate(as, addr, &addr1, &l, true,
	3425	MEMTXATTRS_UNSPECIFIED);
	3426
	3427	if (!(memory_region_is_ram(mr) \|\|
	3428	memory_region_is_romd(mr))) {
	3429	l = memory_access_size(mr, l, addr1);
	3430	} else {
	3431	/* ROM/RAM case */
	3432	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3433	switch (type) {
	3434	case WRITE_DATA:
	3435	memcpy(ptr, buf, l);
	3436	invalidate_and_set_dirty(mr, addr1, l);
	3437	break;
	3438	case FLUSH_CACHE:
	3439	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3440	break;
	3441	}
	3442	}
	3443	len -= l;
	3444	buf += l;
	3445	addr += l;
	3446	}
	3447	rcu_read_unlock();
	3448	}
	3449
	3450	/* used for ROM loading : can write in RAM and ROM */
	3451	void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
	3452	const uint8_t *buf, int len)
	3453	{
	3454	cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
	3455	}
	3456
	3457	void cpu_flush_icache_range(hwaddr start, int len)
	3458	{
	3459	/*
	3460	* This function should do the same thing as an icache flush that was
	3461	* triggered from within the guest. For TCG we are always cache coherent,
	3462	* so there is no need to flush anything. For KVM / Xen we need to flush
	3463	* the host's instruction cache at least.
	3464	*/
	3465	if (tcg_enabled()) {
	3466	return;
	3467	}
	3468
	3469	cpu_physical_memory_write_rom_internal(&address_space_memory,
	3470	start, NULL, len, FLUSH_CACHE);
	3471	}
	3472
	3473	typedef struct {
	3474	MemoryRegion *mr;
	3475	void *buffer;
	3476	hwaddr addr;
	3477	hwaddr len;
	3478	bool in_use;
	3479	} BounceBuffer;
	3480
	3481	static BounceBuffer bounce;
	3482
	3483	typedef struct MapClient {
	3484	QEMUBH *bh;
	3485	QLIST_ENTRY(MapClient) link;
	3486	} MapClient;
	3487
	3488	QemuMutex map_client_list_lock;
	3489	static QLIST_HEAD(map_client_list, MapClient) map_client_list
	3490	= QLIST_HEAD_INITIALIZER(map_client_list);
	3491
	3492	static void cpu_unregister_map_client_do(MapClient *client)
	3493	{
	3494	QLIST_REMOVE(client, link);
	3495	g_free(client);
	3496	}
	3497
	3498	static void cpu_notify_map_clients_locked(void)
	3499	{
	3500	MapClient *client;
	3501
	3502	while (!QLIST_EMPTY(&map_client_list)) {
	3503	client = QLIST_FIRST(&map_client_list);
	3504	qemu_bh_schedule(client->bh);
	3505	cpu_unregister_map_client_do(client);
	3506	}
	3507	}
	3508
	3509	void cpu_register_map_client(QEMUBH *bh)
	3510	{
	3511	MapClient client = g_malloc(sizeof(client));
	3512
	3513	qemu_mutex_lock(&map_client_list_lock);
	3514	client->bh = bh;
	3515	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3516	if (!atomic_read(&bounce.in_use)) {
	3517	cpu_notify_map_clients_locked();
	3518	}
	3519	qemu_mutex_unlock(&map_client_list_lock);
	3520	}
	3521
	3522	void cpu_exec_init_all(void)
	3523	{
	3524	qemu_mutex_init(&ram_list.mutex);
	3525	/* The data structures we set up here depend on knowing the page size,
	3526	* so no more changes can be made after this point.
	3527	* In an ideal world, nothing we did before we had finished the
	3528	* machine setup would care about the target page size, and we could
	3529	* do this much later, rather than requiring board models to state
	3530	* up front what their requirements are.
	3531	*/
	3532	finalize_target_page_bits();
	3533	io_mem_init();
	3534	memory_map_init();
	3535	qemu_mutex_init(&map_client_list_lock);
	3536	}
	3537
	3538	void cpu_unregister_map_client(QEMUBH *bh)
	3539	{
	3540	MapClient *client;
	3541
	3542	qemu_mutex_lock(&map_client_list_lock);
	3543	QLIST_FOREACH(client, &map_client_list, link) {
	3544	if (client->bh == bh) {
	3545	cpu_unregister_map_client_do(client);
	3546	break;
	3547	}
	3548	}
	3549	qemu_mutex_unlock(&map_client_list_lock);
	3550	}
	3551
	3552	static void cpu_notify_map_clients(void)
	3553	{
	3554	qemu_mutex_lock(&map_client_list_lock);
	3555	cpu_notify_map_clients_locked();
	3556	qemu_mutex_unlock(&map_client_list_lock);
	3557	}
	3558
	3559	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	3560	bool is_write, MemTxAttrs attrs)
	3561	{
	3562	MemoryRegion *mr;
	3563	hwaddr l, xlat;
	3564
	3565	while (len > 0) {
	3566	l = len;
	3567	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3568	if (!memory_access_is_direct(mr, is_write)) {
	3569	l = memory_access_size(mr, l, addr);
	3570	if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
	3571	return false;
	3572	}
	3573	}
	3574
	3575	len -= l;
	3576	addr += l;
	3577	}
	3578	return true;
	3579	}
	3580
	3581	bool address_space_access_valid(AddressSpace *as, hwaddr addr,
	3582	int len, bool is_write,
	3583	MemTxAttrs attrs)
	3584	{
	3585	FlatView *fv;
	3586	bool result;
	3587
	3588	rcu_read_lock();
	3589	fv = address_space_to_flatview(as);
	3590	result = flatview_access_valid(fv, addr, len, is_write, attrs);
	3591	rcu_read_unlock();
	3592	return result;
	3593	}
	3594
	3595	static hwaddr
	3596	flatview_extend_translation(FlatView *fv, hwaddr addr,
	3597	hwaddr target_len,
	3598	MemoryRegion *mr, hwaddr base, hwaddr len,
	3599	bool is_write, MemTxAttrs attrs)
	3600	{
	3601	hwaddr done = 0;
	3602	hwaddr xlat;
	3603	MemoryRegion *this_mr;
	3604
	3605	for (;;) {
	3606	target_len -= len;
	3607	addr += len;
	3608	done += len;
	3609	if (target_len == 0) {
	3610	return done;
	3611	}
	3612
	3613	len = target_len;
	3614	this_mr = flatview_translate(fv, addr, &xlat,
	3615	&len, is_write, attrs);
	3616	if (this_mr != mr \|\| xlat != base + done) {
	3617	return done;
	3618	}
	3619	}
	3620	}
	3621
	3622	/* Map a physical memory region into a host virtual address.
	3623	* May map a subset of the requested range, given by and returned in *plen.
	3624	* May return NULL if resources needed to perform the mapping are exhausted.
	3625	* Use only for reads OR writes - not for read-modify-write operations.
	3626	* Use cpu_register_map_client() to know when retrying the map operation is
	3627	* likely to succeed.
	3628	*/
	3629	void address_space_map(AddressSpace as,
	3630	hwaddr addr,
	3631	hwaddr *plen,
	3632	bool is_write,
	3633	MemTxAttrs attrs)
	3634	{
	3635	hwaddr len = *plen;
	3636	hwaddr l, xlat;
	3637	MemoryRegion *mr;
	3638	void *ptr;
	3639	FlatView *fv;
	3640
	3641	if (len == 0) {
	3642	return NULL;
	3643	}
	3644
	3645	l = len;
	3646	rcu_read_lock();
	3647	fv = address_space_to_flatview(as);
	3648	mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
	3649
	3650	if (!memory_access_is_direct(mr, is_write)) {
	3651	if (atomic_xchg(&bounce.in_use, true)) {
	3652	rcu_read_unlock();
	3653	return NULL;
	3654	}
	3655	/* Avoid unbounded allocations */
	3656	l = MIN(l, TARGET_PAGE_SIZE);
	3657	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3658	bounce.addr = addr;
	3659	bounce.len = l;
	3660
	3661	memory_region_ref(mr);
	3662	bounce.mr = mr;
	3663	if (!is_write) {
	3664	flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
	3665	bounce.buffer, l);
	3666	}
	3667
	3668	rcu_read_unlock();
	3669	*plen = l;
	3670	return bounce.buffer;
	3671	}
	3672
	3673
	3674	memory_region_ref(mr);
	3675	*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
	3676	l, is_write, attrs);
	3677	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
	3678	rcu_read_unlock();
	3679
	3680	return ptr;
	3681	}
	3682
	3683	/* Unmaps a memory region previously mapped by address_space_map().
	3684	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3685	* the amount of memory that was actually read or written by the caller.
	3686	*/
	3687	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3688	int is_write, hwaddr access_len)
	3689	{
	3690	if (buffer != bounce.buffer) {
	3691	MemoryRegion *mr;
	3692	ram_addr_t addr1;
	3693
	3694	mr = memory_region_from_host(buffer, &addr1);
	3695	assert(mr != NULL);
	3696	if (is_write) {
	3697	invalidate_and_set_dirty(mr, addr1, access_len);
	3698	}
	3699	if (xen_enabled()) {
	3700	xen_invalidate_map_cache_entry(buffer);
	3701	}
	3702	memory_region_unref(mr);
	3703	return;
	3704	}
	3705	if (is_write) {
	3706	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3707	bounce.buffer, access_len);
	3708	}
	3709	qemu_vfree(bounce.buffer);
	3710	bounce.buffer = NULL;
	3711	memory_region_unref(bounce.mr);
	3712	atomic_mb_set(&bounce.in_use, false);
	3713	cpu_notify_map_clients();
	3714	}
	3715
	3716	void *cpu_physical_memory_map(hwaddr addr,
	3717	hwaddr *plen,
	3718	int is_write)
	3719	{
	3720	return address_space_map(&address_space_memory, addr, plen, is_write,
	3721	MEMTXATTRS_UNSPECIFIED);
	3722	}
	3723
	3724	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3725	int is_write, hwaddr access_len)
	3726	{
	3727	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3728	}
	3729
	3730	#define ARG1_DECL AddressSpace *as
	3731	#define ARG1 as
	3732	#define SUFFIX
	3733	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3734	#define RCU_READ_LOCK(...) rcu_read_lock()
	3735	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3736	#include "memory_ldst.inc.c"
	3737
	3738	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3739	AddressSpace *as,
	3740	hwaddr addr,
	3741	hwaddr len,
	3742	bool is_write)
	3743	{
	3744	AddressSpaceDispatch *d;
	3745	hwaddr l;
	3746	MemoryRegion *mr;
	3747
	3748	assert(len > 0);
	3749
	3750	l = len;
	3751	cache->fv = address_space_get_flatview(as);
	3752	d = flatview_to_dispatch(cache->fv);
	3753	cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
	3754
	3755	mr = cache->mrs.mr;
	3756	memory_region_ref(mr);
	3757	if (memory_access_is_direct(mr, is_write)) {
	3758	/* We don't care about the memory attributes here as we're only
	3759	* doing this if we found actual RAM, which behaves the same
	3760	* regardless of attributes; so UNSPECIFIED is fine.
	3761	*/
	3762	l = flatview_extend_translation(cache->fv, addr, len, mr,
	3763	cache->xlat, l, is_write,
	3764	MEMTXATTRS_UNSPECIFIED);
	3765	cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
	3766	} else {
	3767	cache->ptr = NULL;
	3768	}
	3769
	3770	cache->len = l;
	3771	cache->is_write = is_write;
	3772	return l;
	3773	}
	3774
	3775	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3776	hwaddr addr,
	3777	hwaddr access_len)
	3778	{
	3779	assert(cache->is_write);
	3780	if (likely(cache->ptr)) {
	3781	invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
	3782	}
	3783	}
	3784
	3785	void address_space_cache_destroy(MemoryRegionCache *cache)
	3786	{
	3787	if (!cache->mrs.mr) {
	3788	return;
	3789	}
	3790
	3791	if (xen_enabled()) {
	3792	xen_invalidate_map_cache_entry(cache->ptr);
	3793	}
	3794	memory_region_unref(cache->mrs.mr);
	3795	flatview_unref(cache->fv);
	3796	cache->mrs.mr = NULL;
	3797	cache->fv = NULL;
	3798	}
	3799
	3800	/* Called from RCU critical section. This function has the same
	3801	* semantics as address_space_translate, but it only works on a
	3802	* predefined range of a MemoryRegion that was mapped with
	3803	* address_space_cache_init.
	3804	*/
	3805	static inline MemoryRegion *address_space_translate_cached(
	3806	MemoryRegionCache cache, hwaddr addr, hwaddr xlat,
	3807	hwaddr *plen, bool is_write, MemTxAttrs attrs)
	3808	{
	3809	MemoryRegionSection section;
	3810	MemoryRegion *mr;
	3811	IOMMUMemoryRegion *iommu_mr;
	3812	AddressSpace *target_as;
	3813
	3814	assert(!cache->ptr);
	3815	*xlat = addr + cache->xlat;
	3816
	3817	mr = cache->mrs.mr;
	3818	iommu_mr = memory_region_get_iommu(mr);
	3819	if (!iommu_mr) {
	3820	/* MMIO region. */
	3821	return mr;
	3822	}
	3823
	3824	section = address_space_translate_iommu(iommu_mr, xlat, plen,
	3825	NULL, is_write, true,
	3826	&target_as, attrs);
	3827	return section.mr;
	3828	}
	3829
	3830	/* Called from RCU critical section. address_space_read_cached uses this
	3831	* out of line function when the target is an MMIO or IOMMU region.
	3832	*/
	3833	void
	3834	address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3835	void *buf, int len)
	3836	{
	3837	hwaddr addr1, l;
	3838	MemoryRegion *mr;
	3839
	3840	l = len;
	3841	mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
	3842	MEMTXATTRS_UNSPECIFIED);
	3843	flatview_read_continue(cache->fv,
	3844	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3845	addr1, l, mr);
	3846	}
	3847
	3848	/* Called from RCU critical section. address_space_write_cached uses this
	3849	* out of line function when the target is an MMIO or IOMMU region.
	3850	*/
	3851	void
	3852	address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
	3853	const void *buf, int len)
	3854	{
	3855	hwaddr addr1, l;
	3856	MemoryRegion *mr;
	3857
	3858	l = len;
	3859	mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
	3860	MEMTXATTRS_UNSPECIFIED);
	3861	flatview_write_continue(cache->fv,
	3862	addr, MEMTXATTRS_UNSPECIFIED, buf, len,
	3863	addr1, l, mr);
	3864	}
	3865
	3866	#define ARG1_DECL MemoryRegionCache *cache
	3867	#define ARG1 cache
	3868	#define SUFFIX _cached_slow
	3869	#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
	3870	#define RCU_READ_LOCK() ((void)0)
	3871	#define RCU_READ_UNLOCK() ((void)0)
	3872	#include "memory_ldst.inc.c"
	3873
	3874	/* virtual memory access for debug (includes writing to ROM) */
	3875	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3876	uint8_t *buf, int len, int is_write)
	3877	{
	3878	int l;
	3879	hwaddr phys_addr;
	3880	target_ulong page;
	3881
	3882	cpu_synchronize_state(cpu);
	3883	while (len > 0) {
	3884	int asidx;
	3885	MemTxAttrs attrs;
	3886
	3887	page = addr & TARGET_PAGE_MASK;
	3888	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3889	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3890	/* if no physical page mapped, return an error */
	3891	if (phys_addr == -1)
	3892	return -1;
	3893	l = (page + TARGET_PAGE_SIZE) - addr;
	3894	if (l > len)
	3895	l = len;
	3896	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3897	if (is_write) {
	3898	cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
	3899	phys_addr, buf, l);
	3900	} else {
	3901	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3902	MEMTXATTRS_UNSPECIFIED,
	3903	buf, l, 0);
	3904	}
	3905	len -= l;
	3906	buf += l;
	3907	addr += l;
	3908	}
	3909	return 0;
	3910	}
	3911
	3912	/*
	3913	* Allows code that needs to deal with migration bitmaps etc to still be built
	3914	* target independent.
	3915	*/
	3916	size_t qemu_target_page_size(void)
	3917	{
	3918	return TARGET_PAGE_SIZE;
	3919	}
	3920
	3921	int qemu_target_page_bits(void)
	3922	{
	3923	return TARGET_PAGE_BITS;
	3924	}
	3925
	3926	int qemu_target_page_bits_min(void)
	3927	{
	3928	return TARGET_PAGE_BITS_MIN;
	3929	}
	3930	#endif
	3931
	3932	/*
	3933	* A helper function for the _utterly broken_ virtio device model to find out if
	3934	* it's running on a big endian machine. Don't do this at home kids!
	3935	*/
	3936	bool target_words_bigendian(void);
	3937	bool target_words_bigendian(void)
	3938	{
	3939	#if defined(TARGET_WORDS_BIGENDIAN)
	3940	return true;
	3941	#else
	3942	return false;
	3943	#endif
	3944	}
	3945
	3946	#ifndef CONFIG_USER_ONLY
	3947	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3948	{
	3949	MemoryRegion*mr;
	3950	hwaddr l = 1;
	3951	bool res;
	3952
	3953	rcu_read_lock();
	3954	mr = address_space_translate(&address_space_memory,
	3955	phys_addr, &phys_addr, &l, false,
	3956	MEMTXATTRS_UNSPECIFIED);
	3957
	3958	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3959	rcu_read_unlock();
	3960	return res;
	3961	}
	3962
	3963	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3964	{
	3965	RAMBlock *block;
	3966	int ret = 0;
	3967
	3968	rcu_read_lock();
	3969	RAMBLOCK_FOREACH(block) {
	3970	ret = func(block->idstr, block->host, block->offset,
	3971	block->used_length, opaque);
	3972	if (ret) {
	3973	break;
	3974	}
	3975	}
	3976	rcu_read_unlock();
	3977	return ret;
	3978	}
	3979
	3980	int qemu_ram_foreach_migratable_block(RAMBlockIterFunc func, void *opaque)
	3981	{
	3982	RAMBlock *block;
	3983	int ret = 0;
	3984
	3985	rcu_read_lock();
	3986	RAMBLOCK_FOREACH(block) {
	3987	if (!qemu_ram_is_migratable(block)) {
	3988	continue;
	3989	}
	3990	ret = func(block->idstr, block->host, block->offset,
	3991	block->used_length, opaque);
	3992	if (ret) {
	3993	break;
	3994	}
	3995	}
	3996	rcu_read_unlock();
	3997	return ret;
	3998	}
	3999
	4000	/*
	4001	* Unmap pages of memory from start to start+length such that
	4002	* they a) read as 0, b) Trigger whatever fault mechanism
	4003	* the OS provides for postcopy.
	4004	* The pages must be unmapped by the end of the function.
	4005	* Returns: 0 on success, none-0 on failure
	4006	*
	4007	*/
	4008	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	4009	{
	4010	int ret = -1;
	4011
	4012	uint8_t *host_startaddr = rb->host + start;
	4013
	4014	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	4015	error_report("ram_block_discard_range: Unaligned start address: %p",
	4016	host_startaddr);
	4017	goto err;
	4018	}
	4019
	4020	if ((start + length) <= rb->used_length) {
	4021	bool need_madvise, need_fallocate;
	4022	uint8_t *host_endaddr = host_startaddr + length;
	4023	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	4024	error_report("ram_block_discard_range: Unaligned end address: %p",
	4025	host_endaddr);
	4026	goto err;
	4027	}
	4028
	4029	errno = ENOTSUP; /* If we are missing MADVISE etc */
	4030
	4031	/* The logic here is messy;
	4032	* madvise DONTNEED fails for hugepages
	4033	* fallocate works on hugepages and shmem
	4034	*/
	4035	need_madvise = (rb->page_size == qemu_host_page_size);
	4036	need_fallocate = rb->fd != -1;
	4037	if (need_fallocate) {
	4038	/* For a file, this causes the area of the file to be zero'd
	4039	* if read, and for hugetlbfs also causes it to be unmapped
	4040	* so a userfault will trigger.
	4041	*/
	4042	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	4043	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	4044	start, length);
	4045	if (ret) {
	4046	ret = -errno;
	4047	error_report("ram_block_discard_range: Failed to fallocate "
	4048	"%s:%" PRIx64 " +%zx (%d)",
	4049	rb->idstr, start, length, ret);
	4050	goto err;
	4051	}
	4052	#else
	4053	ret = -ENOSYS;
	4054	error_report("ram_block_discard_range: fallocate not available/file"
	4055	"%s:%" PRIx64 " +%zx (%d)",
	4056	rb->idstr, start, length, ret);
	4057	goto err;
	4058	#endif
	4059	}
	4060	if (need_madvise) {
	4061	/* For normal RAM this causes it to be unmapped,
	4062	* for shared memory it causes the local mapping to disappear
	4063	* and to fall back on the file contents (which we just
	4064	* fallocate'd away).
	4065	*/
	4066	#if defined(CONFIG_MADVISE)
	4067	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	4068	if (ret) {
	4069	ret = -errno;
	4070	error_report("ram_block_discard_range: Failed to discard range "
	4071	"%s:%" PRIx64 " +%zx (%d)",
	4072	rb->idstr, start, length, ret);
	4073	goto err;
	4074	}
	4075	#else
	4076	ret = -ENOSYS;
	4077	error_report("ram_block_discard_range: MADVISE not available"
	4078	"%s:%" PRIx64 " +%zx (%d)",
	4079	rb->idstr, start, length, ret);
	4080	goto err;
	4081	#endif
	4082	}
	4083	trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
	4084	need_madvise, need_fallocate, ret);
	4085	} else {
	4086	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	4087	"/%zx/" RAM_ADDR_FMT")",
	4088	rb->idstr, start, length, rb->used_length);
	4089	}
	4090
	4091	err:
	4092	return ret;
	4093	}
	4094
	4095	#endif
	4096
	4097	void page_size_init(void)
	4098	{
	4099	/* NOTE: we can always suppose that qemu_host_page_size >=
	4100	TARGET_PAGE_SIZE */
	4101	if (qemu_host_page_size == 0) {
	4102	qemu_host_page_size = qemu_real_host_page_size;
	4103	}
	4104	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	4105	qemu_host_page_size = TARGET_PAGE_SIZE;
	4106	}
	4107	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	4108	}
	4109
	4110	#if !defined(CONFIG_USER_ONLY)
	4111
	4112	static void mtree_print_phys_entries(fprintf_function mon, void *f,
	4113	int start, int end, int skip, int ptr)
	4114	{
	4115	if (start == end - 1) {
	4116	mon(f, "\t%3d ", start);
	4117	} else {
	4118	mon(f, "\t%3d..%-3d ", start, end - 1);
	4119	}
	4120	mon(f, " skip=%d ", skip);
	4121	if (ptr == PHYS_MAP_NODE_NIL) {
	4122	mon(f, " ptr=NIL");
	4123	} else if (!skip) {
	4124	mon(f, " ptr=#%d", ptr);
	4125	} else {
	4126	mon(f, " ptr=[%d]", ptr);
	4127	}
	4128	mon(f, "\n");
	4129	}
	4130
	4131	#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
	4132	int128_sub((size), int128_one())) : 0)
	4133
	4134	void mtree_print_dispatch(fprintf_function mon, void *f,
	4135	AddressSpaceDispatch d, MemoryRegion root)
	4136	{
	4137	int i;
	4138
	4139	mon(f, " Dispatch\n");
	4140	mon(f, " Physical sections\n");
	4141
	4142	for (i = 0; i < d->map.sections_nb; ++i) {
	4143	MemoryRegionSection *s = d->map.sections + i;
	4144	const char *names[] = { " [unassigned]", " [not dirty]",
	4145	" [ROM]", " [watch]" };
	4146
	4147	mon(f, " #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
	4148	i,
	4149	s->offset_within_address_space,
	4150	s->offset_within_address_space + MR_SIZE(s->mr->size),
	4151	s->mr->name ? s->mr->name : "(noname)",
	4152	i < ARRAY_SIZE(names) ? names[i] : "",
	4153	s->mr == root ? " [ROOT]" : "",
	4154	s == d->mru_section ? " [MRU]" : "",
	4155	s->mr->is_iommu ? " [iommu]" : "");
	4156
	4157	if (s->mr->alias) {
	4158	mon(f, " alias=%s", s->mr->alias->name ?
	4159	s->mr->alias->name : "noname");
	4160	}
	4161	mon(f, "\n");
	4162	}
	4163
	4164	mon(f, " Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
	4165	P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
	4166	for (i = 0; i < d->map.nodes_nb; ++i) {
	4167	int j, jprev;
	4168	PhysPageEntry prev;
	4169	Node *n = d->map.nodes + i;
	4170
	4171	mon(f, " [%d]\n", i);
	4172
	4173	for (j = 0, jprev = 0, prev = n[0]; j < ARRAY_SIZE(n); ++j) {
	4174	PhysPageEntry pe = n + j;
	4175
	4176	if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
	4177	continue;
	4178	}
	4179
	4180	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	4181
	4182	jprev = j;
	4183	prev = *pe;
	4184	}
	4185
	4186	if (jprev != ARRAY_SIZE(*n)) {
	4187	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	4188	}
	4189	}
	4190	}
	4191
	4192	#endif