Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Virtual page mapping
	3	*
	4	* Copyright (c) 2003 Fabrice Bellard
	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
	18	*/
	19	#include "qemu/osdep.h"
	20	#include "qapi/error.h"
	21
	22	#include "qemu/cutils.h"
	23	#include "cpu.h"
	24	#include "exec/exec-all.h"
	25	#include "exec/target_page.h"
	26	#include "tcg.h"
	27	#include "hw/qdev-core.h"
	28	#include "hw/qdev-properties.h"
	29	#if !defined(CONFIG_USER_ONLY)
	30	#include "hw/boards.h"
	31	#include "hw/xen/xen.h"
	32	#endif
	33	#include "sysemu/kvm.h"
	34	#include "sysemu/sysemu.h"
	35	#include "qemu/timer.h"
	36	#include "qemu/config-file.h"
	37	#include "qemu/error-report.h"
	38	#if defined(CONFIG_USER_ONLY)
	39	#include "qemu.h"
	40	#else /* !CONFIG_USER_ONLY */
	41	#include "hw/hw.h"
	42	#include "exec/memory.h"
	43	#include "exec/ioport.h"
	44	#include "sysemu/dma.h"
	45	#include "sysemu/numa.h"
	46	#include "sysemu/hw_accel.h"
	47	#include "exec/address-spaces.h"
	48	#include "sysemu/xen-mapcache.h"
	49	#include "trace-root.h"
	50
	51	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	52	#include <linux/falloc.h>
	53	#endif
	54
	55	#endif
	56	#include "qemu/rcu_queue.h"
	57	#include "qemu/main-loop.h"
	58	#include "translate-all.h"
	59	#include "sysemu/replay.h"
	60
	61	#include "exec/memory-internal.h"
	62	#include "exec/ram_addr.h"
	63	#include "exec/log.h"
	64
	65	#include "migration/vmstate.h"
	66
	67	#include "qemu/range.h"
	68	#ifndef _WIN32
	69	#include "qemu/mmap-alloc.h"
	70	#endif
	71
	72	#include "monitor/monitor.h"
	73
	74	//#define DEBUG_SUBPAGE
	75
	76	#if !defined(CONFIG_USER_ONLY)
	77	/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
	78	* are protected by the ramlist lock.
	79	*/
	80	RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
	81
	82	static MemoryRegion *system_memory;
	83	static MemoryRegion *system_io;
	84
	85	AddressSpace address_space_io;
	86	AddressSpace address_space_memory;
	87
	88	MemoryRegion io_mem_rom, io_mem_notdirty;
	89	static MemoryRegion io_mem_unassigned;
	90
	91	/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
	92	#define RAM_PREALLOC (1 << 0)
	93
	94	/* RAM is mmap-ed with MAP_SHARED */
	95	#define RAM_SHARED (1 << 1)
	96
	97	/* Only a portion of RAM (used_length) is actually used, and migrated.
	98	* This used_length size can change across reboots.
	99	*/
	100	#define RAM_RESIZEABLE (1 << 2)
	101
	102	#endif
	103
	104	#ifdef TARGET_PAGE_BITS_VARY
	105	int target_page_bits;
	106	bool target_page_bits_decided;
	107	#endif
	108
	109	struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
	110	/* current CPU in the current thread. It is only valid inside
	111	cpu_exec() */
	112	__thread CPUState *current_cpu;
	113	/* 0 = Do not count executed instructions.
	114	1 = Precise instruction counting.
	115	2 = Adaptive rate instruction counting. */
	116	int use_icount;
	117
	118	uintptr_t qemu_host_page_size;
	119	intptr_t qemu_host_page_mask;
	120
	121	bool set_preferred_target_page_bits(int bits)
	122	{
	123	/* The target page size is the lowest common denominator for all
	124	* the CPUs in the system, so we can only make it smaller, never
	125	* larger. And we can't make it smaller once we've committed to
	126	* a particular size.
	127	*/
	128	#ifdef TARGET_PAGE_BITS_VARY
	129	assert(bits >= TARGET_PAGE_BITS_MIN);
	130	if (target_page_bits == 0 \|\| target_page_bits > bits) {
	131	if (target_page_bits_decided) {
	132	return false;
	133	}
	134	target_page_bits = bits;
	135	}
	136	#endif
	137	return true;
	138	}
	139
	140	#if !defined(CONFIG_USER_ONLY)
	141
	142	static void finalize_target_page_bits(void)
	143	{
	144	#ifdef TARGET_PAGE_BITS_VARY
	145	if (target_page_bits == 0) {
	146	target_page_bits = TARGET_PAGE_BITS_MIN;
	147	}
	148	target_page_bits_decided = true;
	149	#endif
	150	}
	151
	152	typedef struct PhysPageEntry PhysPageEntry;
	153
	154	struct PhysPageEntry {
	155	/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
	156	uint32_t skip : 6;
	157	/* index into phys_sections (!skip) or phys_map_nodes (skip) */
	158	uint32_t ptr : 26;
	159	};
	160
	161	#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
	162
	163	/* Size of the L2 (and L3, etc) page tables. */
	164	#define ADDR_SPACE_BITS 64
	165
	166	#define P_L2_BITS 9
	167	#define P_L2_SIZE (1 << P_L2_BITS)
	168
	169	#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
	170
	171	typedef PhysPageEntry Node[P_L2_SIZE];
	172
	173	typedef struct PhysPageMap {
	174	struct rcu_head rcu;
	175
	176	unsigned sections_nb;
	177	unsigned sections_nb_alloc;
	178	unsigned nodes_nb;
	179	unsigned nodes_nb_alloc;
	180	Node *nodes;
	181	MemoryRegionSection *sections;
	182	} PhysPageMap;
	183
	184	struct AddressSpaceDispatch {
	185	MemoryRegionSection *mru_section;
	186	/* This is a multi-level map on the physical address space.
	187	* The bottom level has pointers to MemoryRegionSections.
	188	*/
	189	PhysPageEntry phys_map;
	190	PhysPageMap map;
	191	};
	192
	193	#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
	194	typedef struct subpage_t {
	195	MemoryRegion iomem;
	196	FlatView *fv;
	197	hwaddr base;
	198	uint16_t sub_section[];
	199	} subpage_t;
	200
	201	#define PHYS_SECTION_UNASSIGNED 0
	202	#define PHYS_SECTION_NOTDIRTY 1
	203	#define PHYS_SECTION_ROM 2
	204	#define PHYS_SECTION_WATCH 3
	205
	206	static void io_mem_init(void);
	207	static void memory_map_init(void);
	208	static void tcg_commit(MemoryListener *listener);
	209
	210	static MemoryRegion io_mem_watch;
	211
	212	/**
	213	* CPUAddressSpace: all the information a CPU needs about an AddressSpace
	214	* @cpu: the CPU whose AddressSpace this is
	215	* @as: the AddressSpace itself
	216	* @memory_dispatch: its dispatch pointer (cached, RCU protected)
	217	* @tcg_as_listener: listener for tracking changes to the AddressSpace
	218	*/
	219	struct CPUAddressSpace {
	220	CPUState *cpu;
	221	AddressSpace *as;
	222	struct AddressSpaceDispatch *memory_dispatch;
	223	MemoryListener tcg_as_listener;
	224	};
	225
	226	struct DirtyBitmapSnapshot {
	227	ram_addr_t start;
	228	ram_addr_t end;
	229	unsigned long dirty[];
	230	};
	231
	232	#endif
	233
	234	#if !defined(CONFIG_USER_ONLY)
	235
	236	static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
	237	{
	238	static unsigned alloc_hint = 16;
	239	if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
	240	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
	241	map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
	242	map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
	243	alloc_hint = map->nodes_nb_alloc;
	244	}
	245	}
	246
	247	static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
	248	{
	249	unsigned i;
	250	uint32_t ret;
	251	PhysPageEntry e;
	252	PhysPageEntry *p;
	253
	254	ret = map->nodes_nb++;
	255	p = map->nodes[ret];
	256	assert(ret != PHYS_MAP_NODE_NIL);
	257	assert(ret != map->nodes_nb_alloc);
	258
	259	e.skip = leaf ? 0 : 1;
	260	e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
	261	for (i = 0; i < P_L2_SIZE; ++i) {
	262	memcpy(&p[i], &e, sizeof(e));
	263	}
	264	return ret;
	265	}
	266
	267	static void phys_page_set_level(PhysPageMap map, PhysPageEntry lp,
	268	hwaddr index, hwaddr nb, uint16_t leaf,
	269	int level)
	270	{
	271	PhysPageEntry *p;
	272	hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
	273
	274	if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
	275	lp->ptr = phys_map_node_alloc(map, level == 0);
	276	}
	277	p = map->nodes[lp->ptr];
	278	lp = &p[(index >> (level P_L2_BITS)) & (P_L2_SIZE - 1)];
	279
	280	while (*nb && lp < &p[P_L2_SIZE]) {
	281	if ((index & (step - 1)) == 0 && nb >= step) {
	282	lp->skip = 0;
	283	lp->ptr = leaf;
	284	*index += step;
	285	*nb -= step;
	286	} else {
	287	phys_page_set_level(map, lp, index, nb, leaf, level - 1);
	288	}
	289	++lp;
	290	}
	291	}
	292
	293	static void phys_page_set(AddressSpaceDispatch *d,
	294	hwaddr index, hwaddr nb,
	295	uint16_t leaf)
	296	{
	297	/* Wildly overreserve - it doesn't matter much. */
	298	phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
	299
	300	phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
	301	}
	302
	303	/* Compact a non leaf page entry. Simply detect that the entry has a single child,
	304	* and update our entry so we can skip it and go directly to the destination.
	305	*/
	306	static void phys_page_compact(PhysPageEntry lp, Node nodes)
	307	{
	308	unsigned valid_ptr = P_L2_SIZE;
	309	int valid = 0;
	310	PhysPageEntry *p;
	311	int i;
	312
	313	if (lp->ptr == PHYS_MAP_NODE_NIL) {
	314	return;
	315	}
	316
	317	p = nodes[lp->ptr];
	318	for (i = 0; i < P_L2_SIZE; i++) {
	319	if (p[i].ptr == PHYS_MAP_NODE_NIL) {
	320	continue;
	321	}
	322
	323	valid_ptr = i;
	324	valid++;
	325	if (p[i].skip) {
	326	phys_page_compact(&p[i], nodes);
	327	}
	328	}
	329
	330	/* We can only compress if there's only one child. */
	331	if (valid != 1) {
	332	return;
	333	}
	334
	335	assert(valid_ptr < P_L2_SIZE);
	336
	337	/* Don't compress if it won't fit in the # of bits we have. */
	338	if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
	339	return;
	340	}
	341
	342	lp->ptr = p[valid_ptr].ptr;
	343	if (!p[valid_ptr].skip) {
	344	/* If our only child is a leaf, make this a leaf. */
	345	/* By design, we should have made this node a leaf to begin with so we
	346	* should never reach here.
	347	* But since it's so simple to handle this, let's do it just in case we
	348	* change this rule.
	349	*/
	350	lp->skip = 0;
	351	} else {
	352	lp->skip += p[valid_ptr].skip;
	353	}
	354	}
	355
	356	void address_space_dispatch_compact(AddressSpaceDispatch *d)
	357	{
	358	if (d->phys_map.skip) {
	359	phys_page_compact(&d->phys_map, d->map.nodes);
	360	}
	361	}
	362
	363	static inline bool section_covers_addr(const MemoryRegionSection *section,
	364	hwaddr addr)
	365	{
	366	/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
	367	* the section must cover the entire address space.
	368	*/
	369	return int128_gethi(section->size) \|\|
	370	range_covers_byte(section->offset_within_address_space,
	371	int128_getlo(section->size), addr);
	372	}
	373
	374	static MemoryRegionSection phys_page_find(AddressSpaceDispatch d, hwaddr addr)
	375	{
	376	PhysPageEntry lp = d->phys_map, *p;
	377	Node *nodes = d->map.nodes;
	378	MemoryRegionSection *sections = d->map.sections;
	379	hwaddr index = addr >> TARGET_PAGE_BITS;
	380	int i;
	381
	382	for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
	383	if (lp.ptr == PHYS_MAP_NODE_NIL) {
	384	return &sections[PHYS_SECTION_UNASSIGNED];
	385	}
	386	p = nodes[lp.ptr];
	387	lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
	388	}
	389
	390	if (section_covers_addr(&sections[lp.ptr], addr)) {
	391	return &sections[lp.ptr];
	392	} else {
	393	return &sections[PHYS_SECTION_UNASSIGNED];
	394	}
	395	}
	396
	397	bool memory_region_is_unassigned(MemoryRegion *mr)
	398	{
	399	return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
	400	&& mr != &io_mem_watch;
	401	}
	402
	403	/* Called from RCU critical section */
	404	static MemoryRegionSection address_space_lookup_region(AddressSpaceDispatch d,
	405	hwaddr addr,
	406	bool resolve_subpage)
	407	{
	408	MemoryRegionSection *section = atomic_read(&d->mru_section);
	409	subpage_t *subpage;
	410
	411	if (!section \|\| section == &d->map.sections[PHYS_SECTION_UNASSIGNED] \|\|
	412	!section_covers_addr(section, addr)) {
	413	section = phys_page_find(d, addr);
	414	atomic_set(&d->mru_section, section);
	415	}
	416	if (resolve_subpage && section->mr->subpage) {
	417	subpage = container_of(section->mr, subpage_t, iomem);
	418	section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
	419	}
	420	return section;
	421	}
	422
	423	/* Called from RCU critical section */
	424	static MemoryRegionSection *
	425	address_space_translate_internal(AddressSpaceDispatch d, hwaddr addr, hwaddr xlat,
	426	hwaddr *plen, bool resolve_subpage)
	427	{
	428	MemoryRegionSection *section;
	429	MemoryRegion *mr;
	430	Int128 diff;
	431
	432	section = address_space_lookup_region(d, addr, resolve_subpage);
	433	/* Compute offset within MemoryRegionSection */
	434	addr -= section->offset_within_address_space;
	435
	436	/* Compute offset within MemoryRegion */
	437	*xlat = addr + section->offset_within_region;
	438
	439	mr = section->mr;
	440
	441	/* MMIO registers can be expected to perform full-width accesses based only
	442	* on their address, without considering adjacent registers that could
	443	* decode to completely different MemoryRegions. When such registers
	444	* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
	445	* regions overlap wildly. For this reason we cannot clamp the accesses
	446	* here.
	447	*
	448	* If the length is small (as is the case for address_space_ldl/stl),
	449	* everything works fine. If the incoming length is large, however,
	450	* the caller really has to do the clamping through memory_access_size.
	451	*/
	452	if (memory_region_is_ram(mr)) {
	453	diff = int128_sub(section->size, int128_make64(addr));
	454	plen = int128_get64(int128_min(diff, int128_make64(plen)));
	455	}
	456	return section;
	457	}
	458
	459	/**
	460	* flatview_do_translate - translate an address in FlatView
	461	*
	462	* @fv: the flat view that we want to translate on
	463	* @addr: the address to be translated in above address space
	464	* @xlat: the translated address offset within memory region. It
	465	* cannot be @NULL.
	466	* @plen_out: valid read/write length of the translated address. It
	467	* can be @NULL when we don't care about it.
	468	* @page_mask_out: page mask for the translated address. This
	469	* should only be meaningful for IOMMU translated
	470	* addresses, since there may be huge pages that this bit
	471	* would tell. It can be @NULL if we don't care about it.
	472	* @is_write: whether the translation operation is for write
	473	* @is_mmio: whether this can be MMIO, set true if it can
	474	*
	475	* This function is called from RCU critical section
	476	*/
	477	static MemoryRegionSection flatview_do_translate(FlatView *fv,
	478	hwaddr addr,
	479	hwaddr *xlat,
	480	hwaddr *plen_out,
	481	hwaddr *page_mask_out,
	482	bool is_write,
	483	bool is_mmio,
	484	AddressSpace **target_as)
	485	{
	486	IOMMUTLBEntry iotlb;
	487	MemoryRegionSection *section;
	488	IOMMUMemoryRegion *iommu_mr;
	489	IOMMUMemoryRegionClass *imrc;
	490	hwaddr page_mask = (hwaddr)(-1);
	491	hwaddr plen = (hwaddr)(-1);
	492
	493	if (plen_out) {
	494	plen = *plen_out;
	495	}
	496
	497	for (;;) {
	498	section = address_space_translate_internal(
	499	flatview_to_dispatch(fv), addr, &addr,
	500	&plen, is_mmio);
	501
	502	iommu_mr = memory_region_get_iommu(section->mr);
	503	if (!iommu_mr) {
	504	break;
	505	}
	506	imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
	507
	508	iotlb = imrc->translate(iommu_mr, addr, is_write ?
	509	IOMMU_WO : IOMMU_RO);
	510	addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
	511	\| (addr & iotlb.addr_mask));
	512	page_mask &= iotlb.addr_mask;
	513	plen = MIN(plen, (addr \| iotlb.addr_mask) - addr + 1);
	514	if (!(iotlb.perm & (1 << is_write))) {
	515	goto translate_fail;
	516	}
	517
	518	fv = address_space_to_flatview(iotlb.target_as);
	519	*target_as = iotlb.target_as;
	520	}
	521
	522	*xlat = addr;
	523
	524	if (page_mask == (hwaddr)(-1)) {
	525	/* Not behind an IOMMU, use default page size. */
	526	page_mask = ~TARGET_PAGE_MASK;
	527	}
	528
	529	if (page_mask_out) {
	530	*page_mask_out = page_mask;
	531	}
	532
	533	if (plen_out) {
	534	*plen_out = plen;
	535	}
	536
	537	return *section;
	538
	539	translate_fail:
	540	return (MemoryRegionSection) { .mr = &io_mem_unassigned };
	541	}
	542
	543	/* Called from RCU critical section */
	544	IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
	545	bool is_write)
	546	{
	547	MemoryRegionSection section;
	548	hwaddr xlat, page_mask;
	549
	550	/*
	551	* This can never be MMIO, and we don't really care about plen,
	552	* but page mask.
	553	*/
	554	section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
	555	NULL, &page_mask, is_write, false, &as);
	556
	557	/* Illegal translation */
	558	if (section.mr == &io_mem_unassigned) {
	559	goto iotlb_fail;
	560	}
	561
	562	/* Convert memory region offset into address space offset */
	563	xlat += section.offset_within_address_space -
	564	section.offset_within_region;
	565
	566	return (IOMMUTLBEntry) {
	567	.target_as = as,
	568	.iova = addr & ~page_mask,
	569	.translated_addr = xlat & ~page_mask,
	570	.addr_mask = page_mask,
	571	/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
	572	.perm = IOMMU_RW,
	573	};
	574
	575	iotlb_fail:
	576	return (IOMMUTLBEntry) {0};
	577	}
	578
	579	/* Called from RCU critical section */
	580	MemoryRegion flatview_translate(FlatView fv, hwaddr addr, hwaddr *xlat,
	581	hwaddr *plen, bool is_write)
	582	{
	583	MemoryRegion *mr;
	584	MemoryRegionSection section;
	585	AddressSpace *as = NULL;
	586
	587	/* This can be MMIO, so setup MMIO bit. */
	588	section = flatview_do_translate(fv, addr, xlat, plen, NULL,
	589	is_write, true, &as);
	590	mr = section.mr;
	591
	592	if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
	593	hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
	594	plen = MIN(page, plen);
	595	}
	596
	597	return mr;
	598	}
	599
	600	/* Called from RCU critical section */
	601	MemoryRegionSection *
	602	address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
	603	hwaddr xlat, hwaddr plen)
	604	{
	605	MemoryRegionSection *section;
	606	AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
	607
	608	section = address_space_translate_internal(d, addr, xlat, plen, false);
	609
	610	assert(!memory_region_is_iommu(section->mr));
	611	return section;
	612	}
	613	#endif
	614
	615	#if !defined(CONFIG_USER_ONLY)
	616
	617	static int cpu_common_post_load(void *opaque, int version_id)
	618	{
	619	CPUState *cpu = opaque;
	620
	621	/* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
	622	version_id is increased. */
	623	cpu->interrupt_request &= ~0x01;
	624	tlb_flush(cpu);
	625
	626	/* loadvm has just updated the content of RAM, bypassing the
	627	* usual mechanisms that ensure we flush TBs for writes to
	628	* memory we've translated code from. So we must flush all TBs,
	629	* which will now be stale.
	630	*/
	631	tb_flush(cpu);
	632
	633	return 0;
	634	}
	635
	636	static int cpu_common_pre_load(void *opaque)
	637	{
	638	CPUState *cpu = opaque;
	639
	640	cpu->exception_index = -1;
	641
	642	return 0;
	643	}
	644
	645	static bool cpu_common_exception_index_needed(void *opaque)
	646	{
	647	CPUState *cpu = opaque;
	648
	649	return tcg_enabled() && cpu->exception_index != -1;
	650	}
	651
	652	static const VMStateDescription vmstate_cpu_common_exception_index = {
	653	.name = "cpu_common/exception_index",
	654	.version_id = 1,
	655	.minimum_version_id = 1,
	656	.needed = cpu_common_exception_index_needed,
	657	.fields = (VMStateField[]) {
	658	VMSTATE_INT32(exception_index, CPUState),
	659	VMSTATE_END_OF_LIST()
	660	}
	661	};
	662
	663	static bool cpu_common_crash_occurred_needed(void *opaque)
	664	{
	665	CPUState *cpu = opaque;
	666
	667	return cpu->crash_occurred;
	668	}
	669
	670	static const VMStateDescription vmstate_cpu_common_crash_occurred = {
	671	.name = "cpu_common/crash_occurred",
	672	.version_id = 1,
	673	.minimum_version_id = 1,
	674	.needed = cpu_common_crash_occurred_needed,
	675	.fields = (VMStateField[]) {
	676	VMSTATE_BOOL(crash_occurred, CPUState),
	677	VMSTATE_END_OF_LIST()
	678	}
	679	};
	680
	681	const VMStateDescription vmstate_cpu_common = {
	682	.name = "cpu_common",
	683	.version_id = 1,
	684	.minimum_version_id = 1,
	685	.pre_load = cpu_common_pre_load,
	686	.post_load = cpu_common_post_load,
	687	.fields = (VMStateField[]) {
	688	VMSTATE_UINT32(halted, CPUState),
	689	VMSTATE_UINT32(interrupt_request, CPUState),
	690	VMSTATE_END_OF_LIST()
	691	},
	692	.subsections = (const VMStateDescription*[]) {
	693	&vmstate_cpu_common_exception_index,
	694	&vmstate_cpu_common_crash_occurred,
	695	NULL
	696	}
	697	};
	698
	699	#endif
	700
	701	CPUState *qemu_get_cpu(int index)
	702	{
	703	CPUState *cpu;
	704
	705	CPU_FOREACH(cpu) {
	706	if (cpu->cpu_index == index) {
	707	return cpu;
	708	}
	709	}
	710
	711	return NULL;
	712	}
	713
	714	#if !defined(CONFIG_USER_ONLY)
	715	void cpu_address_space_init(CPUState *cpu, int asidx,
	716	const char prefix, MemoryRegion mr)
	717	{
	718	CPUAddressSpace *newas;
	719	AddressSpace *as = g_new0(AddressSpace, 1);
	720	char *as_name;
	721
	722	assert(mr);
	723	as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
	724	address_space_init(as, mr, as_name);
	725	g_free(as_name);
	726
	727	/* Target code should have set num_ases before calling us */
	728	assert(asidx < cpu->num_ases);
	729
	730	if (asidx == 0) {
	731	/* address space 0 gets the convenience alias */
	732	cpu->as = as;
	733	}
	734
	735	/* KVM cannot currently support multiple address spaces. */
	736	assert(asidx == 0 \|\| !kvm_enabled());
	737
	738	if (!cpu->cpu_ases) {
	739	cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
	740	}
	741
	742	newas = &cpu->cpu_ases[asidx];
	743	newas->cpu = cpu;
	744	newas->as = as;
	745	if (tcg_enabled()) {
	746	newas->tcg_as_listener.commit = tcg_commit;
	747	memory_listener_register(&newas->tcg_as_listener, as);
	748	}
	749	}
	750
	751	AddressSpace cpu_get_address_space(CPUState cpu, int asidx)
	752	{
	753	/* Return the AddressSpace corresponding to the specified index */
	754	return cpu->cpu_ases[asidx].as;
	755	}
	756	#endif
	757
	758	void cpu_exec_unrealizefn(CPUState *cpu)
	759	{
	760	CPUClass *cc = CPU_GET_CLASS(cpu);
	761
	762	cpu_list_remove(cpu);
	763
	764	if (cc->vmsd != NULL) {
	765	vmstate_unregister(NULL, cc->vmsd, cpu);
	766	}
	767	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	768	vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
	769	}
	770	}
	771
	772	Property cpu_common_props[] = {
	773	#ifndef CONFIG_USER_ONLY
	774	/* Create a memory property for softmmu CPU object,
	775	* so users can wire up its memory. (This can't go in qom/cpu.c
	776	* because that file is compiled only once for both user-mode
	777	* and system builds.) The default if no link is set up is to use
	778	* the system address space.
	779	*/
	780	DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
	781	MemoryRegion *),
	782	#endif
	783	DEFINE_PROP_END_OF_LIST(),
	784	};
	785
	786	void cpu_exec_initfn(CPUState *cpu)
	787	{
	788	cpu->as = NULL;
	789	cpu->num_ases = 0;
	790
	791	#ifndef CONFIG_USER_ONLY
	792	cpu->thread_id = qemu_get_thread_id();
	793	cpu->memory = system_memory;
	794	object_ref(OBJECT(cpu->memory));
	795	#endif
	796	}
	797
	798	void cpu_exec_realizefn(CPUState cpu, Error *errp)
	799	{
	800	CPUClass *cc = CPU_GET_CLASS(cpu);
	801	static bool tcg_target_initialized;
	802
	803	cpu_list_add(cpu);
	804
	805	if (tcg_enabled() && !tcg_target_initialized) {
	806	tcg_target_initialized = true;
	807	cc->tcg_initialize();
	808	}
	809
	810	#ifndef CONFIG_USER_ONLY
	811	if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
	812	vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
	813	}
	814	if (cc->vmsd != NULL) {
	815	vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
	816	}
	817	#endif
	818	}
	819
	820	#if defined(CONFIG_USER_ONLY)
	821	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	822	{
	823	mmap_lock();
	824	tb_lock();
	825	tb_invalidate_phys_page_range(pc, pc + 1, 0);
	826	tb_unlock();
	827	mmap_unlock();
	828	}
	829	#else
	830	static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
	831	{
	832	MemTxAttrs attrs;
	833	hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
	834	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	835	if (phys != -1) {
	836	/* Locks grabbed by tb_invalidate_phys_addr */
	837	tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
	838	phys \| (pc & ~TARGET_PAGE_MASK));
	839	}
	840	}
	841	#endif
	842
	843	#if defined(CONFIG_USER_ONLY)
	844	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	845
	846	{
	847	}
	848
	849	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	850	int flags)
	851	{
	852	return -ENOSYS;
	853	}
	854
	855	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	856	{
	857	}
	858
	859	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	860	int flags, CPUWatchpoint **watchpoint)
	861	{
	862	return -ENOSYS;
	863	}
	864	#else
	865	/* Add a watchpoint. */
	866	int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
	867	int flags, CPUWatchpoint **watchpoint)
	868	{
	869	CPUWatchpoint *wp;
	870
	871	/* forbid ranges which are empty or run off the end of the address space */
	872	if (len == 0 \|\| (addr + len - 1) < addr) {
	873	error_report("tried to set invalid watchpoint at %"
	874	VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
	875	return -EINVAL;
	876	}
	877	wp = g_malloc(sizeof(*wp));
	878
	879	wp->vaddr = addr;
	880	wp->len = len;
	881	wp->flags = flags;
	882
	883	/* keep all GDB-injected watchpoints in front */
	884	if (flags & BP_GDB) {
	885	QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
	886	} else {
	887	QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
	888	}
	889
	890	tlb_flush_page(cpu, addr);
	891
	892	if (watchpoint)
	893	*watchpoint = wp;
	894	return 0;
	895	}
	896
	897	/* Remove a specific watchpoint. */
	898	int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
	899	int flags)
	900	{
	901	CPUWatchpoint *wp;
	902
	903	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	904	if (addr == wp->vaddr && len == wp->len
	905	&& flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
	906	cpu_watchpoint_remove_by_ref(cpu, wp);
	907	return 0;
	908	}
	909	}
	910	return -ENOENT;
	911	}
	912
	913	/* Remove a specific watchpoint by reference. */
	914	void cpu_watchpoint_remove_by_ref(CPUState cpu, CPUWatchpoint watchpoint)
	915	{
	916	QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
	917
	918	tlb_flush_page(cpu, watchpoint->vaddr);
	919
	920	g_free(watchpoint);
	921	}
	922
	923	/* Remove all matching watchpoints. */
	924	void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
	925	{
	926	CPUWatchpoint wp, next;
	927
	928	QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
	929	if (wp->flags & mask) {
	930	cpu_watchpoint_remove_by_ref(cpu, wp);
	931	}
	932	}
	933	}
	934
	935	/* Return true if this watchpoint address matches the specified
	936	* access (ie the address range covered by the watchpoint overlaps
	937	* partially or completely with the address range covered by the
	938	* access).
	939	*/
	940	static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
	941	vaddr addr,
	942	vaddr len)
	943	{
	944	/* We know the lengths are non-zero, but a little caution is
	945	* required to avoid errors in the case where the range ends
	946	* exactly at the top of the address space and so addr + len
	947	* wraps round to zero.
	948	*/
	949	vaddr wpend = wp->vaddr + wp->len - 1;
	950	vaddr addrend = addr + len - 1;
	951
	952	return !(addr > wpend \|\| wp->vaddr > addrend);
	953	}
	954
	955	#endif
	956
	957	/* Add a breakpoint. */
	958	int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
	959	CPUBreakpoint **breakpoint)
	960	{
	961	CPUBreakpoint *bp;
	962
	963	bp = g_malloc(sizeof(*bp));
	964
	965	bp->pc = pc;
	966	bp->flags = flags;
	967
	968	/* keep all GDB-injected breakpoints in front */
	969	if (flags & BP_GDB) {
	970	QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
	971	} else {
	972	QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
	973	}
	974
	975	breakpoint_invalidate(cpu, pc);
	976
	977	if (breakpoint) {
	978	*breakpoint = bp;
	979	}
	980	return 0;
	981	}
	982
	983	/* Remove a specific breakpoint. */
	984	int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
	985	{
	986	CPUBreakpoint *bp;
	987
	988	QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
	989	if (bp->pc == pc && bp->flags == flags) {
	990	cpu_breakpoint_remove_by_ref(cpu, bp);
	991	return 0;
	992	}
	993	}
	994	return -ENOENT;
	995	}
	996
	997	/* Remove a specific breakpoint by reference. */
	998	void cpu_breakpoint_remove_by_ref(CPUState cpu, CPUBreakpoint breakpoint)
	999	{
	1000	QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
	1001
	1002	breakpoint_invalidate(cpu, breakpoint->pc);
	1003
	1004	g_free(breakpoint);
	1005	}
	1006
	1007	/* Remove all matching breakpoints. */
	1008	void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
	1009	{
	1010	CPUBreakpoint bp, next;
	1011
	1012	QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
	1013	if (bp->flags & mask) {
	1014	cpu_breakpoint_remove_by_ref(cpu, bp);
	1015	}
	1016	}
	1017	}
	1018
	1019	/* enable or disable single step mode. EXCP_DEBUG is returned by the
	1020	CPU loop after each instruction */
	1021	void cpu_single_step(CPUState *cpu, int enabled)
	1022	{
	1023	if (cpu->singlestep_enabled != enabled) {
	1024	cpu->singlestep_enabled = enabled;
	1025	if (kvm_enabled()) {
	1026	kvm_update_guest_debug(cpu, 0);
	1027	} else {
	1028	/* must flush all the translated code to avoid inconsistencies */
	1029	/* XXX: only flush what is necessary */
	1030	tb_flush(cpu);
	1031	}
	1032	}
	1033	}
	1034
	1035	void cpu_abort(CPUState cpu, const char fmt, ...)
	1036	{
	1037	va_list ap;
	1038	va_list ap2;
	1039
	1040	va_start(ap, fmt);
	1041	va_copy(ap2, ap);
	1042	fprintf(stderr, "qemu: fatal: ");
	1043	vfprintf(stderr, fmt, ap);
	1044	fprintf(stderr, "\n");
	1045	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1046	if (qemu_log_separate()) {
	1047	qemu_log_lock();
	1048	qemu_log("qemu: fatal: ");
	1049	qemu_log_vprintf(fmt, ap2);
	1050	qemu_log("\n");
	1051	log_cpu_state(cpu, CPU_DUMP_FPU \| CPU_DUMP_CCOP);
	1052	qemu_log_flush();
	1053	qemu_log_unlock();
	1054	qemu_log_close();
	1055	}
	1056	va_end(ap2);
	1057	va_end(ap);
	1058	replay_finish();
	1059	#if defined(CONFIG_USER_ONLY)
	1060	{
	1061	struct sigaction act;
	1062	sigfillset(&act.sa_mask);
	1063	act.sa_handler = SIG_DFL;
	1064	sigaction(SIGABRT, &act, NULL);
	1065	}
	1066	#endif
	1067	abort();
	1068	}
	1069
	1070	#if !defined(CONFIG_USER_ONLY)
	1071	/* Called from RCU critical section */
	1072	static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
	1073	{
	1074	RAMBlock *block;
	1075
	1076	block = atomic_rcu_read(&ram_list.mru_block);
	1077	if (block && addr - block->offset < block->max_length) {
	1078	return block;
	1079	}
	1080	RAMBLOCK_FOREACH(block) {
	1081	if (addr - block->offset < block->max_length) {
	1082	goto found;
	1083	}
	1084	}
	1085
	1086	fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
	1087	abort();
	1088
	1089	found:
	1090	/* It is safe to write mru_block outside the iothread lock. This
	1091	* is what happens:
	1092	*
	1093	* mru_block = xxx
	1094	* rcu_read_unlock()
	1095	* xxx removed from list
	1096	* rcu_read_lock()
	1097	* read mru_block
	1098	* mru_block = NULL;
	1099	* call_rcu(reclaim_ramblock, xxx);
	1100	* rcu_read_unlock()
	1101	*
	1102	* atomic_rcu_set is not needed here. The block was already published
	1103	* when it was placed into the list. Here we're just making an extra
	1104	* copy of the pointer.
	1105	*/
	1106	ram_list.mru_block = block;
	1107	return block;
	1108	}
	1109
	1110	static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
	1111	{
	1112	CPUState *cpu;
	1113	ram_addr_t start1;
	1114	RAMBlock *block;
	1115	ram_addr_t end;
	1116
	1117	end = TARGET_PAGE_ALIGN(start + length);
	1118	start &= TARGET_PAGE_MASK;
	1119
	1120	rcu_read_lock();
	1121	block = qemu_get_ram_block(start);
	1122	assert(block == qemu_get_ram_block(end - 1));
	1123	start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
	1124	CPU_FOREACH(cpu) {
	1125	tlb_reset_dirty(cpu, start1, length);
	1126	}
	1127	rcu_read_unlock();
	1128	}
	1129
	1130	/* Note: start and end must be within the same ram block. */
	1131	bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
	1132	ram_addr_t length,
	1133	unsigned client)
	1134	{
	1135	DirtyMemoryBlocks *blocks;
	1136	unsigned long end, page;
	1137	bool dirty = false;
	1138
	1139	if (length == 0) {
	1140	return false;
	1141	}
	1142
	1143	end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
	1144	page = start >> TARGET_PAGE_BITS;
	1145
	1146	rcu_read_lock();
	1147
	1148	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1149
	1150	while (page < end) {
	1151	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1152	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1153	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1154
	1155	dirty \|= bitmap_test_and_clear_atomic(blocks->blocks[idx],
	1156	offset, num);
	1157	page += num;
	1158	}
	1159
	1160	rcu_read_unlock();
	1161
	1162	if (dirty && tcg_enabled()) {
	1163	tlb_reset_dirty_range_all(start, length);
	1164	}
	1165
	1166	return dirty;
	1167	}
	1168
	1169	DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
	1170	(ram_addr_t start, ram_addr_t length, unsigned client)
	1171	{
	1172	DirtyMemoryBlocks *blocks;
	1173	unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
	1174	ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
	1175	ram_addr_t last = QEMU_ALIGN_UP(start + length, align);
	1176	DirtyBitmapSnapshot *snap;
	1177	unsigned long page, end, dest;
	1178
	1179	snap = g_malloc0(sizeof(*snap) +
	1180	((last - first) >> (TARGET_PAGE_BITS + 3)));
	1181	snap->start = first;
	1182	snap->end = last;
	1183
	1184	page = first >> TARGET_PAGE_BITS;
	1185	end = last >> TARGET_PAGE_BITS;
	1186	dest = 0;
	1187
	1188	rcu_read_lock();
	1189
	1190	blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
	1191
	1192	while (page < end) {
	1193	unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
	1194	unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
	1195	unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
	1196
	1197	assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
	1198	assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
	1199	offset >>= BITS_PER_LEVEL;
	1200
	1201	bitmap_copy_and_clear_atomic(snap->dirty + dest,
	1202	blocks->blocks[idx] + offset,
	1203	num);
	1204	page += num;
	1205	dest += num >> BITS_PER_LEVEL;
	1206	}
	1207
	1208	rcu_read_unlock();
	1209
	1210	if (tcg_enabled()) {
	1211	tlb_reset_dirty_range_all(start, length);
	1212	}
	1213
	1214	return snap;
	1215	}
	1216
	1217	bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
	1218	ram_addr_t start,
	1219	ram_addr_t length)
	1220	{
	1221	unsigned long page, end;
	1222
	1223	assert(start >= snap->start);
	1224	assert(start + length <= snap->end);
	1225
	1226	end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
	1227	page = (start - snap->start) >> TARGET_PAGE_BITS;
	1228
	1229	while (page < end) {
	1230	if (test_bit(page, snap->dirty)) {
	1231	return true;
	1232	}
	1233	page++;
	1234	}
	1235	return false;
	1236	}
	1237
	1238	/* Called from RCU critical section */
	1239	hwaddr memory_region_section_get_iotlb(CPUState *cpu,
	1240	MemoryRegionSection *section,
	1241	target_ulong vaddr,
	1242	hwaddr paddr, hwaddr xlat,
	1243	int prot,
	1244	target_ulong *address)
	1245	{
	1246	hwaddr iotlb;
	1247	CPUWatchpoint *wp;
	1248
	1249	if (memory_region_is_ram(section->mr)) {
	1250	/* Normal RAM. */
	1251	iotlb = memory_region_get_ram_addr(section->mr) + xlat;
	1252	if (!section->readonly) {
	1253	iotlb \|= PHYS_SECTION_NOTDIRTY;
	1254	} else {
	1255	iotlb \|= PHYS_SECTION_ROM;
	1256	}
	1257	} else {
	1258	AddressSpaceDispatch *d;
	1259
	1260	d = flatview_to_dispatch(section->fv);
	1261	iotlb = section - d->map.sections;
	1262	iotlb += xlat;
	1263	}
	1264
	1265	/* Make accesses to pages with watchpoints go via the
	1266	watchpoint trap routines. */
	1267	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	1268	if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
	1269	/* Avoid trapping reads of pages with a write breakpoint. */
	1270	if ((prot & PAGE_WRITE) \|\| (wp->flags & BP_MEM_READ)) {
	1271	iotlb = PHYS_SECTION_WATCH + paddr;
	1272	*address \|= TLB_MMIO;
	1273	break;
	1274	}
	1275	}
	1276	}
	1277
	1278	return iotlb;
	1279	}
	1280	#endif /* defined(CONFIG_USER_ONLY) */
	1281
	1282	#if !defined(CONFIG_USER_ONLY)
	1283
	1284	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	1285	uint16_t section);
	1286	static subpage_t subpage_init(FlatView fv, hwaddr base);
	1287
	1288	static void (phys_mem_alloc)(size_t size, uint64_t *align) =
	1289	qemu_anon_ram_alloc;
	1290
	1291	/*
	1292	* Set a custom physical guest memory alloator.
	1293	* Accelerators with unusual needs may need this. Hopefully, we can
	1294	* get rid of it eventually.
	1295	*/
	1296	void phys_mem_set_alloc(void (alloc)(size_t, uint64_t *align))
	1297	{
	1298	phys_mem_alloc = alloc;
	1299	}
	1300
	1301	static uint16_t phys_section_add(PhysPageMap *map,
	1302	MemoryRegionSection *section)
	1303	{
	1304	/* The physical section number is ORed with a page-aligned
	1305	* pointer to produce the iotlb entries. Thus it should
	1306	* never overflow into the page-aligned value.
	1307	*/
	1308	assert(map->sections_nb < TARGET_PAGE_SIZE);
	1309
	1310	if (map->sections_nb == map->sections_nb_alloc) {
	1311	map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
	1312	map->sections = g_renew(MemoryRegionSection, map->sections,
	1313	map->sections_nb_alloc);
	1314	}
	1315	map->sections[map->sections_nb] = *section;
	1316	memory_region_ref(section->mr);
	1317	return map->sections_nb++;
	1318	}
	1319
	1320	static void phys_section_destroy(MemoryRegion *mr)
	1321	{
	1322	bool have_sub_page = mr->subpage;
	1323
	1324	memory_region_unref(mr);
	1325
	1326	if (have_sub_page) {
	1327	subpage_t *subpage = container_of(mr, subpage_t, iomem);
	1328	object_unref(OBJECT(&subpage->iomem));
	1329	g_free(subpage);
	1330	}
	1331	}
	1332
	1333	static void phys_sections_free(PhysPageMap *map)
	1334	{
	1335	while (map->sections_nb > 0) {
	1336	MemoryRegionSection *section = &map->sections[--map->sections_nb];
	1337	phys_section_destroy(section->mr);
	1338	}
	1339	g_free(map->sections);
	1340	g_free(map->nodes);
	1341	}
	1342
	1343	static void register_subpage(FlatView fv, MemoryRegionSection section)
	1344	{
	1345	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1346	subpage_t *subpage;
	1347	hwaddr base = section->offset_within_address_space
	1348	& TARGET_PAGE_MASK;
	1349	MemoryRegionSection *existing = phys_page_find(d, base);
	1350	MemoryRegionSection subsection = {
	1351	.offset_within_address_space = base,
	1352	.size = int128_make64(TARGET_PAGE_SIZE),
	1353	};
	1354	hwaddr start, end;
	1355
	1356	assert(existing->mr->subpage \|\| existing->mr == &io_mem_unassigned);
	1357
	1358	if (!(existing->mr->subpage)) {
	1359	subpage = subpage_init(fv, base);
	1360	subsection.fv = fv;
	1361	subsection.mr = &subpage->iomem;
	1362	phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
	1363	phys_section_add(&d->map, &subsection));
	1364	} else {
	1365	subpage = container_of(existing->mr, subpage_t, iomem);
	1366	}
	1367	start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
	1368	end = start + int128_get64(section->size) - 1;
	1369	subpage_register(subpage, start, end,
	1370	phys_section_add(&d->map, section));
	1371	}
	1372
	1373
	1374	static void register_multipage(FlatView *fv,
	1375	MemoryRegionSection *section)
	1376	{
	1377	AddressSpaceDispatch *d = flatview_to_dispatch(fv);
	1378	hwaddr start_addr = section->offset_within_address_space;
	1379	uint16_t section_index = phys_section_add(&d->map, section);
	1380	uint64_t num_pages = int128_get64(int128_rshift(section->size,
	1381	TARGET_PAGE_BITS));
	1382
	1383	assert(num_pages);
	1384	phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
	1385	}
	1386
	1387	void flatview_add_to_dispatch(FlatView fv, MemoryRegionSection section)
	1388	{
	1389	MemoryRegionSection now = section, remain = section;
	1390	Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
	1391
	1392	if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1393	uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
	1394	- now.offset_within_address_space;
	1395
	1396	now.size = int128_min(int128_make64(left), now.size);
	1397	register_subpage(fv, &now);
	1398	} else {
	1399	now.size = int128_zero();
	1400	}
	1401	while (int128_ne(remain.size, now.size)) {
	1402	remain.size = int128_sub(remain.size, now.size);
	1403	remain.offset_within_address_space += int128_get64(now.size);
	1404	remain.offset_within_region += int128_get64(now.size);
	1405	now = remain;
	1406	if (int128_lt(remain.size, page_size)) {
	1407	register_subpage(fv, &now);
	1408	} else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
	1409	now.size = page_size;
	1410	register_subpage(fv, &now);
	1411	} else {
	1412	now.size = int128_and(now.size, int128_neg(page_size));
	1413	register_multipage(fv, &now);
	1414	}
	1415	}
	1416	}
	1417
	1418	void qemu_flush_coalesced_mmio_buffer(void)
	1419	{
	1420	if (kvm_enabled())
	1421	kvm_flush_coalesced_mmio_buffer();
	1422	}
	1423
	1424	void qemu_mutex_lock_ramlist(void)
	1425	{
	1426	qemu_mutex_lock(&ram_list.mutex);
	1427	}
	1428
	1429	void qemu_mutex_unlock_ramlist(void)
	1430	{
	1431	qemu_mutex_unlock(&ram_list.mutex);
	1432	}
	1433
	1434	void ram_block_dump(Monitor *mon)
	1435	{
	1436	RAMBlock *block;
	1437	char *psize;
	1438
	1439	rcu_read_lock();
	1440	monitor_printf(mon, "%24s %8s %18s %18s %18s\n",
	1441	"Block Name", "PSize", "Offset", "Used", "Total");
	1442	RAMBLOCK_FOREACH(block) {
	1443	psize = size_to_str(block->page_size);
	1444	monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
	1445	" 0x%016" PRIx64 "\n", block->idstr, psize,
	1446	(uint64_t)block->offset,
	1447	(uint64_t)block->used_length,
	1448	(uint64_t)block->max_length);
	1449	g_free(psize);
	1450	}
	1451	rcu_read_unlock();
	1452	}
	1453
	1454	#ifdef __linux__
	1455	/*
	1456	* FIXME TOCTTOU: this iterates over memory backends' mem-path, which
	1457	* may or may not name the same files / on the same filesystem now as
	1458	* when we actually open and map them. Iterate over the file
	1459	* descriptors instead, and use qemu_fd_getpagesize().
	1460	*/
	1461	static int find_max_supported_pagesize(Object obj, void opaque)
	1462	{
	1463	char *mem_path;
	1464	long *hpsize_min = opaque;
	1465
	1466	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	1467	mem_path = object_property_get_str(obj, "mem-path", NULL);
	1468	if (mem_path) {
	1469	long hpsize = qemu_mempath_getpagesize(mem_path);
	1470	if (hpsize < *hpsize_min) {
	1471	*hpsize_min = hpsize;
	1472	}
	1473	} else {
	1474	*hpsize_min = getpagesize();
	1475	}
	1476	}
	1477
	1478	return 0;
	1479	}
	1480
	1481	long qemu_getrampagesize(void)
	1482	{
	1483	long hpsize = LONG_MAX;
	1484	long mainrampagesize;
	1485	Object *memdev_root;
	1486
	1487	if (mem_path) {
	1488	mainrampagesize = qemu_mempath_getpagesize(mem_path);
	1489	} else {
	1490	mainrampagesize = getpagesize();
	1491	}
	1492
	1493	/* it's possible we have memory-backend objects with
	1494	* hugepage-backed RAM. these may get mapped into system
	1495	* address space via -numa parameters or memory hotplug
	1496	* hooks. we want to take these into account, but we
	1497	* also want to make sure these supported hugepage
	1498	* sizes are applicable across the entire range of memory
	1499	* we may boot from, so we take the min across all
	1500	* backends, and assume normal pages in cases where a
	1501	* backend isn't backed by hugepages.
	1502	*/
	1503	memdev_root = object_resolve_path("/objects", NULL);
	1504	if (memdev_root) {
	1505	object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
	1506	}
	1507	if (hpsize == LONG_MAX) {
	1508	/* No additional memory regions found ==> Report main RAM page size */
	1509	return mainrampagesize;
	1510	}
	1511
	1512	/* If NUMA is disabled or the NUMA nodes are not backed with a
	1513	* memory-backend, then there is at least one node using "normal" RAM,
	1514	* so if its page size is smaller we have got to report that size instead.
	1515	*/
	1516	if (hpsize > mainrampagesize &&
	1517	(nb_numa_nodes == 0 \|\| numa_info[0].node_memdev == NULL)) {
	1518	static bool warned;
	1519	if (!warned) {
	1520	error_report("Huge page support disabled (n/a for main memory).");
	1521	warned = true;
	1522	}
	1523	return mainrampagesize;
	1524	}
	1525
	1526	return hpsize;
	1527	}
	1528	#else
	1529	long qemu_getrampagesize(void)
	1530	{
	1531	return getpagesize();
	1532	}
	1533	#endif
	1534
	1535	#ifdef __linux__
	1536	static int64_t get_file_size(int fd)
	1537	{
	1538	int64_t size = lseek(fd, 0, SEEK_END);
	1539	if (size < 0) {
	1540	return -errno;
	1541	}
	1542	return size;
	1543	}
	1544
	1545	static int file_ram_open(const char *path,
	1546	const char *region_name,
	1547	bool *created,
	1548	Error **errp)
	1549	{
	1550	char *filename;
	1551	char *sanitized_name;
	1552	char *c;
	1553	int fd = -1;
	1554
	1555	*created = false;
	1556	for (;;) {
	1557	fd = open(path, O_RDWR);
	1558	if (fd >= 0) {
	1559	/* @path names an existing file, use it */
	1560	break;
	1561	}
	1562	if (errno == ENOENT) {
	1563	/* @path names a file that doesn't exist, create it */
	1564	fd = open(path, O_RDWR \| O_CREAT \| O_EXCL, 0644);
	1565	if (fd >= 0) {
	1566	*created = true;
	1567	break;
	1568	}
	1569	} else if (errno == EISDIR) {
	1570	/* @path names a directory, create a file there */
	1571	/* Make name safe to use with mkstemp by replacing '/' with '_'. */
	1572	sanitized_name = g_strdup(region_name);
	1573	for (c = sanitized_name; *c != '\0'; c++) {
	1574	if (*c == '/') {
	1575	*c = '_';
	1576	}
	1577	}
	1578
	1579	filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
	1580	sanitized_name);
	1581	g_free(sanitized_name);
	1582
	1583	fd = mkstemp(filename);
	1584	if (fd >= 0) {
	1585	unlink(filename);
	1586	g_free(filename);
	1587	break;
	1588	}
	1589	g_free(filename);
	1590	}
	1591	if (errno != EEXIST && errno != EINTR) {
	1592	error_setg_errno(errp, errno,
	1593	"can't open backing store %s for guest RAM",
	1594	path);
	1595	return -1;
	1596	}
	1597	/*
	1598	* Try again on EINTR and EEXIST. The latter happens when
	1599	* something else creates the file between our two open().
	1600	*/
	1601	}
	1602
	1603	return fd;
	1604	}
	1605
	1606	static void file_ram_alloc(RAMBlock block,
	1607	ram_addr_t memory,
	1608	int fd,
	1609	bool truncate,
	1610	Error **errp)
	1611	{
	1612	void *area;
	1613
	1614	block->page_size = qemu_fd_getpagesize(fd);
	1615	if (block->mr->align % block->page_size) {
	1616	error_setg(errp, "alignment 0x%" PRIx64
	1617	" must be multiples of page size 0x%zx",
	1618	block->mr->align, block->page_size);
	1619	return NULL;
	1620	}
	1621	block->mr->align = MAX(block->page_size, block->mr->align);
	1622	#if defined(__s390x__)
	1623	if (kvm_enabled()) {
	1624	block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
	1625	}
	1626	#endif
	1627
	1628	if (memory < block->page_size) {
	1629	error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
	1630	"or larger than page size 0x%zx",
	1631	memory, block->page_size);
	1632	return NULL;
	1633	}
	1634
	1635	memory = ROUND_UP(memory, block->page_size);
	1636
	1637	/*
	1638	* ftruncate is not supported by hugetlbfs in older
	1639	* hosts, so don't bother bailing out on errors.
	1640	* If anything goes wrong with it under other filesystems,
	1641	* mmap will fail.
	1642	*
	1643	* Do not truncate the non-empty backend file to avoid corrupting
	1644	* the existing data in the file. Disabling shrinking is not
	1645	* enough. For example, the current vNVDIMM implementation stores
	1646	* the guest NVDIMM labels at the end of the backend file. If the
	1647	* backend file is later extended, QEMU will not be able to find
	1648	* those labels. Therefore, extending the non-empty backend file
	1649	* is disabled as well.
	1650	*/
	1651	if (truncate && ftruncate(fd, memory)) {
	1652	perror("ftruncate");
	1653	}
	1654
	1655	area = qemu_ram_mmap(fd, memory, block->mr->align,
	1656	block->flags & RAM_SHARED);
	1657	if (area == MAP_FAILED) {
	1658	error_setg_errno(errp, errno,
	1659	"unable to map backing store for guest RAM");
	1660	return NULL;
	1661	}
	1662
	1663	if (mem_prealloc) {
	1664	os_mem_prealloc(fd, area, memory, smp_cpus, errp);
	1665	if (errp && *errp) {
	1666	qemu_ram_munmap(area, memory);
	1667	return NULL;
	1668	}
	1669	}
	1670
	1671	block->fd = fd;
	1672	return area;
	1673	}
	1674	#endif
	1675
	1676	/* Allocate space within the ram_addr_t space that governs the
	1677	* dirty bitmaps.
	1678	* Called with the ramlist lock held.
	1679	*/
	1680	static ram_addr_t find_ram_offset(ram_addr_t size)
	1681	{
	1682	RAMBlock block, next_block;
	1683	ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
	1684
	1685	assert(size != 0); /* it would hand out same offset multiple times */
	1686
	1687	if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
	1688	return 0;
	1689	}
	1690
	1691	RAMBLOCK_FOREACH(block) {
	1692	ram_addr_t candidate, next = RAM_ADDR_MAX;
	1693
	1694	/* Align blocks to start on a 'long' in the bitmap
	1695	* which makes the bitmap sync'ing take the fast path.
	1696	*/
	1697	candidate = block->offset + block->max_length;
	1698	candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
	1699
	1700	/* Search for the closest following block
	1701	* and find the gap.
	1702	*/
	1703	RAMBLOCK_FOREACH(next_block) {
	1704	if (next_block->offset >= candidate) {
	1705	next = MIN(next, next_block->offset);
	1706	}
	1707	}
	1708
	1709	/* If it fits remember our place and remember the size
	1710	* of gap, but keep going so that we might find a smaller
	1711	* gap to fill so avoiding fragmentation.
	1712	*/
	1713	if (next - candidate >= size && next - candidate < mingap) {
	1714	offset = candidate;
	1715	mingap = next - candidate;
	1716	}
	1717
	1718	trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
	1719	}
	1720
	1721	if (offset == RAM_ADDR_MAX) {
	1722	fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
	1723	(uint64_t)size);
	1724	abort();
	1725	}
	1726
	1727	trace_find_ram_offset(size, offset);
	1728
	1729	return offset;
	1730	}
	1731
	1732	unsigned long last_ram_page(void)
	1733	{
	1734	RAMBlock *block;
	1735	ram_addr_t last = 0;
	1736
	1737	rcu_read_lock();
	1738	RAMBLOCK_FOREACH(block) {
	1739	last = MAX(last, block->offset + block->max_length);
	1740	}
	1741	rcu_read_unlock();
	1742	return last >> TARGET_PAGE_BITS;
	1743	}
	1744
	1745	static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
	1746	{
	1747	int ret;
	1748
	1749	/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
	1750	if (!machine_dump_guest_core(current_machine)) {
	1751	ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
	1752	if (ret) {
	1753	perror("qemu_madvise");
	1754	fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
	1755	"but dump_guest_core=off specified\n");
	1756	}
	1757	}
	1758	}
	1759
	1760	const char qemu_ram_get_idstr(RAMBlock rb)
	1761	{
	1762	return rb->idstr;
	1763	}
	1764
	1765	bool qemu_ram_is_shared(RAMBlock *rb)
	1766	{
	1767	return rb->flags & RAM_SHARED;
	1768	}
	1769
	1770	/* Called with iothread lock held. */
	1771	void qemu_ram_set_idstr(RAMBlock new_block, const char name, DeviceState *dev)
	1772	{
	1773	RAMBlock *block;
	1774
	1775	assert(new_block);
	1776	assert(!new_block->idstr[0]);
	1777
	1778	if (dev) {
	1779	char *id = qdev_get_dev_path(dev);
	1780	if (id) {
	1781	snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
	1782	g_free(id);
	1783	}
	1784	}
	1785	pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
	1786
	1787	rcu_read_lock();
	1788	RAMBLOCK_FOREACH(block) {
	1789	if (block != new_block &&
	1790	!strcmp(block->idstr, new_block->idstr)) {
	1791	fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
	1792	new_block->idstr);
	1793	abort();
	1794	}
	1795	}
	1796	rcu_read_unlock();
	1797	}
	1798
	1799	/* Called with iothread lock held. */
	1800	void qemu_ram_unset_idstr(RAMBlock *block)
	1801	{
	1802	/* FIXME: arch_init.c assumes that this is not called throughout
	1803	* migration. Ignore the problem since hot-unplug during migration
	1804	* does not work anyway.
	1805	*/
	1806	if (block) {
	1807	memset(block->idstr, 0, sizeof(block->idstr));
	1808	}
	1809	}
	1810
	1811	size_t qemu_ram_pagesize(RAMBlock *rb)
	1812	{
	1813	return rb->page_size;
	1814	}
	1815
	1816	/* Returns the largest size of page in use */
	1817	size_t qemu_ram_pagesize_largest(void)
	1818	{
	1819	RAMBlock *block;
	1820	size_t largest = 0;
	1821
	1822	RAMBLOCK_FOREACH(block) {
	1823	largest = MAX(largest, qemu_ram_pagesize(block));
	1824	}
	1825
	1826	return largest;
	1827	}
	1828
	1829	static int memory_try_enable_merging(void *addr, size_t len)
	1830	{
	1831	if (!machine_mem_merge(current_machine)) {
	1832	/* disabled by the user */
	1833	return 0;
	1834	}
	1835
	1836	return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
	1837	}
	1838
	1839	/* Only legal before guest might have detected the memory size: e.g. on
	1840	* incoming migration, or right after reset.
	1841	*
	1842	* As memory core doesn't know how is memory accessed, it is up to
	1843	* resize callback to update device state and/or add assertions to detect
	1844	* misuse, if necessary.
	1845	*/
	1846	int qemu_ram_resize(RAMBlock block, ram_addr_t newsize, Error *errp)
	1847	{
	1848	assert(block);
	1849
	1850	newsize = HOST_PAGE_ALIGN(newsize);
	1851
	1852	if (block->used_length == newsize) {
	1853	return 0;
	1854	}
	1855
	1856	if (!(block->flags & RAM_RESIZEABLE)) {
	1857	error_setg_errno(errp, EINVAL,
	1858	"Length mismatch: %s: 0x" RAM_ADDR_FMT
	1859	" in != 0x" RAM_ADDR_FMT, block->idstr,
	1860	newsize, block->used_length);
	1861	return -EINVAL;
	1862	}
	1863
	1864	if (block->max_length < newsize) {
	1865	error_setg_errno(errp, EINVAL,
	1866	"Length too large: %s: 0x" RAM_ADDR_FMT
	1867	" > 0x" RAM_ADDR_FMT, block->idstr,
	1868	newsize, block->max_length);
	1869	return -EINVAL;
	1870	}
	1871
	1872	cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
	1873	block->used_length = newsize;
	1874	cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
	1875	DIRTY_CLIENTS_ALL);
	1876	memory_region_set_size(block->mr, newsize);
	1877	if (block->resized) {
	1878	block->resized(block->idstr, newsize, block->host);
	1879	}
	1880	return 0;
	1881	}
	1882
	1883	/* Called with ram_list.mutex held */
	1884	static void dirty_memory_extend(ram_addr_t old_ram_size,
	1885	ram_addr_t new_ram_size)
	1886	{
	1887	ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
	1888	DIRTY_MEMORY_BLOCK_SIZE);
	1889	ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
	1890	DIRTY_MEMORY_BLOCK_SIZE);
	1891	int i;
	1892
	1893	/* Only need to extend if block count increased */
	1894	if (new_num_blocks <= old_num_blocks) {
	1895	return;
	1896	}
	1897
	1898	for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
	1899	DirtyMemoryBlocks *old_blocks;
	1900	DirtyMemoryBlocks *new_blocks;
	1901	int j;
	1902
	1903	old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
	1904	new_blocks = g_malloc(sizeof(*new_blocks) +
	1905	sizeof(new_blocks->blocks[0]) * new_num_blocks);
	1906
	1907	if (old_num_blocks) {
	1908	memcpy(new_blocks->blocks, old_blocks->blocks,
	1909	old_num_blocks * sizeof(old_blocks->blocks[0]));
	1910	}
	1911
	1912	for (j = old_num_blocks; j < new_num_blocks; j++) {
	1913	new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
	1914	}
	1915
	1916	atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
	1917
	1918	if (old_blocks) {
	1919	g_free_rcu(old_blocks, rcu);
	1920	}
	1921	}
	1922	}
	1923
	1924	static void ram_block_add(RAMBlock new_block, Error *errp)
	1925	{
	1926	RAMBlock *block;
	1927	RAMBlock *last_block = NULL;
	1928	ram_addr_t old_ram_size, new_ram_size;
	1929	Error *err = NULL;
	1930
	1931	old_ram_size = last_ram_page();
	1932
	1933	qemu_mutex_lock_ramlist();
	1934	new_block->offset = find_ram_offset(new_block->max_length);
	1935
	1936	if (!new_block->host) {
	1937	if (xen_enabled()) {
	1938	xen_ram_alloc(new_block->offset, new_block->max_length,
	1939	new_block->mr, &err);
	1940	if (err) {
	1941	error_propagate(errp, err);
	1942	qemu_mutex_unlock_ramlist();
	1943	return;
	1944	}
	1945	} else {
	1946	new_block->host = phys_mem_alloc(new_block->max_length,
	1947	&new_block->mr->align);
	1948	if (!new_block->host) {
	1949	error_setg_errno(errp, errno,
	1950	"cannot set up guest memory '%s'",
	1951	memory_region_name(new_block->mr));
	1952	qemu_mutex_unlock_ramlist();
	1953	return;
	1954	}
	1955	memory_try_enable_merging(new_block->host, new_block->max_length);
	1956	}
	1957	}
	1958
	1959	new_ram_size = MAX(old_ram_size,
	1960	(new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
	1961	if (new_ram_size > old_ram_size) {
	1962	dirty_memory_extend(old_ram_size, new_ram_size);
	1963	}
	1964	/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
	1965	* QLIST (which has an RCU-friendly variant) does not have insertion at
	1966	* tail, so save the last element in last_block.
	1967	*/
	1968	RAMBLOCK_FOREACH(block) {
	1969	last_block = block;
	1970	if (block->max_length < new_block->max_length) {
	1971	break;
	1972	}
	1973	}
	1974	if (block) {
	1975	QLIST_INSERT_BEFORE_RCU(block, new_block, next);
	1976	} else if (last_block) {
	1977	QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
	1978	} else { /* list is empty */
	1979	QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
	1980	}
	1981	ram_list.mru_block = NULL;
	1982
	1983	/* Write list before version */
	1984	smp_wmb();
	1985	ram_list.version++;
	1986	qemu_mutex_unlock_ramlist();
	1987
	1988	cpu_physical_memory_set_dirty_range(new_block->offset,
	1989	new_block->used_length,
	1990	DIRTY_CLIENTS_ALL);
	1991
	1992	if (new_block->host) {
	1993	qemu_ram_setup_dump(new_block->host, new_block->max_length);
	1994	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
	1995	/* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
	1996	qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
	1997	ram_block_notify_add(new_block->host, new_block->max_length);
	1998	}
	1999	}
	2000
	2001	#ifdef __linux__
	2002	RAMBlock qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion mr,
	2003	bool share, int fd,
	2004	Error **errp)
	2005	{
	2006	RAMBlock *new_block;
	2007	Error *local_err = NULL;
	2008	int64_t file_size;
	2009
	2010	if (xen_enabled()) {
	2011	error_setg(errp, "-mem-path not supported with Xen");
	2012	return NULL;
	2013	}
	2014
	2015	if (kvm_enabled() && !kvm_has_sync_mmu()) {
	2016	error_setg(errp,
	2017	"host lacks kvm mmu notifiers, -mem-path unsupported");
	2018	return NULL;
	2019	}
	2020
	2021	if (phys_mem_alloc != qemu_anon_ram_alloc) {
	2022	/*
	2023	* file_ram_alloc() needs to allocate just like
	2024	* phys_mem_alloc, but we haven't bothered to provide
	2025	* a hook there.
	2026	*/
	2027	error_setg(errp,
	2028	"-mem-path not supported with this accelerator");
	2029	return NULL;
	2030	}
	2031
	2032	size = HOST_PAGE_ALIGN(size);
	2033	file_size = get_file_size(fd);
	2034	if (file_size > 0 && file_size < size) {
	2035	error_setg(errp, "backing store %s size 0x%" PRIx64
	2036	" does not match 'size' option 0x" RAM_ADDR_FMT,
	2037	mem_path, file_size, size);
	2038	return NULL;
	2039	}
	2040
	2041	new_block = g_malloc0(sizeof(*new_block));
	2042	new_block->mr = mr;
	2043	new_block->used_length = size;
	2044	new_block->max_length = size;
	2045	new_block->flags = share ? RAM_SHARED : 0;
	2046	new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
	2047	if (!new_block->host) {
	2048	g_free(new_block);
	2049	return NULL;
	2050	}
	2051
	2052	ram_block_add(new_block, &local_err);
	2053	if (local_err) {
	2054	g_free(new_block);
	2055	error_propagate(errp, local_err);
	2056	return NULL;
	2057	}
	2058	return new_block;
	2059
	2060	}
	2061
	2062
	2063	RAMBlock qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion mr,
	2064	bool share, const char *mem_path,
	2065	Error **errp)
	2066	{
	2067	int fd;
	2068	bool created;
	2069	RAMBlock *block;
	2070
	2071	fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
	2072	if (fd < 0) {
	2073	return NULL;
	2074	}
	2075
	2076	block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
	2077	if (!block) {
	2078	if (created) {
	2079	unlink(mem_path);
	2080	}
	2081	close(fd);
	2082	return NULL;
	2083	}
	2084
	2085	return block;
	2086	}
	2087	#endif
	2088
	2089	static
	2090	RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
	2091	void (resized)(const char,
	2092	uint64_t length,
	2093	void *host),
	2094	void *host, bool resizeable,
	2095	MemoryRegion mr, Error *errp)
	2096	{
	2097	RAMBlock *new_block;
	2098	Error *local_err = NULL;
	2099
	2100	size = HOST_PAGE_ALIGN(size);
	2101	max_size = HOST_PAGE_ALIGN(max_size);
	2102	new_block = g_malloc0(sizeof(*new_block));
	2103	new_block->mr = mr;
	2104	new_block->resized = resized;
	2105	new_block->used_length = size;
	2106	new_block->max_length = max_size;
	2107	assert(max_size >= size);
	2108	new_block->fd = -1;
	2109	new_block->page_size = getpagesize();
	2110	new_block->host = host;
	2111	if (host) {
	2112	new_block->flags \|= RAM_PREALLOC;
	2113	}
	2114	if (resizeable) {
	2115	new_block->flags \|= RAM_RESIZEABLE;
	2116	}
	2117	ram_block_add(new_block, &local_err);
	2118	if (local_err) {
	2119	g_free(new_block);
	2120	error_propagate(errp, local_err);
	2121	return NULL;
	2122	}
	2123	return new_block;
	2124	}
	2125
	2126	RAMBlock qemu_ram_alloc_from_ptr(ram_addr_t size, void host,
	2127	MemoryRegion mr, Error *errp)
	2128	{
	2129	return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
	2130	}
	2131
	2132	RAMBlock qemu_ram_alloc(ram_addr_t size, MemoryRegion mr, Error **errp)
	2133	{
	2134	return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
	2135	}
	2136
	2137	RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
	2138	void (resized)(const char,
	2139	uint64_t length,
	2140	void *host),
	2141	MemoryRegion mr, Error *errp)
	2142	{
	2143	return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
	2144	}
	2145
	2146	static void reclaim_ramblock(RAMBlock *block)
	2147	{
	2148	if (block->flags & RAM_PREALLOC) {
	2149	;
	2150	} else if (xen_enabled()) {
	2151	xen_invalidate_map_cache_entry(block->host);
	2152	#ifndef _WIN32
	2153	} else if (block->fd >= 0) {
	2154	qemu_ram_munmap(block->host, block->max_length);
	2155	close(block->fd);
	2156	#endif
	2157	} else {
	2158	qemu_anon_ram_free(block->host, block->max_length);
	2159	}
	2160	g_free(block);
	2161	}
	2162
	2163	void qemu_ram_free(RAMBlock *block)
	2164	{
	2165	if (!block) {
	2166	return;
	2167	}
	2168
	2169	if (block->host) {
	2170	ram_block_notify_remove(block->host, block->max_length);
	2171	}
	2172
	2173	qemu_mutex_lock_ramlist();
	2174	QLIST_REMOVE_RCU(block, next);
	2175	ram_list.mru_block = NULL;
	2176	/* Write list before version */
	2177	smp_wmb();
	2178	ram_list.version++;
	2179	call_rcu(block, reclaim_ramblock, rcu);
	2180	qemu_mutex_unlock_ramlist();
	2181	}
	2182
	2183	#ifndef _WIN32
	2184	void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
	2185	{
	2186	RAMBlock *block;
	2187	ram_addr_t offset;
	2188	int flags;
	2189	void area, vaddr;
	2190
	2191	RAMBLOCK_FOREACH(block) {
	2192	offset = addr - block->offset;
	2193	if (offset < block->max_length) {
	2194	vaddr = ramblock_ptr(block, offset);
	2195	if (block->flags & RAM_PREALLOC) {
	2196	;
	2197	} else if (xen_enabled()) {
	2198	abort();
	2199	} else {
	2200	flags = MAP_FIXED;
	2201	if (block->fd >= 0) {
	2202	flags \|= (block->flags & RAM_SHARED ?
	2203	MAP_SHARED : MAP_PRIVATE);
	2204	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2205	flags, block->fd, offset);
	2206	} else {
	2207	/*
	2208	* Remap needs to match alloc. Accelerators that
	2209	* set phys_mem_alloc never remap. If they did,
	2210	* we'd need a remap hook here.
	2211	*/
	2212	assert(phys_mem_alloc == qemu_anon_ram_alloc);
	2213
	2214	flags \|= MAP_PRIVATE \| MAP_ANONYMOUS;
	2215	area = mmap(vaddr, length, PROT_READ \| PROT_WRITE,
	2216	flags, -1, 0);
	2217	}
	2218	if (area != vaddr) {
	2219	error_report("Could not remap addr: "
	2220	RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
	2221	length, addr);
	2222	exit(1);
	2223	}
	2224	memory_try_enable_merging(vaddr, length);
	2225	qemu_ram_setup_dump(vaddr, length);
	2226	}
	2227	}
	2228	}
	2229	}
	2230	#endif /* !_WIN32 */
	2231
	2232	/* Return a host pointer to ram allocated with qemu_ram_alloc.
	2233	* This should not be used for general purpose DMA. Use address_space_map
	2234	* or address_space_rw instead. For local memory (e.g. video ram) that the
	2235	* device owns, use memory_region_get_ram_ptr.
	2236	*
	2237	* Called within RCU critical section.
	2238	*/
	2239	void qemu_map_ram_ptr(RAMBlock ram_block, ram_addr_t addr)
	2240	{
	2241	RAMBlock *block = ram_block;
	2242
	2243	if (block == NULL) {
	2244	block = qemu_get_ram_block(addr);
	2245	addr -= block->offset;
	2246	}
	2247
	2248	if (xen_enabled() && block->host == NULL) {
	2249	/* We need to check if the requested address is in the RAM
	2250	* because we don't want to map the entire memory in QEMU.
	2251	* In that case just map until the end of the page.
	2252	*/
	2253	if (block->offset == 0) {
	2254	return xen_map_cache(addr, 0, 0, false);
	2255	}
	2256
	2257	block->host = xen_map_cache(block->offset, block->max_length, 1, false);
	2258	}
	2259	return ramblock_ptr(block, addr);
	2260	}
	2261
	2262	/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
	2263	* but takes a size argument.
	2264	*
	2265	* Called within RCU critical section.
	2266	*/
	2267	static void qemu_ram_ptr_length(RAMBlock ram_block, ram_addr_t addr,
	2268	hwaddr *size, bool lock)
	2269	{
	2270	RAMBlock *block = ram_block;
	2271	if (*size == 0) {
	2272	return NULL;
	2273	}
	2274
	2275	if (block == NULL) {
	2276	block = qemu_get_ram_block(addr);
	2277	addr -= block->offset;
	2278	}
	2279	size = MIN(size, block->max_length - addr);
	2280
	2281	if (xen_enabled() && block->host == NULL) {
	2282	/* We need to check if the requested address is in the RAM
	2283	* because we don't want to map the entire memory in QEMU.
	2284	* In that case just map the requested area.
	2285	*/
	2286	if (block->offset == 0) {
	2287	return xen_map_cache(addr, *size, lock, lock);
	2288	}
	2289
	2290	block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
	2291	}
	2292
	2293	return ramblock_ptr(block, addr);
	2294	}
	2295
	2296	/*
	2297	* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
	2298	* in that RAMBlock.
	2299	*
	2300	* ptr: Host pointer to look up
	2301	* round_offset: If true round the result offset down to a page boundary
	2302	* *ram_addr: set to result ram_addr
	2303	* *offset: set to result offset within the RAMBlock
	2304	*
	2305	* Returns: RAMBlock (or NULL if not found)
	2306	*
	2307	* By the time this function returns, the returned pointer is not protected
	2308	* by RCU anymore. If the caller is not within an RCU critical section and
	2309	* does not hold the iothread lock, it must have other means of protecting the
	2310	* pointer, such as a reference to the region that includes the incoming
	2311	* ram_addr_t.
	2312	*/
	2313	RAMBlock qemu_ram_block_from_host(void ptr, bool round_offset,
	2314	ram_addr_t *offset)
	2315	{
	2316	RAMBlock *block;
	2317	uint8_t *host = ptr;
	2318
	2319	if (xen_enabled()) {
	2320	ram_addr_t ram_addr;
	2321	rcu_read_lock();
	2322	ram_addr = xen_ram_addr_from_mapcache(ptr);
	2323	block = qemu_get_ram_block(ram_addr);
	2324	if (block) {
	2325	*offset = ram_addr - block->offset;
	2326	}
	2327	rcu_read_unlock();
	2328	return block;
	2329	}
	2330
	2331	rcu_read_lock();
	2332	block = atomic_rcu_read(&ram_list.mru_block);
	2333	if (block && block->host && host - block->host < block->max_length) {
	2334	goto found;
	2335	}
	2336
	2337	RAMBLOCK_FOREACH(block) {
	2338	/* This case append when the block is not mapped. */
	2339	if (block->host == NULL) {
	2340	continue;
	2341	}
	2342	if (host - block->host < block->max_length) {
	2343	goto found;
	2344	}
	2345	}
	2346
	2347	rcu_read_unlock();
	2348	return NULL;
	2349
	2350	found:
	2351	*offset = (host - block->host);
	2352	if (round_offset) {
	2353	*offset &= TARGET_PAGE_MASK;
	2354	}
	2355	rcu_read_unlock();
	2356	return block;
	2357	}
	2358
	2359	/*
	2360	* Finds the named RAMBlock
	2361	*
	2362	* name: The name of RAMBlock to find
	2363	*
	2364	* Returns: RAMBlock (or NULL if not found)
	2365	*/
	2366	RAMBlock qemu_ram_block_by_name(const char name)
	2367	{
	2368	RAMBlock *block;
	2369
	2370	RAMBLOCK_FOREACH(block) {
	2371	if (!strcmp(name, block->idstr)) {
	2372	return block;
	2373	}
	2374	}
	2375
	2376	return NULL;
	2377	}
	2378
	2379	/* Some of the softmmu routines need to translate from a host pointer
	2380	(typically a TLB entry) back to a ram offset. */
	2381	ram_addr_t qemu_ram_addr_from_host(void *ptr)
	2382	{
	2383	RAMBlock *block;
	2384	ram_addr_t offset;
	2385
	2386	block = qemu_ram_block_from_host(ptr, false, &offset);
	2387	if (!block) {
	2388	return RAM_ADDR_INVALID;
	2389	}
	2390
	2391	return block->offset + offset;
	2392	}
	2393
	2394	/* Called within RCU critical section. */
	2395	void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
	2396	CPUState *cpu,
	2397	vaddr mem_vaddr,
	2398	ram_addr_t ram_addr,
	2399	unsigned size)
	2400	{
	2401	ndi->cpu = cpu;
	2402	ndi->ram_addr = ram_addr;
	2403	ndi->mem_vaddr = mem_vaddr;
	2404	ndi->size = size;
	2405	ndi->locked = false;
	2406
	2407	assert(tcg_enabled());
	2408	if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
	2409	ndi->locked = true;
	2410	tb_lock();
	2411	tb_invalidate_phys_page_fast(ram_addr, size);
	2412	}
	2413	}
	2414
	2415	/* Called within RCU critical section. */
	2416	void memory_notdirty_write_complete(NotDirtyInfo *ndi)
	2417	{
	2418	if (ndi->locked) {
	2419	tb_unlock();
	2420	}
	2421
	2422	/* Set both VGA and migration bits for simplicity and to remove
	2423	* the notdirty callback faster.
	2424	*/
	2425	cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
	2426	DIRTY_CLIENTS_NOCODE);
	2427	/* we remove the notdirty callback only if the code has been
	2428	flushed */
	2429	if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
	2430	tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
	2431	}
	2432	}
	2433
	2434	/* Called within RCU critical section. */
	2435	static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
	2436	uint64_t val, unsigned size)
	2437	{
	2438	NotDirtyInfo ndi;
	2439
	2440	memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
	2441	ram_addr, size);
	2442
	2443	switch (size) {
	2444	case 1:
	2445	stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2446	break;
	2447	case 2:
	2448	stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2449	break;
	2450	case 4:
	2451	stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2452	break;
	2453	case 8:
	2454	stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
	2455	break;
	2456	default:
	2457	abort();
	2458	}
	2459	memory_notdirty_write_complete(&ndi);
	2460	}
	2461
	2462	static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
	2463	unsigned size, bool is_write)
	2464	{
	2465	return is_write;
	2466	}
	2467
	2468	static const MemoryRegionOps notdirty_mem_ops = {
	2469	.write = notdirty_mem_write,
	2470	.valid.accepts = notdirty_mem_accepts,
	2471	.endianness = DEVICE_NATIVE_ENDIAN,
	2472	.valid = {
	2473	.min_access_size = 1,
	2474	.max_access_size = 8,
	2475	.unaligned = false,
	2476	},
	2477	.impl = {
	2478	.min_access_size = 1,
	2479	.max_access_size = 8,
	2480	.unaligned = false,
	2481	},
	2482	};
	2483
	2484	/* Generate a debug exception if a watchpoint has been hit. */
	2485	static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
	2486	{
	2487	CPUState *cpu = current_cpu;
	2488	CPUClass *cc = CPU_GET_CLASS(cpu);
	2489	target_ulong vaddr;
	2490	CPUWatchpoint *wp;
	2491
	2492	assert(tcg_enabled());
	2493	if (cpu->watchpoint_hit) {
	2494	/* We re-entered the check after replacing the TB. Now raise
	2495	* the debug interrupt so that is will trigger after the
	2496	* current instruction. */
	2497	cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
	2498	return;
	2499	}
	2500	vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
	2501	vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
	2502	QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
	2503	if (cpu_watchpoint_address_matches(wp, vaddr, len)
	2504	&& (wp->flags & flags)) {
	2505	if (flags == BP_MEM_READ) {
	2506	wp->flags \|= BP_WATCHPOINT_HIT_READ;
	2507	} else {
	2508	wp->flags \|= BP_WATCHPOINT_HIT_WRITE;
	2509	}
	2510	wp->hitaddr = vaddr;
	2511	wp->hitattrs = attrs;
	2512	if (!cpu->watchpoint_hit) {
	2513	if (wp->flags & BP_CPU &&
	2514	!cc->debug_check_watchpoint(cpu, wp)) {
	2515	wp->flags &= ~BP_WATCHPOINT_HIT;
	2516	continue;
	2517	}
	2518	cpu->watchpoint_hit = wp;
	2519
	2520	/* Both tb_lock and iothread_mutex will be reset when
	2521	* cpu_loop_exit or cpu_loop_exit_noexc longjmp
	2522	* back into the cpu_exec main loop.
	2523	*/
	2524	tb_lock();
	2525	tb_check_watchpoint(cpu);
	2526	if (wp->flags & BP_STOP_BEFORE_ACCESS) {
	2527	cpu->exception_index = EXCP_DEBUG;
	2528	cpu_loop_exit(cpu);
	2529	} else {
	2530	/* Force execution of one insn next time. */
	2531	cpu->cflags_next_tb = 1 \| curr_cflags();
	2532	cpu_loop_exit_noexc(cpu);
	2533	}
	2534	}
	2535	} else {
	2536	wp->flags &= ~BP_WATCHPOINT_HIT;
	2537	}
	2538	}
	2539	}
	2540
	2541	/* Watchpoint access routines. Watchpoints are inserted using TLB tricks,
	2542	so these check for a hit then pass through to the normal out-of-line
	2543	phys routines. */
	2544	static MemTxResult watch_mem_read(void opaque, hwaddr addr, uint64_t pdata,
	2545	unsigned size, MemTxAttrs attrs)
	2546	{
	2547	MemTxResult res;
	2548	uint64_t data;
	2549	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2550	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2551
	2552	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
	2553	switch (size) {
	2554	case 1:
	2555	data = address_space_ldub(as, addr, attrs, &res);
	2556	break;
	2557	case 2:
	2558	data = address_space_lduw(as, addr, attrs, &res);
	2559	break;
	2560	case 4:
	2561	data = address_space_ldl(as, addr, attrs, &res);
	2562	break;
	2563	case 8:
	2564	data = address_space_ldq(as, addr, attrs, &res);
	2565	break;
	2566	default: abort();
	2567	}
	2568	*pdata = data;
	2569	return res;
	2570	}
	2571
	2572	static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
	2573	uint64_t val, unsigned size,
	2574	MemTxAttrs attrs)
	2575	{
	2576	MemTxResult res;
	2577	int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
	2578	AddressSpace *as = current_cpu->cpu_ases[asidx].as;
	2579
	2580	check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
	2581	switch (size) {
	2582	case 1:
	2583	address_space_stb(as, addr, val, attrs, &res);
	2584	break;
	2585	case 2:
	2586	address_space_stw(as, addr, val, attrs, &res);
	2587	break;
	2588	case 4:
	2589	address_space_stl(as, addr, val, attrs, &res);
	2590	break;
	2591	case 8:
	2592	address_space_stq(as, addr, val, attrs, &res);
	2593	break;
	2594	default: abort();
	2595	}
	2596	return res;
	2597	}
	2598
	2599	static const MemoryRegionOps watch_mem_ops = {
	2600	.read_with_attrs = watch_mem_read,
	2601	.write_with_attrs = watch_mem_write,
	2602	.endianness = DEVICE_NATIVE_ENDIAN,
	2603	.valid = {
	2604	.min_access_size = 1,
	2605	.max_access_size = 8,
	2606	.unaligned = false,
	2607	},
	2608	.impl = {
	2609	.min_access_size = 1,
	2610	.max_access_size = 8,
	2611	.unaligned = false,
	2612	},
	2613	};
	2614
	2615	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	2616	const uint8_t *buf, int len);
	2617	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	2618	bool is_write);
	2619
	2620	static MemTxResult subpage_read(void opaque, hwaddr addr, uint64_t data,
	2621	unsigned len, MemTxAttrs attrs)
	2622	{
	2623	subpage_t *subpage = opaque;
	2624	uint8_t buf[8];
	2625	MemTxResult res;
	2626
	2627	#if defined(DEBUG_SUBPAGE)
	2628	printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
	2629	subpage, len, addr);
	2630	#endif
	2631	res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
	2632	if (res) {
	2633	return res;
	2634	}
	2635	switch (len) {
	2636	case 1:
	2637	*data = ldub_p(buf);
	2638	return MEMTX_OK;
	2639	case 2:
	2640	*data = lduw_p(buf);
	2641	return MEMTX_OK;
	2642	case 4:
	2643	*data = ldl_p(buf);
	2644	return MEMTX_OK;
	2645	case 8:
	2646	*data = ldq_p(buf);
	2647	return MEMTX_OK;
	2648	default:
	2649	abort();
	2650	}
	2651	}
	2652
	2653	static MemTxResult subpage_write(void *opaque, hwaddr addr,
	2654	uint64_t value, unsigned len, MemTxAttrs attrs)
	2655	{
	2656	subpage_t *subpage = opaque;
	2657	uint8_t buf[8];
	2658
	2659	#if defined(DEBUG_SUBPAGE)
	2660	printf("%s: subpage %p len %u addr " TARGET_FMT_plx
	2661	" value %"PRIx64"\n",
	2662	__func__, subpage, len, addr, value);
	2663	#endif
	2664	switch (len) {
	2665	case 1:
	2666	stb_p(buf, value);
	2667	break;
	2668	case 2:
	2669	stw_p(buf, value);
	2670	break;
	2671	case 4:
	2672	stl_p(buf, value);
	2673	break;
	2674	case 8:
	2675	stq_p(buf, value);
	2676	break;
	2677	default:
	2678	abort();
	2679	}
	2680	return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
	2681	}
	2682
	2683	static bool subpage_accepts(void *opaque, hwaddr addr,
	2684	unsigned len, bool is_write)
	2685	{
	2686	subpage_t *subpage = opaque;
	2687	#if defined(DEBUG_SUBPAGE)
	2688	printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
	2689	__func__, subpage, is_write ? 'w' : 'r', len, addr);
	2690	#endif
	2691
	2692	return flatview_access_valid(subpage->fv, addr + subpage->base,
	2693	len, is_write);
	2694	}
	2695
	2696	static const MemoryRegionOps subpage_ops = {
	2697	.read_with_attrs = subpage_read,
	2698	.write_with_attrs = subpage_write,
	2699	.impl.min_access_size = 1,
	2700	.impl.max_access_size = 8,
	2701	.valid.min_access_size = 1,
	2702	.valid.max_access_size = 8,
	2703	.valid.accepts = subpage_accepts,
	2704	.endianness = DEVICE_NATIVE_ENDIAN,
	2705	};
	2706
	2707	static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
	2708	uint16_t section)
	2709	{
	2710	int idx, eidx;
	2711
	2712	if (start >= TARGET_PAGE_SIZE \|\| end >= TARGET_PAGE_SIZE)
	2713	return -1;
	2714	idx = SUBPAGE_IDX(start);
	2715	eidx = SUBPAGE_IDX(end);
	2716	#if defined(DEBUG_SUBPAGE)
	2717	printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
	2718	__func__, mmio, start, end, idx, eidx, section);
	2719	#endif
	2720	for (; idx <= eidx; idx++) {
	2721	mmio->sub_section[idx] = section;
	2722	}
	2723
	2724	return 0;
	2725	}
	2726
	2727	static subpage_t subpage_init(FlatView fv, hwaddr base)
	2728	{
	2729	subpage_t *mmio;
	2730
	2731	mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
	2732	mmio->fv = fv;
	2733	mmio->base = base;
	2734	memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
	2735	NULL, TARGET_PAGE_SIZE);
	2736	mmio->iomem.subpage = true;
	2737	#if defined(DEBUG_SUBPAGE)
	2738	printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
	2739	mmio, base, TARGET_PAGE_SIZE);
	2740	#endif
	2741	subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
	2742
	2743	return mmio;
	2744	}
	2745
	2746	static uint16_t dummy_section(PhysPageMap map, FlatView fv, MemoryRegion *mr)
	2747	{
	2748	assert(fv);
	2749	MemoryRegionSection section = {
	2750	.fv = fv,
	2751	.mr = mr,
	2752	.offset_within_address_space = 0,
	2753	.offset_within_region = 0,
	2754	.size = int128_2_64(),
	2755	};
	2756
	2757	return phys_section_add(map, &section);
	2758	}
	2759
	2760	static void readonly_mem_write(void *opaque, hwaddr addr,
	2761	uint64_t val, unsigned size)
	2762	{
	2763	/* Ignore any write to ROM. */
	2764	}
	2765
	2766	static bool readonly_mem_accepts(void *opaque, hwaddr addr,
	2767	unsigned size, bool is_write)
	2768	{
	2769	return is_write;
	2770	}
	2771
	2772	/* This will only be used for writes, because reads are special cased
	2773	* to directly access the underlying host ram.
	2774	*/
	2775	static const MemoryRegionOps readonly_mem_ops = {
	2776	.write = readonly_mem_write,
	2777	.valid.accepts = readonly_mem_accepts,
	2778	.endianness = DEVICE_NATIVE_ENDIAN,
	2779	.valid = {
	2780	.min_access_size = 1,
	2781	.max_access_size = 8,
	2782	.unaligned = false,
	2783	},
	2784	.impl = {
	2785	.min_access_size = 1,
	2786	.max_access_size = 8,
	2787	.unaligned = false,
	2788	},
	2789	};
	2790
	2791	MemoryRegion iotlb_to_region(CPUState cpu, hwaddr index, MemTxAttrs attrs)
	2792	{
	2793	int asidx = cpu_asidx_from_attrs(cpu, attrs);
	2794	CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
	2795	AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
	2796	MemoryRegionSection *sections = d->map.sections;
	2797
	2798	return sections[index & ~TARGET_PAGE_MASK].mr;
	2799	}
	2800
	2801	static void io_mem_init(void)
	2802	{
	2803	memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
	2804	NULL, NULL, UINT64_MAX);
	2805	memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
	2806	NULL, UINT64_MAX);
	2807
	2808	/* io_mem_notdirty calls tb_invalidate_phys_page_fast,
	2809	* which can be called without the iothread mutex.
	2810	*/
	2811	memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
	2812	NULL, UINT64_MAX);
	2813	memory_region_clear_global_locking(&io_mem_notdirty);
	2814
	2815	memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
	2816	NULL, UINT64_MAX);
	2817	}
	2818
	2819	AddressSpaceDispatch address_space_dispatch_new(FlatView fv)
	2820	{
	2821	AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
	2822	uint16_t n;
	2823
	2824	n = dummy_section(&d->map, fv, &io_mem_unassigned);
	2825	assert(n == PHYS_SECTION_UNASSIGNED);
	2826	n = dummy_section(&d->map, fv, &io_mem_notdirty);
	2827	assert(n == PHYS_SECTION_NOTDIRTY);
	2828	n = dummy_section(&d->map, fv, &io_mem_rom);
	2829	assert(n == PHYS_SECTION_ROM);
	2830	n = dummy_section(&d->map, fv, &io_mem_watch);
	2831	assert(n == PHYS_SECTION_WATCH);
	2832
	2833	d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
	2834
	2835	return d;
	2836	}
	2837
	2838	void address_space_dispatch_free(AddressSpaceDispatch *d)
	2839	{
	2840	phys_sections_free(&d->map);
	2841	g_free(d);
	2842	}
	2843
	2844	static void tcg_commit(MemoryListener *listener)
	2845	{
	2846	CPUAddressSpace *cpuas;
	2847	AddressSpaceDispatch *d;
	2848
	2849	/* since each CPU stores ram addresses in its TLB cache, we must
	2850	reset the modified entries */
	2851	cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
	2852	cpu_reloading_memory_map();
	2853	/* The CPU and TLB are protected by the iothread lock.
	2854	* We reload the dispatch pointer now because cpu_reloading_memory_map()
	2855	* may have split the RCU critical section.
	2856	*/
	2857	d = address_space_to_dispatch(cpuas->as);
	2858	atomic_rcu_set(&cpuas->memory_dispatch, d);
	2859	tlb_flush(cpuas->cpu);
	2860	}
	2861
	2862	static void memory_map_init(void)
	2863	{
	2864	system_memory = g_malloc(sizeof(*system_memory));
	2865
	2866	memory_region_init(system_memory, NULL, "system", UINT64_MAX);
	2867	address_space_init(&address_space_memory, system_memory, "memory");
	2868
	2869	system_io = g_malloc(sizeof(*system_io));
	2870	memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
	2871	65536);
	2872	address_space_init(&address_space_io, system_io, "I/O");
	2873	}
	2874
	2875	MemoryRegion *get_system_memory(void)
	2876	{
	2877	return system_memory;
	2878	}
	2879
	2880	MemoryRegion *get_system_io(void)
	2881	{
	2882	return system_io;
	2883	}
	2884
	2885	#endif /* !defined(CONFIG_USER_ONLY) */
	2886
	2887	/* physical memory access (slow version, mainly for debug) */
	2888	#if defined(CONFIG_USER_ONLY)
	2889	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	2890	uint8_t *buf, int len, int is_write)
	2891	{
	2892	int l, flags;
	2893	target_ulong page;
	2894	void * p;
	2895
	2896	while (len > 0) {
	2897	page = addr & TARGET_PAGE_MASK;
	2898	l = (page + TARGET_PAGE_SIZE) - addr;
	2899	if (l > len)
	2900	l = len;
	2901	flags = page_get_flags(page);
	2902	if (!(flags & PAGE_VALID))
	2903	return -1;
	2904	if (is_write) {
	2905	if (!(flags & PAGE_WRITE))
	2906	return -1;
	2907	/* XXX: this code should not depend on lock_user */
	2908	if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
	2909	return -1;
	2910	memcpy(p, buf, l);
	2911	unlock_user(p, addr, l);
	2912	} else {
	2913	if (!(flags & PAGE_READ))
	2914	return -1;
	2915	/* XXX: this code should not depend on lock_user */
	2916	if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
	2917	return -1;
	2918	memcpy(buf, p, l);
	2919	unlock_user(p, addr, 0);
	2920	}
	2921	len -= l;
	2922	buf += l;
	2923	addr += l;
	2924	}
	2925	return 0;
	2926	}
	2927
	2928	#else
	2929
	2930	static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
	2931	hwaddr length)
	2932	{
	2933	uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
	2934	addr += memory_region_get_ram_addr(mr);
	2935
	2936	/* No early return if dirty_log_mask is or becomes 0, because
	2937	* cpu_physical_memory_set_dirty_range will still call
	2938	* xen_modified_memory.
	2939	*/
	2940	if (dirty_log_mask) {
	2941	dirty_log_mask =
	2942	cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
	2943	}
	2944	if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
	2945	assert(tcg_enabled());
	2946	tb_lock();
	2947	tb_invalidate_phys_range(addr, addr + length);
	2948	tb_unlock();
	2949	dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
	2950	}
	2951	cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
	2952	}
	2953
	2954	static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
	2955	{
	2956	unsigned access_size_max = mr->ops->valid.max_access_size;
	2957
	2958	/* Regions are assumed to support 1-4 byte accesses unless
	2959	otherwise specified. */
	2960	if (access_size_max == 0) {
	2961	access_size_max = 4;
	2962	}
	2963
	2964	/* Bound the maximum access by the alignment of the address. */
	2965	if (!mr->ops->impl.unaligned) {
	2966	unsigned align_size_max = addr & -addr;
	2967	if (align_size_max != 0 && align_size_max < access_size_max) {
	2968	access_size_max = align_size_max;
	2969	}
	2970	}
	2971
	2972	/* Don't attempt accesses larger than the maximum. */
	2973	if (l > access_size_max) {
	2974	l = access_size_max;
	2975	}
	2976	l = pow2floor(l);
	2977
	2978	return l;
	2979	}
	2980
	2981	static bool prepare_mmio_access(MemoryRegion *mr)
	2982	{
	2983	bool unlocked = !qemu_mutex_iothread_locked();
	2984	bool release_lock = false;
	2985
	2986	if (unlocked && mr->global_locking) {
	2987	qemu_mutex_lock_iothread();
	2988	unlocked = false;
	2989	release_lock = true;
	2990	}
	2991	if (mr->flush_coalesced_mmio) {
	2992	if (unlocked) {
	2993	qemu_mutex_lock_iothread();
	2994	}
	2995	qemu_flush_coalesced_mmio_buffer();
	2996	if (unlocked) {
	2997	qemu_mutex_unlock_iothread();
	2998	}
	2999	}
	3000
	3001	return release_lock;
	3002	}
	3003
	3004	/* Called within RCU critical section. */
	3005	static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
	3006	MemTxAttrs attrs,
	3007	const uint8_t *buf,
	3008	int len, hwaddr addr1,
	3009	hwaddr l, MemoryRegion *mr)
	3010	{
	3011	uint8_t *ptr;
	3012	uint64_t val;
	3013	MemTxResult result = MEMTX_OK;
	3014	bool release_lock = false;
	3015
	3016	for (;;) {
	3017	if (!memory_access_is_direct(mr, true)) {
	3018	release_lock \|= prepare_mmio_access(mr);
	3019	l = memory_access_size(mr, l, addr1);
	3020	/* XXX: could force current_cpu to NULL to avoid
	3021	potential bugs */
	3022	switch (l) {
	3023	case 8:
	3024	/* 64 bit write access */
	3025	val = ldq_p(buf);
	3026	result \|= memory_region_dispatch_write(mr, addr1, val, 8,
	3027	attrs);
	3028	break;
	3029	case 4:
	3030	/* 32 bit write access */
	3031	val = (uint32_t)ldl_p(buf);
	3032	result \|= memory_region_dispatch_write(mr, addr1, val, 4,
	3033	attrs);
	3034	break;
	3035	case 2:
	3036	/* 16 bit write access */
	3037	val = lduw_p(buf);
	3038	result \|= memory_region_dispatch_write(mr, addr1, val, 2,
	3039	attrs);
	3040	break;
	3041	case 1:
	3042	/* 8 bit write access */
	3043	val = ldub_p(buf);
	3044	result \|= memory_region_dispatch_write(mr, addr1, val, 1,
	3045	attrs);
	3046	break;
	3047	default:
	3048	abort();
	3049	}
	3050	} else {
	3051	/* RAM case */
	3052	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3053	memcpy(ptr, buf, l);
	3054	invalidate_and_set_dirty(mr, addr1, l);
	3055	}
	3056
	3057	if (release_lock) {
	3058	qemu_mutex_unlock_iothread();
	3059	release_lock = false;
	3060	}
	3061
	3062	len -= l;
	3063	buf += l;
	3064	addr += l;
	3065
	3066	if (!len) {
	3067	break;
	3068	}
	3069
	3070	l = len;
	3071	mr = flatview_translate(fv, addr, &addr1, &l, true);
	3072	}
	3073
	3074	return result;
	3075	}
	3076
	3077	static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3078	const uint8_t *buf, int len)
	3079	{
	3080	hwaddr l;
	3081	hwaddr addr1;
	3082	MemoryRegion *mr;
	3083	MemTxResult result = MEMTX_OK;
	3084
	3085	if (len > 0) {
	3086	rcu_read_lock();
	3087	l = len;
	3088	mr = flatview_translate(fv, addr, &addr1, &l, true);
	3089	result = flatview_write_continue(fv, addr, attrs, buf, len,
	3090	addr1, l, mr);
	3091	rcu_read_unlock();
	3092	}
	3093
	3094	return result;
	3095	}
	3096
	3097	MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
	3098	MemTxAttrs attrs,
	3099	const uint8_t *buf, int len)
	3100	{
	3101	return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
	3102	}
	3103
	3104	/* Called within RCU critical section. */
	3105	MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
	3106	MemTxAttrs attrs, uint8_t *buf,
	3107	int len, hwaddr addr1, hwaddr l,
	3108	MemoryRegion *mr)
	3109	{
	3110	uint8_t *ptr;
	3111	uint64_t val;
	3112	MemTxResult result = MEMTX_OK;
	3113	bool release_lock = false;
	3114
	3115	for (;;) {
	3116	if (!memory_access_is_direct(mr, false)) {
	3117	/* I/O case */
	3118	release_lock \|= prepare_mmio_access(mr);
	3119	l = memory_access_size(mr, l, addr1);
	3120	switch (l) {
	3121	case 8:
	3122	/* 64 bit read access */
	3123	result \|= memory_region_dispatch_read(mr, addr1, &val, 8,
	3124	attrs);
	3125	stq_p(buf, val);
	3126	break;
	3127	case 4:
	3128	/* 32 bit read access */
	3129	result \|= memory_region_dispatch_read(mr, addr1, &val, 4,
	3130	attrs);
	3131	stl_p(buf, val);
	3132	break;
	3133	case 2:
	3134	/* 16 bit read access */
	3135	result \|= memory_region_dispatch_read(mr, addr1, &val, 2,
	3136	attrs);
	3137	stw_p(buf, val);
	3138	break;
	3139	case 1:
	3140	/* 8 bit read access */
	3141	result \|= memory_region_dispatch_read(mr, addr1, &val, 1,
	3142	attrs);
	3143	stb_p(buf, val);
	3144	break;
	3145	default:
	3146	abort();
	3147	}
	3148	} else {
	3149	/* RAM case */
	3150	ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
	3151	memcpy(buf, ptr, l);
	3152	}
	3153
	3154	if (release_lock) {
	3155	qemu_mutex_unlock_iothread();
	3156	release_lock = false;
	3157	}
	3158
	3159	len -= l;
	3160	buf += l;
	3161	addr += l;
	3162
	3163	if (!len) {
	3164	break;
	3165	}
	3166
	3167	l = len;
	3168	mr = flatview_translate(fv, addr, &addr1, &l, false);
	3169	}
	3170
	3171	return result;
	3172	}
	3173
	3174	MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
	3175	MemTxAttrs attrs, uint8_t *buf, int len)
	3176	{
	3177	hwaddr l;
	3178	hwaddr addr1;
	3179	MemoryRegion *mr;
	3180	MemTxResult result = MEMTX_OK;
	3181
	3182	if (len > 0) {
	3183	rcu_read_lock();
	3184	l = len;
	3185	mr = flatview_translate(fv, addr, &addr1, &l, false);
	3186	result = flatview_read_continue(fv, addr, attrs, buf, len,
	3187	addr1, l, mr);
	3188	rcu_read_unlock();
	3189	}
	3190
	3191	return result;
	3192	}
	3193
	3194	static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
	3195	uint8_t *buf, int len, bool is_write)
	3196	{
	3197	if (is_write) {
	3198	return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
	3199	} else {
	3200	return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
	3201	}
	3202	}
	3203
	3204	MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
	3205	MemTxAttrs attrs, uint8_t *buf,
	3206	int len, bool is_write)
	3207	{
	3208	return flatview_rw(address_space_to_flatview(as),
	3209	addr, attrs, buf, len, is_write);
	3210	}
	3211
	3212	void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
	3213	int len, int is_write)
	3214	{
	3215	address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
	3216	buf, len, is_write);
	3217	}
	3218
	3219	enum write_rom_type {
	3220	WRITE_DATA,
	3221	FLUSH_CACHE,
	3222	};
	3223
	3224	static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
	3225	hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
	3226	{
	3227	hwaddr l;
	3228	uint8_t *ptr;
	3229	hwaddr addr1;
	3230	MemoryRegion *mr;
	3231
	3232	rcu_read_lock();
	3233	while (len > 0) {
	3234	l = len;
	3235	mr = address_space_translate(as, addr, &addr1, &l, true);
	3236
	3237	if (!(memory_region_is_ram(mr) \|\|
	3238	memory_region_is_romd(mr))) {
	3239	l = memory_access_size(mr, l, addr1);
	3240	} else {
	3241	/* ROM/RAM case */
	3242	ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
	3243	switch (type) {
	3244	case WRITE_DATA:
	3245	memcpy(ptr, buf, l);
	3246	invalidate_and_set_dirty(mr, addr1, l);
	3247	break;
	3248	case FLUSH_CACHE:
	3249	flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
	3250	break;
	3251	}
	3252	}
	3253	len -= l;
	3254	buf += l;
	3255	addr += l;
	3256	}
	3257	rcu_read_unlock();
	3258	}
	3259
	3260	/* used for ROM loading : can write in RAM and ROM */
	3261	void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
	3262	const uint8_t *buf, int len)
	3263	{
	3264	cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
	3265	}
	3266
	3267	void cpu_flush_icache_range(hwaddr start, int len)
	3268	{
	3269	/*
	3270	* This function should do the same thing as an icache flush that was
	3271	* triggered from within the guest. For TCG we are always cache coherent,
	3272	* so there is no need to flush anything. For KVM / Xen we need to flush
	3273	* the host's instruction cache at least.
	3274	*/
	3275	if (tcg_enabled()) {
	3276	return;
	3277	}
	3278
	3279	cpu_physical_memory_write_rom_internal(&address_space_memory,
	3280	start, NULL, len, FLUSH_CACHE);
	3281	}
	3282
	3283	typedef struct {
	3284	MemoryRegion *mr;
	3285	void *buffer;
	3286	hwaddr addr;
	3287	hwaddr len;
	3288	bool in_use;
	3289	} BounceBuffer;
	3290
	3291	static BounceBuffer bounce;
	3292
	3293	typedef struct MapClient {
	3294	QEMUBH *bh;
	3295	QLIST_ENTRY(MapClient) link;
	3296	} MapClient;
	3297
	3298	QemuMutex map_client_list_lock;
	3299	static QLIST_HEAD(map_client_list, MapClient) map_client_list
	3300	= QLIST_HEAD_INITIALIZER(map_client_list);
	3301
	3302	static void cpu_unregister_map_client_do(MapClient *client)
	3303	{
	3304	QLIST_REMOVE(client, link);
	3305	g_free(client);
	3306	}
	3307
	3308	static void cpu_notify_map_clients_locked(void)
	3309	{
	3310	MapClient *client;
	3311
	3312	while (!QLIST_EMPTY(&map_client_list)) {
	3313	client = QLIST_FIRST(&map_client_list);
	3314	qemu_bh_schedule(client->bh);
	3315	cpu_unregister_map_client_do(client);
	3316	}
	3317	}
	3318
	3319	void cpu_register_map_client(QEMUBH *bh)
	3320	{
	3321	MapClient client = g_malloc(sizeof(client));
	3322
	3323	qemu_mutex_lock(&map_client_list_lock);
	3324	client->bh = bh;
	3325	QLIST_INSERT_HEAD(&map_client_list, client, link);
	3326	if (!atomic_read(&bounce.in_use)) {
	3327	cpu_notify_map_clients_locked();
	3328	}
	3329	qemu_mutex_unlock(&map_client_list_lock);
	3330	}
	3331
	3332	void cpu_exec_init_all(void)
	3333	{
	3334	qemu_mutex_init(&ram_list.mutex);
	3335	/* The data structures we set up here depend on knowing the page size,
	3336	* so no more changes can be made after this point.
	3337	* In an ideal world, nothing we did before we had finished the
	3338	* machine setup would care about the target page size, and we could
	3339	* do this much later, rather than requiring board models to state
	3340	* up front what their requirements are.
	3341	*/
	3342	finalize_target_page_bits();
	3343	io_mem_init();
	3344	memory_map_init();
	3345	qemu_mutex_init(&map_client_list_lock);
	3346	}
	3347
	3348	void cpu_unregister_map_client(QEMUBH *bh)
	3349	{
	3350	MapClient *client;
	3351
	3352	qemu_mutex_lock(&map_client_list_lock);
	3353	QLIST_FOREACH(client, &map_client_list, link) {
	3354	if (client->bh == bh) {
	3355	cpu_unregister_map_client_do(client);
	3356	break;
	3357	}
	3358	}
	3359	qemu_mutex_unlock(&map_client_list_lock);
	3360	}
	3361
	3362	static void cpu_notify_map_clients(void)
	3363	{
	3364	qemu_mutex_lock(&map_client_list_lock);
	3365	cpu_notify_map_clients_locked();
	3366	qemu_mutex_unlock(&map_client_list_lock);
	3367	}
	3368
	3369	static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
	3370	bool is_write)
	3371	{
	3372	MemoryRegion *mr;
	3373	hwaddr l, xlat;
	3374
	3375	rcu_read_lock();
	3376	while (len > 0) {
	3377	l = len;
	3378	mr = flatview_translate(fv, addr, &xlat, &l, is_write);
	3379	if (!memory_access_is_direct(mr, is_write)) {
	3380	l = memory_access_size(mr, l, addr);
	3381	if (!memory_region_access_valid(mr, xlat, l, is_write)) {
	3382	rcu_read_unlock();
	3383	return false;
	3384	}
	3385	}
	3386
	3387	len -= l;
	3388	addr += l;
	3389	}
	3390	rcu_read_unlock();
	3391	return true;
	3392	}
	3393
	3394	bool address_space_access_valid(AddressSpace *as, hwaddr addr,
	3395	int len, bool is_write)
	3396	{
	3397	return flatview_access_valid(address_space_to_flatview(as),
	3398	addr, len, is_write);
	3399	}
	3400
	3401	static hwaddr
	3402	flatview_extend_translation(FlatView *fv, hwaddr addr,
	3403	hwaddr target_len,
	3404	MemoryRegion *mr, hwaddr base, hwaddr len,
	3405	bool is_write)
	3406	{
	3407	hwaddr done = 0;
	3408	hwaddr xlat;
	3409	MemoryRegion *this_mr;
	3410
	3411	for (;;) {
	3412	target_len -= len;
	3413	addr += len;
	3414	done += len;
	3415	if (target_len == 0) {
	3416	return done;
	3417	}
	3418
	3419	len = target_len;
	3420	this_mr = flatview_translate(fv, addr, &xlat,
	3421	&len, is_write);
	3422	if (this_mr != mr \|\| xlat != base + done) {
	3423	return done;
	3424	}
	3425	}
	3426	}
	3427
	3428	/* Map a physical memory region into a host virtual address.
	3429	* May map a subset of the requested range, given by and returned in *plen.
	3430	* May return NULL if resources needed to perform the mapping are exhausted.
	3431	* Use only for reads OR writes - not for read-modify-write operations.
	3432	* Use cpu_register_map_client() to know when retrying the map operation is
	3433	* likely to succeed.
	3434	*/
	3435	void address_space_map(AddressSpace as,
	3436	hwaddr addr,
	3437	hwaddr *plen,
	3438	bool is_write)
	3439	{
	3440	hwaddr len = *plen;
	3441	hwaddr l, xlat;
	3442	MemoryRegion *mr;
	3443	void *ptr;
	3444	FlatView *fv = address_space_to_flatview(as);
	3445
	3446	if (len == 0) {
	3447	return NULL;
	3448	}
	3449
	3450	l = len;
	3451	rcu_read_lock();
	3452	mr = flatview_translate(fv, addr, &xlat, &l, is_write);
	3453
	3454	if (!memory_access_is_direct(mr, is_write)) {
	3455	if (atomic_xchg(&bounce.in_use, true)) {
	3456	rcu_read_unlock();
	3457	return NULL;
	3458	}
	3459	/* Avoid unbounded allocations */
	3460	l = MIN(l, TARGET_PAGE_SIZE);
	3461	bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
	3462	bounce.addr = addr;
	3463	bounce.len = l;
	3464
	3465	memory_region_ref(mr);
	3466	bounce.mr = mr;
	3467	if (!is_write) {
	3468	flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
	3469	bounce.buffer, l);
	3470	}
	3471
	3472	rcu_read_unlock();
	3473	*plen = l;
	3474	return bounce.buffer;
	3475	}
	3476
	3477
	3478	memory_region_ref(mr);
	3479	*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
	3480	l, is_write);
	3481	ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
	3482	rcu_read_unlock();
	3483
	3484	return ptr;
	3485	}
	3486
	3487	/* Unmaps a memory region previously mapped by address_space_map().
	3488	* Will also mark the memory as dirty if is_write == 1. access_len gives
	3489	* the amount of memory that was actually read or written by the caller.
	3490	*/
	3491	void address_space_unmap(AddressSpace as, void buffer, hwaddr len,
	3492	int is_write, hwaddr access_len)
	3493	{
	3494	if (buffer != bounce.buffer) {
	3495	MemoryRegion *mr;
	3496	ram_addr_t addr1;
	3497
	3498	mr = memory_region_from_host(buffer, &addr1);
	3499	assert(mr != NULL);
	3500	if (is_write) {
	3501	invalidate_and_set_dirty(mr, addr1, access_len);
	3502	}
	3503	if (xen_enabled()) {
	3504	xen_invalidate_map_cache_entry(buffer);
	3505	}
	3506	memory_region_unref(mr);
	3507	return;
	3508	}
	3509	if (is_write) {
	3510	address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
	3511	bounce.buffer, access_len);
	3512	}
	3513	qemu_vfree(bounce.buffer);
	3514	bounce.buffer = NULL;
	3515	memory_region_unref(bounce.mr);
	3516	atomic_mb_set(&bounce.in_use, false);
	3517	cpu_notify_map_clients();
	3518	}
	3519
	3520	void *cpu_physical_memory_map(hwaddr addr,
	3521	hwaddr *plen,
	3522	int is_write)
	3523	{
	3524	return address_space_map(&address_space_memory, addr, plen, is_write);
	3525	}
	3526
	3527	void cpu_physical_memory_unmap(void *buffer, hwaddr len,
	3528	int is_write, hwaddr access_len)
	3529	{
	3530	return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
	3531	}
	3532
	3533	#define ARG1_DECL AddressSpace *as
	3534	#define ARG1 as
	3535	#define SUFFIX
	3536	#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
	3537	#define IS_DIRECT(mr, is_write) memory_access_is_direct(mr, is_write)
	3538	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3539	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3540	#define RCU_READ_LOCK(...) rcu_read_lock()
	3541	#define RCU_READ_UNLOCK(...) rcu_read_unlock()
	3542	#include "memory_ldst.inc.c"
	3543
	3544	int64_t address_space_cache_init(MemoryRegionCache *cache,
	3545	AddressSpace *as,
	3546	hwaddr addr,
	3547	hwaddr len,
	3548	bool is_write)
	3549	{
	3550	cache->len = len;
	3551	cache->as = as;
	3552	cache->xlat = addr;
	3553	return len;
	3554	}
	3555
	3556	void address_space_cache_invalidate(MemoryRegionCache *cache,
	3557	hwaddr addr,
	3558	hwaddr access_len)
	3559	{
	3560	}
	3561
	3562	void address_space_cache_destroy(MemoryRegionCache *cache)
	3563	{
	3564	cache->as = NULL;
	3565	}
	3566
	3567	#define ARG1_DECL MemoryRegionCache *cache
	3568	#define ARG1 cache
	3569	#define SUFFIX _cached
	3570	#define TRANSLATE(addr, ...) \
	3571	address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
	3572	#define IS_DIRECT(mr, is_write) true
	3573	#define MAP_RAM(mr, ofs) qemu_map_ram_ptr((mr)->ram_block, ofs)
	3574	#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
	3575	#define RCU_READ_LOCK() rcu_read_lock()
	3576	#define RCU_READ_UNLOCK() rcu_read_unlock()
	3577	#include "memory_ldst.inc.c"
	3578
	3579	/* virtual memory access for debug (includes writing to ROM) */
	3580	int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
	3581	uint8_t *buf, int len, int is_write)
	3582	{
	3583	int l;
	3584	hwaddr phys_addr;
	3585	target_ulong page;
	3586
	3587	cpu_synchronize_state(cpu);
	3588	while (len > 0) {
	3589	int asidx;
	3590	MemTxAttrs attrs;
	3591
	3592	page = addr & TARGET_PAGE_MASK;
	3593	phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
	3594	asidx = cpu_asidx_from_attrs(cpu, attrs);
	3595	/* if no physical page mapped, return an error */
	3596	if (phys_addr == -1)
	3597	return -1;
	3598	l = (page + TARGET_PAGE_SIZE) - addr;
	3599	if (l > len)
	3600	l = len;
	3601	phys_addr += (addr & ~TARGET_PAGE_MASK);
	3602	if (is_write) {
	3603	cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
	3604	phys_addr, buf, l);
	3605	} else {
	3606	address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
	3607	MEMTXATTRS_UNSPECIFIED,
	3608	buf, l, 0);
	3609	}
	3610	len -= l;
	3611	buf += l;
	3612	addr += l;
	3613	}
	3614	return 0;
	3615	}
	3616
	3617	/*
	3618	* Allows code that needs to deal with migration bitmaps etc to still be built
	3619	* target independent.
	3620	*/
	3621	size_t qemu_target_page_size(void)
	3622	{
	3623	return TARGET_PAGE_SIZE;
	3624	}
	3625
	3626	int qemu_target_page_bits(void)
	3627	{
	3628	return TARGET_PAGE_BITS;
	3629	}
	3630
	3631	int qemu_target_page_bits_min(void)
	3632	{
	3633	return TARGET_PAGE_BITS_MIN;
	3634	}
	3635	#endif
	3636
	3637	/*
	3638	* A helper function for the _utterly broken_ virtio device model to find out if
	3639	* it's running on a big endian machine. Don't do this at home kids!
	3640	*/
	3641	bool target_words_bigendian(void);
	3642	bool target_words_bigendian(void)
	3643	{
	3644	#if defined(TARGET_WORDS_BIGENDIAN)
	3645	return true;
	3646	#else
	3647	return false;
	3648	#endif
	3649	}
	3650
	3651	#ifndef CONFIG_USER_ONLY
	3652	bool cpu_physical_memory_is_io(hwaddr phys_addr)
	3653	{
	3654	MemoryRegion*mr;
	3655	hwaddr l = 1;
	3656	bool res;
	3657
	3658	rcu_read_lock();
	3659	mr = address_space_translate(&address_space_memory,
	3660	phys_addr, &phys_addr, &l, false);
	3661
	3662	res = !(memory_region_is_ram(mr) \|\| memory_region_is_romd(mr));
	3663	rcu_read_unlock();
	3664	return res;
	3665	}
	3666
	3667	int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
	3668	{
	3669	RAMBlock *block;
	3670	int ret = 0;
	3671
	3672	rcu_read_lock();
	3673	RAMBLOCK_FOREACH(block) {
	3674	ret = func(block->idstr, block->host, block->offset,
	3675	block->used_length, opaque);
	3676	if (ret) {
	3677	break;
	3678	}
	3679	}
	3680	rcu_read_unlock();
	3681	return ret;
	3682	}
	3683
	3684	/*
	3685	* Unmap pages of memory from start to start+length such that
	3686	* they a) read as 0, b) Trigger whatever fault mechanism
	3687	* the OS provides for postcopy.
	3688	* The pages must be unmapped by the end of the function.
	3689	* Returns: 0 on success, none-0 on failure
	3690	*
	3691	*/
	3692	int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
	3693	{
	3694	int ret = -1;
	3695
	3696	uint8_t *host_startaddr = rb->host + start;
	3697
	3698	if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
	3699	error_report("ram_block_discard_range: Unaligned start address: %p",
	3700	host_startaddr);
	3701	goto err;
	3702	}
	3703
	3704	if ((start + length) <= rb->used_length) {
	3705	uint8_t *host_endaddr = host_startaddr + length;
	3706	if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
	3707	error_report("ram_block_discard_range: Unaligned end address: %p",
	3708	host_endaddr);
	3709	goto err;
	3710	}
	3711
	3712	errno = ENOTSUP; /* If we are missing MADVISE etc */
	3713
	3714	if (rb->page_size == qemu_host_page_size) {
	3715	#if defined(CONFIG_MADVISE)
	3716	/* Note: We need the madvise MADV_DONTNEED behaviour of definitely
	3717	* freeing the page.
	3718	*/
	3719	ret = madvise(host_startaddr, length, MADV_DONTNEED);
	3720	#endif
	3721	} else {
	3722	/* Huge page case - unfortunately it can't do DONTNEED, but
	3723	* it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
	3724	* huge page file.
	3725	*/
	3726	#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
	3727	ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	3728	start, length);
	3729	#endif
	3730	}
	3731	if (ret) {
	3732	ret = -errno;
	3733	error_report("ram_block_discard_range: Failed to discard range "
	3734	"%s:%" PRIx64 " +%zx (%d)",
	3735	rb->idstr, start, length, ret);
	3736	}
	3737	} else {
	3738	error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
	3739	"/%zx/" RAM_ADDR_FMT")",
	3740	rb->idstr, start, length, rb->used_length);
	3741	}
	3742
	3743	err:
	3744	return ret;
	3745	}
	3746
	3747	#endif
	3748
	3749	void page_size_init(void)
	3750	{
	3751	/* NOTE: we can always suppose that qemu_host_page_size >=
	3752	TARGET_PAGE_SIZE */
	3753	if (qemu_host_page_size == 0) {
	3754	qemu_host_page_size = qemu_real_host_page_size;
	3755	}
	3756	if (qemu_host_page_size < TARGET_PAGE_SIZE) {
	3757	qemu_host_page_size = TARGET_PAGE_SIZE;
	3758	}
	3759	qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
	3760	}
	3761
	3762	#if !defined(CONFIG_USER_ONLY)
	3763
	3764	static void mtree_print_phys_entries(fprintf_function mon, void *f,
	3765	int start, int end, int skip, int ptr)
	3766	{
	3767	if (start == end - 1) {
	3768	mon(f, "\t%3d ", start);
	3769	} else {
	3770	mon(f, "\t%3d..%-3d ", start, end - 1);
	3771	}
	3772	mon(f, " skip=%d ", skip);
	3773	if (ptr == PHYS_MAP_NODE_NIL) {
	3774	mon(f, " ptr=NIL");
	3775	} else if (!skip) {
	3776	mon(f, " ptr=#%d", ptr);
	3777	} else {
	3778	mon(f, " ptr=[%d]", ptr);
	3779	}
	3780	mon(f, "\n");
	3781	}
	3782
	3783	#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
	3784	int128_sub((size), int128_one())) : 0)
	3785
	3786	void mtree_print_dispatch(fprintf_function mon, void *f,
	3787	AddressSpaceDispatch d, MemoryRegion root)
	3788	{
	3789	int i;
	3790
	3791	mon(f, " Dispatch\n");
	3792	mon(f, " Physical sections\n");
	3793
	3794	for (i = 0; i < d->map.sections_nb; ++i) {
	3795	MemoryRegionSection *s = d->map.sections + i;
	3796	const char *names[] = { " [unassigned]", " [not dirty]",
	3797	" [ROM]", " [watch]" };
	3798
	3799	mon(f, " #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
	3800	i,
	3801	s->offset_within_address_space,
	3802	s->offset_within_address_space + MR_SIZE(s->mr->size),
	3803	s->mr->name ? s->mr->name : "(noname)",
	3804	i < ARRAY_SIZE(names) ? names[i] : "",
	3805	s->mr == root ? " [ROOT]" : "",
	3806	s == d->mru_section ? " [MRU]" : "",
	3807	s->mr->is_iommu ? " [iommu]" : "");
	3808
	3809	if (s->mr->alias) {
	3810	mon(f, " alias=%s", s->mr->alias->name ?
	3811	s->mr->alias->name : "noname");
	3812	}
	3813	mon(f, "\n");
	3814	}
	3815
	3816	mon(f, " Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
	3817	P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
	3818	for (i = 0; i < d->map.nodes_nb; ++i) {
	3819	int j, jprev;
	3820	PhysPageEntry prev;
	3821	Node *n = d->map.nodes + i;
	3822
	3823	mon(f, " [%d]\n", i);
	3824
	3825	for (j = 0, jprev = 0, prev = n[0]; j < ARRAY_SIZE(n); ++j) {
	3826	PhysPageEntry pe = n + j;
	3827
	3828	if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
	3829	continue;
	3830	}
	3831
	3832	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	3833
	3834	jprev = j;
	3835	prev = *pe;
	3836	}
	3837
	3838	if (jprev != ARRAY_SIZE(*n)) {
	3839	mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
	3840	}
	3841	}
	3842	}
	3843
	3844	#endif