Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* QEMU KVM support
	3	*
	4	* Copyright IBM, Corp. 2008
	5	* Red Hat, Inc. 2008
	6	*
	7	* Authors:
	8	* Anthony Liguori <[email protected]>
	9	* Glauber Costa <[email protected]>
	10	*
	11	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	12	* See the COPYING file in the top-level directory.
	13	*
	14	*/
	15
	16	#include "qemu/osdep.h"
	17	#include <sys/ioctl.h>
	18
	19	#include <linux/kvm.h>
	20
	21	#include "qemu-common.h"
	22	#include "qemu/atomic.h"
	23	#include "qemu/option.h"
	24	#include "qemu/config-file.h"
	25	#include "qemu/error-report.h"
	26	#include "hw/hw.h"
	27	#include "hw/pci/msi.h"
	28	#include "hw/pci/msix.h"
	29	#include "hw/s390x/adapter.h"
	30	#include "exec/gdbstub.h"
	31	#include "sysemu/kvm_int.h"
	32	#include "sysemu/cpus.h"
	33	#include "qemu/bswap.h"
	34	#include "exec/memory.h"
	35	#include "exec/ram_addr.h"
	36	#include "exec/address-spaces.h"
	37	#include "qemu/event_notifier.h"
	38	#include "trace-root.h"
	39	#include "hw/irq.h"
	40
	41	#include "hw/boards.h"
	42
	43	/* This check must be after config-host.h is included */
	44	#ifdef CONFIG_EVENTFD
	45	#include <sys/eventfd.h>
	46	#endif
	47
	48	/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
	49	* need to use the real host PAGE_SIZE, as that's what KVM will use.
	50	*/
	51	#define PAGE_SIZE getpagesize()
	52
	53	//#define DEBUG_KVM
	54
	55	#ifdef DEBUG_KVM
	56	#define DPRINTF(fmt, ...) \
	57	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
	58	#else
	59	#define DPRINTF(fmt, ...) \
	60	do { } while (0)
	61	#endif
	62
	63	#define KVM_MSI_HASHTAB_SIZE 256
	64
	65	struct KVMParkedVcpu {
	66	unsigned long vcpu_id;
	67	int kvm_fd;
	68	QLIST_ENTRY(KVMParkedVcpu) node;
	69	};
	70
	71	struct KVMState
	72	{
	73	AccelState parent_obj;
	74
	75	int nr_slots;
	76	int fd;
	77	int vmfd;
	78	int coalesced_mmio;
	79	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
	80	bool coalesced_flush_in_progress;
	81	int broken_set_mem_region;
	82	int vcpu_events;
	83	int robust_singlestep;
	84	int debugregs;
	85	#ifdef KVM_CAP_SET_GUEST_DEBUG
	86	struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
	87	#endif
	88	int many_ioeventfds;
	89	int intx_set_mask;
	90	/* The man page (and posix) say ioctl numbers are signed int, but
	91	* they're not. Linux, glibc and *BSD all treat ioctl numbers as
	92	* unsigned, and treating them as signed here can break things */
	93	unsigned irq_set_ioctl;
	94	unsigned int sigmask_len;
	95	GHashTable *gsimap;
	96	#ifdef KVM_CAP_IRQ_ROUTING
	97	struct kvm_irq_routing *irq_routes;
	98	int nr_allocated_irq_routes;
	99	unsigned long *used_gsi_bitmap;
	100	unsigned int gsi_count;
	101	QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
	102	#endif
	103	KVMMemoryListener memory_listener;
	104	QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
	105	};
	106
	107	KVMState *kvm_state;
	108	bool kvm_kernel_irqchip;
	109	bool kvm_split_irqchip;
	110	bool kvm_async_interrupts_allowed;
	111	bool kvm_halt_in_kernel_allowed;
	112	bool kvm_eventfds_allowed;
	113	bool kvm_irqfds_allowed;
	114	bool kvm_resamplefds_allowed;
	115	bool kvm_msi_via_irqfd_allowed;
	116	bool kvm_gsi_routing_allowed;
	117	bool kvm_gsi_direct_mapping;
	118	bool kvm_allowed;
	119	bool kvm_readonly_mem_allowed;
	120	bool kvm_vm_attributes_allowed;
	121	bool kvm_direct_msi_allowed;
	122	bool kvm_ioeventfd_any_length_allowed;
	123	bool kvm_msi_use_devid;
	124	static bool kvm_immediate_exit;
	125
	126	static const KVMCapabilityInfo kvm_required_capabilites[] = {
	127	KVM_CAP_INFO(USER_MEMORY),
	128	KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
	129	KVM_CAP_LAST_INFO
	130	};
	131
	132	int kvm_get_max_memslots(void)
	133	{
	134	KVMState *s = KVM_STATE(current_machine->accelerator);
	135
	136	return s->nr_slots;
	137	}
	138
	139	static KVMSlot kvm_get_free_slot(KVMMemoryListener kml)
	140	{
	141	KVMState *s = kvm_state;
	142	int i;
	143
	144	for (i = 0; i < s->nr_slots; i++) {
	145	if (kml->slots[i].memory_size == 0) {
	146	return &kml->slots[i];
	147	}
	148	}
	149
	150	return NULL;
	151	}
	152
	153	bool kvm_has_free_slot(MachineState *ms)
	154	{
	155	KVMState *s = KVM_STATE(ms->accelerator);
	156
	157	return kvm_get_free_slot(&s->memory_listener);
	158	}
	159
	160	static KVMSlot kvm_alloc_slot(KVMMemoryListener kml)
	161	{
	162	KVMSlot *slot = kvm_get_free_slot(kml);
	163
	164	if (slot) {
	165	return slot;
	166	}
	167
	168	fprintf(stderr, "%s: no free slot available\n", __func__);
	169	abort();
	170	}
	171
	172	static KVMSlot kvm_lookup_matching_slot(KVMMemoryListener kml,
	173	hwaddr start_addr,
	174	hwaddr end_addr)
	175	{
	176	KVMState *s = kvm_state;
	177	int i;
	178
	179	for (i = 0; i < s->nr_slots; i++) {
	180	KVMSlot *mem = &kml->slots[i];
	181
	182	if (start_addr == mem->start_addr &&
	183	end_addr == mem->start_addr + mem->memory_size) {
	184	return mem;
	185	}
	186	}
	187
	188	return NULL;
	189	}
	190
	191	/*
	192	* Find overlapping slot with lowest start address
	193	*/
	194	static KVMSlot kvm_lookup_overlapping_slot(KVMMemoryListener kml,
	195	hwaddr start_addr,
	196	hwaddr end_addr)
	197	{
	198	KVMState *s = kvm_state;
	199	KVMSlot *found = NULL;
	200	int i;
	201
	202	for (i = 0; i < s->nr_slots; i++) {
	203	KVMSlot *mem = &kml->slots[i];
	204
	205	if (mem->memory_size == 0 \|\|
	206	(found && found->start_addr < mem->start_addr)) {
	207	continue;
	208	}
	209
	210	if (end_addr > mem->start_addr &&
	211	start_addr < mem->start_addr + mem->memory_size) {
	212	found = mem;
	213	}
	214	}
	215
	216	return found;
	217	}
	218
	219	int kvm_physical_memory_addr_from_host(KVMState s, void ram,
	220	hwaddr *phys_addr)
	221	{
	222	KVMMemoryListener *kml = &s->memory_listener;
	223	int i;
	224
	225	for (i = 0; i < s->nr_slots; i++) {
	226	KVMSlot *mem = &kml->slots[i];
	227
	228	if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
	229	*phys_addr = mem->start_addr + (ram - mem->ram);
	230	return 1;
	231	}
	232	}
	233
	234	return 0;
	235	}
	236
	237	static int kvm_set_user_memory_region(KVMMemoryListener kml, KVMSlot slot)
	238	{
	239	KVMState *s = kvm_state;
	240	struct kvm_userspace_memory_region mem;
	241
	242	mem.slot = slot->slot \| (kml->as_id << 16);
	243	mem.guest_phys_addr = slot->start_addr;
	244	mem.userspace_addr = (unsigned long)slot->ram;
	245	mem.flags = slot->flags;
	246
	247	if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
	248	/* Set the slot size to 0 before setting the slot to the desired
	249	* value. This is needed based on KVM commit 75d61fbc. */
	250	mem.memory_size = 0;
	251	kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
	252	}
	253	mem.memory_size = slot->memory_size;
	254	return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
	255	}
	256
	257	int kvm_destroy_vcpu(CPUState *cpu)
	258	{
	259	KVMState *s = kvm_state;
	260	long mmap_size;
	261	struct KVMParkedVcpu *vcpu = NULL;
	262	int ret = 0;
	263
	264	DPRINTF("kvm_destroy_vcpu\n");
	265
	266	mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
	267	if (mmap_size < 0) {
	268	ret = mmap_size;
	269	DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
	270	goto err;
	271	}
	272
	273	ret = munmap(cpu->kvm_run, mmap_size);
	274	if (ret < 0) {
	275	goto err;
	276	}
	277
	278	vcpu = g_malloc0(sizeof(*vcpu));
	279	vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
	280	vcpu->kvm_fd = cpu->kvm_fd;
	281	QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
	282	err:
	283	return ret;
	284	}
	285
	286	static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
	287	{
	288	struct KVMParkedVcpu *cpu;
	289
	290	QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
	291	if (cpu->vcpu_id == vcpu_id) {
	292	int kvm_fd;
	293
	294	QLIST_REMOVE(cpu, node);
	295	kvm_fd = cpu->kvm_fd;
	296	g_free(cpu);
	297	return kvm_fd;
	298	}
	299	}
	300
	301	return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
	302	}
	303
	304	int kvm_init_vcpu(CPUState *cpu)
	305	{
	306	KVMState *s = kvm_state;
	307	long mmap_size;
	308	int ret;
	309
	310	DPRINTF("kvm_init_vcpu\n");
	311
	312	ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
	313	if (ret < 0) {
	314	DPRINTF("kvm_create_vcpu failed\n");
	315	goto err;
	316	}
	317
	318	cpu->kvm_fd = ret;
	319	cpu->kvm_state = s;
	320	cpu->kvm_vcpu_dirty = true;
	321
	322	mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
	323	if (mmap_size < 0) {
	324	ret = mmap_size;
	325	DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
	326	goto err;
	327	}
	328
	329	cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ \| PROT_WRITE, MAP_SHARED,
	330	cpu->kvm_fd, 0);
	331	if (cpu->kvm_run == MAP_FAILED) {
	332	ret = -errno;
	333	DPRINTF("mmap'ing vcpu state failed\n");
	334	goto err;
	335	}
	336
	337	if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
	338	s->coalesced_mmio_ring =
	339	(void )cpu->kvm_run + s->coalesced_mmio PAGE_SIZE;
	340	}
	341
	342	ret = kvm_arch_init_vcpu(cpu);
	343	err:
	344	return ret;
	345	}
	346
	347	/*
	348	* dirty pages logging control
	349	*/
	350
	351	static int kvm_mem_flags(MemoryRegion *mr)
	352	{
	353	bool readonly = mr->readonly \|\| memory_region_is_romd(mr);
	354	int flags = 0;
	355
	356	if (memory_region_get_dirty_log_mask(mr) != 0) {
	357	flags \|= KVM_MEM_LOG_DIRTY_PAGES;
	358	}
	359	if (readonly && kvm_readonly_mem_allowed) {
	360	flags \|= KVM_MEM_READONLY;
	361	}
	362	return flags;
	363	}
	364
	365	static int kvm_slot_update_flags(KVMMemoryListener kml, KVMSlot mem,
	366	MemoryRegion *mr)
	367	{
	368	int old_flags;
	369
	370	old_flags = mem->flags;
	371	mem->flags = kvm_mem_flags(mr);
	372
	373	/* If nothing changed effectively, no need to issue ioctl */
	374	if (mem->flags == old_flags) {
	375	return 0;
	376	}
	377
	378	return kvm_set_user_memory_region(kml, mem);
	379	}
	380
	381	static int kvm_section_update_flags(KVMMemoryListener *kml,
	382	MemoryRegionSection *section)
	383	{
	384	hwaddr phys_addr = section->offset_within_address_space;
	385	ram_addr_t size = int128_get64(section->size);
	386	KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
	387
	388	if (mem == NULL) {
	389	return 0;
	390	} else {
	391	return kvm_slot_update_flags(kml, mem, section->mr);
	392	}
	393	}
	394
	395	static void kvm_log_start(MemoryListener *listener,
	396	MemoryRegionSection *section,
	397	int old, int new)
	398	{
	399	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
	400	int r;
	401
	402	if (old != 0) {
	403	return;
	404	}
	405
	406	r = kvm_section_update_flags(kml, section);
	407	if (r < 0) {
	408	abort();
	409	}
	410	}
	411
	412	static void kvm_log_stop(MemoryListener *listener,
	413	MemoryRegionSection *section,
	414	int old, int new)
	415	{
	416	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
	417	int r;
	418
	419	if (new != 0) {
	420	return;
	421	}
	422
	423	r = kvm_section_update_flags(kml, section);
	424	if (r < 0) {
	425	abort();
	426	}
	427	}
	428
	429	/* get kvm's dirty pages bitmap and update qemu's */
	430	static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
	431	unsigned long *bitmap)
	432	{
	433	ram_addr_t start = section->offset_within_region +
	434	memory_region_get_ram_addr(section->mr);
	435	ram_addr_t pages = int128_get64(section->size) / getpagesize();
	436
	437	cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
	438	return 0;
	439	}
	440
	441	#define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
	442
	443	/**
	444	* kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
	445	* This function updates qemu's dirty bitmap using
	446	* memory_region_set_dirty(). This means all bits are set
	447	* to dirty.
	448	*
	449	* @start_add: start of logged region.
	450	* @end_addr: end of logged region.
	451	*/
	452	static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
	453	MemoryRegionSection *section)
	454	{
	455	KVMState *s = kvm_state;
	456	unsigned long size, allocated_size = 0;
	457	struct kvm_dirty_log d = {};
	458	KVMSlot *mem;
	459	int ret = 0;
	460	hwaddr start_addr = section->offset_within_address_space;
	461	hwaddr end_addr = start_addr + int128_get64(section->size);
	462
	463	d.dirty_bitmap = NULL;
	464	while (start_addr < end_addr) {
	465	mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
	466	if (mem == NULL) {
	467	break;
	468	}
	469
	470	/* XXX bad kernel interface alert
	471	* For dirty bitmap, kernel allocates array of size aligned to
	472	* bits-per-long. But for case when the kernel is 64bits and
	473	* the userspace is 32bits, userspace can't align to the same
	474	* bits-per-long, since sizeof(long) is different between kernel
	475	* and user space. This way, userspace will provide buffer which
	476	* may be 4 bytes less than the kernel will use, resulting in
	477	* userspace memory corruption (which is not detectable by valgrind
	478	* too, in most cases).
	479	* So for now, let's align to 64 instead of HOST_LONG_BITS here, in
	480	* a hope that sizeof(long) won't become >8 any time soon.
	481	*/
	482	size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
	483	/HOST_LONG_BITS/ 64) / 8;
	484	if (!d.dirty_bitmap) {
	485	d.dirty_bitmap = g_malloc(size);
	486	} else if (size > allocated_size) {
	487	d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
	488	}
	489	allocated_size = size;
	490	memset(d.dirty_bitmap, 0, allocated_size);
	491
	492	d.slot = mem->slot \| (kml->as_id << 16);
	493	if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
	494	DPRINTF("ioctl failed %d\n", errno);
	495	ret = -1;
	496	break;
	497	}
	498
	499	kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
	500	start_addr = mem->start_addr + mem->memory_size;
	501	}
	502	g_free(d.dirty_bitmap);
	503
	504	return ret;
	505	}
	506
	507	static void kvm_coalesce_mmio_region(MemoryListener *listener,
	508	MemoryRegionSection *secion,
	509	hwaddr start, hwaddr size)
	510	{
	511	KVMState *s = kvm_state;
	512
	513	if (s->coalesced_mmio) {
	514	struct kvm_coalesced_mmio_zone zone;
	515
	516	zone.addr = start;
	517	zone.size = size;
	518	zone.pad = 0;
	519
	520	(void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
	521	}
	522	}
	523
	524	static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
	525	MemoryRegionSection *secion,
	526	hwaddr start, hwaddr size)
	527	{
	528	KVMState *s = kvm_state;
	529
	530	if (s->coalesced_mmio) {
	531	struct kvm_coalesced_mmio_zone zone;
	532
	533	zone.addr = start;
	534	zone.size = size;
	535	zone.pad = 0;
	536
	537	(void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
	538	}
	539	}
	540
	541	int kvm_check_extension(KVMState *s, unsigned int extension)
	542	{
	543	int ret;
	544
	545	ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
	546	if (ret < 0) {
	547	ret = 0;
	548	}
	549
	550	return ret;
	551	}
	552
	553	int kvm_vm_check_extension(KVMState *s, unsigned int extension)
	554	{
	555	int ret;
	556
	557	ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
	558	if (ret < 0) {
	559	/* VM wide version not implemented, use global one instead */
	560	ret = kvm_check_extension(s, extension);
	561	}
	562
	563	return ret;
	564	}
	565
	566	static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
	567	{
	568	#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
	569	/* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
	570	* endianness, but the memory core hands them in target endianness.
	571	* For example, PPC is always treated as big-endian even if running
	572	* on KVM and on PPC64LE. Correct here.
	573	*/
	574	switch (size) {
	575	case 2:
	576	val = bswap16(val);
	577	break;
	578	case 4:
	579	val = bswap32(val);
	580	break;
	581	}
	582	#endif
	583	return val;
	584	}
	585
	586	static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
	587	bool assign, uint32_t size, bool datamatch)
	588	{
	589	int ret;
	590	struct kvm_ioeventfd iofd = {
	591	.datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
	592	.addr = addr,
	593	.len = size,
	594	.flags = 0,
	595	.fd = fd,
	596	};
	597
	598	if (!kvm_enabled()) {
	599	return -ENOSYS;
	600	}
	601
	602	if (datamatch) {
	603	iofd.flags \|= KVM_IOEVENTFD_FLAG_DATAMATCH;
	604	}
	605	if (!assign) {
	606	iofd.flags \|= KVM_IOEVENTFD_FLAG_DEASSIGN;
	607	}
	608
	609	ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
	610
	611	if (ret < 0) {
	612	return -errno;
	613	}
	614
	615	return 0;
	616	}
	617
	618	static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
	619	bool assign, uint32_t size, bool datamatch)
	620	{
	621	struct kvm_ioeventfd kick = {
	622	.datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
	623	.addr = addr,
	624	.flags = KVM_IOEVENTFD_FLAG_PIO,
	625	.len = size,
	626	.fd = fd,
	627	};
	628	int r;
	629	if (!kvm_enabled()) {
	630	return -ENOSYS;
	631	}
	632	if (datamatch) {
	633	kick.flags \|= KVM_IOEVENTFD_FLAG_DATAMATCH;
	634	}
	635	if (!assign) {
	636	kick.flags \|= KVM_IOEVENTFD_FLAG_DEASSIGN;
	637	}
	638	r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
	639	if (r < 0) {
	640	return r;
	641	}
	642	return 0;
	643	}
	644
	645
	646	static int kvm_check_many_ioeventfds(void)
	647	{
	648	/* Userspace can use ioeventfd for io notification. This requires a host
	649	* that supports eventfd(2) and an I/O thread; since eventfd does not
	650	* support SIGIO it cannot interrupt the vcpu.
	651	*
	652	* Older kernels have a 6 device limit on the KVM io bus. Find out so we
	653	* can avoid creating too many ioeventfds.
	654	*/
	655	#if defined(CONFIG_EVENTFD)
	656	int ioeventfds[7];
	657	int i, ret = 0;
	658	for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
	659	ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
	660	if (ioeventfds[i] < 0) {
	661	break;
	662	}
	663	ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
	664	if (ret < 0) {
	665	close(ioeventfds[i]);
	666	break;
	667	}
	668	}
	669
	670	/* Decide whether many devices are supported or not */
	671	ret = i == ARRAY_SIZE(ioeventfds);
	672
	673	while (i-- > 0) {
	674	kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
	675	close(ioeventfds[i]);
	676	}
	677	return ret;
	678	#else
	679	return 0;
	680	#endif
	681	}
	682
	683	static const KVMCapabilityInfo *
	684	kvm_check_extension_list(KVMState s, const KVMCapabilityInfo list)
	685	{
	686	while (list->name) {
	687	if (!kvm_check_extension(s, list->value)) {
	688	return list;
	689	}
	690	list++;
	691	}
	692	return NULL;
	693	}
	694
	695	static void kvm_set_phys_mem(KVMMemoryListener *kml,
	696	MemoryRegionSection *section, bool add)
	697	{
	698	KVMState *s = kvm_state;
	699	KVMSlot *mem, old;
	700	int err;
	701	MemoryRegion *mr = section->mr;
	702	bool writeable = !mr->readonly && !mr->rom_device;
	703	hwaddr start_addr = section->offset_within_address_space;
	704	ram_addr_t size = int128_get64(section->size);
	705	void *ram = NULL;
	706	unsigned delta;
	707
	708	/* kvm works in page size chunks, but the function may be called
	709	with sub-page size and unaligned start address. Pad the start
	710	address to next and truncate size to previous page boundary. */
	711	delta = qemu_real_host_page_size - (start_addr & ~qemu_real_host_page_mask);
	712	delta &= ~qemu_real_host_page_mask;
	713	if (delta > size) {
	714	return;
	715	}
	716	start_addr += delta;
	717	size -= delta;
	718	size &= qemu_real_host_page_mask;
	719	if (!size \|\| (start_addr & ~qemu_real_host_page_mask)) {
	720	return;
	721	}
	722
	723	if (!memory_region_is_ram(mr)) {
	724	if (writeable \|\| !kvm_readonly_mem_allowed) {
	725	return;
	726	} else if (!mr->romd_mode) {
	727	/* If the memory device is not in romd_mode, then we actually want
	728	* to remove the kvm memory slot so all accesses will trap. */
	729	add = false;
	730	}
	731	}
	732
	733	ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
	734
	735	while (1) {
	736	mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
	737	if (!mem) {
	738	break;
	739	}
	740
	741	if (add && start_addr >= mem->start_addr &&
	742	(start_addr + size <= mem->start_addr + mem->memory_size) &&
	743	(ram - start_addr == mem->ram - mem->start_addr)) {
	744	/* The new slot fits into the existing one and comes with
	745	* identical parameters - update flags and done. */
	746	kvm_slot_update_flags(kml, mem, mr);
	747	return;
	748	}
	749
	750	old = *mem;
	751
	752	if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
	753	kvm_physical_sync_dirty_bitmap(kml, section);
	754	}
	755
	756	/* unregister the overlapping slot */
	757	mem->memory_size = 0;
	758	err = kvm_set_user_memory_region(kml, mem);
	759	if (err) {
	760	fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
	761	__func__, strerror(-err));
	762	abort();
	763	}
	764
	765	/* Workaround for older KVM versions: we can't join slots, even not by
	766	* unregistering the previous ones and then registering the larger
	767	* slot. We have to maintain the existing fragmentation. Sigh.
	768	*
	769	* This workaround assumes that the new slot starts at the same
	770	* address as the first existing one. If not or if some overlapping
	771	* slot comes around later, we will fail (not seen in practice so far)
	772	* - and actually require a recent KVM version. */
	773	if (s->broken_set_mem_region &&
	774	old.start_addr == start_addr && old.memory_size < size && add) {
	775	mem = kvm_alloc_slot(kml);
	776	mem->memory_size = old.memory_size;
	777	mem->start_addr = old.start_addr;
	778	mem->ram = old.ram;
	779	mem->flags = kvm_mem_flags(mr);
	780
	781	err = kvm_set_user_memory_region(kml, mem);
	782	if (err) {
	783	fprintf(stderr, "%s: error updating slot: %s\n", __func__,
	784	strerror(-err));
	785	abort();
	786	}
	787
	788	start_addr += old.memory_size;
	789	ram += old.memory_size;
	790	size -= old.memory_size;
	791	continue;
	792	}
	793
	794	/* register prefix slot */
	795	if (old.start_addr < start_addr) {
	796	mem = kvm_alloc_slot(kml);
	797	mem->memory_size = start_addr - old.start_addr;
	798	mem->start_addr = old.start_addr;
	799	mem->ram = old.ram;
	800	mem->flags = kvm_mem_flags(mr);
	801
	802	err = kvm_set_user_memory_region(kml, mem);
	803	if (err) {
	804	fprintf(stderr, "%s: error registering prefix slot: %s\n",
	805	__func__, strerror(-err));
	806	#ifdef TARGET_PPC
	807	fprintf(stderr, "%s: This is probably because your kernel's " \
	808	"PAGE_SIZE is too big. Please try to use 4k " \
	809	"PAGE_SIZE!\n", __func__);
	810	#endif
	811	abort();
	812	}
	813	}
	814
	815	/* register suffix slot */
	816	if (old.start_addr + old.memory_size > start_addr + size) {
	817	ram_addr_t size_delta;
	818
	819	mem = kvm_alloc_slot(kml);
	820	mem->start_addr = start_addr + size;
	821	size_delta = mem->start_addr - old.start_addr;
	822	mem->memory_size = old.memory_size - size_delta;
	823	mem->ram = old.ram + size_delta;
	824	mem->flags = kvm_mem_flags(mr);
	825
	826	err = kvm_set_user_memory_region(kml, mem);
	827	if (err) {
	828	fprintf(stderr, "%s: error registering suffix slot: %s\n",
	829	__func__, strerror(-err));
	830	abort();
	831	}
	832	}
	833	}
	834
	835	/* in case the KVM bug workaround already "consumed" the new slot */
	836	if (!size) {
	837	return;
	838	}
	839	if (!add) {
	840	return;
	841	}
	842	mem = kvm_alloc_slot(kml);
	843	mem->memory_size = size;
	844	mem->start_addr = start_addr;
	845	mem->ram = ram;
	846	mem->flags = kvm_mem_flags(mr);
	847
	848	err = kvm_set_user_memory_region(kml, mem);
	849	if (err) {
	850	fprintf(stderr, "%s: error registering slot: %s\n", __func__,
	851	strerror(-err));
	852	abort();
	853	}
	854	}
	855
	856	static void kvm_region_add(MemoryListener *listener,
	857	MemoryRegionSection *section)
	858	{
	859	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
	860
	861	memory_region_ref(section->mr);
	862	kvm_set_phys_mem(kml, section, true);
	863	}
	864
	865	static void kvm_region_del(MemoryListener *listener,
	866	MemoryRegionSection *section)
	867	{
	868	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
	869
	870	kvm_set_phys_mem(kml, section, false);
	871	memory_region_unref(section->mr);
	872	}
	873
	874	static void kvm_log_sync(MemoryListener *listener,
	875	MemoryRegionSection *section)
	876	{
	877	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
	878	int r;
	879
	880	r = kvm_physical_sync_dirty_bitmap(kml, section);
	881	if (r < 0) {
	882	abort();
	883	}
	884	}
	885
	886	static void kvm_mem_ioeventfd_add(MemoryListener *listener,
	887	MemoryRegionSection *section,
	888	bool match_data, uint64_t data,
	889	EventNotifier *e)
	890	{
	891	int fd = event_notifier_get_fd(e);
	892	int r;
	893
	894	r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
	895	data, true, int128_get64(section->size),
	896	match_data);
	897	if (r < 0) {
	898	fprintf(stderr, "%s: error adding ioeventfd: %s\n",
	899	__func__, strerror(-r));
	900	abort();
	901	}
	902	}
	903
	904	static void kvm_mem_ioeventfd_del(MemoryListener *listener,
	905	MemoryRegionSection *section,
	906	bool match_data, uint64_t data,
	907	EventNotifier *e)
	908	{
	909	int fd = event_notifier_get_fd(e);
	910	int r;
	911
	912	r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
	913	data, false, int128_get64(section->size),
	914	match_data);
	915	if (r < 0) {
	916	abort();
	917	}
	918	}
	919
	920	static void kvm_io_ioeventfd_add(MemoryListener *listener,
	921	MemoryRegionSection *section,
	922	bool match_data, uint64_t data,
	923	EventNotifier *e)
	924	{
	925	int fd = event_notifier_get_fd(e);
	926	int r;
	927
	928	r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
	929	data, true, int128_get64(section->size),
	930	match_data);
	931	if (r < 0) {
	932	fprintf(stderr, "%s: error adding ioeventfd: %s\n",
	933	__func__, strerror(-r));
	934	abort();
	935	}
	936	}
	937
	938	static void kvm_io_ioeventfd_del(MemoryListener *listener,
	939	MemoryRegionSection *section,
	940	bool match_data, uint64_t data,
	941	EventNotifier *e)
	942
	943	{
	944	int fd = event_notifier_get_fd(e);
	945	int r;
	946
	947	r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
	948	data, false, int128_get64(section->size),
	949	match_data);
	950	if (r < 0) {
	951	abort();
	952	}
	953	}
	954
	955	void kvm_memory_listener_register(KVMState s, KVMMemoryListener kml,
	956	AddressSpace *as, int as_id)
	957	{
	958	int i;
	959
	960	kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
	961	kml->as_id = as_id;
	962
	963	for (i = 0; i < s->nr_slots; i++) {
	964	kml->slots[i].slot = i;
	965	}
	966
	967	kml->listener.region_add = kvm_region_add;
	968	kml->listener.region_del = kvm_region_del;
	969	kml->listener.log_start = kvm_log_start;
	970	kml->listener.log_stop = kvm_log_stop;
	971	kml->listener.log_sync = kvm_log_sync;
	972	kml->listener.priority = 10;
	973
	974	memory_listener_register(&kml->listener, as);
	975	}
	976
	977	static MemoryListener kvm_io_listener = {
	978	.eventfd_add = kvm_io_ioeventfd_add,
	979	.eventfd_del = kvm_io_ioeventfd_del,
	980	.priority = 10,
	981	};
	982
	983	static void kvm_handle_interrupt(CPUState *cpu, int mask)
	984	{
	985	cpu->interrupt_request \|= mask;
	986
	987	if (!qemu_cpu_is_self(cpu)) {
	988	qemu_cpu_kick(cpu);
	989	}
	990	}
	991
	992	int kvm_set_irq(KVMState *s, int irq, int level)
	993	{
	994	struct kvm_irq_level event;
	995	int ret;
	996
	997	assert(kvm_async_interrupts_enabled());
	998
	999	event.level = level;
	1000	event.irq = irq;
	1001	ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
	1002	if (ret < 0) {
	1003	perror("kvm_set_irq");
	1004	abort();
	1005	}
	1006
	1007	return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
	1008	}
	1009
	1010	#ifdef KVM_CAP_IRQ_ROUTING
	1011	typedef struct KVMMSIRoute {
	1012	struct kvm_irq_routing_entry kroute;
	1013	QTAILQ_ENTRY(KVMMSIRoute) entry;
	1014	} KVMMSIRoute;
	1015
	1016	static void set_gsi(KVMState *s, unsigned int gsi)
	1017	{
	1018	set_bit(gsi, s->used_gsi_bitmap);
	1019	}
	1020
	1021	static void clear_gsi(KVMState *s, unsigned int gsi)
	1022	{
	1023	clear_bit(gsi, s->used_gsi_bitmap);
	1024	}
	1025
	1026	void kvm_init_irq_routing(KVMState *s)
	1027	{
	1028	int gsi_count, i;
	1029
	1030	gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
	1031	if (gsi_count > 0) {
	1032	/* Round up so we can search ints using ffs */
	1033	s->used_gsi_bitmap = bitmap_new(gsi_count);
	1034	s->gsi_count = gsi_count;
	1035	}
	1036
	1037	s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
	1038	s->nr_allocated_irq_routes = 0;
	1039
	1040	if (!kvm_direct_msi_allowed) {
	1041	for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
	1042	QTAILQ_INIT(&s->msi_hashtab[i]);
	1043	}
	1044	}
	1045
	1046	kvm_arch_init_irq_routing(s);
	1047	}
	1048
	1049	void kvm_irqchip_commit_routes(KVMState *s)
	1050	{
	1051	int ret;
	1052
	1053	if (kvm_gsi_direct_mapping()) {
	1054	return;
	1055	}
	1056
	1057	if (!kvm_gsi_routing_enabled()) {
	1058	return;
	1059	}
	1060
	1061	s->irq_routes->flags = 0;
	1062	trace_kvm_irqchip_commit_routes();
	1063	ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
	1064	assert(ret == 0);
	1065	}
	1066
	1067	static void kvm_add_routing_entry(KVMState *s,
	1068	struct kvm_irq_routing_entry *entry)
	1069	{
	1070	struct kvm_irq_routing_entry *new;
	1071	int n, size;
	1072
	1073	if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
	1074	n = s->nr_allocated_irq_routes * 2;
	1075	if (n < 64) {
	1076	n = 64;
	1077	}
	1078	size = sizeof(struct kvm_irq_routing);
	1079	size += n * sizeof(*new);
	1080	s->irq_routes = g_realloc(s->irq_routes, size);
	1081	s->nr_allocated_irq_routes = n;
	1082	}
	1083	n = s->irq_routes->nr++;
	1084	new = &s->irq_routes->entries[n];
	1085
	1086	new = entry;
	1087
	1088	set_gsi(s, entry->gsi);
	1089	}
	1090
	1091	static int kvm_update_routing_entry(KVMState *s,
	1092	struct kvm_irq_routing_entry *new_entry)
	1093	{
	1094	struct kvm_irq_routing_entry *entry;
	1095	int n;
	1096
	1097	for (n = 0; n < s->irq_routes->nr; n++) {
	1098	entry = &s->irq_routes->entries[n];
	1099	if (entry->gsi != new_entry->gsi) {
	1100	continue;
	1101	}
	1102
	1103	if(!memcmp(entry, new_entry, sizeof *entry)) {
	1104	return 0;
	1105	}
	1106
	1107	entry = new_entry;
	1108
	1109	return 0;
	1110	}
	1111
	1112	return -ESRCH;
	1113	}
	1114
	1115	void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
	1116	{
	1117	struct kvm_irq_routing_entry e = {};
	1118
	1119	assert(pin < s->gsi_count);
	1120
	1121	e.gsi = irq;
	1122	e.type = KVM_IRQ_ROUTING_IRQCHIP;
	1123	e.flags = 0;
	1124	e.u.irqchip.irqchip = irqchip;
	1125	e.u.irqchip.pin = pin;
	1126	kvm_add_routing_entry(s, &e);
	1127	}
	1128
	1129	void kvm_irqchip_release_virq(KVMState *s, int virq)
	1130	{
	1131	struct kvm_irq_routing_entry *e;
	1132	int i;
	1133
	1134	if (kvm_gsi_direct_mapping()) {
	1135	return;
	1136	}
	1137
	1138	for (i = 0; i < s->irq_routes->nr; i++) {
	1139	e = &s->irq_routes->entries[i];
	1140	if (e->gsi == virq) {
	1141	s->irq_routes->nr--;
	1142	*e = s->irq_routes->entries[s->irq_routes->nr];
	1143	}
	1144	}
	1145	clear_gsi(s, virq);
	1146	kvm_arch_release_virq_post(virq);
	1147	}
	1148
	1149	static unsigned int kvm_hash_msi(uint32_t data)
	1150	{
	1151	/* This is optimized for IA32 MSI layout. However, no other arch shall
	1152	* repeat the mistake of not providing a direct MSI injection API. */
	1153	return data & 0xff;
	1154	}
	1155
	1156	static void kvm_flush_dynamic_msi_routes(KVMState *s)
	1157	{
	1158	KVMMSIRoute route, next;
	1159	unsigned int hash;
	1160
	1161	for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
	1162	QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
	1163	kvm_irqchip_release_virq(s, route->kroute.gsi);
	1164	QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
	1165	g_free(route);
	1166	}
	1167	}
	1168	}
	1169
	1170	static int kvm_irqchip_get_virq(KVMState *s)
	1171	{
	1172	int next_virq;
	1173
	1174	/*
	1175	* PIC and IOAPIC share the first 16 GSI numbers, thus the available
	1176	* GSI numbers are more than the number of IRQ route. Allocating a GSI
	1177	* number can succeed even though a new route entry cannot be added.
	1178	* When this happens, flush dynamic MSI entries to free IRQ route entries.
	1179	*/
	1180	if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
	1181	kvm_flush_dynamic_msi_routes(s);
	1182	}
	1183
	1184	/* Return the lowest unused GSI in the bitmap */
	1185	next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
	1186	if (next_virq >= s->gsi_count) {
	1187	return -ENOSPC;
	1188	} else {
	1189	return next_virq;
	1190	}
	1191	}
	1192
	1193	static KVMMSIRoute kvm_lookup_msi_route(KVMState s, MSIMessage msg)
	1194	{
	1195	unsigned int hash = kvm_hash_msi(msg.data);
	1196	KVMMSIRoute *route;
	1197
	1198	QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
	1199	if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
	1200	route->kroute.u.msi.address_hi == (msg.address >> 32) &&
	1201	route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
	1202	return route;
	1203	}
	1204	}
	1205	return NULL;
	1206	}
	1207
	1208	int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
	1209	{
	1210	struct kvm_msi msi;
	1211	KVMMSIRoute *route;
	1212
	1213	if (kvm_direct_msi_allowed) {
	1214	msi.address_lo = (uint32_t)msg.address;
	1215	msi.address_hi = msg.address >> 32;
	1216	msi.data = le32_to_cpu(msg.data);
	1217	msi.flags = 0;
	1218	memset(msi.pad, 0, sizeof(msi.pad));
	1219
	1220	return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
	1221	}
	1222
	1223	route = kvm_lookup_msi_route(s, msg);
	1224	if (!route) {
	1225	int virq;
	1226
	1227	virq = kvm_irqchip_get_virq(s);
	1228	if (virq < 0) {
	1229	return virq;
	1230	}
	1231
	1232	route = g_malloc0(sizeof(KVMMSIRoute));
	1233	route->kroute.gsi = virq;
	1234	route->kroute.type = KVM_IRQ_ROUTING_MSI;
	1235	route->kroute.flags = 0;
	1236	route->kroute.u.msi.address_lo = (uint32_t)msg.address;
	1237	route->kroute.u.msi.address_hi = msg.address >> 32;
	1238	route->kroute.u.msi.data = le32_to_cpu(msg.data);
	1239
	1240	kvm_add_routing_entry(s, &route->kroute);
	1241	kvm_irqchip_commit_routes(s);
	1242
	1243	QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
	1244	entry);
	1245	}
	1246
	1247	assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
	1248
	1249	return kvm_set_irq(s, route->kroute.gsi, 1);
	1250	}
	1251
	1252	int kvm_irqchip_add_msi_route(KVMState s, int vector, PCIDevice dev)
	1253	{
	1254	struct kvm_irq_routing_entry kroute = {};
	1255	int virq;
	1256	MSIMessage msg = {0, 0};
	1257
	1258	if (dev) {
	1259	msg = pci_get_msi_message(dev, vector);
	1260	}
	1261
	1262	if (kvm_gsi_direct_mapping()) {
	1263	return kvm_arch_msi_data_to_gsi(msg.data);
	1264	}
	1265
	1266	if (!kvm_gsi_routing_enabled()) {
	1267	return -ENOSYS;
	1268	}
	1269
	1270	virq = kvm_irqchip_get_virq(s);
	1271	if (virq < 0) {
	1272	return virq;
	1273	}
	1274
	1275	kroute.gsi = virq;
	1276	kroute.type = KVM_IRQ_ROUTING_MSI;
	1277	kroute.flags = 0;
	1278	kroute.u.msi.address_lo = (uint32_t)msg.address;
	1279	kroute.u.msi.address_hi = msg.address >> 32;
	1280	kroute.u.msi.data = le32_to_cpu(msg.data);
	1281	if (kvm_msi_devid_required()) {
	1282	kroute.flags = KVM_MSI_VALID_DEVID;
	1283	kroute.u.msi.devid = pci_requester_id(dev);
	1284	}
	1285	if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
	1286	kvm_irqchip_release_virq(s, virq);
	1287	return -EINVAL;
	1288	}
	1289
	1290	trace_kvm_irqchip_add_msi_route(virq);
	1291
	1292	kvm_add_routing_entry(s, &kroute);
	1293	kvm_arch_add_msi_route_post(&kroute, vector, dev);
	1294	kvm_irqchip_commit_routes(s);
	1295
	1296	return virq;
	1297	}
	1298
	1299	int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
	1300	PCIDevice *dev)
	1301	{
	1302	struct kvm_irq_routing_entry kroute = {};
	1303
	1304	if (kvm_gsi_direct_mapping()) {
	1305	return 0;
	1306	}
	1307
	1308	if (!kvm_irqchip_in_kernel()) {
	1309	return -ENOSYS;
	1310	}
	1311
	1312	kroute.gsi = virq;
	1313	kroute.type = KVM_IRQ_ROUTING_MSI;
	1314	kroute.flags = 0;
	1315	kroute.u.msi.address_lo = (uint32_t)msg.address;
	1316	kroute.u.msi.address_hi = msg.address >> 32;
	1317	kroute.u.msi.data = le32_to_cpu(msg.data);
	1318	if (kvm_msi_devid_required()) {
	1319	kroute.flags = KVM_MSI_VALID_DEVID;
	1320	kroute.u.msi.devid = pci_requester_id(dev);
	1321	}
	1322	if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
	1323	return -EINVAL;
	1324	}
	1325
	1326	trace_kvm_irqchip_update_msi_route(virq);
	1327
	1328	return kvm_update_routing_entry(s, &kroute);
	1329	}
	1330
	1331	static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
	1332	bool assign)
	1333	{
	1334	struct kvm_irqfd irqfd = {
	1335	.fd = fd,
	1336	.gsi = virq,
	1337	.flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
	1338	};
	1339
	1340	if (rfd != -1) {
	1341	irqfd.flags \|= KVM_IRQFD_FLAG_RESAMPLE;
	1342	irqfd.resamplefd = rfd;
	1343	}
	1344
	1345	if (!kvm_irqfds_enabled()) {
	1346	return -ENOSYS;
	1347	}
	1348
	1349	return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
	1350	}
	1351
	1352	int kvm_irqchip_add_adapter_route(KVMState s, AdapterInfo adapter)
	1353	{
	1354	struct kvm_irq_routing_entry kroute = {};
	1355	int virq;
	1356
	1357	if (!kvm_gsi_routing_enabled()) {
	1358	return -ENOSYS;
	1359	}
	1360
	1361	virq = kvm_irqchip_get_virq(s);
	1362	if (virq < 0) {
	1363	return virq;
	1364	}
	1365
	1366	kroute.gsi = virq;
	1367	kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
	1368	kroute.flags = 0;
	1369	kroute.u.adapter.summary_addr = adapter->summary_addr;
	1370	kroute.u.adapter.ind_addr = adapter->ind_addr;
	1371	kroute.u.adapter.summary_offset = adapter->summary_offset;
	1372	kroute.u.adapter.ind_offset = adapter->ind_offset;
	1373	kroute.u.adapter.adapter_id = adapter->adapter_id;
	1374
	1375	kvm_add_routing_entry(s, &kroute);
	1376
	1377	return virq;
	1378	}
	1379
	1380	int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
	1381	{
	1382	struct kvm_irq_routing_entry kroute = {};
	1383	int virq;
	1384
	1385	if (!kvm_gsi_routing_enabled()) {
	1386	return -ENOSYS;
	1387	}
	1388	if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
	1389	return -ENOSYS;
	1390	}
	1391	virq = kvm_irqchip_get_virq(s);
	1392	if (virq < 0) {
	1393	return virq;
	1394	}
	1395
	1396	kroute.gsi = virq;
	1397	kroute.type = KVM_IRQ_ROUTING_HV_SINT;
	1398	kroute.flags = 0;
	1399	kroute.u.hv_sint.vcpu = vcpu;
	1400	kroute.u.hv_sint.sint = sint;
	1401
	1402	kvm_add_routing_entry(s, &kroute);
	1403	kvm_irqchip_commit_routes(s);
	1404
	1405	return virq;
	1406	}
	1407
	1408	#else /* !KVM_CAP_IRQ_ROUTING */
	1409
	1410	void kvm_init_irq_routing(KVMState *s)
	1411	{
	1412	}
	1413
	1414	void kvm_irqchip_release_virq(KVMState *s, int virq)
	1415	{
	1416	}
	1417
	1418	int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
	1419	{
	1420	abort();
	1421	}
	1422
	1423	int kvm_irqchip_add_msi_route(KVMState s, int vector, PCIDevice dev)
	1424	{
	1425	return -ENOSYS;
	1426	}
	1427
	1428	int kvm_irqchip_add_adapter_route(KVMState s, AdapterInfo adapter)
	1429	{
	1430	return -ENOSYS;
	1431	}
	1432
	1433	int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
	1434	{
	1435	return -ENOSYS;
	1436	}
	1437
	1438	static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
	1439	{
	1440	abort();
	1441	}
	1442
	1443	int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
	1444	{
	1445	return -ENOSYS;
	1446	}
	1447	#endif /* !KVM_CAP_IRQ_ROUTING */
	1448
	1449	int kvm_irqchip_add_irqfd_notifier_gsi(KVMState s, EventNotifier n,
	1450	EventNotifier *rn, int virq)
	1451	{
	1452	return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
	1453	rn ? event_notifier_get_fd(rn) : -1, virq, true);
	1454	}
	1455
	1456	int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState s, EventNotifier n,
	1457	int virq)
	1458	{
	1459	return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
	1460	false);
	1461	}
	1462
	1463	int kvm_irqchip_add_irqfd_notifier(KVMState s, EventNotifier n,
	1464	EventNotifier *rn, qemu_irq irq)
	1465	{
	1466	gpointer key, gsi;
	1467	gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
	1468
	1469	if (!found) {
	1470	return -ENXIO;
	1471	}
	1472	return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
	1473	}
	1474
	1475	int kvm_irqchip_remove_irqfd_notifier(KVMState s, EventNotifier n,
	1476	qemu_irq irq)
	1477	{
	1478	gpointer key, gsi;
	1479	gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
	1480
	1481	if (!found) {
	1482	return -ENXIO;
	1483	}
	1484	return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
	1485	}
	1486
	1487	void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
	1488	{
	1489	g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
	1490	}
	1491
	1492	static void kvm_irqchip_create(MachineState machine, KVMState s)
	1493	{
	1494	int ret;
	1495
	1496	if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
	1497	;
	1498	} else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
	1499	ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
	1500	if (ret < 0) {
	1501	fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
	1502	exit(1);
	1503	}
	1504	} else {
	1505	return;
	1506	}
	1507
	1508	/* First probe and see if there's a arch-specific hook to create the
	1509	* in-kernel irqchip for us */
	1510	ret = kvm_arch_irqchip_create(machine, s);
	1511	if (ret == 0) {
	1512	if (machine_kernel_irqchip_split(machine)) {
	1513	perror("Split IRQ chip mode not supported.");
	1514	exit(1);
	1515	} else {
	1516	ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
	1517	}
	1518	}
	1519	if (ret < 0) {
	1520	fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
	1521	exit(1);
	1522	}
	1523
	1524	kvm_kernel_irqchip = true;
	1525	/* If we have an in-kernel IRQ chip then we must have asynchronous
	1526	* interrupt delivery (though the reverse is not necessarily true)
	1527	*/
	1528	kvm_async_interrupts_allowed = true;
	1529	kvm_halt_in_kernel_allowed = true;
	1530
	1531	kvm_init_irq_routing(s);
	1532
	1533	s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
	1534	}
	1535
	1536	/* Find number of supported CPUs using the recommended
	1537	* procedure from the kernel API documentation to cope with
	1538	* older kernels that may be missing capabilities.
	1539	*/
	1540	static int kvm_recommended_vcpus(KVMState *s)
	1541	{
	1542	int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
	1543	return (ret) ? ret : 4;
	1544	}
	1545
	1546	static int kvm_max_vcpus(KVMState *s)
	1547	{
	1548	int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
	1549	return (ret) ? ret : kvm_recommended_vcpus(s);
	1550	}
	1551
	1552	static int kvm_max_vcpu_id(KVMState *s)
	1553	{
	1554	int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
	1555	return (ret) ? ret : kvm_max_vcpus(s);
	1556	}
	1557
	1558	bool kvm_vcpu_id_is_valid(int vcpu_id)
	1559	{
	1560	KVMState *s = KVM_STATE(current_machine->accelerator);
	1561	return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
	1562	}
	1563
	1564	static int kvm_init(MachineState *ms)
	1565	{
	1566	MachineClass *mc = MACHINE_GET_CLASS(ms);
	1567	static const char upgrade_note[] =
	1568	"Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
	1569	"(see http://sourceforge.net/projects/kvm).\n";
	1570	struct {
	1571	const char *name;
	1572	int num;
	1573	} num_cpus[] = {
	1574	{ "SMP", smp_cpus },
	1575	{ "hotpluggable", max_cpus },
	1576	{ NULL, }
	1577	}, *nc = num_cpus;
	1578	int soft_vcpus_limit, hard_vcpus_limit;
	1579	KVMState *s;
	1580	const KVMCapabilityInfo *missing_cap;
	1581	int ret;
	1582	int type = 0;
	1583	const char *kvm_type;
	1584
	1585	s = KVM_STATE(ms->accelerator);
	1586
	1587	/*
	1588	* On systems where the kernel can support different base page
	1589	* sizes, host page size may be different from TARGET_PAGE_SIZE,
	1590	* even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
	1591	* page size for the system though.
	1592	*/
	1593	assert(TARGET_PAGE_SIZE <= getpagesize());
	1594
	1595	s->sigmask_len = 8;
	1596
	1597	#ifdef KVM_CAP_SET_GUEST_DEBUG
	1598	QTAILQ_INIT(&s->kvm_sw_breakpoints);
	1599	#endif
	1600	QLIST_INIT(&s->kvm_parked_vcpus);
	1601	s->vmfd = -1;
	1602	s->fd = qemu_open("/dev/kvm", O_RDWR);
	1603	if (s->fd == -1) {
	1604	fprintf(stderr, "Could not access KVM kernel module: %m\n");
	1605	ret = -errno;
	1606	goto err;
	1607	}
	1608
	1609	ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
	1610	if (ret < KVM_API_VERSION) {
	1611	if (ret >= 0) {
	1612	ret = -EINVAL;
	1613	}
	1614	fprintf(stderr, "kvm version too old\n");
	1615	goto err;
	1616	}
	1617
	1618	if (ret > KVM_API_VERSION) {
	1619	ret = -EINVAL;
	1620	fprintf(stderr, "kvm version not supported\n");
	1621	goto err;
	1622	}
	1623
	1624	kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
	1625	s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
	1626
	1627	/* If unspecified, use the default value */
	1628	if (!s->nr_slots) {
	1629	s->nr_slots = 32;
	1630	}
	1631
	1632	/* check the vcpu limits */
	1633	soft_vcpus_limit = kvm_recommended_vcpus(s);
	1634	hard_vcpus_limit = kvm_max_vcpus(s);
	1635
	1636	while (nc->name) {
	1637	if (nc->num > soft_vcpus_limit) {
	1638	fprintf(stderr,
	1639	"Warning: Number of %s cpus requested (%d) exceeds "
	1640	"the recommended cpus supported by KVM (%d)\n",
	1641	nc->name, nc->num, soft_vcpus_limit);
	1642
	1643	if (nc->num > hard_vcpus_limit) {
	1644	fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
	1645	"the maximum cpus supported by KVM (%d)\n",
	1646	nc->name, nc->num, hard_vcpus_limit);
	1647	exit(1);
	1648	}
	1649	}
	1650	nc++;
	1651	}
	1652
	1653	kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
	1654	if (mc->kvm_type) {
	1655	type = mc->kvm_type(kvm_type);
	1656	} else if (kvm_type) {
	1657	ret = -EINVAL;
	1658	fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
	1659	goto err;
	1660	}
	1661
	1662	do {
	1663	ret = kvm_ioctl(s, KVM_CREATE_VM, type);
	1664	} while (ret == -EINTR);
	1665
	1666	if (ret < 0) {
	1667	fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
	1668	strerror(-ret));
	1669
	1670	#ifdef TARGET_S390X
	1671	if (ret == -EINVAL) {
	1672	fprintf(stderr,
	1673	"Host kernel setup problem detected. Please verify:\n");
	1674	fprintf(stderr, "- for kernels supporting the switch_amode or"
	1675	" user_mode parameters, whether\n");
	1676	fprintf(stderr,
	1677	" user space is running in primary address space\n");
	1678	fprintf(stderr,
	1679	"- for kernels supporting the vm.allocate_pgste sysctl, "
	1680	"whether it is enabled\n");
	1681	}
	1682	#endif
	1683	goto err;
	1684	}
	1685
	1686	s->vmfd = ret;
	1687	missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
	1688	if (!missing_cap) {
	1689	missing_cap =
	1690	kvm_check_extension_list(s, kvm_arch_required_capabilities);
	1691	}
	1692	if (missing_cap) {
	1693	ret = -EINVAL;
	1694	fprintf(stderr, "kvm does not support %s\n%s",
	1695	missing_cap->name, upgrade_note);
	1696	goto err;
	1697	}
	1698
	1699	s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
	1700
	1701	s->broken_set_mem_region = 1;
	1702	ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
	1703	if (ret > 0) {
	1704	s->broken_set_mem_region = 0;
	1705	}
	1706
	1707	#ifdef KVM_CAP_VCPU_EVENTS
	1708	s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
	1709	#endif
	1710
	1711	s->robust_singlestep =
	1712	kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
	1713
	1714	#ifdef KVM_CAP_DEBUGREGS
	1715	s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
	1716	#endif
	1717
	1718	#ifdef KVM_CAP_IRQ_ROUTING
	1719	kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
	1720	#endif
	1721
	1722	s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
	1723
	1724	s->irq_set_ioctl = KVM_IRQ_LINE;
	1725	if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
	1726	s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
	1727	}
	1728
	1729	#ifdef KVM_CAP_READONLY_MEM
	1730	kvm_readonly_mem_allowed =
	1731	(kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
	1732	#endif
	1733
	1734	kvm_eventfds_allowed =
	1735	(kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
	1736
	1737	kvm_irqfds_allowed =
	1738	(kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
	1739
	1740	kvm_resamplefds_allowed =
	1741	(kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
	1742
	1743	kvm_vm_attributes_allowed =
	1744	(kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
	1745
	1746	kvm_ioeventfd_any_length_allowed =
	1747	(kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
	1748
	1749	ret = kvm_arch_init(ms, s);
	1750	if (ret < 0) {
	1751	goto err;
	1752	}
	1753
	1754	if (machine_kernel_irqchip_allowed(ms)) {
	1755	kvm_irqchip_create(ms, s);
	1756	}
	1757
	1758	kvm_state = s;
	1759
	1760	if (kvm_eventfds_allowed) {
	1761	s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
	1762	s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
	1763	}
	1764	s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
	1765	s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
	1766
	1767	kvm_memory_listener_register(s, &s->memory_listener,
	1768	&address_space_memory, 0);
	1769	memory_listener_register(&kvm_io_listener,
	1770	&address_space_io);
	1771
	1772	s->many_ioeventfds = kvm_check_many_ioeventfds();
	1773
	1774	cpu_interrupt_handler = kvm_handle_interrupt;
	1775
	1776	return 0;
	1777
	1778	err:
	1779	assert(ret < 0);
	1780	if (s->vmfd >= 0) {
	1781	close(s->vmfd);
	1782	}
	1783	if (s->fd != -1) {
	1784	close(s->fd);
	1785	}
	1786	g_free(s->memory_listener.slots);
	1787
	1788	return ret;
	1789	}
	1790
	1791	void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
	1792	{
	1793	s->sigmask_len = sigmask_len;
	1794	}
	1795
	1796	static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
	1797	int size, uint32_t count)
	1798	{
	1799	int i;
	1800	uint8_t *ptr = data;
	1801
	1802	for (i = 0; i < count; i++) {
	1803	address_space_rw(&address_space_io, port, attrs,
	1804	ptr, size,
	1805	direction == KVM_EXIT_IO_OUT);
	1806	ptr += size;
	1807	}
	1808	}
	1809
	1810	static int kvm_handle_internal_error(CPUState cpu, struct kvm_run run)
	1811	{
	1812	fprintf(stderr, "KVM internal error. Suberror: %d\n",
	1813	run->internal.suberror);
	1814
	1815	if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
	1816	int i;
	1817
	1818	for (i = 0; i < run->internal.ndata; ++i) {
	1819	fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
	1820	i, (uint64_t)run->internal.data[i]);
	1821	}
	1822	}
	1823	if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
	1824	fprintf(stderr, "emulation failure\n");
	1825	if (!kvm_arch_stop_on_emulation_error(cpu)) {
	1826	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
	1827	return EXCP_INTERRUPT;
	1828	}
	1829	}
	1830	/* FIXME: Should trigger a qmp message to let management know
	1831	* something went wrong.
	1832	*/
	1833	return -1;
	1834	}
	1835
	1836	void kvm_flush_coalesced_mmio_buffer(void)
	1837	{
	1838	KVMState *s = kvm_state;
	1839
	1840	if (s->coalesced_flush_in_progress) {
	1841	return;
	1842	}
	1843
	1844	s->coalesced_flush_in_progress = true;
	1845
	1846	if (s->coalesced_mmio_ring) {
	1847	struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
	1848	while (ring->first != ring->last) {
	1849	struct kvm_coalesced_mmio *ent;
	1850
	1851	ent = &ring->coalesced_mmio[ring->first];
	1852
	1853	cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
	1854	smp_wmb();
	1855	ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
	1856	}
	1857	}
	1858
	1859	s->coalesced_flush_in_progress = false;
	1860	}
	1861
	1862	static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
	1863	{
	1864	if (!cpu->kvm_vcpu_dirty) {
	1865	kvm_arch_get_registers(cpu);
	1866	cpu->kvm_vcpu_dirty = true;
	1867	}
	1868	}
	1869
	1870	void kvm_cpu_synchronize_state(CPUState *cpu)
	1871	{
	1872	if (!cpu->kvm_vcpu_dirty) {
	1873	run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
	1874	}
	1875	}
	1876
	1877	static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
	1878	{
	1879	kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
	1880	cpu->kvm_vcpu_dirty = false;
	1881	}
	1882
	1883	void kvm_cpu_synchronize_post_reset(CPUState *cpu)
	1884	{
	1885	run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
	1886	}
	1887
	1888	static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
	1889	{
	1890	kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
	1891	cpu->kvm_vcpu_dirty = false;
	1892	}
	1893
	1894	void kvm_cpu_synchronize_post_init(CPUState *cpu)
	1895	{
	1896	run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
	1897	}
	1898
	1899	#ifdef KVM_HAVE_MCE_INJECTION
	1900	static __thread void *pending_sigbus_addr;
	1901	static __thread int pending_sigbus_code;
	1902	static __thread bool have_sigbus_pending;
	1903	#endif
	1904
	1905	static void kvm_cpu_kick(CPUState *cpu)
	1906	{
	1907	atomic_set(&cpu->kvm_run->immediate_exit, 1);
	1908	}
	1909
	1910	static void kvm_cpu_kick_self(void)
	1911	{
	1912	if (kvm_immediate_exit) {
	1913	kvm_cpu_kick(current_cpu);
	1914	} else {
	1915	qemu_cpu_kick_self();
	1916	}
	1917	}
	1918
	1919	static void kvm_eat_signals(CPUState *cpu)
	1920	{
	1921	struct timespec ts = { 0, 0 };
	1922	siginfo_t siginfo;
	1923	sigset_t waitset;
	1924	sigset_t chkset;
	1925	int r;
	1926
	1927	if (kvm_immediate_exit) {
	1928	atomic_set(&cpu->kvm_run->immediate_exit, 0);
	1929	/* Write kvm_run->immediate_exit before the cpu->exit_request
	1930	* write in kvm_cpu_exec.
	1931	*/
	1932	smp_wmb();
	1933	return;
	1934	}
	1935
	1936	sigemptyset(&waitset);
	1937	sigaddset(&waitset, SIG_IPI);
	1938
	1939	do {
	1940	r = sigtimedwait(&waitset, &siginfo, &ts);
	1941	if (r == -1 && !(errno == EAGAIN \|\| errno == EINTR)) {
	1942	perror("sigtimedwait");
	1943	exit(1);
	1944	}
	1945
	1946	r = sigpending(&chkset);
	1947	if (r == -1) {
	1948	perror("sigpending");
	1949	exit(1);
	1950	}
	1951	} while (sigismember(&chkset, SIG_IPI));
	1952	}
	1953
	1954	int kvm_cpu_exec(CPUState *cpu)
	1955	{
	1956	struct kvm_run *run = cpu->kvm_run;
	1957	int ret, run_ret;
	1958
	1959	DPRINTF("kvm_cpu_exec()\n");
	1960
	1961	if (kvm_arch_process_async_events(cpu)) {
	1962	atomic_set(&cpu->exit_request, 0);
	1963	return EXCP_HLT;
	1964	}
	1965
	1966	qemu_mutex_unlock_iothread();
	1967
	1968	do {
	1969	MemTxAttrs attrs;
	1970
	1971	if (cpu->kvm_vcpu_dirty) {
	1972	kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
	1973	cpu->kvm_vcpu_dirty = false;
	1974	}
	1975
	1976	kvm_arch_pre_run(cpu, run);
	1977	if (atomic_read(&cpu->exit_request)) {
	1978	DPRINTF("interrupt exit requested\n");
	1979	/*
	1980	* KVM requires us to reenter the kernel after IO exits to complete
	1981	* instruction emulation. This self-signal will ensure that we
	1982	* leave ASAP again.
	1983	*/
	1984	kvm_cpu_kick_self();
	1985	}
	1986
	1987	/* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
	1988	* Matching barrier in kvm_eat_signals.
	1989	*/
	1990	smp_rmb();
	1991
	1992	run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
	1993
	1994	attrs = kvm_arch_post_run(cpu, run);
	1995
	1996	#ifdef KVM_HAVE_MCE_INJECTION
	1997	if (unlikely(have_sigbus_pending)) {
	1998	qemu_mutex_lock_iothread();
	1999	kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
	2000	pending_sigbus_addr);
	2001	have_sigbus_pending = false;
	2002	qemu_mutex_unlock_iothread();
	2003	}
	2004	#endif
	2005
	2006	if (run_ret < 0) {
	2007	if (run_ret == -EINTR \|\| run_ret == -EAGAIN) {
	2008	DPRINTF("io window exit\n");
	2009	kvm_eat_signals(cpu);
	2010	ret = EXCP_INTERRUPT;
	2011	break;
	2012	}
	2013	fprintf(stderr, "error: kvm run failed %s\n",
	2014	strerror(-run_ret));
	2015	#ifdef TARGET_PPC
	2016	if (run_ret == -EBUSY) {
	2017	fprintf(stderr,
	2018	"This is probably because your SMT is enabled.\n"
	2019	"VCPU can only run on primary threads with all "
	2020	"secondary threads offline.\n");
	2021	}
	2022	#endif
	2023	ret = -1;
	2024	break;
	2025	}
	2026
	2027	trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
	2028	switch (run->exit_reason) {
	2029	case KVM_EXIT_IO:
	2030	DPRINTF("handle_io\n");
	2031	/* Called outside BQL */
	2032	kvm_handle_io(run->io.port, attrs,
	2033	(uint8_t *)run + run->io.data_offset,
	2034	run->io.direction,
	2035	run->io.size,
	2036	run->io.count);
	2037	ret = 0;
	2038	break;
	2039	case KVM_EXIT_MMIO:
	2040	DPRINTF("handle_mmio\n");
	2041	/* Called outside BQL */
	2042	address_space_rw(&address_space_memory,
	2043	run->mmio.phys_addr, attrs,
	2044	run->mmio.data,
	2045	run->mmio.len,
	2046	run->mmio.is_write);
	2047	ret = 0;
	2048	break;
	2049	case KVM_EXIT_IRQ_WINDOW_OPEN:
	2050	DPRINTF("irq_window_open\n");
	2051	ret = EXCP_INTERRUPT;
	2052	break;
	2053	case KVM_EXIT_SHUTDOWN:
	2054	DPRINTF("shutdown\n");
	2055	qemu_system_reset_request();
	2056	ret = EXCP_INTERRUPT;
	2057	break;
	2058	case KVM_EXIT_UNKNOWN:
	2059	fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
	2060	(uint64_t)run->hw.hardware_exit_reason);
	2061	ret = -1;
	2062	break;
	2063	case KVM_EXIT_INTERNAL_ERROR:
	2064	ret = kvm_handle_internal_error(cpu, run);
	2065	break;
	2066	case KVM_EXIT_SYSTEM_EVENT:
	2067	switch (run->system_event.type) {
	2068	case KVM_SYSTEM_EVENT_SHUTDOWN:
	2069	qemu_system_shutdown_request();
	2070	ret = EXCP_INTERRUPT;
	2071	break;
	2072	case KVM_SYSTEM_EVENT_RESET:
	2073	qemu_system_reset_request();
	2074	ret = EXCP_INTERRUPT;
	2075	break;
	2076	case KVM_SYSTEM_EVENT_CRASH:
	2077	kvm_cpu_synchronize_state(cpu);
	2078	qemu_mutex_lock_iothread();
	2079	qemu_system_guest_panicked(cpu_get_crash_info(cpu));
	2080	qemu_mutex_unlock_iothread();
	2081	ret = 0;
	2082	break;
	2083	default:
	2084	DPRINTF("kvm_arch_handle_exit\n");
	2085	ret = kvm_arch_handle_exit(cpu, run);
	2086	break;
	2087	}
	2088	break;
	2089	default:
	2090	DPRINTF("kvm_arch_handle_exit\n");
	2091	ret = kvm_arch_handle_exit(cpu, run);
	2092	break;
	2093	}
	2094	} while (ret == 0);
	2095
	2096	qemu_mutex_lock_iothread();
	2097
	2098	if (ret < 0) {
	2099	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
	2100	vm_stop(RUN_STATE_INTERNAL_ERROR);
	2101	}
	2102
	2103	atomic_set(&cpu->exit_request, 0);
	2104	return ret;
	2105	}
	2106
	2107	int kvm_ioctl(KVMState *s, int type, ...)
	2108	{
	2109	int ret;
	2110	void *arg;
	2111	va_list ap;
	2112
	2113	va_start(ap, type);
	2114	arg = va_arg(ap, void *);
	2115	va_end(ap);
	2116
	2117	trace_kvm_ioctl(type, arg);
	2118	ret = ioctl(s->fd, type, arg);
	2119	if (ret == -1) {
	2120	ret = -errno;
	2121	}
	2122	return ret;
	2123	}
	2124
	2125	int kvm_vm_ioctl(KVMState *s, int type, ...)
	2126	{
	2127	int ret;
	2128	void *arg;
	2129	va_list ap;
	2130
	2131	va_start(ap, type);
	2132	arg = va_arg(ap, void *);
	2133	va_end(ap);
	2134
	2135	trace_kvm_vm_ioctl(type, arg);
	2136	ret = ioctl(s->vmfd, type, arg);
	2137	if (ret == -1) {
	2138	ret = -errno;
	2139	}
	2140	return ret;
	2141	}
	2142
	2143	int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
	2144	{
	2145	int ret;
	2146	void *arg;
	2147	va_list ap;
	2148
	2149	va_start(ap, type);
	2150	arg = va_arg(ap, void *);
	2151	va_end(ap);
	2152
	2153	trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
	2154	ret = ioctl(cpu->kvm_fd, type, arg);
	2155	if (ret == -1) {
	2156	ret = -errno;
	2157	}
	2158	return ret;
	2159	}
	2160
	2161	int kvm_device_ioctl(int fd, int type, ...)
	2162	{
	2163	int ret;
	2164	void *arg;
	2165	va_list ap;
	2166
	2167	va_start(ap, type);
	2168	arg = va_arg(ap, void *);
	2169	va_end(ap);
	2170
	2171	trace_kvm_device_ioctl(fd, type, arg);
	2172	ret = ioctl(fd, type, arg);
	2173	if (ret == -1) {
	2174	ret = -errno;
	2175	}
	2176	return ret;
	2177	}
	2178
	2179	int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
	2180	{
	2181	int ret;
	2182	struct kvm_device_attr attribute = {
	2183	.group = group,
	2184	.attr = attr,
	2185	};
	2186
	2187	if (!kvm_vm_attributes_allowed) {
	2188	return 0;
	2189	}
	2190
	2191	ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
	2192	/* kvm returns 0 on success for HAS_DEVICE_ATTR */
	2193	return ret ? 0 : 1;
	2194	}
	2195
	2196	int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
	2197	{
	2198	struct kvm_device_attr attribute = {
	2199	.group = group,
	2200	.attr = attr,
	2201	.flags = 0,
	2202	};
	2203
	2204	return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
	2205	}
	2206
	2207	void kvm_device_access(int fd, int group, uint64_t attr,
	2208	void *val, bool write)
	2209	{
	2210	struct kvm_device_attr kvmattr;
	2211	int err;
	2212
	2213	kvmattr.flags = 0;
	2214	kvmattr.group = group;
	2215	kvmattr.attr = attr;
	2216	kvmattr.addr = (uintptr_t)val;
	2217
	2218	err = kvm_device_ioctl(fd,
	2219	write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
	2220	&kvmattr);
	2221	if (err < 0) {
	2222	error_report("KVM_%s_DEVICE_ATTR failed: %s",
	2223	write ? "SET" : "GET", strerror(-err));
	2224	error_printf("Group %d attr 0x%016" PRIx64 "\n", group, attr);
	2225	abort();
	2226	}
	2227	}
	2228
	2229	/* Return 1 on success, 0 on failure */
	2230	int kvm_has_sync_mmu(void)
	2231	{
	2232	return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
	2233	}
	2234
	2235	int kvm_has_vcpu_events(void)
	2236	{
	2237	return kvm_state->vcpu_events;
	2238	}
	2239
	2240	int kvm_has_robust_singlestep(void)
	2241	{
	2242	return kvm_state->robust_singlestep;
	2243	}
	2244
	2245	int kvm_has_debugregs(void)
	2246	{
	2247	return kvm_state->debugregs;
	2248	}
	2249
	2250	int kvm_has_many_ioeventfds(void)
	2251	{
	2252	if (!kvm_enabled()) {
	2253	return 0;
	2254	}
	2255	return kvm_state->many_ioeventfds;
	2256	}
	2257
	2258	int kvm_has_gsi_routing(void)
	2259	{
	2260	#ifdef KVM_CAP_IRQ_ROUTING
	2261	return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
	2262	#else
	2263	return false;
	2264	#endif
	2265	}
	2266
	2267	int kvm_has_intx_set_mask(void)
	2268	{
	2269	return kvm_state->intx_set_mask;
	2270	}
	2271
	2272	#ifdef KVM_CAP_SET_GUEST_DEBUG
	2273	struct kvm_sw_breakpoint kvm_find_sw_breakpoint(CPUState cpu,
	2274	target_ulong pc)
	2275	{
	2276	struct kvm_sw_breakpoint *bp;
	2277
	2278	QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
	2279	if (bp->pc == pc) {
	2280	return bp;
	2281	}
	2282	}
	2283	return NULL;
	2284	}
	2285
	2286	int kvm_sw_breakpoints_active(CPUState *cpu)
	2287	{
	2288	return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
	2289	}
	2290
	2291	struct kvm_set_guest_debug_data {
	2292	struct kvm_guest_debug dbg;
	2293	int err;
	2294	};
	2295
	2296	static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
	2297	{
	2298	struct kvm_set_guest_debug_data *dbg_data =
	2299	(struct kvm_set_guest_debug_data *) data.host_ptr;
	2300
	2301	dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
	2302	&dbg_data->dbg);
	2303	}
	2304
	2305	int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
	2306	{
	2307	struct kvm_set_guest_debug_data data;
	2308
	2309	data.dbg.control = reinject_trap;
	2310
	2311	if (cpu->singlestep_enabled) {
	2312	data.dbg.control \|= KVM_GUESTDBG_ENABLE \| KVM_GUESTDBG_SINGLESTEP;
	2313	}
	2314	kvm_arch_update_guest_debug(cpu, &data.dbg);
	2315
	2316	run_on_cpu(cpu, kvm_invoke_set_guest_debug,
	2317	RUN_ON_CPU_HOST_PTR(&data));
	2318	return data.err;
	2319	}
	2320
	2321	int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
	2322	target_ulong len, int type)
	2323	{
	2324	struct kvm_sw_breakpoint *bp;
	2325	int err;
	2326
	2327	if (type == GDB_BREAKPOINT_SW) {
	2328	bp = kvm_find_sw_breakpoint(cpu, addr);
	2329	if (bp) {
	2330	bp->use_count++;
	2331	return 0;
	2332	}
	2333
	2334	bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
	2335	bp->pc = addr;
	2336	bp->use_count = 1;
	2337	err = kvm_arch_insert_sw_breakpoint(cpu, bp);
	2338	if (err) {
	2339	g_free(bp);
	2340	return err;
	2341	}
	2342
	2343	QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
	2344	} else {
	2345	err = kvm_arch_insert_hw_breakpoint(addr, len, type);
	2346	if (err) {
	2347	return err;
	2348	}
	2349	}
	2350
	2351	CPU_FOREACH(cpu) {
	2352	err = kvm_update_guest_debug(cpu, 0);
	2353	if (err) {
	2354	return err;
	2355	}
	2356	}
	2357	return 0;
	2358	}
	2359
	2360	int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
	2361	target_ulong len, int type)
	2362	{
	2363	struct kvm_sw_breakpoint *bp;
	2364	int err;
	2365
	2366	if (type == GDB_BREAKPOINT_SW) {
	2367	bp = kvm_find_sw_breakpoint(cpu, addr);
	2368	if (!bp) {
	2369	return -ENOENT;
	2370	}
	2371
	2372	if (bp->use_count > 1) {
	2373	bp->use_count--;
	2374	return 0;
	2375	}
	2376
	2377	err = kvm_arch_remove_sw_breakpoint(cpu, bp);
	2378	if (err) {
	2379	return err;
	2380	}
	2381
	2382	QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
	2383	g_free(bp);
	2384	} else {
	2385	err = kvm_arch_remove_hw_breakpoint(addr, len, type);
	2386	if (err) {
	2387	return err;
	2388	}
	2389	}
	2390
	2391	CPU_FOREACH(cpu) {
	2392	err = kvm_update_guest_debug(cpu, 0);
	2393	if (err) {
	2394	return err;
	2395	}
	2396	}
	2397	return 0;
	2398	}
	2399
	2400	void kvm_remove_all_breakpoints(CPUState *cpu)
	2401	{
	2402	struct kvm_sw_breakpoint bp, next;
	2403	KVMState *s = cpu->kvm_state;
	2404	CPUState *tmpcpu;
	2405
	2406	QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
	2407	if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
	2408	/* Try harder to find a CPU that currently sees the breakpoint. */
	2409	CPU_FOREACH(tmpcpu) {
	2410	if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
	2411	break;
	2412	}
	2413	}
	2414	}
	2415	QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
	2416	g_free(bp);
	2417	}
	2418	kvm_arch_remove_all_hw_breakpoints();
	2419
	2420	CPU_FOREACH(cpu) {
	2421	kvm_update_guest_debug(cpu, 0);
	2422	}
	2423	}
	2424
	2425	#else /* !KVM_CAP_SET_GUEST_DEBUG */
	2426
	2427	int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
	2428	{
	2429	return -EINVAL;
	2430	}
	2431
	2432	int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
	2433	target_ulong len, int type)
	2434	{
	2435	return -EINVAL;
	2436	}
	2437
	2438	int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
	2439	target_ulong len, int type)
	2440	{
	2441	return -EINVAL;
	2442	}
	2443
	2444	void kvm_remove_all_breakpoints(CPUState *cpu)
	2445	{
	2446	}
	2447	#endif /* !KVM_CAP_SET_GUEST_DEBUG */
	2448
	2449	static int kvm_set_signal_mask(CPUState cpu, const sigset_t sigset)
	2450	{
	2451	KVMState *s = kvm_state;
	2452	struct kvm_signal_mask *sigmask;
	2453	int r;
	2454
	2455	sigmask = g_malloc(sizeof(sigmask) + sizeof(sigset));
	2456
	2457	sigmask->len = s->sigmask_len;
	2458	memcpy(sigmask->sigset, sigset, sizeof(*sigset));
	2459	r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
	2460	g_free(sigmask);
	2461
	2462	return r;
	2463	}
	2464
	2465	static void kvm_ipi_signal(int sig)
	2466	{
	2467	if (current_cpu) {
	2468	assert(kvm_immediate_exit);
	2469	kvm_cpu_kick(current_cpu);
	2470	}
	2471	}
	2472
	2473	void kvm_init_cpu_signals(CPUState *cpu)
	2474	{
	2475	int r;
	2476	sigset_t set;
	2477	struct sigaction sigact;
	2478
	2479	memset(&sigact, 0, sizeof(sigact));
	2480	sigact.sa_handler = kvm_ipi_signal;
	2481	sigaction(SIG_IPI, &sigact, NULL);
	2482
	2483	pthread_sigmask(SIG_BLOCK, NULL, &set);
	2484	#if defined KVM_HAVE_MCE_INJECTION
	2485	sigdelset(&set, SIGBUS);
	2486	pthread_sigmask(SIG_SETMASK, &set, NULL);
	2487	#endif
	2488	sigdelset(&set, SIG_IPI);
	2489	if (kvm_immediate_exit) {
	2490	r = pthread_sigmask(SIG_SETMASK, &set, NULL);
	2491	} else {
	2492	r = kvm_set_signal_mask(cpu, &set);
	2493	}
	2494	if (r) {
	2495	fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
	2496	exit(1);
	2497	}
	2498	}
	2499
	2500	/* Called asynchronously in VCPU thread. */
	2501	int kvm_on_sigbus_vcpu(CPUState cpu, int code, void addr)
	2502	{
	2503	#ifdef KVM_HAVE_MCE_INJECTION
	2504	if (have_sigbus_pending) {
	2505	return 1;
	2506	}
	2507	have_sigbus_pending = true;
	2508	pending_sigbus_addr = addr;
	2509	pending_sigbus_code = code;
	2510	atomic_set(&cpu->exit_request, 1);
	2511	return 0;
	2512	#else
	2513	return 1;
	2514	#endif
	2515	}
	2516
	2517	/* Called synchronously (via signalfd) in main thread. */
	2518	int kvm_on_sigbus(int code, void *addr)
	2519	{
	2520	#ifdef KVM_HAVE_MCE_INJECTION
	2521	/* Action required MCE kills the process if SIGBUS is blocked. Because
	2522	* that's what happens in the I/O thread, where we handle MCE via signalfd,
	2523	* we can only get action optional here.
	2524	*/
	2525	assert(code != BUS_MCEERR_AR);
	2526	kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
	2527	return 0;
	2528	#else
	2529	return 1;
	2530	#endif
	2531	}
	2532
	2533	int kvm_create_device(KVMState *s, uint64_t type, bool test)
	2534	{
	2535	int ret;
	2536	struct kvm_create_device create_dev;
	2537
	2538	create_dev.type = type;
	2539	create_dev.fd = -1;
	2540	create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
	2541
	2542	if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
	2543	return -ENOTSUP;
	2544	}
	2545
	2546	ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
	2547	if (ret) {
	2548	return ret;
	2549	}
	2550
	2551	return test ? 0 : create_dev.fd;
	2552	}
	2553
	2554	bool kvm_device_supported(int vmfd, uint64_t type)
	2555	{
	2556	struct kvm_create_device create_dev = {
	2557	.type = type,
	2558	.fd = -1,
	2559	.flags = KVM_CREATE_DEVICE_TEST,
	2560	};
	2561
	2562	if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
	2563	return false;
	2564	}
	2565
	2566	return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
	2567	}
	2568
	2569	int kvm_set_one_reg(CPUState cs, uint64_t id, void source)
	2570	{
	2571	struct kvm_one_reg reg;
	2572	int r;
	2573
	2574	reg.id = id;
	2575	reg.addr = (uintptr_t) source;
	2576	r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
	2577	if (r) {
	2578	trace_kvm_failed_reg_set(id, strerror(-r));
	2579	}
	2580	return r;
	2581	}
	2582
	2583	int kvm_get_one_reg(CPUState cs, uint64_t id, void target)
	2584	{
	2585	struct kvm_one_reg reg;
	2586	int r;
	2587
	2588	reg.id = id;
	2589	reg.addr = (uintptr_t) target;
	2590	r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
	2591	if (r) {
	2592	trace_kvm_failed_reg_get(id, strerror(-r));
	2593	}
	2594	return r;
	2595	}
	2596
	2597	static void kvm_accel_class_init(ObjectClass oc, void data)
	2598	{
	2599	AccelClass *ac = ACCEL_CLASS(oc);
	2600	ac->name = "KVM";
	2601	ac->init_machine = kvm_init;
	2602	ac->allowed = &kvm_allowed;
	2603	}
	2604
	2605	static const TypeInfo kvm_accel_type = {
	2606	.name = TYPE_KVM_ACCEL,
	2607	.parent = TYPE_ACCEL,
	2608	.class_init = kvm_accel_class_init,
	2609	.instance_size = sizeof(KVMState),
	2610	};
	2611
	2612	static void kvm_type_init(void)
	2613	{
	2614	type_register_static(&kvm_accel_type);
	2615	}
	2616
	2617	type_init(kvm_type_init);