#include "qemu/osdep.h"
#include <sys/ioctl.h>
-#include <sys/mman.h>
#include <linux/kvm.h>
#include "qemu/error-report.h"
#include "hw/hw.h"
#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
#include "hw/s390x/adapter.h"
#include "exec/gdbstub.h"
#include "sysemu/kvm_int.h"
+#include "sysemu/cpus.h"
#include "qemu/bswap.h"
#include "exec/memory.h"
#include "exec/ram_addr.h"
#include "exec/address-spaces.h"
#include "qemu/event_notifier.h"
-#include "trace.h"
+#include "trace-root.h"
#include "hw/irq.h"
#include "hw/boards.h"
#define KVM_MSI_HASHTAB_SIZE 256
+struct KVMParkedVcpu {
+ unsigned long vcpu_id;
+ int kvm_fd;
+ QLIST_ENTRY(KVMParkedVcpu) node;
+};
+
struct KVMState
{
AccelState parent_obj;
QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
#endif
KVMMemoryListener memory_listener;
+ QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
};
KVMState *kvm_state;
bool kvm_vm_attributes_allowed;
bool kvm_direct_msi_allowed;
bool kvm_ioeventfd_any_length_allowed;
+bool kvm_msi_use_devid;
+static bool kvm_immediate_exit;
static const KVMCapabilityInfo kvm_required_capabilites[] = {
KVM_CAP_INFO(USER_MEMORY),
KVM_CAP_LAST_INFO
};
+int kvm_get_max_memslots(void)
+{
+ KVMState *s = KVM_STATE(current_machine->accelerator);
+
+ return s->nr_slots;
+}
+
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
KVMState *s = kvm_state;
return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
}
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+ KVMState *s = kvm_state;
+ long mmap_size;
+ struct KVMParkedVcpu *vcpu = NULL;
+ int ret = 0;
+
+ DPRINTF("kvm_destroy_vcpu\n");
+
+ mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+ if (mmap_size < 0) {
+ ret = mmap_size;
+ DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+ goto err;
+ }
+
+ ret = munmap(cpu->kvm_run, mmap_size);
+ if (ret < 0) {
+ goto err;
+ }
+
+ vcpu = g_malloc0(sizeof(*vcpu));
+ vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+ vcpu->kvm_fd = cpu->kvm_fd;
+ QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+err:
+ return ret;
+}
+
+static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+ struct KVMParkedVcpu *cpu;
+
+ QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+ if (cpu->vcpu_id == vcpu_id) {
+ int kvm_fd;
+
+ QLIST_REMOVE(cpu, node);
+ kvm_fd = cpu->kvm_fd;
+ g_free(cpu);
+ return kvm_fd;
+ }
+ }
+
+ return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
+}
+
int kvm_init_vcpu(CPUState *cpu)
{
KVMState *s = kvm_state;
DPRINTF("kvm_init_vcpu\n");
- ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
+ ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
if (ret < 0) {
DPRINTF("kvm_create_vcpu failed\n");
goto err;
{
int ret;
+ if (kvm_gsi_direct_mapping()) {
+ return;
+ }
+
+ if (!kvm_gsi_routing_enabled()) {
+ return;
+ }
+
s->irq_routes->flags = 0;
+ trace_kvm_irqchip_commit_routes();
ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
assert(ret == 0);
}
*entry = *new_entry;
- kvm_irqchip_commit_routes(s);
-
return 0;
}
}
}
clear_gsi(s, virq);
+ kvm_arch_release_virq_post(virq);
}
static unsigned int kvm_hash_msi(uint32_t data)
return kvm_set_irq(s, route->kroute.gsi, 1);
}
-int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg, PCIDevice *dev)
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
{
struct kvm_irq_routing_entry kroute = {};
int virq;
+ MSIMessage msg = {0, 0};
+
+ if (dev) {
+ msg = pci_get_msi_message(dev, vector);
+ }
if (kvm_gsi_direct_mapping()) {
return kvm_arch_msi_data_to_gsi(msg.data);
kroute.u.msi.address_lo = (uint32_t)msg.address;
kroute.u.msi.address_hi = msg.address >> 32;
kroute.u.msi.data = le32_to_cpu(msg.data);
+ if (kvm_msi_devid_required()) {
+ kroute.flags = KVM_MSI_VALID_DEVID;
+ kroute.u.msi.devid = pci_requester_id(dev);
+ }
if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
kvm_irqchip_release_virq(s, virq);
return -EINVAL;
}
+ trace_kvm_irqchip_add_msi_route(virq);
+
kvm_add_routing_entry(s, &kroute);
+ kvm_arch_add_msi_route_post(&kroute, vector, dev);
kvm_irqchip_commit_routes(s);
return virq;
kroute.u.msi.address_lo = (uint32_t)msg.address;
kroute.u.msi.address_hi = msg.address >> 32;
kroute.u.msi.data = le32_to_cpu(msg.data);
+ if (kvm_msi_devid_required()) {
+ kroute.flags = KVM_MSI_VALID_DEVID;
+ kroute.u.msi.devid = pci_requester_id(dev);
+ }
if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
return -EINVAL;
}
+ trace_kvm_irqchip_update_msi_route(virq);
+
return kvm_update_routing_entry(s, &kroute);
}
abort();
}
-int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
{
return -ENOSYS;
}
return (ret) ? ret : kvm_recommended_vcpus(s);
}
+static int kvm_max_vcpu_id(KVMState *s)
+{
+ int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
+ return (ret) ? ret : kvm_max_vcpus(s);
+}
+
+bool kvm_vcpu_id_is_valid(int vcpu_id)
+{
+ KVMState *s = KVM_STATE(current_machine->accelerator);
+ return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
+}
+
static int kvm_init(MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_INIT(&s->kvm_sw_breakpoints);
#endif
+ QLIST_INIT(&s->kvm_parked_vcpus);
s->vmfd = -1;
s->fd = qemu_open("/dev/kvm", O_RDWR);
if (s->fd == -1) {
goto err;
}
+ kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
/* If unspecified, use the default value */
s->coalesced_flush_in_progress = false;
}
-static void do_kvm_cpu_synchronize_state(void *arg)
+static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
- CPUState *cpu = arg;
-
if (!cpu->kvm_vcpu_dirty) {
kvm_arch_get_registers(cpu);
cpu->kvm_vcpu_dirty = true;
void kvm_cpu_synchronize_state(CPUState *cpu)
{
if (!cpu->kvm_vcpu_dirty) {
- run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
-static void do_kvm_cpu_synchronize_post_reset(void *arg)
+static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
{
- CPUState *cpu = arg;
-
kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
cpu->kvm_vcpu_dirty = false;
}
void kvm_cpu_synchronize_post_reset(CPUState *cpu)
{
- run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, cpu);
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
}
-static void do_kvm_cpu_synchronize_post_init(void *arg)
+static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
{
- CPUState *cpu = arg;
-
kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
cpu->kvm_vcpu_dirty = false;
}
void kvm_cpu_synchronize_post_init(CPUState *cpu)
{
- run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu);
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
+}
+
+static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
+{
+ cpu->kvm_vcpu_dirty = true;
+}
+
+void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
+}
+
+#ifdef KVM_HAVE_MCE_INJECTION
+static __thread void *pending_sigbus_addr;
+static __thread int pending_sigbus_code;
+static __thread bool have_sigbus_pending;
+#endif
+
+static void kvm_cpu_kick(CPUState *cpu)
+{
+ atomic_set(&cpu->kvm_run->immediate_exit, 1);
+}
+
+static void kvm_cpu_kick_self(void)
+{
+ if (kvm_immediate_exit) {
+ kvm_cpu_kick(current_cpu);
+ } else {
+ qemu_cpu_kick_self();
+ }
+}
+
+static void kvm_eat_signals(CPUState *cpu)
+{
+ struct timespec ts = { 0, 0 };
+ siginfo_t siginfo;
+ sigset_t waitset;
+ sigset_t chkset;
+ int r;
+
+ if (kvm_immediate_exit) {
+ atomic_set(&cpu->kvm_run->immediate_exit, 0);
+ /* Write kvm_run->immediate_exit before the cpu->exit_request
+ * write in kvm_cpu_exec.
+ */
+ smp_wmb();
+ return;
+ }
+
+ sigemptyset(&waitset);
+ sigaddset(&waitset, SIG_IPI);
+
+ do {
+ r = sigtimedwait(&waitset, &siginfo, &ts);
+ if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
+ perror("sigtimedwait");
+ exit(1);
+ }
+
+ r = sigpending(&chkset);
+ if (r == -1) {
+ perror("sigpending");
+ exit(1);
+ }
+ } while (sigismember(&chkset, SIG_IPI));
}
int kvm_cpu_exec(CPUState *cpu)
DPRINTF("kvm_cpu_exec()\n");
if (kvm_arch_process_async_events(cpu)) {
- cpu->exit_request = 0;
+ atomic_set(&cpu->exit_request, 0);
return EXCP_HLT;
}
}
kvm_arch_pre_run(cpu, run);
- if (cpu->exit_request) {
+ if (atomic_read(&cpu->exit_request)) {
DPRINTF("interrupt exit requested\n");
/*
* KVM requires us to reenter the kernel after IO exits to complete
* instruction emulation. This self-signal will ensure that we
* leave ASAP again.
*/
- qemu_cpu_kick_self();
+ kvm_cpu_kick_self();
}
+ /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
+ * Matching barrier in kvm_eat_signals.
+ */
+ smp_rmb();
+
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
attrs = kvm_arch_post_run(cpu, run);
+#ifdef KVM_HAVE_MCE_INJECTION
+ if (unlikely(have_sigbus_pending)) {
+ qemu_mutex_lock_iothread();
+ kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
+ pending_sigbus_addr);
+ have_sigbus_pending = false;
+ qemu_mutex_unlock_iothread();
+ }
+#endif
+
if (run_ret < 0) {
if (run_ret == -EINTR || run_ret == -EAGAIN) {
DPRINTF("io window exit\n");
+ kvm_eat_signals(cpu);
ret = EXCP_INTERRUPT;
break;
}
break;
case KVM_EXIT_SHUTDOWN:
DPRINTF("shutdown\n");
- qemu_system_reset_request();
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
ret = EXCP_INTERRUPT;
break;
case KVM_EXIT_UNKNOWN:
case KVM_EXIT_SYSTEM_EVENT:
switch (run->system_event.type) {
case KVM_SYSTEM_EVENT_SHUTDOWN:
- qemu_system_shutdown_request();
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
ret = EXCP_INTERRUPT;
break;
case KVM_SYSTEM_EVENT_RESET:
- qemu_system_reset_request();
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
ret = EXCP_INTERRUPT;
break;
case KVM_SYSTEM_EVENT_CRASH:
+ kvm_cpu_synchronize_state(cpu);
qemu_mutex_lock_iothread();
- qemu_system_guest_panicked();
+ qemu_system_guest_panicked(cpu_get_crash_info(cpu));
qemu_mutex_unlock_iothread();
ret = 0;
break;
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
- cpu->exit_request = 0;
+ atomic_set(&cpu->exit_request, 0);
return ret;
}
if (err < 0) {
error_report("KVM_%s_DEVICE_ATTR failed: %s",
write ? "SET" : "GET", strerror(-err));
- error_printf("Group %d attr 0x%016" PRIx64, group, attr);
+ error_printf("Group %d attr 0x%016" PRIx64 "\n", group, attr);
abort();
}
}
+/* Return 1 on success, 0 on failure */
int kvm_has_sync_mmu(void)
{
return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
return kvm_state->intx_set_mask;
}
-void kvm_setup_guest_memory(void *start, size_t size)
-{
- if (!kvm_has_sync_mmu()) {
- int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
-
- if (ret) {
- perror("qemu_madvise");
- fprintf(stderr,
- "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
- exit(1);
- }
- }
-}
-
#ifdef KVM_CAP_SET_GUEST_DEBUG
struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
target_ulong pc)
struct kvm_set_guest_debug_data {
struct kvm_guest_debug dbg;
- CPUState *cpu;
int err;
};
-static void kvm_invoke_set_guest_debug(void *data)
+static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
{
- struct kvm_set_guest_debug_data *dbg_data = data;
+ struct kvm_set_guest_debug_data *dbg_data =
+ (struct kvm_set_guest_debug_data *) data.host_ptr;
- dbg_data->err = kvm_vcpu_ioctl(dbg_data->cpu, KVM_SET_GUEST_DEBUG,
+ dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
&dbg_data->dbg);
}
data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
}
kvm_arch_update_guest_debug(cpu, &data.dbg);
- data.cpu = cpu;
- run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
+ run_on_cpu(cpu, kvm_invoke_set_guest_debug,
+ RUN_ON_CPU_HOST_PTR(&data));
return data.err;
}
}
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
-int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
+static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
{
KVMState *s = kvm_state;
struct kvm_signal_mask *sigmask;
int r;
- if (!sigset) {
- return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL);
- }
-
sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
sigmask->len = s->sigmask_len;
return r;
}
+
+static void kvm_ipi_signal(int sig)
+{
+ if (current_cpu) {
+ assert(kvm_immediate_exit);
+ kvm_cpu_kick(current_cpu);
+ }
+}
+
+void kvm_init_cpu_signals(CPUState *cpu)
+{
+ int r;
+ sigset_t set;
+ struct sigaction sigact;
+
+ memset(&sigact, 0, sizeof(sigact));
+ sigact.sa_handler = kvm_ipi_signal;
+ sigaction(SIG_IPI, &sigact, NULL);
+
+ pthread_sigmask(SIG_BLOCK, NULL, &set);
+#if defined KVM_HAVE_MCE_INJECTION
+ sigdelset(&set, SIGBUS);
+ pthread_sigmask(SIG_SETMASK, &set, NULL);
+#endif
+ sigdelset(&set, SIG_IPI);
+ if (kvm_immediate_exit) {
+ r = pthread_sigmask(SIG_SETMASK, &set, NULL);
+ } else {
+ r = kvm_set_signal_mask(cpu, &set);
+ }
+ if (r) {
+ fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
+ exit(1);
+ }
+}
+
+/* Called asynchronously in VCPU thread. */
int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
{
- return kvm_arch_on_sigbus_vcpu(cpu, code, addr);
+#ifdef KVM_HAVE_MCE_INJECTION
+ if (have_sigbus_pending) {
+ return 1;
+ }
+ have_sigbus_pending = true;
+ pending_sigbus_addr = addr;
+ pending_sigbus_code = code;
+ atomic_set(&cpu->exit_request, 1);
+ return 0;
+#else
+ return 1;
+#endif
}
+/* Called synchronously (via signalfd) in main thread. */
int kvm_on_sigbus(int code, void *addr)
{
- return kvm_arch_on_sigbus(code, addr);
+#ifdef KVM_HAVE_MCE_INJECTION
+ /* Action required MCE kills the process if SIGBUS is blocked. Because
+ * that's what happens in the I/O thread, where we handle MCE via signalfd,
+ * we can only get action optional here.
+ */
+ assert(code != BUS_MCEERR_AR);
+ kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
+ return 0;
+#else
+ return 1;
+#endif
}
int kvm_create_device(KVMState *s, uint64_t type, bool test)