2 * ARM implementation of KVM hooks
4 * Copyright Christoffer Dall 2009-2010
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
14 #include <linux/kvm.h>
16 #include "qemu-common.h"
17 #include "qemu/timer.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "sysemu/sysemu.h"
21 #include "sysemu/kvm.h"
22 #include "sysemu/kvm_int.h"
26 #include "internals.h"
27 #include "hw/pci/pci.h"
28 #include "exec/memattrs.h"
29 #include "exec/address-spaces.h"
30 #include "hw/boards.h"
34 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
38 static bool cap_has_mp_state;
39 static bool cap_has_inject_serror_esr;
41 static ARMHostCPUFeatures arm_host_cpu_features;
43 int kvm_arm_vcpu_init(CPUState *cs)
45 ARMCPU *cpu = ARM_CPU(cs);
46 struct kvm_vcpu_init init;
48 init.target = cpu->kvm_target;
49 memcpy(init.features, cpu->kvm_init_features, sizeof(init.features));
51 return kvm_vcpu_ioctl(cs, KVM_ARM_VCPU_INIT, &init);
54 void kvm_arm_init_serror_injection(CPUState *cs)
56 cap_has_inject_serror_esr = kvm_check_extension(cs->kvm_state,
57 KVM_CAP_ARM_INJECT_SERROR_ESR);
60 bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try,
62 struct kvm_vcpu_init *init)
64 int ret, kvmfd = -1, vmfd = -1, cpufd = -1;
66 kvmfd = qemu_open("/dev/kvm", O_RDWR);
70 vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
74 cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
80 /* Caller doesn't want the VCPU to be initialized, so skip it */
84 ret = ioctl(vmfd, KVM_ARM_PREFERRED_TARGET, init);
86 ret = ioctl(cpufd, KVM_ARM_VCPU_INIT, init);
90 } else if (cpus_to_try) {
91 /* Old kernel which doesn't know about the
92 * PREFERRED_TARGET ioctl: we know it will only support
93 * creating one kind of guest CPU which is its preferred
96 while (*cpus_to_try != QEMU_KVM_ARM_TARGET_NONE) {
97 init->target = *cpus_to_try++;
98 memset(init->features, 0, sizeof(init->features));
99 ret = ioctl(cpufd, KVM_ARM_VCPU_INIT, init);
108 /* Treat a NULL cpus_to_try argument the same as an empty
109 * list, which means we will fail the call since this must
110 * be an old kernel which doesn't support PREFERRED_TARGET.
136 void kvm_arm_destroy_scratch_host_vcpu(int *fdarray)
140 for (i = 2; i >= 0; i--) {
145 void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu)
147 CPUARMState *env = &cpu->env;
149 if (!arm_host_cpu_features.dtb_compatible) {
150 if (!kvm_enabled() ||
151 !kvm_arm_get_host_cpu_features(&arm_host_cpu_features)) {
152 /* We can't report this error yet, so flag that we need to
153 * in arm_cpu_realizefn().
155 cpu->kvm_target = QEMU_KVM_ARM_TARGET_NONE;
156 cpu->host_cpu_probe_failed = true;
161 cpu->kvm_target = arm_host_cpu_features.target;
162 cpu->dtb_compatible = arm_host_cpu_features.dtb_compatible;
163 cpu->isar = arm_host_cpu_features.isar;
164 env->features = arm_host_cpu_features.features;
167 int kvm_arm_get_max_vm_ipa_size(MachineState *ms)
169 KVMState *s = KVM_STATE(ms->accelerator);
172 ret = kvm_check_extension(s, KVM_CAP_ARM_VM_IPA_SIZE);
173 return ret > 0 ? ret : 40;
176 int kvm_arch_init(MachineState *ms, KVMState *s)
178 /* For ARM interrupt delivery is always asynchronous,
179 * whether we are using an in-kernel VGIC or not.
181 kvm_async_interrupts_allowed = true;
184 * PSCI wakes up secondary cores, so we always need to
185 * have vCPUs waiting in kernel space
187 kvm_halt_in_kernel_allowed = true;
189 cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE);
194 unsigned long kvm_arch_vcpu_id(CPUState *cpu)
196 return cpu->cpu_index;
199 /* We track all the KVM devices which need their memory addresses
200 * passing to the kernel in a list of these structures.
201 * When board init is complete we run through the list and
202 * tell the kernel the base addresses of the memory regions.
203 * We use a MemoryListener to track mapping and unmapping of
204 * the regions during board creation, so the board models don't
205 * need to do anything special for the KVM case.
207 * Sometimes the address must be OR'ed with some other fields
208 * (for example for KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION).
209 * @kda_addr_ormask aims at storing the value of those fields.
211 typedef struct KVMDevice {
212 struct kvm_arm_device_addr kda;
213 struct kvm_device_attr kdattr;
214 uint64_t kda_addr_ormask;
216 QSLIST_ENTRY(KVMDevice) entries;
220 static QSLIST_HEAD(, KVMDevice) kvm_devices_head;
222 static void kvm_arm_devlistener_add(MemoryListener *listener,
223 MemoryRegionSection *section)
227 QSLIST_FOREACH(kd, &kvm_devices_head, entries) {
228 if (section->mr == kd->mr) {
229 kd->kda.addr = section->offset_within_address_space;
234 static void kvm_arm_devlistener_del(MemoryListener *listener,
235 MemoryRegionSection *section)
239 QSLIST_FOREACH(kd, &kvm_devices_head, entries) {
240 if (section->mr == kd->mr) {
246 static MemoryListener devlistener = {
247 .region_add = kvm_arm_devlistener_add,
248 .region_del = kvm_arm_devlistener_del,
251 static void kvm_arm_set_device_addr(KVMDevice *kd)
253 struct kvm_device_attr *attr = &kd->kdattr;
256 /* If the device control API is available and we have a device fd on the
257 * KVMDevice struct, let's use the newer API
259 if (kd->dev_fd >= 0) {
260 uint64_t addr = kd->kda.addr;
262 addr |= kd->kda_addr_ormask;
263 attr->addr = (uintptr_t)&addr;
264 ret = kvm_device_ioctl(kd->dev_fd, KVM_SET_DEVICE_ATTR, attr);
266 ret = kvm_vm_ioctl(kvm_state, KVM_ARM_SET_DEVICE_ADDR, &kd->kda);
270 fprintf(stderr, "Failed to set device address: %s\n",
276 static void kvm_arm_machine_init_done(Notifier *notifier, void *data)
280 QSLIST_FOREACH_SAFE(kd, &kvm_devices_head, entries, tkd) {
281 if (kd->kda.addr != -1) {
282 kvm_arm_set_device_addr(kd);
284 memory_region_unref(kd->mr);
285 QSLIST_REMOVE_HEAD(&kvm_devices_head, entries);
288 memory_listener_unregister(&devlistener);
291 static Notifier notify = {
292 .notify = kvm_arm_machine_init_done,
295 void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group,
296 uint64_t attr, int dev_fd, uint64_t addr_ormask)
300 if (!kvm_irqchip_in_kernel()) {
304 if (QSLIST_EMPTY(&kvm_devices_head)) {
305 memory_listener_register(&devlistener, &address_space_memory);
306 qemu_add_machine_init_done_notifier(¬ify);
308 kd = g_new0(KVMDevice, 1);
312 kd->kdattr.flags = 0;
313 kd->kdattr.group = group;
314 kd->kdattr.attr = attr;
316 kd->kda_addr_ormask = addr_ormask;
317 QSLIST_INSERT_HEAD(&kvm_devices_head, kd, entries);
318 memory_region_ref(kd->mr);
321 static int compare_u64(const void *a, const void *b)
323 if (*(uint64_t *)a > *(uint64_t *)b) {
326 if (*(uint64_t *)a < *(uint64_t *)b) {
332 /* Initialize the ARMCPU cpreg list according to the kernel's
333 * definition of what CPU registers it knows about (and throw away
334 * the previous TCG-created cpreg list).
336 int kvm_arm_init_cpreg_list(ARMCPU *cpu)
338 struct kvm_reg_list rl;
339 struct kvm_reg_list *rlp;
340 int i, ret, arraylen;
341 CPUState *cs = CPU(cpu);
344 ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, &rl);
348 rlp = g_malloc(sizeof(struct kvm_reg_list) + rl.n * sizeof(uint64_t));
350 ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, rlp);
354 /* Sort the list we get back from the kernel, since cpreg_tuples
355 * must be in strictly ascending order.
357 qsort(&rlp->reg, rlp->n, sizeof(rlp->reg[0]), compare_u64);
359 for (i = 0, arraylen = 0; i < rlp->n; i++) {
360 if (!kvm_arm_reg_syncs_via_cpreg_list(rlp->reg[i])) {
363 switch (rlp->reg[i] & KVM_REG_SIZE_MASK) {
364 case KVM_REG_SIZE_U32:
365 case KVM_REG_SIZE_U64:
368 fprintf(stderr, "Can't handle size of register in kernel list\n");
376 cpu->cpreg_indexes = g_renew(uint64_t, cpu->cpreg_indexes, arraylen);
377 cpu->cpreg_values = g_renew(uint64_t, cpu->cpreg_values, arraylen);
378 cpu->cpreg_vmstate_indexes = g_renew(uint64_t, cpu->cpreg_vmstate_indexes,
380 cpu->cpreg_vmstate_values = g_renew(uint64_t, cpu->cpreg_vmstate_values,
382 cpu->cpreg_array_len = arraylen;
383 cpu->cpreg_vmstate_array_len = arraylen;
385 for (i = 0, arraylen = 0; i < rlp->n; i++) {
386 uint64_t regidx = rlp->reg[i];
387 if (!kvm_arm_reg_syncs_via_cpreg_list(regidx)) {
390 cpu->cpreg_indexes[arraylen] = regidx;
393 assert(cpu->cpreg_array_len == arraylen);
395 if (!write_kvmstate_to_list(cpu)) {
396 /* Shouldn't happen unless kernel is inconsistent about
397 * what registers exist.
399 fprintf(stderr, "Initial read of kernel register state failed\n");
409 bool write_kvmstate_to_list(ARMCPU *cpu)
411 CPUState *cs = CPU(cpu);
415 for (i = 0; i < cpu->cpreg_array_len; i++) {
416 struct kvm_one_reg r;
417 uint64_t regidx = cpu->cpreg_indexes[i];
423 switch (regidx & KVM_REG_SIZE_MASK) {
424 case KVM_REG_SIZE_U32:
425 r.addr = (uintptr_t)&v32;
426 ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
428 cpu->cpreg_values[i] = v32;
431 case KVM_REG_SIZE_U64:
432 r.addr = (uintptr_t)(cpu->cpreg_values + i);
433 ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
445 bool write_list_to_kvmstate(ARMCPU *cpu, int level)
447 CPUState *cs = CPU(cpu);
451 for (i = 0; i < cpu->cpreg_array_len; i++) {
452 struct kvm_one_reg r;
453 uint64_t regidx = cpu->cpreg_indexes[i];
457 if (kvm_arm_cpreg_level(regidx) > level) {
462 switch (regidx & KVM_REG_SIZE_MASK) {
463 case KVM_REG_SIZE_U32:
464 v32 = cpu->cpreg_values[i];
465 r.addr = (uintptr_t)&v32;
467 case KVM_REG_SIZE_U64:
468 r.addr = (uintptr_t)(cpu->cpreg_values + i);
473 ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
475 /* We might fail for "unknown register" and also for
476 * "you tried to set a register which is constant with
477 * a different value from what it actually contains".
485 void kvm_arm_reset_vcpu(ARMCPU *cpu)
489 /* Re-init VCPU so that all registers are set to
490 * their respective reset values.
492 ret = kvm_arm_vcpu_init(CPU(cpu));
494 fprintf(stderr, "kvm_arm_vcpu_init failed: %s\n", strerror(-ret));
497 if (!write_kvmstate_to_list(cpu)) {
498 fprintf(stderr, "write_kvmstate_to_list failed\n");
502 * Sync the reset values also into the CPUState. This is necessary
503 * because the next thing we do will be a kvm_arch_put_registers()
504 * which will update the list values from the CPUState before copying
505 * the list values back to KVM. It's OK to ignore failure returns here
506 * for the same reason we do so in kvm_arch_get_registers().
508 write_list_to_cpustate(cpu);
512 * Update KVM's MP_STATE based on what QEMU thinks it is
514 int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu)
516 if (cap_has_mp_state) {
517 struct kvm_mp_state mp_state = {
518 .mp_state = (cpu->power_state == PSCI_OFF) ?
519 KVM_MP_STATE_STOPPED : KVM_MP_STATE_RUNNABLE
521 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
523 fprintf(stderr, "%s: failed to set MP_STATE %d/%s\n",
524 __func__, ret, strerror(-ret));
533 * Sync the KVM MP_STATE into QEMU
535 int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu)
537 if (cap_has_mp_state) {
538 struct kvm_mp_state mp_state;
539 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MP_STATE, &mp_state);
541 fprintf(stderr, "%s: failed to get MP_STATE %d/%s\n",
542 __func__, ret, strerror(-ret));
545 cpu->power_state = (mp_state.mp_state == KVM_MP_STATE_STOPPED) ?
552 int kvm_put_vcpu_events(ARMCPU *cpu)
554 CPUARMState *env = &cpu->env;
555 struct kvm_vcpu_events events;
558 if (!kvm_has_vcpu_events()) {
562 memset(&events, 0, sizeof(events));
563 events.exception.serror_pending = env->serror.pending;
565 /* Inject SError to guest with specified syndrome if host kernel
566 * supports it, otherwise inject SError without syndrome.
568 if (cap_has_inject_serror_esr) {
569 events.exception.serror_has_esr = env->serror.has_esr;
570 events.exception.serror_esr = env->serror.esr;
573 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
575 error_report("failed to put vcpu events");
581 int kvm_get_vcpu_events(ARMCPU *cpu)
583 CPUARMState *env = &cpu->env;
584 struct kvm_vcpu_events events;
587 if (!kvm_has_vcpu_events()) {
591 memset(&events, 0, sizeof(events));
592 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
594 error_report("failed to get vcpu events");
598 env->serror.pending = events.exception.serror_pending;
599 env->serror.has_esr = events.exception.serror_has_esr;
600 env->serror.esr = events.exception.serror_esr;
605 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
609 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
612 uint32_t switched_level;
614 if (kvm_irqchip_in_kernel()) {
616 * We only need to sync timer states with user-space interrupt
617 * controllers, so return early and save cycles if we don't.
619 return MEMTXATTRS_UNSPECIFIED;
624 /* Synchronize our shadowed in-kernel device irq lines with the kvm ones */
625 if (run->s.regs.device_irq_level != cpu->device_irq_level) {
626 switched_level = cpu->device_irq_level ^ run->s.regs.device_irq_level;
628 qemu_mutex_lock_iothread();
630 if (switched_level & KVM_ARM_DEV_EL1_VTIMER) {
631 qemu_set_irq(cpu->gt_timer_outputs[GTIMER_VIRT],
632 !!(run->s.regs.device_irq_level &
633 KVM_ARM_DEV_EL1_VTIMER));
634 switched_level &= ~KVM_ARM_DEV_EL1_VTIMER;
637 if (switched_level & KVM_ARM_DEV_EL1_PTIMER) {
638 qemu_set_irq(cpu->gt_timer_outputs[GTIMER_PHYS],
639 !!(run->s.regs.device_irq_level &
640 KVM_ARM_DEV_EL1_PTIMER));
641 switched_level &= ~KVM_ARM_DEV_EL1_PTIMER;
644 if (switched_level & KVM_ARM_DEV_PMU) {
645 qemu_set_irq(cpu->pmu_interrupt,
646 !!(run->s.regs.device_irq_level & KVM_ARM_DEV_PMU));
647 switched_level &= ~KVM_ARM_DEV_PMU;
650 if (switched_level) {
651 qemu_log_mask(LOG_UNIMP, "%s: unhandled in-kernel device IRQ %x\n",
652 __func__, switched_level);
655 /* We also mark unknown levels as processed to not waste cycles */
656 cpu->device_irq_level = run->s.regs.device_irq_level;
657 qemu_mutex_unlock_iothread();
660 return MEMTXATTRS_UNSPECIFIED;
664 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
668 switch (run->exit_reason) {
670 if (kvm_arm_handle_debug(cs, &run->debug.arch)) {
672 } /* otherwise return to guest */
675 qemu_log_mask(LOG_UNIMP, "%s: un-handled exit reason %d\n",
676 __func__, run->exit_reason);
682 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
687 int kvm_arch_process_async_events(CPUState *cs)
692 /* The #ifdef protections are until 32bit headers are imported and can
693 * be removed once both 32 and 64 bit reach feature parity.
695 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
697 #ifdef KVM_GUESTDBG_USE_SW_BP
698 if (kvm_sw_breakpoints_active(cs)) {
699 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
702 #ifdef KVM_GUESTDBG_USE_HW
703 if (kvm_arm_hw_debug_active(cs)) {
704 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW;
705 kvm_arm_copy_hw_debug_data(&dbg->arch);
710 void kvm_arch_init_irq_routing(KVMState *s)
714 int kvm_arch_irqchip_create(MachineState *ms, KVMState *s)
716 if (machine_kernel_irqchip_split(ms)) {
717 perror("-machine kernel_irqchip=split is not supported on ARM.");
721 /* If we can create the VGIC using the newer device control API, we
722 * let the device do this when it initializes itself, otherwise we
723 * fall back to the old API */
724 return kvm_check_extension(s, KVM_CAP_DEVICE_CTRL);
727 int kvm_arm_vgic_probe(void)
729 if (kvm_create_device(kvm_state,
730 KVM_DEV_TYPE_ARM_VGIC_V3, true) == 0) {
732 } else if (kvm_create_device(kvm_state,
733 KVM_DEV_TYPE_ARM_VGIC_V2, true) == 0) {
740 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
741 uint64_t address, uint32_t data, PCIDevice *dev)
743 AddressSpace *as = pci_device_iommu_address_space(dev);
744 hwaddr xlat, len, doorbell_gpa;
745 MemoryRegionSection mrs;
749 if (as == &address_space_memory) {
753 /* MSI doorbell address is translated by an IOMMU */
756 mr = address_space_translate(as, address, &xlat, &len, true,
757 MEMTXATTRS_UNSPECIFIED);
761 mrs = memory_region_find(mr, xlat, 1);
766 doorbell_gpa = mrs.offset_within_address_space;
767 memory_region_unref(mrs.mr);
769 route->u.msi.address_lo = doorbell_gpa;
770 route->u.msi.address_hi = doorbell_gpa >> 32;
772 trace_kvm_arm_fixup_msi_route(address, doorbell_gpa);
781 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
782 int vector, PCIDevice *dev)
787 int kvm_arch_release_virq_post(int virq)
792 int kvm_arch_msi_data_to_gsi(uint32_t data)
794 return (data - 32) & 0xffff;