2 * vfio based device assignment support - platform devices
4 * Copyright Linaro Limited, 2014
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
13 * Based on vfio based PCI device assignment support:
14 * Copyright Red Hat, Inc. 2012
17 #include "qemu/osdep.h"
18 #include "qapi/error.h"
19 #include <sys/ioctl.h>
20 #include <linux/vfio.h>
22 #include "hw/vfio/vfio-platform.h"
23 #include "migration/vmstate.h"
24 #include "qemu/error-report.h"
25 #include "qemu/lockable.h"
26 #include "qemu/main-loop.h"
27 #include "qemu/module.h"
28 #include "qemu/range.h"
29 #include "exec/memory.h"
30 #include "exec/address-spaces.h"
31 #include "qemu/queue.h"
32 #include "hw/sysbus.h"
35 #include "hw/platform-bus.h"
36 #include "hw/qdev-properties.h"
37 #include "sysemu/kvm.h"
40 * Functions used whatever the injection method
43 static inline bool vfio_irq_is_automasked(VFIOINTp *intp)
45 return intp->flags & VFIO_IRQ_INFO_AUTOMASKED;
49 * vfio_init_intp - allocate, initialize the IRQ struct pointer
50 * and add it into the list of IRQs
51 * @vbasedev: the VFIO device handle
52 * @info: irq info struct retrieved from VFIO driver
55 static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev,
56 struct vfio_irq_info info, Error **errp)
59 VFIOPlatformDevice *vdev =
60 container_of(vbasedev, VFIOPlatformDevice, vbasedev);
61 SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev);
64 intp = g_malloc0(sizeof(*intp));
66 intp->pin = info.index;
67 intp->flags = info.flags;
68 intp->state = VFIO_IRQ_INACTIVE;
69 intp->kvm_accel = false;
71 sysbus_init_irq(sbdev, &intp->qemuirq);
73 /* Get an eventfd for trigger */
74 intp->interrupt = g_malloc0(sizeof(EventNotifier));
75 ret = event_notifier_init(intp->interrupt, 0);
77 g_free(intp->interrupt);
79 error_setg_errno(errp, -ret,
80 "failed to initialize trigger eventfd notifier");
83 if (vfio_irq_is_automasked(intp)) {
84 /* Get an eventfd for resample/unmask */
85 intp->unmask = g_malloc0(sizeof(EventNotifier));
86 ret = event_notifier_init(intp->unmask, 0);
88 g_free(intp->interrupt);
91 error_setg_errno(errp, -ret,
92 "failed to initialize resample eventfd notifier");
97 QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
102 * vfio_set_trigger_eventfd - set VFIO eventfd handling
104 * @intp: IRQ struct handle
105 * @handler: handler to be called on eventfd signaling
107 * Setup VFIO signaling and attach an optional user-side handler
110 static int vfio_set_trigger_eventfd(VFIOINTp *intp,
111 eventfd_user_side_handler_t handler)
113 VFIODevice *vbasedev = &intp->vdev->vbasedev;
114 int32_t fd = event_notifier_get_fd(intp->interrupt);
118 qemu_set_fd_handler(fd, (IOHandler *)handler, NULL, intp);
120 ret = vfio_set_irq_signaling(vbasedev, intp->pin, 0,
121 VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err);
123 error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
124 qemu_set_fd_handler(fd, NULL, NULL, NULL);
131 * Functions only used when eventfds are handled on user-side
136 * vfio_mmap_set_enabled - enable/disable the fast path mode
137 * @vdev: the VFIO platform device
138 * @enabled: the target mmap state
140 * enabled = true ~ fast path = MMIO region is mmaped (no KVM TRAP);
141 * enabled = false ~ slow path = MMIO region is trapped and region callbacks
142 * are called; slow path enables to trap the device IRQ status register reset
145 static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled)
149 for (i = 0; i < vdev->vbasedev.num_regions; i++) {
150 vfio_region_mmaps_set_enabled(vdev->regions[i], enabled);
155 * vfio_intp_mmap_enable - timer function, restores the fast path
156 * if there is no more active IRQ
157 * @opaque: actually points to the VFIO platform device
159 * Called on mmap timer timout, this function checks whether the
160 * IRQ is still active and if not, restores the fast path.
161 * by construction a single eventfd is handled at a time.
162 * if the IRQ is still active, the timer is re-programmed.
164 static void vfio_intp_mmap_enable(void *opaque)
167 VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque;
169 qemu_mutex_lock(&vdev->intp_mutex);
170 QLIST_FOREACH(tmp, &vdev->intp_list, next) {
171 if (tmp->state == VFIO_IRQ_ACTIVE) {
172 trace_vfio_platform_intp_mmap_enable(tmp->pin);
173 /* re-program the timer to check active status later */
174 timer_mod(vdev->mmap_timer,
175 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
177 qemu_mutex_unlock(&vdev->intp_mutex);
181 vfio_mmap_set_enabled(vdev, true);
182 qemu_mutex_unlock(&vdev->intp_mutex);
186 * vfio_intp_inject_pending_lockheld - Injects a pending IRQ
187 * @opaque: opaque pointer, in practice the VFIOINTp handle
189 * The function is called on a previous IRQ completion, from
190 * vfio_platform_eoi, while the intp_mutex is locked.
191 * Also in such situation, the slow path already is set and
192 * the mmap timer was already programmed.
194 static void vfio_intp_inject_pending_lockheld(VFIOINTp *intp)
196 trace_vfio_platform_intp_inject_pending_lockheld(intp->pin,
197 event_notifier_get_fd(intp->interrupt));
199 intp->state = VFIO_IRQ_ACTIVE;
201 /* trigger the virtual IRQ */
202 qemu_set_irq(intp->qemuirq, 1);
206 * vfio_intp_interrupt - The user-side eventfd handler
207 * @opaque: opaque pointer which in practice is the VFIOINTp handle
209 * the function is entered in event handler context:
210 * the vIRQ is injected into the guest if there is no other active
213 static void vfio_intp_interrupt(VFIOINTp *intp)
217 VFIOPlatformDevice *vdev = intp->vdev;
218 bool delay_handling = false;
220 QEMU_LOCK_GUARD(&vdev->intp_mutex);
221 if (intp->state == VFIO_IRQ_INACTIVE) {
222 QLIST_FOREACH(tmp, &vdev->intp_list, next) {
223 if (tmp->state == VFIO_IRQ_ACTIVE ||
224 tmp->state == VFIO_IRQ_PENDING) {
225 delay_handling = true;
230 if (delay_handling) {
232 * the new IRQ gets a pending status and is pushed in
235 intp->state = VFIO_IRQ_PENDING;
236 trace_vfio_intp_interrupt_set_pending(intp->pin);
237 QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue,
239 event_notifier_test_and_clear(intp->interrupt);
243 trace_vfio_platform_intp_interrupt(intp->pin,
244 event_notifier_get_fd(intp->interrupt));
246 ret = event_notifier_test_and_clear(intp->interrupt);
248 error_report("Error when clearing fd=%d (ret = %d)",
249 event_notifier_get_fd(intp->interrupt), ret);
252 intp->state = VFIO_IRQ_ACTIVE;
255 vfio_mmap_set_enabled(vdev, false);
257 /* trigger the virtual IRQ */
258 qemu_set_irq(intp->qemuirq, 1);
261 * Schedule the mmap timer which will restore fastpath when no IRQ
264 if (vdev->mmap_timeout) {
265 timer_mod(vdev->mmap_timer,
266 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
272 * vfio_platform_eoi - IRQ completion routine
273 * @vbasedev: the VFIO device handle
275 * De-asserts the active virtual IRQ and unmasks the physical IRQ
276 * (effective for level sensitive IRQ auto-masked by the VFIO driver).
277 * Then it handles next pending IRQ if any.
278 * eoi function is called on the first access to any MMIO region
279 * after an IRQ was triggered, trapped since slow path was set.
280 * It is assumed this access corresponds to the IRQ status
281 * register reset. With such a mechanism, a single IRQ can be
282 * handled at a time since there is no way to know which IRQ
283 * was completed by the guest (we would need additional details
284 * about the IRQ status register mask).
286 static void vfio_platform_eoi(VFIODevice *vbasedev)
289 VFIOPlatformDevice *vdev =
290 container_of(vbasedev, VFIOPlatformDevice, vbasedev);
292 qemu_mutex_lock(&vdev->intp_mutex);
293 QLIST_FOREACH(intp, &vdev->intp_list, next) {
294 if (intp->state == VFIO_IRQ_ACTIVE) {
295 trace_vfio_platform_eoi(intp->pin,
296 event_notifier_get_fd(intp->interrupt));
297 intp->state = VFIO_IRQ_INACTIVE;
299 /* deassert the virtual IRQ */
300 qemu_set_irq(intp->qemuirq, 0);
302 if (vfio_irq_is_automasked(intp)) {
303 /* unmasks the physical level-sensitive IRQ */
304 vfio_unmask_single_irqindex(vbasedev, intp->pin);
307 /* a single IRQ can be active at a time */
311 /* in case there are pending IRQs, handle the first one */
312 if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) {
313 intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue);
314 vfio_intp_inject_pending_lockheld(intp);
315 QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext);
317 qemu_mutex_unlock(&vdev->intp_mutex);
321 * vfio_start_eventfd_injection - starts the virtual IRQ injection using
322 * user-side handled eventfds
323 * @sbdev: the sysbus device handle
324 * @irq: the qemu irq handle
327 static void vfio_start_eventfd_injection(SysBusDevice *sbdev, qemu_irq irq)
329 VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
332 QLIST_FOREACH(intp, &vdev->intp_list, next) {
333 if (intp->qemuirq == irq) {
339 if (vfio_set_trigger_eventfd(intp, vfio_intp_interrupt)) {
345 * Functions used for irqfd
349 * vfio_set_resample_eventfd - sets the resamplefd for an IRQ
350 * @intp: the IRQ struct handle
351 * programs the VFIO driver to unmask this IRQ when the
352 * intp->unmask eventfd is triggered
354 static int vfio_set_resample_eventfd(VFIOINTp *intp)
356 int32_t fd = event_notifier_get_fd(intp->unmask);
357 VFIODevice *vbasedev = &intp->vdev->vbasedev;
361 qemu_set_fd_handler(fd, NULL, NULL, NULL);
362 ret = vfio_set_irq_signaling(vbasedev, intp->pin, 0,
363 VFIO_IRQ_SET_ACTION_UNMASK, fd, &err);
365 error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name);
371 * vfio_start_irqfd_injection - starts the virtual IRQ injection using
374 * @sbdev: the sysbus device handle
375 * @irq: the qemu irq handle
377 * In case the irqfd setup fails, we fallback to userspace handled eventfd
379 static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)
381 VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
384 if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() ||
385 !vdev->irqfd_allowed) {
389 QLIST_FOREACH(intp, &vdev->intp_list, next) {
390 if (intp->qemuirq == irq) {
396 if (kvm_irqchip_add_irqfd_notifier(kvm_state, intp->interrupt,
397 intp->unmask, irq) < 0) {
401 if (vfio_set_trigger_eventfd(intp, NULL) < 0) {
404 if (vfio_irq_is_automasked(intp)) {
405 if (vfio_set_resample_eventfd(intp) < 0) {
408 trace_vfio_platform_start_level_irqfd_injection(intp->pin,
409 event_notifier_get_fd(intp->interrupt),
410 event_notifier_get_fd(intp->unmask));
412 trace_vfio_platform_start_edge_irqfd_injection(intp->pin,
413 event_notifier_get_fd(intp->interrupt));
416 intp->kvm_accel = true;
420 kvm_irqchip_remove_irqfd_notifier(kvm_state, intp->interrupt, irq);
423 vfio_start_eventfd_injection(sbdev, irq);
429 static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev)
431 vbasedev->needs_reset = true;
434 /* not implemented yet */
435 static int vfio_platform_hot_reset_multi(VFIODevice *vbasedev)
441 * vfio_populate_device - Allocate and populate MMIO region
442 * and IRQ structs according to driver returned information
443 * @vbasedev: the VFIO device handle
444 * @errp: error object
447 static int vfio_populate_device(VFIODevice *vbasedev, Error **errp)
449 VFIOINTp *intp, *tmp;
451 VFIOPlatformDevice *vdev =
452 container_of(vbasedev, VFIOPlatformDevice, vbasedev);
454 if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PLATFORM)) {
455 error_setg(errp, "this isn't a platform device");
459 vdev->regions = g_new0(VFIORegion *, vbasedev->num_regions);
461 for (i = 0; i < vbasedev->num_regions; i++) {
462 char *name = g_strdup_printf("VFIO %s region %d\n", vbasedev->name, i);
464 vdev->regions[i] = g_new0(VFIORegion, 1);
465 ret = vfio_region_setup(OBJECT(vdev), vbasedev,
466 vdev->regions[i], i, name);
469 error_setg_errno(errp, -ret, "failed to get region %d info", i);
474 vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
475 vfio_intp_mmap_enable, vdev);
477 QSIMPLEQ_INIT(&vdev->pending_intp_queue);
479 for (i = 0; i < vbasedev->num_irqs; i++) {
480 struct vfio_irq_info irq = { .argsz = sizeof(irq) };
483 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
485 error_setg_errno(errp, -ret, "failed to get device irq info");
488 trace_vfio_platform_populate_interrupts(irq.index,
491 intp = vfio_init_intp(vbasedev, irq, errp);
500 timer_del(vdev->mmap_timer);
501 QLIST_FOREACH_SAFE(intp, &vdev->intp_list, next, tmp) {
502 QLIST_REMOVE(intp, next);
506 for (i = 0; i < vbasedev->num_regions; i++) {
507 if (vdev->regions[i]) {
508 vfio_region_finalize(vdev->regions[i]);
510 g_free(vdev->regions[i]);
512 g_free(vdev->regions);
516 /* specialized functions for VFIO Platform devices */
517 static VFIODeviceOps vfio_platform_ops = {
518 .vfio_compute_needs_reset = vfio_platform_compute_needs_reset,
519 .vfio_hot_reset_multi = vfio_platform_hot_reset_multi,
520 .vfio_eoi = vfio_platform_eoi,
524 * vfio_base_device_init - perform preliminary VFIO setup
525 * @vbasedev: the VFIO device handle
526 * @errp: error object
528 * Implement the VFIO command sequence that allows to discover
529 * assigned device resources: group extraction, device
530 * fd retrieval, resource query.
531 * Precondition: the device name must be initialized
533 static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
536 VFIODevice *vbasedev_iter;
537 char *tmp, group_path[PATH_MAX], *group_name;
543 /* @sysfsdev takes precedence over @host */
544 if (vbasedev->sysfsdev) {
545 g_free(vbasedev->name);
546 vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
548 if (!vbasedev->name || strchr(vbasedev->name, '/')) {
549 error_setg(errp, "wrong host device name");
553 vbasedev->sysfsdev = g_strdup_printf("/sys/bus/platform/devices/%s",
557 if (stat(vbasedev->sysfsdev, &st) < 0) {
558 error_setg_errno(errp, errno,
559 "failed to get the sysfs host device file status");
563 tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
564 len = readlink(tmp, group_path, sizeof(group_path));
567 if (len < 0 || len >= sizeof(group_path)) {
568 ret = len < 0 ? -errno : -ENAMETOOLONG;
569 error_setg_errno(errp, -ret, "no iommu_group found");
575 group_name = basename(group_path);
576 if (sscanf(group_name, "%d", &groupid) != 1) {
577 error_setg_errno(errp, errno, "failed to read %s", group_path);
581 trace_vfio_platform_base_device_init(vbasedev->name, groupid);
583 group = vfio_get_group(groupid, &address_space_memory, errp);
588 QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
589 if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
590 error_setg(errp, "device is already attached");
591 vfio_put_group(group);
595 ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
597 vfio_put_group(group);
601 ret = vfio_populate_device(vbasedev, errp);
603 vfio_put_group(group);
610 * vfio_platform_realize - the device realize function
611 * @dev: device state pointer
614 * initialize the device, its memory regions and IRQ structures
615 * IRQ are started separately
617 static void vfio_platform_realize(DeviceState *dev, Error **errp)
619 VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev);
620 SysBusDevice *sbdev = SYS_BUS_DEVICE(dev);
621 VFIODevice *vbasedev = &vdev->vbasedev;
624 vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
626 vbasedev->ops = &vfio_platform_ops;
628 qemu_mutex_init(&vdev->intp_mutex);
630 trace_vfio_platform_realize(vbasedev->sysfsdev ?
631 vbasedev->sysfsdev : vbasedev->name,
634 ret = vfio_base_device_init(vbasedev, errp);
645 path = g_strdup_printf("%s/of_node/compatible", vbasedev->sysfsdev);
646 if (!g_file_get_contents(path, &contents, &length, &gerr)) {
647 error_setg(errp, "%s", gerr->message);
653 vdev->compat = contents;
654 for (vdev->num_compat = 0; length; vdev->num_compat++) {
655 size_t skip = strlen(contents) + 1;
661 for (i = 0; i < vbasedev->num_regions; i++) {
662 if (vfio_region_mmap(vdev->regions[i])) {
663 warn_report("%s mmap unsupported, performance may be slow",
664 memory_region_name(vdev->regions[i]->mem));
666 sysbus_init_mmio(sbdev, vdev->regions[i]->mem);
673 if (vdev->vbasedev.name) {
674 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
676 error_prepend(errp, "vfio error: ");
680 static const VMStateDescription vfio_platform_vmstate = {
681 .name = "vfio-platform",
685 static Property vfio_platform_dev_properties[] = {
686 DEFINE_PROP_STRING("host", VFIOPlatformDevice, vbasedev.name),
687 DEFINE_PROP_STRING("sysfsdev", VFIOPlatformDevice, vbasedev.sysfsdev),
688 DEFINE_PROP_BOOL("x-no-mmap", VFIOPlatformDevice, vbasedev.no_mmap, false),
689 DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
691 DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
692 DEFINE_PROP_END_OF_LIST(),
695 static void vfio_platform_class_init(ObjectClass *klass, void *data)
697 DeviceClass *dc = DEVICE_CLASS(klass);
698 SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass);
700 dc->realize = vfio_platform_realize;
701 device_class_set_props(dc, vfio_platform_dev_properties);
702 dc->vmsd = &vfio_platform_vmstate;
703 dc->desc = "VFIO-based platform device assignment";
704 sbc->connect_irq_notifier = vfio_start_irqfd_injection;
705 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
706 /* Supported by TYPE_VIRT_MACHINE */
707 dc->user_creatable = true;
710 static const TypeInfo vfio_platform_dev_info = {
711 .name = TYPE_VFIO_PLATFORM,
712 .parent = TYPE_SYS_BUS_DEVICE,
713 .instance_size = sizeof(VFIOPlatformDevice),
714 .class_init = vfio_platform_class_init,
715 .class_size = sizeof(VFIOPlatformDeviceClass),
718 static void register_vfio_platform_dev_type(void)
720 type_register_static(&vfio_platform_dev_info);
723 type_init(register_vfio_platform_dev_type)