4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
48 struct mutex group_lock;
49 struct cdev group_cdev;
51 wait_queue_head_t release_q;
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
59 struct vfio_container {
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
68 struct vfio_unbound_dev {
70 struct list_head unbound_next;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
100 #ifdef CONFIG_VFIO_NOIOMMU
101 static bool noiommu __read_mostly;
102 module_param_named(enable_unsafe_noiommu_mode,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
115 struct iommu_group *vfio_iommu_group_get(struct device *dev)
117 struct iommu_group *group;
118 int __maybe_unused ret;
120 group = iommu_group_get(dev);
122 #ifdef CONFIG_VFIO_NOIOMMU
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We use iommu_present() again in the main code to detect these
129 if (group || !noiommu || iommu_present(dev->bus))
132 group = iommu_group_alloc();
136 iommu_group_set_name(group, "vfio-noiommu");
137 ret = iommu_group_add_device(group, dev);
138 iommu_group_put(group);
143 * Where to taint? At this point we've added an IOMMU group for a
144 * device that is not backed by iommu_ops, therefore any iommu_
145 * callback using iommu_ops can legitimately Oops. So, while we may
146 * be about to give a DMA capable device to a user without IOMMU
147 * protection, which is clearly taint-worthy, let's go ahead and do
150 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
151 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
156 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
158 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
160 #ifdef CONFIG_VFIO_NOIOMMU
161 if (!iommu_present(dev->bus))
162 iommu_group_remove_device(dev);
165 iommu_group_put(group);
167 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
169 #ifdef CONFIG_VFIO_NOIOMMU
170 static void *vfio_noiommu_open(unsigned long arg)
172 if (arg != VFIO_NOIOMMU_IOMMU)
173 return ERR_PTR(-EINVAL);
174 if (!capable(CAP_SYS_RAWIO))
175 return ERR_PTR(-EPERM);
180 static void vfio_noiommu_release(void *iommu_data)
184 static long vfio_noiommu_ioctl(void *iommu_data,
185 unsigned int cmd, unsigned long arg)
187 if (cmd == VFIO_CHECK_EXTENSION)
188 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
193 static int vfio_iommu_present(struct device *dev, void *unused)
195 return iommu_present(dev->bus) ? 1 : 0;
198 static int vfio_noiommu_attach_group(void *iommu_data,
199 struct iommu_group *iommu_group)
201 return iommu_group_for_each_dev(iommu_group, NULL,
202 vfio_iommu_present) ? -EINVAL : 0;
205 static void vfio_noiommu_detach_group(void *iommu_data,
206 struct iommu_group *iommu_group)
210 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
211 .name = "vfio-noiommu",
212 .owner = THIS_MODULE,
213 .open = vfio_noiommu_open,
214 .release = vfio_noiommu_release,
215 .ioctl = vfio_noiommu_ioctl,
216 .attach_group = vfio_noiommu_attach_group,
217 .detach_group = vfio_noiommu_detach_group,
223 * IOMMU driver registration
225 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
227 struct vfio_iommu_driver *driver, *tmp;
229 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
235 mutex_lock(&vfio.iommu_drivers_lock);
237 /* Check for duplicates */
238 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
239 if (tmp->ops == ops) {
240 mutex_unlock(&vfio.iommu_drivers_lock);
246 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
248 mutex_unlock(&vfio.iommu_drivers_lock);
252 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
254 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
256 struct vfio_iommu_driver *driver;
258 mutex_lock(&vfio.iommu_drivers_lock);
259 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
260 if (driver->ops == ops) {
261 list_del(&driver->vfio_next);
262 mutex_unlock(&vfio.iommu_drivers_lock);
267 mutex_unlock(&vfio.iommu_drivers_lock);
269 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
272 * Group minor allocation/free - both called with vfio.group_lock held
274 static int vfio_alloc_group_minor(struct vfio_group *group)
276 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
279 static void vfio_free_group_minor(int minor)
281 idr_remove(&vfio.group_idr, minor);
284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
285 unsigned long action, void *data);
286 static void vfio_group_get(struct vfio_group *group);
289 * Container objects - containers are created when /dev/vfio/vfio is
290 * opened, but their lifecycle extends until the last user is done, so
291 * it's freed via kref. Must support container/group/device being
292 * closed in any order.
294 static void vfio_container_get(struct vfio_container *container)
296 kref_get(&container->kref);
299 static void vfio_container_release(struct kref *kref)
301 struct vfio_container *container;
302 container = container_of(kref, struct vfio_container, kref);
307 static void vfio_container_put(struct vfio_container *container)
309 kref_put(&container->kref, vfio_container_release);
312 static void vfio_group_unlock_and_free(struct vfio_group *group)
314 mutex_unlock(&vfio.group_lock);
316 * Unregister outside of lock. A spurious callback is harmless now
317 * that the group is no longer in vfio.group_list.
319 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
324 * Group objects - create, release, get, put, search
326 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
329 struct vfio_group *group, *tmp;
333 group = kzalloc(sizeof(*group), GFP_KERNEL);
335 return ERR_PTR(-ENOMEM);
337 kref_init(&group->kref);
338 INIT_LIST_HEAD(&group->device_list);
339 mutex_init(&group->device_lock);
340 INIT_LIST_HEAD(&group->unbound_list);
341 mutex_init(&group->unbound_lock);
342 atomic_set(&group->container_users, 0);
343 atomic_set(&group->opened, 0);
344 group->iommu_group = iommu_group;
345 group->noiommu = !iommu_present;
347 group->nb.notifier_call = vfio_iommu_group_notifier;
350 * blocking notifiers acquire a rwsem around registering and hold
351 * it around callback. Therefore, need to register outside of
352 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
353 * do anything unless it can find the group in vfio.group_list, so
354 * no harm in registering early.
356 ret = iommu_group_register_notifier(iommu_group, &group->nb);
362 mutex_lock(&vfio.group_lock);
364 /* Did we race creating this group? */
365 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
366 if (tmp->iommu_group == iommu_group) {
368 vfio_group_unlock_and_free(group);
373 minor = vfio_alloc_group_minor(group);
375 vfio_group_unlock_and_free(group);
376 return ERR_PTR(minor);
379 dev = device_create(vfio.class, NULL,
380 MKDEV(MAJOR(vfio.group_devt), minor),
381 group, "%s%d", group->noiommu ? "noiommu-" : "",
382 iommu_group_id(iommu_group));
384 vfio_free_group_minor(minor);
385 vfio_group_unlock_and_free(group);
386 return (struct vfio_group *)dev; /* ERR_PTR */
389 group->minor = minor;
392 list_add(&group->vfio_next, &vfio.group_list);
394 mutex_unlock(&vfio.group_lock);
399 /* called with vfio.group_lock held */
400 static void vfio_group_release(struct kref *kref)
402 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
403 struct vfio_unbound_dev *unbound, *tmp;
404 struct iommu_group *iommu_group = group->iommu_group;
406 WARN_ON(!list_empty(&group->device_list));
408 list_for_each_entry_safe(unbound, tmp,
409 &group->unbound_list, unbound_next) {
410 list_del(&unbound->unbound_next);
414 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
415 list_del(&group->vfio_next);
416 vfio_free_group_minor(group->minor);
417 vfio_group_unlock_and_free(group);
418 iommu_group_put(iommu_group);
421 static void vfio_group_put(struct vfio_group *group)
423 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
426 /* Assume group_lock or group reference is held */
427 static void vfio_group_get(struct vfio_group *group)
429 kref_get(&group->kref);
433 * Not really a try as we will sleep for mutex, but we need to make
434 * sure the group pointer is valid under lock and get a reference.
436 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
438 struct vfio_group *target = group;
440 mutex_lock(&vfio.group_lock);
441 list_for_each_entry(group, &vfio.group_list, vfio_next) {
442 if (group == target) {
443 vfio_group_get(group);
444 mutex_unlock(&vfio.group_lock);
448 mutex_unlock(&vfio.group_lock);
454 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
456 struct vfio_group *group;
458 mutex_lock(&vfio.group_lock);
459 list_for_each_entry(group, &vfio.group_list, vfio_next) {
460 if (group->iommu_group == iommu_group) {
461 vfio_group_get(group);
462 mutex_unlock(&vfio.group_lock);
466 mutex_unlock(&vfio.group_lock);
471 static struct vfio_group *vfio_group_get_from_minor(int minor)
473 struct vfio_group *group;
475 mutex_lock(&vfio.group_lock);
476 group = idr_find(&vfio.group_idr, minor);
478 mutex_unlock(&vfio.group_lock);
481 vfio_group_get(group);
482 mutex_unlock(&vfio.group_lock);
488 * Device objects - create, release, get, put, search
491 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
493 const struct vfio_device_ops *ops,
496 struct vfio_device *device;
498 device = kzalloc(sizeof(*device), GFP_KERNEL);
500 return ERR_PTR(-ENOMEM);
502 kref_init(&device->kref);
504 device->group = group;
506 device->device_data = device_data;
507 dev_set_drvdata(dev, device);
509 /* No need to get group_lock, caller has group reference */
510 vfio_group_get(group);
512 mutex_lock(&group->device_lock);
513 list_add(&device->group_next, &group->device_list);
514 mutex_unlock(&group->device_lock);
519 static void vfio_device_release(struct kref *kref)
521 struct vfio_device *device = container_of(kref,
522 struct vfio_device, kref);
523 struct vfio_group *group = device->group;
525 list_del(&device->group_next);
526 mutex_unlock(&group->device_lock);
528 dev_set_drvdata(device->dev, NULL);
532 /* vfio_del_group_dev may be waiting for this device */
533 wake_up(&vfio.release_q);
536 /* Device reference always implies a group reference */
537 void vfio_device_put(struct vfio_device *device)
539 struct vfio_group *group = device->group;
540 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
541 vfio_group_put(group);
543 EXPORT_SYMBOL_GPL(vfio_device_put);
545 static void vfio_device_get(struct vfio_device *device)
547 vfio_group_get(device->group);
548 kref_get(&device->kref);
551 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
554 struct vfio_device *device;
556 mutex_lock(&group->device_lock);
557 list_for_each_entry(device, &group->device_list, group_next) {
558 if (device->dev == dev) {
559 vfio_device_get(device);
560 mutex_unlock(&group->device_lock);
564 mutex_unlock(&group->device_lock);
569 * Some drivers, like pci-stub, are only used to prevent other drivers from
570 * claiming a device and are therefore perfectly legitimate for a user owned
571 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
572 * of the device, but it does prevent the user from having direct access to
573 * the device, which is useful in some circumstances.
575 * We also assume that we can include PCI interconnect devices, ie. bridges.
576 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
577 * then all of the downstream devices will be part of the same IOMMU group as
578 * the bridge. Thus, if placing the bridge into the user owned IOVA space
579 * breaks anything, it only does so for user owned devices downstream. Note
580 * that error notification via MSI can be affected for platforms that handle
581 * MSI within the same IOVA space as DMA.
583 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
585 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
589 if (dev_is_pci(dev)) {
590 struct pci_dev *pdev = to_pci_dev(dev);
592 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
596 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
597 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
605 * A vfio group is viable for use by userspace if all devices are in
606 * one of the following states:
608 * - bound to a vfio driver
609 * - bound to a whitelisted driver
610 * - a PCI interconnect device
612 * We use two methods to determine whether a device is bound to a vfio
613 * driver. The first is to test whether the device exists in the vfio
614 * group. The second is to test if the device exists on the group
615 * unbound_list, indicating it's in the middle of transitioning from
616 * a vfio driver to driver-less.
618 static int vfio_dev_viable(struct device *dev, void *data)
620 struct vfio_group *group = data;
621 struct vfio_device *device;
622 struct device_driver *drv = ACCESS_ONCE(dev->driver);
623 struct vfio_unbound_dev *unbound;
626 mutex_lock(&group->unbound_lock);
627 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
628 if (dev == unbound->dev) {
633 mutex_unlock(&group->unbound_lock);
635 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
638 device = vfio_group_get_device(group, dev);
640 vfio_device_put(device);
648 * Async device support
650 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
652 struct vfio_device *device;
654 /* Do we already know about it? We shouldn't */
655 device = vfio_group_get_device(group, dev);
656 if (WARN_ON_ONCE(device)) {
657 vfio_device_put(device);
661 /* Nothing to do for idle groups */
662 if (!atomic_read(&group->container_users))
665 /* TODO Prevent device auto probing */
666 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
667 iommu_group_id(group->iommu_group));
672 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
674 /* We don't care what happens when the group isn't in use */
675 if (!atomic_read(&group->container_users))
678 return vfio_dev_viable(dev, group);
681 static int vfio_iommu_group_notifier(struct notifier_block *nb,
682 unsigned long action, void *data)
684 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
685 struct device *dev = data;
686 struct vfio_unbound_dev *unbound;
689 * Need to go through a group_lock lookup to get a reference or we
690 * risk racing a group being removed. Ignore spurious notifies.
692 group = vfio_group_try_get(group);
697 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
698 vfio_group_nb_add_dev(group, dev);
700 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
702 * Nothing to do here. If the device is in use, then the
703 * vfio sub-driver should block the remove callback until
704 * it is unused. If the device is unused or attached to a
705 * stub driver, then it should be released and we don't
706 * care that it will be going away.
709 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
710 pr_debug("%s: Device %s, group %d binding to driver\n",
711 __func__, dev_name(dev),
712 iommu_group_id(group->iommu_group));
714 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
715 pr_debug("%s: Device %s, group %d bound to driver %s\n",
716 __func__, dev_name(dev),
717 iommu_group_id(group->iommu_group), dev->driver->name);
718 BUG_ON(vfio_group_nb_verify(group, dev));
720 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
721 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
722 __func__, dev_name(dev),
723 iommu_group_id(group->iommu_group), dev->driver->name);
725 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
726 pr_debug("%s: Device %s, group %d unbound from driver\n",
727 __func__, dev_name(dev),
728 iommu_group_id(group->iommu_group));
730 * XXX An unbound device in a live group is ok, but we'd
731 * really like to avoid the above BUG_ON by preventing other
732 * drivers from binding to it. Once that occurs, we have to
733 * stop the system to maintain isolation. At a minimum, we'd
734 * want a toggle to disable driver auto probe for this device.
737 mutex_lock(&group->unbound_lock);
738 list_for_each_entry(unbound,
739 &group->unbound_list, unbound_next) {
740 if (dev == unbound->dev) {
741 list_del(&unbound->unbound_next);
746 mutex_unlock(&group->unbound_lock);
750 vfio_group_put(group);
757 int vfio_add_group_dev(struct device *dev,
758 const struct vfio_device_ops *ops, void *device_data)
760 struct iommu_group *iommu_group;
761 struct vfio_group *group;
762 struct vfio_device *device;
764 iommu_group = iommu_group_get(dev);
768 group = vfio_group_get_from_iommu(iommu_group);
770 group = vfio_create_group(iommu_group, iommu_present(dev->bus));
772 iommu_group_put(iommu_group);
773 return PTR_ERR(group);
777 * A found vfio_group already holds a reference to the
778 * iommu_group. A created vfio_group keeps the reference.
780 iommu_group_put(iommu_group);
783 device = vfio_group_get_device(group, dev);
785 WARN(1, "Device %s already exists on group %d\n",
786 dev_name(dev), iommu_group_id(iommu_group));
787 vfio_device_put(device);
788 vfio_group_put(group);
792 device = vfio_group_create_device(group, dev, ops, device_data);
793 if (IS_ERR(device)) {
794 vfio_group_put(group);
795 return PTR_ERR(device);
799 * Drop all but the vfio_device reference. The vfio_device holds
800 * a reference to the vfio_group, which holds a reference to the
803 vfio_group_put(group);
807 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
810 * Get a reference to the vfio_device for a device. Even if the
811 * caller thinks they own the device, they could be racing with a
812 * release call path, so we can't trust drvdata for the shortcut.
813 * Go the long way around, from the iommu_group to the vfio_group
814 * to the vfio_device.
816 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
818 struct iommu_group *iommu_group;
819 struct vfio_group *group;
820 struct vfio_device *device;
822 iommu_group = iommu_group_get(dev);
826 group = vfio_group_get_from_iommu(iommu_group);
827 iommu_group_put(iommu_group);
831 device = vfio_group_get_device(group, dev);
832 vfio_group_put(group);
836 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
838 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
841 struct vfio_device *it, *device = NULL;
843 mutex_lock(&group->device_lock);
844 list_for_each_entry(it, &group->device_list, group_next) {
845 if (!strcmp(dev_name(it->dev), buf)) {
847 vfio_device_get(device);
851 mutex_unlock(&group->device_lock);
857 * Caller must hold a reference to the vfio_device
859 void *vfio_device_data(struct vfio_device *device)
861 return device->device_data;
863 EXPORT_SYMBOL_GPL(vfio_device_data);
865 /* Given a referenced group, check if it contains the device */
866 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
868 struct vfio_device *device;
870 device = vfio_group_get_device(group, dev);
874 vfio_device_put(device);
879 * Decrement the device reference count and wait for the device to be
880 * removed. Open file descriptors for the device... */
881 void *vfio_del_group_dev(struct device *dev)
883 struct vfio_device *device = dev_get_drvdata(dev);
884 struct vfio_group *group = device->group;
885 void *device_data = device->device_data;
886 struct vfio_unbound_dev *unbound;
889 bool interrupted = false;
892 * The group exists so long as we have a device reference. Get
893 * a group reference and use it to scan for the device going away.
895 vfio_group_get(group);
898 * When the device is removed from the group, the group suddenly
899 * becomes non-viable; the device has a driver (until the unbind
900 * completes), but it's not present in the group. This is bad news
901 * for any external users that need to re-acquire a group reference
902 * in order to match and release their existing reference. To
903 * solve this, we track such devices on the unbound_list to bridge
904 * the gap until they're fully unbound.
906 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
909 mutex_lock(&group->unbound_lock);
910 list_add(&unbound->unbound_next, &group->unbound_list);
911 mutex_unlock(&group->unbound_lock);
915 vfio_device_put(device);
918 * If the device is still present in the group after the above
919 * 'put', then it is in use and we need to request it from the
920 * bus driver. The driver may in turn need to request the
921 * device from the user. We send the request on an arbitrary
922 * interval with counter to allow the driver to take escalating
923 * measures to release the device if it has the ability to do so.
926 device = vfio_group_get_device(group, dev);
930 if (device->ops->request)
931 device->ops->request(device_data, i++);
933 vfio_device_put(device);
936 ret = wait_event_timeout(vfio.release_q,
937 !vfio_dev_present(group, dev), HZ * 10);
939 ret = wait_event_interruptible_timeout(vfio.release_q,
940 !vfio_dev_present(group, dev), HZ * 10);
941 if (ret == -ERESTARTSYS) {
944 "Device is currently in use, task"
946 "blocked until device is released",
947 current->comm, task_pid_nr(current));
952 vfio_group_put(group);
956 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
959 * VFIO base fd, /dev/vfio/vfio
961 static long vfio_ioctl_check_extension(struct vfio_container *container,
964 struct vfio_iommu_driver *driver;
967 down_read(&container->group_lock);
969 driver = container->iommu_driver;
972 /* No base extensions yet */
975 * If no driver is set, poll all registered drivers for
976 * extensions and return the first positive result. If
977 * a driver is already set, further queries will be passed
978 * only to that driver.
981 mutex_lock(&vfio.iommu_drivers_lock);
982 list_for_each_entry(driver, &vfio.iommu_drivers_list,
985 #ifdef CONFIG_VFIO_NOIOMMU
986 if (!list_empty(&container->group_list) &&
987 (container->noiommu !=
988 (driver->ops == &vfio_noiommu_ops)))
992 if (!try_module_get(driver->ops->owner))
995 ret = driver->ops->ioctl(NULL,
996 VFIO_CHECK_EXTENSION,
998 module_put(driver->ops->owner);
1002 mutex_unlock(&vfio.iommu_drivers_lock);
1004 ret = driver->ops->ioctl(container->iommu_data,
1005 VFIO_CHECK_EXTENSION, arg);
1008 up_read(&container->group_lock);
1013 /* hold write lock on container->group_lock */
1014 static int __vfio_container_attach_groups(struct vfio_container *container,
1015 struct vfio_iommu_driver *driver,
1018 struct vfio_group *group;
1021 list_for_each_entry(group, &container->group_list, container_next) {
1022 ret = driver->ops->attach_group(data, group->iommu_group);
1030 list_for_each_entry_continue_reverse(group, &container->group_list,
1032 driver->ops->detach_group(data, group->iommu_group);
1038 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1041 struct vfio_iommu_driver *driver;
1044 down_write(&container->group_lock);
1047 * The container is designed to be an unprivileged interface while
1048 * the group can be assigned to specific users. Therefore, only by
1049 * adding a group to a container does the user get the privilege of
1050 * enabling the iommu, which may allocate finite resources. There
1051 * is no unset_iommu, but by removing all the groups from a container,
1052 * the container is deprivileged and returns to an unset state.
1054 if (list_empty(&container->group_list) || container->iommu_driver) {
1055 up_write(&container->group_lock);
1059 mutex_lock(&vfio.iommu_drivers_lock);
1060 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1063 #ifdef CONFIG_VFIO_NOIOMMU
1065 * Only noiommu containers can use vfio-noiommu and noiommu
1066 * containers can only use vfio-noiommu.
1068 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1072 if (!try_module_get(driver->ops->owner))
1076 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1077 * so test which iommu driver reported support for this
1078 * extension and call open on them. We also pass them the
1079 * magic, allowing a single driver to support multiple
1080 * interfaces if they'd like.
1082 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1083 module_put(driver->ops->owner);
1087 /* module reference holds the driver we're working on */
1088 mutex_unlock(&vfio.iommu_drivers_lock);
1090 data = driver->ops->open(arg);
1092 ret = PTR_ERR(data);
1093 module_put(driver->ops->owner);
1094 goto skip_drivers_unlock;
1097 ret = __vfio_container_attach_groups(container, driver, data);
1099 container->iommu_driver = driver;
1100 container->iommu_data = data;
1102 driver->ops->release(data);
1103 module_put(driver->ops->owner);
1106 goto skip_drivers_unlock;
1109 mutex_unlock(&vfio.iommu_drivers_lock);
1110 skip_drivers_unlock:
1111 up_write(&container->group_lock);
1116 static long vfio_fops_unl_ioctl(struct file *filep,
1117 unsigned int cmd, unsigned long arg)
1119 struct vfio_container *container = filep->private_data;
1120 struct vfio_iommu_driver *driver;
1128 case VFIO_GET_API_VERSION:
1129 ret = VFIO_API_VERSION;
1131 case VFIO_CHECK_EXTENSION:
1132 ret = vfio_ioctl_check_extension(container, arg);
1134 case VFIO_SET_IOMMU:
1135 ret = vfio_ioctl_set_iommu(container, arg);
1138 down_read(&container->group_lock);
1140 driver = container->iommu_driver;
1141 data = container->iommu_data;
1143 if (driver) /* passthrough all unrecognized ioctls */
1144 ret = driver->ops->ioctl(data, cmd, arg);
1146 up_read(&container->group_lock);
1152 #ifdef CONFIG_COMPAT
1153 static long vfio_fops_compat_ioctl(struct file *filep,
1154 unsigned int cmd, unsigned long arg)
1156 arg = (unsigned long)compat_ptr(arg);
1157 return vfio_fops_unl_ioctl(filep, cmd, arg);
1159 #endif /* CONFIG_COMPAT */
1161 static int vfio_fops_open(struct inode *inode, struct file *filep)
1163 struct vfio_container *container;
1165 container = kzalloc(sizeof(*container), GFP_KERNEL);
1169 INIT_LIST_HEAD(&container->group_list);
1170 init_rwsem(&container->group_lock);
1171 kref_init(&container->kref);
1173 filep->private_data = container;
1178 static int vfio_fops_release(struct inode *inode, struct file *filep)
1180 struct vfio_container *container = filep->private_data;
1182 filep->private_data = NULL;
1184 vfio_container_put(container);
1190 * Once an iommu driver is set, we optionally pass read/write/mmap
1191 * on to the driver, allowing management interfaces beyond ioctl.
1193 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1194 size_t count, loff_t *ppos)
1196 struct vfio_container *container = filep->private_data;
1197 struct vfio_iommu_driver *driver;
1198 ssize_t ret = -EINVAL;
1200 down_read(&container->group_lock);
1202 driver = container->iommu_driver;
1203 if (likely(driver && driver->ops->read))
1204 ret = driver->ops->read(container->iommu_data,
1207 up_read(&container->group_lock);
1212 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1213 size_t count, loff_t *ppos)
1215 struct vfio_container *container = filep->private_data;
1216 struct vfio_iommu_driver *driver;
1217 ssize_t ret = -EINVAL;
1219 down_read(&container->group_lock);
1221 driver = container->iommu_driver;
1222 if (likely(driver && driver->ops->write))
1223 ret = driver->ops->write(container->iommu_data,
1226 up_read(&container->group_lock);
1231 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1233 struct vfio_container *container = filep->private_data;
1234 struct vfio_iommu_driver *driver;
1237 down_read(&container->group_lock);
1239 driver = container->iommu_driver;
1240 if (likely(driver && driver->ops->mmap))
1241 ret = driver->ops->mmap(container->iommu_data, vma);
1243 up_read(&container->group_lock);
1248 static const struct file_operations vfio_fops = {
1249 .owner = THIS_MODULE,
1250 .open = vfio_fops_open,
1251 .release = vfio_fops_release,
1252 .read = vfio_fops_read,
1253 .write = vfio_fops_write,
1254 .unlocked_ioctl = vfio_fops_unl_ioctl,
1255 #ifdef CONFIG_COMPAT
1256 .compat_ioctl = vfio_fops_compat_ioctl,
1258 .mmap = vfio_fops_mmap,
1262 * VFIO Group fd, /dev/vfio/$GROUP
1264 static void __vfio_group_unset_container(struct vfio_group *group)
1266 struct vfio_container *container = group->container;
1267 struct vfio_iommu_driver *driver;
1269 down_write(&container->group_lock);
1271 driver = container->iommu_driver;
1273 driver->ops->detach_group(container->iommu_data,
1274 group->iommu_group);
1276 group->container = NULL;
1277 list_del(&group->container_next);
1279 /* Detaching the last group deprivileges a container, remove iommu */
1280 if (driver && list_empty(&container->group_list)) {
1281 driver->ops->release(container->iommu_data);
1282 module_put(driver->ops->owner);
1283 container->iommu_driver = NULL;
1284 container->iommu_data = NULL;
1287 up_write(&container->group_lock);
1289 vfio_container_put(container);
1293 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1294 * if there was no container to unset. Since the ioctl is called on
1295 * the group, we know that still exists, therefore the only valid
1296 * transition here is 1->0.
1298 static int vfio_group_unset_container(struct vfio_group *group)
1300 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1307 __vfio_group_unset_container(group);
1313 * When removing container users, anything that removes the last user
1314 * implicitly removes the group from the container. That is, if the
1315 * group file descriptor is closed, as well as any device file descriptors,
1316 * the group is free.
1318 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1320 if (0 == atomic_dec_if_positive(&group->container_users))
1321 __vfio_group_unset_container(group);
1324 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1327 struct vfio_container *container;
1328 struct vfio_iommu_driver *driver;
1331 if (atomic_read(&group->container_users))
1334 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1337 f = fdget(container_fd);
1341 /* Sanity check, is this really our fd? */
1342 if (f.file->f_op != &vfio_fops) {
1347 container = f.file->private_data;
1348 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1350 down_write(&container->group_lock);
1352 /* Real groups and fake groups cannot mix */
1353 if (!list_empty(&container->group_list) &&
1354 container->noiommu != group->noiommu) {
1359 driver = container->iommu_driver;
1361 ret = driver->ops->attach_group(container->iommu_data,
1362 group->iommu_group);
1367 group->container = container;
1368 container->noiommu = group->noiommu;
1369 list_add(&group->container_next, &container->group_list);
1371 /* Get a reference on the container and mark a user within the group */
1372 vfio_container_get(container);
1373 atomic_inc(&group->container_users);
1376 up_write(&container->group_lock);
1381 static bool vfio_group_viable(struct vfio_group *group)
1383 return (iommu_group_for_each_dev(group->iommu_group,
1384 group, vfio_dev_viable) == 0);
1387 static const struct file_operations vfio_device_fops;
1389 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1391 struct vfio_device *device;
1395 if (0 == atomic_read(&group->container_users) ||
1396 !group->container->iommu_driver || !vfio_group_viable(group))
1399 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1402 device = vfio_device_get_from_name(group, buf);
1406 ret = device->ops->open(device->device_data);
1408 vfio_device_put(device);
1413 * We can't use anon_inode_getfd() because we need to modify
1414 * the f_mode flags directly to allow more than just ioctls
1416 ret = get_unused_fd_flags(O_CLOEXEC);
1418 device->ops->release(device->device_data);
1419 vfio_device_put(device);
1423 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1425 if (IS_ERR(filep)) {
1427 ret = PTR_ERR(filep);
1428 device->ops->release(device->device_data);
1429 vfio_device_put(device);
1434 * TODO: add an anon_inode interface to do this.
1435 * Appears to be missing by lack of need rather than
1436 * explicitly prevented. Now there's need.
1438 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1440 atomic_inc(&group->container_users);
1442 fd_install(ret, filep);
1445 dev_warn(device->dev, "vfio-noiommu device opened by user "
1446 "(%s:%d)\n", current->comm, task_pid_nr(current));
1451 static long vfio_group_fops_unl_ioctl(struct file *filep,
1452 unsigned int cmd, unsigned long arg)
1454 struct vfio_group *group = filep->private_data;
1458 case VFIO_GROUP_GET_STATUS:
1460 struct vfio_group_status status;
1461 unsigned long minsz;
1463 minsz = offsetofend(struct vfio_group_status, flags);
1465 if (copy_from_user(&status, (void __user *)arg, minsz))
1468 if (status.argsz < minsz)
1473 if (vfio_group_viable(group))
1474 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1476 if (group->container)
1477 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1479 if (copy_to_user((void __user *)arg, &status, minsz))
1485 case VFIO_GROUP_SET_CONTAINER:
1489 if (get_user(fd, (int __user *)arg))
1495 ret = vfio_group_set_container(group, fd);
1498 case VFIO_GROUP_UNSET_CONTAINER:
1499 ret = vfio_group_unset_container(group);
1501 case VFIO_GROUP_GET_DEVICE_FD:
1505 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1507 return PTR_ERR(buf);
1509 ret = vfio_group_get_device_fd(group, buf);
1518 #ifdef CONFIG_COMPAT
1519 static long vfio_group_fops_compat_ioctl(struct file *filep,
1520 unsigned int cmd, unsigned long arg)
1522 arg = (unsigned long)compat_ptr(arg);
1523 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1525 #endif /* CONFIG_COMPAT */
1527 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1529 struct vfio_group *group;
1532 group = vfio_group_get_from_minor(iminor(inode));
1536 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1537 vfio_group_put(group);
1541 /* Do we need multiple instances of the group open? Seems not. */
1542 opened = atomic_cmpxchg(&group->opened, 0, 1);
1544 vfio_group_put(group);
1548 /* Is something still in use from a previous open? */
1549 if (group->container) {
1550 atomic_dec(&group->opened);
1551 vfio_group_put(group);
1555 filep->private_data = group;
1560 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1562 struct vfio_group *group = filep->private_data;
1564 filep->private_data = NULL;
1566 vfio_group_try_dissolve_container(group);
1568 atomic_dec(&group->opened);
1570 vfio_group_put(group);
1575 static const struct file_operations vfio_group_fops = {
1576 .owner = THIS_MODULE,
1577 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1578 #ifdef CONFIG_COMPAT
1579 .compat_ioctl = vfio_group_fops_compat_ioctl,
1581 .open = vfio_group_fops_open,
1582 .release = vfio_group_fops_release,
1588 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1590 struct vfio_device *device = filep->private_data;
1592 device->ops->release(device->device_data);
1594 vfio_group_try_dissolve_container(device->group);
1596 vfio_device_put(device);
1601 static long vfio_device_fops_unl_ioctl(struct file *filep,
1602 unsigned int cmd, unsigned long arg)
1604 struct vfio_device *device = filep->private_data;
1606 if (unlikely(!device->ops->ioctl))
1609 return device->ops->ioctl(device->device_data, cmd, arg);
1612 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1613 size_t count, loff_t *ppos)
1615 struct vfio_device *device = filep->private_data;
1617 if (unlikely(!device->ops->read))
1620 return device->ops->read(device->device_data, buf, count, ppos);
1623 static ssize_t vfio_device_fops_write(struct file *filep,
1624 const char __user *buf,
1625 size_t count, loff_t *ppos)
1627 struct vfio_device *device = filep->private_data;
1629 if (unlikely(!device->ops->write))
1632 return device->ops->write(device->device_data, buf, count, ppos);
1635 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1637 struct vfio_device *device = filep->private_data;
1639 if (unlikely(!device->ops->mmap))
1642 return device->ops->mmap(device->device_data, vma);
1645 #ifdef CONFIG_COMPAT
1646 static long vfio_device_fops_compat_ioctl(struct file *filep,
1647 unsigned int cmd, unsigned long arg)
1649 arg = (unsigned long)compat_ptr(arg);
1650 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1652 #endif /* CONFIG_COMPAT */
1654 static const struct file_operations vfio_device_fops = {
1655 .owner = THIS_MODULE,
1656 .release = vfio_device_fops_release,
1657 .read = vfio_device_fops_read,
1658 .write = vfio_device_fops_write,
1659 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1660 #ifdef CONFIG_COMPAT
1661 .compat_ioctl = vfio_device_fops_compat_ioctl,
1663 .mmap = vfio_device_fops_mmap,
1667 * External user API, exported by symbols to be linked dynamically.
1669 * The protocol includes:
1670 * 1. do normal VFIO init operation:
1671 * - opening a new container;
1672 * - attaching group(s) to it;
1673 * - setting an IOMMU driver for a container.
1674 * When IOMMU is set for a container, all groups in it are
1675 * considered ready to use by an external user.
1677 * 2. User space passes a group fd to an external user.
1678 * The external user calls vfio_group_get_external_user()
1680 * - the group is initialized;
1681 * - IOMMU is set for it.
1682 * If both checks passed, vfio_group_get_external_user()
1683 * increments the container user counter to prevent
1684 * the VFIO group from disposal before KVM exits.
1686 * 3. The external user calls vfio_external_user_iommu_id()
1687 * to know an IOMMU ID.
1689 * 4. When the external KVM finishes, it calls
1690 * vfio_group_put_external_user() to release the VFIO group.
1691 * This call decrements the container user counter.
1693 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1695 struct vfio_group *group = filep->private_data;
1697 if (filep->f_op != &vfio_group_fops)
1698 return ERR_PTR(-EINVAL);
1700 if (!atomic_inc_not_zero(&group->container_users))
1701 return ERR_PTR(-EINVAL);
1703 if (group->noiommu) {
1704 atomic_dec(&group->container_users);
1705 return ERR_PTR(-EPERM);
1708 if (!group->container->iommu_driver ||
1709 !vfio_group_viable(group)) {
1710 atomic_dec(&group->container_users);
1711 return ERR_PTR(-EINVAL);
1714 vfio_group_get(group);
1718 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1720 void vfio_group_put_external_user(struct vfio_group *group)
1722 vfio_group_put(group);
1723 vfio_group_try_dissolve_container(group);
1725 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1727 int vfio_external_user_iommu_id(struct vfio_group *group)
1729 return iommu_group_id(group->iommu_group);
1731 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1733 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1735 return vfio_ioctl_check_extension(group->container, arg);
1737 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1740 * Module/class support
1742 static char *vfio_devnode(struct device *dev, umode_t *mode)
1744 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1747 static struct miscdevice vfio_dev = {
1748 .minor = VFIO_MINOR,
1751 .nodename = "vfio/vfio",
1752 .mode = S_IRUGO | S_IWUGO,
1755 static int __init vfio_init(void)
1759 idr_init(&vfio.group_idr);
1760 mutex_init(&vfio.group_lock);
1761 mutex_init(&vfio.iommu_drivers_lock);
1762 INIT_LIST_HEAD(&vfio.group_list);
1763 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1764 init_waitqueue_head(&vfio.release_q);
1766 ret = misc_register(&vfio_dev);
1768 pr_err("vfio: misc device register failed\n");
1772 /* /dev/vfio/$GROUP */
1773 vfio.class = class_create(THIS_MODULE, "vfio");
1774 if (IS_ERR(vfio.class)) {
1775 ret = PTR_ERR(vfio.class);
1779 vfio.class->devnode = vfio_devnode;
1781 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1783 goto err_alloc_chrdev;
1785 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1786 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1790 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1793 * Attempt to load known iommu-drivers. This gives us a working
1794 * environment without the user needing to explicitly load iommu
1797 request_module_nowait("vfio_iommu_type1");
1798 request_module_nowait("vfio_iommu_spapr_tce");
1800 #ifdef CONFIG_VFIO_NOIOMMU
1801 vfio_register_iommu_driver(&vfio_noiommu_ops);
1806 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1808 class_destroy(vfio.class);
1811 misc_deregister(&vfio_dev);
1815 static void __exit vfio_cleanup(void)
1817 WARN_ON(!list_empty(&vfio.group_list));
1819 #ifdef CONFIG_VFIO_NOIOMMU
1820 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1822 idr_destroy(&vfio.group_idr);
1823 cdev_del(&vfio.group_cdev);
1824 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1825 class_destroy(vfio.class);
1827 misc_deregister(&vfio_dev);
1830 module_init(vfio_init);
1831 module_exit(vfio_cleanup);
1833 MODULE_VERSION(DRIVER_VERSION);
1834 MODULE_LICENSE("GPL v2");
1835 MODULE_AUTHOR(DRIVER_AUTHOR);
1836 MODULE_DESCRIPTION(DRIVER_DESC);
1837 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1838 MODULE_ALIAS("devname:vfio/vfio");