1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
42 #define DRIVER_VERSION "0.3"
44 #define DRIVER_DESC "VFIO - User Level meta-driver"
47 struct class *device_class;
48 struct ida device_ida;
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
58 static DEFINE_XARRAY(vfio_device_set_xa);
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
70 * Atomically acquire a singleton object in the xarray for this set_id
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
76 xa_unlock(&vfio_device_set_xa);
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
89 dev_set = new_dev_set;
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
110 static void vfio_release_device_set(struct vfio_device *device)
112 struct vfio_device_set *dev_set = device->dev_set;
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
128 xa_unlock(&vfio_device_set_xa);
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
136 lockdep_assert_held(&dev_set->lock);
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
145 * Device objects - create, release, get, put, search
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
150 if (refcount_dec_and_test(&device->refcount))
151 complete(&device->comp);
154 bool vfio_device_try_get_registration(struct vfio_device *device)
156 return refcount_inc_not_zero(&device->refcount);
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
165 struct vfio_device *device =
166 container_of(dev, struct vfio_device, device);
168 vfio_release_device_set(device);
169 ida_free(&vfio.device_ida, device->index);
171 if (device->ops->release)
172 device->ops->release(device);
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 const struct vfio_device_ops *ops);
181 * Allocate and initialize vfio_device so it can be registered to vfio
184 * Drivers should use the wrapper vfio_alloc_device() for allocation.
185 * @size is the size of the structure to be allocated, including any
186 * private data used by the driver.
188 * Driver may provide an @init callback to cover device private data.
190 * Use vfio_put_device() to release the structure after success return.
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 const struct vfio_device_ops *ops)
195 struct vfio_device *device;
198 if (WARN_ON(size < sizeof(struct vfio_device)))
199 return ERR_PTR(-EINVAL);
201 device = kvzalloc(size, GFP_KERNEL);
203 return ERR_PTR(-ENOMEM);
205 ret = vfio_init_device(device, dev, ops);
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
217 * Initialize a vfio_device so it can be registered to vfio core.
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 const struct vfio_device_ops *ops)
224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
226 dev_dbg(dev, "Error to alloc index\n");
231 init_completion(&device->comp);
236 ret = ops->init(device);
241 device_initialize(&device->device);
242 device->device.release = vfio_device_release;
243 device->device.class = vfio.device_class;
244 device->device.parent = device->dev;
248 vfio_release_device_set(device);
249 ida_free(&vfio.device_ida, device->index);
253 static int __vfio_register_dev(struct vfio_device *device,
254 enum vfio_group_type type)
258 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
259 (!device->ops->bind_iommufd ||
260 !device->ops->unbind_iommufd ||
261 !device->ops->attach_ioas)))
265 * If the driver doesn't specify a set then the device is added to a
266 * singleton set just for itself.
268 if (!device->dev_set)
269 vfio_assign_device_set(device, device);
271 ret = dev_set_name(&device->device, "vfio%d", device->index);
275 ret = vfio_device_set_group(device, type);
279 ret = device_add(&device->device);
283 /* Refcounting can't start until the driver calls register */
284 refcount_set(&device->refcount, 1);
286 vfio_device_group_register(device);
290 vfio_device_remove_group(device);
294 int vfio_register_group_dev(struct vfio_device *device)
296 return __vfio_register_dev(device, VFIO_IOMMU);
298 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
301 * Register a virtual device without IOMMU backing. The user of this
302 * device must not be able to directly trigger unmediated DMA.
304 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
306 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
308 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
311 * Decrement the device reference count and wait for the device to be
312 * removed. Open file descriptors for the device... */
313 void vfio_unregister_group_dev(struct vfio_device *device)
316 bool interrupted = false;
319 vfio_device_put_registration(device);
320 rc = try_wait_for_completion(&device->comp);
322 if (device->ops->request)
323 device->ops->request(device, i++);
326 rc = wait_for_completion_timeout(&device->comp,
329 rc = wait_for_completion_interruptible_timeout(
330 &device->comp, HZ * 10);
333 dev_warn(device->dev,
334 "Device is currently in use, task"
336 "blocked until device is released",
337 current->comm, task_pid_nr(current));
342 vfio_device_group_unregister(device);
344 /* Balances device_add in register path */
345 device_del(&device->device);
347 /* Balances vfio_device_set_group in register path */
348 vfio_device_remove_group(device);
350 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
352 #ifdef CONFIG_HAVE_KVM
353 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
355 void (*pfn)(struct kvm *kvm);
356 bool (*fn)(struct kvm *kvm);
359 lockdep_assert_held(&device->dev_set->lock);
361 pfn = symbol_get(kvm_put_kvm);
365 fn = symbol_get(kvm_get_kvm_safe);
367 symbol_put(kvm_put_kvm);
372 symbol_put(kvm_get_kvm_safe);
374 symbol_put(kvm_put_kvm);
378 device->put_kvm = pfn;
382 void vfio_device_put_kvm(struct vfio_device *device)
384 lockdep_assert_held(&device->dev_set->lock);
389 if (WARN_ON(!device->put_kvm))
392 device->put_kvm(device->kvm);
393 device->put_kvm = NULL;
394 symbol_put(kvm_put_kvm);
401 /* true if the vfio_device has open_device() called but not close_device() */
402 static bool vfio_assert_device_open(struct vfio_device *device)
404 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
407 static int vfio_device_first_open(struct vfio_device *device,
408 struct iommufd_ctx *iommufd)
412 lockdep_assert_held(&device->dev_set->lock);
414 if (!try_module_get(device->dev->driver->owner))
418 ret = vfio_iommufd_bind(device, iommufd);
420 ret = vfio_device_group_use_iommu(device);
424 if (device->ops->open_device) {
425 ret = device->ops->open_device(device);
427 goto err_unuse_iommu;
433 vfio_iommufd_unbind(device);
435 vfio_device_group_unuse_iommu(device);
437 module_put(device->dev->driver->owner);
441 static void vfio_device_last_close(struct vfio_device *device,
442 struct iommufd_ctx *iommufd)
444 lockdep_assert_held(&device->dev_set->lock);
446 if (device->ops->close_device)
447 device->ops->close_device(device);
449 vfio_iommufd_unbind(device);
451 vfio_device_group_unuse_iommu(device);
452 module_put(device->dev->driver->owner);
455 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
459 lockdep_assert_held(&device->dev_set->lock);
461 device->open_count++;
462 if (device->open_count == 1) {
463 ret = vfio_device_first_open(device, iommufd);
465 device->open_count--;
471 void vfio_device_close(struct vfio_device *device,
472 struct iommufd_ctx *iommufd)
474 lockdep_assert_held(&device->dev_set->lock);
476 vfio_assert_device_open(device);
477 if (device->open_count == 1)
478 vfio_device_last_close(device, iommufd);
479 device->open_count--;
483 * Wrapper around pm_runtime_resume_and_get().
484 * Return error code on failure or 0 on success.
486 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
488 struct device *dev = device->dev;
490 if (dev->driver && dev->driver->pm) {
493 ret = pm_runtime_resume_and_get(dev);
495 dev_info_ratelimited(dev,
496 "vfio: runtime resume failed %d\n", ret);
505 * Wrapper around pm_runtime_put().
507 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
509 struct device *dev = device->dev;
511 if (dev->driver && dev->driver->pm)
518 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
520 struct vfio_device *device = filep->private_data;
522 vfio_device_group_close(device);
524 vfio_device_put_registration(device);
530 * vfio_mig_get_next_state - Compute the next step in the FSM
531 * @cur_fsm - The current state the device is in
532 * @new_fsm - The target state to reach
533 * @next_fsm - Pointer to the next step to get to new_fsm
535 * Return 0 upon success, otherwise -errno
536 * Upon success the next step in the state progression between cur_fsm and
537 * new_fsm will be set in next_fsm.
539 * This breaks down requests for combination transitions into smaller steps and
540 * returns the next step to get to new_fsm. The function may need to be called
541 * multiple times before reaching new_fsm.
544 int vfio_mig_get_next_state(struct vfio_device *device,
545 enum vfio_device_mig_state cur_fsm,
546 enum vfio_device_mig_state new_fsm,
547 enum vfio_device_mig_state *next_fsm)
549 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
551 * The coding in this table requires the driver to implement the
552 * following FSM arcs:
558 * If P2P is supported then the driver must also implement these FSM
560 * RUNNING -> RUNNING_P2P
561 * RUNNING_P2P -> RUNNING
562 * RUNNING_P2P -> STOP
563 * STOP -> RUNNING_P2P
565 * If precopy is supported then the driver must support these additional
567 * RUNNING -> PRE_COPY
568 * PRE_COPY -> RUNNING
569 * PRE_COPY -> STOP_COPY
570 * However, if precopy and P2P are supported together then the driver
571 * must support these additional arcs beyond the P2P arcs above:
572 * PRE_COPY -> RUNNING
573 * PRE_COPY -> PRE_COPY_P2P
574 * PRE_COPY_P2P -> PRE_COPY
575 * PRE_COPY_P2P -> RUNNING_P2P
576 * PRE_COPY_P2P -> STOP_COPY
577 * RUNNING -> PRE_COPY
578 * RUNNING_P2P -> PRE_COPY_P2P
580 * Without P2P and precopy the driver must implement:
584 * The coding will step through multiple states for some combination
585 * transitions; if all optional features are supported, this means the
587 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
588 * PRE_COPY -> RUNNING -> RUNNING_P2P
589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
590 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
591 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
593 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
594 * RESUMING -> STOP -> RUNNING_P2P
595 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
597 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
598 * RESUMING -> STOP -> STOP_COPY
599 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
600 * RUNNING -> RUNNING_P2P -> STOP
601 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
602 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
603 * RUNNING_P2P -> RUNNING -> PRE_COPY
604 * RUNNING_P2P -> STOP -> RESUMING
605 * RUNNING_P2P -> STOP -> STOP_COPY
606 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
607 * STOP -> RUNNING_P2P -> RUNNING
608 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
609 * STOP_COPY -> STOP -> RESUMING
610 * STOP_COPY -> STOP -> RUNNING_P2P
611 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
613 * The following transitions are blocked:
614 * STOP_COPY -> PRE_COPY
615 * STOP_COPY -> PRE_COPY_P2P
617 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
618 [VFIO_DEVICE_STATE_STOP] = {
619 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
620 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
623 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
624 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
625 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
626 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
628 [VFIO_DEVICE_STATE_RUNNING] = {
629 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
630 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
631 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
632 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
636 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
638 [VFIO_DEVICE_STATE_PRE_COPY] = {
639 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
640 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
641 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
642 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
644 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
645 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
646 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
648 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
649 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
651 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
652 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
653 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
654 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
656 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
658 [VFIO_DEVICE_STATE_STOP_COPY] = {
659 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
660 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
661 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
662 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
663 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
664 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
665 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
666 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
668 [VFIO_DEVICE_STATE_RESUMING] = {
669 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
670 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
671 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
672 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
673 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
674 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
675 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
676 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
678 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
679 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
680 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
681 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
682 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
683 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
684 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
685 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
686 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
688 [VFIO_DEVICE_STATE_ERROR] = {
689 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
690 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
691 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
692 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
693 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
694 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
695 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
696 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
700 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
701 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
703 [VFIO_DEVICE_STATE_PRE_COPY] =
704 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
705 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
707 VFIO_MIGRATION_PRE_COPY,
708 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
709 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
710 [VFIO_DEVICE_STATE_RUNNING_P2P] =
711 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
712 [VFIO_DEVICE_STATE_ERROR] = ~0U,
715 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
716 (state_flags_table[cur_fsm] & device->migration_flags) !=
717 state_flags_table[cur_fsm]))
720 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
721 (state_flags_table[new_fsm] & device->migration_flags) !=
722 state_flags_table[new_fsm])
726 * Arcs touching optional and unsupported states are skipped over. The
727 * driver will instead see an arc from the original state to the next
728 * logical state, as per the above comment.
730 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
731 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
732 state_flags_table[*next_fsm])
733 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
735 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
737 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
740 * Convert the drivers's struct file into a FD number and return it to userspace
742 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
743 struct vfio_device_feature_mig_state *mig)
748 fd = get_unused_fd_flags(O_CLOEXEC);
755 if (copy_to_user(arg, mig, sizeof(*mig))) {
759 fd_install(fd, filp);
770 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
771 u32 flags, void __user *arg,
775 offsetofend(struct vfio_device_feature_mig_state, data_fd);
776 struct vfio_device_feature_mig_state mig;
777 struct file *filp = NULL;
780 if (!device->mig_ops)
783 ret = vfio_check_feature(flags, argsz,
784 VFIO_DEVICE_FEATURE_SET |
785 VFIO_DEVICE_FEATURE_GET,
790 if (copy_from_user(&mig, arg, minsz))
793 if (flags & VFIO_DEVICE_FEATURE_GET) {
794 enum vfio_device_mig_state curr_state;
796 ret = device->mig_ops->migration_get_state(device,
800 mig.device_state = curr_state;
804 /* Handle the VFIO_DEVICE_FEATURE_SET */
805 filp = device->mig_ops->migration_set_state(device, mig.device_state);
806 if (IS_ERR(filp) || !filp)
809 return vfio_ioct_mig_return_fd(filp, arg, &mig);
812 if (copy_to_user(arg, &mig, sizeof(mig)))
815 return PTR_ERR(filp);
820 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
821 u32 flags, void __user *arg,
824 struct vfio_device_feature_mig_data_size data_size = {};
825 unsigned long stop_copy_length;
828 if (!device->mig_ops)
831 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
836 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
840 data_size.stop_copy_length = stop_copy_length;
841 if (copy_to_user(arg, &data_size, sizeof(data_size)))
847 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
848 u32 flags, void __user *arg,
851 struct vfio_device_feature_migration mig = {
852 .flags = device->migration_flags,
856 if (!device->mig_ops)
859 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
863 if (copy_to_user(arg, &mig, sizeof(mig)))
868 /* Ranges should fit into a single kernel page */
869 #define LOG_MAX_RANGES \
870 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
873 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
874 u32 flags, void __user *arg,
878 offsetofend(struct vfio_device_feature_dma_logging_control,
880 struct vfio_device_feature_dma_logging_range __user *ranges;
881 struct vfio_device_feature_dma_logging_control control;
882 struct vfio_device_feature_dma_logging_range range;
883 struct rb_root_cached root = RB_ROOT_CACHED;
884 struct interval_tree_node *nodes;
889 if (!device->log_ops)
892 ret = vfio_check_feature(flags, argsz,
893 VFIO_DEVICE_FEATURE_SET,
898 if (copy_from_user(&control, arg, minsz))
901 nnodes = control.num_ranges;
905 if (nnodes > LOG_MAX_RANGES)
908 ranges = u64_to_user_ptr(control.ranges);
909 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
914 for (i = 0; i < nnodes; i++) {
915 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
919 if (!IS_ALIGNED(range.iova, control.page_size) ||
920 !IS_ALIGNED(range.length, control.page_size)) {
925 if (check_add_overflow(range.iova, range.length, &iova_end) ||
926 iova_end > ULONG_MAX) {
931 nodes[i].start = range.iova;
932 nodes[i].last = range.iova + range.length - 1;
933 if (interval_tree_iter_first(&root, nodes[i].start,
935 /* Range overlapping */
939 interval_tree_insert(nodes + i, &root);
942 ret = device->log_ops->log_start(device, &root, nnodes,
947 if (copy_to_user(arg, &control, sizeof(control))) {
949 device->log_ops->log_stop(device);
958 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
959 u32 flags, void __user *arg,
964 if (!device->log_ops)
967 ret = vfio_check_feature(flags, argsz,
968 VFIO_DEVICE_FEATURE_SET, 0);
972 return device->log_ops->log_stop(device);
975 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
976 unsigned long iova, size_t length,
979 struct vfio_device *device = opaque;
981 return device->log_ops->log_read_and_clear(device, iova, length, iter);
985 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
986 u32 flags, void __user *arg,
990 offsetofend(struct vfio_device_feature_dma_logging_report,
992 struct vfio_device_feature_dma_logging_report report;
993 struct iova_bitmap *iter;
997 if (!device->log_ops)
1000 ret = vfio_check_feature(flags, argsz,
1001 VFIO_DEVICE_FEATURE_GET,
1006 if (copy_from_user(&report, arg, minsz))
1009 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1012 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1013 iova_end > ULONG_MAX)
1016 iter = iova_bitmap_alloc(report.iova, report.length,
1018 u64_to_user_ptr(report.bitmap));
1020 return PTR_ERR(iter);
1022 ret = iova_bitmap_for_each(iter, device,
1023 vfio_device_log_read_and_clear);
1025 iova_bitmap_free(iter);
1029 static int vfio_ioctl_device_feature(struct vfio_device *device,
1030 struct vfio_device_feature __user *arg)
1032 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1033 struct vfio_device_feature feature;
1035 if (copy_from_user(&feature, arg, minsz))
1038 if (feature.argsz < minsz)
1041 /* Check unknown flags */
1043 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1044 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1047 /* GET & SET are mutually exclusive except with PROBE */
1048 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1049 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1050 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1053 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1054 case VFIO_DEVICE_FEATURE_MIGRATION:
1055 return vfio_ioctl_device_feature_migration(
1056 device, feature.flags, arg->data,
1057 feature.argsz - minsz);
1058 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1059 return vfio_ioctl_device_feature_mig_device_state(
1060 device, feature.flags, arg->data,
1061 feature.argsz - minsz);
1062 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1063 return vfio_ioctl_device_feature_logging_start(
1064 device, feature.flags, arg->data,
1065 feature.argsz - minsz);
1066 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1067 return vfio_ioctl_device_feature_logging_stop(
1068 device, feature.flags, arg->data,
1069 feature.argsz - minsz);
1070 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1071 return vfio_ioctl_device_feature_logging_report(
1072 device, feature.flags, arg->data,
1073 feature.argsz - minsz);
1074 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1075 return vfio_ioctl_device_feature_migration_data_size(
1076 device, feature.flags, arg->data,
1077 feature.argsz - minsz);
1079 if (unlikely(!device->ops->device_feature))
1081 return device->ops->device_feature(device, feature.flags,
1083 feature.argsz - minsz);
1087 static long vfio_device_fops_unl_ioctl(struct file *filep,
1088 unsigned int cmd, unsigned long arg)
1090 struct vfio_device *device = filep->private_data;
1093 ret = vfio_device_pm_runtime_get(device);
1098 case VFIO_DEVICE_FEATURE:
1099 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1103 if (unlikely(!device->ops->ioctl))
1106 ret = device->ops->ioctl(device, cmd, arg);
1110 vfio_device_pm_runtime_put(device);
1114 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1115 size_t count, loff_t *ppos)
1117 struct vfio_device *device = filep->private_data;
1119 if (unlikely(!device->ops->read))
1122 return device->ops->read(device, buf, count, ppos);
1125 static ssize_t vfio_device_fops_write(struct file *filep,
1126 const char __user *buf,
1127 size_t count, loff_t *ppos)
1129 struct vfio_device *device = filep->private_data;
1131 if (unlikely(!device->ops->write))
1134 return device->ops->write(device, buf, count, ppos);
1137 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1139 struct vfio_device *device = filep->private_data;
1141 if (unlikely(!device->ops->mmap))
1144 return device->ops->mmap(device, vma);
1147 const struct file_operations vfio_device_fops = {
1148 .owner = THIS_MODULE,
1149 .release = vfio_device_fops_release,
1150 .read = vfio_device_fops_read,
1151 .write = vfio_device_fops_write,
1152 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1153 .compat_ioctl = compat_ptr_ioctl,
1154 .mmap = vfio_device_fops_mmap,
1158 * Sub-module support
1161 * Helper for managing a buffer of info chain capabilities, allocate or
1162 * reallocate a buffer with additional @size, filling in @id and @version
1163 * of the capability. A pointer to the new capability is returned.
1165 * NB. The chain is based at the head of the buffer, so new entries are
1166 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1167 * next offsets prior to copying to the user buffer.
1169 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1170 size_t size, u16 id, u16 version)
1173 struct vfio_info_cap_header *header, *tmp;
1175 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1180 return ERR_PTR(-ENOMEM);
1184 header = buf + caps->size;
1186 /* Eventually copied to user buffer, zero */
1187 memset(header, 0, size);
1190 header->version = version;
1192 /* Add to the end of the capability chain */
1193 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1196 tmp->next = caps->size;
1201 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1203 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1205 struct vfio_info_cap_header *tmp;
1206 void *buf = (void *)caps->buf;
1208 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1209 tmp->next += offset;
1211 EXPORT_SYMBOL(vfio_info_cap_shift);
1213 int vfio_info_add_capability(struct vfio_info_cap *caps,
1214 struct vfio_info_cap_header *cap, size_t size)
1216 struct vfio_info_cap_header *header;
1218 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1220 return PTR_ERR(header);
1222 memcpy(header + 1, cap + 1, size - sizeof(*header));
1226 EXPORT_SYMBOL(vfio_info_add_capability);
1228 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1229 int max_irq_type, size_t *data_size)
1231 unsigned long minsz;
1234 minsz = offsetofend(struct vfio_irq_set, count);
1236 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1237 (hdr->count >= (U32_MAX - hdr->start)) ||
1238 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1239 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1245 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1248 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1249 case VFIO_IRQ_SET_DATA_NONE:
1252 case VFIO_IRQ_SET_DATA_BOOL:
1253 size = sizeof(uint8_t);
1255 case VFIO_IRQ_SET_DATA_EVENTFD:
1256 size = sizeof(int32_t);
1263 if (hdr->argsz - minsz < hdr->count * size)
1269 *data_size = hdr->count * size;
1274 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1277 * Pin contiguous user pages and return their associated host pages for local
1279 * @device [in] : device
1280 * @iova [in] : starting IOVA of user pages to be pinned.
1281 * @npage [in] : count of pages to be pinned. This count should not
1282 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1283 * @prot [in] : protection flags
1284 * @pages[out] : array of host pages
1285 * Return error or number of pages pinned.
1287 * A driver may only call this function if the vfio_device was created
1288 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1290 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1291 int npage, int prot, struct page **pages)
1293 /* group->container cannot change while a vfio device is open */
1294 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1296 if (vfio_device_has_container(device))
1297 return vfio_device_container_pin_pages(device, iova,
1298 npage, prot, pages);
1299 if (device->iommufd_access) {
1302 if (iova > ULONG_MAX)
1305 * VFIO ignores the sub page offset, npages is from the start of
1306 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1307 * the sub page offset by doing:
1308 * pages[0] + (iova % PAGE_SIZE)
1310 ret = iommufd_access_pin_pages(
1311 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1312 npage * PAGE_SIZE, pages,
1313 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1320 EXPORT_SYMBOL(vfio_pin_pages);
1323 * Unpin contiguous host pages for local domain only.
1324 * @device [in] : device
1325 * @iova [in] : starting address of user pages to be unpinned.
1326 * @npage [in] : count of pages to be unpinned. This count should not
1327 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1329 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1331 if (WARN_ON(!vfio_assert_device_open(device)))
1334 if (vfio_device_has_container(device)) {
1335 vfio_device_container_unpin_pages(device, iova, npage);
1338 if (device->iommufd_access) {
1339 if (WARN_ON(iova > ULONG_MAX))
1341 iommufd_access_unpin_pages(device->iommufd_access,
1342 ALIGN_DOWN(iova, PAGE_SIZE),
1347 EXPORT_SYMBOL(vfio_unpin_pages);
1350 * This interface allows the CPUs to perform some sort of virtual DMA on
1351 * behalf of the device.
1353 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1354 * into/from a kernel buffer.
1356 * As the read/write of user space memory is conducted via the CPUs and is
1357 * not a real device DMA, it is not necessary to pin the user space memory.
1359 * @device [in] : VFIO device
1360 * @iova [in] : base IOVA of a user space buffer
1361 * @data [in] : pointer to kernel buffer
1362 * @len [in] : kernel buffer length
1363 * @write : indicate read or write
1364 * Return error code on failure or 0 on success.
1366 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1367 size_t len, bool write)
1369 if (!data || len <= 0 || !vfio_assert_device_open(device))
1372 if (vfio_device_has_container(device))
1373 return vfio_device_container_dma_rw(device, iova,
1376 if (device->iommufd_access) {
1377 unsigned int flags = 0;
1379 if (iova > ULONG_MAX)
1382 /* VFIO historically tries to auto-detect a kthread */
1384 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1386 flags |= IOMMUFD_ACCESS_RW_WRITE;
1387 return iommufd_access_rw(device->iommufd_access, iova, data,
1392 EXPORT_SYMBOL(vfio_dma_rw);
1395 * Module/class support
1397 static int __init vfio_init(void)
1401 ida_init(&vfio.device_ida);
1403 ret = vfio_group_init();
1407 ret = vfio_virqfd_init();
1411 /* /sys/class/vfio-dev/vfioX */
1412 vfio.device_class = class_create("vfio-dev");
1413 if (IS_ERR(vfio.device_class)) {
1414 ret = PTR_ERR(vfio.device_class);
1418 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1424 vfio_group_cleanup();
1428 static void __exit vfio_cleanup(void)
1430 ida_destroy(&vfio.device_ida);
1431 class_destroy(vfio.device_class);
1432 vfio.device_class = NULL;
1434 vfio_group_cleanup();
1435 xa_destroy(&vfio_device_set_xa);
1438 module_init(vfio_init);
1439 module_exit(vfio_cleanup);
1441 MODULE_VERSION(DRIVER_VERSION);
1442 MODULE_LICENSE("GPL v2");
1443 MODULE_AUTHOR(DRIVER_AUTHOR);
1444 MODULE_DESCRIPTION(DRIVER_DESC);
1445 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");