1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2018-2020 Intel Corporation.
4 * Copyright (C) 2020 Red Hat, Inc.
9 * Thanks Michael S. Tsirkin for the valuable comments and
10 * suggestions. And thanks to Cunming Liang and Zhihong Wang for all
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/cdev.h>
17 #include <linux/device.h>
19 #include <linux/slab.h>
20 #include <linux/iommu.h>
21 #include <linux/uuid.h>
22 #include <linux/vdpa.h>
23 #include <linux/nospec.h>
24 #include <linux/vhost.h>
29 VHOST_VDPA_BACKEND_FEATURES =
30 (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) |
31 (1ULL << VHOST_BACKEND_F_IOTLB_BATCH) |
32 (1ULL << VHOST_BACKEND_F_IOTLB_ASID),
35 #define VHOST_VDPA_DEV_MAX (1U << MINORBITS)
37 #define VHOST_VDPA_IOTLB_BUCKETS 16
39 struct vhost_vdpa_as {
40 struct hlist_node hash_link;
41 struct vhost_iotlb iotlb;
46 struct vhost_dev vdev;
47 struct iommu_domain *domain;
48 struct vhost_virtqueue *vqs;
49 struct completion completion;
50 struct vdpa_device *vdpa;
51 struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
58 struct eventfd_ctx *config_ctx;
60 struct vdpa_iova_range range;
64 static DEFINE_IDA(vhost_vdpa_ida);
66 static dev_t vhost_vdpa_major;
68 static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v,
69 struct vhost_iotlb *iotlb, u64 start,
72 static inline u32 iotlb_to_asid(struct vhost_iotlb *iotlb)
74 struct vhost_vdpa_as *as = container_of(iotlb, struct
75 vhost_vdpa_as, iotlb);
79 static struct vhost_vdpa_as *asid_to_as(struct vhost_vdpa *v, u32 asid)
81 struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS];
82 struct vhost_vdpa_as *as;
84 hlist_for_each_entry(as, head, hash_link)
91 static struct vhost_iotlb *asid_to_iotlb(struct vhost_vdpa *v, u32 asid)
93 struct vhost_vdpa_as *as = asid_to_as(v, asid);
101 static struct vhost_vdpa_as *vhost_vdpa_alloc_as(struct vhost_vdpa *v, u32 asid)
103 struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS];
104 struct vhost_vdpa_as *as;
106 if (asid_to_as(v, asid))
109 if (asid >= v->vdpa->nas)
112 as = kmalloc(sizeof(*as), GFP_KERNEL);
116 vhost_iotlb_init(&as->iotlb, 0, 0);
118 hlist_add_head(&as->hash_link, head);
123 static struct vhost_vdpa_as *vhost_vdpa_find_alloc_as(struct vhost_vdpa *v,
126 struct vhost_vdpa_as *as = asid_to_as(v, asid);
131 return vhost_vdpa_alloc_as(v, asid);
134 static void vhost_vdpa_reset_map(struct vhost_vdpa *v, u32 asid)
136 struct vdpa_device *vdpa = v->vdpa;
137 const struct vdpa_config_ops *ops = vdpa->config;
140 ops->reset_map(vdpa, asid);
143 static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid)
145 struct vhost_vdpa_as *as = asid_to_as(v, asid);
150 hlist_del(&as->hash_link);
151 vhost_vdpa_iotlb_unmap(v, &as->iotlb, 0ULL, 0ULL - 1, asid);
153 * Devices with vendor specific IOMMU may need to restore
154 * iotlb to the initial or default state, which cannot be
155 * cleaned up in the all range unmap call above. Give them
156 * a chance to clean up or reset the map to the desired
159 vhost_vdpa_reset_map(v, asid);
165 static void handle_vq_kick(struct vhost_work *work)
167 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
169 struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
170 const struct vdpa_config_ops *ops = v->vdpa->config;
172 ops->kick_vq(v->vdpa, vq - v->vqs);
175 static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
177 struct vhost_virtqueue *vq = private;
178 struct eventfd_ctx *call_ctx = vq->call_ctx.ctx;
181 eventfd_signal(call_ctx, 1);
186 static irqreturn_t vhost_vdpa_config_cb(void *private)
188 struct vhost_vdpa *v = private;
189 struct eventfd_ctx *config_ctx = v->config_ctx;
192 eventfd_signal(config_ctx, 1);
197 static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
199 struct vhost_virtqueue *vq = &v->vqs[qid];
200 const struct vdpa_config_ops *ops = v->vdpa->config;
201 struct vdpa_device *vdpa = v->vdpa;
204 if (!ops->get_vq_irq)
207 irq = ops->get_vq_irq(vdpa, qid);
211 irq_bypass_unregister_producer(&vq->call_ctx.producer);
212 if (!vq->call_ctx.ctx)
215 vq->call_ctx.producer.token = vq->call_ctx.ctx;
216 vq->call_ctx.producer.irq = irq;
217 ret = irq_bypass_register_producer(&vq->call_ctx.producer);
219 dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n",
220 qid, vq->call_ctx.producer.token, ret);
223 static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
225 struct vhost_virtqueue *vq = &v->vqs[qid];
227 irq_bypass_unregister_producer(&vq->call_ctx.producer);
230 static int _compat_vdpa_reset(struct vhost_vdpa *v)
232 struct vdpa_device *vdpa = v->vdpa;
236 flags |= !vhost_backend_has_feature(v->vdev.vqs[0],
237 VHOST_BACKEND_F_IOTLB_PERSIST) ?
238 VDPA_RESET_F_CLEAN_MAP : 0;
241 return vdpa_reset(vdpa, flags);
244 static int vhost_vdpa_reset(struct vhost_vdpa *v)
247 return _compat_vdpa_reset(v);
250 static long vhost_vdpa_bind_mm(struct vhost_vdpa *v)
252 struct vdpa_device *vdpa = v->vdpa;
253 const struct vdpa_config_ops *ops = vdpa->config;
255 if (!vdpa->use_va || !ops->bind_mm)
258 return ops->bind_mm(vdpa, v->vdev.mm);
261 static void vhost_vdpa_unbind_mm(struct vhost_vdpa *v)
263 struct vdpa_device *vdpa = v->vdpa;
264 const struct vdpa_config_ops *ops = vdpa->config;
266 if (!vdpa->use_va || !ops->unbind_mm)
269 ops->unbind_mm(vdpa);
272 static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
274 struct vdpa_device *vdpa = v->vdpa;
275 const struct vdpa_config_ops *ops = vdpa->config;
278 device_id = ops->get_device_id(vdpa);
280 if (copy_to_user(argp, &device_id, sizeof(device_id)))
286 static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
288 struct vdpa_device *vdpa = v->vdpa;
289 const struct vdpa_config_ops *ops = vdpa->config;
292 status = ops->get_status(vdpa);
294 if (copy_to_user(statusp, &status, sizeof(status)))
300 static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
302 struct vdpa_device *vdpa = v->vdpa;
303 const struct vdpa_config_ops *ops = vdpa->config;
304 u8 status, status_old;
309 if (copy_from_user(&status, statusp, sizeof(status)))
312 status_old = ops->get_status(vdpa);
315 * Userspace shouldn't remove status bits unless reset the
318 if (status != 0 && (status_old & ~status) != 0)
321 if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK))
322 for (i = 0; i < nvqs; i++)
323 vhost_vdpa_unsetup_vq_irq(v, i);
326 ret = _compat_vdpa_reset(v);
330 vdpa_set_status(vdpa, status);
332 if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
333 for (i = 0; i < nvqs; i++)
334 vhost_vdpa_setup_vq_irq(v, i);
339 static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
340 struct vhost_vdpa_config *c)
342 struct vdpa_device *vdpa = v->vdpa;
343 size_t size = vdpa->config->get_config_size(vdpa);
345 if (c->len == 0 || c->off > size)
348 if (c->len > size - c->off)
354 static long vhost_vdpa_get_config(struct vhost_vdpa *v,
355 struct vhost_vdpa_config __user *c)
357 struct vdpa_device *vdpa = v->vdpa;
358 struct vhost_vdpa_config config;
359 unsigned long size = offsetof(struct vhost_vdpa_config, buf);
362 if (copy_from_user(&config, c, size))
364 if (vhost_vdpa_config_validate(v, &config))
366 buf = kvzalloc(config.len, GFP_KERNEL);
370 vdpa_get_config(vdpa, config.off, buf, config.len);
372 if (copy_to_user(c->buf, buf, config.len)) {
381 static long vhost_vdpa_set_config(struct vhost_vdpa *v,
382 struct vhost_vdpa_config __user *c)
384 struct vdpa_device *vdpa = v->vdpa;
385 struct vhost_vdpa_config config;
386 unsigned long size = offsetof(struct vhost_vdpa_config, buf);
389 if (copy_from_user(&config, c, size))
391 if (vhost_vdpa_config_validate(v, &config))
394 buf = vmemdup_user(c->buf, config.len);
398 vdpa_set_config(vdpa, config.off, buf, config.len);
404 static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v)
406 struct vdpa_device *vdpa = v->vdpa;
407 const struct vdpa_config_ops *ops = vdpa->config;
412 static bool vhost_vdpa_can_resume(const struct vhost_vdpa *v)
414 struct vdpa_device *vdpa = v->vdpa;
415 const struct vdpa_config_ops *ops = vdpa->config;
420 static bool vhost_vdpa_has_desc_group(const struct vhost_vdpa *v)
422 struct vdpa_device *vdpa = v->vdpa;
423 const struct vdpa_config_ops *ops = vdpa->config;
425 return ops->get_vq_desc_group;
428 static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
430 struct vdpa_device *vdpa = v->vdpa;
431 const struct vdpa_config_ops *ops = vdpa->config;
434 features = ops->get_device_features(vdpa);
436 if (copy_to_user(featurep, &features, sizeof(features)))
442 static u64 vhost_vdpa_get_backend_features(const struct vhost_vdpa *v)
444 struct vdpa_device *vdpa = v->vdpa;
445 const struct vdpa_config_ops *ops = vdpa->config;
447 if (!ops->get_backend_features)
450 return ops->get_backend_features(vdpa);
453 static bool vhost_vdpa_has_persistent_map(const struct vhost_vdpa *v)
455 struct vdpa_device *vdpa = v->vdpa;
456 const struct vdpa_config_ops *ops = vdpa->config;
458 return (!ops->set_map && !ops->dma_map) || ops->reset_map ||
459 vhost_vdpa_get_backend_features(v) & BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST);
462 static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
464 struct vdpa_device *vdpa = v->vdpa;
465 const struct vdpa_config_ops *ops = vdpa->config;
466 struct vhost_dev *d = &v->vdev;
472 * It's not allowed to change the features after they have
475 if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
478 if (copy_from_user(&features, featurep, sizeof(features)))
481 if (vdpa_set_features(vdpa, features))
484 /* let the vqs know what has been configured */
485 actual_features = ops->get_driver_features(vdpa);
486 for (i = 0; i < d->nvqs; ++i) {
487 struct vhost_virtqueue *vq = d->vqs[i];
489 mutex_lock(&vq->mutex);
490 vq->acked_features = actual_features;
491 mutex_unlock(&vq->mutex);
497 static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
499 struct vdpa_device *vdpa = v->vdpa;
500 const struct vdpa_config_ops *ops = vdpa->config;
503 num = ops->get_vq_num_max(vdpa);
505 if (copy_to_user(argp, &num, sizeof(num)))
511 static void vhost_vdpa_config_put(struct vhost_vdpa *v)
514 eventfd_ctx_put(v->config_ctx);
515 v->config_ctx = NULL;
519 static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
521 struct vdpa_callback cb;
523 struct eventfd_ctx *ctx;
525 cb.callback = vhost_vdpa_config_cb;
527 if (copy_from_user(&fd, argp, sizeof(fd)))
530 ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
531 swap(ctx, v->config_ctx);
533 if (!IS_ERR_OR_NULL(ctx))
534 eventfd_ctx_put(ctx);
536 if (IS_ERR(v->config_ctx)) {
537 long ret = PTR_ERR(v->config_ctx);
539 v->config_ctx = NULL;
543 v->vdpa->config->set_config_cb(v->vdpa, &cb);
548 static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp)
550 struct vhost_vdpa_iova_range range = {
551 .first = v->range.first,
552 .last = v->range.last,
555 if (copy_to_user(argp, &range, sizeof(range)))
560 static long vhost_vdpa_get_config_size(struct vhost_vdpa *v, u32 __user *argp)
562 struct vdpa_device *vdpa = v->vdpa;
563 const struct vdpa_config_ops *ops = vdpa->config;
566 size = ops->get_config_size(vdpa);
568 if (copy_to_user(argp, &size, sizeof(size)))
574 static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp)
576 struct vdpa_device *vdpa = v->vdpa;
578 if (copy_to_user(argp, &vdpa->nvqs, sizeof(vdpa->nvqs)))
584 /* After a successful return of ioctl the device must not process more
585 * virtqueue descriptors. The device can answer to read or writes of config
586 * fields as if it were not suspended. In particular, writing to "queue_enable"
587 * with a value of 1 will not make the device start processing buffers.
589 static long vhost_vdpa_suspend(struct vhost_vdpa *v)
591 struct vdpa_device *vdpa = v->vdpa;
592 const struct vdpa_config_ops *ops = vdpa->config;
597 return ops->suspend(vdpa);
600 /* After a successful return of this ioctl the device resumes processing
601 * virtqueue descriptors. The device becomes fully operational the same way it
602 * was before it was suspended.
604 static long vhost_vdpa_resume(struct vhost_vdpa *v)
606 struct vdpa_device *vdpa = v->vdpa;
607 const struct vdpa_config_ops *ops = vdpa->config;
612 return ops->resume(vdpa);
615 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
618 struct vdpa_device *vdpa = v->vdpa;
619 const struct vdpa_config_ops *ops = vdpa->config;
620 struct vdpa_vq_state vq_state;
621 struct vdpa_callback cb;
622 struct vhost_virtqueue *vq;
623 struct vhost_vring_state s;
627 r = get_user(idx, (u32 __user *)argp);
634 idx = array_index_nospec(idx, v->nvqs);
638 case VHOST_VDPA_SET_VRING_ENABLE:
639 if (copy_from_user(&s, argp, sizeof(s)))
641 ops->set_vq_ready(vdpa, idx, s.num);
643 case VHOST_VDPA_GET_VRING_GROUP:
644 if (!ops->get_vq_group)
647 s.num = ops->get_vq_group(vdpa, idx);
648 if (s.num >= vdpa->ngroups)
650 else if (copy_to_user(argp, &s, sizeof(s)))
653 case VHOST_VDPA_GET_VRING_DESC_GROUP:
654 if (!vhost_vdpa_has_desc_group(v))
657 s.num = ops->get_vq_desc_group(vdpa, idx);
658 if (s.num >= vdpa->ngroups)
660 else if (copy_to_user(argp, &s, sizeof(s)))
663 case VHOST_VDPA_SET_GROUP_ASID:
664 if (copy_from_user(&s, argp, sizeof(s)))
666 if (s.num >= vdpa->nas)
668 if (!ops->set_group_asid)
670 return ops->set_group_asid(vdpa, idx, s.num);
671 case VHOST_GET_VRING_BASE:
672 r = ops->get_vq_state(v->vdpa, idx, &vq_state);
676 if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
677 vq->last_avail_idx = vq_state.packed.last_avail_idx |
678 (vq_state.packed.last_avail_counter << 15);
679 vq->last_used_idx = vq_state.packed.last_used_idx |
680 (vq_state.packed.last_used_counter << 15);
682 vq->last_avail_idx = vq_state.split.avail_index;
687 r = vhost_vring_ioctl(&v->vdev, cmd, argp);
692 case VHOST_SET_VRING_ADDR:
693 if (ops->set_vq_address(vdpa, idx,
694 (u64)(uintptr_t)vq->desc,
695 (u64)(uintptr_t)vq->avail,
696 (u64)(uintptr_t)vq->used))
700 case VHOST_SET_VRING_BASE:
701 if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
702 vq_state.packed.last_avail_idx = vq->last_avail_idx & 0x7fff;
703 vq_state.packed.last_avail_counter = !!(vq->last_avail_idx & 0x8000);
704 vq_state.packed.last_used_idx = vq->last_used_idx & 0x7fff;
705 vq_state.packed.last_used_counter = !!(vq->last_used_idx & 0x8000);
707 vq_state.split.avail_index = vq->last_avail_idx;
709 r = ops->set_vq_state(vdpa, idx, &vq_state);
712 case VHOST_SET_VRING_CALL:
713 if (vq->call_ctx.ctx) {
714 cb.callback = vhost_vdpa_virtqueue_cb;
716 cb.trigger = vq->call_ctx.ctx;
722 ops->set_vq_cb(vdpa, idx, &cb);
723 vhost_vdpa_setup_vq_irq(v, idx);
726 case VHOST_SET_VRING_NUM:
727 ops->set_vq_num(vdpa, idx, vq->num);
734 static long vhost_vdpa_unlocked_ioctl(struct file *filep,
735 unsigned int cmd, unsigned long arg)
737 struct vhost_vdpa *v = filep->private_data;
738 struct vhost_dev *d = &v->vdev;
739 void __user *argp = (void __user *)arg;
740 u64 __user *featurep = argp;
744 if (cmd == VHOST_SET_BACKEND_FEATURES) {
745 if (copy_from_user(&features, featurep, sizeof(features)))
747 if (features & ~(VHOST_VDPA_BACKEND_FEATURES |
748 BIT_ULL(VHOST_BACKEND_F_DESC_ASID) |
749 BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST) |
750 BIT_ULL(VHOST_BACKEND_F_SUSPEND) |
751 BIT_ULL(VHOST_BACKEND_F_RESUME) |
752 BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK)))
754 if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) &&
755 !vhost_vdpa_can_suspend(v))
757 if ((features & BIT_ULL(VHOST_BACKEND_F_RESUME)) &&
758 !vhost_vdpa_can_resume(v))
760 if ((features & BIT_ULL(VHOST_BACKEND_F_DESC_ASID)) &&
761 !(features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)))
763 if ((features & BIT_ULL(VHOST_BACKEND_F_DESC_ASID)) &&
764 !vhost_vdpa_has_desc_group(v))
766 if ((features & BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST)) &&
767 !vhost_vdpa_has_persistent_map(v))
769 vhost_set_backend_features(&v->vdev, features);
773 mutex_lock(&d->mutex);
776 case VHOST_VDPA_GET_DEVICE_ID:
777 r = vhost_vdpa_get_device_id(v, argp);
779 case VHOST_VDPA_GET_STATUS:
780 r = vhost_vdpa_get_status(v, argp);
782 case VHOST_VDPA_SET_STATUS:
783 r = vhost_vdpa_set_status(v, argp);
785 case VHOST_VDPA_GET_CONFIG:
786 r = vhost_vdpa_get_config(v, argp);
788 case VHOST_VDPA_SET_CONFIG:
789 r = vhost_vdpa_set_config(v, argp);
791 case VHOST_GET_FEATURES:
792 r = vhost_vdpa_get_features(v, argp);
794 case VHOST_SET_FEATURES:
795 r = vhost_vdpa_set_features(v, argp);
797 case VHOST_VDPA_GET_VRING_NUM:
798 r = vhost_vdpa_get_vring_num(v, argp);
800 case VHOST_VDPA_GET_GROUP_NUM:
801 if (copy_to_user(argp, &v->vdpa->ngroups,
802 sizeof(v->vdpa->ngroups)))
805 case VHOST_VDPA_GET_AS_NUM:
806 if (copy_to_user(argp, &v->vdpa->nas, sizeof(v->vdpa->nas)))
809 case VHOST_SET_LOG_BASE:
810 case VHOST_SET_LOG_FD:
813 case VHOST_VDPA_SET_CONFIG_CALL:
814 r = vhost_vdpa_set_config_call(v, argp);
816 case VHOST_GET_BACKEND_FEATURES:
817 features = VHOST_VDPA_BACKEND_FEATURES;
818 if (vhost_vdpa_can_suspend(v))
819 features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND);
820 if (vhost_vdpa_can_resume(v))
821 features |= BIT_ULL(VHOST_BACKEND_F_RESUME);
822 if (vhost_vdpa_has_desc_group(v))
823 features |= BIT_ULL(VHOST_BACKEND_F_DESC_ASID);
824 if (vhost_vdpa_has_persistent_map(v))
825 features |= BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST);
826 features |= vhost_vdpa_get_backend_features(v);
827 if (copy_to_user(featurep, &features, sizeof(features)))
830 case VHOST_VDPA_GET_IOVA_RANGE:
831 r = vhost_vdpa_get_iova_range(v, argp);
833 case VHOST_VDPA_GET_CONFIG_SIZE:
834 r = vhost_vdpa_get_config_size(v, argp);
836 case VHOST_VDPA_GET_VQS_COUNT:
837 r = vhost_vdpa_get_vqs_count(v, argp);
839 case VHOST_VDPA_SUSPEND:
840 r = vhost_vdpa_suspend(v);
842 case VHOST_VDPA_RESUME:
843 r = vhost_vdpa_resume(v);
846 r = vhost_dev_ioctl(&v->vdev, cmd, argp);
847 if (r == -ENOIOCTLCMD)
848 r = vhost_vdpa_vring_ioctl(v, cmd, argp);
856 case VHOST_SET_OWNER:
857 r = vhost_vdpa_bind_mm(v);
859 vhost_dev_reset_owner(d, NULL);
863 mutex_unlock(&d->mutex);
866 static void vhost_vdpa_general_unmap(struct vhost_vdpa *v,
867 struct vhost_iotlb_map *map, u32 asid)
869 struct vdpa_device *vdpa = v->vdpa;
870 const struct vdpa_config_ops *ops = vdpa->config;
872 ops->dma_unmap(vdpa, asid, map->start, map->size);
873 } else if (ops->set_map == NULL) {
874 iommu_unmap(v->domain, map->start, map->size);
878 static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
879 u64 start, u64 last, u32 asid)
881 struct vhost_dev *dev = &v->vdev;
882 struct vhost_iotlb_map *map;
884 unsigned long pfn, pinned;
886 while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
887 pinned = PFN_DOWN(map->size);
888 for (pfn = PFN_DOWN(map->addr);
889 pinned > 0; pfn++, pinned--) {
890 page = pfn_to_page(pfn);
891 if (map->perm & VHOST_ACCESS_WO)
892 set_page_dirty_lock(page);
893 unpin_user_page(page);
895 atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
896 vhost_vdpa_general_unmap(v, map, asid);
897 vhost_iotlb_map_free(iotlb, map);
901 static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
902 u64 start, u64 last, u32 asid)
904 struct vhost_iotlb_map *map;
905 struct vdpa_map_file *map_file;
907 while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
908 map_file = (struct vdpa_map_file *)map->opaque;
909 fput(map_file->file);
911 vhost_vdpa_general_unmap(v, map, asid);
912 vhost_iotlb_map_free(iotlb, map);
916 static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v,
917 struct vhost_iotlb *iotlb, u64 start,
920 struct vdpa_device *vdpa = v->vdpa;
923 return vhost_vdpa_va_unmap(v, iotlb, start, last, asid);
925 return vhost_vdpa_pa_unmap(v, iotlb, start, last, asid);
928 static int perm_to_iommu_flags(u32 perm)
933 case VHOST_ACCESS_WO:
934 flags |= IOMMU_WRITE;
936 case VHOST_ACCESS_RO:
939 case VHOST_ACCESS_RW:
940 flags |= (IOMMU_WRITE | IOMMU_READ);
943 WARN(1, "invalidate vhost IOTLB permission\n");
947 return flags | IOMMU_CACHE;
950 static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
951 u64 iova, u64 size, u64 pa, u32 perm, void *opaque)
953 struct vhost_dev *dev = &v->vdev;
954 struct vdpa_device *vdpa = v->vdpa;
955 const struct vdpa_config_ops *ops = vdpa->config;
956 u32 asid = iotlb_to_asid(iotlb);
959 r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1,
965 r = ops->dma_map(vdpa, asid, iova, size, pa, perm, opaque);
966 } else if (ops->set_map) {
968 r = ops->set_map(vdpa, asid, iotlb);
970 r = iommu_map(v->domain, iova, pa, size,
971 perm_to_iommu_flags(perm), GFP_KERNEL);
974 vhost_iotlb_del_range(iotlb, iova, iova + size - 1);
979 atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
984 static void vhost_vdpa_unmap(struct vhost_vdpa *v,
985 struct vhost_iotlb *iotlb,
988 struct vdpa_device *vdpa = v->vdpa;
989 const struct vdpa_config_ops *ops = vdpa->config;
990 u32 asid = iotlb_to_asid(iotlb);
992 vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1, asid);
996 ops->set_map(vdpa, asid, iotlb);
1001 static int vhost_vdpa_va_map(struct vhost_vdpa *v,
1002 struct vhost_iotlb *iotlb,
1003 u64 iova, u64 size, u64 uaddr, u32 perm)
1005 struct vhost_dev *dev = &v->vdev;
1006 u64 offset, map_size, map_iova = iova;
1007 struct vdpa_map_file *map_file;
1008 struct vm_area_struct *vma;
1011 mmap_read_lock(dev->mm);
1014 vma = find_vma(dev->mm, uaddr);
1019 map_size = min(size, vma->vm_end - uaddr);
1020 if (!(vma->vm_file && (vma->vm_flags & VM_SHARED) &&
1021 !(vma->vm_flags & (VM_IO | VM_PFNMAP))))
1024 map_file = kzalloc(sizeof(*map_file), GFP_KERNEL);
1029 offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
1030 map_file->offset = offset;
1031 map_file->file = get_file(vma->vm_file);
1032 ret = vhost_vdpa_map(v, iotlb, map_iova, map_size, uaddr,
1035 fput(map_file->file);
1042 map_iova += map_size;
1045 vhost_vdpa_unmap(v, iotlb, iova, map_iova - iova);
1047 mmap_read_unlock(dev->mm);
1052 static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
1053 struct vhost_iotlb *iotlb,
1054 u64 iova, u64 size, u64 uaddr, u32 perm)
1056 struct vhost_dev *dev = &v->vdev;
1057 struct page **page_list;
1058 unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
1059 unsigned int gup_flags = FOLL_LONGTERM;
1060 unsigned long npages, cur_base, map_pfn, last_pfn = 0;
1061 unsigned long lock_limit, sz2pin, nchunks, i;
1066 /* Limit the use of memory for bookkeeping */
1067 page_list = (struct page **) __get_free_page(GFP_KERNEL);
1071 if (perm & VHOST_ACCESS_WO)
1072 gup_flags |= FOLL_WRITE;
1074 npages = PFN_UP(size + (iova & ~PAGE_MASK));
1080 mmap_read_lock(dev->mm);
1082 lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
1083 if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
1088 cur_base = uaddr & PAGE_MASK;
1093 sz2pin = min_t(unsigned long, npages, list_size);
1094 pinned = pin_user_pages(cur_base, sz2pin,
1095 gup_flags, page_list);
1096 if (sz2pin != pinned) {
1100 unpin_user_pages(page_list, pinned);
1108 map_pfn = page_to_pfn(page_list[0]);
1110 for (i = 0; i < pinned; i++) {
1111 unsigned long this_pfn = page_to_pfn(page_list[i]);
1114 if (last_pfn && (this_pfn != last_pfn + 1)) {
1115 /* Pin a contiguous chunk of memory */
1116 csize = PFN_PHYS(last_pfn - map_pfn + 1);
1117 ret = vhost_vdpa_map(v, iotlb, iova, csize,
1122 * Unpin the pages that are left unmapped
1123 * from this point on in the current
1124 * page_list. The remaining outstanding
1125 * ones which may stride across several
1126 * chunks will be covered in the common
1127 * error path subsequently.
1129 unpin_user_pages(&page_list[i],
1139 last_pfn = this_pfn;
1142 cur_base += PFN_PHYS(pinned);
1146 /* Pin the rest chunk */
1147 ret = vhost_vdpa_map(v, iotlb, iova, PFN_PHYS(last_pfn - map_pfn + 1),
1148 PFN_PHYS(map_pfn), perm, NULL);
1155 * Unpin the outstanding pages which are yet to be
1156 * mapped but haven't due to vdpa_map() or
1157 * pin_user_pages() failure.
1159 * Mapped pages are accounted in vdpa_map(), hence
1160 * the corresponding unpinning will be handled by
1164 for (pfn = map_pfn; pfn <= last_pfn; pfn++)
1165 unpin_user_page(pfn_to_page(pfn));
1167 vhost_vdpa_unmap(v, iotlb, start, size);
1170 mmap_read_unlock(dev->mm);
1172 free_page((unsigned long)page_list);
1177 static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
1178 struct vhost_iotlb *iotlb,
1179 struct vhost_iotlb_msg *msg)
1181 struct vdpa_device *vdpa = v->vdpa;
1183 if (msg->iova < v->range.first || !msg->size ||
1184 msg->iova > U64_MAX - msg->size + 1 ||
1185 msg->iova + msg->size - 1 > v->range.last)
1188 if (vhost_iotlb_itree_first(iotlb, msg->iova,
1189 msg->iova + msg->size - 1))
1193 return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
1194 msg->uaddr, msg->perm);
1196 return vhost_vdpa_pa_map(v, iotlb, msg->iova, msg->size, msg->uaddr,
1200 static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
1201 struct vhost_iotlb_msg *msg)
1203 struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
1204 struct vdpa_device *vdpa = v->vdpa;
1205 const struct vdpa_config_ops *ops = vdpa->config;
1206 struct vhost_iotlb *iotlb = NULL;
1207 struct vhost_vdpa_as *as = NULL;
1210 mutex_lock(&dev->mutex);
1212 r = vhost_dev_check_owner(dev);
1216 if (msg->type == VHOST_IOTLB_UPDATE ||
1217 msg->type == VHOST_IOTLB_BATCH_BEGIN) {
1218 as = vhost_vdpa_find_alloc_as(v, asid);
1220 dev_err(&v->dev, "can't find and alloc asid %d\n",
1227 iotlb = asid_to_iotlb(v, asid);
1229 if ((v->in_batch && v->batch_asid != asid) || !iotlb) {
1230 if (v->in_batch && v->batch_asid != asid) {
1231 dev_info(&v->dev, "batch id %d asid %d\n",
1232 v->batch_asid, asid);
1235 dev_err(&v->dev, "no iotlb for asid %d\n", asid);
1240 switch (msg->type) {
1241 case VHOST_IOTLB_UPDATE:
1242 r = vhost_vdpa_process_iotlb_update(v, iotlb, msg);
1244 case VHOST_IOTLB_INVALIDATE:
1245 vhost_vdpa_unmap(v, iotlb, msg->iova, msg->size);
1247 case VHOST_IOTLB_BATCH_BEGIN:
1248 v->batch_asid = asid;
1251 case VHOST_IOTLB_BATCH_END:
1252 if (v->in_batch && ops->set_map)
1253 ops->set_map(vdpa, asid, iotlb);
1254 v->in_batch = false;
1261 mutex_unlock(&dev->mutex);
1266 static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
1267 struct iov_iter *from)
1269 struct file *file = iocb->ki_filp;
1270 struct vhost_vdpa *v = file->private_data;
1271 struct vhost_dev *dev = &v->vdev;
1273 return vhost_chr_write_iter(dev, from);
1276 static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
1278 struct vdpa_device *vdpa = v->vdpa;
1279 const struct vdpa_config_ops *ops = vdpa->config;
1280 struct device *dma_dev = vdpa_get_dma_dev(vdpa);
1281 const struct bus_type *bus;
1284 /* Device want to do DMA by itself */
1285 if (ops->set_map || ops->dma_map)
1292 if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) {
1293 dev_warn_once(&v->dev,
1294 "Failed to allocate domain, device is not IOMMU cache coherent capable\n");
1298 v->domain = iommu_domain_alloc(bus);
1302 ret = iommu_attach_device(v->domain, dma_dev);
1309 iommu_domain_free(v->domain);
1314 static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
1316 struct vdpa_device *vdpa = v->vdpa;
1317 struct device *dma_dev = vdpa_get_dma_dev(vdpa);
1320 iommu_detach_device(v->domain, dma_dev);
1321 iommu_domain_free(v->domain);
1327 static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
1329 struct vdpa_iova_range *range = &v->range;
1330 struct vdpa_device *vdpa = v->vdpa;
1331 const struct vdpa_config_ops *ops = vdpa->config;
1333 if (ops->get_iova_range) {
1334 *range = ops->get_iova_range(vdpa);
1335 } else if (v->domain && v->domain->geometry.force_aperture) {
1336 range->first = v->domain->geometry.aperture_start;
1337 range->last = v->domain->geometry.aperture_end;
1340 range->last = ULLONG_MAX;
1344 static void vhost_vdpa_cleanup(struct vhost_vdpa *v)
1346 struct vhost_vdpa_as *as;
1349 for (asid = 0; asid < v->vdpa->nas; asid++) {
1350 as = asid_to_as(v, asid);
1352 vhost_vdpa_remove_as(v, asid);
1355 vhost_vdpa_free_domain(v);
1356 vhost_dev_cleanup(&v->vdev);
1361 static int vhost_vdpa_open(struct inode *inode, struct file *filep)
1363 struct vhost_vdpa *v;
1364 struct vhost_dev *dev;
1365 struct vhost_virtqueue **vqs;
1369 v = container_of(inode->i_cdev, struct vhost_vdpa, cdev);
1371 opened = atomic_cmpxchg(&v->opened, 0, 1);
1376 r = vhost_vdpa_reset(v);
1380 vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
1387 for (i = 0; i < nvqs; i++) {
1388 vqs[i] = &v->vqs[i];
1389 vqs[i]->handle_kick = handle_vq_kick;
1391 vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
1392 vhost_vdpa_process_iotlb_msg);
1394 r = vhost_vdpa_alloc_domain(v);
1396 goto err_alloc_domain;
1398 vhost_vdpa_set_iova_range(v);
1400 filep->private_data = v;
1405 vhost_vdpa_cleanup(v);
1407 atomic_dec(&v->opened);
1411 static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
1415 for (i = 0; i < v->nvqs; i++)
1416 vhost_vdpa_unsetup_vq_irq(v, i);
1419 static int vhost_vdpa_release(struct inode *inode, struct file *filep)
1421 struct vhost_vdpa *v = filep->private_data;
1422 struct vhost_dev *d = &v->vdev;
1424 mutex_lock(&d->mutex);
1425 filep->private_data = NULL;
1426 vhost_vdpa_clean_irq(v);
1427 vhost_vdpa_reset(v);
1428 vhost_dev_stop(&v->vdev);
1429 vhost_vdpa_unbind_mm(v);
1430 vhost_vdpa_config_put(v);
1431 vhost_vdpa_cleanup(v);
1432 mutex_unlock(&d->mutex);
1434 atomic_dec(&v->opened);
1435 complete(&v->completion);
1441 static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
1443 struct vhost_vdpa *v = vmf->vma->vm_file->private_data;
1444 struct vdpa_device *vdpa = v->vdpa;
1445 const struct vdpa_config_ops *ops = vdpa->config;
1446 struct vdpa_notification_area notify;
1447 struct vm_area_struct *vma = vmf->vma;
1448 u16 index = vma->vm_pgoff;
1450 notify = ops->get_vq_notification(vdpa, index);
1452 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1453 if (remap_pfn_range(vma, vmf->address & PAGE_MASK,
1454 PFN_DOWN(notify.addr), PAGE_SIZE,
1456 return VM_FAULT_SIGBUS;
1458 return VM_FAULT_NOPAGE;
1461 static const struct vm_operations_struct vhost_vdpa_vm_ops = {
1462 .fault = vhost_vdpa_fault,
1465 static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
1467 struct vhost_vdpa *v = vma->vm_file->private_data;
1468 struct vdpa_device *vdpa = v->vdpa;
1469 const struct vdpa_config_ops *ops = vdpa->config;
1470 struct vdpa_notification_area notify;
1471 unsigned long index = vma->vm_pgoff;
1473 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1475 if ((vma->vm_flags & VM_SHARED) == 0)
1477 if (vma->vm_flags & VM_READ)
1481 if (!ops->get_vq_notification)
1484 /* To be safe and easily modelled by userspace, We only
1485 * support the doorbell which sits on the page boundary and
1486 * does not share the page with other registers.
1488 notify = ops->get_vq_notification(vdpa, index);
1489 if (notify.addr & (PAGE_SIZE - 1))
1491 if (vma->vm_end - vma->vm_start != notify.size)
1494 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
1495 vma->vm_ops = &vhost_vdpa_vm_ops;
1498 #endif /* CONFIG_MMU */
1500 static const struct file_operations vhost_vdpa_fops = {
1501 .owner = THIS_MODULE,
1502 .open = vhost_vdpa_open,
1503 .release = vhost_vdpa_release,
1504 .write_iter = vhost_vdpa_chr_write_iter,
1505 .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
1507 .mmap = vhost_vdpa_mmap,
1508 #endif /* CONFIG_MMU */
1509 .compat_ioctl = compat_ptr_ioctl,
1512 static void vhost_vdpa_release_dev(struct device *device)
1514 struct vhost_vdpa *v =
1515 container_of(device, struct vhost_vdpa, dev);
1517 ida_simple_remove(&vhost_vdpa_ida, v->minor);
1522 static int vhost_vdpa_probe(struct vdpa_device *vdpa)
1524 const struct vdpa_config_ops *ops = vdpa->config;
1525 struct vhost_vdpa *v;
1529 /* We can't support platform IOMMU device with more than 1
1532 if (!ops->set_map && !ops->dma_map &&
1533 (vdpa->ngroups > 1 || vdpa->nas > 1))
1536 v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1540 minor = ida_simple_get(&vhost_vdpa_ida, 0,
1541 VHOST_VDPA_DEV_MAX, GFP_KERNEL);
1547 atomic_set(&v->opened, 0);
1550 v->nvqs = vdpa->nvqs;
1551 v->virtio_id = ops->get_device_id(vdpa);
1553 device_initialize(&v->dev);
1554 v->dev.release = vhost_vdpa_release_dev;
1555 v->dev.parent = &vdpa->dev;
1556 v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor);
1557 v->vqs = kmalloc_array(v->nvqs, sizeof(struct vhost_virtqueue),
1564 r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
1568 cdev_init(&v->cdev, &vhost_vdpa_fops);
1569 v->cdev.owner = THIS_MODULE;
1571 r = cdev_device_add(&v->cdev, &v->dev);
1575 init_completion(&v->completion);
1576 vdpa_set_drvdata(vdpa, v);
1578 for (i = 0; i < VHOST_VDPA_IOTLB_BUCKETS; i++)
1579 INIT_HLIST_HEAD(&v->as[i]);
1584 put_device(&v->dev);
1585 ida_simple_remove(&vhost_vdpa_ida, v->minor);
1589 static void vhost_vdpa_remove(struct vdpa_device *vdpa)
1591 struct vhost_vdpa *v = vdpa_get_drvdata(vdpa);
1594 cdev_device_del(&v->cdev, &v->dev);
1597 opened = atomic_cmpxchg(&v->opened, 0, 1);
1600 wait_for_completion(&v->completion);
1603 put_device(&v->dev);
1606 static struct vdpa_driver vhost_vdpa_driver = {
1608 .name = "vhost_vdpa",
1610 .probe = vhost_vdpa_probe,
1611 .remove = vhost_vdpa_remove,
1614 static int __init vhost_vdpa_init(void)
1618 r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX,
1621 goto err_alloc_chrdev;
1623 r = vdpa_register_driver(&vhost_vdpa_driver);
1625 goto err_vdpa_register_driver;
1629 err_vdpa_register_driver:
1630 unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
1634 module_init(vhost_vdpa_init);
1636 static void __exit vhost_vdpa_exit(void)
1638 vdpa_unregister_driver(&vhost_vdpa_driver);
1639 unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
1641 module_exit(vhost_vdpa_exit);
1643 MODULE_VERSION("0.0.1");
1644 MODULE_LICENSE("GPL v2");
1645 MODULE_AUTHOR("Intel Corporation");
1646 MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");