4 * Copyright Red Hat, Inc. 2010
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
20 #include "qemu/atomic.h"
21 #include "qemu/range.h"
22 #include "qemu/error-report.h"
23 #include "qemu/memfd.h"
24 #include <linux/vhost.h>
25 #include "exec/address-spaces.h"
26 #include "hw/virtio/virtio-bus.h"
27 #include "hw/virtio/virtio-access.h"
28 #include "migration/blocker.h"
29 #include "sysemu/dma.h"
31 /* enabled until disconnected backend stabilizes */
32 #define _VHOST_DEBUG 1
35 #define VHOST_OPS_DEBUG(fmt, ...) \
36 do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
37 strerror(errno), errno); } while (0)
39 #define VHOST_OPS_DEBUG(fmt, ...) \
43 static struct vhost_log *vhost_log;
44 static struct vhost_log *vhost_log_shm;
46 static unsigned int used_memslots;
47 static QLIST_HEAD(, vhost_dev) vhost_devices =
48 QLIST_HEAD_INITIALIZER(vhost_devices);
50 bool vhost_has_free_slot(void)
52 unsigned int slots_limit = ~0U;
53 struct vhost_dev *hdev;
55 QLIST_FOREACH(hdev, &vhost_devices, entry) {
56 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
57 slots_limit = MIN(slots_limit, r);
59 return slots_limit > used_memslots;
62 static void vhost_dev_sync_region(struct vhost_dev *dev,
63 MemoryRegionSection *section,
64 uint64_t mfirst, uint64_t mlast,
65 uint64_t rfirst, uint64_t rlast)
67 vhost_log_chunk_t *log = dev->log->log;
69 uint64_t start = MAX(mfirst, rfirst);
70 uint64_t end = MIN(mlast, rlast);
71 vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
72 vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
73 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
78 assert(end / VHOST_LOG_CHUNK < dev->log_size);
79 assert(start / VHOST_LOG_CHUNK < dev->log_size);
81 for (;from < to; ++from) {
82 vhost_log_chunk_t log;
83 /* We first check with non-atomic: much cheaper,
84 * and we expect non-dirty to be the common case. */
86 addr += VHOST_LOG_CHUNK;
89 /* Data must be read atomically. We don't really need barrier semantics
90 * but it's easier to use atomic_* than roll our own. */
91 log = atomic_xchg(from, 0);
95 hwaddr section_offset;
97 page_addr = addr + bit * VHOST_LOG_PAGE;
98 section_offset = page_addr - section->offset_within_address_space;
99 mr_offset = section_offset + section->offset_within_region;
100 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
101 log &= ~(0x1ull << bit);
103 addr += VHOST_LOG_CHUNK;
107 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
108 MemoryRegionSection *section,
116 if (!dev->log_enabled || !dev->started) {
119 start_addr = section->offset_within_address_space;
120 end_addr = range_get_last(start_addr, int128_get64(section->size));
121 start_addr = MAX(first, start_addr);
122 end_addr = MIN(last, end_addr);
124 for (i = 0; i < dev->mem->nregions; ++i) {
125 struct vhost_memory_region *reg = dev->mem->regions + i;
126 vhost_dev_sync_region(dev, section, start_addr, end_addr,
127 reg->guest_phys_addr,
128 range_get_last(reg->guest_phys_addr,
131 for (i = 0; i < dev->nvqs; ++i) {
132 struct vhost_virtqueue *vq = dev->vqs + i;
133 vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
134 range_get_last(vq->used_phys, vq->used_size));
139 static void vhost_log_sync(MemoryListener *listener,
140 MemoryRegionSection *section)
142 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
144 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
147 static void vhost_log_sync_range(struct vhost_dev *dev,
148 hwaddr first, hwaddr last)
151 /* FIXME: this is N^2 in number of sections */
152 for (i = 0; i < dev->n_mem_sections; ++i) {
153 MemoryRegionSection *section = &dev->mem_sections[i];
154 vhost_sync_dirty_bitmap(dev, section, first, last);
158 /* Assign/unassign. Keep an unsorted array of non-overlapping
159 * memory regions in dev->mem. */
160 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
164 int from, to, n = dev->mem->nregions;
165 /* Track overlapping/split regions for sanity checking. */
166 int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
168 for (from = 0, to = 0; from < n; ++from, ++to) {
169 struct vhost_memory_region *reg = dev->mem->regions + to;
174 /* clone old region */
176 memcpy(reg, dev->mem->regions + from, sizeof *reg);
179 /* No overlap is simple */
180 if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
185 /* Split only happens if supplied region
186 * is in the middle of an existing one. Thus it can not
187 * overlap with any other existing region. */
190 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
191 memlast = range_get_last(start_addr, size);
193 /* Remove whole region */
194 if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
195 --dev->mem->nregions;
202 if (memlast >= reglast) {
203 reg->memory_size = start_addr - reg->guest_phys_addr;
204 assert(reg->memory_size);
205 assert(!overlap_end);
211 if (start_addr <= reg->guest_phys_addr) {
212 change = memlast + 1 - reg->guest_phys_addr;
213 reg->memory_size -= change;
214 reg->guest_phys_addr += change;
215 reg->userspace_addr += change;
216 assert(reg->memory_size);
217 assert(!overlap_start);
222 /* This only happens if supplied region
223 * is in the middle of an existing one. Thus it can not
224 * overlap with any other existing region. */
225 assert(!overlap_start);
226 assert(!overlap_end);
227 assert(!overlap_middle);
228 /* Split region: shrink first part, shift second part. */
229 memcpy(dev->mem->regions + n, reg, sizeof *reg);
230 reg->memory_size = start_addr - reg->guest_phys_addr;
231 assert(reg->memory_size);
232 change = memlast + 1 - reg->guest_phys_addr;
233 reg = dev->mem->regions + n;
234 reg->memory_size -= change;
235 assert(reg->memory_size);
236 reg->guest_phys_addr += change;
237 reg->userspace_addr += change;
238 /* Never add more than 1 region */
239 assert(dev->mem->nregions == n);
240 ++dev->mem->nregions;
245 /* Called after unassign, so no regions overlap the given range. */
246 static void vhost_dev_assign_memory(struct vhost_dev *dev,
252 struct vhost_memory_region *merged = NULL;
253 for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
254 struct vhost_memory_region *reg = dev->mem->regions + to;
255 uint64_t prlast, urlast;
256 uint64_t pmlast, umlast;
259 /* clone old region */
261 memcpy(reg, dev->mem->regions + from, sizeof *reg);
263 prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
264 pmlast = range_get_last(start_addr, size);
265 urlast = range_get_last(reg->userspace_addr, reg->memory_size);
266 umlast = range_get_last(uaddr, size);
268 /* check for overlapping regions: should never happen. */
269 assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
270 /* Not an adjacent or overlapping region - do not merge. */
271 if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
272 (pmlast + 1 != reg->guest_phys_addr ||
273 umlast + 1 != reg->userspace_addr)) {
277 if (dev->vhost_ops->vhost_backend_can_merge &&
278 !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
290 u = MIN(uaddr, reg->userspace_addr);
291 s = MIN(start_addr, reg->guest_phys_addr);
292 e = MAX(pmlast, prlast);
293 uaddr = merged->userspace_addr = u;
294 start_addr = merged->guest_phys_addr = s;
295 size = merged->memory_size = e - s + 1;
296 assert(merged->memory_size);
300 struct vhost_memory_region *reg = dev->mem->regions + to;
301 memset(reg, 0, sizeof *reg);
302 reg->memory_size = size;
303 assert(reg->memory_size);
304 reg->guest_phys_addr = start_addr;
305 reg->userspace_addr = uaddr;
308 assert(to <= dev->mem->nregions + 1);
309 dev->mem->nregions = to;
312 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
314 uint64_t log_size = 0;
316 for (i = 0; i < dev->mem->nregions; ++i) {
317 struct vhost_memory_region *reg = dev->mem->regions + i;
318 uint64_t last = range_get_last(reg->guest_phys_addr,
320 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
322 for (i = 0; i < dev->nvqs; ++i) {
323 struct vhost_virtqueue *vq = dev->vqs + i;
324 uint64_t last = vq->used_phys + vq->used_size - 1;
325 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
330 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
333 struct vhost_log *log;
334 uint64_t logsize = size * sizeof(*(log->log));
337 log = g_new0(struct vhost_log, 1);
339 log->log = qemu_memfd_alloc("vhost-log", logsize,
340 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
343 error_report_err(err);
347 memset(log->log, 0, logsize);
349 log->log = g_malloc0(logsize);
359 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
361 struct vhost_log *log = share ? vhost_log_shm : vhost_log;
363 if (!log || log->size != size) {
364 log = vhost_log_alloc(size, share);
377 static void vhost_log_put(struct vhost_dev *dev, bool sync)
379 struct vhost_log *log = dev->log;
386 if (log->refcnt == 0) {
387 /* Sync only the range covered by the old log */
388 if (dev->log_size && sync) {
389 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
392 if (vhost_log == log) {
395 } else if (vhost_log_shm == log) {
396 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
398 vhost_log_shm = NULL;
408 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
410 return dev->vhost_ops->vhost_requires_shm_log &&
411 dev->vhost_ops->vhost_requires_shm_log(dev);
414 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
416 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
417 uint64_t log_base = (uintptr_t)log->log;
420 /* inform backend of log switching, this must be done before
421 releasing the current log, to ensure no logging is lost */
422 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
424 VHOST_OPS_DEBUG("vhost_set_log_base failed");
427 vhost_log_put(dev, true);
429 dev->log_size = size;
432 static int vhost_dev_has_iommu(struct vhost_dev *dev)
434 VirtIODevice *vdev = dev->vdev;
436 return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
439 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
440 hwaddr *plen, int is_write)
442 if (!vhost_dev_has_iommu(dev)) {
443 return cpu_physical_memory_map(addr, plen, is_write);
445 return (void *)(uintptr_t)addr;
449 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
450 hwaddr len, int is_write,
453 if (!vhost_dev_has_iommu(dev)) {
454 cpu_physical_memory_unmap(buffer, len, is_write, access_len);
458 static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
469 if (!ranges_overlap(start_addr, size, part_addr, part_size)) {
473 p = vhost_memory_map(dev, part_addr, &l, 1);
474 if (!p || l != part_size) {
480 vhost_memory_unmap(dev, p, l, 0, 0);
484 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
490 const char *part_name[] = {
496 for (i = 0; i < dev->nvqs; ++i) {
497 struct vhost_virtqueue *vq = dev->vqs + i;
500 r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
501 vq->desc_size, start_addr, size);
507 r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
508 vq->avail_size, start_addr, size);
514 r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
515 vq->used_size, start_addr, size);
522 error_report("Unable to map %s for ring %d", part_name[j], i);
523 } else if (r == -EBUSY) {
524 error_report("%s relocated for ring %d", part_name[j], i);
529 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
533 int i, n = dev->mem->nregions;
534 for (i = 0; i < n; ++i) {
535 struct vhost_memory_region *reg = dev->mem->regions + i;
536 if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
544 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
549 struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
557 reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
558 memlast = range_get_last(start_addr, size);
560 /* Need to extend region? */
561 if (start_addr < reg->guest_phys_addr || memlast > reglast) {
564 /* userspace_addr changed? */
565 return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
568 static void vhost_set_memory(MemoryListener *listener,
569 MemoryRegionSection *section,
572 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
574 hwaddr start_addr = section->offset_within_address_space;
575 ram_addr_t size = int128_get64(section->size);
577 memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
578 int s = offsetof(struct vhost_memory, regions) +
579 (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
582 dev->mem = g_realloc(dev->mem, s);
590 /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
591 ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
593 if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
594 /* Region exists with same address. Nothing to do. */
598 if (!vhost_dev_find_reg(dev, start_addr, size)) {
599 /* Removing region that we don't access. Nothing to do. */
604 vhost_dev_unassign_memory(dev, start_addr, size);
606 /* Add given mapping, merging adjacent regions if any */
607 vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
609 /* Remove old mapping for this memory, if any. */
610 vhost_dev_unassign_memory(dev, start_addr, size);
612 dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
613 dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
614 dev->memory_changed = true;
615 used_memslots = dev->mem->nregions;
618 static bool vhost_section(MemoryRegionSection *section)
620 return memory_region_is_ram(section->mr) &&
621 !memory_region_is_rom(section->mr);
624 static void vhost_begin(MemoryListener *listener)
626 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
628 dev->mem_changed_end_addr = 0;
629 dev->mem_changed_start_addr = -1;
632 static void vhost_commit(MemoryListener *listener)
634 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
636 hwaddr start_addr = 0;
641 if (!dev->memory_changed) {
647 if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
652 start_addr = dev->mem_changed_start_addr;
653 size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
655 r = vhost_verify_ring_mappings(dev, start_addr, size);
659 if (!dev->log_enabled) {
660 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
662 VHOST_OPS_DEBUG("vhost_set_mem_table failed");
664 dev->memory_changed = false;
667 log_size = vhost_get_log_size(dev);
668 /* We allocate an extra 4K bytes to log,
669 * to reduce the * number of reallocations. */
670 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
671 /* To log more, must increase log size before table update. */
672 if (dev->log_size < log_size) {
673 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
675 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
677 VHOST_OPS_DEBUG("vhost_set_mem_table failed");
679 /* To log less, can only decrease log size after table update. */
680 if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
681 vhost_dev_log_resize(dev, log_size);
683 dev->memory_changed = false;
686 static void vhost_region_add(MemoryListener *listener,
687 MemoryRegionSection *section)
689 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
692 if (!vhost_section(section)) {
696 ++dev->n_mem_sections;
697 dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
698 dev->n_mem_sections);
699 dev->mem_sections[dev->n_mem_sections - 1] = *section;
700 memory_region_ref(section->mr);
701 vhost_set_memory(listener, section, true);
704 static void vhost_region_del(MemoryListener *listener,
705 MemoryRegionSection *section)
707 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
711 if (!vhost_section(section)) {
715 vhost_set_memory(listener, section, false);
716 memory_region_unref(section->mr);
717 for (i = 0; i < dev->n_mem_sections; ++i) {
718 if (dev->mem_sections[i].offset_within_address_space
719 == section->offset_within_address_space) {
720 --dev->n_mem_sections;
721 memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
722 (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
728 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
730 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
731 struct vhost_dev *hdev = iommu->hdev;
732 hwaddr iova = iotlb->iova + iommu->iommu_offset;
734 if (vhost_backend_invalidate_device_iotlb(hdev, iova,
735 iotlb->addr_mask + 1)) {
736 error_report("Fail to invalidate device iotlb");
740 static void vhost_iommu_region_add(MemoryListener *listener,
741 MemoryRegionSection *section)
743 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
745 struct vhost_iommu *iommu;
748 if (!memory_region_is_iommu(section->mr)) {
752 iommu = g_malloc0(sizeof(*iommu));
753 end = int128_add(int128_make64(section->offset_within_region),
755 end = int128_sub(end, int128_one());
756 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
757 IOMMU_NOTIFIER_UNMAP,
758 section->offset_within_region,
760 iommu->mr = section->mr;
761 iommu->iommu_offset = section->offset_within_address_space -
762 section->offset_within_region;
764 memory_region_register_iommu_notifier(section->mr, &iommu->n);
765 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
766 /* TODO: can replay help performance here? */
769 static void vhost_iommu_region_del(MemoryListener *listener,
770 MemoryRegionSection *section)
772 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
774 struct vhost_iommu *iommu;
776 if (!memory_region_is_iommu(section->mr)) {
780 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
781 if (iommu->mr == section->mr &&
782 iommu->n.start == section->offset_within_region) {
783 memory_region_unregister_iommu_notifier(iommu->mr,
785 QLIST_REMOVE(iommu, iommu_next);
792 static void vhost_region_nop(MemoryListener *listener,
793 MemoryRegionSection *section)
797 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
798 struct vhost_virtqueue *vq,
799 unsigned idx, bool enable_log)
801 struct vhost_vring_addr addr = {
803 .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
804 .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
805 .used_user_addr = (uint64_t)(unsigned long)vq->used,
806 .log_guest_addr = vq->used_phys,
807 .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
809 int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
811 VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
817 static int vhost_dev_set_features(struct vhost_dev *dev,
820 uint64_t features = dev->acked_features;
823 features |= 0x1ULL << VHOST_F_LOG_ALL;
825 r = dev->vhost_ops->vhost_set_features(dev, features);
827 VHOST_OPS_DEBUG("vhost_set_features failed");
829 return r < 0 ? -errno : 0;
832 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
835 r = vhost_dev_set_features(dev, enable_log);
839 for (i = 0; i < dev->nvqs; ++i) {
840 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
841 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
849 for (; i >= 0; --i) {
850 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
851 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
854 vhost_dev_set_features(dev, dev->log_enabled);
859 static int vhost_migration_log(MemoryListener *listener, int enable)
861 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
864 if (!!enable == dev->log_enabled) {
868 dev->log_enabled = enable;
872 r = vhost_dev_set_log(dev, false);
876 vhost_log_put(dev, false);
878 vhost_dev_log_resize(dev, vhost_get_log_size(dev));
879 r = vhost_dev_set_log(dev, true);
884 dev->log_enabled = enable;
888 static void vhost_log_global_start(MemoryListener *listener)
892 r = vhost_migration_log(listener, true);
898 static void vhost_log_global_stop(MemoryListener *listener)
902 r = vhost_migration_log(listener, false);
908 static void vhost_log_start(MemoryListener *listener,
909 MemoryRegionSection *section,
912 /* FIXME: implement */
915 static void vhost_log_stop(MemoryListener *listener,
916 MemoryRegionSection *section,
919 /* FIXME: implement */
922 /* The vhost driver natively knows how to handle the vrings of non
923 * cross-endian legacy devices and modern devices. Only legacy devices
924 * exposed to a bi-endian guest may require the vhost driver to use a
925 * specific endianness.
927 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
929 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
932 #ifdef HOST_WORDS_BIGENDIAN
933 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
935 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
939 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
943 struct vhost_vring_state s = {
944 .index = vhost_vq_index,
948 if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
952 VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
953 if (errno == ENOTTY) {
954 error_report("vhost does not support cross-endian");
961 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
962 uint64_t gpa, uint64_t *uaddr,
967 for (i = 0; i < hdev->mem->nregions; i++) {
968 struct vhost_memory_region *reg = hdev->mem->regions + i;
970 if (gpa >= reg->guest_phys_addr &&
971 reg->guest_phys_addr + reg->memory_size > gpa) {
972 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
973 *len = reg->guest_phys_addr + reg->memory_size - gpa;
981 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
989 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
991 if (iotlb.target_as != NULL) {
992 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
995 error_report("Fail to lookup the translated address "
996 "%"PRIx64, iotlb.translated_addr);
1000 len = MIN(iotlb.addr_mask + 1, len);
1001 iova = iova & ~iotlb.addr_mask;
1003 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1006 error_report("Fail to update device iotlb");
1016 static int vhost_virtqueue_start(struct vhost_dev *dev,
1017 struct VirtIODevice *vdev,
1018 struct vhost_virtqueue *vq,
1021 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1022 VirtioBusState *vbus = VIRTIO_BUS(qbus);
1023 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1026 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1027 struct vhost_vring_file file = {
1028 .index = vhost_vq_index
1030 struct vhost_vring_state state = {
1031 .index = vhost_vq_index
1033 struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1036 vq->num = state.num = virtio_queue_get_num(vdev, idx);
1037 r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1039 VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1043 state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1044 r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1046 VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1050 if (vhost_needs_vring_endian(vdev)) {
1051 r = vhost_virtqueue_set_vring_endian_legacy(dev,
1052 virtio_is_big_endian(vdev),
1059 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1060 vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
1061 vq->desc = vhost_memory_map(dev, a, &l, 0);
1062 if (!vq->desc || l != s) {
1064 goto fail_alloc_desc;
1066 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1067 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1068 vq->avail = vhost_memory_map(dev, a, &l, 0);
1069 if (!vq->avail || l != s) {
1071 goto fail_alloc_avail;
1073 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1074 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1075 vq->used = vhost_memory_map(dev, a, &l, 1);
1076 if (!vq->used || l != s) {
1078 goto fail_alloc_used;
1081 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1087 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1088 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1090 VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1095 /* Clear and discard previous events if any. */
1096 event_notifier_test_and_clear(&vq->masked_notifier);
1098 /* Init vring in unmasked state, unless guest_notifier_mask
1101 if (!vdev->use_guest_notifier_mask) {
1102 /* TODO: check and handle errors. */
1103 vhost_virtqueue_mask(dev, vdev, idx, false);
1106 if (k->query_guest_notifiers &&
1107 k->query_guest_notifiers(qbus->parent) &&
1108 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1110 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1121 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1124 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1127 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1133 static void vhost_virtqueue_stop(struct vhost_dev *dev,
1134 struct VirtIODevice *vdev,
1135 struct vhost_virtqueue *vq,
1138 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1139 struct vhost_vring_state state = {
1140 .index = vhost_vq_index,
1144 r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1146 VHOST_OPS_DEBUG("vhost VQ %d ring restore failed: %d", idx, r);
1147 /* Connection to the backend is broken, so let's sync internal
1148 * last avail idx to the device used idx.
1150 virtio_queue_restore_last_avail_idx(vdev, idx);
1152 virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1154 virtio_queue_invalidate_signalled_used(vdev, idx);
1155 virtio_queue_update_used_idx(vdev, idx);
1157 /* In the cross-endian case, we need to reset the vring endianness to
1158 * native as legacy devices expect so by default.
1160 if (vhost_needs_vring_endian(vdev)) {
1161 vhost_virtqueue_set_vring_endian_legacy(dev,
1162 !virtio_is_big_endian(vdev),
1166 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1167 1, virtio_queue_get_used_size(vdev, idx));
1168 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1169 0, virtio_queue_get_avail_size(vdev, idx));
1170 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1171 0, virtio_queue_get_desc_size(vdev, idx));
1174 static void vhost_eventfd_add(MemoryListener *listener,
1175 MemoryRegionSection *section,
1176 bool match_data, uint64_t data, EventNotifier *e)
1180 static void vhost_eventfd_del(MemoryListener *listener,
1181 MemoryRegionSection *section,
1182 bool match_data, uint64_t data, EventNotifier *e)
1186 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1187 int n, uint32_t timeout)
1189 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1190 struct vhost_vring_state state = {
1191 .index = vhost_vq_index,
1196 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1200 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1202 VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1209 static int vhost_virtqueue_init(struct vhost_dev *dev,
1210 struct vhost_virtqueue *vq, int n)
1212 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1213 struct vhost_vring_file file = {
1214 .index = vhost_vq_index,
1216 int r = event_notifier_init(&vq->masked_notifier, 0);
1221 file.fd = event_notifier_get_fd(&vq->masked_notifier);
1222 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1224 VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1233 event_notifier_cleanup(&vq->masked_notifier);
1237 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1239 event_notifier_cleanup(&vq->masked_notifier);
1242 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1243 VhostBackendType backend_type, uint32_t busyloop_timeout)
1246 int i, r, n_initialized_vqs = 0;
1247 Error *local_err = NULL;
1250 hdev->migration_blocker = NULL;
1252 r = vhost_set_backend_type(hdev, backend_type);
1255 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1260 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1261 error_report("vhost backend memory slots limit is less"
1262 " than current number of present memory slots");
1267 r = hdev->vhost_ops->vhost_set_owner(hdev);
1269 VHOST_OPS_DEBUG("vhost_set_owner failed");
1273 r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1275 VHOST_OPS_DEBUG("vhost_get_features failed");
1279 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1280 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1286 if (busyloop_timeout) {
1287 for (i = 0; i < hdev->nvqs; ++i) {
1288 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1296 hdev->features = features;
1298 hdev->memory_listener = (MemoryListener) {
1299 .begin = vhost_begin,
1300 .commit = vhost_commit,
1301 .region_add = vhost_region_add,
1302 .region_del = vhost_region_del,
1303 .region_nop = vhost_region_nop,
1304 .log_start = vhost_log_start,
1305 .log_stop = vhost_log_stop,
1306 .log_sync = vhost_log_sync,
1307 .log_global_start = vhost_log_global_start,
1308 .log_global_stop = vhost_log_global_stop,
1309 .eventfd_add = vhost_eventfd_add,
1310 .eventfd_del = vhost_eventfd_del,
1314 hdev->iommu_listener = (MemoryListener) {
1315 .region_add = vhost_iommu_region_add,
1316 .region_del = vhost_iommu_region_del,
1319 if (hdev->migration_blocker == NULL) {
1320 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1321 error_setg(&hdev->migration_blocker,
1322 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1323 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) {
1324 error_setg(&hdev->migration_blocker,
1325 "Migration disabled: failed to allocate shared memory");
1329 if (hdev->migration_blocker != NULL) {
1330 r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1332 error_report_err(local_err);
1333 error_free(hdev->migration_blocker);
1338 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1339 hdev->n_mem_sections = 0;
1340 hdev->mem_sections = NULL;
1343 hdev->log_enabled = false;
1344 hdev->started = false;
1345 hdev->memory_changed = false;
1346 memory_listener_register(&hdev->memory_listener, &address_space_memory);
1347 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1352 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1355 hdev->nvqs = n_initialized_vqs;
1356 vhost_dev_cleanup(hdev);
1360 void vhost_dev_cleanup(struct vhost_dev *hdev)
1364 for (i = 0; i < hdev->nvqs; ++i) {
1365 vhost_virtqueue_cleanup(hdev->vqs + i);
1368 /* those are only safe after successful init */
1369 memory_listener_unregister(&hdev->memory_listener);
1370 QLIST_REMOVE(hdev, entry);
1372 if (hdev->migration_blocker) {
1373 migrate_del_blocker(hdev->migration_blocker);
1374 error_free(hdev->migration_blocker);
1377 g_free(hdev->mem_sections);
1378 if (hdev->vhost_ops) {
1379 hdev->vhost_ops->vhost_backend_cleanup(hdev);
1383 memset(hdev, 0, sizeof(struct vhost_dev));
1386 /* Stop processing guest IO notifications in qemu.
1387 * Start processing them in vhost in kernel.
1389 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1391 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1394 /* We will pass the notifiers to the kernel, make sure that QEMU
1395 * doesn't interfere.
1397 r = virtio_device_grab_ioeventfd(vdev);
1399 error_report("binding does not support host notifiers");
1403 for (i = 0; i < hdev->nvqs; ++i) {
1404 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1407 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1415 e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1418 error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1422 virtio_device_release_ioeventfd(vdev);
1427 /* Stop processing guest IO notifications in vhost.
1428 * Start processing them in qemu.
1429 * This might actually run the qemu handlers right away,
1430 * so virtio in qemu must be completely setup when this is called.
1432 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1434 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1437 for (i = 0; i < hdev->nvqs; ++i) {
1438 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1441 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1445 virtio_device_release_ioeventfd(vdev);
1448 /* Test and clear event pending status.
1449 * Should be called after unmask to avoid losing events.
1451 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1453 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1454 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1455 return event_notifier_test_and_clear(&vq->masked_notifier);
1458 /* Mask/unmask events from this vq. */
1459 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1462 struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1463 int r, index = n - hdev->vq_index;
1464 struct vhost_vring_file file;
1466 /* should only be called after backend is connected */
1467 assert(hdev->vhost_ops);
1470 assert(vdev->use_guest_notifier_mask);
1471 file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1473 file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1476 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1477 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1479 VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1483 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1486 const int *bit = feature_bits;
1487 while (*bit != VHOST_INVALID_FEATURE_BIT) {
1488 uint64_t bit_mask = (1ULL << *bit);
1489 if (!(hdev->features & bit_mask)) {
1490 features &= ~bit_mask;
1497 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1500 const int *bit = feature_bits;
1501 while (*bit != VHOST_INVALID_FEATURE_BIT) {
1502 uint64_t bit_mask = (1ULL << *bit);
1503 if (features & bit_mask) {
1504 hdev->acked_features |= bit_mask;
1510 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1511 uint32_t config_len)
1513 assert(hdev->vhost_ops);
1515 if (hdev->vhost_ops->vhost_get_config) {
1516 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
1522 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1523 uint32_t offset, uint32_t size, uint32_t flags)
1525 assert(hdev->vhost_ops);
1527 if (hdev->vhost_ops->vhost_set_config) {
1528 return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1535 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1536 const VhostDevConfigOps *ops)
1538 assert(hdev->vhost_ops);
1539 hdev->config_ops = ops;
1542 /* Host notifiers must be enabled at this point. */
1543 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1547 /* should only be called after backend is connected */
1548 assert(hdev->vhost_ops);
1550 hdev->started = true;
1553 r = vhost_dev_set_features(hdev, hdev->log_enabled);
1558 if (vhost_dev_has_iommu(hdev)) {
1559 memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1562 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1564 VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1568 for (i = 0; i < hdev->nvqs; ++i) {
1569 r = vhost_virtqueue_start(hdev,
1572 hdev->vq_index + i);
1578 if (hdev->log_enabled) {
1581 hdev->log_size = vhost_get_log_size(hdev);
1582 hdev->log = vhost_log_get(hdev->log_size,
1583 vhost_dev_log_is_shared(hdev));
1584 log_base = (uintptr_t)hdev->log->log;
1585 r = hdev->vhost_ops->vhost_set_log_base(hdev,
1586 hdev->log_size ? log_base : 0,
1589 VHOST_OPS_DEBUG("vhost_set_log_base failed");
1595 if (vhost_dev_has_iommu(hdev)) {
1596 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1598 /* Update used ring information for IOTLB to work correctly,
1599 * vhost-kernel code requires for this.*/
1600 for (i = 0; i < hdev->nvqs; ++i) {
1601 struct vhost_virtqueue *vq = hdev->vqs + i;
1602 vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1607 vhost_log_put(hdev, false);
1610 vhost_virtqueue_stop(hdev,
1613 hdev->vq_index + i);
1620 hdev->started = false;
1624 /* Host notifiers must be enabled at this point. */
1625 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1629 /* should only be called after backend is connected */
1630 assert(hdev->vhost_ops);
1632 for (i = 0; i < hdev->nvqs; ++i) {
1633 vhost_virtqueue_stop(hdev,
1636 hdev->vq_index + i);
1639 if (vhost_dev_has_iommu(hdev)) {
1640 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1641 memory_listener_unregister(&hdev->iommu_listener);
1643 vhost_log_put(hdev, true);
1644 hdev->started = false;
1648 int vhost_net_set_backend(struct vhost_dev *hdev,
1649 struct vhost_vring_file *file)
1651 if (hdev->vhost_ops->vhost_net_set_backend) {
1652 return hdev->vhost_ops->vhost_net_set_backend(hdev, file);