]>
Commit | Line | Data |
---|---|---|
88807f89 SH |
1 | /* Copyright 2012 Red Hat, Inc. |
2 | * Copyright IBM, Corp. 2012 | |
3 | * | |
4 | * Based on Linux 2.6.39 vhost code: | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * Copyright (C) 2006 Rusty Russell IBM Corporation | |
7 | * | |
8 | * Author: Michael S. Tsirkin <[email protected]> | |
9 | * Stefan Hajnoczi <[email protected]> | |
10 | * | |
11 | * Inspiration, some code, and most witty comments come from | |
12 | * Documentation/virtual/lguest/lguest.c, by Rusty Russell | |
13 | * | |
14 | * This work is licensed under the terms of the GNU GPL, version 2. | |
15 | */ | |
16 | ||
17 | #include "trace.h" | |
87b7f2f8 PB |
18 | #include "hw/hw.h" |
19 | #include "exec/memory.h" | |
20 | #include "exec/address-spaces.h" | |
b0e5d90e | 21 | #include "hw/virtio/virtio-access.h" |
0d09e41a | 22 | #include "hw/virtio/dataplane/vring.h" |
b0e5d90e | 23 | #include "hw/virtio/dataplane/vring-accessors.h" |
b4a42f81 | 24 | #include "qemu/error-report.h" |
88807f89 | 25 | |
87b7f2f8 PB |
26 | /* vring_map can be coupled with vring_unmap or (if you still have the |
27 | * value returned in *mr) memory_region_unref. | |
28 | */ | |
29 | static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len, | |
30 | bool is_write) | |
31 | { | |
32 | MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len); | |
33 | ||
34 | if (!section.mr || int128_get64(section.size) < len) { | |
35 | goto out; | |
36 | } | |
37 | if (is_write && section.readonly) { | |
38 | goto out; | |
39 | } | |
40 | if (!memory_region_is_ram(section.mr)) { | |
41 | goto out; | |
42 | } | |
43 | ||
44 | /* Ignore regions with dirty logging, we cannot mark them dirty */ | |
2d1a35be | 45 | if (memory_region_get_dirty_log_mask(section.mr)) { |
87b7f2f8 PB |
46 | goto out; |
47 | } | |
48 | ||
49 | *mr = section.mr; | |
50 | return memory_region_get_ram_ptr(section.mr) + section.offset_within_region; | |
51 | ||
52 | out: | |
53 | memory_region_unref(section.mr); | |
54 | *mr = NULL; | |
55 | return NULL; | |
56 | } | |
57 | ||
58 | static void vring_unmap(void *buffer, bool is_write) | |
59 | { | |
60 | ram_addr_t addr; | |
61 | MemoryRegion *mr; | |
62 | ||
63 | mr = qemu_ram_addr_from_host(buffer, &addr); | |
64 | memory_region_unref(mr); | |
65 | } | |
66 | ||
88807f89 SH |
67 | /* Map the guest's vring to host memory */ |
68 | bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) | |
69 | { | |
a9718ef0 PM |
70 | struct vring *vr = &vring->vr; |
71 | hwaddr addr; | |
72 | hwaddr size; | |
73 | void *ptr; | |
88807f89 SH |
74 | |
75 | vring->broken = false; | |
a9718ef0 PM |
76 | vr->num = virtio_queue_get_num(vdev, n); |
77 | ||
78 | addr = virtio_queue_get_desc_addr(vdev, n); | |
79 | size = virtio_queue_get_desc_size(vdev, n); | |
80 | /* Map the descriptor area as read only */ | |
81 | ptr = vring_map(&vring->mr_desc, addr, size, false); | |
82 | if (!ptr) { | |
83 | error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring desc " | |
84 | "at 0x%" HWADDR_PRIx, | |
85 | size, addr); | |
86 | goto out_err_desc; | |
88807f89 | 87 | } |
a9718ef0 PM |
88 | vr->desc = ptr; |
89 | ||
90 | addr = virtio_queue_get_avail_addr(vdev, n); | |
91 | size = virtio_queue_get_avail_size(vdev, n); | |
92 | /* Add the size of the used_event_idx */ | |
93 | size += sizeof(uint16_t); | |
94 | /* Map the driver area as read only */ | |
95 | ptr = vring_map(&vring->mr_avail, addr, size, false); | |
96 | if (!ptr) { | |
97 | error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring avail " | |
98 | "at 0x%" HWADDR_PRIx, | |
99 | size, addr); | |
100 | goto out_err_avail; | |
101 | } | |
102 | vr->avail = ptr; | |
103 | ||
104 | addr = virtio_queue_get_used_addr(vdev, n); | |
105 | size = virtio_queue_get_used_size(vdev, n); | |
106 | /* Add the size of the avail_event_idx */ | |
107 | size += sizeof(uint16_t); | |
108 | /* Map the device area as read-write */ | |
109 | ptr = vring_map(&vring->mr_used, addr, size, true); | |
110 | if (!ptr) { | |
111 | error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring used " | |
112 | "at 0x%" HWADDR_PRIx, | |
113 | size, addr); | |
114 | goto out_err_used; | |
115 | } | |
116 | vr->used = ptr; | |
88807f89 | 117 | |
9154b02c | 118 | vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n); |
b0e5d90e | 119 | vring->last_used_idx = vring_get_used_idx(vdev, vring); |
88807f89 SH |
120 | vring->signalled_used = 0; |
121 | vring->signalled_used_valid = false; | |
122 | ||
123 | trace_vring_setup(virtio_queue_get_ring_addr(vdev, n), | |
124 | vring->vr.desc, vring->vr.avail, vring->vr.used); | |
125 | return true; | |
a9718ef0 PM |
126 | |
127 | out_err_used: | |
128 | memory_region_unref(vring->mr_avail); | |
129 | out_err_avail: | |
130 | memory_region_unref(vring->mr_desc); | |
131 | out_err_desc: | |
132 | vring->broken = true; | |
133 | return false; | |
88807f89 SH |
134 | } |
135 | ||
9154b02c | 136 | void vring_teardown(Vring *vring, VirtIODevice *vdev, int n) |
88807f89 | 137 | { |
9154b02c | 138 | virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx); |
6793dfd1 | 139 | virtio_queue_invalidate_signalled_used(vdev, n); |
9154b02c | 140 | |
a9718ef0 PM |
141 | memory_region_unref(vring->mr_desc); |
142 | memory_region_unref(vring->mr_avail); | |
143 | memory_region_unref(vring->mr_used); | |
88807f89 SH |
144 | } |
145 | ||
146 | /* Disable guest->host notifies */ | |
147 | void vring_disable_notification(VirtIODevice *vdev, Vring *vring) | |
148 | { | |
95129d6f | 149 | if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { |
b0e5d90e | 150 | vring_set_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY); |
88807f89 SH |
151 | } |
152 | } | |
153 | ||
154 | /* Enable guest->host notifies | |
155 | * | |
156 | * Return true if the vring is empty, false if there are more requests. | |
157 | */ | |
158 | bool vring_enable_notification(VirtIODevice *vdev, Vring *vring) | |
159 | { | |
95129d6f | 160 | if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { |
88807f89 SH |
161 | vring_avail_event(&vring->vr) = vring->vr.avail->idx; |
162 | } else { | |
b0e5d90e | 163 | vring_clear_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY); |
88807f89 SH |
164 | } |
165 | smp_mb(); /* ensure update is seen before reading avail_idx */ | |
b0e5d90e | 166 | return !vring_more_avail(vdev, vring); |
88807f89 SH |
167 | } |
168 | ||
169 | /* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */ | |
170 | bool vring_should_notify(VirtIODevice *vdev, Vring *vring) | |
171 | { | |
172 | uint16_t old, new; | |
173 | bool v; | |
174 | /* Flush out used index updates. This is paired | |
175 | * with the barrier that the Guest executes when enabling | |
176 | * interrupts. */ | |
177 | smp_mb(); | |
178 | ||
95129d6f | 179 | if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && |
b0e5d90e | 180 | unlikely(!vring_more_avail(vdev, vring))) { |
88807f89 SH |
181 | return true; |
182 | } | |
183 | ||
95129d6f | 184 | if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { |
b0e5d90e CH |
185 | return !(vring_get_avail_flags(vdev, vring) & |
186 | VRING_AVAIL_F_NO_INTERRUPT); | |
88807f89 SH |
187 | } |
188 | old = vring->signalled_used; | |
189 | v = vring->signalled_used_valid; | |
190 | new = vring->signalled_used = vring->last_used_idx; | |
191 | vring->signalled_used_valid = true; | |
192 | ||
193 | if (unlikely(!v)) { | |
194 | return true; | |
195 | } | |
196 | ||
be1e50a2 GK |
197 | return vring_need_event(virtio_tswap16(vdev, vring_used_event(&vring->vr)), |
198 | new, old); | |
88807f89 SH |
199 | } |
200 | ||
4d684832 | 201 | |
f329c74c | 202 | static int get_desc(Vring *vring, VirtQueueElement *elem, |
4d684832 PB |
203 | struct vring_desc *desc) |
204 | { | |
205 | unsigned *num; | |
8c1b566f PB |
206 | struct iovec *iov; |
207 | hwaddr *addr; | |
87b7f2f8 | 208 | MemoryRegion *mr; |
4d684832 | 209 | |
f329c74c | 210 | if (desc->flags & VRING_DESC_F_WRITE) { |
8c1b566f PB |
211 | num = &elem->in_num; |
212 | iov = &elem->in_sg[*num]; | |
213 | addr = &elem->in_addr[*num]; | |
4d684832 | 214 | } else { |
8c1b566f PB |
215 | num = &elem->out_num; |
216 | iov = &elem->out_sg[*num]; | |
217 | addr = &elem->out_addr[*num]; | |
4d684832 PB |
218 | |
219 | /* If it's an output descriptor, they're all supposed | |
220 | * to come before any input descriptors. */ | |
8c1b566f | 221 | if (unlikely(elem->in_num)) { |
4d684832 PB |
222 | error_report("Descriptor has out after in"); |
223 | return -EFAULT; | |
224 | } | |
225 | } | |
226 | ||
227 | /* Stop for now if there are not enough iovecs available. */ | |
8c1b566f | 228 | if (*num >= VIRTQUEUE_MAX_SIZE) { |
032f8b81 FZ |
229 | error_report("Invalid SG num: %u", *num); |
230 | return -EFAULT; | |
4d684832 PB |
231 | } |
232 | ||
233 | /* TODO handle non-contiguous memory across region boundaries */ | |
f329c74c CH |
234 | iov->iov_base = vring_map(&mr, desc->addr, desc->len, |
235 | desc->flags & VRING_DESC_F_WRITE); | |
4d684832 PB |
236 | if (!iov->iov_base) { |
237 | error_report("Failed to map descriptor addr %#" PRIx64 " len %u", | |
f329c74c | 238 | (uint64_t)desc->addr, desc->len); |
4d684832 PB |
239 | return -EFAULT; |
240 | } | |
241 | ||
87b7f2f8 PB |
242 | /* The MemoryRegion is looked up again and unref'ed later, leave the |
243 | * ref in place. */ | |
f329c74c CH |
244 | iov->iov_len = desc->len; |
245 | *addr = desc->addr; | |
4d684832 PB |
246 | *num += 1; |
247 | return 0; | |
248 | } | |
249 | ||
b0e5d90e CH |
250 | static void copy_in_vring_desc(VirtIODevice *vdev, |
251 | const struct vring_desc *guest, | |
252 | struct vring_desc *host) | |
253 | { | |
254 | host->addr = virtio_ldq_p(vdev, &guest->addr); | |
255 | host->len = virtio_ldl_p(vdev, &guest->len); | |
256 | host->flags = virtio_lduw_p(vdev, &guest->flags); | |
257 | host->next = virtio_lduw_p(vdev, &guest->next); | |
258 | } | |
259 | ||
88807f89 | 260 | /* This is stolen from linux/drivers/vhost/vhost.c. */ |
b0e5d90e CH |
261 | static int get_indirect(VirtIODevice *vdev, Vring *vring, |
262 | VirtQueueElement *elem, struct vring_desc *indirect) | |
88807f89 SH |
263 | { |
264 | struct vring_desc desc; | |
265 | unsigned int i = 0, count, found = 0; | |
4d684832 | 266 | int ret; |
88807f89 SH |
267 | |
268 | /* Sanity check */ | |
f329c74c | 269 | if (unlikely(indirect->len % sizeof(desc))) { |
88807f89 SH |
270 | error_report("Invalid length in indirect descriptor: " |
271 | "len %#x not multiple of %#zx", | |
f329c74c | 272 | indirect->len, sizeof(desc)); |
88807f89 SH |
273 | vring->broken = true; |
274 | return -EFAULT; | |
275 | } | |
276 | ||
f329c74c | 277 | count = indirect->len / sizeof(desc); |
88807f89 SH |
278 | /* Buffers are chained via a 16 bit next field, so |
279 | * we can have at most 2^16 of these. */ | |
280 | if (unlikely(count > USHRT_MAX + 1)) { | |
f329c74c | 281 | error_report("Indirect buffer length too big: %d", indirect->len); |
88807f89 SH |
282 | vring->broken = true; |
283 | return -EFAULT; | |
284 | } | |
285 | ||
286 | do { | |
287 | struct vring_desc *desc_ptr; | |
87b7f2f8 | 288 | MemoryRegion *mr; |
88807f89 SH |
289 | |
290 | /* Translate indirect descriptor */ | |
87b7f2f8 | 291 | desc_ptr = vring_map(&mr, |
f329c74c | 292 | indirect->addr + found * sizeof(desc), |
87b7f2f8 | 293 | sizeof(desc), false); |
88807f89 SH |
294 | if (!desc_ptr) { |
295 | error_report("Failed to map indirect descriptor " | |
296 | "addr %#" PRIx64 " len %zu", | |
f329c74c | 297 | (uint64_t)indirect->addr + found * sizeof(desc), |
88807f89 SH |
298 | sizeof(desc)); |
299 | vring->broken = true; | |
300 | return -EFAULT; | |
301 | } | |
b0e5d90e | 302 | copy_in_vring_desc(vdev, desc_ptr, &desc); |
87b7f2f8 | 303 | memory_region_unref(mr); |
88807f89 SH |
304 | |
305 | /* Ensure descriptor has been loaded before accessing fields */ | |
306 | barrier(); /* read_barrier_depends(); */ | |
307 | ||
308 | if (unlikely(++found > count)) { | |
309 | error_report("Loop detected: last one at %u " | |
310 | "indirect size %u", i, count); | |
311 | vring->broken = true; | |
312 | return -EFAULT; | |
313 | } | |
314 | ||
f329c74c | 315 | if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { |
88807f89 SH |
316 | error_report("Nested indirect descriptor"); |
317 | vring->broken = true; | |
318 | return -EFAULT; | |
319 | } | |
320 | ||
f329c74c | 321 | ret = get_desc(vring, elem, &desc); |
4d684832 PB |
322 | if (ret < 0) { |
323 | vring->broken |= (ret == -EFAULT); | |
324 | return ret; | |
88807f89 | 325 | } |
f329c74c CH |
326 | i = desc.next; |
327 | } while (desc.flags & VRING_DESC_F_NEXT); | |
88807f89 SH |
328 | return 0; |
329 | } | |
330 | ||
abd76425 | 331 | static void vring_unmap_element(VirtQueueElement *elem) |
8c1b566f | 332 | { |
87b7f2f8 PB |
333 | int i; |
334 | ||
335 | /* This assumes that the iovecs, if changed, are never moved past | |
336 | * the end of the valid area. This is true if iovec manipulations | |
337 | * are done with iov_discard_front and iov_discard_back. | |
338 | */ | |
339 | for (i = 0; i < elem->out_num; i++) { | |
340 | vring_unmap(elem->out_sg[i].iov_base, false); | |
341 | } | |
342 | ||
343 | for (i = 0; i < elem->in_num; i++) { | |
344 | vring_unmap(elem->in_sg[i].iov_base, true); | |
345 | } | |
8c1b566f PB |
346 | } |
347 | ||
88807f89 SH |
348 | /* This looks in the virtqueue and for the first available buffer, and converts |
349 | * it to an iovec for convenient access. Since descriptors consist of some | |
350 | * number of output then some number of input descriptors, it's actually two | |
351 | * iovecs, but we pack them into one and note how many of each there were. | |
352 | * | |
353 | * This function returns the descriptor number found, or vq->num (which is | |
354 | * never a valid descriptor number) if none was found. A negative code is | |
355 | * returned on error. | |
356 | * | |
357 | * Stolen from linux/drivers/vhost/vhost.c. | |
358 | */ | |
359 | int vring_pop(VirtIODevice *vdev, Vring *vring, | |
f897bf75 | 360 | VirtQueueElement *elem) |
88807f89 SH |
361 | { |
362 | struct vring_desc desc; | |
363 | unsigned int i, head, found = 0, num = vring->vr.num; | |
364 | uint16_t avail_idx, last_avail_idx; | |
4d684832 | 365 | int ret; |
88807f89 | 366 | |
f897bf75 SH |
367 | /* Initialize elem so it can be safely unmapped */ |
368 | elem->in_num = elem->out_num = 0; | |
369 | ||
88807f89 SH |
370 | /* If there was a fatal error then refuse operation */ |
371 | if (vring->broken) { | |
781c117f PB |
372 | ret = -EFAULT; |
373 | goto out; | |
88807f89 SH |
374 | } |
375 | ||
376 | /* Check it isn't doing very strange things with descriptor numbers. */ | |
377 | last_avail_idx = vring->last_avail_idx; | |
b0e5d90e | 378 | avail_idx = vring_get_avail_idx(vdev, vring); |
88807f89 SH |
379 | barrier(); /* load indices now and not again later */ |
380 | ||
381 | if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) { | |
382 | error_report("Guest moved used index from %u to %u", | |
383 | last_avail_idx, avail_idx); | |
781c117f PB |
384 | ret = -EFAULT; |
385 | goto out; | |
88807f89 SH |
386 | } |
387 | ||
388 | /* If there's nothing new since last we looked. */ | |
389 | if (avail_idx == last_avail_idx) { | |
781c117f PB |
390 | ret = -EAGAIN; |
391 | goto out; | |
88807f89 SH |
392 | } |
393 | ||
394 | /* Only get avail ring entries after they have been exposed by guest. */ | |
395 | smp_rmb(); | |
396 | ||
397 | /* Grab the next descriptor number they're advertising, and increment | |
398 | * the index we've seen. */ | |
b0e5d90e | 399 | head = vring_get_avail_ring(vdev, vring, last_avail_idx % num); |
88807f89 | 400 | |
8c1b566f | 401 | elem->index = head; |
f897bf75 | 402 | |
88807f89 SH |
403 | /* If their number is silly, that's an error. */ |
404 | if (unlikely(head >= num)) { | |
405 | error_report("Guest says index %u > %u is available", head, num); | |
781c117f PB |
406 | ret = -EFAULT; |
407 | goto out; | |
88807f89 SH |
408 | } |
409 | ||
88807f89 SH |
410 | i = head; |
411 | do { | |
412 | if (unlikely(i >= num)) { | |
413 | error_report("Desc index is %u > %u, head = %u", i, num, head); | |
781c117f PB |
414 | ret = -EFAULT; |
415 | goto out; | |
88807f89 SH |
416 | } |
417 | if (unlikely(++found > num)) { | |
418 | error_report("Loop detected: last one at %u vq size %u head %u", | |
419 | i, num, head); | |
781c117f PB |
420 | ret = -EFAULT; |
421 | goto out; | |
88807f89 | 422 | } |
b0e5d90e | 423 | copy_in_vring_desc(vdev, &vring->vr.desc[i], &desc); |
88807f89 SH |
424 | |
425 | /* Ensure descriptor is loaded before accessing fields */ | |
426 | barrier(); | |
427 | ||
f329c74c | 428 | if (desc.flags & VRING_DESC_F_INDIRECT) { |
b0e5d90e | 429 | ret = get_indirect(vdev, vring, elem, &desc); |
88807f89 | 430 | if (ret < 0) { |
781c117f | 431 | goto out; |
88807f89 SH |
432 | } |
433 | continue; | |
434 | } | |
435 | ||
f329c74c | 436 | ret = get_desc(vring, elem, &desc); |
4d684832 | 437 | if (ret < 0) { |
781c117f | 438 | goto out; |
88807f89 SH |
439 | } |
440 | ||
f329c74c CH |
441 | i = desc.next; |
442 | } while (desc.flags & VRING_DESC_F_NEXT); | |
88807f89 SH |
443 | |
444 | /* On success, increment avail index. */ | |
445 | vring->last_avail_idx++; | |
95129d6f | 446 | if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { |
be1e50a2 GK |
447 | vring_avail_event(&vring->vr) = |
448 | virtio_tswap16(vdev, vring->last_avail_idx); | |
a3614c65 BW |
449 | } |
450 | ||
88807f89 | 451 | return head; |
781c117f PB |
452 | |
453 | out: | |
454 | assert(ret < 0); | |
455 | if (ret == -EFAULT) { | |
456 | vring->broken = true; | |
457 | } | |
f897bf75 | 458 | vring_unmap_element(elem); |
781c117f | 459 | return ret; |
88807f89 SH |
460 | } |
461 | ||
462 | /* After we've used one of their buffers, we tell them about it. | |
463 | * | |
464 | * Stolen from linux/drivers/vhost/vhost.c. | |
465 | */ | |
b0e5d90e CH |
466 | void vring_push(VirtIODevice *vdev, Vring *vring, VirtQueueElement *elem, |
467 | int len) | |
88807f89 | 468 | { |
8c1b566f | 469 | unsigned int head = elem->index; |
88807f89 SH |
470 | uint16_t new; |
471 | ||
abd76425 | 472 | vring_unmap_element(elem); |
8c1b566f | 473 | |
88807f89 SH |
474 | /* Don't touch vring if a fatal error occurred */ |
475 | if (vring->broken) { | |
476 | return; | |
477 | } | |
478 | ||
479 | /* The virtqueue contains a ring of used buffers. Get a pointer to the | |
480 | * next entry in that used ring. */ | |
b0e5d90e CH |
481 | vring_set_used_ring_id(vdev, vring, vring->last_used_idx % vring->vr.num, |
482 | head); | |
483 | vring_set_used_ring_len(vdev, vring, vring->last_used_idx % vring->vr.num, | |
484 | len); | |
88807f89 SH |
485 | |
486 | /* Make sure buffer is written before we update index. */ | |
487 | smp_wmb(); | |
488 | ||
b0e5d90e CH |
489 | new = ++vring->last_used_idx; |
490 | vring_set_used_idx(vdev, vring, new); | |
88807f89 SH |
491 | if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) { |
492 | vring->signalled_used_valid = false; | |
493 | } | |
494 | } |