]>
Commit | Line | Data |
---|---|---|
e72f66a0 SH |
1 | /* |
2 | * Dedicated thread for virtio-blk I/O processing | |
3 | * | |
4 | * Copyright 2012 IBM, Corp. | |
5 | * Copyright 2012 Red Hat, Inc. and/or its affiliates | |
6 | * | |
7 | * Authors: | |
8 | * Stefan Hajnoczi <[email protected]> | |
9 | * | |
10 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
11 | * See the COPYING file in the top-level directory. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "trace.h" | |
16 | #include "qemu/iov.h" | |
17 | #include "event-poll.h" | |
18 | #include "qemu/thread.h" | |
b4a42f81 | 19 | #include "qemu/error-report.h" |
e72f66a0 SH |
20 | #include "vring.h" |
21 | #include "ioq.h" | |
22 | #include "migration/migration.h" | |
b4a42f81 | 23 | #include "block/block.h" |
e72f66a0 SH |
24 | #include "hw/virtio-blk.h" |
25 | #include "hw/dataplane/virtio-blk.h" | |
26 | ||
27 | enum { | |
28 | SEG_MAX = 126, /* maximum number of I/O segments */ | |
29 | VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ | |
30 | REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, | |
31 | * is VRING_MAX / 2 with traditional and | |
32 | * VRING_MAX with indirect descriptors */ | |
33 | }; | |
34 | ||
35 | typedef struct { | |
36 | struct iocb iocb; /* Linux AIO control block */ | |
37 | QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ | |
38 | unsigned int head; /* vring descriptor index */ | |
de0161c0 SH |
39 | struct iovec *bounce_iov; /* used if guest buffers are unaligned */ |
40 | QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */ | |
e72f66a0 SH |
41 | } VirtIOBlockRequest; |
42 | ||
43 | struct VirtIOBlockDataPlane { | |
44 | bool started; | |
cd7fdfe5 | 45 | bool stopping; |
e72f66a0 SH |
46 | QEMUBH *start_bh; |
47 | QemuThread thread; | |
48 | ||
49 | VirtIOBlkConf *blk; | |
50 | int fd; /* image file descriptor */ | |
51 | ||
52 | VirtIODevice *vdev; | |
53 | Vring vring; /* virtqueue vring */ | |
54 | EventNotifier *guest_notifier; /* irq */ | |
55 | ||
56 | EventPoll event_poll; /* event poller */ | |
57 | EventHandler io_handler; /* Linux AIO completion handler */ | |
58 | EventHandler notify_handler; /* virtqueue notify handler */ | |
59 | ||
60 | IOQueue ioqueue; /* Linux AIO queue (should really be per | |
61 | dataplane thread) */ | |
62 | VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the | |
63 | queue */ | |
64 | ||
65 | unsigned int num_reqs; | |
66 | ||
67 | Error *migration_blocker; | |
68 | }; | |
69 | ||
70 | /* Raise an interrupt to signal guest, if necessary */ | |
71 | static void notify_guest(VirtIOBlockDataPlane *s) | |
72 | { | |
73 | if (!vring_should_notify(s->vdev, &s->vring)) { | |
74 | return; | |
75 | } | |
76 | ||
77 | event_notifier_set(s->guest_notifier); | |
78 | } | |
79 | ||
80 | static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) | |
81 | { | |
82 | VirtIOBlockDataPlane *s = opaque; | |
83 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
84 | struct virtio_blk_inhdr hdr; | |
85 | int len; | |
86 | ||
87 | if (likely(ret >= 0)) { | |
88 | hdr.status = VIRTIO_BLK_S_OK; | |
89 | len = ret; | |
90 | } else { | |
91 | hdr.status = VIRTIO_BLK_S_IOERR; | |
92 | len = 0; | |
93 | } | |
94 | ||
95 | trace_virtio_blk_data_plane_complete_request(s, req->head, ret); | |
96 | ||
de0161c0 SH |
97 | if (req->read_qiov) { |
98 | assert(req->bounce_iov); | |
99 | qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len); | |
100 | qemu_iovec_destroy(req->read_qiov); | |
101 | g_slice_free(QEMUIOVector, req->read_qiov); | |
102 | } | |
103 | ||
104 | if (req->bounce_iov) { | |
105 | qemu_vfree(req->bounce_iov->iov_base); | |
106 | g_slice_free(struct iovec, req->bounce_iov); | |
107 | } | |
108 | ||
e72f66a0 SH |
109 | qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); |
110 | qemu_iovec_destroy(req->inhdr); | |
111 | g_slice_free(QEMUIOVector, req->inhdr); | |
112 | ||
113 | /* According to the virtio specification len should be the number of bytes | |
114 | * written to, but for virtio-blk it seems to be the number of bytes | |
115 | * transferred plus the status bytes. | |
116 | */ | |
117 | vring_push(&s->vring, req->head, len + sizeof(hdr)); | |
118 | ||
119 | s->num_reqs--; | |
120 | } | |
121 | ||
122 | static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, | |
123 | QEMUIOVector *inhdr, unsigned char status) | |
124 | { | |
125 | struct virtio_blk_inhdr hdr = { | |
126 | .status = status, | |
127 | }; | |
128 | ||
129 | qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); | |
130 | qemu_iovec_destroy(inhdr); | |
131 | g_slice_free(QEMUIOVector, inhdr); | |
132 | ||
133 | vring_push(&s->vring, head, sizeof(hdr)); | |
134 | notify_guest(s); | |
135 | } | |
136 | ||
137 | /* Get disk serial number */ | |
138 | static void do_get_id_cmd(VirtIOBlockDataPlane *s, | |
139 | struct iovec *iov, unsigned int iov_cnt, | |
140 | unsigned int head, QEMUIOVector *inhdr) | |
141 | { | |
142 | char id[VIRTIO_BLK_ID_BYTES]; | |
143 | ||
144 | /* Serial number not NUL-terminated when shorter than buffer */ | |
145 | strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); | |
146 | iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); | |
147 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
148 | } | |
149 | ||
b5ef1aab SH |
150 | static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read, |
151 | struct iovec *iov, unsigned int iov_cnt, | |
152 | long long offset, unsigned int head, | |
153 | QEMUIOVector *inhdr) | |
154 | { | |
155 | struct iocb *iocb; | |
de0161c0 SH |
156 | QEMUIOVector qiov; |
157 | struct iovec *bounce_iov = NULL; | |
158 | QEMUIOVector *read_qiov = NULL; | |
159 | ||
160 | qemu_iovec_init_external(&qiov, iov, iov_cnt); | |
161 | if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) { | |
162 | void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size); | |
163 | ||
164 | if (read) { | |
165 | /* Need to copy back from bounce buffer on completion */ | |
166 | read_qiov = g_slice_new(QEMUIOVector); | |
167 | qemu_iovec_init(read_qiov, iov_cnt); | |
168 | qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size); | |
169 | } else { | |
170 | qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size); | |
171 | } | |
172 | ||
173 | /* Redirect I/O to aligned bounce buffer */ | |
174 | bounce_iov = g_slice_new(struct iovec); | |
175 | bounce_iov->iov_base = bounce_buffer; | |
176 | bounce_iov->iov_len = qiov.size; | |
177 | iov = bounce_iov; | |
178 | iov_cnt = 1; | |
179 | } | |
b5ef1aab SH |
180 | |
181 | iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset); | |
182 | ||
183 | /* Fill in virtio block metadata needed for completion */ | |
184 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
185 | req->head = head; | |
186 | req->inhdr = inhdr; | |
de0161c0 SH |
187 | req->bounce_iov = bounce_iov; |
188 | req->read_qiov = read_qiov; | |
b5ef1aab SH |
189 | return 0; |
190 | } | |
191 | ||
e72f66a0 SH |
192 | static int process_request(IOQueue *ioq, struct iovec iov[], |
193 | unsigned int out_num, unsigned int in_num, | |
194 | unsigned int head) | |
195 | { | |
196 | VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); | |
197 | struct iovec *in_iov = &iov[out_num]; | |
198 | struct virtio_blk_outhdr outhdr; | |
199 | QEMUIOVector *inhdr; | |
200 | size_t in_size; | |
e72f66a0 SH |
201 | |
202 | /* Copy in outhdr */ | |
203 | if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, | |
204 | sizeof(outhdr)) != sizeof(outhdr))) { | |
205 | error_report("virtio-blk request outhdr too short"); | |
206 | return -EFAULT; | |
207 | } | |
208 | iov_discard_front(&iov, &out_num, sizeof(outhdr)); | |
209 | ||
210 | /* Grab inhdr for later */ | |
211 | in_size = iov_size(in_iov, in_num); | |
212 | if (in_size < sizeof(struct virtio_blk_inhdr)) { | |
213 | error_report("virtio_blk request inhdr too short"); | |
214 | return -EFAULT; | |
215 | } | |
216 | inhdr = g_slice_new(QEMUIOVector); | |
217 | qemu_iovec_init(inhdr, 1); | |
218 | qemu_iovec_concat_iov(inhdr, in_iov, in_num, | |
219 | in_size - sizeof(struct virtio_blk_inhdr), | |
220 | sizeof(struct virtio_blk_inhdr)); | |
221 | iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | |
222 | ||
223 | /* TODO Linux sets the barrier bit even when not advertised! */ | |
224 | outhdr.type &= ~VIRTIO_BLK_T_BARRIER; | |
225 | ||
226 | switch (outhdr.type) { | |
227 | case VIRTIO_BLK_T_IN: | |
b5ef1aab SH |
228 | do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr); |
229 | return 0; | |
e72f66a0 SH |
230 | |
231 | case VIRTIO_BLK_T_OUT: | |
b5ef1aab SH |
232 | do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr); |
233 | return 0; | |
e72f66a0 SH |
234 | |
235 | case VIRTIO_BLK_T_SCSI_CMD: | |
236 | /* TODO support SCSI commands */ | |
237 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); | |
238 | return 0; | |
239 | ||
240 | case VIRTIO_BLK_T_FLUSH: | |
241 | /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ | |
242 | if (qemu_fdatasync(s->fd) < 0) { | |
243 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); | |
244 | } else { | |
245 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
246 | } | |
247 | return 0; | |
248 | ||
249 | case VIRTIO_BLK_T_GET_ID: | |
250 | do_get_id_cmd(s, in_iov, in_num, head, inhdr); | |
251 | return 0; | |
252 | ||
253 | default: | |
254 | error_report("virtio-blk unsupported request type %#x", outhdr.type); | |
255 | qemu_iovec_destroy(inhdr); | |
256 | g_slice_free(QEMUIOVector, inhdr); | |
257 | return -EFAULT; | |
258 | } | |
e72f66a0 SH |
259 | } |
260 | ||
261 | static void handle_notify(EventHandler *handler) | |
262 | { | |
263 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
264 | notify_handler); | |
265 | ||
266 | /* There is one array of iovecs into which all new requests are extracted | |
267 | * from the vring. Requests are read from the vring and the translated | |
268 | * descriptors are written to the iovecs array. The iovecs do not have to | |
269 | * persist across handle_notify() calls because the kernel copies the | |
270 | * iovecs on io_submit(). | |
271 | * | |
272 | * Handling io_submit() EAGAIN may require storing the requests across | |
273 | * handle_notify() calls until the kernel has sufficient resources to | |
274 | * accept more I/O. This is not implemented yet. | |
275 | */ | |
276 | struct iovec iovec[VRING_MAX]; | |
277 | struct iovec *end = &iovec[VRING_MAX]; | |
278 | struct iovec *iov = iovec; | |
279 | ||
280 | /* When a request is read from the vring, the index of the first descriptor | |
281 | * (aka head) is returned so that the completed request can be pushed onto | |
282 | * the vring later. | |
283 | * | |
284 | * The number of hypervisor read-only iovecs is out_num. The number of | |
285 | * hypervisor write-only iovecs is in_num. | |
286 | */ | |
287 | int head; | |
288 | unsigned int out_num = 0, in_num = 0; | |
289 | unsigned int num_queued; | |
290 | ||
291 | for (;;) { | |
292 | /* Disable guest->host notifies to avoid unnecessary vmexits */ | |
293 | vring_disable_notification(s->vdev, &s->vring); | |
294 | ||
295 | for (;;) { | |
296 | head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); | |
297 | if (head < 0) { | |
298 | break; /* no more requests */ | |
299 | } | |
300 | ||
301 | trace_virtio_blk_data_plane_process_request(s, out_num, in_num, | |
302 | head); | |
303 | ||
304 | if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { | |
305 | vring_set_broken(&s->vring); | |
306 | break; | |
307 | } | |
308 | iov += out_num + in_num; | |
309 | } | |
310 | ||
311 | if (likely(head == -EAGAIN)) { /* vring emptied */ | |
312 | /* Re-enable guest->host notifies and stop processing the vring. | |
313 | * But if the guest has snuck in more descriptors, keep processing. | |
314 | */ | |
315 | if (vring_enable_notification(s->vdev, &s->vring)) { | |
316 | break; | |
317 | } | |
318 | } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ | |
319 | /* Since there are no iovecs[] left, stop processing for now. Do | |
320 | * not re-enable guest->host notifies since the I/O completion | |
321 | * handler knows to check for more vring descriptors anyway. | |
322 | */ | |
323 | break; | |
324 | } | |
325 | } | |
326 | ||
327 | num_queued = ioq_num_queued(&s->ioqueue); | |
328 | if (num_queued > 0) { | |
329 | s->num_reqs += num_queued; | |
330 | ||
331 | int rc = ioq_submit(&s->ioqueue); | |
332 | if (unlikely(rc < 0)) { | |
333 | fprintf(stderr, "ioq_submit failed %d\n", rc); | |
334 | exit(1); | |
335 | } | |
336 | } | |
337 | } | |
338 | ||
339 | static void handle_io(EventHandler *handler) | |
340 | { | |
341 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
342 | io_handler); | |
343 | ||
344 | if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { | |
345 | notify_guest(s); | |
346 | } | |
347 | ||
348 | /* If there were more requests than iovecs, the vring will not be empty yet | |
349 | * so check again. There should now be enough resources to process more | |
350 | * requests. | |
351 | */ | |
352 | if (unlikely(vring_more_avail(&s->vring))) { | |
353 | handle_notify(&s->notify_handler); | |
354 | } | |
355 | } | |
356 | ||
357 | static void *data_plane_thread(void *opaque) | |
358 | { | |
359 | VirtIOBlockDataPlane *s = opaque; | |
360 | ||
361 | do { | |
362 | event_poll(&s->event_poll); | |
cd7fdfe5 | 363 | } while (!s->stopping || s->num_reqs > 0); |
e72f66a0 SH |
364 | return NULL; |
365 | } | |
366 | ||
367 | static void start_data_plane_bh(void *opaque) | |
368 | { | |
369 | VirtIOBlockDataPlane *s = opaque; | |
370 | ||
371 | qemu_bh_delete(s->start_bh); | |
372 | s->start_bh = NULL; | |
373 | qemu_thread_create(&s->thread, data_plane_thread, | |
374 | s, QEMU_THREAD_JOINABLE); | |
375 | } | |
376 | ||
377 | bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, | |
378 | VirtIOBlockDataPlane **dataplane) | |
379 | { | |
380 | VirtIOBlockDataPlane *s; | |
381 | int fd; | |
382 | ||
383 | *dataplane = NULL; | |
384 | ||
385 | if (!blk->data_plane) { | |
386 | return true; | |
387 | } | |
388 | ||
389 | if (blk->scsi) { | |
390 | error_report("device is incompatible with x-data-plane, use scsi=off"); | |
391 | return false; | |
392 | } | |
393 | ||
394 | if (blk->config_wce) { | |
395 | error_report("device is incompatible with x-data-plane, " | |
396 | "use config-wce=off"); | |
397 | return false; | |
398 | } | |
399 | ||
400 | fd = raw_get_aio_fd(blk->conf.bs); | |
401 | if (fd < 0) { | |
402 | error_report("drive is incompatible with x-data-plane, " | |
403 | "use format=raw,cache=none,aio=native"); | |
404 | return false; | |
405 | } | |
406 | ||
407 | s = g_new0(VirtIOBlockDataPlane, 1); | |
408 | s->vdev = vdev; | |
409 | s->fd = fd; | |
410 | s->blk = blk; | |
411 | ||
412 | /* Prevent block operations that conflict with data plane thread */ | |
413 | bdrv_set_in_use(blk->conf.bs, 1); | |
414 | ||
415 | error_setg(&s->migration_blocker, | |
416 | "x-data-plane does not support migration"); | |
417 | migrate_add_blocker(s->migration_blocker); | |
418 | ||
419 | *dataplane = s; | |
420 | return true; | |
421 | } | |
422 | ||
423 | void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) | |
424 | { | |
425 | if (!s) { | |
426 | return; | |
427 | } | |
428 | ||
429 | virtio_blk_data_plane_stop(s); | |
430 | migrate_del_blocker(s->migration_blocker); | |
431 | error_free(s->migration_blocker); | |
432 | bdrv_set_in_use(s->blk->conf.bs, 0); | |
433 | g_free(s); | |
434 | } | |
435 | ||
436 | void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) | |
437 | { | |
438 | VirtQueue *vq; | |
439 | int i; | |
440 | ||
441 | if (s->started) { | |
442 | return; | |
443 | } | |
444 | ||
445 | vq = virtio_get_queue(s->vdev, 0); | |
446 | if (!vring_setup(&s->vring, s->vdev, 0)) { | |
447 | return; | |
448 | } | |
449 | ||
450 | event_poll_init(&s->event_poll); | |
451 | ||
452 | /* Set up guest notifier (irq) */ | |
b8bec49c | 453 | if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, |
e72f66a0 SH |
454 | true) != 0) { |
455 | fprintf(stderr, "virtio-blk failed to set guest notifier, " | |
456 | "ensure -enable-kvm is set\n"); | |
457 | exit(1); | |
458 | } | |
459 | s->guest_notifier = virtio_queue_get_guest_notifier(vq); | |
460 | ||
461 | /* Set up virtqueue notify */ | |
462 | if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, | |
463 | 0, true) != 0) { | |
464 | fprintf(stderr, "virtio-blk failed to set host notifier\n"); | |
465 | exit(1); | |
466 | } | |
467 | event_poll_add(&s->event_poll, &s->notify_handler, | |
468 | virtio_queue_get_host_notifier(vq), | |
469 | handle_notify); | |
470 | ||
471 | /* Set up ioqueue */ | |
472 | ioq_init(&s->ioqueue, s->fd, REQ_MAX); | |
473 | for (i = 0; i < ARRAY_SIZE(s->requests); i++) { | |
474 | ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); | |
475 | } | |
476 | event_poll_add(&s->event_poll, &s->io_handler, | |
477 | ioq_get_notifier(&s->ioqueue), handle_io); | |
478 | ||
479 | s->started = true; | |
480 | trace_virtio_blk_data_plane_start(s); | |
481 | ||
482 | /* Kick right away to begin processing requests already in vring */ | |
483 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | |
484 | ||
485 | /* Spawn thread in BH so it inherits iothread cpusets */ | |
486 | s->start_bh = qemu_bh_new(start_data_plane_bh, s); | |
487 | qemu_bh_schedule(s->start_bh); | |
488 | } | |
489 | ||
490 | void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) | |
491 | { | |
cd7fdfe5 | 492 | if (!s->started || s->stopping) { |
e72f66a0 SH |
493 | return; |
494 | } | |
cd7fdfe5 | 495 | s->stopping = true; |
e72f66a0 SH |
496 | trace_virtio_blk_data_plane_stop(s); |
497 | ||
498 | /* Stop thread or cancel pending thread creation BH */ | |
499 | if (s->start_bh) { | |
500 | qemu_bh_delete(s->start_bh); | |
501 | s->start_bh = NULL; | |
502 | } else { | |
503 | event_poll_notify(&s->event_poll); | |
504 | qemu_thread_join(&s->thread); | |
505 | } | |
506 | ||
507 | ioq_cleanup(&s->ioqueue); | |
508 | ||
509 | s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false); | |
510 | ||
511 | event_poll_cleanup(&s->event_poll); | |
512 | ||
513 | /* Clean up guest notifier (irq) */ | |
b8bec49c | 514 | s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, false); |
e72f66a0 SH |
515 | |
516 | vring_teardown(&s->vring); | |
cd7fdfe5 SH |
517 | s->started = false; |
518 | s->stopping = false; | |
e72f66a0 | 519 | } |