]>
Commit | Line | Data |
---|---|---|
e72f66a0 SH |
1 | /* |
2 | * Dedicated thread for virtio-blk I/O processing | |
3 | * | |
4 | * Copyright 2012 IBM, Corp. | |
5 | * Copyright 2012 Red Hat, Inc. and/or its affiliates | |
6 | * | |
7 | * Authors: | |
8 | * Stefan Hajnoczi <[email protected]> | |
9 | * | |
10 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
11 | * See the COPYING file in the top-level directory. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "trace.h" | |
16 | #include "qemu/iov.h" | |
17 | #include "event-poll.h" | |
18 | #include "qemu/thread.h" | |
19 | #include "vring.h" | |
20 | #include "ioq.h" | |
21 | #include "migration/migration.h" | |
22 | #include "hw/virtio-blk.h" | |
23 | #include "hw/dataplane/virtio-blk.h" | |
24 | ||
25 | enum { | |
26 | SEG_MAX = 126, /* maximum number of I/O segments */ | |
27 | VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */ | |
28 | REQ_MAX = VRING_MAX, /* maximum number of requests in the vring, | |
29 | * is VRING_MAX / 2 with traditional and | |
30 | * VRING_MAX with indirect descriptors */ | |
31 | }; | |
32 | ||
33 | typedef struct { | |
34 | struct iocb iocb; /* Linux AIO control block */ | |
35 | QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */ | |
36 | unsigned int head; /* vring descriptor index */ | |
37 | } VirtIOBlockRequest; | |
38 | ||
39 | struct VirtIOBlockDataPlane { | |
40 | bool started; | |
41 | QEMUBH *start_bh; | |
42 | QemuThread thread; | |
43 | ||
44 | VirtIOBlkConf *blk; | |
45 | int fd; /* image file descriptor */ | |
46 | ||
47 | VirtIODevice *vdev; | |
48 | Vring vring; /* virtqueue vring */ | |
49 | EventNotifier *guest_notifier; /* irq */ | |
50 | ||
51 | EventPoll event_poll; /* event poller */ | |
52 | EventHandler io_handler; /* Linux AIO completion handler */ | |
53 | EventHandler notify_handler; /* virtqueue notify handler */ | |
54 | ||
55 | IOQueue ioqueue; /* Linux AIO queue (should really be per | |
56 | dataplane thread) */ | |
57 | VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the | |
58 | queue */ | |
59 | ||
60 | unsigned int num_reqs; | |
61 | ||
62 | Error *migration_blocker; | |
63 | }; | |
64 | ||
65 | /* Raise an interrupt to signal guest, if necessary */ | |
66 | static void notify_guest(VirtIOBlockDataPlane *s) | |
67 | { | |
68 | if (!vring_should_notify(s->vdev, &s->vring)) { | |
69 | return; | |
70 | } | |
71 | ||
72 | event_notifier_set(s->guest_notifier); | |
73 | } | |
74 | ||
75 | static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque) | |
76 | { | |
77 | VirtIOBlockDataPlane *s = opaque; | |
78 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
79 | struct virtio_blk_inhdr hdr; | |
80 | int len; | |
81 | ||
82 | if (likely(ret >= 0)) { | |
83 | hdr.status = VIRTIO_BLK_S_OK; | |
84 | len = ret; | |
85 | } else { | |
86 | hdr.status = VIRTIO_BLK_S_IOERR; | |
87 | len = 0; | |
88 | } | |
89 | ||
90 | trace_virtio_blk_data_plane_complete_request(s, req->head, ret); | |
91 | ||
92 | qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr)); | |
93 | qemu_iovec_destroy(req->inhdr); | |
94 | g_slice_free(QEMUIOVector, req->inhdr); | |
95 | ||
96 | /* According to the virtio specification len should be the number of bytes | |
97 | * written to, but for virtio-blk it seems to be the number of bytes | |
98 | * transferred plus the status bytes. | |
99 | */ | |
100 | vring_push(&s->vring, req->head, len + sizeof(hdr)); | |
101 | ||
102 | s->num_reqs--; | |
103 | } | |
104 | ||
105 | static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head, | |
106 | QEMUIOVector *inhdr, unsigned char status) | |
107 | { | |
108 | struct virtio_blk_inhdr hdr = { | |
109 | .status = status, | |
110 | }; | |
111 | ||
112 | qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr)); | |
113 | qemu_iovec_destroy(inhdr); | |
114 | g_slice_free(QEMUIOVector, inhdr); | |
115 | ||
116 | vring_push(&s->vring, head, sizeof(hdr)); | |
117 | notify_guest(s); | |
118 | } | |
119 | ||
120 | /* Get disk serial number */ | |
121 | static void do_get_id_cmd(VirtIOBlockDataPlane *s, | |
122 | struct iovec *iov, unsigned int iov_cnt, | |
123 | unsigned int head, QEMUIOVector *inhdr) | |
124 | { | |
125 | char id[VIRTIO_BLK_ID_BYTES]; | |
126 | ||
127 | /* Serial number not NUL-terminated when shorter than buffer */ | |
128 | strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id)); | |
129 | iov_from_buf(iov, iov_cnt, 0, id, sizeof(id)); | |
130 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
131 | } | |
132 | ||
133 | static int process_request(IOQueue *ioq, struct iovec iov[], | |
134 | unsigned int out_num, unsigned int in_num, | |
135 | unsigned int head) | |
136 | { | |
137 | VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue); | |
138 | struct iovec *in_iov = &iov[out_num]; | |
139 | struct virtio_blk_outhdr outhdr; | |
140 | QEMUIOVector *inhdr; | |
141 | size_t in_size; | |
142 | struct iocb *iocb; | |
143 | ||
144 | /* Copy in outhdr */ | |
145 | if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr, | |
146 | sizeof(outhdr)) != sizeof(outhdr))) { | |
147 | error_report("virtio-blk request outhdr too short"); | |
148 | return -EFAULT; | |
149 | } | |
150 | iov_discard_front(&iov, &out_num, sizeof(outhdr)); | |
151 | ||
152 | /* Grab inhdr for later */ | |
153 | in_size = iov_size(in_iov, in_num); | |
154 | if (in_size < sizeof(struct virtio_blk_inhdr)) { | |
155 | error_report("virtio_blk request inhdr too short"); | |
156 | return -EFAULT; | |
157 | } | |
158 | inhdr = g_slice_new(QEMUIOVector); | |
159 | qemu_iovec_init(inhdr, 1); | |
160 | qemu_iovec_concat_iov(inhdr, in_iov, in_num, | |
161 | in_size - sizeof(struct virtio_blk_inhdr), | |
162 | sizeof(struct virtio_blk_inhdr)); | |
163 | iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | |
164 | ||
165 | /* TODO Linux sets the barrier bit even when not advertised! */ | |
166 | outhdr.type &= ~VIRTIO_BLK_T_BARRIER; | |
167 | ||
168 | switch (outhdr.type) { | |
169 | case VIRTIO_BLK_T_IN: | |
170 | iocb = ioq_rdwr(ioq, true, in_iov, in_num, outhdr.sector * 512); | |
171 | break; | |
172 | ||
173 | case VIRTIO_BLK_T_OUT: | |
174 | iocb = ioq_rdwr(ioq, false, iov, out_num, outhdr.sector * 512); | |
175 | break; | |
176 | ||
177 | case VIRTIO_BLK_T_SCSI_CMD: | |
178 | /* TODO support SCSI commands */ | |
179 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP); | |
180 | return 0; | |
181 | ||
182 | case VIRTIO_BLK_T_FLUSH: | |
183 | /* TODO fdsync not supported by Linux AIO, do it synchronously here! */ | |
184 | if (qemu_fdatasync(s->fd) < 0) { | |
185 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR); | |
186 | } else { | |
187 | complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK); | |
188 | } | |
189 | return 0; | |
190 | ||
191 | case VIRTIO_BLK_T_GET_ID: | |
192 | do_get_id_cmd(s, in_iov, in_num, head, inhdr); | |
193 | return 0; | |
194 | ||
195 | default: | |
196 | error_report("virtio-blk unsupported request type %#x", outhdr.type); | |
197 | qemu_iovec_destroy(inhdr); | |
198 | g_slice_free(QEMUIOVector, inhdr); | |
199 | return -EFAULT; | |
200 | } | |
201 | ||
202 | /* Fill in virtio block metadata needed for completion */ | |
203 | VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb); | |
204 | req->head = head; | |
205 | req->inhdr = inhdr; | |
206 | return 0; | |
207 | } | |
208 | ||
209 | static void handle_notify(EventHandler *handler) | |
210 | { | |
211 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
212 | notify_handler); | |
213 | ||
214 | /* There is one array of iovecs into which all new requests are extracted | |
215 | * from the vring. Requests are read from the vring and the translated | |
216 | * descriptors are written to the iovecs array. The iovecs do not have to | |
217 | * persist across handle_notify() calls because the kernel copies the | |
218 | * iovecs on io_submit(). | |
219 | * | |
220 | * Handling io_submit() EAGAIN may require storing the requests across | |
221 | * handle_notify() calls until the kernel has sufficient resources to | |
222 | * accept more I/O. This is not implemented yet. | |
223 | */ | |
224 | struct iovec iovec[VRING_MAX]; | |
225 | struct iovec *end = &iovec[VRING_MAX]; | |
226 | struct iovec *iov = iovec; | |
227 | ||
228 | /* When a request is read from the vring, the index of the first descriptor | |
229 | * (aka head) is returned so that the completed request can be pushed onto | |
230 | * the vring later. | |
231 | * | |
232 | * The number of hypervisor read-only iovecs is out_num. The number of | |
233 | * hypervisor write-only iovecs is in_num. | |
234 | */ | |
235 | int head; | |
236 | unsigned int out_num = 0, in_num = 0; | |
237 | unsigned int num_queued; | |
238 | ||
239 | for (;;) { | |
240 | /* Disable guest->host notifies to avoid unnecessary vmexits */ | |
241 | vring_disable_notification(s->vdev, &s->vring); | |
242 | ||
243 | for (;;) { | |
244 | head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num); | |
245 | if (head < 0) { | |
246 | break; /* no more requests */ | |
247 | } | |
248 | ||
249 | trace_virtio_blk_data_plane_process_request(s, out_num, in_num, | |
250 | head); | |
251 | ||
252 | if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) { | |
253 | vring_set_broken(&s->vring); | |
254 | break; | |
255 | } | |
256 | iov += out_num + in_num; | |
257 | } | |
258 | ||
259 | if (likely(head == -EAGAIN)) { /* vring emptied */ | |
260 | /* Re-enable guest->host notifies and stop processing the vring. | |
261 | * But if the guest has snuck in more descriptors, keep processing. | |
262 | */ | |
263 | if (vring_enable_notification(s->vdev, &s->vring)) { | |
264 | break; | |
265 | } | |
266 | } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */ | |
267 | /* Since there are no iovecs[] left, stop processing for now. Do | |
268 | * not re-enable guest->host notifies since the I/O completion | |
269 | * handler knows to check for more vring descriptors anyway. | |
270 | */ | |
271 | break; | |
272 | } | |
273 | } | |
274 | ||
275 | num_queued = ioq_num_queued(&s->ioqueue); | |
276 | if (num_queued > 0) { | |
277 | s->num_reqs += num_queued; | |
278 | ||
279 | int rc = ioq_submit(&s->ioqueue); | |
280 | if (unlikely(rc < 0)) { | |
281 | fprintf(stderr, "ioq_submit failed %d\n", rc); | |
282 | exit(1); | |
283 | } | |
284 | } | |
285 | } | |
286 | ||
287 | static void handle_io(EventHandler *handler) | |
288 | { | |
289 | VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane, | |
290 | io_handler); | |
291 | ||
292 | if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) { | |
293 | notify_guest(s); | |
294 | } | |
295 | ||
296 | /* If there were more requests than iovecs, the vring will not be empty yet | |
297 | * so check again. There should now be enough resources to process more | |
298 | * requests. | |
299 | */ | |
300 | if (unlikely(vring_more_avail(&s->vring))) { | |
301 | handle_notify(&s->notify_handler); | |
302 | } | |
303 | } | |
304 | ||
305 | static void *data_plane_thread(void *opaque) | |
306 | { | |
307 | VirtIOBlockDataPlane *s = opaque; | |
308 | ||
309 | do { | |
310 | event_poll(&s->event_poll); | |
311 | } while (s->started || s->num_reqs > 0); | |
312 | return NULL; | |
313 | } | |
314 | ||
315 | static void start_data_plane_bh(void *opaque) | |
316 | { | |
317 | VirtIOBlockDataPlane *s = opaque; | |
318 | ||
319 | qemu_bh_delete(s->start_bh); | |
320 | s->start_bh = NULL; | |
321 | qemu_thread_create(&s->thread, data_plane_thread, | |
322 | s, QEMU_THREAD_JOINABLE); | |
323 | } | |
324 | ||
325 | bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk, | |
326 | VirtIOBlockDataPlane **dataplane) | |
327 | { | |
328 | VirtIOBlockDataPlane *s; | |
329 | int fd; | |
330 | ||
331 | *dataplane = NULL; | |
332 | ||
333 | if (!blk->data_plane) { | |
334 | return true; | |
335 | } | |
336 | ||
337 | if (blk->scsi) { | |
338 | error_report("device is incompatible with x-data-plane, use scsi=off"); | |
339 | return false; | |
340 | } | |
341 | ||
342 | if (blk->config_wce) { | |
343 | error_report("device is incompatible with x-data-plane, " | |
344 | "use config-wce=off"); | |
345 | return false; | |
346 | } | |
347 | ||
348 | fd = raw_get_aio_fd(blk->conf.bs); | |
349 | if (fd < 0) { | |
350 | error_report("drive is incompatible with x-data-plane, " | |
351 | "use format=raw,cache=none,aio=native"); | |
352 | return false; | |
353 | } | |
354 | ||
355 | s = g_new0(VirtIOBlockDataPlane, 1); | |
356 | s->vdev = vdev; | |
357 | s->fd = fd; | |
358 | s->blk = blk; | |
359 | ||
360 | /* Prevent block operations that conflict with data plane thread */ | |
361 | bdrv_set_in_use(blk->conf.bs, 1); | |
362 | ||
363 | error_setg(&s->migration_blocker, | |
364 | "x-data-plane does not support migration"); | |
365 | migrate_add_blocker(s->migration_blocker); | |
366 | ||
367 | *dataplane = s; | |
368 | return true; | |
369 | } | |
370 | ||
371 | void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) | |
372 | { | |
373 | if (!s) { | |
374 | return; | |
375 | } | |
376 | ||
377 | virtio_blk_data_plane_stop(s); | |
378 | migrate_del_blocker(s->migration_blocker); | |
379 | error_free(s->migration_blocker); | |
380 | bdrv_set_in_use(s->blk->conf.bs, 0); | |
381 | g_free(s); | |
382 | } | |
383 | ||
384 | void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s) | |
385 | { | |
386 | VirtQueue *vq; | |
387 | int i; | |
388 | ||
389 | if (s->started) { | |
390 | return; | |
391 | } | |
392 | ||
393 | vq = virtio_get_queue(s->vdev, 0); | |
394 | if (!vring_setup(&s->vring, s->vdev, 0)) { | |
395 | return; | |
396 | } | |
397 | ||
398 | event_poll_init(&s->event_poll); | |
399 | ||
400 | /* Set up guest notifier (irq) */ | |
401 | if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, | |
402 | true) != 0) { | |
403 | fprintf(stderr, "virtio-blk failed to set guest notifier, " | |
404 | "ensure -enable-kvm is set\n"); | |
405 | exit(1); | |
406 | } | |
407 | s->guest_notifier = virtio_queue_get_guest_notifier(vq); | |
408 | ||
409 | /* Set up virtqueue notify */ | |
410 | if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, | |
411 | 0, true) != 0) { | |
412 | fprintf(stderr, "virtio-blk failed to set host notifier\n"); | |
413 | exit(1); | |
414 | } | |
415 | event_poll_add(&s->event_poll, &s->notify_handler, | |
416 | virtio_queue_get_host_notifier(vq), | |
417 | handle_notify); | |
418 | ||
419 | /* Set up ioqueue */ | |
420 | ioq_init(&s->ioqueue, s->fd, REQ_MAX); | |
421 | for (i = 0; i < ARRAY_SIZE(s->requests); i++) { | |
422 | ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb); | |
423 | } | |
424 | event_poll_add(&s->event_poll, &s->io_handler, | |
425 | ioq_get_notifier(&s->ioqueue), handle_io); | |
426 | ||
427 | s->started = true; | |
428 | trace_virtio_blk_data_plane_start(s); | |
429 | ||
430 | /* Kick right away to begin processing requests already in vring */ | |
431 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | |
432 | ||
433 | /* Spawn thread in BH so it inherits iothread cpusets */ | |
434 | s->start_bh = qemu_bh_new(start_data_plane_bh, s); | |
435 | qemu_bh_schedule(s->start_bh); | |
436 | } | |
437 | ||
438 | void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) | |
439 | { | |
440 | if (!s->started) { | |
441 | return; | |
442 | } | |
443 | s->started = false; | |
444 | trace_virtio_blk_data_plane_stop(s); | |
445 | ||
446 | /* Stop thread or cancel pending thread creation BH */ | |
447 | if (s->start_bh) { | |
448 | qemu_bh_delete(s->start_bh); | |
449 | s->start_bh = NULL; | |
450 | } else { | |
451 | event_poll_notify(&s->event_poll); | |
452 | qemu_thread_join(&s->thread); | |
453 | } | |
454 | ||
455 | ioq_cleanup(&s->ioqueue); | |
456 | ||
457 | s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false); | |
458 | ||
459 | event_poll_cleanup(&s->event_poll); | |
460 | ||
461 | /* Clean up guest notifier (irq) */ | |
462 | s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false); | |
463 | ||
464 | vring_teardown(&s->vring); | |
465 | } |