2 * Copyright (c) 2018 Citrix Systems Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
21 #include "qemu/osdep.h"
22 #include "qemu/error-report.h"
23 #include "qapi/error.h"
25 #include "hw/xen/xen_common.h"
26 #include "hw/block/xen_blkif.h"
27 #include "sysemu/block-backend.h"
28 #include "sysemu/iothread.h"
29 #include "xen-block.h"
31 typedef struct XenBlockRequest {
41 XenBlockDataPlane *dataplane;
42 QLIST_ENTRY(XenBlockRequest) list;
46 struct XenBlockDataPlane {
48 XenEventChannel *event_channel;
49 unsigned int *ring_ref;
50 unsigned int nr_ring_ref;
53 blkif_back_rings_t rings;
55 QLIST_HEAD(inflight_head, XenBlockRequest) inflight;
56 QLIST_HEAD(freelist_head, XenBlockRequest) freelist;
58 int requests_inflight;
59 unsigned int max_requests;
66 static void reset_request(XenBlockRequest *request)
68 memset(&request->req, 0, sizeof(request->req));
74 request->aio_inflight = 0;
75 request->aio_errors = 0;
77 request->dataplane = NULL;
78 memset(&request->list, 0, sizeof(request->list));
79 memset(&request->acct, 0, sizeof(request->acct));
81 qemu_iovec_reset(&request->v);
84 static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
86 XenBlockRequest *request = NULL;
88 if (QLIST_EMPTY(&dataplane->freelist)) {
89 if (dataplane->requests_total >= dataplane->max_requests) {
92 /* allocate new struct */
93 request = g_malloc0(sizeof(*request));
94 request->dataplane = dataplane;
96 * We cannot need more pages per requests than this, and since we
97 * re-use requests, allocate the memory once here. It will be freed
98 * xen_block_dataplane_destroy() when the request list is freed.
100 request->buf = qemu_memalign(XC_PAGE_SIZE,
101 BLKIF_MAX_SEGMENTS_PER_REQUEST *
103 dataplane->requests_total++;
104 qemu_iovec_init(&request->v, 1);
106 /* get one from freelist */
107 request = QLIST_FIRST(&dataplane->freelist);
108 QLIST_REMOVE(request, list);
110 QLIST_INSERT_HEAD(&dataplane->inflight, request, list);
111 dataplane->requests_inflight++;
117 static void xen_block_finish_request(XenBlockRequest *request)
119 XenBlockDataPlane *dataplane = request->dataplane;
121 QLIST_REMOVE(request, list);
122 dataplane->requests_inflight--;
125 static void xen_block_release_request(XenBlockRequest *request)
127 XenBlockDataPlane *dataplane = request->dataplane;
129 QLIST_REMOVE(request, list);
130 reset_request(request);
131 request->dataplane = dataplane;
132 QLIST_INSERT_HEAD(&dataplane->freelist, request, list);
133 dataplane->requests_inflight--;
137 * translate request into iovec + start offset
138 * do sanity checks along the way
140 static int xen_block_parse_request(XenBlockRequest *request)
142 XenBlockDataPlane *dataplane = request->dataplane;
146 switch (request->req.operation) {
149 case BLKIF_OP_FLUSH_DISKCACHE:
150 request->presync = 1;
151 if (!request->req.nr_segments) {
157 case BLKIF_OP_DISCARD:
160 error_report("error: unknown operation (%d)", request->req.operation);
164 if (request->req.operation != BLKIF_OP_READ &&
165 blk_is_read_only(dataplane->blk)) {
166 error_report("error: write req for ro device");
170 request->start = request->req.sector_number * XEN_BLKIF_SECTOR_SIZE;
171 for (i = 0; i < request->req.nr_segments; i++) {
172 if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
173 error_report("error: nr_segments too big");
176 if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) {
177 error_report("error: first > last sector");
180 if (request->req.seg[i].last_sect * XEN_BLKIF_SECTOR_SIZE >=
182 error_report("error: page crossing");
186 len = (request->req.seg[i].last_sect -
187 request->req.seg[i].first_sect + 1) * XEN_BLKIF_SECTOR_SIZE;
188 request->size += len;
190 if (request->start + request->size > blk_getlength(dataplane->blk)) {
191 error_report("error: access beyond end of file");
197 request->status = BLKIF_RSP_ERROR;
201 static int xen_block_copy_request(XenBlockRequest *request)
203 XenBlockDataPlane *dataplane = request->dataplane;
204 XenDevice *xendev = dataplane->xendev;
205 XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
207 bool to_domain = (request->req.operation == BLKIF_OP_READ);
208 void *virt = request->buf;
209 Error *local_err = NULL;
211 if (request->req.nr_segments == 0) {
215 count = request->req.nr_segments;
217 for (i = 0; i < count; i++) {
219 segs[i].dest.foreign.ref = request->req.seg[i].gref;
220 segs[i].dest.foreign.offset = request->req.seg[i].first_sect *
221 XEN_BLKIF_SECTOR_SIZE;
222 segs[i].source.virt = virt;
224 segs[i].source.foreign.ref = request->req.seg[i].gref;
225 segs[i].source.foreign.offset = request->req.seg[i].first_sect *
226 XEN_BLKIF_SECTOR_SIZE;
227 segs[i].dest.virt = virt;
229 segs[i].len = (request->req.seg[i].last_sect -
230 request->req.seg[i].first_sect + 1) *
231 XEN_BLKIF_SECTOR_SIZE;
235 xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err);
238 error_reportf_err(local_err, "failed to copy data: ");
240 request->aio_errors++;
247 static int xen_block_do_aio(XenBlockRequest *request);
248 static int xen_block_send_response(XenBlockRequest *request);
250 static void xen_block_complete_aio(void *opaque, int ret)
252 XenBlockRequest *request = opaque;
253 XenBlockDataPlane *dataplane = request->dataplane;
255 aio_context_acquire(dataplane->ctx);
258 error_report("%s I/O error",
259 request->req.operation == BLKIF_OP_READ ?
261 request->aio_errors++;
264 request->aio_inflight--;
265 if (request->presync) {
266 request->presync = 0;
267 xen_block_do_aio(request);
270 if (request->aio_inflight > 0) {
274 switch (request->req.operation) {
276 /* in case of failure request->aio_errors is increased */
278 xen_block_copy_request(request);
282 case BLKIF_OP_FLUSH_DISKCACHE:
287 request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
288 xen_block_finish_request(request);
290 switch (request->req.operation) {
292 case BLKIF_OP_FLUSH_DISKCACHE:
293 if (!request->req.nr_segments) {
298 if (request->status == BLKIF_RSP_OKAY) {
299 block_acct_done(blk_get_stats(dataplane->blk), &request->acct);
301 block_acct_failed(blk_get_stats(dataplane->blk), &request->acct);
304 case BLKIF_OP_DISCARD:
308 if (xen_block_send_response(request)) {
309 Error *local_err = NULL;
311 xen_device_notify_event_channel(dataplane->xendev,
312 dataplane->event_channel,
315 error_report_err(local_err);
318 xen_block_release_request(request);
320 qemu_bh_schedule(dataplane->bh);
323 aio_context_release(dataplane->ctx);
326 static bool xen_block_split_discard(XenBlockRequest *request,
327 blkif_sector_t sector_number,
330 XenBlockDataPlane *dataplane = request->dataplane;
333 uint64_t byte_remaining;
334 uint64_t sec_start = sector_number;
335 uint64_t sec_count = nr_sectors;
337 /* Wrap around, or overflowing byte limit? */
338 if (sec_start + sec_count < sec_count ||
339 sec_start + sec_count > INT64_MAX / XEN_BLKIF_SECTOR_SIZE) {
343 byte_offset = sec_start * XEN_BLKIF_SECTOR_SIZE;
344 byte_remaining = sec_count * XEN_BLKIF_SECTOR_SIZE;
347 byte_chunk = byte_remaining > BDRV_REQUEST_MAX_BYTES ?
348 BDRV_REQUEST_MAX_BYTES : byte_remaining;
349 request->aio_inflight++;
350 blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk,
351 xen_block_complete_aio, request);
352 byte_remaining -= byte_chunk;
353 byte_offset += byte_chunk;
354 } while (byte_remaining > 0);
359 static int xen_block_do_aio(XenBlockRequest *request)
361 XenBlockDataPlane *dataplane = request->dataplane;
363 if (request->req.nr_segments &&
364 (request->req.operation == BLKIF_OP_WRITE ||
365 request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
366 xen_block_copy_request(request)) {
370 request->aio_inflight++;
371 if (request->presync) {
372 blk_aio_flush(request->dataplane->blk, xen_block_complete_aio,
377 switch (request->req.operation) {
379 qemu_iovec_add(&request->v, request->buf, request->size);
380 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
381 request->v.size, BLOCK_ACCT_READ);
382 request->aio_inflight++;
383 blk_aio_preadv(dataplane->blk, request->start, &request->v, 0,
384 xen_block_complete_aio, request);
387 case BLKIF_OP_FLUSH_DISKCACHE:
388 if (!request->req.nr_segments) {
392 qemu_iovec_add(&request->v, request->buf, request->size);
393 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
395 request->req.operation == BLKIF_OP_WRITE ?
396 BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
397 request->aio_inflight++;
398 blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0,
399 xen_block_complete_aio, request);
401 case BLKIF_OP_DISCARD:
403 struct blkif_request_discard *req = (void *)&request->req;
404 if (!xen_block_split_discard(request, req->sector_number,
411 /* unknown operation (shouldn't happen -- parse catches this) */
415 xen_block_complete_aio(request, 0);
420 xen_block_finish_request(request);
421 request->status = BLKIF_RSP_ERROR;
425 static int xen_block_send_response(XenBlockRequest *request)
427 XenBlockDataPlane *dataplane = request->dataplane;
429 int have_requests = 0;
430 blkif_response_t *resp;
432 /* Place on the response ring for the relevant domain. */
433 switch (dataplane->protocol) {
434 case BLKIF_PROTOCOL_NATIVE:
435 resp = (blkif_response_t *)RING_GET_RESPONSE(
436 &dataplane->rings.native,
437 dataplane->rings.native.rsp_prod_pvt);
439 case BLKIF_PROTOCOL_X86_32:
440 resp = (blkif_response_t *)RING_GET_RESPONSE(
441 &dataplane->rings.x86_32_part,
442 dataplane->rings.x86_32_part.rsp_prod_pvt);
444 case BLKIF_PROTOCOL_X86_64:
445 resp = (blkif_response_t *)RING_GET_RESPONSE(
446 &dataplane->rings.x86_64_part,
447 dataplane->rings.x86_64_part.rsp_prod_pvt);
453 resp->id = request->req.id;
454 resp->operation = request->req.operation;
455 resp->status = request->status;
457 dataplane->rings.common.rsp_prod_pvt++;
459 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common,
461 if (dataplane->rings.common.rsp_prod_pvt ==
462 dataplane->rings.common.req_cons) {
464 * Tail check for pending requests. Allows frontend to avoid
465 * notifications if requests are already in flight (lower
466 * overheads and promotes batching).
468 RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common,
470 } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) {
475 dataplane->more_work++;
480 static int xen_block_get_request(XenBlockDataPlane *dataplane,
481 XenBlockRequest *request, RING_IDX rc)
483 switch (dataplane->protocol) {
484 case BLKIF_PROTOCOL_NATIVE: {
485 blkif_request_t *req =
486 RING_GET_REQUEST(&dataplane->rings.native, rc);
488 memcpy(&request->req, req, sizeof(request->req));
491 case BLKIF_PROTOCOL_X86_32: {
492 blkif_x86_32_request_t *req =
493 RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc);
495 blkif_get_x86_32_req(&request->req, req);
498 case BLKIF_PROTOCOL_X86_64: {
499 blkif_x86_64_request_t *req =
500 RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc);
502 blkif_get_x86_64_req(&request->req, req);
506 /* Prevent the compiler from accessing the on-ring fields instead. */
512 * Threshold of in-flight requests above which we will start using
513 * blk_io_plug()/blk_io_unplug() to batch requests.
515 #define IO_PLUG_THRESHOLD 1
517 static void xen_block_handle_requests(XenBlockDataPlane *dataplane)
520 XenBlockRequest *request;
521 int inflight_atstart = dataplane->requests_inflight;
524 dataplane->more_work = 0;
526 rc = dataplane->rings.common.req_cons;
527 rp = dataplane->rings.common.sring->req_prod;
528 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
531 * If there was more than IO_PLUG_THRESHOLD requests in flight
532 * when we got here, this is an indication that there the bottleneck
533 * is below us, so it's worth beginning to batch up I/O requests
534 * rather than submitting them immediately. The maximum number
535 * of requests we're willing to batch is the number already in
536 * flight, so it can grow up to max_requests when the bottleneck
539 if (inflight_atstart > IO_PLUG_THRESHOLD) {
540 blk_io_plug(dataplane->blk);
543 /* pull request from ring */
544 if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) {
547 request = xen_block_start_request(dataplane);
548 if (request == NULL) {
549 dataplane->more_work++;
552 xen_block_get_request(dataplane, request, rc);
553 dataplane->rings.common.req_cons = ++rc;
556 if (xen_block_parse_request(request) != 0) {
557 switch (request->req.operation) {
559 block_acct_invalid(blk_get_stats(dataplane->blk),
563 block_acct_invalid(blk_get_stats(dataplane->blk),
566 case BLKIF_OP_FLUSH_DISKCACHE:
567 block_acct_invalid(blk_get_stats(dataplane->blk),
573 if (xen_block_send_response(request)) {
574 Error *local_err = NULL;
576 xen_device_notify_event_channel(dataplane->xendev,
577 dataplane->event_channel,
580 error_report_err(local_err);
583 xen_block_release_request(request);
587 if (inflight_atstart > IO_PLUG_THRESHOLD &&
588 batched >= inflight_atstart) {
589 blk_io_unplug(dataplane->blk);
591 xen_block_do_aio(request);
592 if (inflight_atstart > IO_PLUG_THRESHOLD) {
593 if (batched >= inflight_atstart) {
594 blk_io_plug(dataplane->blk);
601 if (inflight_atstart > IO_PLUG_THRESHOLD) {
602 blk_io_unplug(dataplane->blk);
605 if (dataplane->more_work &&
606 dataplane->requests_inflight < dataplane->max_requests) {
607 qemu_bh_schedule(dataplane->bh);
611 static void xen_block_dataplane_bh(void *opaque)
613 XenBlockDataPlane *dataplane = opaque;
615 aio_context_acquire(dataplane->ctx);
616 xen_block_handle_requests(dataplane);
617 aio_context_release(dataplane->ctx);
620 static void xen_block_dataplane_event(void *opaque)
622 XenBlockDataPlane *dataplane = opaque;
624 qemu_bh_schedule(dataplane->bh);
627 XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
631 XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1);
633 dataplane->xendev = xendev;
634 dataplane->blk = conf->blk;
636 QLIST_INIT(&dataplane->inflight);
637 QLIST_INIT(&dataplane->freelist);
640 dataplane->iothread = iothread;
641 object_ref(OBJECT(dataplane->iothread));
642 dataplane->ctx = iothread_get_aio_context(dataplane->iothread);
644 dataplane->ctx = qemu_get_aio_context();
646 dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
652 void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane)
654 XenBlockRequest *request;
660 while (!QLIST_EMPTY(&dataplane->freelist)) {
661 request = QLIST_FIRST(&dataplane->freelist);
662 QLIST_REMOVE(request, list);
663 qemu_iovec_destroy(&request->v);
664 qemu_vfree(request->buf);
668 qemu_bh_delete(dataplane->bh);
669 if (dataplane->iothread) {
670 object_unref(OBJECT(dataplane->iothread));
676 void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
684 aio_context_acquire(dataplane->ctx);
685 /* Xen doesn't have multiple users for nodes, so this can't fail */
686 blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
687 aio_context_release(dataplane->ctx);
689 xendev = dataplane->xendev;
691 if (dataplane->event_channel) {
692 Error *local_err = NULL;
694 xen_device_unbind_event_channel(xendev, dataplane->event_channel,
696 dataplane->event_channel = NULL;
699 error_report_err(local_err);
703 if (dataplane->sring) {
704 Error *local_err = NULL;
706 xen_device_unmap_grant_refs(xendev, dataplane->sring,
707 dataplane->nr_ring_ref, &local_err);
708 dataplane->sring = NULL;
711 error_report_err(local_err);
715 g_free(dataplane->ring_ref);
716 dataplane->ring_ref = NULL;
719 void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
720 const unsigned int ring_ref[],
721 unsigned int nr_ring_ref,
722 unsigned int event_channel,
723 unsigned int protocol,
726 XenDevice *xendev = dataplane->xendev;
727 Error *local_err = NULL;
728 unsigned int ring_size;
731 dataplane->nr_ring_ref = nr_ring_ref;
732 dataplane->ring_ref = g_new(unsigned int, nr_ring_ref);
734 for (i = 0; i < nr_ring_ref; i++) {
735 dataplane->ring_ref[i] = ring_ref[i];
738 dataplane->protocol = protocol;
740 ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
741 switch (dataplane->protocol) {
742 case BLKIF_PROTOCOL_NATIVE:
744 dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size);
747 case BLKIF_PROTOCOL_X86_32:
749 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
752 case BLKIF_PROTOCOL_X86_64:
754 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
758 error_setg(errp, "unknown protocol %u", dataplane->protocol);
762 xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref,
765 error_propagate(errp, local_err);
769 dataplane->sring = xen_device_map_grant_refs(xendev,
771 dataplane->nr_ring_ref,
772 PROT_READ | PROT_WRITE,
775 error_propagate(errp, local_err);
779 switch (dataplane->protocol) {
780 case BLKIF_PROTOCOL_NATIVE:
782 blkif_sring_t *sring_native = dataplane->sring;
784 BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size);
787 case BLKIF_PROTOCOL_X86_32:
789 blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring;
791 BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32,
795 case BLKIF_PROTOCOL_X86_64:
797 blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring;
799 BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64,
805 dataplane->event_channel =
806 xen_device_bind_event_channel(xendev, event_channel,
807 xen_block_dataplane_event, dataplane,
810 error_propagate(errp, local_err);
814 aio_context_acquire(dataplane->ctx);
815 /* If other users keep the BlockBackend in the iothread, that's ok */
816 blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
817 aio_context_release(dataplane->ctx);
821 xen_block_dataplane_stop(dataplane);