2 * Copyright (c) 2018 Citrix Systems Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
21 #include "qemu/osdep.h"
22 #include "qemu/error-report.h"
23 #include "qapi/error.h"
25 #include "hw/xen/xen_common.h"
26 #include "hw/block/xen_blkif.h"
27 #include "sysemu/block-backend.h"
28 #include "sysemu/iothread.h"
29 #include "xen-block.h"
31 typedef struct XenBlockRequest {
41 XenBlockDataPlane *dataplane;
42 QLIST_ENTRY(XenBlockRequest) list;
46 struct XenBlockDataPlane {
48 XenEventChannel *event_channel;
49 unsigned int *ring_ref;
50 unsigned int nr_ring_ref;
55 blkif_back_rings_t rings;
57 QLIST_HEAD(inflight_head, XenBlockRequest) inflight;
58 QLIST_HEAD(freelist_head, XenBlockRequest) freelist;
60 int requests_inflight;
61 unsigned int max_requests;
68 static void reset_request(XenBlockRequest *request)
70 memset(&request->req, 0, sizeof(request->req));
76 request->aio_inflight = 0;
77 request->aio_errors = 0;
79 request->dataplane = NULL;
80 memset(&request->list, 0, sizeof(request->list));
81 memset(&request->acct, 0, sizeof(request->acct));
83 qemu_iovec_reset(&request->v);
86 static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
88 XenBlockRequest *request = NULL;
90 if (QLIST_EMPTY(&dataplane->freelist)) {
91 if (dataplane->requests_total >= dataplane->max_requests) {
94 /* allocate new struct */
95 request = g_malloc0(sizeof(*request));
96 request->dataplane = dataplane;
98 * We cannot need more pages per requests than this, and since we
99 * re-use requests, allocate the memory once here. It will be freed
100 * xen_block_dataplane_destroy() when the request list is freed.
102 request->buf = qemu_memalign(XC_PAGE_SIZE,
103 BLKIF_MAX_SEGMENTS_PER_REQUEST *
105 dataplane->requests_total++;
106 qemu_iovec_init(&request->v, 1);
108 /* get one from freelist */
109 request = QLIST_FIRST(&dataplane->freelist);
110 QLIST_REMOVE(request, list);
112 QLIST_INSERT_HEAD(&dataplane->inflight, request, list);
113 dataplane->requests_inflight++;
119 static void xen_block_finish_request(XenBlockRequest *request)
121 XenBlockDataPlane *dataplane = request->dataplane;
123 QLIST_REMOVE(request, list);
124 dataplane->requests_inflight--;
127 static void xen_block_release_request(XenBlockRequest *request)
129 XenBlockDataPlane *dataplane = request->dataplane;
131 QLIST_REMOVE(request, list);
132 reset_request(request);
133 request->dataplane = dataplane;
134 QLIST_INSERT_HEAD(&dataplane->freelist, request, list);
135 dataplane->requests_inflight--;
139 * translate request into iovec + start offset
140 * do sanity checks along the way
142 static int xen_block_parse_request(XenBlockRequest *request)
144 XenBlockDataPlane *dataplane = request->dataplane;
148 switch (request->req.operation) {
151 case BLKIF_OP_FLUSH_DISKCACHE:
152 request->presync = 1;
153 if (!request->req.nr_segments) {
159 case BLKIF_OP_DISCARD:
162 error_report("error: unknown operation (%d)", request->req.operation);
166 if (request->req.operation != BLKIF_OP_READ &&
167 blk_is_read_only(dataplane->blk)) {
168 error_report("error: write req for ro device");
172 request->start = request->req.sector_number * dataplane->file_blk;
173 for (i = 0; i < request->req.nr_segments; i++) {
174 if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
175 error_report("error: nr_segments too big");
178 if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) {
179 error_report("error: first > last sector");
182 if (request->req.seg[i].last_sect * dataplane->file_blk >=
184 error_report("error: page crossing");
188 len = (request->req.seg[i].last_sect -
189 request->req.seg[i].first_sect + 1) * dataplane->file_blk;
190 request->size += len;
192 if (request->start + request->size > dataplane->file_size) {
193 error_report("error: access beyond end of file");
199 request->status = BLKIF_RSP_ERROR;
203 static int xen_block_copy_request(XenBlockRequest *request)
205 XenBlockDataPlane *dataplane = request->dataplane;
206 XenDevice *xendev = dataplane->xendev;
207 XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
209 int64_t file_blk = dataplane->file_blk;
210 bool to_domain = (request->req.operation == BLKIF_OP_READ);
211 void *virt = request->buf;
212 Error *local_err = NULL;
214 if (request->req.nr_segments == 0) {
218 count = request->req.nr_segments;
220 for (i = 0; i < count; i++) {
222 segs[i].dest.foreign.ref = request->req.seg[i].gref;
223 segs[i].dest.foreign.offset = request->req.seg[i].first_sect *
225 segs[i].source.virt = virt;
227 segs[i].source.foreign.ref = request->req.seg[i].gref;
228 segs[i].source.foreign.offset = request->req.seg[i].first_sect *
230 segs[i].dest.virt = virt;
232 segs[i].len = (request->req.seg[i].last_sect -
233 request->req.seg[i].first_sect + 1) * file_blk;
237 xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err);
240 error_reportf_err(local_err, "failed to copy data: ");
242 request->aio_errors++;
249 static int xen_block_do_aio(XenBlockRequest *request);
250 static int xen_block_send_response(XenBlockRequest *request);
252 static void xen_block_complete_aio(void *opaque, int ret)
254 XenBlockRequest *request = opaque;
255 XenBlockDataPlane *dataplane = request->dataplane;
257 aio_context_acquire(dataplane->ctx);
260 error_report("%s I/O error",
261 request->req.operation == BLKIF_OP_READ ?
263 request->aio_errors++;
266 request->aio_inflight--;
267 if (request->presync) {
268 request->presync = 0;
269 xen_block_do_aio(request);
272 if (request->aio_inflight > 0) {
276 switch (request->req.operation) {
278 /* in case of failure request->aio_errors is increased */
280 xen_block_copy_request(request);
284 case BLKIF_OP_FLUSH_DISKCACHE:
285 if (!request->req.nr_segments) {
293 request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
294 xen_block_finish_request(request);
296 switch (request->req.operation) {
298 case BLKIF_OP_FLUSH_DISKCACHE:
299 if (!request->req.nr_segments) {
303 if (request->status == BLKIF_RSP_OKAY) {
304 block_acct_done(blk_get_stats(dataplane->blk), &request->acct);
306 block_acct_failed(blk_get_stats(dataplane->blk), &request->acct);
309 case BLKIF_OP_DISCARD:
313 if (xen_block_send_response(request)) {
314 Error *local_err = NULL;
316 xen_device_notify_event_channel(dataplane->xendev,
317 dataplane->event_channel,
320 error_report_err(local_err);
323 xen_block_release_request(request);
325 qemu_bh_schedule(dataplane->bh);
328 aio_context_release(dataplane->ctx);
331 static bool xen_block_split_discard(XenBlockRequest *request,
332 blkif_sector_t sector_number,
335 XenBlockDataPlane *dataplane = request->dataplane;
338 uint64_t byte_remaining, limit;
339 uint64_t sec_start = sector_number;
340 uint64_t sec_count = nr_sectors;
342 /* Wrap around, or overflowing byte limit? */
343 if (sec_start + sec_count < sec_count ||
344 sec_start + sec_count > INT64_MAX / dataplane->file_blk) {
348 limit = BDRV_REQUEST_MAX_SECTORS * dataplane->file_blk;
349 byte_offset = sec_start * dataplane->file_blk;
350 byte_remaining = sec_count * dataplane->file_blk;
353 byte_chunk = byte_remaining > limit ? limit : byte_remaining;
354 request->aio_inflight++;
355 blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk,
356 xen_block_complete_aio, request);
357 byte_remaining -= byte_chunk;
358 byte_offset += byte_chunk;
359 } while (byte_remaining > 0);
364 static int xen_block_do_aio(XenBlockRequest *request)
366 XenBlockDataPlane *dataplane = request->dataplane;
368 if (request->req.nr_segments &&
369 (request->req.operation == BLKIF_OP_WRITE ||
370 request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
371 xen_block_copy_request(request)) {
375 request->aio_inflight++;
376 if (request->presync) {
377 blk_aio_flush(request->dataplane->blk, xen_block_complete_aio,
382 switch (request->req.operation) {
384 qemu_iovec_add(&request->v, request->buf, request->size);
385 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
386 request->v.size, BLOCK_ACCT_READ);
387 request->aio_inflight++;
388 blk_aio_preadv(dataplane->blk, request->start, &request->v, 0,
389 xen_block_complete_aio, request);
392 case BLKIF_OP_FLUSH_DISKCACHE:
393 if (!request->req.nr_segments) {
397 qemu_iovec_add(&request->v, request->buf, request->size);
398 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
400 request->req.operation == BLKIF_OP_WRITE ?
401 BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
402 request->aio_inflight++;
403 blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0,
404 xen_block_complete_aio, request);
406 case BLKIF_OP_DISCARD:
408 struct blkif_request_discard *req = (void *)&request->req;
409 if (!xen_block_split_discard(request, req->sector_number,
416 /* unknown operation (shouldn't happen -- parse catches this) */
420 xen_block_complete_aio(request, 0);
425 xen_block_finish_request(request);
426 request->status = BLKIF_RSP_ERROR;
430 static int xen_block_send_response(XenBlockRequest *request)
432 XenBlockDataPlane *dataplane = request->dataplane;
434 int have_requests = 0;
435 blkif_response_t *resp;
437 /* Place on the response ring for the relevant domain. */
438 switch (dataplane->protocol) {
439 case BLKIF_PROTOCOL_NATIVE:
440 resp = (blkif_response_t *)RING_GET_RESPONSE(
441 &dataplane->rings.native,
442 dataplane->rings.native.rsp_prod_pvt);
444 case BLKIF_PROTOCOL_X86_32:
445 resp = (blkif_response_t *)RING_GET_RESPONSE(
446 &dataplane->rings.x86_32_part,
447 dataplane->rings.x86_32_part.rsp_prod_pvt);
449 case BLKIF_PROTOCOL_X86_64:
450 resp = (blkif_response_t *)RING_GET_RESPONSE(
451 &dataplane->rings.x86_64_part,
452 dataplane->rings.x86_64_part.rsp_prod_pvt);
458 resp->id = request->req.id;
459 resp->operation = request->req.operation;
460 resp->status = request->status;
462 dataplane->rings.common.rsp_prod_pvt++;
464 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common,
466 if (dataplane->rings.common.rsp_prod_pvt ==
467 dataplane->rings.common.req_cons) {
469 * Tail check for pending requests. Allows frontend to avoid
470 * notifications if requests are already in flight (lower
471 * overheads and promotes batching).
473 RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common,
475 } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) {
480 dataplane->more_work++;
485 static int xen_block_get_request(XenBlockDataPlane *dataplane,
486 XenBlockRequest *request, RING_IDX rc)
488 switch (dataplane->protocol) {
489 case BLKIF_PROTOCOL_NATIVE: {
490 blkif_request_t *req =
491 RING_GET_REQUEST(&dataplane->rings.native, rc);
493 memcpy(&request->req, req, sizeof(request->req));
496 case BLKIF_PROTOCOL_X86_32: {
497 blkif_x86_32_request_t *req =
498 RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc);
500 blkif_get_x86_32_req(&request->req, req);
503 case BLKIF_PROTOCOL_X86_64: {
504 blkif_x86_64_request_t *req =
505 RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc);
507 blkif_get_x86_64_req(&request->req, req);
511 /* Prevent the compiler from accessing the on-ring fields instead. */
517 * Threshold of in-flight requests above which we will start using
518 * blk_io_plug()/blk_io_unplug() to batch requests.
520 #define IO_PLUG_THRESHOLD 1
522 static void xen_block_handle_requests(XenBlockDataPlane *dataplane)
525 XenBlockRequest *request;
526 int inflight_atstart = dataplane->requests_inflight;
529 dataplane->more_work = 0;
531 rc = dataplane->rings.common.req_cons;
532 rp = dataplane->rings.common.sring->req_prod;
533 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
536 * If there was more than IO_PLUG_THRESHOLD requests in flight
537 * when we got here, this is an indication that there the bottleneck
538 * is below us, so it's worth beginning to batch up I/O requests
539 * rather than submitting them immediately. The maximum number
540 * of requests we're willing to batch is the number already in
541 * flight, so it can grow up to max_requests when the bottleneck
544 if (inflight_atstart > IO_PLUG_THRESHOLD) {
545 blk_io_plug(dataplane->blk);
548 /* pull request from ring */
549 if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) {
552 request = xen_block_start_request(dataplane);
553 if (request == NULL) {
554 dataplane->more_work++;
557 xen_block_get_request(dataplane, request, rc);
558 dataplane->rings.common.req_cons = ++rc;
561 if (xen_block_parse_request(request) != 0) {
562 switch (request->req.operation) {
564 block_acct_invalid(blk_get_stats(dataplane->blk),
568 block_acct_invalid(blk_get_stats(dataplane->blk),
571 case BLKIF_OP_FLUSH_DISKCACHE:
572 block_acct_invalid(blk_get_stats(dataplane->blk),
578 if (xen_block_send_response(request)) {
579 Error *local_err = NULL;
581 xen_device_notify_event_channel(dataplane->xendev,
582 dataplane->event_channel,
585 error_report_err(local_err);
588 xen_block_release_request(request);
592 if (inflight_atstart > IO_PLUG_THRESHOLD &&
593 batched >= inflight_atstart) {
594 blk_io_unplug(dataplane->blk);
596 xen_block_do_aio(request);
597 if (inflight_atstart > IO_PLUG_THRESHOLD) {
598 if (batched >= inflight_atstart) {
599 blk_io_plug(dataplane->blk);
606 if (inflight_atstart > IO_PLUG_THRESHOLD) {
607 blk_io_unplug(dataplane->blk);
610 if (dataplane->more_work &&
611 dataplane->requests_inflight < dataplane->max_requests) {
612 qemu_bh_schedule(dataplane->bh);
616 static void xen_block_dataplane_bh(void *opaque)
618 XenBlockDataPlane *dataplane = opaque;
620 aio_context_acquire(dataplane->ctx);
621 xen_block_handle_requests(dataplane);
622 aio_context_release(dataplane->ctx);
625 static void xen_block_dataplane_event(void *opaque)
627 XenBlockDataPlane *dataplane = opaque;
629 qemu_bh_schedule(dataplane->bh);
632 XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
636 XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1);
638 dataplane->xendev = xendev;
639 dataplane->file_blk = conf->logical_block_size;
640 dataplane->blk = conf->blk;
641 dataplane->file_size = blk_getlength(dataplane->blk);
643 QLIST_INIT(&dataplane->inflight);
644 QLIST_INIT(&dataplane->freelist);
647 dataplane->iothread = iothread;
648 object_ref(OBJECT(dataplane->iothread));
649 dataplane->ctx = iothread_get_aio_context(dataplane->iothread);
651 dataplane->ctx = qemu_get_aio_context();
653 dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
659 void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane)
661 XenBlockRequest *request;
667 while (!QLIST_EMPTY(&dataplane->freelist)) {
668 request = QLIST_FIRST(&dataplane->freelist);
669 QLIST_REMOVE(request, list);
670 qemu_iovec_destroy(&request->v);
671 qemu_vfree(request->buf);
675 qemu_bh_delete(dataplane->bh);
676 if (dataplane->iothread) {
677 object_unref(OBJECT(dataplane->iothread));
683 void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
691 aio_context_acquire(dataplane->ctx);
692 blk_set_aio_context(dataplane->blk, qemu_get_aio_context());
693 aio_context_release(dataplane->ctx);
695 xendev = dataplane->xendev;
697 if (dataplane->event_channel) {
698 Error *local_err = NULL;
700 xen_device_unbind_event_channel(xendev, dataplane->event_channel,
702 dataplane->event_channel = NULL;
705 error_report_err(local_err);
709 if (dataplane->sring) {
710 Error *local_err = NULL;
712 xen_device_unmap_grant_refs(xendev, dataplane->sring,
713 dataplane->nr_ring_ref, &local_err);
714 dataplane->sring = NULL;
717 error_report_err(local_err);
721 g_free(dataplane->ring_ref);
722 dataplane->ring_ref = NULL;
725 void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
726 const unsigned int ring_ref[],
727 unsigned int nr_ring_ref,
728 unsigned int event_channel,
729 unsigned int protocol,
732 XenDevice *xendev = dataplane->xendev;
733 Error *local_err = NULL;
734 unsigned int ring_size;
737 dataplane->nr_ring_ref = nr_ring_ref;
738 dataplane->ring_ref = g_new(unsigned int, nr_ring_ref);
740 for (i = 0; i < nr_ring_ref; i++) {
741 dataplane->ring_ref[i] = ring_ref[i];
744 dataplane->protocol = protocol;
746 ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
747 switch (dataplane->protocol) {
748 case BLKIF_PROTOCOL_NATIVE:
750 dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size);
753 case BLKIF_PROTOCOL_X86_32:
755 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
758 case BLKIF_PROTOCOL_X86_64:
760 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
764 error_setg(errp, "unknown protocol %u", dataplane->protocol);
768 xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref,
771 error_propagate(errp, local_err);
775 dataplane->sring = xen_device_map_grant_refs(xendev,
777 dataplane->nr_ring_ref,
778 PROT_READ | PROT_WRITE,
781 error_propagate(errp, local_err);
785 switch (dataplane->protocol) {
786 case BLKIF_PROTOCOL_NATIVE:
788 blkif_sring_t *sring_native = dataplane->sring;
790 BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size);
793 case BLKIF_PROTOCOL_X86_32:
795 blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring;
797 BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32,
801 case BLKIF_PROTOCOL_X86_64:
803 blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring;
805 BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64,
811 dataplane->event_channel =
812 xen_device_bind_event_channel(xendev, event_channel,
813 xen_block_dataplane_event, dataplane,
816 error_propagate(errp, local_err);
820 aio_context_acquire(dataplane->ctx);
821 blk_set_aio_context(dataplane->blk, dataplane->ctx);
822 aio_context_release(dataplane->ctx);
826 xen_block_dataplane_stop(dataplane);