2 * Copyright (c) 2018 Citrix Systems Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; under version 2 of the License.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, see <http://www.gnu.org/licenses/>.
17 * Contributions after 2012-01-13 are licensed under the terms of the
18 * GNU GPL, version 2 or (at your option) any later version.
21 #include "qemu/osdep.h"
22 #include "qemu/error-report.h"
23 #include "qemu/main-loop.h"
24 #include "qapi/error.h"
25 #include "hw/xen/xen_common.h"
26 #include "hw/block/xen_blkif.h"
27 #include "sysemu/block-backend.h"
28 #include "sysemu/iothread.h"
29 #include "xen-block.h"
31 typedef struct XenBlockRequest {
41 XenBlockDataPlane *dataplane;
42 QLIST_ENTRY(XenBlockRequest) list;
46 struct XenBlockDataPlane {
48 XenEventChannel *event_channel;
49 unsigned int *ring_ref;
50 unsigned int nr_ring_ref;
53 blkif_back_rings_t rings;
55 QLIST_HEAD(inflight_head, XenBlockRequest) inflight;
56 QLIST_HEAD(freelist_head, XenBlockRequest) freelist;
58 int requests_inflight;
59 unsigned int max_requests;
61 unsigned int sector_size;
67 static void reset_request(XenBlockRequest *request)
69 memset(&request->req, 0, sizeof(request->req));
75 request->aio_inflight = 0;
76 request->aio_errors = 0;
78 request->dataplane = NULL;
79 memset(&request->list, 0, sizeof(request->list));
80 memset(&request->acct, 0, sizeof(request->acct));
82 qemu_iovec_reset(&request->v);
85 static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
87 XenBlockRequest *request = NULL;
89 if (QLIST_EMPTY(&dataplane->freelist)) {
90 if (dataplane->requests_total >= dataplane->max_requests) {
93 /* allocate new struct */
94 request = g_malloc0(sizeof(*request));
95 request->dataplane = dataplane;
97 * We cannot need more pages per requests than this, and since we
98 * re-use requests, allocate the memory once here. It will be freed
99 * xen_block_dataplane_destroy() when the request list is freed.
101 request->buf = qemu_memalign(XC_PAGE_SIZE,
102 BLKIF_MAX_SEGMENTS_PER_REQUEST *
104 dataplane->requests_total++;
105 qemu_iovec_init(&request->v, 1);
107 /* get one from freelist */
108 request = QLIST_FIRST(&dataplane->freelist);
109 QLIST_REMOVE(request, list);
111 QLIST_INSERT_HEAD(&dataplane->inflight, request, list);
112 dataplane->requests_inflight++;
118 static void xen_block_finish_request(XenBlockRequest *request)
120 XenBlockDataPlane *dataplane = request->dataplane;
122 QLIST_REMOVE(request, list);
123 dataplane->requests_inflight--;
126 static void xen_block_release_request(XenBlockRequest *request)
128 XenBlockDataPlane *dataplane = request->dataplane;
130 QLIST_REMOVE(request, list);
131 reset_request(request);
132 request->dataplane = dataplane;
133 QLIST_INSERT_HEAD(&dataplane->freelist, request, list);
134 dataplane->requests_inflight--;
138 * translate request into iovec + start offset
139 * do sanity checks along the way
141 static int xen_block_parse_request(XenBlockRequest *request)
143 XenBlockDataPlane *dataplane = request->dataplane;
147 switch (request->req.operation) {
150 case BLKIF_OP_FLUSH_DISKCACHE:
151 request->presync = 1;
152 if (!request->req.nr_segments) {
158 case BLKIF_OP_DISCARD:
161 error_report("error: unknown operation (%d)", request->req.operation);
165 if (request->req.operation != BLKIF_OP_READ &&
166 blk_is_read_only(dataplane->blk)) {
167 error_report("error: write req for ro device");
171 request->start = request->req.sector_number * dataplane->sector_size;
172 for (i = 0; i < request->req.nr_segments; i++) {
173 if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
174 error_report("error: nr_segments too big");
177 if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) {
178 error_report("error: first > last sector");
181 if (request->req.seg[i].last_sect * dataplane->sector_size >=
183 error_report("error: page crossing");
187 len = (request->req.seg[i].last_sect -
188 request->req.seg[i].first_sect + 1) * dataplane->sector_size;
189 request->size += len;
191 if (request->start + request->size > blk_getlength(dataplane->blk)) {
192 error_report("error: access beyond end of file");
198 request->status = BLKIF_RSP_ERROR;
202 static int xen_block_copy_request(XenBlockRequest *request)
204 XenBlockDataPlane *dataplane = request->dataplane;
205 XenDevice *xendev = dataplane->xendev;
206 XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
208 bool to_domain = (request->req.operation == BLKIF_OP_READ);
209 void *virt = request->buf;
210 Error *local_err = NULL;
212 if (request->req.nr_segments == 0) {
216 count = request->req.nr_segments;
218 for (i = 0; i < count; i++) {
220 segs[i].dest.foreign.ref = request->req.seg[i].gref;
221 segs[i].dest.foreign.offset = request->req.seg[i].first_sect *
222 dataplane->sector_size;
223 segs[i].source.virt = virt;
225 segs[i].source.foreign.ref = request->req.seg[i].gref;
226 segs[i].source.foreign.offset = request->req.seg[i].first_sect *
227 dataplane->sector_size;
228 segs[i].dest.virt = virt;
230 segs[i].len = (request->req.seg[i].last_sect -
231 request->req.seg[i].first_sect + 1) *
232 dataplane->sector_size;
236 xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err);
239 error_reportf_err(local_err, "failed to copy data: ");
241 request->aio_errors++;
248 static int xen_block_do_aio(XenBlockRequest *request);
249 static int xen_block_send_response(XenBlockRequest *request);
251 static void xen_block_complete_aio(void *opaque, int ret)
253 XenBlockRequest *request = opaque;
254 XenBlockDataPlane *dataplane = request->dataplane;
256 aio_context_acquire(dataplane->ctx);
259 error_report("%s I/O error",
260 request->req.operation == BLKIF_OP_READ ?
262 request->aio_errors++;
265 request->aio_inflight--;
266 if (request->presync) {
267 request->presync = 0;
268 xen_block_do_aio(request);
271 if (request->aio_inflight > 0) {
275 switch (request->req.operation) {
277 /* in case of failure request->aio_errors is increased */
279 xen_block_copy_request(request);
283 case BLKIF_OP_FLUSH_DISKCACHE:
288 request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
289 xen_block_finish_request(request);
291 switch (request->req.operation) {
293 case BLKIF_OP_FLUSH_DISKCACHE:
294 if (!request->req.nr_segments) {
299 if (request->status == BLKIF_RSP_OKAY) {
300 block_acct_done(blk_get_stats(dataplane->blk), &request->acct);
302 block_acct_failed(blk_get_stats(dataplane->blk), &request->acct);
305 case BLKIF_OP_DISCARD:
309 if (xen_block_send_response(request)) {
310 Error *local_err = NULL;
312 xen_device_notify_event_channel(dataplane->xendev,
313 dataplane->event_channel,
316 error_report_err(local_err);
319 xen_block_release_request(request);
321 if (dataplane->more_work) {
322 qemu_bh_schedule(dataplane->bh);
326 aio_context_release(dataplane->ctx);
329 static bool xen_block_split_discard(XenBlockRequest *request,
330 blkif_sector_t sector_number,
333 XenBlockDataPlane *dataplane = request->dataplane;
336 uint64_t byte_remaining;
337 uint64_t sec_start = sector_number;
338 uint64_t sec_count = nr_sectors;
340 /* Wrap around, or overflowing byte limit? */
341 if (sec_start + sec_count < sec_count ||
342 sec_start + sec_count > INT64_MAX / dataplane->sector_size) {
346 byte_offset = sec_start * dataplane->sector_size;
347 byte_remaining = sec_count * dataplane->sector_size;
350 byte_chunk = byte_remaining > BDRV_REQUEST_MAX_BYTES ?
351 BDRV_REQUEST_MAX_BYTES : byte_remaining;
352 request->aio_inflight++;
353 blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk,
354 xen_block_complete_aio, request);
355 byte_remaining -= byte_chunk;
356 byte_offset += byte_chunk;
357 } while (byte_remaining > 0);
362 static int xen_block_do_aio(XenBlockRequest *request)
364 XenBlockDataPlane *dataplane = request->dataplane;
366 if (request->req.nr_segments &&
367 (request->req.operation == BLKIF_OP_WRITE ||
368 request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
369 xen_block_copy_request(request)) {
373 request->aio_inflight++;
374 if (request->presync) {
375 blk_aio_flush(request->dataplane->blk, xen_block_complete_aio,
380 switch (request->req.operation) {
382 qemu_iovec_add(&request->v, request->buf, request->size);
383 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
384 request->v.size, BLOCK_ACCT_READ);
385 request->aio_inflight++;
386 blk_aio_preadv(dataplane->blk, request->start, &request->v, 0,
387 xen_block_complete_aio, request);
390 case BLKIF_OP_FLUSH_DISKCACHE:
391 if (!request->req.nr_segments) {
395 qemu_iovec_add(&request->v, request->buf, request->size);
396 block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
398 request->req.operation == BLKIF_OP_WRITE ?
399 BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
400 request->aio_inflight++;
401 blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0,
402 xen_block_complete_aio, request);
404 case BLKIF_OP_DISCARD:
406 struct blkif_request_discard *req = (void *)&request->req;
407 if (!xen_block_split_discard(request, req->sector_number,
414 /* unknown operation (shouldn't happen -- parse catches this) */
418 xen_block_complete_aio(request, 0);
423 xen_block_finish_request(request);
424 request->status = BLKIF_RSP_ERROR;
428 static int xen_block_send_response(XenBlockRequest *request)
430 XenBlockDataPlane *dataplane = request->dataplane;
432 int have_requests = 0;
433 blkif_response_t *resp;
435 /* Place on the response ring for the relevant domain. */
436 switch (dataplane->protocol) {
437 case BLKIF_PROTOCOL_NATIVE:
438 resp = (blkif_response_t *)RING_GET_RESPONSE(
439 &dataplane->rings.native,
440 dataplane->rings.native.rsp_prod_pvt);
442 case BLKIF_PROTOCOL_X86_32:
443 resp = (blkif_response_t *)RING_GET_RESPONSE(
444 &dataplane->rings.x86_32_part,
445 dataplane->rings.x86_32_part.rsp_prod_pvt);
447 case BLKIF_PROTOCOL_X86_64:
448 resp = (blkif_response_t *)RING_GET_RESPONSE(
449 &dataplane->rings.x86_64_part,
450 dataplane->rings.x86_64_part.rsp_prod_pvt);
456 resp->id = request->req.id;
457 resp->operation = request->req.operation;
458 resp->status = request->status;
460 dataplane->rings.common.rsp_prod_pvt++;
462 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common,
464 if (dataplane->rings.common.rsp_prod_pvt ==
465 dataplane->rings.common.req_cons) {
467 * Tail check for pending requests. Allows frontend to avoid
468 * notifications if requests are already in flight (lower
469 * overheads and promotes batching).
471 RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common,
473 } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) {
478 dataplane->more_work++;
483 static int xen_block_get_request(XenBlockDataPlane *dataplane,
484 XenBlockRequest *request, RING_IDX rc)
486 switch (dataplane->protocol) {
487 case BLKIF_PROTOCOL_NATIVE: {
488 blkif_request_t *req =
489 RING_GET_REQUEST(&dataplane->rings.native, rc);
491 memcpy(&request->req, req, sizeof(request->req));
494 case BLKIF_PROTOCOL_X86_32: {
495 blkif_x86_32_request_t *req =
496 RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc);
498 blkif_get_x86_32_req(&request->req, req);
501 case BLKIF_PROTOCOL_X86_64: {
502 blkif_x86_64_request_t *req =
503 RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc);
505 blkif_get_x86_64_req(&request->req, req);
509 /* Prevent the compiler from accessing the on-ring fields instead. */
515 * Threshold of in-flight requests above which we will start using
516 * blk_io_plug()/blk_io_unplug() to batch requests.
518 #define IO_PLUG_THRESHOLD 1
520 static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
523 XenBlockRequest *request;
524 int inflight_atstart = dataplane->requests_inflight;
526 bool done_something = false;
528 dataplane->more_work = 0;
530 rc = dataplane->rings.common.req_cons;
531 rp = dataplane->rings.common.sring->req_prod;
532 xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
535 * If there was more than IO_PLUG_THRESHOLD requests in flight
536 * when we got here, this is an indication that there the bottleneck
537 * is below us, so it's worth beginning to batch up I/O requests
538 * rather than submitting them immediately. The maximum number
539 * of requests we're willing to batch is the number already in
540 * flight, so it can grow up to max_requests when the bottleneck
543 if (inflight_atstart > IO_PLUG_THRESHOLD) {
544 blk_io_plug(dataplane->blk);
547 /* pull request from ring */
548 if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) {
551 request = xen_block_start_request(dataplane);
552 if (request == NULL) {
553 dataplane->more_work++;
556 xen_block_get_request(dataplane, request, rc);
557 dataplane->rings.common.req_cons = ++rc;
558 done_something = true;
561 if (xen_block_parse_request(request) != 0) {
562 switch (request->req.operation) {
564 block_acct_invalid(blk_get_stats(dataplane->blk),
568 block_acct_invalid(blk_get_stats(dataplane->blk),
571 case BLKIF_OP_FLUSH_DISKCACHE:
572 block_acct_invalid(blk_get_stats(dataplane->blk),
578 if (xen_block_send_response(request)) {
579 Error *local_err = NULL;
581 xen_device_notify_event_channel(dataplane->xendev,
582 dataplane->event_channel,
585 error_report_err(local_err);
588 xen_block_release_request(request);
592 if (inflight_atstart > IO_PLUG_THRESHOLD &&
593 batched >= inflight_atstart) {
594 blk_io_unplug(dataplane->blk);
596 xen_block_do_aio(request);
597 if (inflight_atstart > IO_PLUG_THRESHOLD) {
598 if (batched >= inflight_atstart) {
599 blk_io_plug(dataplane->blk);
606 if (inflight_atstart > IO_PLUG_THRESHOLD) {
607 blk_io_unplug(dataplane->blk);
610 return done_something;
613 static void xen_block_dataplane_bh(void *opaque)
615 XenBlockDataPlane *dataplane = opaque;
617 aio_context_acquire(dataplane->ctx);
618 xen_block_handle_requests(dataplane);
619 aio_context_release(dataplane->ctx);
622 static bool xen_block_dataplane_event(void *opaque)
624 XenBlockDataPlane *dataplane = opaque;
626 return xen_block_handle_requests(dataplane);
629 XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
631 unsigned int sector_size,
634 XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1);
636 dataplane->xendev = xendev;
637 dataplane->blk = blk;
638 dataplane->sector_size = sector_size;
640 QLIST_INIT(&dataplane->inflight);
641 QLIST_INIT(&dataplane->freelist);
644 dataplane->iothread = iothread;
645 object_ref(OBJECT(dataplane->iothread));
646 dataplane->ctx = iothread_get_aio_context(dataplane->iothread);
648 dataplane->ctx = qemu_get_aio_context();
650 dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
656 void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane)
658 XenBlockRequest *request;
664 while (!QLIST_EMPTY(&dataplane->freelist)) {
665 request = QLIST_FIRST(&dataplane->freelist);
666 QLIST_REMOVE(request, list);
667 qemu_iovec_destroy(&request->v);
668 qemu_vfree(request->buf);
672 qemu_bh_delete(dataplane->bh);
673 if (dataplane->iothread) {
674 object_unref(OBJECT(dataplane->iothread));
680 void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
688 aio_context_acquire(dataplane->ctx);
689 /* Xen doesn't have multiple users for nodes, so this can't fail */
690 blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
691 aio_context_release(dataplane->ctx);
693 xendev = dataplane->xendev;
695 if (dataplane->event_channel) {
696 Error *local_err = NULL;
698 xen_device_unbind_event_channel(xendev, dataplane->event_channel,
700 dataplane->event_channel = NULL;
703 error_report_err(local_err);
707 if (dataplane->sring) {
708 Error *local_err = NULL;
710 xen_device_unmap_grant_refs(xendev, dataplane->sring,
711 dataplane->nr_ring_ref, &local_err);
712 dataplane->sring = NULL;
715 error_report_err(local_err);
719 g_free(dataplane->ring_ref);
720 dataplane->ring_ref = NULL;
723 void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
724 const unsigned int ring_ref[],
725 unsigned int nr_ring_ref,
726 unsigned int event_channel,
727 unsigned int protocol,
730 XenDevice *xendev = dataplane->xendev;
731 Error *local_err = NULL;
732 unsigned int ring_size;
735 dataplane->nr_ring_ref = nr_ring_ref;
736 dataplane->ring_ref = g_new(unsigned int, nr_ring_ref);
738 for (i = 0; i < nr_ring_ref; i++) {
739 dataplane->ring_ref[i] = ring_ref[i];
742 dataplane->protocol = protocol;
744 ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
745 switch (dataplane->protocol) {
746 case BLKIF_PROTOCOL_NATIVE:
748 dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size);
751 case BLKIF_PROTOCOL_X86_32:
753 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
756 case BLKIF_PROTOCOL_X86_64:
758 dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
762 error_setg(errp, "unknown protocol %u", dataplane->protocol);
766 xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref,
769 error_propagate(errp, local_err);
773 dataplane->sring = xen_device_map_grant_refs(xendev,
775 dataplane->nr_ring_ref,
776 PROT_READ | PROT_WRITE,
779 error_propagate(errp, local_err);
783 switch (dataplane->protocol) {
784 case BLKIF_PROTOCOL_NATIVE:
786 blkif_sring_t *sring_native = dataplane->sring;
788 BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size);
791 case BLKIF_PROTOCOL_X86_32:
793 blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring;
795 BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32,
799 case BLKIF_PROTOCOL_X86_64:
801 blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring;
803 BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64,
809 dataplane->event_channel =
810 xen_device_bind_event_channel(xendev, dataplane->ctx, event_channel,
811 xen_block_dataplane_event, dataplane,
814 error_propagate(errp, local_err);
818 aio_context_acquire(dataplane->ctx);
819 /* If other users keep the BlockBackend in the iothread, that's ok */
820 blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
821 aio_context_release(dataplane->ctx);
825 xen_block_dataplane_stop(dataplane);