X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/7794b34e63fd42803e959c4989e5358f2412d325..0b9fd3f467dc5ac041fa014cd28c949b25b87d25:/block/block-backend.c diff --git a/block/block-backend.c b/block/block-backend.c index 45d9101be3..f2f75a977d 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -17,8 +17,10 @@ #include "block/throttle-groups.h" #include "sysemu/blockdev.h" #include "sysemu/sysemu.h" -#include "qapi-event.h" +#include "qapi/error.h" +#include "qapi/qapi-events-block.h" #include "qemu/id.h" +#include "qemu/option.h" #include "trace.h" #include "migration/misc.h" @@ -29,6 +31,13 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb); +typedef struct BlockBackendAioNotifier { + void (*attached_aio_context)(AioContext *new_context, void *opaque); + void (*detach_aio_context)(void *opaque); + void *opaque; + QLIST_ENTRY(BlockBackendAioNotifier) list; +} BlockBackendAioNotifier; + struct BlockBackend { char *name; int refcnt; @@ -67,10 +76,19 @@ struct BlockBackend { bool allow_write_beyond_eof; NotifierList remove_bs_notifiers, insert_bs_notifiers; + QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers; int quiesce_counter; VMChangeStateEntry *vmsh; bool force_allow_inactivate; + + /* Number of in-flight aio requests. BlockDriverState also counts + * in-flight requests but aio requests can exist even when blk->root is + * NULL, so we cannot rely on its counter for that case. + * Accessed with atomic ops. + */ + unsigned int in_flight; + AioWait wait; }; typedef struct BlockBackendAIOCB { @@ -237,6 +255,36 @@ static int blk_root_inactivate(BdrvChild *child) return 0; } +static void blk_root_attach(BdrvChild *child) +{ + BlockBackend *blk = child->opaque; + BlockBackendAioNotifier *notifier; + + trace_blk_root_attach(child, blk, child->bs); + + QLIST_FOREACH(notifier, &blk->aio_notifiers, list) { + bdrv_add_aio_context_notifier(child->bs, + notifier->attached_aio_context, + notifier->detach_aio_context, + notifier->opaque); + } +} + +static void blk_root_detach(BdrvChild *child) +{ + BlockBackend *blk = child->opaque; + BlockBackendAioNotifier *notifier; + + trace_blk_root_detach(child, blk, child->bs); + + QLIST_FOREACH(notifier, &blk->aio_notifiers, list) { + bdrv_remove_aio_context_notifier(child->bs, + notifier->attached_aio_context, + notifier->detach_aio_context, + notifier->opaque); + } +} + static const BdrvChildRole child_root = { .inherit_options = blk_root_inherit_options, @@ -250,6 +298,9 @@ static const BdrvChildRole child_root = { .activate = blk_root_activate, .inactivate = blk_root_inactivate, + + .attach = blk_root_attach, + .detach = blk_root_detach, }; /* @@ -277,6 +328,7 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm) notifier_list_init(&blk->remove_bs_notifiers); notifier_list_init(&blk->insert_bs_notifiers); + QLIST_INIT(&blk->aio_notifiers); QTAILQ_INSERT_TAIL(&block_backends, blk, link); return blk; @@ -299,7 +351,7 @@ BlockBackend *blk_new_open(const char *filename, const char *reference, { BlockBackend *blk; BlockDriverState *bs; - uint64_t perm; + uint64_t perm = 0; /* blk_new_open() is mainly used in .bdrv_create implementations and the * tools where sharing isn't a concern because the BDS stays private, so we @@ -309,9 +361,11 @@ BlockBackend *blk_new_open(const char *filename, const char *reference, * caller of blk_new_open() doesn't make use of the permissions, but they * shouldn't hurt either. We can still share everything here because the * guest devices will add their own blockers if they can't share. */ - perm = BLK_PERM_CONSISTENT_READ; - if (flags & BDRV_O_RDWR) { - perm |= BLK_PERM_WRITE; + if ((flags & BDRV_O_NO_IO) == 0) { + perm |= BLK_PERM_CONSISTENT_READ; + if (flags & BDRV_O_RDWR) { + perm |= BLK_PERM_WRITE; + } } if (flags & BDRV_O_RESIZE) { perm |= BLK_PERM_RESIZE; @@ -352,6 +406,7 @@ static void blk_delete(BlockBackend *blk) } assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers)); assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers)); + assert(QLIST_EMPTY(&blk->aio_notifiers)); QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); block_acct_cleanup(&blk->stats); @@ -442,21 +497,37 @@ BlockBackend *blk_next(BlockBackend *blk) * the monitor or attached to a BlockBackend */ BlockDriverState *bdrv_next(BdrvNextIterator *it) { - BlockDriverState *bs; + BlockDriverState *bs, *old_bs; + + /* Must be called from the main loop */ + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); /* First, return all root nodes of BlockBackends. In order to avoid * returning a BDS twice when multiple BBs refer to it, we only return it * if the BB is the first one in the parent list of the BDS. */ if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { + BlockBackend *old_blk = it->blk; + + old_bs = old_blk ? blk_bs(old_blk) : NULL; + do { it->blk = blk_all_next(it->blk); bs = it->blk ? blk_bs(it->blk) : NULL; } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk)); + if (it->blk) { + blk_ref(it->blk); + } + blk_unref(old_blk); + if (bs) { + bdrv_ref(bs); + bdrv_unref(old_bs); return bs; } it->phase = BDRV_NEXT_MONITOR_OWNED; + } else { + old_bs = it->bs; } /* Then return the monitor-owned BDSes without a BB attached. Ignore all @@ -467,18 +538,46 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it) bs = it->bs; } while (bs && bdrv_has_blk(bs)); + if (bs) { + bdrv_ref(bs); + } + bdrv_unref(old_bs); + return bs; } -BlockDriverState *bdrv_first(BdrvNextIterator *it) +static void bdrv_next_reset(BdrvNextIterator *it) { *it = (BdrvNextIterator) { .phase = BDRV_NEXT_BACKEND_ROOTS, }; +} +BlockDriverState *bdrv_first(BdrvNextIterator *it) +{ + bdrv_next_reset(it); return bdrv_next(it); } +/* Must be called when aborting a bdrv_next() iteration before + * bdrv_next() returns NULL */ +void bdrv_next_cleanup(BdrvNextIterator *it) +{ + /* Must be called from the main loop */ + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); + + if (it->phase == BDRV_NEXT_BACKEND_ROOTS) { + if (it->blk) { + bdrv_unref(blk_bs(it->blk)); + blk_unref(it->blk); + } + } else { + bdrv_unref(it->bs); + } + + bdrv_next_reset(it); +} + /* * Add a BlockBackend into the list of backends referenced by the monitor, with * the given @name acting as the handle for the monitor. @@ -655,16 +754,25 @@ BlockBackend *blk_by_public(BlockBackendPublic *public) */ void blk_remove_bs(BlockBackend *blk) { - ThrottleTimers *tt; + ThrottleGroupMember *tgm = &blk->public.throttle_group_member; + BlockDriverState *bs; notifier_list_notify(&blk->remove_bs_notifiers, blk); - if (blk->public.throttle_group_member.throttle_state) { - tt = &blk->public.throttle_group_member.throttle_timers; - throttle_timers_detach_aio_context(tt); + if (tgm->throttle_state) { + bs = blk_bs(blk); + bdrv_drained_begin(bs); + throttle_group_detach_aio_context(tgm); + throttle_group_attach_aio_context(tgm, qemu_get_aio_context()); + bdrv_drained_end(bs); } blk_update_root_state(blk); + /* bdrv_root_unref_child() will cause blk->root to become stale and may + * switch to a completion coroutine later on. Let's drain all I/O here + * to avoid that and a potential QEMU crash. + */ + blk_drain(blk); bdrv_root_unref_child(blk->root); blk->root = NULL; } @@ -674,6 +782,7 @@ void blk_remove_bs(BlockBackend *blk) */ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) { + ThrottleGroupMember *tgm = &blk->public.throttle_group_member; blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->perm, blk->shared_perm, blk, errp); if (blk->root == NULL) { @@ -682,10 +791,9 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) bdrv_ref(bs); notifier_list_notify(&blk->insert_bs_notifiers, blk); - if (blk->public.throttle_group_member.throttle_state) { - throttle_timers_attach_aio_context( - &blk->public.throttle_group_member.throttle_timers, - bdrv_get_aio_context(bs)); + if (tgm->throttle_state) { + throttle_group_detach_aio_context(tgm); + throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs)); } return 0; @@ -1090,7 +1198,7 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, typedef struct BlkRwCo { BlockBackend *blk; int64_t offset; - QEMUIOVector *qiov; + void *iobuf; int ret; BdrvRequestFlags flags; } BlkRwCo; @@ -1098,17 +1206,19 @@ typedef struct BlkRwCo { static void blk_read_entry(void *opaque) { BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; - rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size, - rwco->qiov, rwco->flags); + rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size, + qiov, rwco->flags); } static void blk_write_entry(void *opaque) { BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; - rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size, - rwco->qiov, rwco->flags); + rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size, + qiov, rwco->flags); } static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, @@ -1128,7 +1238,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, rwco = (BlkRwCo) { .blk = blk, .offset = offset, - .qiov = &qiov, + .iobuf = &qiov, .flags = flags, .ret = NOT_DONE, }; @@ -1173,11 +1283,22 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) return bdrv_make_zero(blk->root, flags); } +static void blk_inc_in_flight(BlockBackend *blk) +{ + atomic_inc(&blk->in_flight); +} + +static void blk_dec_in_flight(BlockBackend *blk) +{ + atomic_dec(&blk->in_flight); + aio_wait_kick(&blk->wait); +} + static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; - bdrv_dec_in_flight(acb->common.bs); + blk_dec_in_flight(acb->blk); acb->common.cb(acb->common.opaque, acb->ret); qemu_aio_unref(acb); } @@ -1188,7 +1309,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, { struct BlockBackendAIOCB *acb; - bdrv_inc_in_flight(blk_bs(blk)); + blk_inc_in_flight(blk); acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); acb->blk = blk; acb->ret = ret; @@ -1211,7 +1332,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = { static void blk_aio_complete(BlkAioEmAIOCB *acb) { if (acb->has_returned) { - bdrv_dec_in_flight(acb->common.bs); + blk_dec_in_flight(acb->rwco.blk); acb->common.cb(acb->common.opaque, acb->rwco.ret); qemu_aio_unref(acb); } @@ -1225,19 +1346,19 @@ static void blk_aio_complete_bh(void *opaque) } static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, - QEMUIOVector *qiov, CoroutineEntry co_entry, + void *iobuf, CoroutineEntry co_entry, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { BlkAioEmAIOCB *acb; Coroutine *co; - bdrv_inc_in_flight(blk_bs(blk)); + blk_inc_in_flight(blk); acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); acb->rwco = (BlkRwCo) { .blk = blk, .offset = offset, - .qiov = qiov, + .iobuf = iobuf, .flags = flags, .ret = NOT_DONE, }; @@ -1260,10 +1381,11 @@ static void blk_aio_read_entry(void *opaque) { BlkAioEmAIOCB *acb = opaque; BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; - assert(rwco->qiov->size == acb->bytes); + assert(qiov->size == acb->bytes); rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes, - rwco->qiov, rwco->flags); + qiov, rwco->flags); blk_aio_complete(acb); } @@ -1271,10 +1393,11 @@ static void blk_aio_write_entry(void *opaque) { BlkAioEmAIOCB *acb = opaque; BlkRwCo *rwco = &acb->rwco; + QEMUIOVector *qiov = rwco->iobuf; - assert(!rwco->qiov || rwco->qiov->size == acb->bytes); + assert(!qiov || qiov->size == acb->bytes); rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes, - rwco->qiov, rwco->flags); + qiov, rwco->flags); blk_aio_complete(acb); } @@ -1403,8 +1526,10 @@ int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf) static void blk_ioctl_entry(void *opaque) { BlkRwCo *rwco = opaque; + QEMUIOVector *qiov = rwco->iobuf; + rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, - rwco->qiov->iov[0].iov_base); + qiov->iov[0].iov_base); } int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) @@ -1417,24 +1542,15 @@ static void blk_aio_ioctl_entry(void *opaque) BlkAioEmAIOCB *acb = opaque; BlkRwCo *rwco = &acb->rwco; - rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, - rwco->qiov->iov[0].iov_base); + rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf); + blk_aio_complete(acb); } BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) { - QEMUIOVector qiov; - struct iovec iov; - - iov = (struct iovec) { - .iov_base = buf, - .iov_len = 0, - }; - qemu_iovec_init_external(&qiov, &iov, 1); - - return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque); + return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); } int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) @@ -1444,7 +1560,7 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) return ret; } - return bdrv_co_pdiscard(blk_bs(blk), offset, bytes); + return bdrv_co_pdiscard(blk->root, offset, bytes); } int blk_co_flush(BlockBackend *blk) @@ -1469,14 +1585,41 @@ int blk_flush(BlockBackend *blk) void blk_drain(BlockBackend *blk) { - if (blk_bs(blk)) { - bdrv_drain(blk_bs(blk)); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_drained_begin(bs); + } + + /* We may have -ENOMEDIUM completions in flight */ + AIO_WAIT_WHILE(&blk->wait, + blk_get_aio_context(blk), + atomic_mb_read(&blk->in_flight) > 0); + + if (bs) { + bdrv_drained_end(bs); } } void blk_drain_all(void) { - bdrv_drain_all(); + BlockBackend *blk = NULL; + + bdrv_drain_all_begin(); + + while ((blk = blk_all_next(blk)) != NULL) { + AioContext *ctx = blk_get_aio_context(blk); + + aio_context_acquire(ctx); + + /* We may have -ENOMEDIUM completions in flight */ + AIO_WAIT_WHILE(&blk->wait, ctx, + atomic_mb_read(&blk->in_flight) > 0); + + aio_context_release(ctx); + } + + bdrv_drain_all_end(); } void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, @@ -1517,10 +1660,11 @@ static void send_qmp_error_event(BlockBackend *blk, bool is_read, int error) { IoOperationType optype; + BlockDriverState *bs = blk_bs(blk); optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(blk_name(blk), - bdrv_get_node_name(blk_bs(blk)), optype, + qapi_event_send_block_io_error(blk_name(blk), !!bs, + bs ? bdrv_get_node_name(bs) : NULL, optype, action, blk_iostatus_is_enabled(blk), error == ENOSPC, strerror(error), &error_abort); @@ -1726,13 +1870,7 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason) AioContext *blk_get_aio_context(BlockBackend *blk) { - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bdrv_get_aio_context(bs); - } else { - return qemu_get_aio_context(); - } + return bdrv_get_aio_context(blk_bs(blk)); } static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) @@ -1748,8 +1886,10 @@ void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) if (bs) { if (tgm->throttle_state) { + bdrv_drained_begin(bs); throttle_group_detach_aio_context(tgm); throttle_group_attach_aio_context(tgm, new_context); + bdrv_drained_end(bs); } bdrv_set_aio_context(bs, new_context); } @@ -1759,8 +1899,15 @@ void blk_add_aio_context_notifier(BlockBackend *blk, void (*attached_aio_context)(AioContext *new_context, void *opaque), void (*detach_aio_context)(void *opaque), void *opaque) { + BlockBackendAioNotifier *notifier; BlockDriverState *bs = blk_bs(blk); + notifier = g_new(BlockBackendAioNotifier, 1); + notifier->attached_aio_context = attached_aio_context; + notifier->detach_aio_context = detach_aio_context; + notifier->opaque = opaque; + QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list); + if (bs) { bdrv_add_aio_context_notifier(bs, attached_aio_context, detach_aio_context, opaque); @@ -1773,12 +1920,25 @@ void blk_remove_aio_context_notifier(BlockBackend *blk, void (*detach_aio_context)(void *), void *opaque) { + BlockBackendAioNotifier *notifier; BlockDriverState *bs = blk_bs(blk); if (bs) { bdrv_remove_aio_context_notifier(bs, attached_aio_context, detach_aio_context, opaque); } + + QLIST_FOREACH(notifier, &blk->aio_notifiers, list) { + if (notifier->attached_aio_context == attached_aio_context && + notifier->detach_aio_context == detach_aio_context && + notifier->opaque == opaque) { + QLIST_REMOVE(notifier, list); + g_free(notifier); + return; + } + } + + abort(); } void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify) @@ -1848,7 +2008,9 @@ int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc, static void blk_pdiscard_entry(void *opaque) { BlkRwCo *rwco = opaque; - rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size); + QEMUIOVector *qiov = rwco->iobuf; + + rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size); } int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes) @@ -1974,10 +2136,16 @@ void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg) void blk_io_limits_disable(BlockBackend *blk) { - assert(blk->public.throttle_group_member.throttle_state); - bdrv_drained_begin(blk_bs(blk)); - throttle_group_unregister_tgm(&blk->public.throttle_group_member); - bdrv_drained_end(blk_bs(blk)); + BlockDriverState *bs = blk_bs(blk); + ThrottleGroupMember *tgm = &blk->public.throttle_group_member; + assert(tgm->throttle_state); + if (bs) { + bdrv_drained_begin(bs); + } + throttle_group_unregister_tgm(tgm); + if (bs) { + bdrv_drained_end(bs); + } } /* should be called before blk_set_io_limits if a limit is set */ @@ -2038,3 +2206,32 @@ static void blk_root_drained_end(BdrvChild *child) } } } + +void blk_register_buf(BlockBackend *blk, void *host, size_t size) +{ + bdrv_register_buf(blk_bs(blk), host, size); +} + +void blk_unregister_buf(BlockBackend *blk, void *host) +{ + bdrv_unregister_buf(blk_bs(blk), host); +} + +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, + BlockBackend *blk_out, int64_t off_out, + int bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) +{ + int r; + r = blk_check_byte_request(blk_in, off_in, bytes); + if (r) { + return r; + } + r = blk_check_byte_request(blk_out, off_out, bytes); + if (r) { + return r; + } + return bdrv_co_copy_range(blk_in->root, off_in, + blk_out->root, off_out, + bytes, read_flags, write_flags); +}