#include "block/throttle-groups.h"
#include "sysemu/blockdev.h"
#include "sysemu/sysemu.h"
-#include "qapi-event.h"
+#include "qapi/error.h"
+#include "qapi/qapi-events-block.h"
#include "qemu/id.h"
+#include "qemu/option.h"
#include "trace.h"
#include "migration/misc.h"
static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
+typedef struct BlockBackendAioNotifier {
+ void (*attached_aio_context)(AioContext *new_context, void *opaque);
+ void (*detach_aio_context)(void *opaque);
+ void *opaque;
+ QLIST_ENTRY(BlockBackendAioNotifier) list;
+} BlockBackendAioNotifier;
+
struct BlockBackend {
char *name;
int refcnt;
bool allow_write_beyond_eof;
NotifierList remove_bs_notifiers, insert_bs_notifiers;
+ QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
int quiesce_counter;
VMChangeStateEntry *vmsh;
bool force_allow_inactivate;
+
+ /* Number of in-flight aio requests. BlockDriverState also counts
+ * in-flight requests but aio requests can exist even when blk->root is
+ * NULL, so we cannot rely on its counter for that case.
+ * Accessed with atomic ops.
+ */
+ unsigned int in_flight;
+ AioWait wait;
};
typedef struct BlockBackendAIOCB {
return 0;
}
+static void blk_root_attach(BdrvChild *child)
+{
+ BlockBackend *blk = child->opaque;
+ BlockBackendAioNotifier *notifier;
+
+ trace_blk_root_attach(child, blk, child->bs);
+
+ QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
+ bdrv_add_aio_context_notifier(child->bs,
+ notifier->attached_aio_context,
+ notifier->detach_aio_context,
+ notifier->opaque);
+ }
+}
+
+static void blk_root_detach(BdrvChild *child)
+{
+ BlockBackend *blk = child->opaque;
+ BlockBackendAioNotifier *notifier;
+
+ trace_blk_root_detach(child, blk, child->bs);
+
+ QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
+ bdrv_remove_aio_context_notifier(child->bs,
+ notifier->attached_aio_context,
+ notifier->detach_aio_context,
+ notifier->opaque);
+ }
+}
+
static const BdrvChildRole child_root = {
.inherit_options = blk_root_inherit_options,
.activate = blk_root_activate,
.inactivate = blk_root_inactivate,
+
+ .attach = blk_root_attach,
+ .detach = blk_root_detach,
};
/*
notifier_list_init(&blk->remove_bs_notifiers);
notifier_list_init(&blk->insert_bs_notifiers);
+ QLIST_INIT(&blk->aio_notifiers);
QTAILQ_INSERT_TAIL(&block_backends, blk, link);
return blk;
{
BlockBackend *blk;
BlockDriverState *bs;
- uint64_t perm;
+ uint64_t perm = 0;
/* blk_new_open() is mainly used in .bdrv_create implementations and the
* tools where sharing isn't a concern because the BDS stays private, so we
* caller of blk_new_open() doesn't make use of the permissions, but they
* shouldn't hurt either. We can still share everything here because the
* guest devices will add their own blockers if they can't share. */
- perm = BLK_PERM_CONSISTENT_READ;
- if (flags & BDRV_O_RDWR) {
- perm |= BLK_PERM_WRITE;
+ if ((flags & BDRV_O_NO_IO) == 0) {
+ perm |= BLK_PERM_CONSISTENT_READ;
+ if (flags & BDRV_O_RDWR) {
+ perm |= BLK_PERM_WRITE;
+ }
}
if (flags & BDRV_O_RESIZE) {
perm |= BLK_PERM_RESIZE;
}
assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
+ assert(QLIST_EMPTY(&blk->aio_notifiers));
QTAILQ_REMOVE(&block_backends, blk, link);
drive_info_del(blk->legacy_dinfo);
block_acct_cleanup(&blk->stats);
* the monitor or attached to a BlockBackend */
BlockDriverState *bdrv_next(BdrvNextIterator *it)
{
- BlockDriverState *bs;
+ BlockDriverState *bs, *old_bs;
+
+ /* Must be called from the main loop */
+ assert(qemu_get_current_aio_context() == qemu_get_aio_context());
/* First, return all root nodes of BlockBackends. In order to avoid
* returning a BDS twice when multiple BBs refer to it, we only return it
* if the BB is the first one in the parent list of the BDS. */
if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+ BlockBackend *old_blk = it->blk;
+
+ old_bs = old_blk ? blk_bs(old_blk) : NULL;
+
do {
it->blk = blk_all_next(it->blk);
bs = it->blk ? blk_bs(it->blk) : NULL;
} while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
+ if (it->blk) {
+ blk_ref(it->blk);
+ }
+ blk_unref(old_blk);
+
if (bs) {
+ bdrv_ref(bs);
+ bdrv_unref(old_bs);
return bs;
}
it->phase = BDRV_NEXT_MONITOR_OWNED;
+ } else {
+ old_bs = it->bs;
}
/* Then return the monitor-owned BDSes without a BB attached. Ignore all
bs = it->bs;
} while (bs && bdrv_has_blk(bs));
+ if (bs) {
+ bdrv_ref(bs);
+ }
+ bdrv_unref(old_bs);
+
return bs;
}
-BlockDriverState *bdrv_first(BdrvNextIterator *it)
+static void bdrv_next_reset(BdrvNextIterator *it)
{
*it = (BdrvNextIterator) {
.phase = BDRV_NEXT_BACKEND_ROOTS,
};
+}
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+ bdrv_next_reset(it);
return bdrv_next(it);
}
+/* Must be called when aborting a bdrv_next() iteration before
+ * bdrv_next() returns NULL */
+void bdrv_next_cleanup(BdrvNextIterator *it)
+{
+ /* Must be called from the main loop */
+ assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+
+ if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
+ if (it->blk) {
+ bdrv_unref(blk_bs(it->blk));
+ blk_unref(it->blk);
+ }
+ } else {
+ bdrv_unref(it->bs);
+ }
+
+ bdrv_next_reset(it);
+}
+
/*
* Add a BlockBackend into the list of backends referenced by the monitor, with
* the given @name acting as the handle for the monitor.
*/
void blk_remove_bs(BlockBackend *blk)
{
- ThrottleTimers *tt;
+ ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
+ BlockDriverState *bs;
notifier_list_notify(&blk->remove_bs_notifiers, blk);
- if (blk->public.throttle_group_member.throttle_state) {
- tt = &blk->public.throttle_group_member.throttle_timers;
- throttle_timers_detach_aio_context(tt);
+ if (tgm->throttle_state) {
+ bs = blk_bs(blk);
+ bdrv_drained_begin(bs);
+ throttle_group_detach_aio_context(tgm);
+ throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
+ bdrv_drained_end(bs);
}
blk_update_root_state(blk);
+ /* bdrv_root_unref_child() will cause blk->root to become stale and may
+ * switch to a completion coroutine later on. Let's drain all I/O here
+ * to avoid that and a potential QEMU crash.
+ */
+ blk_drain(blk);
bdrv_root_unref_child(blk->root);
blk->root = NULL;
}
*/
int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
{
+ ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
blk->root = bdrv_root_attach_child(bs, "root", &child_root,
blk->perm, blk->shared_perm, blk, errp);
if (blk->root == NULL) {
bdrv_ref(bs);
notifier_list_notify(&blk->insert_bs_notifiers, blk);
- if (blk->public.throttle_group_member.throttle_state) {
- throttle_timers_attach_aio_context(
- &blk->public.throttle_group_member.throttle_timers,
- bdrv_get_aio_context(bs));
+ if (tgm->throttle_state) {
+ throttle_group_detach_aio_context(tgm);
+ throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
}
return 0;
typedef struct BlkRwCo {
BlockBackend *blk;
int64_t offset;
- QEMUIOVector *qiov;
+ void *iobuf;
int ret;
BdrvRequestFlags flags;
} BlkRwCo;
static void blk_read_entry(void *opaque)
{
BlkRwCo *rwco = opaque;
+ QEMUIOVector *qiov = rwco->iobuf;
- rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size,
- rwco->qiov, rwco->flags);
+ rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
+ qiov, rwco->flags);
}
static void blk_write_entry(void *opaque)
{
BlkRwCo *rwco = opaque;
+ QEMUIOVector *qiov = rwco->iobuf;
- rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size,
- rwco->qiov, rwco->flags);
+ rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
+ qiov, rwco->flags);
}
static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
rwco = (BlkRwCo) {
.blk = blk,
.offset = offset,
- .qiov = &qiov,
+ .iobuf = &qiov,
.flags = flags,
.ret = NOT_DONE,
};
return bdrv_make_zero(blk->root, flags);
}
+static void blk_inc_in_flight(BlockBackend *blk)
+{
+ atomic_inc(&blk->in_flight);
+}
+
+static void blk_dec_in_flight(BlockBackend *blk)
+{
+ atomic_dec(&blk->in_flight);
+ aio_wait_kick(&blk->wait);
+}
+
static void error_callback_bh(void *opaque)
{
struct BlockBackendAIOCB *acb = opaque;
- bdrv_dec_in_flight(acb->common.bs);
+ blk_dec_in_flight(acb->blk);
acb->common.cb(acb->common.opaque, acb->ret);
qemu_aio_unref(acb);
}
{
struct BlockBackendAIOCB *acb;
- bdrv_inc_in_flight(blk_bs(blk));
+ blk_inc_in_flight(blk);
acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
acb->blk = blk;
acb->ret = ret;
static void blk_aio_complete(BlkAioEmAIOCB *acb)
{
if (acb->has_returned) {
- bdrv_dec_in_flight(acb->common.bs);
+ blk_dec_in_flight(acb->rwco.blk);
acb->common.cb(acb->common.opaque, acb->rwco.ret);
qemu_aio_unref(acb);
}
}
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
- QEMUIOVector *qiov, CoroutineEntry co_entry,
+ void *iobuf, CoroutineEntry co_entry,
BdrvRequestFlags flags,
BlockCompletionFunc *cb, void *opaque)
{
BlkAioEmAIOCB *acb;
Coroutine *co;
- bdrv_inc_in_flight(blk_bs(blk));
+ blk_inc_in_flight(blk);
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
acb->rwco = (BlkRwCo) {
.blk = blk,
.offset = offset,
- .qiov = qiov,
+ .iobuf = iobuf,
.flags = flags,
.ret = NOT_DONE,
};
{
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
+ QEMUIOVector *qiov = rwco->iobuf;
- assert(rwco->qiov->size == acb->bytes);
+ assert(qiov->size == acb->bytes);
rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
- rwco->qiov, rwco->flags);
+ qiov, rwco->flags);
blk_aio_complete(acb);
}
{
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
+ QEMUIOVector *qiov = rwco->iobuf;
- assert(!rwco->qiov || rwco->qiov->size == acb->bytes);
+ assert(!qiov || qiov->size == acb->bytes);
rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
- rwco->qiov, rwco->flags);
+ qiov, rwco->flags);
blk_aio_complete(acb);
}
static void blk_ioctl_entry(void *opaque)
{
BlkRwCo *rwco = opaque;
+ QEMUIOVector *qiov = rwco->iobuf;
+
rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
- rwco->qiov->iov[0].iov_base);
+ qiov->iov[0].iov_base);
}
int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
- rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
- rwco->qiov->iov[0].iov_base);
+ rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
+
blk_aio_complete(acb);
}
BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
BlockCompletionFunc *cb, void *opaque)
{
- QEMUIOVector qiov;
- struct iovec iov;
-
- iov = (struct iovec) {
- .iov_base = buf,
- .iov_len = 0,
- };
- qemu_iovec_init_external(&qiov, &iov, 1);
-
- return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
+ return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
}
int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
return ret;
}
- return bdrv_co_pdiscard(blk_bs(blk), offset, bytes);
+ return bdrv_co_pdiscard(blk->root, offset, bytes);
}
int blk_co_flush(BlockBackend *blk)
void blk_drain(BlockBackend *blk)
{
- if (blk_bs(blk)) {
- bdrv_drain(blk_bs(blk));
+ BlockDriverState *bs = blk_bs(blk);
+
+ if (bs) {
+ bdrv_drained_begin(bs);
+ }
+
+ /* We may have -ENOMEDIUM completions in flight */
+ AIO_WAIT_WHILE(&blk->wait,
+ blk_get_aio_context(blk),
+ atomic_mb_read(&blk->in_flight) > 0);
+
+ if (bs) {
+ bdrv_drained_end(bs);
}
}
void blk_drain_all(void)
{
- bdrv_drain_all();
+ BlockBackend *blk = NULL;
+
+ bdrv_drain_all_begin();
+
+ while ((blk = blk_all_next(blk)) != NULL) {
+ AioContext *ctx = blk_get_aio_context(blk);
+
+ aio_context_acquire(ctx);
+
+ /* We may have -ENOMEDIUM completions in flight */
+ AIO_WAIT_WHILE(&blk->wait, ctx,
+ atomic_mb_read(&blk->in_flight) > 0);
+
+ aio_context_release(ctx);
+ }
+
+ bdrv_drain_all_end();
}
void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
bool is_read, int error)
{
IoOperationType optype;
+ BlockDriverState *bs = blk_bs(blk);
optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
- qapi_event_send_block_io_error(blk_name(blk),
- bdrv_get_node_name(blk_bs(blk)), optype,
+ qapi_event_send_block_io_error(blk_name(blk), !!bs,
+ bs ? bdrv_get_node_name(bs) : NULL, optype,
action, blk_iostatus_is_enabled(blk),
error == ENOSPC, strerror(error),
&error_abort);
AioContext *blk_get_aio_context(BlockBackend *blk)
{
- BlockDriverState *bs = blk_bs(blk);
-
- if (bs) {
- return bdrv_get_aio_context(bs);
- } else {
- return qemu_get_aio_context();
- }
+ return bdrv_get_aio_context(blk_bs(blk));
}
static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
if (bs) {
if (tgm->throttle_state) {
+ bdrv_drained_begin(bs);
throttle_group_detach_aio_context(tgm);
throttle_group_attach_aio_context(tgm, new_context);
+ bdrv_drained_end(bs);
}
bdrv_set_aio_context(bs, new_context);
}
void (*attached_aio_context)(AioContext *new_context, void *opaque),
void (*detach_aio_context)(void *opaque), void *opaque)
{
+ BlockBackendAioNotifier *notifier;
BlockDriverState *bs = blk_bs(blk);
+ notifier = g_new(BlockBackendAioNotifier, 1);
+ notifier->attached_aio_context = attached_aio_context;
+ notifier->detach_aio_context = detach_aio_context;
+ notifier->opaque = opaque;
+ QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
+
if (bs) {
bdrv_add_aio_context_notifier(bs, attached_aio_context,
detach_aio_context, opaque);
void (*detach_aio_context)(void *),
void *opaque)
{
+ BlockBackendAioNotifier *notifier;
BlockDriverState *bs = blk_bs(blk);
if (bs) {
bdrv_remove_aio_context_notifier(bs, attached_aio_context,
detach_aio_context, opaque);
}
+
+ QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
+ if (notifier->attached_aio_context == attached_aio_context &&
+ notifier->detach_aio_context == detach_aio_context &&
+ notifier->opaque == opaque) {
+ QLIST_REMOVE(notifier, list);
+ g_free(notifier);
+ return;
+ }
+ }
+
+ abort();
}
void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
static void blk_pdiscard_entry(void *opaque)
{
BlkRwCo *rwco = opaque;
- rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
+ QEMUIOVector *qiov = rwco->iobuf;
+
+ rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
}
int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
void blk_io_limits_disable(BlockBackend *blk)
{
- assert(blk->public.throttle_group_member.throttle_state);
- bdrv_drained_begin(blk_bs(blk));
- throttle_group_unregister_tgm(&blk->public.throttle_group_member);
- bdrv_drained_end(blk_bs(blk));
+ BlockDriverState *bs = blk_bs(blk);
+ ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
+ assert(tgm->throttle_state);
+ if (bs) {
+ bdrv_drained_begin(bs);
+ }
+ throttle_group_unregister_tgm(tgm);
+ if (bs) {
+ bdrv_drained_end(bs);
+ }
}
/* should be called before blk_set_io_limits if a limit is set */
}
}
}
+
+void blk_register_buf(BlockBackend *blk, void *host, size_t size)
+{
+ bdrv_register_buf(blk_bs(blk), host, size);
+}
+
+void blk_unregister_buf(BlockBackend *blk, void *host)
+{
+ bdrv_unregister_buf(blk_bs(blk), host);
+}
+
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
+ BlockBackend *blk_out, int64_t off_out,
+ int bytes, BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags)
+{
+ int r;
+ r = blk_check_byte_request(blk_in, off_in, bytes);
+ if (r) {
+ return r;
+ }
+ r = blk_check_byte_request(blk_out, off_out, bytes);
+ if (r) {
+ return r;
+ }
+ return bdrv_co_copy_range(blk_in->root, off_in,
+ blk_out->root, off_out,
+ bytes, read_flags, write_flags);
+}