*/
#include "trace.h"
-#include "sysemu/qtest.h"
+#include "sysemu/block-backend.h"
#include "block/blockjob.h"
#include "block/block_int.h"
+#include "block/throttle-groups.h"
+#include "qemu/error-report.h"
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
{
int i;
- throttle_config(&bs->throttle_state, cfg);
+ throttle_group_config(bs, cfg);
for (i = 0; i < 2; i++) {
qemu_co_enter_next(&bs->throttled_reqs[i]);
void bdrv_io_limits_disable(BlockDriverState *bs)
{
bs->io_limits_enabled = false;
-
bdrv_start_throttled_reqs(bs);
-
- throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[1]);
+ throttle_group_unregister_bs(bs);
}
/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
{
- int clock_type = QEMU_CLOCK_REALTIME;
-
- if (qtest_enabled()) {
- /* For testing block IO throttling only */
- clock_type = QEMU_CLOCK_VIRTUAL;
- }
assert(!bs->io_limits_enabled);
- throttle_init(&bs->throttle_state,
- bdrv_get_aio_context(bs),
- clock_type,
- bdrv_throttle_read_timer_cb,
- bdrv_throttle_write_timer_cb,
- bs);
+ throttle_group_register_bs(bs, group);
bs->io_limits_enabled = true;
}
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write: is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
- unsigned int bytes,
- bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
{
- /* does this io must wait */
- bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
- /* if must wait or any request of this type throttled queue the IO */
- if (must_wait ||
- !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
- qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+ /* this bs is not part of any group */
+ if (!bs->throttle_state) {
+ return;
}
- /* the IO will be executed, do the accounting */
- throttle_account(&bs->throttle_state, is_write, bytes);
-
-
- /* if the next request must wait -> do nothing */
- if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+ /* this bs is a part of the same group than the one we want */
+ if (!g_strcmp0(throttle_group_get_name(bs), group)) {
return;
}
- /* else queue next request for execution */
- qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+ /* need to change the group this bs belong to */
+ bdrv_io_limits_disable(bs);
+ bdrv_io_limits_enable(bs, group);
}
void bdrv_setup_io_funcs(BlockDriver *bdrv)
/* Take some limits from the children as a default */
if (bs->file) {
- bdrv_refresh_limits(bs->file, &local_err);
+ bdrv_refresh_limits(bs->file->bs, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
- bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
- bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
- bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+ bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
+ bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
+ bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
+ bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
+ bs->bl.max_iov = bs->file->bs->bl.max_iov;
} else {
- bs->bl.opt_mem_alignment = 512;
+ bs->bl.min_mem_alignment = 512;
+ bs->bl.opt_mem_alignment = getpagesize();
+
+ /* Safe default since most protocols use readv()/writev()/etc */
+ bs->bl.max_iov = IOV_MAX;
}
- if (bs->backing_hd) {
- bdrv_refresh_limits(bs->backing_hd, &local_err);
+ if (bs->backing) {
+ bdrv_refresh_limits(bs->backing->bs, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
bs->bl.opt_transfer_length =
MAX(bs->bl.opt_transfer_length,
- bs->backing_hd->bl.opt_transfer_length);
+ bs->backing->bs->bl.opt_transfer_length);
bs->bl.max_transfer_length =
MIN_NON_ZERO(bs->bl.max_transfer_length,
- bs->backing_hd->bl.max_transfer_length);
+ bs->backing->bs->bl.max_transfer_length);
bs->bl.opt_mem_alignment =
MAX(bs->bl.opt_mem_alignment,
- bs->backing_hd->bl.opt_mem_alignment);
+ bs->backing->bs->bl.opt_mem_alignment);
+ bs->bl.min_mem_alignment =
+ MAX(bs->bl.min_mem_alignment,
+ bs->backing->bs->bl.min_mem_alignment);
+ bs->bl.max_iov =
+ MIN(bs->bl.max_iov,
+ bs->backing->bs->bl.max_iov);
}
/* Then let the driver override it */
}
/* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
+bool bdrv_requests_pending(BlockDriverState *bs)
{
+ BdrvChild *child;
+
if (!QLIST_EMPTY(&bs->tracked_requests)) {
return true;
}
if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
return true;
}
- if (bs->file && bdrv_requests_pending(bs->file)) {
- return true;
- }
- if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
- return true;
+
+ QLIST_FOREACH(child, &bs->children, next) {
+ if (bdrv_requests_pending(child->bs)) {
+ return true;
+ }
}
+
return false;
}
-static bool bdrv_drain_one(BlockDriverState *bs)
+static void bdrv_drain_recurse(BlockDriverState *bs)
{
- bool bs_busy;
+ BdrvChild *child;
- bdrv_flush_io_queue(bs);
- bdrv_start_throttled_reqs(bs);
- bs_busy = bdrv_requests_pending(bs);
- bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
- return bs_busy;
+ if (bs->drv && bs->drv->bdrv_drain) {
+ bs->drv->bdrv_drain(bs);
+ }
+ QLIST_FOREACH(child, &bs->children, next) {
+ bdrv_drain_recurse(child->bs);
+ }
}
/*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all(). This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
*
* Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
* AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts. In that case, use
+ * bdrv_drain_all() instead.
*/
void bdrv_drain(BlockDriverState *bs)
{
- while (bdrv_drain_one(bs)) {
+ bool busy = true;
+
+ bdrv_drain_recurse(bs);
+ while (busy) {
/* Keep iterating */
+ bdrv_flush_io_queue(bs);
+ busy = bdrv_requests_pending(bs);
+ busy |= aio_poll(bdrv_get_aio_context(bs), busy);
}
}
*
* This function does not flush data to disk, use bdrv_flush_all() for that
* after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete. Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
*/
void bdrv_drain_all(void)
{
/* Always run first iteration so any pending completion BHs run */
bool busy = true;
BlockDriverState *bs = NULL;
+ GSList *aio_ctxs = NULL, *ctx;
while ((bs = bdrv_next(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs);
block_job_pause(bs->job);
}
aio_context_release(aio_context);
+
+ if (!g_slist_find(aio_ctxs, aio_context)) {
+ aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+ }
}
+ /* Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a
+ * coroutine can submit an I/O request to another device in response to
+ * request completion. Therefore we must keep looping until there was no
+ * more activity rather than simply draining each device independently.
+ */
while (busy) {
busy = false;
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
- AioContext *aio_context = bdrv_get_aio_context(bs);
+ for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+ AioContext *aio_context = ctx->data;
+ bs = NULL;
aio_context_acquire(aio_context);
- busy |= bdrv_drain_one(bs);
+ while ((bs = bdrv_next(bs))) {
+ if (aio_context == bdrv_get_aio_context(bs)) {
+ bdrv_flush_io_queue(bs);
+ if (bdrv_requests_pending(bs)) {
+ busy = true;
+ aio_poll(aio_context, busy);
+ }
+ }
+ }
+ busy |= aio_poll(aio_context, false);
aio_context_release(aio_context);
}
}
}
aio_context_release(aio_context);
}
+ g_slist_free(aio_ctxs);
}
/**
static void tracked_request_begin(BdrvTrackedRequest *req,
BlockDriverState *bs,
int64_t offset,
- unsigned int bytes, bool is_write)
+ unsigned int bytes,
+ enum BdrvTrackedRequestType type)
{
*req = (BdrvTrackedRequest){
.bs = bs,
.offset = offset,
.bytes = bytes,
- .is_write = is_write,
+ .type = type,
.co = qemu_coroutine_self(),
.serialising = false,
.overlap_offset = offset,
mark_request_serialising(req, bdrv_get_cluster_size(bs));
}
- wait_serialising_requests(req);
+ if (!(flags & BDRV_REQ_NO_SERIALISING)) {
+ wait_serialising_requests(req);
+ }
if (flags & BDRV_REQ_COPY_ON_READ) {
int pnum;
return ret;
}
-static inline uint64_t bdrv_get_align(BlockDriverState *bs)
-{
- /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
- return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
-}
-
-static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
- int64_t offset, size_t bytes)
-{
- int64_t align = bdrv_get_align(bs);
- return !(offset & (align - 1) || (bytes & (align - 1)));
-}
-
/*
* Handle a read request in coroutine context
*/
BlockDriver *drv = bs->drv;
BdrvTrackedRequest req;
- uint64_t align = bdrv_get_align(bs);
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
uint8_t *head_buf = NULL;
uint8_t *tail_buf = NULL;
QEMUIOVector local_qiov;
return ret;
}
- if (bs->copy_on_read) {
+ /* Don't do copy-on-read if we read data before write operation */
+ if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
flags |= BDRV_REQ_COPY_ON_READ;
}
/* throttling disk I/O */
if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, false);
+ throttle_group_co_io_limits_intercept(bs, bytes, false);
}
/* Align read if necessary by padding qiov */
bytes = ROUND_UP(bytes, align);
}
- tracked_request_begin(&req, bs, offset, bytes, false);
+ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
use_local_qiov ? &local_qiov : qiov,
flags);
return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
}
+int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_NO_SERIALISING);
+}
+
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{
if (ret < 0) {
/* Do nothing, write notifier decided to fail this request */
} else if (flags & BDRV_REQ_ZERO_WRITE) {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
} else {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV);
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
if (ret == 0 && !bs->enable_write_cache) {
ret = bdrv_co_flush(bs);
bdrv_set_dirty(bs, sector_num, nb_sectors);
- block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+ if (bs->wr_highest_offset < offset + bytes) {
+ bs->wr_highest_offset = offset + bytes;
+ }
if (ret >= 0) {
bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
return ret;
}
+static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+ int64_t offset,
+ unsigned int bytes,
+ BdrvRequestFlags flags,
+ BdrvTrackedRequest *req)
+{
+ uint8_t *buf = NULL;
+ QEMUIOVector local_qiov;
+ struct iovec iov;
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+ unsigned int head_padding_bytes, tail_padding_bytes;
+ int ret = 0;
+
+ head_padding_bytes = offset & (align - 1);
+ tail_padding_bytes = align - ((offset + bytes) & (align - 1));
+
+
+ assert(flags & BDRV_REQ_ZERO_WRITE);
+ if (head_padding_bytes || tail_padding_bytes) {
+ buf = qemu_blockalign(bs, align);
+ iov = (struct iovec) {
+ .iov_base = buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&local_qiov, &iov, 1);
+ }
+ if (head_padding_bytes) {
+ uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
+
+ /* RMW the unaligned part before head. */
+ mark_request_serialising(req, align);
+ wait_serialising_requests(req);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+ align, &local_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+ memset(buf + head_padding_bytes, 0, zero_bytes);
+ ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+ &local_qiov,
+ flags & ~BDRV_REQ_ZERO_WRITE);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += zero_bytes;
+ bytes -= zero_bytes;
+ }
+
+ assert(!bytes || (offset & (align - 1)) == 0);
+ if (bytes >= align) {
+ /* Write the aligned part in the middle. */
+ uint64_t aligned_bytes = bytes & ~(align - 1);
+ ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
+ NULL, flags);
+ if (ret < 0) {
+ goto fail;
+ }
+ bytes -= aligned_bytes;
+ offset += aligned_bytes;
+ }
+
+ assert(!bytes || (offset & (align - 1)) == 0);
+ if (bytes) {
+ assert(align == tail_padding_bytes + bytes);
+ /* RMW the unaligned part after tail. */
+ mark_request_serialising(req, align);
+ wait_serialising_requests(req);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ ret = bdrv_aligned_preadv(bs, req, offset, align,
+ align, &local_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+ memset(buf, 0, bytes);
+ ret = bdrv_aligned_pwritev(bs, req, offset, align,
+ &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
+ }
+fail:
+ qemu_vfree(buf);
+ return ret;
+
+}
+
/*
* Handle a write request in coroutine context
*/
BdrvRequestFlags flags)
{
BdrvTrackedRequest req;
- uint64_t align = bdrv_get_align(bs);
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
uint8_t *head_buf = NULL;
uint8_t *tail_buf = NULL;
QEMUIOVector local_qiov;
/* throttling disk I/O */
if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, true);
+ throttle_group_co_io_limits_intercept(bs, bytes, true);
}
/*
* Pad qiov with the read parts and be sure to have a tracked request not
* only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
*/
- tracked_request_begin(&req, bs, offset, bytes, true);
+ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
+
+ if (!qiov) {
+ ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+ goto out;
+ }
if (offset & (align - 1)) {
QEMUIOVector head_qiov;
};
qemu_iovec_init_external(&head_qiov, &head_iov, 1);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
align, &head_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
qemu_iovec_init(&local_qiov, qiov->niov + 2);
qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
};
qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
align, &tail_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
if (!use_local_qiov) {
qemu_iovec_init(&local_qiov, qiov->niov + 1);
bytes = ROUND_UP(bytes, align);
}
- if (use_local_qiov) {
- /* Local buffer may have non-zero data. */
- flags &= ~BDRV_REQ_ZERO_WRITE;
- }
ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
use_local_qiov ? &local_qiov : qiov,
flags);
fail:
- tracked_request_end(&req);
if (use_local_qiov) {
qemu_iovec_destroy(&local_qiov);
}
qemu_vfree(head_buf);
qemu_vfree(tail_buf);
-
+out:
+ tracked_request_end(&req);
return ret;
}
int64_t sector_num, int nb_sectors,
BdrvRequestFlags flags)
{
- int ret;
-
trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
if (!(bs->open_flags & BDRV_O_UNMAP)) {
flags &= ~BDRV_REQ_MAY_UNMAP;
}
- if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS)) {
- ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
- BDRV_REQ_ZERO_WRITE | flags);
- } else {
- uint8_t *buf;
- QEMUIOVector local_qiov;
- size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
- buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
- memset(buf, 0, bytes);
- qemu_iovec_init(&local_qiov, 1);
- qemu_iovec_add(&local_qiov, buf, bytes);
-
- ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
- BDRV_REQ_ZERO_WRITE | flags);
- qemu_vfree(buf);
- }
- return ret;
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+ BDRV_REQ_ZERO_WRITE | flags);
}
int bdrv_flush_all(void)
if (ret & BDRV_BLOCK_RAW) {
assert(ret & BDRV_BLOCK_OFFSET_VALID);
- return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
*pnum, pnum);
}
if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
ret |= BDRV_BLOCK_ALLOCATED;
- }
-
- if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+ } else {
if (bdrv_unallocated_blocks_are_zero(bs)) {
ret |= BDRV_BLOCK_ZERO;
- } else if (bs->backing_hd) {
- BlockDriverState *bs2 = bs->backing_hd;
+ } else if (bs->backing) {
+ BlockDriverState *bs2 = bs->backing->bs;
int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
ret |= BDRV_BLOCK_ZERO;
(ret & BDRV_BLOCK_OFFSET_VALID)) {
int file_pnum;
- ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
*pnum, &file_pnum);
if (ret2 >= 0) {
/* Ignore errors. This is just providing extra information, it
return ret;
}
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors,
+ int *pnum)
+{
+ BlockDriverState *p;
+ int64_t ret = 0;
+
+ assert(bs != base);
+ for (p = bs; p != base; p = backing_bs(p)) {
+ ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+ if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+ break;
+ }
+ /* [sector_num, pnum] unallocated on this layer, which could be only
+ * the first part of [sector_num, nb_sectors]. */
+ nb_sectors = MIN(nb_sectors, *pnum);
+ }
+ return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status_above() */
+static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
{
BdrvCoGetBlockStatusData *data = opaque;
- BlockDriverState *bs = data->bs;
- data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
- data->pnum);
+ data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
+ data->sector_num,
+ data->nb_sectors,
+ data->pnum);
data->done = true;
}
/*
- * Synchronous wrapper around bdrv_co_get_block_status().
+ * Synchronous wrapper around bdrv_co_get_block_status_above().
*
- * See bdrv_co_get_block_status() for details.
+ * See bdrv_co_get_block_status_above() for details.
*/
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, int *pnum)
+int64_t bdrv_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
{
Coroutine *co;
BdrvCoGetBlockStatusData data = {
.bs = bs,
+ .base = base,
.sector_num = sector_num,
.nb_sectors = nb_sectors,
.pnum = pnum,
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
- bdrv_get_block_status_co_entry(&data);
+ bdrv_get_block_status_above_co_entry(&data);
} else {
AioContext *aio_context = bdrv_get_aio_context(bs);
- co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+ co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
qemu_coroutine_enter(co, &data);
while (!data.done) {
aio_poll(aio_context, true);
return data.ret;
}
+int64_t bdrv_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ return bdrv_get_block_status_above(bs, backing_bs(bs),
+ sector_num, nb_sectors, pnum);
+}
+
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum)
{
n = pnum_inter;
}
- intermediate = intermediate->backing_hd;
+ intermediate = backing_bs(intermediate);
}
*pnum = n;
} else if (drv->bdrv_save_vmstate) {
return drv->bdrv_save_vmstate(bs, qiov, pos);
} else if (bs->file) {
- return bdrv_writev_vmstate(bs->file, qiov, pos);
+ return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
}
return -ENOTSUP;
if (drv->bdrv_load_vmstate)
return drv->bdrv_load_vmstate(bs, buf, pos, size);
if (bs->file)
- return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
return -ENOTSUP;
}
merge = 1;
}
- if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
+ bs->bl.max_iov) {
merge = 0;
}
}
}
- block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+ if (bs->blk) {
+ block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
+ num_reqs - outidx - 1);
+ }
return outidx + 1;
}
{
BlockAIOCB *acb;
- acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb = g_malloc(aiocb_info->aiocb_size);
acb->aiocb_info = aiocb_info;
acb->bs = bs;
acb->cb = cb;
BlockAIOCB *acb = p;
assert(acb->refcnt > 0);
if (--acb->refcnt == 0) {
- g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ g_free(acb);
}
}
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
{
int ret;
+ BdrvTrackedRequest req;
- if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+ bdrv_is_sg(bs)) {
return 0;
}
+ tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
/* Write back cached data to the OS even with cache=unsafe */
BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
if (bs->drv->bdrv_co_flush_to_os) {
ret = bs->drv->bdrv_co_flush_to_os(bs);
if (ret < 0) {
- return ret;
+ goto out;
}
}
ret = 0;
}
if (ret < 0) {
- return ret;
+ goto out;
}
/* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
* in the case of cache=unsafe, so there are no useless flushes.
*/
flush_parent:
- return bdrv_co_flush(bs->file);
+ ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+ tracked_request_end(&req);
+ return ret;
}
int bdrv_flush(BlockDriverState *bs)
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
int nb_sectors)
{
+ BdrvTrackedRequest req;
int max_discard, ret;
if (!bs->drv) {
return -EPERM;
}
- bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
/* Do nothing if disabled. */
if (!(bs->open_flags & BDRV_O_UNMAP)) {
return 0;
return 0;
}
+ tracked_request_begin(&req, bs, sector_num, nb_sectors,
+ BDRV_TRACKED_DISCARD);
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
while (nb_sectors > 0) {
int ret;
acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
bdrv_co_io_em_complete, &co);
if (acb == NULL) {
- return -EIO;
+ ret = -EIO;
+ goto out;
} else {
qemu_coroutine_yield();
ret = co.ret;
}
}
if (ret && ret != -ENOTSUP) {
- return ret;
+ goto out;
}
sector_num += num;
nb_sectors -= num;
}
- return 0;
+ ret = 0;
+out:
+ tracked_request_end(&req);
+ return ret;
}
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
return rwco.ret;
}
-/* needed for generic scsi interface */
+typedef struct {
+ CoroutineIOCompletion *co;
+ QEMUBH *bh;
+} BdrvIoctlCompletionData;
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+ BdrvIoctlCompletionData *data = opaque;
+
+ bdrv_co_io_em_complete(data->co, -ENOTSUP);
+ qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
{
BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest tracked_req;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockAIOCB *acb;
- if (drv && drv->bdrv_ioctl)
- return drv->bdrv_ioctl(bs, req, buf);
- return -ENOTSUP;
+ tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+ if (!drv || !drv->bdrv_aio_ioctl) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+
+ acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+ if (!acb) {
+ BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+ data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+ bdrv_ioctl_bh_cb, data);
+ data->co = &co;
+ qemu_bh_schedule(data->bh);
+ }
+ qemu_coroutine_yield();
+out:
+ tracked_request_end(&tracked_req);
+ return co.ret;
+}
+
+typedef struct {
+ BlockDriverState *bs;
+ int req;
+ void *buf;
+ int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+ BdrvIoctlCoData *data = opaque;
+ data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BdrvIoctlCoData data = {
+ .bs = bs,
+ .req = req,
+ .buf = buf,
+ .ret = -EINPROGRESS,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_co_ioctl_entry(&data);
+ } else {
+ Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+
+ qemu_coroutine_enter(co, &data);
+ while (data.ret == -EINPROGRESS) {
+ aio_poll(bdrv_get_aio_context(bs), true);
+ }
+ }
+ return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+ acb->req.req, acb->req.buf);
+ bdrv_co_complete(acb);
}
BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
unsigned long int req, void *buf,
BlockCompletionFunc *cb, void *opaque)
{
- BlockDriver *drv = bs->drv;
+ BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+ bs, cb, opaque);
+ Coroutine *co;
+
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.req = req;
+ acb->req.buf = buf;
+ co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+ qemu_coroutine_enter(co, acb);
- if (drv && drv->bdrv_aio_ioctl)
- return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
- return NULL;
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
}
void *qemu_blockalign(BlockDriverState *bs, size_t size)
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
{
int i;
- size_t alignment = bdrv_opt_mem_align(bs);
+ size_t alignment = bdrv_min_mem_align(bs);
for (i = 0; i < qiov->niov; i++) {
if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
if (drv && drv->bdrv_io_plug) {
drv->bdrv_io_plug(bs);
} else if (bs->file) {
- bdrv_io_plug(bs->file);
+ bdrv_io_plug(bs->file->bs);
}
}
if (drv && drv->bdrv_io_unplug) {
drv->bdrv_io_unplug(bs);
} else if (bs->file) {
- bdrv_io_unplug(bs->file);
+ bdrv_io_unplug(bs->file->bs);
}
}
if (drv && drv->bdrv_flush_io_queue) {
drv->bdrv_flush_io_queue(bs);
} else if (bs->file) {
- bdrv_flush_io_queue(bs->file);
+ bdrv_flush_io_queue(bs->file->bs);
+ }
+ bdrv_start_throttled_reqs(bs);
+}
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+ if (!bs->quiesce_counter++) {
+ aio_disable_external(bdrv_get_aio_context(bs));
+ }
+ bdrv_drain(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+ assert(bs->quiesce_counter > 0);
+ if (--bs->quiesce_counter > 0) {
+ return;
}
+ aio_enable_external(bdrv_get_aio_context(bs));
}