*/
#include "trace.h"
-#include "sysemu/qtest.h"
#include "block/blockjob.h"
#include "block/block_int.h"
+#include "block/throttle-groups.h"
+#include "qemu/error-report.h"
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
{
int i;
- throttle_config(&bs->throttle_state, cfg);
+ throttle_group_config(bs, cfg);
for (i = 0; i < 2; i++) {
qemu_co_enter_next(&bs->throttled_reqs[i]);
void bdrv_io_limits_disable(BlockDriverState *bs)
{
bs->io_limits_enabled = false;
-
bdrv_start_throttled_reqs(bs);
-
- throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
- BlockDriverState *bs = opaque;
- qemu_co_enter_next(&bs->throttled_reqs[1]);
+ throttle_group_unregister_bs(bs);
}
/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
{
- int clock_type = QEMU_CLOCK_REALTIME;
-
- if (qtest_enabled()) {
- /* For testing block IO throttling only */
- clock_type = QEMU_CLOCK_VIRTUAL;
- }
assert(!bs->io_limits_enabled);
- throttle_init(&bs->throttle_state,
- bdrv_get_aio_context(bs),
- clock_type,
- bdrv_throttle_read_timer_cb,
- bdrv_throttle_write_timer_cb,
- bs);
+ throttle_group_register_bs(bs, group);
bs->io_limits_enabled = true;
}
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write: is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
- unsigned int bytes,
- bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
{
- /* does this io must wait */
- bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
- /* if must wait or any request of this type throttled queue the IO */
- if (must_wait ||
- !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
- qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+ /* this bs is not part of any group */
+ if (!bs->throttle_state) {
+ return;
}
- /* the IO will be executed, do the accounting */
- throttle_account(&bs->throttle_state, is_write, bytes);
-
-
- /* if the next request must wait -> do nothing */
- if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+ /* this bs is a part of the same group than the one we want */
+ if (!g_strcmp0(throttle_group_get_name(bs), group)) {
return;
}
- /* else queue next request for execution */
- qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+ /* need to change the group this bs belong to */
+ bdrv_io_limits_disable(bs);
+ bdrv_io_limits_enable(bs, group);
}
void bdrv_setup_io_funcs(BlockDriver *bdrv)
return false;
}
-static bool bdrv_drain_one(BlockDriverState *bs)
-{
- bool bs_busy;
-
- bdrv_flush_io_queue(bs);
- bdrv_start_throttled_reqs(bs);
- bs_busy = bdrv_requests_pending(bs);
- bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
- return bs_busy;
-}
-
/*
* Wait for pending requests to complete on a single BlockDriverState subtree
*
- * See the warning in bdrv_drain_all(). This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
- *
* Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
* AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts. In that case, use
+ * bdrv_drain_all() instead.
*/
void bdrv_drain(BlockDriverState *bs)
{
- while (bdrv_drain_one(bs)) {
+ bool busy = true;
+
+ while (busy) {
/* Keep iterating */
+ bdrv_flush_io_queue(bs);
+ busy = bdrv_requests_pending(bs);
+ busy |= aio_poll(bdrv_get_aio_context(bs), busy);
}
}
*
* This function does not flush data to disk, use bdrv_flush_all() for that
* after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete. Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
*/
void bdrv_drain_all(void)
{
/* Always run first iteration so any pending completion BHs run */
bool busy = true;
BlockDriverState *bs = NULL;
+ GSList *aio_ctxs = NULL, *ctx;
while ((bs = bdrv_next(bs))) {
AioContext *aio_context = bdrv_get_aio_context(bs);
block_job_pause(bs->job);
}
aio_context_release(aio_context);
+
+ if (!g_slist_find(aio_ctxs, aio_context)) {
+ aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+ }
}
+ /* Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a
+ * coroutine can submit an I/O request to another device in response to
+ * request completion. Therefore we must keep looping until there was no
+ * more activity rather than simply draining each device independently.
+ */
while (busy) {
busy = false;
- bs = NULL;
- while ((bs = bdrv_next(bs))) {
- AioContext *aio_context = bdrv_get_aio_context(bs);
+ for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+ AioContext *aio_context = ctx->data;
+ bs = NULL;
aio_context_acquire(aio_context);
- busy |= bdrv_drain_one(bs);
+ while ((bs = bdrv_next(bs))) {
+ if (aio_context == bdrv_get_aio_context(bs)) {
+ bdrv_flush_io_queue(bs);
+ if (bdrv_requests_pending(bs)) {
+ busy = true;
+ aio_poll(aio_context, busy);
+ }
+ }
+ }
+ busy |= aio_poll(aio_context, false);
aio_context_release(aio_context);
}
}
}
aio_context_release(aio_context);
}
+ g_slist_free(aio_ctxs);
}
/**
return ret;
}
- if (bs->copy_on_read) {
+ /* Don't do copy-on-read if we read data before write operation */
+ if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) {
flags |= BDRV_REQ_COPY_ON_READ;
}
/* throttling disk I/O */
if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, false);
+ throttle_group_co_io_limits_intercept(bs, bytes, false);
}
/* Align read if necessary by padding qiov */
return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
}
+int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_NO_COPY_ON_READ);
+}
+
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{
/* throttling disk I/O */
if (bs->io_limits_enabled) {
- bdrv_io_limits_intercept(bs, bytes, true);
+ throttle_group_co_io_limits_intercept(bs, bytes, true);
}
/*
return ret;
}
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors,
+ int *pnum)
+{
+ BlockDriverState *p;
+ int64_t ret = 0;
+
+ assert(bs != base);
+ for (p = bs; p != base; p = p->backing_hd) {
+ ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+ if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+ break;
+ }
+ /* [sector_num, pnum] unallocated on this layer, which could be only
+ * the first part of [sector_num, nb_sectors]. */
+ nb_sectors = MIN(nb_sectors, *pnum);
+ }
+ return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status_above() */
+static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
{
BdrvCoGetBlockStatusData *data = opaque;
- BlockDriverState *bs = data->bs;
- data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
- data->pnum);
+ data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
+ data->sector_num,
+ data->nb_sectors,
+ data->pnum);
data->done = true;
}
/*
- * Synchronous wrapper around bdrv_co_get_block_status().
+ * Synchronous wrapper around bdrv_co_get_block_status_above().
*
- * See bdrv_co_get_block_status() for details.
+ * See bdrv_co_get_block_status_above() for details.
*/
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, int *pnum)
+int64_t bdrv_get_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
{
Coroutine *co;
BdrvCoGetBlockStatusData data = {
.bs = bs,
+ .base = base,
.sector_num = sector_num,
.nb_sectors = nb_sectors,
.pnum = pnum,
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
- bdrv_get_block_status_co_entry(&data);
+ bdrv_get_block_status_above_co_entry(&data);
} else {
AioContext *aio_context = bdrv_get_aio_context(bs);
- co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+ co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
qemu_coroutine_enter(co, &data);
while (!data.done) {
aio_poll(aio_context, true);
return data.ret;
}
+int64_t bdrv_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ return bdrv_get_block_status_above(bs, bs->backing_hd,
+ sector_num, nb_sectors, pnum);
+}
+
int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum)
{
{
BlockAIOCB *acb;
- acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb = g_malloc(aiocb_info->aiocb_size);
acb->aiocb_info = aiocb_info;
acb->bs = bs;
acb->cb = cb;
BlockAIOCB *acb = p;
assert(acb->refcnt > 0);
if (--acb->refcnt == 0) {
- g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ g_free(acb);
}
}
{
int ret;
- if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+ bdrv_is_sg(bs)) {
return 0;
}
return -EPERM;
}
- bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
/* Do nothing if disabled. */
if (!(bs->open_flags & BDRV_O_UNMAP)) {
return 0;
return 0;
}
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
while (nb_sectors > 0) {
int ret;
} else if (bs->file) {
bdrv_flush_io_queue(bs->file);
}
+ bdrv_start_throttled_reqs(bs);
}