* THE SOFTWARE.
*/
+#include "qemu/osdep.h"
#include "trace.h"
+#include "sysemu/block-backend.h"
#include "block/blockjob.h"
#include "block/block_int.h"
#include "block/throttle-groups.h"
/* Take some limits from the children as a default */
if (bs->file) {
- bdrv_refresh_limits(bs->file, &local_err);
+ bdrv_refresh_limits(bs->file->bs, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
- bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
- bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
- bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment;
- bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+ bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
+ bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
+ bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
+ bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
+ bs->bl.max_iov = bs->file->bs->bl.max_iov;
} else {
bs->bl.min_mem_alignment = 512;
bs->bl.opt_mem_alignment = getpagesize();
+
+ /* Safe default since most protocols use readv()/writev()/etc */
+ bs->bl.max_iov = IOV_MAX;
}
- if (bs->backing_hd) {
- bdrv_refresh_limits(bs->backing_hd, &local_err);
+ if (bs->backing) {
+ bdrv_refresh_limits(bs->backing->bs, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
bs->bl.opt_transfer_length =
MAX(bs->bl.opt_transfer_length,
- bs->backing_hd->bl.opt_transfer_length);
+ bs->backing->bs->bl.opt_transfer_length);
bs->bl.max_transfer_length =
MIN_NON_ZERO(bs->bl.max_transfer_length,
- bs->backing_hd->bl.max_transfer_length);
+ bs->backing->bs->bl.max_transfer_length);
bs->bl.opt_mem_alignment =
MAX(bs->bl.opt_mem_alignment,
- bs->backing_hd->bl.opt_mem_alignment);
+ bs->backing->bs->bl.opt_mem_alignment);
bs->bl.min_mem_alignment =
MAX(bs->bl.min_mem_alignment,
- bs->backing_hd->bl.min_mem_alignment);
+ bs->backing->bs->bl.min_mem_alignment);
+ bs->bl.max_iov =
+ MIN(bs->bl.max_iov,
+ bs->backing->bs->bl.max_iov);
}
/* Then let the driver override it */
}
/* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
+bool bdrv_requests_pending(BlockDriverState *bs)
{
+ BdrvChild *child;
+
if (!QLIST_EMPTY(&bs->tracked_requests)) {
return true;
}
if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
return true;
}
- if (bs->file && bdrv_requests_pending(bs->file)) {
- return true;
- }
- if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
- return true;
+
+ QLIST_FOREACH(child, &bs->children, next) {
+ if (bdrv_requests_pending(child->bs)) {
+ return true;
+ }
}
+
return false;
}
+static void bdrv_drain_recurse(BlockDriverState *bs)
+{
+ BdrvChild *child;
+
+ if (bs->drv && bs->drv->bdrv_drain) {
+ bs->drv->bdrv_drain(bs);
+ }
+ QLIST_FOREACH(child, &bs->children, next) {
+ bdrv_drain_recurse(child->bs);
+ }
+}
+
/*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all(). This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
*
* Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
* AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts. In that case, use
+ * bdrv_drain_all() instead.
*/
void bdrv_drain(BlockDriverState *bs)
{
bool busy = true;
+ bdrv_drain_recurse(bs);
while (busy) {
/* Keep iterating */
bdrv_flush_io_queue(bs);
*
* This function does not flush data to disk, use bdrv_flush_all() for that
* after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete. Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
*/
void bdrv_drain_all(void)
{
}
}
+ /* Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a
+ * coroutine can submit an I/O request to another device in response to
+ * request completion. Therefore we must keep looping until there was no
+ * more activity rather than simply draining each device independently.
+ */
while (busy) {
busy = false;
static void tracked_request_begin(BdrvTrackedRequest *req,
BlockDriverState *bs,
int64_t offset,
- unsigned int bytes, bool is_write)
+ unsigned int bytes,
+ enum BdrvTrackedRequestType type)
{
*req = (BdrvTrackedRequest){
.bs = bs,
.offset = offset,
.bytes = bytes,
- .is_write = is_write,
+ .type = type,
.co = qemu_coroutine_self(),
.serialising = false,
.overlap_offset = offset,
mark_request_serialising(req, bdrv_get_cluster_size(bs));
}
- wait_serialising_requests(req);
+ if (!(flags & BDRV_REQ_NO_SERIALISING)) {
+ wait_serialising_requests(req);
+ }
if (flags & BDRV_REQ_COPY_ON_READ) {
int pnum;
return ret;
}
- if (bs->copy_on_read) {
+ /* Don't do copy-on-read if we read data before write operation */
+ if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
flags |= BDRV_REQ_COPY_ON_READ;
}
bytes = ROUND_UP(bytes, align);
}
- tracked_request_begin(&req, bs, offset, bytes, false);
+ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
use_local_qiov ? &local_qiov : qiov,
flags);
return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
}
+int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_NO_SERIALISING);
+}
+
int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
{
if (ret < 0) {
/* Do nothing, write notifier decided to fail this request */
} else if (flags & BDRV_REQ_ZERO_WRITE) {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
} else {
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV);
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
if (ret == 0 && !bs->enable_write_cache) {
ret = bdrv_co_flush(bs);
bdrv_set_dirty(bs, sector_num, nb_sectors);
- block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+ if (bs->wr_highest_offset < offset + bytes) {
+ bs->wr_highest_offset = offset + bytes;
+ }
if (ret >= 0) {
bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
/* RMW the unaligned part before head. */
mark_request_serialising(req, align);
wait_serialising_requests(req);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
align, &local_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
memset(buf + head_padding_bytes, 0, zero_bytes);
ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
/* RMW the unaligned part after tail. */
mark_request_serialising(req, align);
wait_serialising_requests(req);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
ret = bdrv_aligned_preadv(bs, req, offset, align,
align, &local_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
memset(buf, 0, bytes);
ret = bdrv_aligned_pwritev(bs, req, offset, align,
if (bs->read_only) {
return -EPERM;
}
+ assert(!(bs->open_flags & BDRV_O_INACTIVE));
ret = bdrv_check_byte_request(bs, offset, bytes);
if (ret < 0) {
* Pad qiov with the read parts and be sure to have a tracked request not
* only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
*/
- tracked_request_begin(&req, bs, offset, bytes, true);
+ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
if (!qiov) {
ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
};
qemu_iovec_init_external(&head_qiov, &head_iov, 1);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
align, &head_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
qemu_iovec_init(&local_qiov, qiov->niov + 2);
qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
};
qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
align, &tail_qiov, 0);
if (ret < 0) {
goto fail;
}
- BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+ bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
if (!use_local_qiov) {
qemu_iovec_init(&local_qiov, qiov->niov + 1);
if (ret & BDRV_BLOCK_RAW) {
assert(ret & BDRV_BLOCK_OFFSET_VALID);
- return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
*pnum, pnum);
}
} else {
if (bdrv_unallocated_blocks_are_zero(bs)) {
ret |= BDRV_BLOCK_ZERO;
- } else if (bs->backing_hd) {
- BlockDriverState *bs2 = bs->backing_hd;
+ } else if (bs->backing) {
+ BlockDriverState *bs2 = bs->backing->bs;
int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
ret |= BDRV_BLOCK_ZERO;
(ret & BDRV_BLOCK_OFFSET_VALID)) {
int file_pnum;
- ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
*pnum, &file_pnum);
if (ret2 >= 0) {
/* Ignore errors. This is just providing extra information, it
int64_t ret = 0;
assert(bs != base);
- for (p = bs; p != base; p = p->backing_hd) {
+ for (p = bs; p != base; p = backing_bs(p)) {
ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
break;
int64_t sector_num,
int nb_sectors, int *pnum)
{
- return bdrv_get_block_status_above(bs, bs->backing_hd,
+ return bdrv_get_block_status_above(bs, backing_bs(bs),
sector_num, nb_sectors, pnum);
}
n = pnum_inter;
}
- intermediate = intermediate->backing_hd;
+ intermediate = backing_bs(intermediate);
}
*pnum = n;
} else if (drv->bdrv_save_vmstate) {
return drv->bdrv_save_vmstate(bs, qiov, pos);
} else if (bs->file) {
- return bdrv_writev_vmstate(bs->file, qiov, pos);
+ return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
}
return -ENOTSUP;
if (drv->bdrv_load_vmstate)
return drv->bdrv_load_vmstate(bs, buf, pos, size);
if (bs->file)
- return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
return -ENOTSUP;
}
merge = 1;
}
- if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
+ bs->bl.max_iov) {
merge = 0;
}
}
}
- block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+ if (bs->blk) {
+ block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
+ num_reqs - outidx - 1);
+ }
return outidx + 1;
}
{
BlockAIOCB *acb;
- acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb = g_malloc(aiocb_info->aiocb_size);
acb->aiocb_info = aiocb_info;
acb->bs = bs;
acb->cb = cb;
BlockAIOCB *acb = p;
assert(acb->refcnt > 0);
if (--acb->refcnt == 0) {
- g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ g_free(acb);
}
}
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
{
int ret;
+ BdrvTrackedRequest req;
if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
bdrv_is_sg(bs)) {
return 0;
}
+ tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
/* Write back cached data to the OS even with cache=unsafe */
BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
if (bs->drv->bdrv_co_flush_to_os) {
ret = bs->drv->bdrv_co_flush_to_os(bs);
if (ret < 0) {
- return ret;
+ goto out;
}
}
ret = 0;
}
if (ret < 0) {
- return ret;
+ goto out;
}
/* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
* in the case of cache=unsafe, so there are no useless flushes.
*/
flush_parent:
- return bdrv_co_flush(bs->file);
+ ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+ tracked_request_end(&req);
+ return ret;
}
int bdrv_flush(BlockDriverState *bs)
int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
int nb_sectors)
{
+ BdrvTrackedRequest req;
int max_discard, ret;
if (!bs->drv) {
} else if (bs->read_only) {
return -EPERM;
}
+ assert(!(bs->open_flags & BDRV_O_INACTIVE));
/* Do nothing if disabled. */
if (!(bs->open_flags & BDRV_O_UNMAP)) {
return 0;
}
+ tracked_request_begin(&req, bs, sector_num, nb_sectors,
+ BDRV_TRACKED_DISCARD);
bdrv_set_dirty(bs, sector_num, nb_sectors);
max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
bdrv_co_io_em_complete, &co);
if (acb == NULL) {
- return -EIO;
+ ret = -EIO;
+ goto out;
} else {
qemu_coroutine_yield();
ret = co.ret;
}
}
if (ret && ret != -ENOTSUP) {
- return ret;
+ goto out;
}
sector_num += num;
nb_sectors -= num;
}
- return 0;
+ ret = 0;
+out:
+ tracked_request_end(&req);
+ return ret;
}
int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
return rwco.ret;
}
-/* needed for generic scsi interface */
+typedef struct {
+ CoroutineIOCompletion *co;
+ QEMUBH *bh;
+} BdrvIoctlCompletionData;
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+ BdrvIoctlCompletionData *data = opaque;
+
+ bdrv_co_io_em_complete(data->co, -ENOTSUP);
+ qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
{
BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest tracked_req;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockAIOCB *acb;
- if (drv && drv->bdrv_ioctl)
- return drv->bdrv_ioctl(bs, req, buf);
- return -ENOTSUP;
+ tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+ if (!drv || !drv->bdrv_aio_ioctl) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+
+ acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+ if (!acb) {
+ BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+ data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+ bdrv_ioctl_bh_cb, data);
+ data->co = &co;
+ qemu_bh_schedule(data->bh);
+ }
+ qemu_coroutine_yield();
+out:
+ tracked_request_end(&tracked_req);
+ return co.ret;
+}
+
+typedef struct {
+ BlockDriverState *bs;
+ int req;
+ void *buf;
+ int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+ BdrvIoctlCoData *data = opaque;
+ data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BdrvIoctlCoData data = {
+ .bs = bs,
+ .req = req,
+ .buf = buf,
+ .ret = -EINPROGRESS,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_co_ioctl_entry(&data);
+ } else {
+ Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+
+ qemu_coroutine_enter(co, &data);
+ while (data.ret == -EINPROGRESS) {
+ aio_poll(bdrv_get_aio_context(bs), true);
+ }
+ }
+ return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+ acb->req.req, acb->req.buf);
+ bdrv_co_complete(acb);
}
BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
unsigned long int req, void *buf,
BlockCompletionFunc *cb, void *opaque)
{
- BlockDriver *drv = bs->drv;
+ BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+ bs, cb, opaque);
+ Coroutine *co;
+
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.req = req;
+ acb->req.buf = buf;
+ co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+ qemu_coroutine_enter(co, acb);
- if (drv && drv->bdrv_aio_ioctl)
- return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
- return NULL;
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
}
void *qemu_blockalign(BlockDriverState *bs, size_t size)
if (drv && drv->bdrv_io_plug) {
drv->bdrv_io_plug(bs);
} else if (bs->file) {
- bdrv_io_plug(bs->file);
+ bdrv_io_plug(bs->file->bs);
}
}
if (drv && drv->bdrv_io_unplug) {
drv->bdrv_io_unplug(bs);
} else if (bs->file) {
- bdrv_io_unplug(bs->file);
+ bdrv_io_unplug(bs->file->bs);
}
}
if (drv && drv->bdrv_flush_io_queue) {
drv->bdrv_flush_io_queue(bs);
} else if (bs->file) {
- bdrv_flush_io_queue(bs->file);
+ bdrv_flush_io_queue(bs->file->bs);
}
bdrv_start_throttled_reqs(bs);
}
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+ if (!bs->quiesce_counter++) {
+ aio_disable_external(bdrv_get_aio_context(bs));
+ }
+ bdrv_drain(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+ assert(bs->quiesce_counter > 0);
+ if (--bs->quiesce_counter > 0) {
+ return;
+ }
+ aio_enable_external(bdrv_get_aio_context(bs));
+}