X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/39ba59c21bdc3b1dda09219919b87fe001e7d5d9..fc63dcff46a2979ad998ff67c982f69d49691b7d:/block.c diff --git a/block.c b/block.c index e3fe97f275..ae297bb847 100644 --- a/block.c +++ b/block.c @@ -27,8 +27,10 @@ #include "monitor.h" #include "block_int.h" #include "module.h" -#include "qemu-objects.h" +#include "qjson.h" #include "qemu-coroutine.h" +#include "qmp-commands.h" +#include "qemu-timer.h" #ifdef CONFIG_BSD #include @@ -44,6 +46,13 @@ #include #endif +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +typedef enum { + BDRV_REQ_COPY_ON_READ = 0x1, + BDRV_REQ_ZERO_WRITE = 0x2, +} BdrvRequestFlags; + static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -51,27 +60,33 @@ static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque); -static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors); -static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors); -static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs); +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); + +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait); +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -105,6 +120,79 @@ int is_windows_drive(const char *filename) } #endif +/* throttling disk I/O limits */ +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + while (qemu_co_queue_next(&bs->throttled_reqs)); + + if (bs->block_timer) { + qemu_del_timer(bs->block_timer); + qemu_free_timer(bs->block_timer); + bs->block_timer = NULL; + } + + bs->slice_start = 0; + bs->slice_end = 0; + bs->slice_time = 0; + memset(&bs->io_base, 0, sizeof(bs->io_base)); +} + +static void bdrv_block_timer(void *opaque) +{ + BlockDriverState *bs = opaque; + + qemu_co_queue_next(&bs->throttled_reqs); +} + +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + qemu_co_queue_init(&bs->throttled_reqs); + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); + bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; + bs->slice_start = qemu_get_clock_ns(vm_clock); + bs->slice_end = bs->slice_start + bs->slice_time; + memset(&bs->io_base, 0, sizeof(bs->io_base)); + bs->io_limits_enabled = true; +} + +bool bdrv_io_limits_enabled(BlockDriverState *bs) +{ + BlockIOLimit *io_limits = &bs->io_limits; + return io_limits->bps[BLOCK_IO_LIMIT_READ] + || io_limits->bps[BLOCK_IO_LIMIT_WRITE] + || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] + || io_limits->iops[BLOCK_IO_LIMIT_READ] + || io_limits->iops[BLOCK_IO_LIMIT_WRITE] + || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; +} + +static void bdrv_io_limits_intercept(BlockDriverState *bs, + bool is_write, int nb_sectors) +{ + int64_t wait_time = -1; + + if (!qemu_co_queue_empty(&bs->throttled_reqs)) { + qemu_co_queue_wait(&bs->throttled_reqs); + } + + /* In fact, we hope to keep each request's timing, in FIFO mode. The next + * throttled requests will not be dequeued until the current request is + * allowed to be serviced. So if the current request still exceeds the + * limits, it will be inserted to the head. All requests followed it will + * be still in throttled_reqs queue. + */ + + while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { + qemu_mod_timer(bs->block_timer, + wait_time + qemu_get_clock_ns(vm_clock)); + qemu_co_queue_wait_insert_head(&bs->throttled_reqs); + } + + qemu_co_queue_next(&bs->throttled_reqs); +} + /* check if the path starts with ":" */ static int path_has_protocol(const char *path) { @@ -184,30 +272,21 @@ void path_combine(char *dest, int dest_size, void bdrv_register(BlockDriver *bdrv) { - if (bdrv->bdrv_co_readv) { - /* Emulate AIO by coroutines, and sync by AIO */ - bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em; - bdrv->bdrv_read = bdrv_read_em; - bdrv->bdrv_write = bdrv_write_em; - } else { + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { bdrv->bdrv_co_readv = bdrv_co_readv_em; bdrv->bdrv_co_writev = bdrv_co_writev_em; + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ if (!bdrv->bdrv_aio_readv) { /* add AIO emulation layer */ bdrv->bdrv_aio_readv = bdrv_aio_readv_em; bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } else if (!bdrv->bdrv_read) { - /* add synchronous IO emulation layer */ - bdrv->bdrv_read = bdrv_read_em; - bdrv->bdrv_write = bdrv_write_em; } } - if (!bdrv->bdrv_aio_flush) - bdrv->bdrv_aio_flush = bdrv_aio_flush_em; - QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); } @@ -221,6 +300,7 @@ BlockDriverState *bdrv_new(const char *device_name) if (device_name[0] != '\0') { QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); } + bdrv_iostatus_disable(bs); return bs; } @@ -465,6 +545,22 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) return 0; } +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + /* * Common part for opening disk images and files */ @@ -475,14 +571,24 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, assert(drv != NULL); + trace_bdrv_open_common(bs, filename, flags, drv->format_name); + bs->file = NULL; bs->total_sectors = 0; bs->encrypted = 0; bs->valid_key = 0; + bs->sg = 0; bs->open_flags = flags; + bs->growable = 0; bs->buffer_alignment = 512; + assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ + if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) { + bdrv_enable_copy_on_read(bs); + } + pstrcpy(bs->filename, sizeof(bs->filename), filename); + bs->backing_file[0] = '\0'; if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) { return -ENOTSUP; @@ -491,8 +597,7 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, bs->drv = drv; bs->opaque = g_malloc0(drv->instance_size); - if (flags & BDRV_O_CACHE_WB) - bs->enable_write_cache = 1; + bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); /* * Clear flags that are internal to the block layer before opening the @@ -507,6 +612,8 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, open_flags |= BDRV_O_RDWR; } + bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR); + /* Open the image, either directly or using a protocol */ if (drv->bdrv_file_open) { ret = drv->bdrv_file_open(bs, filename, open_flags); @@ -521,8 +628,6 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename, goto free_and_fail; } - bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR); - ret = refresh_total_sectors(bs, bs->total_sectors); if (ret < 0) { goto free_and_fail; @@ -578,6 +683,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, BlockDriver *drv) { int ret; + char tmp_filename[PATH_MAX]; if (flags & BDRV_O_SNAPSHOT) { BlockDriverState *bs1; @@ -585,7 +691,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, int is_protocol = 0; BlockDriver *bdrv_qcow2; QEMUOptionParameter *options; - char tmp_filename[PATH_MAX]; char backing_filename[PATH_MAX]; /* if snapshot, we create a temporary backing file and open it @@ -691,6 +796,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags, bdrv_dev_change_media_cb(bs, true); } + /* throttling disk I/O limits */ + if (bs->io_limits_enabled) { + bdrv_io_limits_enable(bs); + } + return 0; unlink_and_fail: @@ -719,6 +829,7 @@ void bdrv_close(BlockDriverState *bs) #endif bs->opaque = NULL; bs->drv = NULL; + bs->copy_on_read = 0; if (bs->file != NULL) { bdrv_close(bs->file); @@ -726,6 +837,11 @@ void bdrv_close(BlockDriverState *bs) bdrv_dev_change_media_cb(bs, false); } + + /*throttling disk I/O limits*/ + if (bs->io_limits_enabled) { + bdrv_io_limits_disable(bs); + } } void bdrv_close_all(void) @@ -737,6 +853,25 @@ void bdrv_close_all(void) } } +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + */ +void bdrv_drain_all(void) +{ + BlockDriverState *bs; + + qemu_aio_flush(); + + /* If requests are still pending there is a bug somewhere */ + QTAILQ_FOREACH(bs, &bdrv_states, list) { + assert(QLIST_EMPTY(&bs->tracked_requests)); + assert(qemu_co_queue_empty(&bs->throttled_reqs)); + } +} + /* make a BlockDriverState anonymous by removing from bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) @@ -770,6 +905,7 @@ int bdrv_attach_dev(BlockDriverState *bs, void *dev) return -EBUSY; } bs->dev = dev; + bdrv_iostatus_reset(bs); return 0; } @@ -819,6 +955,13 @@ bool bdrv_dev_has_removable_media(BlockDriverState *bs) return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); } +void bdrv_dev_eject_request(BlockDriverState *bs, bool force) +{ + if (bs->dev_ops && bs->dev_ops->eject_request_cb) { + bs->dev_ops->eject_request_cb(bs->dev_opaque, force); + } +} + bool bdrv_dev_is_tray_open(BlockDriverState *bs) { if (bs->dev_ops && bs->dev_ops->is_tray_open) { @@ -884,6 +1027,10 @@ int bdrv_commit(BlockDriverState *bs) return -EACCES; } + if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { + return -EBUSY; + } + backing_drv = bs->backing_hd->drv; ro = bs->backing_hd->read_only; strncpy(filename, bs->backing_hd->filename, sizeof(filename)); @@ -918,7 +1065,7 @@ int bdrv_commit(BlockDriverState *bs) buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { - if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { + if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { if (bdrv_read(bs, sector, buf, n) != 0) { ret = -EIO; @@ -976,6 +1123,118 @@ void bdrv_commit_all(void) } } +struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + bool is_write; + QLIST_ENTRY(BdrvTrackedRequest) list; + Coroutine *co; /* owner, used for deadlock detection */ + CoQueue wait_queue; /* coroutines blocked on this request */ +}; + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .is_write = is_write, + .co = qemu_coroutine_self(), + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +/** + * Round a region to cluster boundaries + */ +static void round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t sector_num, int nb_sectors) { + /* aaaa bbbb */ + if (sector_num >= req->sector_num + req->nb_sectors) { + return false; + } + /* bbbb aaaa */ + if (req->sector_num >= sector_num + nb_sectors) { + return false; + } + return true; +} + +static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BdrvTrackedRequest *req; + int64_t cluster_sector_num; + int cluster_nb_sectors; + bool retry; + + /* If we touch the same cluster it counts as an overlap. This guarantees + * that allocating writes will be serialized and not race with each other + * for the same cluster. For example, in copy-on-read it ensures that the + * CoR read and write operations are atomic and guest writes cannot + * interleave between them. + */ + round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (tracked_request_overlaps(req, cluster_sector_num, + cluster_nb_sectors)) { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + qemu_co_queue_wait(&req->wait_queue); + retry = true; + break; + } + } + } while (retry); +} + /* * Return values: * 0 - success @@ -1025,41 +1284,69 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, nb_sectors * BDRV_SECTOR_SIZE); } -static inline bool bdrv_has_async_rw(BlockDriver *drv) +typedef struct RwCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + QEMUIOVector *qiov; + bool is_write; + int ret; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) { - return drv->bdrv_co_readv != bdrv_co_readv_em - || drv->bdrv_aio_readv != bdrv_aio_readv_em; + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, + rwco->nb_sectors, rwco->qiov, 0); + } else { + rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, + rwco->nb_sectors, rwco->qiov, 0); + } } -static inline bool bdrv_has_async_flush(BlockDriver *drv) +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write) { - return drv->bdrv_aio_flush != bdrv_aio_flush_em; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + Coroutine *co; + RwCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .qiov = &qiov, + .is_write = is_write, + .ret = NOT_DONE, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + return rwco.ret; } /* return < 0 if error. See bdrv_write() for the return codes */ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { - BlockDriver *drv = bs->drv; - - if (!drv) - return -ENOMEDIUM; - - if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov); - } - - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return -EIO; - - return drv->bdrv_read(bs, sector_num, buf, nb_sectors); + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); } static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, @@ -1099,36 +1386,7 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { - BlockDriver *drv = bs->drv; - - if (!bs->drv) - return -ENOMEDIUM; - - if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); - } - - if (bs->read_only) - return -EACCES; - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return -EIO; - - if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 1); - } - - if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { - bs->wr_highest_sector = sector_num + nb_sectors - 1; - } - - return drv->bdrv_write(bs, sector_num, buf, nb_sectors); + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true); } int bdrv_pread(BlockDriverState *bs, int64_t offset, @@ -1249,79 +1507,269 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; - trace_bdrv_co_readv(bs, sector_num, nb_sectors); + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); - if (!drv) { - return -ENOMEDIUM; + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors); + } else { + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); } - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; } -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; - trace_bdrv_co_writev(bs, sector_num, nb_sectors); - - if (!bs->drv) { + if (!drv) { return -ENOMEDIUM; } - if (bs->read_only) { - return -EACCES; - } if (bdrv_check_request(bs, sector_num, nb_sectors)) { return -EIO; } - if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 1); + /* throttling disk read I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, false, nb_sectors); } - if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { - bs->wr_highest_sector = sector_num + nb_sectors - 1; + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight++; } - return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); -} + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } -/** - * Truncate file to 'offset' bytes (needed only for file protocols) - */ -int bdrv_truncate(BlockDriverState *bs, int64_t offset) -{ - BlockDriver *drv = bs->drv; - int ret; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_truncate) - return -ENOTSUP; - if (bs->read_only) - return -EACCES; - if (bdrv_in_use(bs)) - return -EBUSY; - ret = drv->bdrv_truncate(bs, offset); - if (ret == 0) { - ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); - bdrv_dev_resize_cb(bs); + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + +out: + tracked_request_end(&req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + bs->copy_on_read_in_flight--; } + return ret; } -/** - * Length of a allocated file in bytes. Sparse files are counted by actual - * allocated space. Return < 0 if error or unknown. - */ -int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov; + int ret; + + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + } + + /* Fall back to bounce buffer if write zeroes is unsupported */ + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_blockalign(bs, iov.iov_len); + memset(iov.iov_base, 0, iov.iov_len); + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + /* throttling disk write I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, true, nb_sectors); + } + + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + + if (flags & BDRV_REQ_ZERO_WRITE) { + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); + } else { + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + + if (bs->dirty_bitmap) { + set_dirty_bitmap(bs, sector_num, nb_sectors, 1); + } + + if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { + bs->wr_highest_sector = sector_num + nb_sectors - 1; + } + + tracked_request_end(&req); + + return ret; +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE); +} + +/** + * Truncate file to 'offset' bytes (needed only for file protocols) + */ +int bdrv_truncate(BlockDriverState *bs, int64_t offset) +{ + BlockDriver *drv = bs->drv; + int ret; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_truncate) + return -ENOTSUP; + if (bs->read_only) + return -EACCES; + if (bdrv_in_use(bs)) + return -EBUSY; + ret = drv->bdrv_truncate(bs, offset); + if (ret == 0) { + ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); + bdrv_dev_resize_cb(bs); + } + return ret; +} + +/** + * Length of a allocated file in bytes. Sparse files are counted by actual + * allocated space. Return < 0 if error or unknown. + */ +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) { BlockDriver *drv = bs->drv; if (!drv) { @@ -1502,6 +1950,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs, *psecs = bs->secs; } +/* throttling disk io limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + BlockIOLimit *io_limits) +{ + bs->io_limits = *io_limits; + bs->io_limits_enabled = bdrv_io_limits_enabled(bs); +} + /* Recognize floppy formats */ typedef struct FDFormat { FDriveType drive; @@ -1732,33 +2188,6 @@ const char *bdrv_get_device_name(BlockDriverState *bs) return bs->device_name; } -int bdrv_flush(BlockDriverState *bs) -{ - if (bs->open_flags & BDRV_O_NO_FLUSH) { - return 0; - } - - if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) { - return bdrv_co_flush_em(bs); - } - - if (bs->drv && bs->drv->bdrv_flush) { - return bs->drv->bdrv_flush(bs); - } - - /* - * Some block drivers always operate in either writethrough or unsafe mode - * and don't support bdrv_flush therefore. Usually qemu doesn't know how - * the server works (because the behaviour is hardcoded or depends on - * server-side configuration), so we can't ensure that everything is safe - * on disk. Returning an error doesn't work because that would break guests - * even if the server operates in writethrough mode. - * - * Let's hope the user knows what he's doing. - */ - return 0; -} - void bdrv_flush_all(void) { BlockDriverState *bs; @@ -1781,42 +2210,87 @@ int bdrv_has_zero_init(BlockDriverState *bs) return 1; } -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) -{ - if (!bs->drv) { - return -ENOMEDIUM; - } - if (!bs->drv->bdrv_discard) { - return 0; - } - return bs->drv->bdrv_discard(bs, sector_num, nb_sectors); -} +typedef struct BdrvCoIsAllocatedData { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int *pnum; + int ret; + bool done; +} BdrvCoIsAllocatedData; /* * Returns true iff the specified sector is present in the disk image. Drivers * not implementing the functionality are assumed to not support backing files, * hence all their sectors are reported as allocated. * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * * 'pnum' is set to the number of sectors (including and immediately following * the specified sector) that are known to be in the same * allocated/unallocated state. * - * 'nb_sectors' is the max value 'pnum' should be set to. + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. */ -int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) +int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { int64_t n; - if (!bs->drv->bdrv_is_allocated) { - if (sector_num >= bs->total_sectors) { - *pnum = 0; - return 0; - } - n = bs->total_sectors - sector_num; - *pnum = (n < nb_sectors) ? (n) : (nb_sectors); + + if (sector_num >= bs->total_sectors) { + *pnum = 0; + return 0; + } + + n = bs->total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_is_allocated) { + *pnum = nb_sectors; return 1; } - return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum); + + return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); +} + +/* Coroutine wrapper for bdrv_is_allocated() */ +static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) +{ + BdrvCoIsAllocatedData *data = opaque; + BlockDriverState *bs = data->bs; + + data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_is_allocated(). + * + * See bdrv_co_is_allocated() for details. + */ +int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + Coroutine *co; + BdrvCoIsAllocatedData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + co = qemu_coroutine_create(bdrv_is_allocated_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } + return data.ret; } void bdrv_mon_event(const BlockDriverState *bdrv, @@ -1848,178 +2322,120 @@ void bdrv_mon_event(const BlockDriverState *bdrv, qobject_decref(data); } -static void bdrv_print_dict(QObject *obj, void *opaque) +BlockInfoList *qmp_query_block(Error **errp) { - QDict *bs_dict; - Monitor *mon = opaque; - - bs_dict = qobject_to_qdict(obj); - - monitor_printf(mon, "%s: removable=%d", - qdict_get_str(bs_dict, "device"), - qdict_get_bool(bs_dict, "removable")); - - if (qdict_get_bool(bs_dict, "removable")) { - monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked")); - monitor_printf(mon, " tray-open=%d", - qdict_get_bool(bs_dict, "tray-open")); - } - if (qdict_haskey(bs_dict, "inserted")) { - QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted")); - - monitor_printf(mon, " file="); - monitor_print_filename(mon, qdict_get_str(qdict, "file")); - if (qdict_haskey(qdict, "backing_file")) { - monitor_printf(mon, " backing_file="); - monitor_print_filename(mon, qdict_get_str(qdict, "backing_file")); - } - monitor_printf(mon, " ro=%d drv=%s encrypted=%d", - qdict_get_bool(qdict, "ro"), - qdict_get_str(qdict, "drv"), - qdict_get_bool(qdict, "encrypted")); - } else { - monitor_printf(mon, " [not inserted]"); - } - - monitor_printf(mon, "\n"); -} - -void bdrv_info_print(Monitor *mon, const QObject *data) -{ - qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon); -} - -void bdrv_info(Monitor *mon, QObject **ret_data) -{ - QList *bs_list; + BlockInfoList *head = NULL, *cur_item = NULL; BlockDriverState *bs; - bs_list = qlist_new(); - QTAILQ_FOREACH(bs, &bdrv_states, list) { - QObject *bs_obj; - QDict *bs_dict; + BlockInfoList *info = g_malloc0(sizeof(*info)); - bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': 'unknown', " - "'removable': %i, 'locked': %i }", - bs->device_name, - bdrv_dev_has_removable_media(bs), - bdrv_dev_is_medium_locked(bs)); - bs_dict = qobject_to_qdict(bs_obj); + info->value = g_malloc0(sizeof(*info->value)); + info->value->device = g_strdup(bs->device_name); + info->value->type = g_strdup("unknown"); + info->value->locked = bdrv_dev_is_medium_locked(bs); + info->value->removable = bdrv_dev_has_removable_media(bs); if (bdrv_dev_has_removable_media(bs)) { - qdict_put(bs_dict, "tray-open", - qbool_from_int(bdrv_dev_is_tray_open(bs))); + info->value->has_tray_open = true; + info->value->tray_open = bdrv_dev_is_tray_open(bs); } - if (bs->drv) { - QObject *obj; - - obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, " - "'encrypted': %i }", - bs->filename, bs->read_only, - bs->drv->format_name, - bdrv_is_encrypted(bs)); - if (bs->backing_file[0] != '\0') { - QDict *qdict = qobject_to_qdict(obj); - qdict_put(qdict, "backing_file", - qstring_from_str(bs->backing_file)); - } - qdict_put_obj(bs_dict, "inserted", obj); + if (bdrv_iostatus_is_enabled(bs)) { + info->value->has_io_status = true; + info->value->io_status = bs->iostatus; } - qlist_append_obj(bs_list, bs_obj); - } - - *ret_data = QOBJECT(bs_list); -} -static void bdrv_stats_iter(QObject *data, void *opaque) -{ - QDict *qdict; - Monitor *mon = opaque; + if (bs->drv) { + info->value->has_inserted = true; + info->value->inserted = g_malloc0(sizeof(*info->value->inserted)); + info->value->inserted->file = g_strdup(bs->filename); + info->value->inserted->ro = bs->read_only; + info->value->inserted->drv = g_strdup(bs->drv->format_name); + info->value->inserted->encrypted = bs->encrypted; + if (bs->backing_file[0]) { + info->value->inserted->has_backing_file = true; + info->value->inserted->backing_file = g_strdup(bs->backing_file); + } - qdict = qobject_to_qdict(data); - monitor_printf(mon, "%s:", qdict_get_str(qdict, "device")); + if (bs->io_limits_enabled) { + info->value->inserted->bps = + bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + info->value->inserted->bps_rd = + bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; + info->value->inserted->bps_wr = + bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; + info->value->inserted->iops = + bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + info->value->inserted->iops_rd = + bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; + info->value->inserted->iops_wr = + bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; + } + } - qdict = qobject_to_qdict(qdict_get(qdict, "stats")); - monitor_printf(mon, " rd_bytes=%" PRId64 - " wr_bytes=%" PRId64 - " rd_operations=%" PRId64 - " wr_operations=%" PRId64 - " flush_operations=%" PRId64 - " wr_total_time_ns=%" PRId64 - " rd_total_time_ns=%" PRId64 - " flush_total_time_ns=%" PRId64 - "\n", - qdict_get_int(qdict, "rd_bytes"), - qdict_get_int(qdict, "wr_bytes"), - qdict_get_int(qdict, "rd_operations"), - qdict_get_int(qdict, "wr_operations"), - qdict_get_int(qdict, "flush_operations"), - qdict_get_int(qdict, "wr_total_time_ns"), - qdict_get_int(qdict, "rd_total_time_ns"), - qdict_get_int(qdict, "flush_total_time_ns")); -} + /* XXX: waiting for the qapi to support GSList */ + if (!cur_item) { + head = cur_item = info; + } else { + cur_item->next = info; + cur_item = info; + } + } -void bdrv_stats_print(Monitor *mon, const QObject *data) -{ - qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon); + return head; } -static QObject* bdrv_info_stats_bs(BlockDriverState *bs) +/* Consider exposing this as a full fledged QMP command */ +static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp) { - QObject *res; - QDict *dict; + BlockStats *s; - res = qobject_from_jsonf("{ 'stats': {" - "'rd_bytes': %" PRId64 "," - "'wr_bytes': %" PRId64 "," - "'rd_operations': %" PRId64 "," - "'wr_operations': %" PRId64 "," - "'wr_highest_offset': %" PRId64 "," - "'flush_operations': %" PRId64 "," - "'wr_total_time_ns': %" PRId64 "," - "'rd_total_time_ns': %" PRId64 "," - "'flush_total_time_ns': %" PRId64 - "} }", - bs->nr_bytes[BDRV_ACCT_READ], - bs->nr_bytes[BDRV_ACCT_WRITE], - bs->nr_ops[BDRV_ACCT_READ], - bs->nr_ops[BDRV_ACCT_WRITE], - bs->wr_highest_sector * - (uint64_t)BDRV_SECTOR_SIZE, - bs->nr_ops[BDRV_ACCT_FLUSH], - bs->total_time_ns[BDRV_ACCT_WRITE], - bs->total_time_ns[BDRV_ACCT_READ], - bs->total_time_ns[BDRV_ACCT_FLUSH]); - dict = qobject_to_qdict(res); + s = g_malloc0(sizeof(*s)); - if (*bs->device_name) { - qdict_put(dict, "device", qstring_from_str(bs->device_name)); + if (bs->device_name[0]) { + s->has_device = true; + s->device = g_strdup(bs->device_name); } + s->stats = g_malloc0(sizeof(*s->stats)); + s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ]; + s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE]; + s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ]; + s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE]; + s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE; + s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH]; + s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE]; + s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ]; + s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH]; + if (bs->file) { - QObject *parent = bdrv_info_stats_bs(bs->file); - qdict_put_obj(dict, "parent", parent); + s->has_parent = true; + s->parent = qmp_query_blockstat(bs->file, NULL); } - return res; + return s; } -void bdrv_info_stats(Monitor *mon, QObject **ret_data) +BlockStatsList *qmp_query_blockstats(Error **errp) { - QObject *obj; - QList *devices; + BlockStatsList *head = NULL, *cur_item = NULL; BlockDriverState *bs; - devices = qlist_new(); - QTAILQ_FOREACH(bs, &bdrv_states, list) { - obj = bdrv_info_stats_bs(bs); - qlist_append_obj(devices, obj); + BlockStatsList *info = g_malloc0(sizeof(*info)); + info->value = qmp_query_blockstat(bs, NULL); + + /* XXX: waiting for the qapi to support GSList */ + if (!cur_item) { + head = cur_item = info; + } else { + cur_item->next = info; + cur_item = info; + } } - *ret_data = QOBJECT(devices); + return head; } const char *bdrv_get_encrypted_filename(BlockDriverState *bs) @@ -2035,11 +2451,7 @@ const char *bdrv_get_encrypted_filename(BlockDriverState *bs) void bdrv_get_backing_filename(BlockDriverState *bs, char *filename, int filename_size) { - if (!bs->backing_file) { - pstrcpy(filename, filename_size, ""); - } else { - pstrcpy(filename, filename_size, bs->backing_file); - } + pstrcpy(filename, filename_size, bs->backing_file); } int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, @@ -2232,6 +2644,24 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs, return -ENOTSUP; } +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file) +{ + if (!bs->drv) { + return NULL; + } + + if (bs->backing_hd) { + if (strcmp(bs->backing_file, backing_file) == 0) { + return bs->backing_hd; + } else { + return bdrv_find_backing_image(bs->backing_hd, backing_file); + } + } + + return NULL; +} + #define NB_SUFFIXES 4 char *get_human_readable_size(char *buf, int buf_size, int64_t size) @@ -2312,89 +2742,20 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - BlockDriver *drv = bs->drv; - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, - cb, opaque); -} - -typedef struct BlockCompleteData { - BlockDriverCompletionFunc *cb; - void *opaque; - BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; -} BlockCompleteData; - -static void block_complete_cb(void *opaque, int ret) -{ - BlockCompleteData *b = opaque; - - if (b->bs->dirty_bitmap) { - set_dirty_bitmap(b->bs, b->sector_num, b->nb_sectors, 1); - } - b->cb(b->opaque, ret); - g_free(b); -} - -static BlockCompleteData *blk_dirty_cb_alloc(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - BlockCompleteData *blkdata = g_malloc0(sizeof(BlockCompleteData)); - - blkdata->bs = bs; - blkdata->cb = cb; - blkdata->opaque = opaque; - blkdata->sector_num = sector_num; - blkdata->nb_sectors = nb_sectors; - - return blkdata; + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + cb, opaque, false); } BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - BlockDriver *drv = bs->drv; - BlockDriverAIOCB *ret; - BlockCompleteData *blk_cb_data; - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - if (!drv) - return NULL; - if (bs->read_only) - return NULL; - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - if (bs->dirty_bitmap) { - blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, - opaque); - cb = &block_complete_cb; - opaque = blk_cb_data; - } - - ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, - cb, opaque); - - if (ret) { - if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { - bs->wr_highest_sector = sector_num + nb_sectors - 1; - } - } - - return ret; + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + cb, opaque, true); } @@ -2549,7 +2910,6 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, */ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) { - BlockDriverAIOCB *acb; MultiwriteCB *mcb; int i; @@ -2580,82 +2940,185 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); - /* - * Run the aio requests. As soon as one request can't be submitted - * successfully, fail all requests that are not yet submitted (we must - * return failure for all requests anyway) - * - * num_requests cannot be set to the right value immediately: If - * bdrv_aio_writev fails for some request, num_requests would be too high - * and therefore multiwrite_cb() would never recognize the multiwrite - * request as completed. We also cannot use the loop variable i to set it - * when the first request fails because the callback may already have been - * called for previously submitted requests. Thus, num_requests must be - * incremented for each request that is submitted. - * - * The problem that callbacks may be called early also means that we need - * to take care that num_requests doesn't become 0 before all requests are - * submitted - multiwrite_cb() would consider the multiwrite request - * completed. A dummy request that is "completed" by a manual call to - * multiwrite_cb() takes care of this. - */ - mcb->num_requests = 1; - - // Run the aio requests + /* Run the aio requests. */ + mcb->num_requests = num_reqs; for (i = 0; i < num_reqs; i++) { - mcb->num_requests++; - acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, + bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, reqs[i].nb_sectors, multiwrite_cb, mcb); + } - if (acb == NULL) { - // We can only fail the whole thing if no request has been - // submitted yet. Otherwise we'll wait for the submitted AIOs to - // complete and report the error in the callback. - if (i == 0) { - trace_bdrv_aio_multiwrite_earlyfail(mcb); - goto fail; - } else { - trace_bdrv_aio_multiwrite_latefail(mcb, i); - multiwrite_cb(mcb, -EIO); - break; - } + return 0; +} + +void bdrv_aio_cancel(BlockDriverAIOCB *acb) +{ + acb->pool->cancel(acb); +} + +/* block I/O throttling */ +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, double elapsed_time, uint64_t *wait) +{ + uint64_t bps_limit = 0; + double bytes_limit, bytes_base, bytes_res; + double slice_time, wait_time; + + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.bps[is_write]) { + bps_limit = bs->io_limits.bps[is_write]; + } else { + if (wait) { + *wait = 0; } + + return false; } - /* Complete the dummy request */ - multiwrite_cb(mcb, 0); + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + bytes_limit = bps_limit * slice_time; + bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write]; + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { + bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write]; + } - return 0; + /* bytes_base: the bytes of data which have been read/written; and + * it is obtained from the history statistic info. + * bytes_res: the remaining bytes of data which need to be read/written. + * (bytes_base + bytes_res) / bps_limit: used to calcuate + * the total time for completing reading/writting all data. + */ + bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; -fail: - for (i = 0; i < mcb->num_callbacks; i++) { - reqs[i].error = -EIO; + if (bytes_base + bytes_res <= bytes_limit) { + if (wait) { + *wait = 0; + } + + return false; } - g_free(mcb); - return -1; + + /* Calc approx time to dispatch */ + wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; + + /* When the I/O rate at runtime exceeds the limits, + * bs->slice_end need to be extended in order that the current statistic + * info can be kept until the timer fire, so it is increased and tuned + * based on the result of experiment. + */ + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; } -BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, + double elapsed_time, uint64_t *wait) { - BlockDriver *drv = bs->drv; + uint64_t iops_limit = 0; + double ios_limit, ios_base; + double slice_time, wait_time; - trace_bdrv_aio_flush(bs, opaque); + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; + } else if (bs->io_limits.iops[is_write]) { + iops_limit = bs->io_limits.iops[is_write]; + } else { + if (wait) { + *wait = 0; + } - if (bs->open_flags & BDRV_O_NO_FLUSH) { - return bdrv_aio_noop_em(bs, cb, opaque); + return false; } - if (!drv) - return NULL; - return drv->bdrv_aio_flush(bs, cb, opaque); + slice_time = bs->slice_end - bs->slice_start; + slice_time /= (NANOSECONDS_PER_SECOND); + ios_limit = iops_limit * slice_time; + ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write]; + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { + ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write]; + } + + if (ios_base + 1 <= ios_limit) { + if (wait) { + *wait = 0; + } + + return false; + } + + /* Calc approx time to dispatch */ + wait_time = (ios_base + 1) / iops_limit; + if (wait_time > elapsed_time) { + wait_time = wait_time - elapsed_time; + } else { + wait_time = 0; + } + + bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; + bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; + if (wait) { + *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; + } + + return true; } -void bdrv_aio_cancel(BlockDriverAIOCB *acb) +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, + bool is_write, int64_t *wait) { - acb->pool->cancel(acb); -} + int64_t now, max_wait; + uint64_t bps_wait = 0, iops_wait = 0; + double elapsed_time; + int bps_ret, iops_ret; + + now = qemu_get_clock_ns(vm_clock); + if ((bs->slice_start < now) + && (bs->slice_end > now)) { + bs->slice_end = now + bs->slice_time; + } else { + bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; + bs->slice_start = now; + bs->slice_end = now + bs->slice_time; + + bs->io_base.bytes[is_write] = bs->nr_bytes[is_write]; + bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write]; + + bs->io_base.ios[is_write] = bs->nr_ops[is_write]; + bs->io_base.ios[!is_write] = bs->nr_ops[!is_write]; + } + + elapsed_time = now - bs->slice_start; + elapsed_time /= (NANOSECONDS_PER_SECOND); + + bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, + is_write, elapsed_time, &bps_wait); + iops_ret = bdrv_exceed_iops_limits(bs, is_write, + elapsed_time, &iops_wait); + if (bps_ret || iops_ret) { + max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; + if (wait) { + *wait = max_wait; + } + + now = qemu_get_clock_ns(vm_clock); + if (bs->slice_end < now + max_wait) { + bs->slice_end = now + max_wait; + } + + return true; + } + + if (wait) { + *wait = 0; + } + return false; +} /**************************************************************/ /* async block device emulation */ @@ -2712,15 +3175,13 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, acb->is_write = is_write; acb->qiov = qiov; acb->bounce = qemu_blockalign(bs, qiov->size); - - if (!acb->bh) - acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); + acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); if (is_write) { qemu_iovec_to_buffer(acb->qiov, acb->bounce); - acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); } else { - acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); } qemu_bh_schedule(acb->bh); @@ -2760,7 +3221,7 @@ static AIOPool bdrv_em_co_aio_pool = { .cancel = bdrv_aio_co_cancel_em, }; -static void bdrv_co_rw_bh(void *opaque) +static void bdrv_co_em_bh(void *opaque) { BlockDriverAIOCBCoroutine *acb = opaque; @@ -2769,20 +3230,21 @@ static void bdrv_co_rw_bh(void *opaque) qemu_aio_release(acb); } -static void coroutine_fn bdrv_co_rw(void *opaque) +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) { BlockDriverAIOCBCoroutine *acb = opaque; BlockDriverState *bs = acb->common.bs; if (!acb->is_write) { - acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov); + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, 0); } else { - acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov); + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, 0); } - acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb); + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); qemu_bh_schedule(acb->bh); } @@ -2803,128 +3265,63 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, acb->req.qiov = qiov; acb->is_write = is_write; - co = qemu_coroutine_create(bdrv_co_rw); + co = qemu_coroutine_create(bdrv_co_do_rw); qemu_coroutine_enter(co, acb); return &acb->common; } -static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) { - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, - false); -} + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; -static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, - true); + acb->req.error = bdrv_co_flush(bs); + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); } -static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs, +BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { - BlockDriverAIOCBSync *acb; + trace_bdrv_aio_flush(bs, opaque); - acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque); - acb->is_write = 1; /* don't bounce in the completion hadler */ - acb->qiov = NULL; - acb->bounce = NULL; - acb->ret = 0; + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; - if (!acb->bh) - acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); + acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); - bdrv_flush(bs); - qemu_bh_schedule(acb->bh); return &acb->common; } -static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) { - BlockDriverAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque); - acb->is_write = 1; /* don't bounce in the completion handler */ - acb->qiov = NULL; - acb->bounce = NULL; - acb->ret = 0; - - if (!acb->bh) { - acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); - } + BlockDriverAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); qemu_bh_schedule(acb->bh); - return &acb->common; -} - -/**************************************************************/ -/* sync block device emulation */ - -static void bdrv_rw_em_cb(void *opaque, int ret) -{ - *(int *)opaque = ret; } -#define NOT_DONE 0x7fffffff - -static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { - int async_ret; - BlockDriverAIOCB *acb; - struct iovec iov; - QEMUIOVector qiov; - - async_ret = NOT_DONE; - iov.iov_base = (void *)buf; - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&qiov, &iov, 1); - acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors, - bdrv_rw_em_cb, &async_ret); - if (acb == NULL) { - async_ret = -1; - goto fail; - } - - while (async_ret == NOT_DONE) { - qemu_aio_wait(); - } - + Coroutine *co; + BlockDriverAIOCBCoroutine *acb; -fail: - return async_ret; -} + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); -static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int async_ret; - BlockDriverAIOCB *acb; - struct iovec iov; - QEMUIOVector qiov; - - async_ret = NOT_DONE; - iov.iov_base = (void *)buf; - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&qiov, &iov, 1); - acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors, - bdrv_rw_em_cb, &async_ret); - if (acb == NULL) { - async_ret = -1; - goto fail; - } - while (async_ret == NOT_DONE) { - qemu_aio_wait(); - } + acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); -fail: - return async_ret; + return &acb->common; } void bdrv_init(void) @@ -2990,14 +3387,14 @@ static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, BlockDriverAIOCB *acb; if (is_write) { - acb = bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); } else { - acb = bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); } - trace_bdrv_co_io(is_write, acb); + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); if (!acb) { return -EIO; } @@ -3020,19 +3417,162 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); } -static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs) +static void coroutine_fn bdrv_flush_co_entry(void *opaque) { - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs->drv) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + return 0; + } + + if (bs->drv->bdrv_co_flush_to_disk) { + return bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + return 0; + } +} + +void bdrv_invalidate_cache(BlockDriverState *bs) +{ + if (bs->drv && bs->drv->bdrv_invalidate_cache) { + bs->drv->bdrv_invalidate_cache(bs); + } +} + +void bdrv_invalidate_cache_all(void) +{ + BlockDriverState *bs; + + QTAILQ_FOREACH(bs, &bdrv_states, list) { + bdrv_invalidate_cache(bs); + } +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, }; - BlockDriverAIOCB *acb; - acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); - if (!acb) { + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + + return rwco.ret; +} + +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (!bs->drv) { + return -ENOMEDIUM; + } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { return -EIO; + } else if (bs->read_only) { + return -EROFS; + } else if (bs->drv->bdrv_co_discard) { + return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); + } else if (bs->drv->bdrv_aio_discard) { + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + return co.ret; + } + } else { + return 0; } - qemu_coroutine_yield(); - return co.ret; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + qemu_aio_wait(); + } + } + + return rwco.ret; } /**************************************************************/ @@ -3181,6 +3721,47 @@ int bdrv_in_use(BlockDriverState *bs) return bs->in_use; } +void bdrv_iostatus_enable(BlockDriverState *bs) +{ + bs->iostatus_enabled = true; + bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +} + +/* The I/O status is only enabled if the drive explicitly + * enables it _and_ the VM is configured to stop on errors */ +bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) +{ + return (bs->iostatus_enabled && + (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC || + bs->on_write_error == BLOCK_ERR_STOP_ANY || + bs->on_read_error == BLOCK_ERR_STOP_ANY)); +} + +void bdrv_iostatus_disable(BlockDriverState *bs) +{ + bs->iostatus_enabled = false; +} + +void bdrv_iostatus_reset(BlockDriverState *bs) +{ + if (bdrv_iostatus_is_enabled(bs)) { + bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; + } +} + +/* XXX: Today this is set by device models because it makes the implementation + quite simple. However, the block layer knows about the error, so it's + possible to implement this without device models being involved */ +void bdrv_iostatus_set_err(BlockDriverState *bs, int error) +{ + if (bdrv_iostatus_is_enabled(bs) && + bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + assert(error >= 0); + bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : + BLOCK_DEVICE_IO_STATUS_FAILED; + } +} + void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, enum BlockAcctType type) @@ -3344,3 +3925,51 @@ out: return ret; } + +void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockJob *job; + + if (bs->job || bdrv_in_use(bs)) { + return NULL; + } + bdrv_set_in_use(bs, 1); + + job = g_malloc0(job_type->instance_size); + job->job_type = job_type; + job->bs = bs; + job->cb = cb; + job->opaque = opaque; + bs->job = job; + return job; +} + +void block_job_complete(BlockJob *job, int ret) +{ + BlockDriverState *bs = job->bs; + + assert(bs->job == job); + job->cb(job->opaque, ret); + bs->job = NULL; + g_free(job); + bdrv_set_in_use(bs, 0); +} + +int block_job_set_speed(BlockJob *job, int64_t value) +{ + if (!job->job_type->set_speed) { + return -ENOTSUP; + } + return job->job_type->set_speed(job, value); +} + +void block_job_cancel(BlockJob *job) +{ + job->cancelled = true; +} + +bool block_job_is_cancelled(BlockJob *job) +{ + return job->cancelled; +}