X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/b04b6b6ec3a1e0ba90c2f58617286d9fc35fa372..664c6733d72c589cd9f6ccee305e7b7ce36ea06d:/block.c diff --git a/block.c b/block.c index 38078f7cd5..cb21a5fa61 100644 --- a/block.c +++ b/block.c @@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "qemu/notify.h" #include "block/coroutine.h" +#include "block/qapi.h" #include "qmp-commands.h" #include "qemu/timer.h" @@ -49,12 +50,12 @@ #include #endif -#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +struct BdrvDirtyBitmap { + HBitmap *bitmap; + QLIST_ENTRY(BdrvDirtyBitmap) list; +}; -typedef enum { - BDRV_REQ_COPY_ON_READ = 0x1, - BDRV_REQ_ZERO_WRITE = 0x2, -} BdrvRequestFlags; +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, @@ -69,26 +70,30 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BdrvRequestFlags flags, BlockDriverCompletionFunc *cb, void *opaque, bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors); + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); +static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = + QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); + static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); @@ -187,7 +192,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs) * @is_write: is the IO a write */ static void bdrv_io_limits_intercept(BlockDriverState *bs, - int nb_sectors, + unsigned int bytes, bool is_write) { /* does this io must wait */ @@ -200,9 +205,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, } /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, - is_write, - nb_sectors * BDRV_SECTOR_SIZE); + throttle_account(&bs->throttle_state, is_write, bytes); + /* if the next request must wait -> do nothing */ if (throttle_schedule_timer(&bs->throttle_state, is_write)) { @@ -213,6 +217,16 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, qemu_co_queue_next(&bs->throttled_reqs[is_write]); } +size_t bdrv_opt_mem_align(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + /* 4k should be on the safe side */ + return 4096; + } + + return bs->bl.opt_mem_alignment; +} + /* check if the path starts with ":" */ static int path_has_protocol(const char *path) { @@ -323,9 +337,10 @@ BlockDriverState *bdrv_new(const char *device_name) BlockDriverState *bs; bs = g_malloc0(sizeof(BlockDriverState)); + QLIST_INIT(&bs->dirty_bitmaps); pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); if (device_name[0] != '\0') { - QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); + QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); } bdrv_iostatus_disable(bs); notifier_list_init(&bs->close_notifiers); @@ -477,6 +492,43 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options, return ret; } +int bdrv_refresh_limits(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return 0; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file); + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd); + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + return drv->bdrv_refresh_limits(bs); + } + + return 0; +} + /* * Create a uniquely-named empty temporary file. * Return 0 upon success, otherwise a negative errno value. @@ -730,6 +782,33 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags) return open_flags; } +static int bdrv_assign_node_name(BlockDriverState *bs, + const char *node_name, + Error **errp) +{ + if (!node_name) { + return 0; + } + + /* empty string node name is invalid */ + if (node_name[0] == '\0') { + error_setg(errp, "Empty node name"); + return -EINVAL; + } + + /* takes care of avoiding duplicates node names */ + if (bdrv_find_node(node_name)) { + error_setg(errp, "Duplicate node name"); + return -EINVAL; + } + + /* copy node name into the bs and insert it into the graph list */ + pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); + + return 0; +} + /* * Common part for opening disk images and files * @@ -740,6 +819,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, { int ret, open_flags; const char *filename; + const char *node_name = NULL; Error *local_err = NULL; assert(drv != NULL); @@ -754,6 +834,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); + node_name = qdict_get_try_str(options, "node-name"); + ret = bdrv_assign_node_name(bs, node_name, errp); + if (ret < 0) { + return ret; + } + qdict_del(options, "node-name"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ @@ -763,7 +850,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } bs->open_flags = flags; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; + bs->request_alignment = 512; bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); @@ -831,6 +919,10 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, goto free_and_fail; } + bdrv_refresh_limits(bs); + assert(bdrv_opt_mem_align(bs) != 0); + assert(bs->request_alignment != 0); + #ifndef _WIN32 if (bs->is_temporary) { assert(bs->filename[0] != '\0'); @@ -856,9 +948,10 @@ free_and_fail: * dictionary, it needs to use QINCREF() before calling bdrv_file_open. */ int bdrv_file_open(BlockDriverState **pbs, const char *filename, - QDict *options, int flags, Error **errp) + const char *reference, QDict *options, int flags, + Error **errp) { - BlockDriverState *bs; + BlockDriverState *bs = NULL; BlockDriver *drv; const char *drvname; bool allow_protocol_prefix = false; @@ -870,6 +963,24 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, options = qdict_new(); } + if (reference) { + if (filename || qdict_size(options)) { + error_setg(errp, "Cannot reference an existing block device with " + "additional options or a new filename"); + return -EINVAL; + } + QDECREF(options); + + bs = bdrv_find(reference); + if (!bs) { + error_setg(errp, "Cannot find block device '%s'", reference); + return -ENODEV; + } + bdrv_ref(bs); + *pbs = bs; + return 0; + } + bs = bdrv_new(""); bs->options = options; options = qdict_clone_shallow(options); @@ -927,14 +1038,19 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, goto fail; } - ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + if (!drv->bdrv_file_open) { + ret = bdrv_open(bs, filename, options, flags, drv, &local_err); + options = NULL; + } else { + ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + } if (ret < 0) { error_propagate(errp, local_err); goto fail; } /* Check if any unknown options were used */ - if (qdict_size(options) != 0) { + if (options && (qdict_size(options) != 0)) { const QDictEntry *entry = qdict_first(options); error_setg(errp, "Block protocol '%s' doesn't support the option '%s'", drv->format_name, entry->key); @@ -1014,11 +1130,91 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) error_free(local_err); return ret; } - pstrcpy(bs->backing_file, sizeof(bs->backing_file), - bs->backing_hd->file->filename); + + if (bs->backing_hd->file) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + bs->backing_hd->file->filename); + } + + /* Recalculate the BlockLimits with the backing file */ + bdrv_refresh_limits(bs); + return 0; } +/* + * Opens a disk image whose options are given as BlockdevRef in another block + * device's options. + * + * If force_raw is true, bdrv_file_open() will be used, thereby preventing any + * image format auto-detection. If it is false and a filename is given, + * bdrv_open() will be used for auto-detection. + * + * If allow_none is true, no image will be opened if filename is false and no + * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. + * + * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. + * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict + * itself, all options starting with "${bdref_key}." are considered part of the + * BlockdevRef. + * + * The BlockdevRef will be removed from the options QDict. + */ +int bdrv_open_image(BlockDriverState **pbs, const char *filename, + QDict *options, const char *bdref_key, int flags, + bool force_raw, bool allow_none, Error **errp) +{ + QDict *image_options; + int ret; + char *bdref_key_dot; + const char *reference; + + bdref_key_dot = g_strdup_printf("%s.", bdref_key); + qdict_extract_subqdict(options, &image_options, bdref_key_dot); + g_free(bdref_key_dot); + + reference = qdict_get_try_str(options, bdref_key); + if (!filename && !reference && !qdict_size(image_options)) { + if (allow_none) { + ret = 0; + } else { + error_setg(errp, "A block device must be specified for \"%s\"", + bdref_key); + ret = -EINVAL; + } + goto done; + } + + if (filename && !force_raw) { + /* If a filename is given and the block driver should be detected + automatically (instead of using none), use bdrv_open() in order to do + that auto-detection. */ + BlockDriverState *bs; + + if (reference) { + error_setg(errp, "Cannot reference an existing block device while " + "giving a filename"); + ret = -EINVAL; + goto done; + } + + bs = bdrv_new(""); + ret = bdrv_open(bs, filename, image_options, flags, NULL, errp); + if (ret < 0) { + bdrv_unref(bs); + } else { + *pbs = bs; + } + } else { + ret = bdrv_file_open(pbs, filename, reference, image_options, flags, + errp); + } + +done: + qdict_del(options, bdref_key); + return ret; +} + /* * Opens a disk image (raw, qcow2, vmdk, ...) * @@ -1034,7 +1230,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char tmp_filename[PATH_MAX + 1]; BlockDriverState *file = NULL; - QDict *file_options = NULL; const char *drvname; Error *local_err = NULL; @@ -1052,21 +1247,16 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, int64_t total_size; BlockDriver *bdrv_qcow2; QEMUOptionParameter *create_options; - char backing_filename[PATH_MAX]; - - if (qdict_size(options) != 0) { - error_setg(errp, "Can't use snapshot=on with driver-specific options"); - ret = -EINVAL; - goto fail; - } - assert(filename != NULL); + QDict *snapshot_options; /* if snapshot, we create a temporary backing file and open it instead of opening 'filename' directly */ - /* if there is a backing file, use it */ + /* Get the required size from the image */ bs1 = bdrv_new(""); - ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err); + QINCREF(options); + ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING, + drv, &local_err); if (ret < 0) { bdrv_unref(bs1); goto fail; @@ -1075,33 +1265,18 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, bdrv_unref(bs1); + /* Create the temporary image */ ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not get temporary filename"); goto fail; } - /* Real path is meaningless for protocols */ - if (path_has_protocol(filename)) { - snprintf(backing_filename, sizeof(backing_filename), - "%s", filename); - } else if (!realpath(filename, backing_filename)) { - ret = -errno; - error_setg_errno(errp, errno, "Could not resolve path '%s'", filename); - goto fail; - } - bdrv_qcow2 = bdrv_find_format("qcow2"); create_options = parse_option_parameters("", bdrv_qcow2->create_options, NULL); set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); - set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE, - backing_filename); - if (drv) { - set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT, - drv->format_name); - } ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err); free_option_parameters(create_options); @@ -1114,6 +1289,22 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, goto fail; } + /* Prepare a new options QDict for the temporary file, where user + * options refer to the backing file */ + if (filename) { + qdict_put(options, "file.filename", qstring_from_str(filename)); + } + if (drv) { + qdict_put(options, "driver", qstring_from_str(drv->format_name)); + } + + snapshot_options = qdict_new(); + qdict_put(snapshot_options, "backing", options); + qdict_flatten(snapshot_options); + + bs->options = snapshot_options; + options = qdict_clone_shallow(bs->options); + filename = tmp_filename; drv = bdrv_qcow2; bs->is_temporary = 1; @@ -1124,10 +1315,9 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, flags |= BDRV_O_ALLOW_RDWR; } - qdict_extract_subqdict(options, &file_options, "file."); - - ret = bdrv_file_open(&file, filename, file_options, - bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); + ret = bdrv_open_image(&file, filename, options, "file", + bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true, + &local_err); if (ret < 0) { goto fail; } @@ -1137,10 +1327,21 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, if (drvname) { drv = bdrv_find_format(drvname); qdict_del(options, "driver"); + if (!drv) { + error_setg(errp, "Invalid driver: '%s'", drvname); + ret = -EINVAL; + goto unlink_and_fail; + } } if (!drv) { - ret = find_image_format(file, filename, &drv, &local_err); + if (file) { + ret = find_image_format(file, filename, &drv, &local_err); + } else { + error_setg(errp, "Must specify either driver or file"); + ret = -EINVAL; + goto unlink_and_fail; + } } if (!drv) { @@ -1153,7 +1354,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, goto unlink_and_fail; } - if (bs->file != file) { + if (file && (bs->file != file)) { bdrv_unref(file); file = NULL; } @@ -1424,6 +1625,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state) reopen_state->bs->enable_write_cache = !!(reopen_state->flags & BDRV_O_CACHE_WB); reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); + + bdrv_refresh_limits(reopen_state->bs); } /* @@ -1498,7 +1701,7 @@ void bdrv_close_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_close(bs); } } @@ -1527,7 +1730,7 @@ static bool bdrv_requests_pending(BlockDriverState *bs) static bool bdrv_requests_pending_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bdrv_requests_pending(bs)) { return true; } @@ -1554,13 +1757,8 @@ void bdrv_drain_all(void) BlockDriverState *bs; while (busy) { - /* FIXME: We do not have timer support here, so this is effectively - * a busy wait. - */ - QTAILQ_FOREACH(bs, &bdrv_states, list) { - if (bdrv_start_throttled_reqs(bs)) { - busy = true; - } + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bdrv_start_throttled_reqs(bs); } busy = bdrv_requests_pending_all(); @@ -1568,14 +1766,19 @@ void bdrv_drain_all(void) } } -/* make a BlockDriverState anonymous by removing from bdrv_state list. +/* make a BlockDriverState anonymous by removing from bdrv_state and + * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) { if (bs->device_name[0] != '\0') { - QTAILQ_REMOVE(&bdrv_states, bs, list); + QTAILQ_REMOVE(&bdrv_states, bs, device_list); } bs->device_name[0] = '\0'; + if (bs->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); + } + bs->node_name[0] = '\0'; } static void bdrv_rebind(BlockDriverState *bs) @@ -1595,7 +1798,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->dev_ops = bs_src->dev_ops; bs_dest->dev_opaque = bs_src->dev_opaque; bs_dest->dev = bs_src->dev; - bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->guest_block_size = bs_src->guest_block_size; bs_dest->copy_on_read = bs_src->copy_on_read; bs_dest->enable_write_cache = bs_src->enable_write_cache; @@ -1617,7 +1820,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->iostatus = bs_src->iostatus; /* dirty bitmap */ - bs_dest->dirty_bitmap = bs_src->dirty_bitmap; + bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps; /* reference count */ bs_dest->refcnt = bs_src->refcnt; @@ -1629,7 +1832,12 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, /* keep the same entry in bdrv_states */ pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), bs_src->device_name); - bs_dest->list = bs_src->list; + bs_dest->device_list = bs_src->device_list; + + /* keep the same entry in graph_bdrv_states + * We do want to swap name but don't want to swap linked list entries + */ + bs_dest->node_list = bs_src->node_list; } /* @@ -1650,7 +1858,7 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) /* bs_new must be anonymous and shouldn't have anything fancy enabled */ assert(bs_new->device_name[0] == '\0'); - assert(bs_new->dirty_bitmap == NULL); + assert(QLIST_EMPTY(&bs_new->dirty_bitmaps)); assert(bs_new->job == NULL); assert(bs_new->dev == NULL); assert(bs_new->in_use == 0); @@ -1711,6 +1919,7 @@ static void bdrv_delete(BlockDriverState *bs) assert(!bs->job); assert(!bs->in_use); assert(!bs->refcnt); + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); bdrv_close(bs); @@ -1746,7 +1955,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev = NULL; bs->dev_ops = NULL; bs->dev_opaque = NULL; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; } /* TODO change to return DeviceState * when all users are qdevified */ @@ -1877,10 +2086,10 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - int64_t sector, total_sectors; + int64_t sector, total_sectors, length, backing_length; int n, ro, open_flags; int ret = 0; - uint8_t *buf; + uint8_t *buf = NULL; char filename[PATH_MAX]; if (!drv) @@ -1905,7 +2114,29 @@ int bdrv_commit(BlockDriverState *bs) } } - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + length = bdrv_getlength(bs); + if (length < 0) { + ret = length; + goto ro_cleanup; + } + + backing_length = bdrv_getlength(bs->backing_hd); + if (backing_length < 0) { + ret = backing_length; + goto ro_cleanup; + } + + /* If our top snapshot is larger than the backing file image, + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { + ret = bdrv_truncate(bs->backing_hd, length); + if (ret < 0) { + goto ro_cleanup; + } + } + + total_sectors = length >> BDRV_SECTOR_BITS; buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { @@ -1914,13 +2145,13 @@ int bdrv_commit(BlockDriverState *bs) goto ro_cleanup; } if (ret) { - if (bdrv_read(bs, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_read(bs, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } - if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_write(bs->backing_hd, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } } @@ -1928,6 +2159,9 @@ int bdrv_commit(BlockDriverState *bs) if (drv->bdrv_make_empty) { ret = drv->bdrv_make_empty(bs); + if (ret < 0) { + goto ro_cleanup; + } bdrv_flush(bs); } @@ -1935,9 +2169,11 @@ int bdrv_commit(BlockDriverState *bs) * Make sure all data we wrote to the backing device is actually * stable on disk. */ - if (bs->backing_hd) + if (bs->backing_hd) { bdrv_flush(bs->backing_hd); + } + ret = 0; ro_cleanup: g_free(buf); @@ -1953,7 +2189,7 @@ int bdrv_commit_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bs->drv && bs->backing_hd) { int ret = bdrv_commit(bs); if (ret < 0) { @@ -1971,6 +2207,10 @@ int bdrv_commit_all(void) */ static void tracked_request_end(BdrvTrackedRequest *req) { + if (req->serialising) { + req->bs->serialising_in_flight--; + } + QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); } @@ -1980,15 +2220,18 @@ static void tracked_request_end(BdrvTrackedRequest *req) */ static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, bool is_write) + int64_t offset, + unsigned int bytes, bool is_write) { *req = (BdrvTrackedRequest){ .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .is_write = is_write, - .co = qemu_coroutine_self(), + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, }; qemu_co_queue_init(&req->wait_queue); @@ -1996,6 +2239,21 @@ static void tracked_request_begin(BdrvTrackedRequest *req, QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); } +static void mark_request_serialising(BdrvTrackedRequest *req, size_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + /** * Round a region to cluster boundaries */ @@ -2017,53 +2275,75 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t sector_num, int nb_sectors) { + int64_t offset, unsigned int bytes) +{ /* aaaa bbbb */ - if (sector_num >= req->sector_num + req->nb_sectors) { + if (offset >= req->overlap_offset + req->overlap_bytes) { return false; } /* bbbb aaaa */ - if (req->sector_num >= sector_num + nb_sectors) { + if (req->overlap_offset >= offset + bytes) { return false; } return true; } -static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { + BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; - int64_t cluster_sector_num; - int cluster_nb_sectors; bool retry; + bool waited = false; - /* If we touch the same cluster it counts as an overlap. This guarantees - * that allocating writes will be serialized and not race with each other - * for the same cluster. For example, in copy-on-read it ensures that the - * CoR read and write operations are atomic and guest writes cannot - * interleave between them. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + if (!bs->serialising_in_flight) { + return false; + } do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (tracked_request_overlaps(req, cluster_sector_num, - cluster_nb_sectors)) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must * never happen since it means deadlock. */ assert(qemu_coroutine_self() != req->co); - qemu_co_queue_wait(&req->wait_queue); - retry = true; - break; + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } } } } while (retry); + + return waited; } /* @@ -2225,6 +2505,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, } new_top_bs->backing_hd = base_bs; + bdrv_refresh_limits(new_top_bs); QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { /* so that bdrv_close() does not recursively close the chain */ @@ -2272,8 +2553,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, typedef struct RwCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; QEMUIOVector *qiov; bool is_write; int ret; @@ -2285,34 +2565,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, + .offset = offset, .qiov = qiov, .is_write = is_write, .ret = NOT_DONE, .flags = flags, }; - assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); /** * In sync call context, when the vcpu is blocked, this throttling timer @@ -2351,7 +2629,8 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, }; qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ @@ -2387,128 +2666,102 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) { - return bdrv_rwv_co(bs, sector_num, qiov, true, 0); + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE); + int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; + int64_t ret, nb_sectors, sector_num = 0; + int n; + + for (;;) { + nb_sectors = target_size - sector_num; + if (nb_sectors <= 0) { + return 0; + } + if (nb_sectors > INT_MAX) { + nb_sectors = INT_MAX; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, - void *buf, int count1) +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; int ret; - count = count1; - /* first read to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); - count -= len; - if (count == 0) - return count1; - sector_num++; - buf += len; - } - - /* read the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) - return ret; - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - buf += len; - count -= len; + if (bytes < 0) { + return -EINVAL; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf, count); + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; } - return count1; + + return bytes; } int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; int ret; - count = qiov->size; - - /* first write to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), - len); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - count -= len; - if (count == 0) - return qiov->size; - sector_num++; - } - - /* write the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - QEMUIOVector qiov_inplace; - - qemu_iovec_init(&qiov_inplace, qiov->niov); - qemu_iovec_concat(&qiov_inplace, qiov, len, - nb_sectors << BDRV_SECTOR_BITS); - ret = bdrv_writev(bs, sector_num, &qiov_inplace); - qemu_iovec_destroy(&qiov_inplace); - if (ret < 0) { - return ret; - } - - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - count -= len; + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - } return qiov->size; } int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int count1) + const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *) buf, - .iov_len = count1, + .iov_len = bytes, }; + if (bytes < 0) { + return -EINVAL; + } + qemu_iovec_init_external(&qiov, &iov, 1); return bdrv_pwritev(bs, offset, &qiov); } @@ -2577,7 +2830,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, if (drv->bdrv_co_write_zeroes && buffer_is_zero(bounce_buffer, iov.iov_len)) { ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors); + cluster_nb_sectors, 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. @@ -2604,40 +2857,34 @@ err: } /* - * Handle a read request in coroutine context + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. */ -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; int ret; - if (!drv) { - return -ENOMEDIUM; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } - - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight++; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, false); + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); } - tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + wait_serialising_requests(req); if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -2653,6 +2900,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } } + /* Forward the request to the BlockDriver */ if (!(bs->zero_beyond_eof && bs->growable)) { ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } else { @@ -2666,7 +2914,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); - max_nb_sectors = MAX(0, total_sectors - sector_num); + max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num, + align >> BDRV_SECTOR_BITS)); if (max_nb_sectors > 0) { ret = drv->bdrv_co_readv(bs, sector_num, MIN(nb_sectors, max_nb_sectors), qiov); @@ -2684,18 +2933,98 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } out: - tracked_request_end(&req); - - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight--; - } - return ret; } -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ trace_bdrv_co_readv(bs, sector_num, nb_sectors); return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); @@ -2710,86 +3039,119 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, BDRV_REQ_COPY_ON_READ); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_WRITE_ZEROES_DEFAULT 32768 + static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; - struct iovec iov; - int ret; + struct iovec iov = {0}; + int ret = 0; - /* TODO Emulate only part of misaligned requests instead of letting block - * drivers return -ENOTSUP and emulate everything */ + int max_write_zeroes = bs->bl.max_write_zeroes ? + bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); - if (ret != -ENOTSUP) { - return ret; + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } } - } - /* Fall back to bounce buffer if write zeroes is unsupported */ - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_blockalign(bs, iov.iov_len); - memset(iov.iov_base, 0, iov.iov_len); - qemu_iovec_init_external(&qiov, &iov, 1); + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE); + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_write_zeroes) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } qemu_vfree(iov.iov_base); return ret; } /* - * Handle a write request in coroutine context + * Forwards an already correctly aligned write request to the BlockDriver. */ -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; + bool waited; int ret; - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } - - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, true); - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); if (ret == 0 && !bs->enable_write_cache) { ret = bdrv_co_flush(bs); } - if (bs->dirty_bitmap) { - bdrv_set_dirty(bs, sector_num, nb_sectors); - } + bdrv_set_dirty(bs, sector_num, nb_sectors); if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { bs->wr_highest_sector = sector_num + nb_sectors - 1; @@ -2798,11 +3160,143 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); } + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: tracked_request_end(&req); + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + return ret; } +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -2812,12 +3306,17 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, } int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) { - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE); + BDRV_REQ_ZERO_WRITE | flags); } /** @@ -3021,11 +3520,12 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), } } +/* This function is to find block backend bs */ BlockDriverState *bdrv_find(const char *name) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (!strcmp(name, bs->device_name)) { return bs; } @@ -3033,19 +3533,83 @@ BlockDriverState *bdrv_find(const char *name) return NULL; } +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) +{ + BlockDriverState *bs; + + assert(node_name); + + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; + } + } + return NULL; +} + +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(void) +{ + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; + + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + entry = g_malloc0(sizeof(*entry)); + entry->value = bdrv_block_device_info(bs); + entry->next = list; + list = entry; + } + + return list; +} + +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockDriverState *bs = NULL; + + if ((!device && !node_name) || (device && node_name)) { + error_setg(errp, "Use either device or node-name but not both"); + return NULL; + } + + if (device) { + bs = bdrv_find(device); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return NULL; + } + + return bs; + } + + bs = bdrv_find_node(node_name); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return NULL; + } + + return bs; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { return QTAILQ_FIRST(&bdrv_states); } - return QTAILQ_NEXT(bs, list); + return QTAILQ_NEXT(bs, device_list); } void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { it(opaque, bs); } } @@ -3065,7 +3629,7 @@ int bdrv_flush_all(void) BlockDriverState *bs; int result = 0; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { int ret = bdrv_flush(bs); if (ret < 0 && !result) { result = ret; @@ -3097,6 +3661,36 @@ int bdrv_has_zero_init(BlockDriverState *bs) return 0; } +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + + if (bs->backing_hd) { + return false; + } + + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.unallocated_blocks_are_zero; + } + + return false; +} + +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + + if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { + return false; + } + + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.can_write_zeroes_with_unmap; + } + + return false; +} + typedef struct BdrvCoGetBlockStatusData { BlockDriverState *bs; BlockDriverState *base; @@ -3166,8 +3760,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, *pnum, pnum); } - if (!(ret & BDRV_BLOCK_DATA)) { - if (bdrv_has_zero_init(bs)) { + if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { + if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; } else if (bs->backing_hd) { BlockDriverState *bs2 = bs->backing_hd; @@ -3325,7 +3919,7 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return -EIO; - assert(!bs->dirty_bitmap); + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); } @@ -3414,6 +4008,19 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, return -ENOTSUP; } +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) +{ + while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { + return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); + } + + return -ENOTSUP; +} + int bdrv_debug_resume(BlockDriverState *bs, const char *tag) { while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { @@ -3549,7 +4156,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, cb, opaque, false); } @@ -3559,7 +4166,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockDriverCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, cb, opaque, true); } @@ -3731,8 +4349,10 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) /* Run the aio requests. */ mcb->num_requests = num_reqs; for (i = 0; i < num_reqs; i++) { - bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, multiwrite_cb, mcb); + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); } return 0; @@ -3874,10 +4494,10 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) if (!acb->is_write) { acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, 0); + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); } else { acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, 0); + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); } acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); @@ -3888,6 +4508,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BdrvRequestFlags flags, BlockDriverCompletionFunc *cb, void *opaque, bool is_write) @@ -3899,6 +4520,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, acb->req.sector = sector_num; acb->req.nb_sectors = nb_sectors; acb->req.qiov = qiov; + acb->req.flags = flags; acb->is_write = is_write; acb->done = NULL; @@ -4131,7 +4753,7 @@ void bdrv_invalidate_cache_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_invalidate_cache(bs); } } @@ -4140,7 +4762,7 @@ void bdrv_clear_incoming_migration_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); } } @@ -4167,16 +4789,29 @@ int bdrv_flush(BlockDriverState *bs) return rwco.ret; } +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; static void coroutine_fn bdrv_discard_co_entry(void *opaque) { - RwCo *rwco = opaque; + DiscardCo *rwco = opaque; rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_DISCARD_DEFAULT 32768 + int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + int max_discard; + if (!bs->drv) { return -ENOMEDIUM; } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { @@ -4185,40 +4820,68 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return -EROFS; } - if (bs->dirty_bitmap) { - bdrv_reset_dirty(bs, sector_num, nb_sectors); - } + bdrv_reset_dirty(bs, sector_num, nb_sectors); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { return 0; } - if (bs->drv->bdrv_co_discard) { - return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); - } else if (bs->drv->bdrv_aio_discard) { - BlockDriverAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - return -EIO; + max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); } else { - qemu_coroutine_yield(); - return co.ret; + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } } - } else { - return 0; + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; } + return 0; } int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { Coroutine *co; - RwCo rwco = { + DiscardCo rwco = { .bs = bs, .sector_num = sector_num, .nb_sectors = nb_sectors, @@ -4323,14 +4986,14 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, return NULL; } -void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) +void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { - bs->buffer_alignment = align; + bs->guest_block_size = align; } void *qemu_blockalign(BlockDriverState *bs, size_t size) { - return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); + return qemu_memalign(bdrv_opt_mem_align(bs), size); } /* @@ -4339,9 +5002,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) { int i; + size_t alignment = bdrv_opt_mem_align(bs); for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { return false; } } @@ -4349,58 +5016,90 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity) { int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; assert((granularity & (granularity - 1)) == 0); - if (granularity) { - granularity >>= BDRV_SECTOR_BITS; - assert(!bs->dirty_bitmap); - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); - bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); - } else { - if (bs->dirty_bitmap) { - hbitmap_free(bs->dirty_bitmap); - bs->dirty_bitmap = NULL; + granularity >>= BDRV_SECTOR_BITS; + assert(granularity); + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); + bitmap = g_malloc0(sizeof(BdrvDirtyBitmap)); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; +} + +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +{ + BdrvDirtyBitmap *bm, *next; + QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { + if (bm == bitmap) { + QLIST_REMOVE(bitmap, list); + hbitmap_free(bitmap->bitmap); + g_free(bitmap); + return; } } } -int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) { - if (bs->dirty_bitmap) { - return hbitmap_get(bs->dirty_bitmap, sector); + BdrvDirtyBitmap *bm; + BlockDirtyInfoList *list = NULL; + BlockDirtyInfoList **plist = &list; + + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo)); + BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList)); + info->count = bdrv_get_dirty_count(bs, bm); + info->granularity = + ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); + entry->value = info; + *plist = entry; + plist = &entry->next; + } + + return list; +} + +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) +{ + if (bitmap) { + return hbitmap_get(bitmap->bitmap, sector); } else { return 0; } } -void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) +void bdrv_dirty_iter_init(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) { - hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); + hbitmap_iter_init(hbi, bitmap->bitmap, 0); } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + } } -void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); + } } -int64_t bdrv_get_dirty_count(BlockDriverState *bs) +int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) { - if (bs->dirty_bitmap) { - return hbitmap_count(bs->dirty_bitmap); - } else { - return 0; - } + return hbitmap_count(bitmap->bitmap); } /* Get a reference to bs */ @@ -4499,7 +5198,6 @@ void bdrv_img_create(const char *filename, const char *fmt, { QEMUOptionParameter *param = NULL, *create_options = NULL; QEMUOptionParameter *backing_fmt, *backing_file, *size; - BlockDriverState *bs = NULL; BlockDriver *drv, *proto_drv; BlockDriver *backing_drv = NULL; Error *local_err = NULL; @@ -4578,6 +5276,7 @@ void bdrv_img_create(const char *filename, const char *fmt, size = get_option_parameter(param, BLOCK_OPT_SIZE); if (size && size->value.n == -1) { if (backing_file && backing_file->value.s) { + BlockDriverState *bs; uint64_t size; char buf[32]; int back_flags; @@ -4596,6 +5295,7 @@ void bdrv_img_create(const char *filename, const char *fmt, error_get_pretty(local_err)); error_free(local_err); local_err = NULL; + bdrv_unref(bs); goto out; } bdrv_get_geometry(bs, &size); @@ -4603,6 +5303,8 @@ void bdrv_img_create(const char *filename, const char *fmt, snprintf(buf, sizeof(buf), "%" PRId64, size); set_option_parameter(param, BLOCK_OPT_SIZE, buf); + + bdrv_unref(bs); } else { error_setg(errp, "Image creation needs a size parameter"); goto out; @@ -4633,9 +5335,6 @@ out: free_option_parameters(create_options); free_option_parameters(param); - if (bs) { - bdrv_unref(bs); - } if (error_is_set(&local_err)) { error_propagate(errp, local_err); } @@ -4661,21 +5360,68 @@ int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) return bs->drv->bdrv_amend_options(bs, options); } -ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs) +/* Used to recurse on single child block filters. + * Single child block filter will store their child in bs->file. + */ +bool bdrv_generic_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) { - if (bs->drv->bdrv_check_ext_snapshot) { - return bs->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv) { + return false; + } + + if (!bs->drv->authorizations[BS_IS_A_FILTER]) { + if (bs == candidate) { + return true; + } else { + return false; + } } - if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) { - return bs->file->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) { + return false; } - /* external snapshots are allowed by default */ - return EXT_SNAPSHOT_ALLOWED; + if (!bs->file) { + return false; + } + + return bdrv_recurse_is_first_non_filter(bs->file, candidate); +} + +bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) { + return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); + } + + return bdrv_generic_is_first_non_filter(bs, candidate); } -ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs) +/* This function checks if the candidate is the first non filter bs down it's + * bs chain. Since we don't have pointers to parents it explore all bs chains + * from the top. Some filters can choose not to pass down the recursion. + */ +bool bdrv_is_first_non_filter(BlockDriverState *candidate) { - return EXT_SNAPSHOT_FORBIDDEN; + BlockDriverState *bs; + + /* walk down the bs forest recursively */ + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bool perm; + + if (!bs->file) { + continue; + } + + perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + + /* candidate is the first non filter */ + if (perm) { + return true; + } + } + + return false; }