]> Git Repo - qemu.git/blobdiff - block.c
block: add accounting for merged requests
[qemu.git] / block.c
diff --git a/block.c b/block.c
index f80e2b2b58bbcec7207c47f794883f568d8b975b..8272ef901e75bae70aec6cf6c59a173c2fa12c62 100644 (file)
--- a/block.c
+++ b/block.c
@@ -28,6 +28,7 @@
 #include "block/blockjob.h"
 #include "qemu/module.h"
 #include "qapi/qmp/qjson.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
 #include "qemu/notify.h"
 #include "block/coroutine.h"
@@ -57,13 +58,12 @@ struct BdrvDirtyBitmap {
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
-static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
-static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
+        BlockCompletionFunc *cb, void *opaque);
 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *iov);
@@ -76,14 +76,14 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
-static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                               int64_t sector_num,
-                                               QEMUIOVector *qiov,
-                                               int nb_sectors,
-                                               BdrvRequestFlags flags,
-                                               BlockDriverCompletionFunc *cb,
-                                               void *opaque,
-                                               bool is_write);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
@@ -97,6 +97,10 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
+static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                           int nr_sectors);
+static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                             int nr_sectors);
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
 
@@ -229,7 +233,7 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
 }
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
     const char *p;
 
@@ -303,15 +307,32 @@ void path_combine(char *dest, int dest_size,
     }
 }
 
-void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
+void bdrv_get_full_backing_filename_from_filename(const char *backed,
+                                                  const char *backing,
+                                                  char *dest, size_t sz,
+                                                  Error **errp)
 {
-    if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
-        pstrcpy(dest, sz, bs->backing_file);
+    if (backing[0] == '\0' || path_has_protocol(backing) ||
+        path_is_absolute(backing))
+    {
+        pstrcpy(dest, sz, backing);
+    } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
+        error_setg(errp, "Cannot use relative backing file names for '%s'",
+                   backed);
     } else {
-        path_combine(dest, sz, bs->filename, bs->backing_file);
+        path_combine(dest, sz, backed, backing);
     }
 }
 
+void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
+                                    Error **errp)
+{
+    char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
+
+    bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
+                                                 dest, sz, errp);
+}
+
 void bdrv_register(BlockDriver *bdrv)
 {
     /* Block drivers without coroutine functions need emulation */
@@ -332,29 +353,21 @@ void bdrv_register(BlockDriver *bdrv)
     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }
 
-/* create a new block device (by default it is empty) */
-BlockDriverState *bdrv_new(const char *device_name, Error **errp)
+BlockDriverState *bdrv_new_root(void)
+{
+    BlockDriverState *bs = bdrv_new();
+
+    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
+    return bs;
+}
+
+BlockDriverState *bdrv_new(void)
 {
     BlockDriverState *bs;
     int i;
 
-    if (bdrv_find(device_name)) {
-        error_setg(errp, "Device with id '%s' already exists",
-                   device_name);
-        return NULL;
-    }
-    if (bdrv_find_node(device_name)) {
-        error_setg(errp, "Device with node-name '%s' already exists",
-                   device_name);
-        return NULL;
-    }
-
-    bs = g_malloc0(sizeof(BlockDriverState));
+    bs = g_new0(BlockDriverState, 1);
     QLIST_INIT(&bs->dirty_bitmaps);
-    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
-    if (device_name[0] != '\0') {
-        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
-    }
     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
         QLIST_INIT(&bs->op_blockers[i]);
     }
@@ -471,7 +484,7 @@ int bdrv_create(BlockDriver *drv, const char* filename,
         co = qemu_coroutine_create(bdrv_create_co_entry);
         qemu_coroutine_enter(co, &cco);
         while (cco.ret == NOT_DONE) {
-            qemu_aio_wait();
+            aio_poll(qemu_get_aio_context(), true);
         }
     }
 
@@ -508,30 +521,43 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
     return ret;
 }
 
-int bdrv_refresh_limits(BlockDriverState *bs)
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BlockDriver *drv = bs->drv;
+    Error *local_err = NULL;
 
     memset(&bs->bl, 0, sizeof(bs->bl));
 
     if (!drv) {
-        return 0;
+        return;
     }
 
     /* Take some limits from the children as a default */
     if (bs->file) {
-        bdrv_refresh_limits(bs->file);
+        bdrv_refresh_limits(bs->file, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
     } else {
         bs->bl.opt_mem_alignment = 512;
     }
 
     if (bs->backing_hd) {
-        bdrv_refresh_limits(bs->backing_hd);
+        bdrv_refresh_limits(bs->backing_hd, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
         bs->bl.opt_transfer_length =
             MAX(bs->bl.opt_transfer_length,
                 bs->backing_hd->bl.opt_transfer_length);
+        bs->bl.max_transfer_length =
+            MIN_NON_ZERO(bs->bl.max_transfer_length,
+                         bs->backing_hd->bl.max_transfer_length);
         bs->bl.opt_mem_alignment =
             MAX(bs->bl.opt_mem_alignment,
                 bs->backing_hd->bl.opt_mem_alignment);
@@ -539,10 +565,8 @@ int bdrv_refresh_limits(BlockDriverState *bs)
 
     /* Then let the driver override it */
     if (drv->bdrv_refresh_limits) {
-        return drv->bdrv_refresh_limits(bs);
+        drv->bdrv_refresh_limits(bs, errp);
     }
-
-    return 0;
 }
 
 /*
@@ -626,7 +650,7 @@ BlockDriver *bdrv_find_protocol(const char *filename,
     }
 
     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
-        return bdrv_find_format("file");
+        return &bdrv_file;
     }
 
     p = strchr(filename, ':');
@@ -645,22 +669,49 @@ BlockDriver *bdrv_find_protocol(const char *filename,
     return NULL;
 }
 
+/*
+ * Guess image format by probing its contents.
+ * This is not a good idea when your image is raw (CVE-2008-2004), but
+ * we do it anyway for backward compatibility.
+ *
+ * @buf         contains the image's first @buf_size bytes.
+ * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
+ *              but can be smaller if the image file is smaller)
+ * @filename    is its filename.
+ *
+ * For all block drivers, call the bdrv_probe() method to get its
+ * probing score.
+ * Return the first block driver with the highest probing score.
+ */
+BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
+                            const char *filename)
+{
+    int score_max = 0, score;
+    BlockDriver *drv = NULL, *d;
+
+    QLIST_FOREACH(d, &bdrv_drivers, list) {
+        if (d->bdrv_probe) {
+            score = d->bdrv_probe(buf, buf_size, filename);
+            if (score > score_max) {
+                score_max = score;
+                drv = d;
+            }
+        }
+    }
+
+    return drv;
+}
+
 static int find_image_format(BlockDriverState *bs, const char *filename,
                              BlockDriver **pdrv, Error **errp)
 {
-    int score, score_max;
-    BlockDriver *drv1, *drv;
-    uint8_t buf[2048];
+    BlockDriver *drv;
+    uint8_t buf[BLOCK_PROBE_BUF_SIZE];
     int ret = 0;
 
     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
-        drv = bdrv_find_format("raw");
-        if (!drv) {
-            error_setg(errp, "Could not find raw image format");
-            ret = -ENOENT;
-        }
-        *pdrv = drv;
+        *pdrv = &bdrv_raw;
         return ret;
     }
 
@@ -672,17 +723,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename,
         return ret;
     }
 
-    score_max = 0;
-    drv = NULL;
-    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
-        if (drv1->bdrv_probe) {
-            score = drv1->bdrv_probe(buf, ret, filename);
-            if (score > score_max) {
-                score_max = score;
-                drv = drv1;
-            }
-        }
-    }
+    drv = bdrv_probe_all(buf, ret, filename);
     if (!drv) {
         error_setg(errp, "Could not determine image format: No compatible "
                    "driver found");
@@ -694,6 +735,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename,
 
 /**
  * Set the current 'total_sectors' value
+ * Return 0 on success, -errno on error.
  */
 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 {
@@ -851,14 +893,14 @@ static void bdrv_assign_node_name(BlockDriverState *bs,
         return;
     }
 
-    /* empty string node name is invalid */
-    if (node_name[0] == '\0') {
-        error_setg(errp, "Empty node name");
+    /* Check for empty string or invalid characters */
+    if (!id_wellformed(node_name)) {
+        error_setg(errp, "Invalid node name");
         return;
     }
 
     /* takes care of avoiding namespaces collisions */
-    if (bdrv_find(node_name)) {
+    if (blk_by_name(node_name)) {
         error_setg(errp, "node-name=%s is conflicting with a device id",
                    node_name);
         return;
@@ -954,6 +996,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
     } else {
         bs->filename[0] = '\0';
     }
+    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
 
     bs->drv = drv;
     bs->opaque = g_malloc0(drv->instance_size);
@@ -993,7 +1036,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
         goto free_and_fail;
     }
 
-    bdrv_refresh_limits(bs);
+    bdrv_refresh_limits(bs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto free_and_fail;
+    }
+
     assert(bdrv_opt_mem_align(bs) != 0);
     assert((bs->request_alignment != 0) || bs->sg);
     return 0;
@@ -1135,7 +1184,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
     } else if (backing_hd) {
         error_setg(&bs->backing_blocker,
                    "device is used as backing hd of '%s'",
-                   bs->device_name);
+                   bdrv_get_device_name(bs));
     }
 
     bs->backing_hd = backing_hd;
@@ -1151,10 +1200,10 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
 
     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
     /* Otherwise we won't be able to commit due to check in bdrv_commit */
-    bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
+    bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
                     bs->backing_blocker);
 out:
-    bdrv_refresh_limits(bs);
+    bdrv_refresh_limits(bs, NULL);
 }
 
 /*
@@ -1169,7 +1218,6 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
 {
     char *backing_filename = g_malloc0(PATH_MAX);
     int ret = 0;
-    BlockDriver *back_drv = NULL;
     BlockDriverState *backing_hd;
     Error *local_err = NULL;
 
@@ -1190,7 +1238,14 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
         QDECREF(options);
         goto free_exit;
     } else {
-        bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
+        bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
+                                       &local_err);
+        if (local_err) {
+            ret = -EINVAL;
+            error_propagate(errp, local_err);
+            QDECREF(options);
+            goto free_exit;
+        }
     }
 
     if (!bs->drv || !bs->drv->supports_backing) {
@@ -1200,16 +1255,16 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
         goto free_exit;
     }
 
-    backing_hd = bdrv_new("", errp);
+    backing_hd = bdrv_new();
 
-    if (bs->backing_format[0] != '\0') {
-        back_drv = bdrv_find_format(bs->backing_format);
+    if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
+        qdict_put(options, "driver", qstring_from_str(bs->backing_format));
     }
 
     assert(bs->backing_hd == NULL);
     ret = bdrv_open(&backing_hd,
                     *backing_filename ? backing_filename : NULL, NULL, options,
-                    bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
+                    bdrv_backing_flags(bs->open_flags), NULL, &local_err);
     if (ret < 0) {
         bdrv_unref(backing_hd);
         backing_hd = NULL;
@@ -1283,7 +1338,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
     char *tmp_filename = g_malloc0(PATH_MAX + 1);
     int64_t total_size;
-    BlockDriver *bdrv_qcow2;
     QemuOpts *opts = NULL;
     QDict *snapshot_options;
     BlockDriverState *bs_snapshot;
@@ -1300,7 +1354,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
         error_setg_errno(errp, -total_size, "Could not get image size");
         goto out;
     }
-    total_size &= BDRV_SECTOR_MASK;
 
     /* Create the temporary image */
     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
@@ -1309,11 +1362,10 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
         goto out;
     }
 
-    bdrv_qcow2 = bdrv_find_format("qcow2");
-    opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
+    opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
                             &error_abort);
     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
-    ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
+    ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
     qemu_opts_del(opts);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not create temporary overlay "
@@ -1330,10 +1382,10 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
     qdict_put(snapshot_options, "file.filename",
               qstring_from_str(tmp_filename));
 
-    bs_snapshot = bdrv_new("", &error_abort);
+    bs_snapshot = bdrv_new();
 
     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
-                    flags, bdrv_qcow2, &local_err);
+                    flags, &bdrv_qcow2, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto out;
@@ -1401,7 +1453,7 @@ int bdrv_open(BlockDriverState **pbs, const char *filename,
     if (*pbs) {
         bs = *pbs;
     } else {
-        bs = bdrv_new("", &error_abort);
+        bs = bdrv_new();
     }
 
     /* NULL means an empty set of options */
@@ -1457,6 +1509,7 @@ int bdrv_open(BlockDriverState **pbs, const char *filename,
     }
 
     /* Image format probing */
+    bs->probed = !drv;
     if (!drv && file) {
         ret = find_image_format(file, filename, &drv, &local_err);
         if (ret < 0) {
@@ -1490,6 +1543,8 @@ int bdrv_open(BlockDriverState **pbs, const char *filename,
         }
     }
 
+    bdrv_refresh_filename(bs);
+
     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
      * temporary snapshot afterwards. */
     if (snapshot_flags) {
@@ -1508,7 +1563,7 @@ int bdrv_open(BlockDriverState **pbs, const char *filename,
         } else {
             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
                        "support the option '%s'", drv->format_name,
-                       bs->device_name, entry->key);
+                       bdrv_get_device_name(bs), entry->key);
         }
 
         ret = -EINVAL;
@@ -1516,7 +1571,9 @@ int bdrv_open(BlockDriverState **pbs, const char *filename,
     }
 
     if (!bdrv_key_required(bs)) {
-        bdrv_dev_change_media_cb(bs, true);
+        if (bs->blk) {
+            blk_dev_change_media_cb(bs->blk, true);
+        }
     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
                && !runstate_check(RUN_STATE_INMIGRATE)
                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
@@ -1715,7 +1772,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
         reopen_state->flags & BDRV_O_RDWR) {
         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
-                  reopen_state->bs->device_name);
+                  bdrv_get_device_name(reopen_state->bs));
         goto error;
     }
 
@@ -1742,7 +1799,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
         /* It is currently mandatory to have a bdrv_reopen_prepare()
          * handler for each supported drv. */
         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  drv->format_name, reopen_state->bs->device_name,
+                  drv->format_name, bdrv_get_device_name(reopen_state->bs),
                  "reopening of file");
         ret = -1;
         goto error;
@@ -1778,7 +1835,7 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)
                                               BDRV_O_CACHE_WB);
     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
 
-    bdrv_refresh_limits(reopen_state->bs);
+    bdrv_refresh_limits(reopen_state->bs, NULL);
 }
 
 /*
@@ -1801,6 +1858,8 @@ void bdrv_reopen_abort(BDRVReopenState *reopen_state)
 
 void bdrv_close(BlockDriverState *bs)
 {
+    BdrvAioNotifier *ban, *ban_next;
+
     if (bs->job) {
         block_job_cancel_sync(bs->job);
     }
@@ -1830,6 +1889,8 @@ void bdrv_close(BlockDriverState *bs)
         bs->zero_beyond_eof = false;
         QDECREF(bs->options);
         bs->options = NULL;
+        QDECREF(bs->full_open_options);
+        bs->full_open_options = NULL;
 
         if (bs->file != NULL) {
             bdrv_unref(bs->file);
@@ -1837,12 +1898,19 @@ void bdrv_close(BlockDriverState *bs)
         }
     }
 
-    bdrv_dev_change_media_cb(bs, false);
+    if (bs->blk) {
+        blk_dev_change_media_cb(bs->blk, false);
+    }
 
     /*throttling disk I/O limits*/
     if (bs->io_limits_enabled) {
         bdrv_io_limits_disable(bs);
     }
+
+    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
+        g_free(ban);
+    }
+    QLIST_INIT(&bs->aio_notifiers);
 }
 
 void bdrv_close_all(void)
@@ -1879,6 +1947,34 @@ static bool bdrv_requests_pending(BlockDriverState *bs)
     return false;
 }
 
+static bool bdrv_drain_one(BlockDriverState *bs)
+{
+    bool bs_busy;
+
+    bdrv_flush_io_queue(bs);
+    bdrv_start_throttled_reqs(bs);
+    bs_busy = bdrv_requests_pending(bs);
+    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
+    return bs_busy;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * See the warning in bdrv_drain_all().  This function can only be called if
+ * you are sure nothing can generate I/O because you have op blockers
+ * installed.
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+    while (bdrv_drain_one(bs)) {
+        /* Keep iterating */
+    }
+}
+
 /*
  * Wait for pending requests to complete across all BlockDriverStates
  *
@@ -1902,15 +1998,10 @@ void bdrv_drain_all(void)
 
         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
             AioContext *aio_context = bdrv_get_aio_context(bs);
-            bool bs_busy;
 
             aio_context_acquire(aio_context);
-            bdrv_start_throttled_reqs(bs);
-            bs_busy = bdrv_requests_pending(bs);
-            bs_busy |= aio_poll(aio_context, bs_busy);
+            busy |= bdrv_drain_one(bs);
             aio_context_release(aio_context);
-
-            busy |= bs_busy;
         }
     }
 }
@@ -1920,10 +2011,17 @@ void bdrv_drain_all(void)
    Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
 {
-    if (bs->device_name[0] != '\0') {
+    /*
+     * Take care to remove bs from bdrv_states only when it's actually
+     * in it.  Note that bs->device_list.tqe_prev is initially null,
+     * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
+     * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
+     * resetting it to null on remove.
+     */
+    if (bs->device_list.tqe_prev) {
         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
+        bs->device_list.tqe_prev = NULL;
     }
-    bs->device_name[0] = '\0';
     if (bs->node_name[0] != '\0') {
         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
     }
@@ -1943,9 +2041,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     /* move some fields that need to stay attached to the device */
 
     /* dev info */
-    bs_dest->dev_ops            = bs_src->dev_ops;
-    bs_dest->dev_opaque         = bs_src->dev_opaque;
-    bs_dest->dev                = bs_src->dev;
     bs_dest->guest_block_size   = bs_src->guest_block_size;
     bs_dest->copy_on_read       = bs_src->copy_on_read;
 
@@ -1977,9 +2072,9 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->job                = bs_src->job;
 
     /* keep the same entry in bdrv_states */
-    pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
-            bs_src->device_name);
     bs_dest->device_list = bs_src->device_list;
+    bs_dest->blk = bs_src->blk;
+
     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
            sizeof(bs_dest->op_blockers));
 }
@@ -1992,7 +2087,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_old. Both bs_new and bs_old are modified.
  *
- * bs_new is required to be anonymous.
+ * bs_new must not be attached to a BlockBackend.
  *
  * This function does not create any image files.
  */
@@ -2011,11 +2106,10 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
     }
 
-    /* bs_new must be anonymous and shouldn't have anything fancy enabled */
-    assert(bs_new->device_name[0] == '\0');
+    /* bs_new must be unattached and shouldn't have anything fancy enabled */
+    assert(!bs_new->blk);
     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
     assert(bs_new->job == NULL);
-    assert(bs_new->dev == NULL);
     assert(bs_new->io_limits_enabled == false);
     assert(!throttle_have_timer(&bs_new->throttle_state));
 
@@ -2028,11 +2122,10 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     bdrv_move_feature_fields(bs_old, bs_new);
     bdrv_move_feature_fields(bs_new, &tmp);
 
-    /* bs_new shouldn't be in bdrv_states even after the swap!  */
-    assert(bs_new->device_name[0] == '\0');
+    /* bs_new must remain unattached */
+    assert(!bs_new->blk);
 
     /* Check a few fields that should remain attached to the device */
-    assert(bs_new->dev == NULL);
     assert(bs_new->job == NULL);
     assert(bs_new->io_limits_enabled == false);
     assert(!throttle_have_timer(&bs_new->throttle_state));
@@ -2056,7 +2149,7 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_top. Both bs_new and bs_top are modified.
  *
- * bs_new is required to be anonymous.
+ * bs_new must not be attached to a BlockBackend.
  *
  * This function does not create any image files.
  */
@@ -2071,7 +2164,6 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 
 static void bdrv_delete(BlockDriverState *bs)
 {
-    assert(!bs->dev);
     assert(!bs->job);
     assert(bdrv_op_blocker_is_empty(bs));
     assert(!bs->refcnt);
@@ -2085,101 +2177,6 @@ static void bdrv_delete(BlockDriverState *bs)
     g_free(bs);
 }
 
-int bdrv_attach_dev(BlockDriverState *bs, void *dev)
-/* TODO change to DeviceState *dev when all users are qdevified */
-{
-    if (bs->dev) {
-        return -EBUSY;
-    }
-    bs->dev = dev;
-    bdrv_iostatus_reset(bs);
-    return 0;
-}
-
-/* TODO qdevified devices don't use this, remove when devices are qdevified */
-void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
-{
-    if (bdrv_attach_dev(bs, dev) < 0) {
-        abort();
-    }
-}
-
-void bdrv_detach_dev(BlockDriverState *bs, void *dev)
-/* TODO change to DeviceState *dev when all users are qdevified */
-{
-    assert(bs->dev == dev);
-    bs->dev = NULL;
-    bs->dev_ops = NULL;
-    bs->dev_opaque = NULL;
-    bs->guest_block_size = 512;
-}
-
-/* TODO change to return DeviceState * when all users are qdevified */
-void *bdrv_get_attached_dev(BlockDriverState *bs)
-{
-    return bs->dev;
-}
-
-void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
-                      void *opaque)
-{
-    bs->dev_ops = ops;
-    bs->dev_opaque = opaque;
-}
-
-static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
-{
-    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
-        bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
-        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
-        if (tray_was_closed) {
-            /* tray open */
-            qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
-                                              true, &error_abort);
-        }
-        if (load) {
-            /* tray close */
-            qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
-                                              false, &error_abort);
-        }
-    }
-}
-
-bool bdrv_dev_has_removable_media(BlockDriverState *bs)
-{
-    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
-}
-
-void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
-{
-    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
-        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
-    }
-}
-
-bool bdrv_dev_is_tray_open(BlockDriverState *bs)
-{
-    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
-        return bs->dev_ops->is_tray_open(bs->dev_opaque);
-    }
-    return false;
-}
-
-static void bdrv_dev_resize_cb(BlockDriverState *bs)
-{
-    if (bs->dev_ops && bs->dev_ops->resize_cb) {
-        bs->dev_ops->resize_cb(bs->dev_opaque);
-    }
-}
-
-bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
-{
-    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
-        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
-    }
-    return false;
-}
-
 /*
  * Run consistency checks on an image
  *
@@ -2189,6 +2186,9 @@ bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
  */
 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
+    if (bs->drv == NULL) {
+        return -ENOMEDIUM;
+    }
     if (bs->drv->bdrv_check == NULL) {
         return -ENOTSUP;
     }
@@ -2207,23 +2207,20 @@ int bdrv_commit(BlockDriverState *bs)
     int n, ro, open_flags;
     int ret = 0;
     uint8_t *buf = NULL;
-    char filename[PATH_MAX];
 
     if (!drv)
         return -ENOMEDIUM;
-    
+
     if (!bs->backing_hd) {
         return -ENOTSUP;
     }
 
-    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
-        bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
+        bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
         return -EBUSY;
     }
 
     ro = bs->backing_hd->read_only;
-    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
-    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
     open_flags =  bs->backing_hd->open_flags;
 
     if (ro) {
@@ -2255,7 +2252,14 @@ int bdrv_commit(BlockDriverState *bs)
     }
 
     total_sectors = length >> BDRV_SECTOR_BITS;
-    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+
+    /* qemu_try_blockalign() for bs will choose an alignment that works for
+     * bs->backing_hd as well, so no need to compare the alignment manually. */
+    buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
+    if (buf == NULL) {
+        ret = -ENOMEM;
+        goto ro_cleanup;
+    }
 
     for (sector = 0; sector < total_sectors; sector += n) {
         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
@@ -2293,7 +2297,7 @@ int bdrv_commit(BlockDriverState *bs)
 
     ret = 0;
 ro_cleanup:
-    g_free(buf);
+    qemu_vfree(buf);
 
     if (ro) {
         /* ignoring error return here */
@@ -2598,7 +2602,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
      * into our deletion queue, until we hit the 'base'
      */
     while (intermediate) {
-        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
+        intermediate_state = g_new0(BlkIntermediateStates, 1);
         intermediate_state->bs = intermediate;
         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
 
@@ -2813,23 +2817,21 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
  */
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
 {
-    int64_t target_size;
-    int64_t ret, nb_sectors, sector_num = 0;
+    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
     int n;
 
-    target_size = bdrv_getlength(bs);
-    if (target_size < 0) {
-        return target_size;
+    target_sectors = bdrv_nb_sectors(bs);
+    if (target_sectors < 0) {
+        return target_sectors;
     }
-    target_size /= BDRV_SECTOR_SIZE;
 
     for (;;) {
-        nb_sectors = target_size - sector_num;
+        nb_sectors = target_sectors - sector_num;
         if (nb_sectors <= 0) {
             return 0;
         }
-        if (nb_sectors > INT_MAX) {
-            nb_sectors = INT_MAX;
+        if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+            nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
         }
         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
         if (ret < 0) {
@@ -2954,7 +2956,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
                                    cluster_sector_num, cluster_nb_sectors);
 
     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
-    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    if (bounce_buffer == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
@@ -3009,6 +3016,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -3041,20 +3049,29 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
     } else {
         /* Read zeros after EOF of growable BDSes */
-        int64_t len, total_sectors, max_nb_sectors;
+        int64_t total_sectors, max_nb_sectors;
 
-        len = bdrv_getlength(bs);
-        if (len < 0) {
-            ret = len;
+        total_sectors = bdrv_nb_sectors(bs);
+        if (total_sectors < 0) {
+            ret = total_sectors;
             goto out;
         }
 
-        total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
                                   align >> BDRV_SECTOR_BITS);
-        if (max_nb_sectors > 0) {
-            ret = drv->bdrv_co_readv(bs, sector_num,
-                                     MIN(nb_sectors, max_nb_sectors), qiov);
+        if (nb_sectors < max_nb_sectors) {
+            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+        } else if (max_nb_sectors > 0) {
+            QEMUIOVector local_qiov;
+
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, 0,
+                              max_nb_sectors * BDRV_SECTOR_SIZE);
+
+            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+                                     &local_qiov);
+
+            qemu_iovec_destroy(&local_qiov);
         } else {
             ret = 0;
         }
@@ -3175,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                             BDRV_REQ_COPY_ON_READ);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_WRITE_ZEROES_DEFAULT 32768
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
@@ -3189,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int ret = 0;
 
     int max_write_zeroes = bs->bl.max_write_zeroes ?
-                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
+                           bs->bl.max_write_zeroes : INT_MAX;
 
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
@@ -3224,9 +3238,16 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 
         if (ret == -ENOTSUP) {
             /* Fall back to bounce buffer if write zeroes is unsupported */
+            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+            num = MIN(num, max_xfer_len);
             iov.iov_len = num * BDRV_SECTOR_SIZE;
             if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                if (iov.iov_base == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
             }
             qemu_iovec_init_external(&qiov, &iov, 1);
@@ -3236,7 +3257,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
             /* Keep bounce buffer around if it is big enough for all
              * all future requests.
              */
-            if (num < max_write_zeroes) {
+            if (num < max_xfer_len) {
                 qemu_vfree(iov.iov_base);
                 iov.iov_base = NULL;
             }
@@ -3246,6 +3267,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
         nb_sectors -= num;
     }
 
+fail:
     qemu_vfree(iov.iov_base);
     return ret;
 }
@@ -3266,6 +3288,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
 
     waited = wait_serialising_requests(req);
     assert(!waited || !req->serialising);
@@ -3300,9 +3323,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 
     bdrv_set_dirty(bs, sector_num, nb_sectors);
 
-    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
-        bs->wr_highest_sector = sector_num + nb_sectors - 1;
-    }
+    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
     if (bs->growable && ret >= 0) {
         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
     }
@@ -3483,7 +3505,9 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
     ret = drv->bdrv_truncate(bs, offset);
     if (ret == 0) {
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
-        bdrv_dev_resize_cb(bs);
+        if (bs->blk) {
+            blk_dev_resize_cb(bs->blk);
+        }
     }
     return ret;
 }
@@ -3508,11 +3532,12 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 }
 
 /**
- * Length of a file in bytes. Return < 0 if error or unknown.
+ * Return number of sectors on success, -errno on error.
  */
-int64_t bdrv_getlength(BlockDriverState *bs)
+int64_t bdrv_nb_sectors(BlockDriverState *bs)
 {
     BlockDriver *drv = bs->drv;
+
     if (!drv)
         return -ENOMEDIUM;
 
@@ -3522,19 +3547,26 @@ int64_t bdrv_getlength(BlockDriverState *bs)
             return ret;
         }
     }
-    return bs->total_sectors * BDRV_SECTOR_SIZE;
+    return bs->total_sectors;
+}
+
+/**
+ * Return length in bytes on success, -errno on error.
+ * The length is always a multiple of BDRV_SECTOR_SIZE.
+ */
+int64_t bdrv_getlength(BlockDriverState *bs)
+{
+    int64_t ret = bdrv_nb_sectors(bs);
+
+    return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
 }
 
 /* return 0 as number of sectors if no device present or error */
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
 {
-    int64_t length;
-    length = bdrv_getlength(bs);
-    if (length < 0)
-        length = 0;
-    else
-        length = length >> BDRV_SECTOR_BITS;
-    *nb_sectors_ptr = length;
+    int64_t nb_sectors = bdrv_nb_sectors(bs);
+
+    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }
 
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
@@ -3568,6 +3600,19 @@ BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int e
     }
 }
 
+static void send_qmp_error_event(BlockDriverState *bs,
+                                 BlockErrorAction action,
+                                 bool is_read, int error)
+{
+    IoOperationType optype;
+
+    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
+    qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
+                                   bdrv_iostatus_is_enabled(bs),
+                                   error == ENOSPC, strerror(error),
+                                   &error_abort);
+}
+
 /* This is done by device models because, while the block layer knows
  * about the error, it does not know whether an operation comes from
  * the device or the block layer (from a job, for example).
@@ -3593,16 +3638,10 @@ void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
          * also ensures that the STOP/RESUME pair of events is emitted.
          */
         qemu_system_vmstop_request_prepare();
-        qapi_event_send_block_io_error(bdrv_get_device_name(bs),
-                                       is_read ? IO_OPERATION_TYPE_READ :
-                                       IO_OPERATION_TYPE_WRITE,
-                                       action, &error_abort);
+        send_qmp_error_event(bs, action, is_read, error);
         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
     } else {
-        qapi_event_send_block_io_error(bdrv_get_device_name(bs),
-                                       is_read ? IO_OPERATION_TYPE_READ :
-                                       IO_OPERATION_TYPE_WRITE,
-                                       action, &error_abort);
+        send_qmp_error_event(bs, action, is_read, error);
     }
 }
 
@@ -3669,8 +3708,10 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
         bs->valid_key = 0;
     } else if (!bs->valid_key) {
         bs->valid_key = 1;
-        /* call the change callback now, we skipped it on open */
-        bdrv_dev_change_media_cb(bs, true);
+        if (bs->blk) {
+            /* call the change callback now, we skipped it on open */
+            blk_dev_change_media_cb(bs->blk, true);
+        }
     }
     return ret;
 }
@@ -3680,11 +3721,17 @@ const char *bdrv_get_format_name(BlockDriverState *bs)
     return bs->drv ? bs->drv->format_name : NULL;
 }
 
+static int qsort_strcmp(const void *a, const void *b)
+{
+    return strcmp(a, b);
+}
+
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque)
 {
     BlockDriver *drv;
     int count = 0;
+    int i;
     const char **formats = NULL;
 
     QLIST_FOREACH(drv, &bdrv_drivers, list) {
@@ -3696,26 +3743,28 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
             }
 
             if (!found) {
-                formats = g_realloc(formats, (count + 1) * sizeof(char *));
+                formats = g_renew(const char *, formats, count + 1);
                 formats[count++] = drv->format_name;
-                it(opaque, drv->format_name);
             }
         }
     }
+
+    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
+
+    for (i = 0; i < count; i++) {
+        it(opaque, formats[i]);
+    }
+
     g_free(formats);
 }
 
 /* This function is to find block backend bs */
+/* TODO convert callers to blk_by_name(), then remove */
 BlockDriverState *bdrv_find(const char *name)
 {
-    BlockDriverState *bs;
+    BlockBackend *blk = blk_by_name(name);
 
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        if (!strcmp(name, bs->device_name)) {
-            return bs;
-        }
-    }
-    return NULL;
+    return blk ? blk_bs(blk) : NULL;
 }
 
 /* This function is to find a node in the bs graph */
@@ -3754,13 +3803,14 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                  const char *node_name,
                                  Error **errp)
 {
-    BlockDriverState *bs = NULL;
+    BlockBackend *blk;
+    BlockDriverState *bs;
 
     if (device) {
-        bs = bdrv_find(device);
+        blk = blk_by_name(device);
 
-        if (bs) {
-            return bs;
+        if (blk) {
+            return blk_bs(blk);
         }
     }
 
@@ -3789,6 +3839,14 @@ bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
     return top != NULL;
 }
 
+BlockDriverState *bdrv_next_node(BlockDriverState *bs)
+{
+    if (!bs) {
+        return QTAILQ_FIRST(&graph_bdrv_states);
+    }
+    return QTAILQ_NEXT(bs, node_list);
+}
+
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
     if (!bs) {
@@ -3797,18 +3855,15 @@ BlockDriverState *bdrv_next(BlockDriverState *bs)
     return QTAILQ_NEXT(bs, device_list);
 }
 
-void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
+const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
-    BlockDriverState *bs;
-
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        it(opaque, bs);
-    }
+    return bs->node_name;
 }
 
-const char *bdrv_get_device_name(BlockDriverState *bs)
+/* TODO check what callers really want: bs->node_name or blk_name() */
+const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
-    return bs->device_name;
+    return bs->blk ? blk_name(bs->blk) : "";
 }
 
 int bdrv_get_flags(BlockDriverState *bs)
@@ -3899,9 +3954,9 @@ typedef struct BdrvCoGetBlockStatusData {
 } BdrvCoGetBlockStatusData;
 
 /*
- * Returns true iff the specified sector is present in the disk image. Drivers
- * not implementing the functionality are assumed to not support backing files,
- * hence all their sectors are reported as allocated.
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
  *
  * If 'sector_num' is beyond the end of the disk image the return value is 0
  * and 'pnum' is set to 0.
@@ -3917,21 +3972,21 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
                                                      int64_t sector_num,
                                                      int nb_sectors, int *pnum)
 {
-    int64_t length;
+    int64_t total_sectors;
     int64_t n;
     int64_t ret, ret2;
 
-    length = bdrv_getlength(bs);
-    if (length < 0) {
-        return length;
+    total_sectors = bdrv_nb_sectors(bs);
+    if (total_sectors < 0) {
+        return total_sectors;
     }
 
-    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
+    if (sector_num >= total_sectors) {
         *pnum = 0;
         return 0;
     }
 
-    n = bs->total_sectors - sector_num;
+    n = total_sectors - sector_num;
     if (n < nb_sectors) {
         nb_sectors = n;
     }
@@ -3966,8 +4021,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
             ret |= BDRV_BLOCK_ZERO;
         } else if (bs->backing_hd) {
             BlockDriverState *bs2 = bs->backing_hd;
-            int64_t length2 = bdrv_getlength(bs2);
-            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
+            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
                 ret |= BDRV_BLOCK_ZERO;
             }
         }
@@ -3976,13 +4031,24 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
     if (bs->file &&
         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
         (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        int file_pnum;
+
         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
-                                        *pnum, pnum);
+                                        *pnum, &file_pnum);
         if (ret2 >= 0) {
             /* Ignore errors.  This is just providing extra information, it
              * is useful but not necessary.
              */
-            ret |= (ret2 & BDRV_BLOCK_ZERO);
+            if (!file_pnum) {
+                /* !file_pnum indicates an offset at or beyond the EOF; it is
+                 * perfectly valid for the format block driver to point to such
+                 * offsets, so catch it and mark everything as zero */
+                ret |= BDRV_BLOCK_ZERO;
+            } else {
+                /* Limit request to the range reported by the protocol driver */
+                *pnum = file_pnum;
+                ret |= (ret2 & BDRV_BLOCK_ZERO);
+            }
         }
     }
 
@@ -4039,7 +4105,7 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
     if (ret < 0) {
         return ret;
     }
-    return (ret & BDRV_BLOCK_ALLOCATED);
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
 }
 
 /*
@@ -4335,9 +4401,9 @@ int bdrv_get_backing_file_depth(BlockDriverState *bs)
 /**************************************************************/
 /* async I/Os */
 
-BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                                 QEMUIOVector *qiov, int nb_sectors,
-                                 BlockDriverCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                           QEMUIOVector *qiov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
 {
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
@@ -4345,9 +4411,9 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  cb, opaque, false);
 }
 
-BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                                  QEMUIOVector *qiov, int nb_sectors,
-                                  BlockDriverCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                            QEMUIOVector *qiov, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
 {
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
@@ -4355,9 +4421,9 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                  cb, opaque, true);
 }
 
-BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
 
@@ -4372,7 +4438,7 @@ typedef struct MultiwriteCB {
     int num_requests;
     int num_callbacks;
     struct {
-        BlockDriverCompletionFunc *cb;
+        BlockCompletionFunc *cb;
         void *opaque;
         QEMUIOVector *free_qiov;
     } callbacks[];
@@ -4453,6 +4519,11 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
             merge = 0;
         }
 
+        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+            merge = 0;
+        }
+
         if (merge) {
             size_t size;
             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
@@ -4470,6 +4541,12 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
             // Add the second request
             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
 
+            // Add tail of first request, if necessary
+            if (qiov->size < reqs[outidx].qiov->size) {
+                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+                                  reqs[outidx].qiov->size - qiov->size);
+            }
+
             reqs[outidx].nb_sectors = qiov->size >> 9;
             reqs[outidx].qiov = qiov;
 
@@ -4482,6 +4559,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
         }
     }
 
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
     return outidx + 1;
 }
 
@@ -4543,69 +4622,83 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
     return 0;
 }
 
-void bdrv_aio_cancel(BlockDriverAIOCB *acb)
+void bdrv_aio_cancel(BlockAIOCB *acb)
 {
-    acb->aiocb_info->cancel(acb);
+    qemu_aio_ref(acb);
+    bdrv_aio_cancel_async(acb);
+    while (acb->refcnt > 1) {
+        if (acb->aiocb_info->get_aio_context) {
+            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+        } else if (acb->bs) {
+            aio_poll(bdrv_get_aio_context(acb->bs), true);
+        } else {
+            abort();
+        }
+    }
+    qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+    if (acb->aiocb_info->cancel_async) {
+        acb->aiocb_info->cancel_async(acb);
+    }
 }
 
 /**************************************************************/
 /* async block device emulation */
 
-typedef struct BlockDriverAIOCBSync {
-    BlockDriverAIOCB common;
+typedef struct BlockAIOCBSync {
+    BlockAIOCB common;
     QEMUBH *bh;
     int ret;
     /* vector translation state */
     QEMUIOVector *qiov;
     uint8_t *bounce;
     int is_write;
-} BlockDriverAIOCBSync;
-
-static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
-{
-    BlockDriverAIOCBSync *acb =
-        container_of(blockacb, BlockDriverAIOCBSync, common);
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    qemu_aio_release(acb);
-}
+} BlockAIOCBSync;
 
 static const AIOCBInfo bdrv_em_aiocb_info = {
-    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
-    .cancel             = bdrv_aio_cancel_em,
+    .aiocb_size         = sizeof(BlockAIOCBSync),
 };
 
 static void bdrv_aio_bh_cb(void *opaque)
 {
-    BlockDriverAIOCBSync *acb = opaque;
+    BlockAIOCBSync *acb = opaque;
 
-    if (!acb->is_write)
+    if (!acb->is_write && acb->ret >= 0) {
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
     qemu_vfree(acb->bounce);
     acb->common.cb(acb->common.opaque, acb->ret);
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }
 
-static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            QEMUIOVector *qiov,
-                                            int nb_sectors,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque,
-                                            int is_write)
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque,
+                                      int is_write)
 
 {
-    BlockDriverAIOCBSync *acb;
+    BlockAIOCBSync *acb;
 
     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
     acb->is_write = is_write;
     acb->qiov = qiov;
-    acb->bounce = qemu_blockalign(bs, qiov->size);
+    acb->bounce = qemu_try_blockalign(bs, qiov->size);
     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
 
-    if (is_write) {
+    if (acb->bounce == NULL) {
+        acb->ret = -ENOMEM;
+    } else if (is_write) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
     } else {
@@ -4617,65 +4710,47 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
     return &acb->common;
 }
 
-static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 }
 
-static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
 }
 
 
-typedef struct BlockDriverAIOCBCoroutine {
-    BlockDriverAIOCB common;
+typedef struct BlockAIOCBCoroutine {
+    BlockAIOCB common;
     BlockRequest req;
     bool is_write;
     bool *done;
     QEMUBH* bh;
-} BlockDriverAIOCBCoroutine;
-
-static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
-{
-    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
-    BlockDriverAIOCBCoroutine *acb =
-        container_of(blockacb, BlockDriverAIOCBCoroutine, common);
-    bool done = false;
-
-    acb->done = &done;
-    while (!done) {
-        aio_poll(aio_context, true);
-    }
-}
+} BlockAIOCBCoroutine;
 
 static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
-    .cancel             = bdrv_aio_co_cancel_em,
+    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
 };
 
 static void bdrv_co_em_bh(void *opaque)
 {
-    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockAIOCBCoroutine *acb = opaque;
 
     acb->common.cb(acb->common.opaque, acb->req.error);
 
-    if (acb->done) {
-        *acb->done = true;
-    }
-
     qemu_bh_delete(acb->bh);
-    qemu_aio_release(acb);
+    qemu_aio_unref(acb);
 }
 
 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
 static void coroutine_fn bdrv_co_do_rw(void *opaque)
 {
-    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockAIOCBCoroutine *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
 
     if (!acb->is_write) {
@@ -4690,17 +4765,17 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
     qemu_bh_schedule(acb->bh);
 }
 
-static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                               int64_t sector_num,
-                                               QEMUIOVector *qiov,
-                                               int nb_sectors,
-                                               BdrvRequestFlags flags,
-                                               BlockDriverCompletionFunc *cb,
-                                               void *opaque,
-                                               bool is_write)
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write)
 {
     Coroutine *co;
-    BlockDriverAIOCBCoroutine *acb;
+    BlockAIOCBCoroutine *acb;
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->req.sector = sector_num;
@@ -4708,7 +4783,6 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
     acb->req.qiov = qiov;
     acb->req.flags = flags;
     acb->is_write = is_write;
-    acb->done = NULL;
 
     co = qemu_coroutine_create(bdrv_co_do_rw);
     qemu_coroutine_enter(co, acb);
@@ -4718,7 +4792,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
 
 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
 {
-    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockAIOCBCoroutine *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_flush(bs);
@@ -4726,16 +4800,15 @@ static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
     qemu_bh_schedule(acb->bh);
 }
 
-BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
 {
     trace_bdrv_aio_flush(bs, opaque);
 
     Coroutine *co;
-    BlockDriverAIOCBCoroutine *acb;
+    BlockAIOCBCoroutine *acb;
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->done = NULL;
 
     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
     qemu_coroutine_enter(co, acb);
@@ -4745,7 +4818,7 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
 
 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
 {
-    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockAIOCBCoroutine *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
@@ -4753,19 +4826,18 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
     qemu_bh_schedule(acb->bh);
 }
 
-BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
     Coroutine *co;
-    BlockDriverAIOCBCoroutine *acb;
+    BlockAIOCBCoroutine *acb;
 
     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->req.sector = sector_num;
     acb->req.nb_sectors = nb_sectors;
-    acb->done = NULL;
     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
     qemu_coroutine_enter(co, acb);
 
@@ -4784,22 +4856,32 @@ void bdrv_init_with_whitelist(void)
 }
 
 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockDriverCompletionFunc *cb, void *opaque)
+                   BlockCompletionFunc *cb, void *opaque)
 {
-    BlockDriverAIOCB *acb;
+    BlockAIOCB *acb;
 
     acb = g_slice_alloc(aiocb_info->aiocb_size);
     acb->aiocb_info = aiocb_info;
     acb->bs = bs;
     acb->cb = cb;
     acb->opaque = opaque;
+    acb->refcnt = 1;
     return acb;
 }
 
-void qemu_aio_release(void *p)
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
 {
-    BlockDriverAIOCB *acb = p;
-    g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    }
 }
 
 /**************************************************************/
@@ -4825,7 +4907,7 @@ static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
     CoroutineIOCompletion co = {
         .coroutine = qemu_coroutine_self(),
     };
-    BlockDriverAIOCB *acb;
+    BlockAIOCB *acb;
 
     if (is_write) {
         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
@@ -4891,7 +4973,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
     if (bs->drv->bdrv_co_flush_to_disk) {
         ret = bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
-        BlockDriverAIOCB *acb;
+        BlockAIOCB *acb;
         CoroutineIOCompletion co = {
             .coroutine = qemu_coroutine_self(),
         };
@@ -4937,6 +5019,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
         return;
     }
 
+    if (!(bs->open_flags & BDRV_O_INCOMING)) {
+        return;
+    }
+    bs->open_flags &= ~BDRV_O_INCOMING;
+
     if (bs->drv->bdrv_invalidate_cache) {
         bs->drv->bdrv_invalidate_cache(bs, &local_err);
     } else if (bs->file) {
@@ -4972,19 +5059,6 @@ void bdrv_invalidate_cache_all(Error **errp)
     }
 }
 
-void bdrv_clear_incoming_migration_all(void)
-{
-    BlockDriverState *bs;
-
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
-        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
-        aio_context_release(aio_context);
-    }
-}
-
 int bdrv_flush(BlockDriverState *bs)
 {
     Coroutine *co;
@@ -5022,11 +5096,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_DISCARD_DEFAULT 32768
-
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                  int nb_sectors)
 {
@@ -5051,7 +5120,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
+    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
     while (nb_sectors > 0) {
         int ret;
         int num = nb_sectors;
@@ -5074,7 +5143,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         if (bs->drv->bdrv_co_discard) {
             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
         } else {
-            BlockDriverAIOCB *acb;
+            BlockAIOCB *acb;
             CoroutineIOCompletion co = {
                 .coroutine = qemu_coroutine_self(),
             };
@@ -5161,13 +5230,15 @@ int bdrv_media_changed(BlockDriverState *bs)
 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
 {
     BlockDriver *drv = bs->drv;
+    const char *device_name;
 
     if (drv && drv->bdrv_eject) {
         drv->bdrv_eject(bs, eject_flag);
     }
 
-    if (bs->device_name[0] != '\0') {
-        qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
+    device_name = bdrv_get_device_name(bs);
+    if (device_name[0] != '\0') {
+        qapi_event_send_device_tray_moved(device_name,
                                           eject_flag, &error_abort);
     }
 }
@@ -5198,9 +5269,9 @@ int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
     return -ENOTSUP;
 }
 
-BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
         unsigned long int req, void *buf,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
 
@@ -5219,6 +5290,35 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
     return qemu_memalign(bdrv_opt_mem_align(bs), size);
 }
 
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+    return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+    size_t align = bdrv_opt_mem_align(bs);
+
+    /* Ensure that NULL is never returned on success */
+    assert(align > 0);
+    if (size == 0) {
+        size = align;
+    }
+
+    return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+    void *mem = qemu_try_blockalign(bs, size);
+
+    if (mem) {
+        memset(mem, 0, size);
+    }
+
+    return mem;
+}
+
 /*
  * Check if all memory in this vector is sector aligned.
  */
@@ -5249,14 +5349,13 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
 
     granularity >>= BDRV_SECTOR_BITS;
     assert(granularity);
-    bitmap_size = bdrv_getlength(bs);
+    bitmap_size = bdrv_nb_sectors(bs);
     if (bitmap_size < 0) {
         error_setg_errno(errp, -bitmap_size, "could not get length of device");
         errno = -bitmap_size;
         return NULL;
     }
-    bitmap_size >>= BDRV_SECTOR_BITS;
-    bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
+    bitmap = g_new0(BdrvDirtyBitmap, 1);
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
     return bitmap;
@@ -5282,8 +5381,8 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
     BlockDirtyInfoList **plist = &list;
 
     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
-        BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
-        BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
+        BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
+        BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
         info->count = bdrv_get_dirty_count(bs, bm);
         info->granularity =
             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
@@ -5310,8 +5409,20 @@ void bdrv_dirty_iter_init(BlockDriverState *bs,
     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
 }
 
-void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int nr_sectors)
+void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                           int64_t cur_sector, int nr_sectors)
+{
+    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                             int64_t cur_sector, int nr_sectors)
+{
+    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
+static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                           int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
@@ -5319,7 +5430,8 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
     }
 }
 
-void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
+static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                             int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
@@ -5343,6 +5455,9 @@ void bdrv_ref(BlockDriverState *bs)
  * deleted. */
 void bdrv_unref(BlockDriverState *bs)
 {
+    if (!bs) {
+        return;
+    }
     assert(bs->refcnt > 0);
     if (--bs->refcnt == 0) {
         bdrv_delete(bs);
@@ -5362,7 +5477,8 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
         blocker = QLIST_FIRST(&bs->op_blockers[op]);
         if (errp) {
             error_setg(errp, "Device '%s' is busy: %s",
-                       bs->device_name, error_get_pretty(blocker->reason));
+                       bdrv_get_device_name(bs),
+                       error_get_pretty(blocker->reason));
         }
         return true;
     }
@@ -5374,7 +5490,7 @@ void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
     BdrvOpBlocker *blocker;
     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
 
-    blocker = g_malloc0(sizeof(BdrvOpBlocker));
+    blocker = g_new0(BdrvOpBlocker, 1);
     blocker->reason = reason;
     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
 }
@@ -5459,27 +5575,6 @@ void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
     }
 }
 
-void
-bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
-        enum BlockAcctType type)
-{
-    assert(type < BDRV_MAX_IOTYPE);
-
-    cookie->bytes = bytes;
-    cookie->start_time_ns = get_clock();
-    cookie->type = type;
-}
-
-void
-bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
-{
-    assert(cookie->type < BDRV_MAX_IOTYPE);
-
-    bs->nr_bytes[cookie->type] += cookie->bytes;
-    bs->nr_ops[cookie->type]++;
-    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
-}
-
 void bdrv_img_create(const char *filename, const char *fmt,
                      const char *base_filename, const char *base_fmt,
                      char *options, uint64_t img_size, int flags,
@@ -5507,6 +5602,18 @@ void bdrv_img_create(const char *filename, const char *fmt,
         return;
     }
 
+    if (!drv->create_opts) {
+        error_setg(errp, "Format driver '%s' does not support image creation",
+                   drv->format_name);
+        return;
+    }
+
+    if (!proto_drv->create_opts) {
+        error_setg(errp, "Protocol driver '%s' does not support image creation",
+                   proto_drv->format_name);
+        return;
+    }
+
     create_opts = qemu_opts_append(create_opts, drv->create_opts);
     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
 
@@ -5563,26 +5670,36 @@ void bdrv_img_create(const char *filename, const char *fmt,
     if (size == -1) {
         if (backing_file) {
             BlockDriverState *bs;
-            uint64_t size;
+            char *full_backing = g_new0(char, PATH_MAX);
+            int64_t size;
             int back_flags;
 
+            bdrv_get_full_backing_filename_from_filename(filename, backing_file,
+                                                         full_backing, PATH_MAX,
+                                                         &local_err);
+            if (local_err) {
+                g_free(full_backing);
+                goto out;
+            }
+
             /* backing files always opened read-only */
             back_flags =
                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
             bs = NULL;
-            ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
+            ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
                             backing_drv, &local_err);
+            g_free(full_backing);
             if (ret < 0) {
-                error_setg_errno(errp, -ret, "Could not open '%s': %s",
-                                 backing_file,
-                                 error_get_pretty(local_err));
-                error_free(local_err);
-                local_err = NULL;
                 goto out;
             }
-            bdrv_get_geometry(bs, &size);
-            size *= 512;
+            size = bdrv_getlength(bs);
+            if (size < 0) {
+                error_setg_errno(errp, -size, "Could not get size of '%s'",
+                                 backing_file);
+                bdrv_unref(bs);
+                goto out;
+            }
 
             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
 
@@ -5594,8 +5711,8 @@ void bdrv_img_create(const char *filename, const char *fmt,
     }
 
     if (!quiet) {
-        printf("Formatting '%s', fmt=%s ", filename, fmt);
-        qemu_opts_print(opts);
+        printf("Formatting '%s', fmt=%s", filename, fmt);
+        qemu_opts_print(opts, " ");
         puts("");
     }
 
@@ -5630,10 +5747,16 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 
 void bdrv_detach_aio_context(BlockDriverState *bs)
 {
+    BdrvAioNotifier *baf;
+
     if (!bs->drv) {
         return;
     }
 
+    QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
+        baf->detach_aio_context(baf->opaque);
+    }
+
     if (bs->io_limits_enabled) {
         throttle_detach_aio_context(&bs->throttle_state);
     }
@@ -5653,6 +5776,8 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
 void bdrv_attach_aio_context(BlockDriverState *bs,
                              AioContext *new_context)
 {
+    BdrvAioNotifier *ban;
+
     if (!bs->drv) {
         return;
     }
@@ -5671,6 +5796,10 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
     if (bs->io_limits_enabled) {
         throttle_attach_aio_context(&bs->throttle_state, new_context);
     }
+
+    QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
+        ban->attached_aio_context(new_context, ban->opaque);
+    }
 }
 
 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
@@ -5687,18 +5816,56 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     aio_context_release(new_context);
 }
 
+void bdrv_add_aio_context_notifier(BlockDriverState *bs,
+        void (*attached_aio_context)(AioContext *new_context, void *opaque),
+        void (*detach_aio_context)(void *opaque), void *opaque)
+{
+    BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
+    *ban = (BdrvAioNotifier){
+        .attached_aio_context = attached_aio_context,
+        .detach_aio_context   = detach_aio_context,
+        .opaque               = opaque
+    };
+
+    QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
+}
+
+void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
+                                      void (*attached_aio_context)(AioContext *,
+                                                                   void *),
+                                      void (*detach_aio_context)(void *),
+                                      void *opaque)
+{
+    BdrvAioNotifier *ban, *ban_next;
+
+    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
+        if (ban->attached_aio_context == attached_aio_context &&
+            ban->detach_aio_context   == detach_aio_context   &&
+            ban->opaque               == opaque)
+        {
+            QLIST_REMOVE(ban, list);
+            g_free(ban);
+
+            return;
+        }
+    }
+
+    abort();
+}
+
 void bdrv_add_before_write_notifier(BlockDriverState *bs,
                                     NotifierWithReturn *notifier)
 {
     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
 }
 
-int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
+int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
+                       BlockDriverAmendStatusCB *status_cb)
 {
     if (!bs->drv->bdrv_amend_options) {
         return -ENOTSUP;
     }
-    return bs->drv->bdrv_amend_options(bs, opts);
+    return bs->drv->bdrv_amend_options(bs, opts, status_cb);
 }
 
 /* This function will be called by the bdrv_recurse_is_first_non_filter method
@@ -5761,13 +5928,19 @@ bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
 {
     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
+    AioContext *aio_context;
+
     if (!to_replace_bs) {
         error_setg(errp, "Node name '%s' not found", node_name);
         return NULL;
     }
 
+    aio_context = bdrv_get_aio_context(to_replace_bs);
+    aio_context_acquire(aio_context);
+
     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
-        return NULL;
+        to_replace_bs = NULL;
+        goto out;
     }
 
     /* We don't want arbitrary node of the BDS chain to be replaced only the top
@@ -5777,8 +5950,182 @@ BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
      */
     if (!bdrv_is_first_non_filter(to_replace_bs)) {
         error_setg(errp, "Only top most non filter can be replaced");
-        return NULL;
+        to_replace_bs = NULL;
+        goto out;
     }
 
+out:
+    aio_context_release(aio_context);
     return to_replace_bs;
 }
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_plug) {
+        drv->bdrv_io_plug(bs);
+    } else if (bs->file) {
+        bdrv_io_plug(bs->file);
+    }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_unplug) {
+        drv->bdrv_io_unplug(bs);
+    } else if (bs->file) {
+        bdrv_io_unplug(bs->file);
+    }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_flush_io_queue) {
+        drv->bdrv_flush_io_queue(bs);
+    } else if (bs->file) {
+        bdrv_flush_io_queue(bs->file);
+    }
+}
+
+static bool append_open_options(QDict *d, BlockDriverState *bs)
+{
+    const QDictEntry *entry;
+    bool found_any = false;
+
+    for (entry = qdict_first(bs->options); entry;
+         entry = qdict_next(bs->options, entry))
+    {
+        /* Only take options for this level and exclude all non-driver-specific
+         * options */
+        if (!strchr(qdict_entry_key(entry), '.') &&
+            strcmp(qdict_entry_key(entry), "node-name"))
+        {
+            qobject_incref(qdict_entry_value(entry));
+            qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
+            found_any = true;
+        }
+    }
+
+    return found_any;
+}
+
+/* Updates the following BDS fields:
+ *  - exact_filename: A filename which may be used for opening a block device
+ *                    which (mostly) equals the given BDS (even without any
+ *                    other options; so reading and writing must return the same
+ *                    results, but caching etc. may be different)
+ *  - full_open_options: Options which, when given when opening a block device
+ *                       (without a filename), result in a BDS (mostly)
+ *                       equalling the given one
+ *  - filename: If exact_filename is set, it is copied here. Otherwise,
+ *              full_open_options is converted to a JSON object, prefixed with
+ *              "json:" (for use through the JSON pseudo protocol) and put here.
+ */
+void bdrv_refresh_filename(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    QDict *opts;
+
+    if (!drv) {
+        return;
+    }
+
+    /* This BDS's file name will most probably depend on its file's name, so
+     * refresh that first */
+    if (bs->file) {
+        bdrv_refresh_filename(bs->file);
+    }
+
+    if (drv->bdrv_refresh_filename) {
+        /* Obsolete information is of no use here, so drop the old file name
+         * information before refreshing it */
+        bs->exact_filename[0] = '\0';
+        if (bs->full_open_options) {
+            QDECREF(bs->full_open_options);
+            bs->full_open_options = NULL;
+        }
+
+        drv->bdrv_refresh_filename(bs);
+    } else if (bs->file) {
+        /* Try to reconstruct valid information from the underlying file */
+        bool has_open_options;
+
+        bs->exact_filename[0] = '\0';
+        if (bs->full_open_options) {
+            QDECREF(bs->full_open_options);
+            bs->full_open_options = NULL;
+        }
+
+        opts = qdict_new();
+        has_open_options = append_open_options(opts, bs);
+
+        /* If no specific options have been given for this BDS, the filename of
+         * the underlying file should suffice for this one as well */
+        if (bs->file->exact_filename[0] && !has_open_options) {
+            strcpy(bs->exact_filename, bs->file->exact_filename);
+        }
+        /* Reconstructing the full options QDict is simple for most format block
+         * drivers, as long as the full options are known for the underlying
+         * file BDS. The full options QDict of that file BDS should somehow
+         * contain a representation of the filename, therefore the following
+         * suffices without querying the (exact_)filename of this BDS. */
+        if (bs->file->full_open_options) {
+            qdict_put_obj(opts, "driver",
+                          QOBJECT(qstring_from_str(drv->format_name)));
+            QINCREF(bs->file->full_open_options);
+            qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
+
+            bs->full_open_options = opts;
+        } else {
+            QDECREF(opts);
+        }
+    } else if (!bs->full_open_options && qdict_size(bs->options)) {
+        /* There is no underlying file BDS (at least referenced by BDS.file),
+         * so the full options QDict should be equal to the options given
+         * specifically for this block device when it was opened (plus the
+         * driver specification).
+         * Because those options don't change, there is no need to update
+         * full_open_options when it's already set. */
+
+        opts = qdict_new();
+        append_open_options(opts, bs);
+        qdict_put_obj(opts, "driver",
+                      QOBJECT(qstring_from_str(drv->format_name)));
+
+        if (bs->exact_filename[0]) {
+            /* This may not work for all block protocol drivers (some may
+             * require this filename to be parsed), but we have to find some
+             * default solution here, so just include it. If some block driver
+             * does not support pure options without any filename at all or
+             * needs some special format of the options QDict, it needs to
+             * implement the driver-specific bdrv_refresh_filename() function.
+             */
+            qdict_put_obj(opts, "filename",
+                          QOBJECT(qstring_from_str(bs->exact_filename)));
+        }
+
+        bs->full_open_options = opts;
+    }
+
+    if (bs->exact_filename[0]) {
+        pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
+    } else if (bs->full_open_options) {
+        QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
+        snprintf(bs->filename, sizeof(bs->filename), "json:%s",
+                 qstring_get_str(json));
+        QDECREF(json);
+    }
+}
+
+/* This accessor function purpose is to allow the device models to access the
+ * BlockAcctStats structure embedded inside a BlockDriverState without being
+ * aware of the BlockDriverState structure layout.
+ * It will go away when the BlockAcctStats structure will be moved inside
+ * the device models.
+ */
+BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
+{
+    return &bs->stats;
+}
This page took 0.106245 seconds and 4 git commands to generate.