]> Git Repo - qemu.git/blobdiff - block.c
target-ppc: Refactor PPC970
[qemu.git] / block.c
diff --git a/block.c b/block.c
index d9c472b2f85c48feaa5519edce7157612a167c61..17f763db792f621eb1a3e1eec4b50b9a11762412 100644 (file)
--- a/block.c
+++ b/block.c
@@ -70,11 +70,11 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                int64_t sector_num,
@@ -179,6 +179,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs)
 {
     assert(!bs->io_limits_enabled);
     throttle_init(&bs->throttle_state,
+                  bdrv_get_aio_context(bs),
                   QEMU_CLOCK_VIRTUAL,
                   bdrv_throttle_read_timer_cb,
                   bdrv_throttle_write_timer_cb,
@@ -192,7 +193,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs)
  * @is_write:   is the IO a write
  */
 static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     int nb_sectors,
+                                     unsigned int bytes,
                                      bool is_write)
 {
     /* does this io must wait */
@@ -205,9 +206,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
     }
 
     /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state,
-                     is_write,
-                     nb_sectors * BDRV_SECTOR_SIZE);
+    throttle_account(&bs->throttle_state, is_write, bytes);
+
 
     /* if the next request must wait -> do nothing */
     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
@@ -333,9 +333,21 @@ void bdrv_register(BlockDriver *bdrv)
 }
 
 /* create a new block device (by default it is empty) */
-BlockDriverState *bdrv_new(const char *device_name)
+BlockDriverState *bdrv_new(const char *device_name, Error **errp)
 {
     BlockDriverState *bs;
+    int i;
+
+    if (bdrv_find(device_name)) {
+        error_setg(errp, "Device with id '%s' already exists",
+                   device_name);
+        return NULL;
+    }
+    if (bdrv_find_node(device_name)) {
+        error_setg(errp, "Device with node-name '%s' already exists",
+                   device_name);
+        return NULL;
+    }
 
     bs = g_malloc0(sizeof(BlockDriverState));
     QLIST_INIT(&bs->dirty_bitmaps);
@@ -343,12 +355,16 @@ BlockDriverState *bdrv_new(const char *device_name)
     if (device_name[0] != '\0') {
         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
     }
+    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
+        QLIST_INIT(&bs->op_blockers[i]);
+    }
     bdrv_iostatus_disable(bs);
     notifier_list_init(&bs->close_notifiers);
     notifier_with_return_list_init(&bs->before_write_notifiers);
     qemu_co_queue_init(&bs->throttled_reqs[0]);
     qemu_co_queue_init(&bs->throttled_reqs[1]);
     bs->refcnt = 1;
+    bs->aio_context = qemu_get_aio_context();
 
     return bs;
 }
@@ -422,7 +438,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
     assert(cco->drv);
 
     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         error_propagate(&cco->err, local_err);
     }
     cco->ret = ret;
@@ -461,7 +477,7 @@ int bdrv_create(BlockDriver *drv, const char* filename,
 
     ret = cco.ret;
     if (ret < 0) {
-        if (error_is_set(&cco.err)) {
+        if (cco.err) {
             error_propagate(errp, cco.err);
         } else {
             error_setg_errno(errp, -ret, "Could not create image");
@@ -487,7 +503,7 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
     }
 
     ret = bdrv_create(drv, filename, options, &local_err);
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         error_propagate(errp, local_err);
     }
     return ret;
@@ -548,8 +564,9 @@ int get_tmp_filename(char *filename, int size)
     int fd;
     const char *tmpdir;
     tmpdir = getenv("TMPDIR");
-    if (!tmpdir)
-        tmpdir = "/tmp";
+    if (!tmpdir) {
+        tmpdir = "/var/tmp";
+    }
     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
         return -EOVERFLOW;
     }
@@ -763,6 +780,50 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
     bs->copy_on_read--;
 }
 
+/*
+ * Returns the flags that a temporary snapshot should get, based on the
+ * originally requested flags (the originally requested image will have flags
+ * like a backing file)
+ */
+static int bdrv_temp_snapshot_flags(int flags)
+{
+    return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+}
+
+/*
+ * Returns the flags that bs->file should get, based on the given flags for
+ * the parent BDS
+ */
+static int bdrv_inherited_flags(int flags)
+{
+    /* Enable protocol handling, disable format probing for bs->file */
+    flags |= BDRV_O_PROTOCOL;
+
+    /* Our block drivers take care to send flushes and respect unmap policy,
+     * so we can enable both unconditionally on lower layers. */
+    flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
+
+    /* Clear flags that only apply to the top layer */
+    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
+
+    return flags;
+}
+
+/*
+ * Returns the flags that bs->backing_hd should get, based on the given flags
+ * for the parent BDS
+ */
+static int bdrv_backing_flags(int flags)
+{
+    /* backing files always opened read-only */
+    flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
+
+    /* snapshot=on is handled on the top layer */
+    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
+
+    return flags;
+}
+
 static int bdrv_open_flags(BlockDriverState *bs, int flags)
 {
     int open_flags = flags | BDRV_O_CACHE_WB;
@@ -776,38 +837,43 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags)
     /*
      * Snapshots should be writable.
      */
-    if (bs->is_temporary) {
+    if (flags & BDRV_O_TEMPORARY) {
         open_flags |= BDRV_O_RDWR;
     }
 
     return open_flags;
 }
 
-static int bdrv_assign_node_name(BlockDriverState *bs,
-                                 const char *node_name,
-                                 Error **errp)
+static void bdrv_assign_node_name(BlockDriverState *bs,
+                                  const char *node_name,
+                                  Error **errp)
 {
     if (!node_name) {
-        return 0;
+        return;
     }
 
     /* empty string node name is invalid */
     if (node_name[0] == '\0') {
         error_setg(errp, "Empty node name");
-        return -EINVAL;
+        return;
+    }
+
+    /* takes care of avoiding namespaces collisions */
+    if (bdrv_find(node_name)) {
+        error_setg(errp, "node-name=%s is conflicting with a device id",
+                   node_name);
+        return;
     }
 
     /* takes care of avoiding duplicates node names */
     if (bdrv_find_node(node_name)) {
         error_setg(errp, "Duplicate node name");
-        return -EINVAL;
+        return;
     }
 
     /* copy node name into the bs and insert it into the graph list */
     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
-
-    return 0;
 }
 
 /*
@@ -833,12 +899,19 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
         filename = qdict_get_try_str(options, "filename");
     }
 
+    if (drv->bdrv_needs_filename && !filename) {
+        error_setg(errp, "The '%s' block driver requires a file name",
+                   drv->format_name);
+        return -EINVAL;
+    }
+
     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
 
     node_name = qdict_get_try_str(options, "node-name");
-    ret = bdrv_assign_node_name(bs, node_name, errp);
-    if (ret < 0) {
-        return ret;
+    bdrv_assign_node_name(bs, node_name, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return -EINVAL;
     }
     qdict_del(options, "node-name");
 
@@ -904,7 +977,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
     }
 
     if (ret < 0) {
-        if (error_is_set(&local_err)) {
+        if (local_err) {
             error_propagate(errp, local_err);
         } else if (bs->filename[0]) {
             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
@@ -922,14 +995,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     bdrv_refresh_limits(bs);
     assert(bdrv_opt_mem_align(bs) != 0);
-    assert(bs->request_alignment != 0);
-
-#ifndef _WIN32
-    if (bs->is_temporary) {
-        assert(bs->filename[0] != '\0');
-        unlink(bs->filename);
-    }
-#endif
+    assert((bs->request_alignment != 0) || bs->sg);
     return 0;
 
 free_and_fail:
@@ -943,55 +1009,28 @@ free_and_fail:
 /*
  * Opens a file using a protocol (file, host_device, nbd, ...)
  *
- * options is a QDict of options to pass to the block drivers, or NULL for an
- * empty set of options. The reference to the QDict belongs to the block layer
- * after the call (even on failure), so if the caller intends to reuse the
- * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
+ * options is an indirect pointer to a QDict of options to pass to the block
+ * drivers, or pointer to NULL for an empty set of options. If this function
+ * takes ownership of the QDict reference, it will set *options to NULL;
+ * otherwise, it will contain unused/unrecognized options after this function
+ * returns. Then, the caller is responsible for freeing it. If it intends to
+ * reuse the QDict, QINCREF() should be called beforehand.
  */
-int bdrv_file_open(BlockDriverState **pbs, const char *filename,
-                   const char *reference, QDict *options, int flags,
-                   Error **errp)
+static int bdrv_file_open(BlockDriverState *bs, const char *filename,
+                          QDict **options, int flags, Error **errp)
 {
-    BlockDriverState *bs = NULL;
     BlockDriver *drv;
     const char *drvname;
-    bool allow_protocol_prefix = false;
+    bool parse_filename = false;
     Error *local_err = NULL;
     int ret;
 
-    /* NULL means an empty set of options */
-    if (options == NULL) {
-        options = qdict_new();
-    }
-
-    if (reference) {
-        if (filename || qdict_size(options)) {
-            error_setg(errp, "Cannot reference an existing block device with "
-                       "additional options or a new filename");
-            return -EINVAL;
-        }
-        QDECREF(options);
-
-        bs = bdrv_find(reference);
-        if (!bs) {
-            error_setg(errp, "Cannot find block device '%s'", reference);
-            return -ENODEV;
-        }
-        bdrv_ref(bs);
-        *pbs = bs;
-        return 0;
-    }
-
-    bs = bdrv_new("");
-    bs->options = options;
-    options = qdict_clone_shallow(options);
-
     /* Fetch the file name from the options QDict if necessary */
     if (!filename) {
-        filename = qdict_get_try_str(options, "filename");
-    } else if (filename && !qdict_haskey(options, "filename")) {
-        qdict_put(options, "filename", qstring_from_str(filename));
-        allow_protocol_prefix = true;
+        filename = qdict_get_try_str(*options, "filename");
+    } else if (filename && !qdict_haskey(*options, "filename")) {
+        qdict_put(*options, "filename", qstring_from_str(filename));
+        parse_filename = true;
     } else {
         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
                    "same time");
@@ -1000,15 +1039,15 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename,
     }
 
     /* Find the right block driver */
-    drvname = qdict_get_try_str(options, "driver");
+    drvname = qdict_get_try_str(*options, "driver");
     if (drvname) {
         drv = bdrv_find_format(drvname);
         if (!drv) {
             error_setg(errp, "Unknown driver '%s'", drvname);
         }
-        qdict_del(options, "driver");
+        qdict_del(*options, "driver");
     } else if (filename) {
-        drv = bdrv_find_protocol(filename, allow_protocol_prefix);
+        drv = bdrv_find_protocol(filename, parse_filename);
         if (!drv) {
             error_setg(errp, "Unknown protocol");
         }
@@ -1024,55 +1063,70 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename,
     }
 
     /* Parse the filename and open it */
-    if (drv->bdrv_parse_filename && filename) {
-        drv->bdrv_parse_filename(filename, options, &local_err);
-        if (error_is_set(&local_err)) {
+    if (drv->bdrv_parse_filename && parse_filename) {
+        drv->bdrv_parse_filename(filename, *options, &local_err);
+        if (local_err) {
             error_propagate(errp, local_err);
             ret = -EINVAL;
             goto fail;
         }
-        qdict_del(options, "filename");
-    } else if (drv->bdrv_needs_filename && !filename) {
-        error_setg(errp, "The '%s' block driver requires a file name",
-                   drv->format_name);
-        ret = -EINVAL;
-        goto fail;
+
+        if (!drv->bdrv_needs_filename) {
+            qdict_del(*options, "filename");
+        } else {
+            filename = qdict_get_str(*options, "filename");
+        }
     }
 
     if (!drv->bdrv_file_open) {
-        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
-        options = NULL;
+        ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
+        *options = NULL;
     } else {
-        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
+        ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
     }
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
     }
 
-    /* Check if any unknown options were used */
-    if (options && (qdict_size(options) != 0)) {
-        const QDictEntry *entry = qdict_first(options);
-        error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
-                   drv->format_name, entry->key);
-        ret = -EINVAL;
-        goto fail;
-    }
-    QDECREF(options);
-
     bs->growable = 1;
-    *pbs = bs;
     return 0;
 
 fail:
-    QDECREF(options);
-    if (!bs->drv) {
-        QDECREF(bs->options);
-    }
-    bdrv_unref(bs);
     return ret;
 }
 
+void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
+{
+
+    if (bs->backing_hd) {
+        assert(bs->backing_blocker);
+        bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
+    } else if (backing_hd) {
+        error_setg(&bs->backing_blocker,
+                   "device is used as backing hd of '%s'",
+                   bs->device_name);
+    }
+
+    bs->backing_hd = backing_hd;
+    if (!backing_hd) {
+        error_free(bs->backing_blocker);
+        bs->backing_blocker = NULL;
+        goto out;
+    }
+    bs->open_flags &= ~BDRV_O_NO_BACKING;
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
+    pstrcpy(bs->backing_format, sizeof(bs->backing_format),
+            backing_hd->drv ? backing_hd->drv->format_name : "");
+
+    bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
+    /* Otherwise we won't be able to commit due to check in bdrv_commit */
+    bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
+                    bs->backing_blocker);
+out:
+    bdrv_refresh_limits(bs);
+}
+
 /*
  * Opens the backing file for a BlockDriverState if not yet open
  *
@@ -1083,14 +1137,15 @@ fail:
  */
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
 {
-    char backing_filename[PATH_MAX];
-    int back_flags, ret;
+    char *backing_filename = g_malloc0(PATH_MAX);
+    int ret = 0;
     BlockDriver *back_drv = NULL;
+    BlockDriverState *backing_hd;
     Error *local_err = NULL;
 
     if (bs->backing_hd != NULL) {
         QDECREF(options);
-        return 0;
+        goto free_exit;
     }
 
     /* NULL means an empty set of options */
@@ -1103,54 +1158,41 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
         backing_filename[0] = '\0';
     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
         QDECREF(options);
-        return 0;
+        goto free_exit;
     } else {
-        bdrv_get_full_backing_filename(bs, backing_filename,
-                                       sizeof(backing_filename));
+        bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
     }
 
-    bs->backing_hd = bdrv_new("");
+    backing_hd = bdrv_new("", errp);
 
     if (bs->backing_format[0] != '\0') {
         back_drv = bdrv_find_format(bs->backing_format);
     }
 
-    /* backing files always opened read-only */
-    back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
-                                    BDRV_O_COPY_ON_READ);
-
-    ret = bdrv_open(bs->backing_hd,
-                    *backing_filename ? backing_filename : NULL, options,
-                    back_flags, back_drv, &local_err);
+    assert(bs->backing_hd == NULL);
+    ret = bdrv_open(&backing_hd,
+                    *backing_filename ? backing_filename : NULL, NULL, options,
+                    bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
     if (ret < 0) {
-        bdrv_unref(bs->backing_hd);
-        bs->backing_hd = NULL;
+        bdrv_unref(backing_hd);
+        backing_hd = NULL;
         bs->open_flags |= BDRV_O_NO_BACKING;
         error_setg(errp, "Could not open backing file: %s",
                    error_get_pretty(local_err));
         error_free(local_err);
-        return ret;
+        goto free_exit;
     }
+    bdrv_set_backing_hd(bs, backing_hd);
 
-    if (bs->backing_hd->file) {
-        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
-                bs->backing_hd->file->filename);
-    }
-
-    /* Recalculate the BlockLimits with the backing file */
-    bdrv_refresh_limits(bs);
-
-    return 0;
+free_exit:
+    g_free(backing_filename);
+    return ret;
 }
 
 /*
  * Opens a disk image whose options are given as BlockdevRef in another block
  * device's options.
  *
- * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
- * image format auto-detection. If it is false and a filename is given,
- * bdrv_open() will be used for auto-detection.
- *
  * If allow_none is true, no image will be opened if filename is false and no
  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
  *
@@ -1160,16 +1202,21 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
  * BlockdevRef.
  *
  * The BlockdevRef will be removed from the options QDict.
+ *
+ * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
  */
 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
                     QDict *options, const char *bdref_key, int flags,
-                    bool force_raw, bool allow_none, Error **errp)
+                    bool allow_none, Error **errp)
 {
     QDict *image_options;
     int ret;
     char *bdref_key_dot;
     const char *reference;
 
+    assert(pbs);
+    assert(*pbs == NULL);
+
     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
     g_free(bdref_key_dot);
@@ -1183,39 +1230,112 @@ int bdrv_open_image(BlockDriverState **pbs, const char *filename,
                        bdref_key);
             ret = -EINVAL;
         }
+        QDECREF(image_options);
         goto done;
     }
 
-    if (filename && !force_raw) {
-        /* If a filename is given and the block driver should be detected
-           automatically (instead of using none), use bdrv_open() in order to do
-           that auto-detection. */
-        BlockDriverState *bs;
-
-        if (reference) {
-            error_setg(errp, "Cannot reference an existing block device while "
-                       "giving a filename");
-            ret = -EINVAL;
-            goto done;
-        }
-
-        bs = bdrv_new("");
-        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
-        if (ret < 0) {
-            bdrv_unref(bs);
-        } else {
-            *pbs = bs;
-        }
-    } else {
-        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
-                             errp);
-    }
+    ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
 
 done:
     qdict_del(options, bdref_key);
     return ret;
 }
 
+void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
+{
+    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
+    char *tmp_filename = g_malloc0(PATH_MAX + 1);
+    int64_t total_size;
+    BlockDriver *bdrv_qcow2;
+    QEMUOptionParameter *create_options;
+    QDict *snapshot_options;
+    BlockDriverState *bs_snapshot;
+    Error *local_err;
+    int ret;
+
+    /* if snapshot, we create a temporary backing file and open it
+       instead of opening 'filename' directly */
+
+    /* Get the required size from the image */
+    total_size = bdrv_getlength(bs);
+    if (total_size < 0) {
+        error_setg_errno(errp, -total_size, "Could not get image size");
+        goto out;
+    }
+    total_size &= BDRV_SECTOR_MASK;
+
+    /* Create the temporary image */
+    ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not get temporary filename");
+        goto out;
+    }
+
+    bdrv_qcow2 = bdrv_find_format("qcow2");
+    create_options = parse_option_parameters("", bdrv_qcow2->create_options,
+                                             NULL);
+
+    set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
+
+    ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
+    free_option_parameters(create_options);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not create temporary overlay "
+                         "'%s': %s", tmp_filename,
+                         error_get_pretty(local_err));
+        error_free(local_err);
+        goto out;
+    }
+
+    /* Prepare a new options QDict for the temporary file */
+    snapshot_options = qdict_new();
+    qdict_put(snapshot_options, "file.driver",
+              qstring_from_str("file"));
+    qdict_put(snapshot_options, "file.filename",
+              qstring_from_str(tmp_filename));
+
+    bs_snapshot = bdrv_new("", &error_abort);
+
+    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
+                    flags, bdrv_qcow2, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto out;
+    }
+
+    bdrv_append(bs_snapshot, bs);
+
+out:
+    g_free(tmp_filename);
+}
+
+static QDict *parse_json_filename(const char *filename, Error **errp)
+{
+    QObject *options_obj;
+    QDict *options;
+    int ret;
+
+    ret = strstart(filename, "json:", &filename);
+    assert(ret);
+
+    options_obj = qobject_from_json(filename);
+    if (!options_obj) {
+        error_setg(errp, "Could not parse the JSON options");
+        return NULL;
+    }
+
+    if (qobject_type(options_obj) != QTYPE_QDICT) {
+        qobject_decref(options_obj);
+        error_setg(errp, "Invalid JSON object given");
+        return NULL;
+    }
+
+    options = qobject_to_qdict(options_obj);
+    qdict_flatten(options);
+
+    return options;
+}
+
 /*
  * Opens a disk image (raw, qcow2, vmdk, ...)
  *
@@ -1223,102 +1343,106 @@ done:
  * empty set of options. The reference to the QDict belongs to the block layer
  * after the call (even on failure), so if the caller intends to reuse the
  * dictionary, it needs to use QINCREF() before calling bdrv_open.
+ *
+ * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
+ * If it is not NULL, the referenced BDS will be reused.
+ *
+ * The reference parameter may be used to specify an existing block device which
+ * should be opened. If specified, neither options nor a filename may be given,
+ * nor can an existing BDS be reused (that is, *pbs has to be NULL).
  */
-int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
-              int flags, BlockDriver *drv, Error **errp)
+int bdrv_open(BlockDriverState **pbs, const char *filename,
+              const char *reference, QDict *options, int flags,
+              BlockDriver *drv, Error **errp)
 {
     int ret;
-    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
-    char tmp_filename[PATH_MAX + 1];
-    BlockDriverState *file = NULL;
+    BlockDriverState *file = NULL, *bs;
     const char *drvname;
     Error *local_err = NULL;
+    int snapshot_flags = 0;
 
-    /* NULL means an empty set of options */
-    if (options == NULL) {
-        options = qdict_new();
-    }
+    assert(pbs);
 
-    bs->options = options;
-    options = qdict_clone_shallow(options);
+    if (reference) {
+        bool options_non_empty = options ? qdict_size(options) : false;
+        QDECREF(options);
 
-    /* For snapshot=on, create a temporary qcow2 overlay */
-    if (flags & BDRV_O_SNAPSHOT) {
-        BlockDriverState *bs1;
-        int64_t total_size;
-        BlockDriver *bdrv_qcow2;
-        QEMUOptionParameter *create_options;
-        QDict *snapshot_options;
-
-        /* if snapshot, we create a temporary backing file and open it
-           instead of opening 'filename' directly */
-
-        /* Get the required size from the image */
-        bs1 = bdrv_new("");
-        QINCREF(options);
-        ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
-                        drv, &local_err);
-        if (ret < 0) {
-            bdrv_unref(bs1);
-            goto fail;
+        if (*pbs) {
+            error_setg(errp, "Cannot reuse an existing BDS when referencing "
+                       "another block device");
+            return -EINVAL;
         }
-        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
 
-        bdrv_unref(bs1);
+        if (filename || options_non_empty) {
+            error_setg(errp, "Cannot reference an existing block device with "
+                       "additional options or a new filename");
+            return -EINVAL;
+        }
 
-        /* Create the temporary image */
-        ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
-        if (ret < 0) {
-            error_setg_errno(errp, -ret, "Could not get temporary filename");
-            goto fail;
+        bs = bdrv_lookup_bs(reference, reference, errp);
+        if (!bs) {
+            return -ENODEV;
         }
+        bdrv_ref(bs);
+        *pbs = bs;
+        return 0;
+    }
 
-        bdrv_qcow2 = bdrv_find_format("qcow2");
-        create_options = parse_option_parameters("", bdrv_qcow2->create_options,
-                                                 NULL);
+    if (*pbs) {
+        bs = *pbs;
+    } else {
+        bs = bdrv_new("", &error_abort);
+    }
 
-        set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
+    /* NULL means an empty set of options */
+    if (options == NULL) {
+        options = qdict_new();
+    }
 
-        ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
-        free_option_parameters(create_options);
-        if (ret < 0) {
-            error_setg_errno(errp, -ret, "Could not create temporary overlay "
-                             "'%s': %s", tmp_filename,
-                             error_get_pretty(local_err));
-            error_free(local_err);
-            local_err = NULL;
+    if (filename && g_str_has_prefix(filename, "json:")) {
+        QDict *json_options = parse_json_filename(filename, &local_err);
+        if (local_err) {
+            ret = -EINVAL;
             goto fail;
         }
 
-        /* Prepare a new options QDict for the temporary file, where user
-         * options refer to the backing file */
-        if (filename) {
-            qdict_put(options, "file.filename", qstring_from_str(filename));
-        }
-        if (drv) {
-            qdict_put(options, "driver", qstring_from_str(drv->format_name));
-        }
-
-        snapshot_options = qdict_new();
-        qdict_put(snapshot_options, "backing", options);
-        qdict_flatten(snapshot_options);
+        /* Options given in the filename have lower priority than options
+         * specified directly */
+        qdict_join(options, json_options, false);
+        QDECREF(json_options);
+        filename = NULL;
+    }
 
-        bs->options = snapshot_options;
-        options = qdict_clone_shallow(bs->options);
+    bs->options = options;
+    options = qdict_clone_shallow(options);
 
-        filename = tmp_filename;
-        drv = bdrv_qcow2;
-        bs->is_temporary = 1;
+    if (flags & BDRV_O_PROTOCOL) {
+        assert(!drv);
+        ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
+                             &local_err);
+        if (!ret) {
+            drv = bs->drv;
+            goto done;
+        } else if (bs->drv) {
+            goto close_and_fail;
+        } else {
+            goto fail;
+        }
     }
 
     /* Open image file without format layer */
     if (flags & BDRV_O_RDWR) {
         flags |= BDRV_O_ALLOW_RDWR;
     }
+    if (flags & BDRV_O_SNAPSHOT) {
+        snapshot_flags = bdrv_temp_snapshot_flags(flags);
+        flags = bdrv_backing_flags(flags);
+    }
 
+    assert(file == NULL);
     ret = bdrv_open_image(&file, filename, options, "file",
-                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
-                          &local_err);
+                          bdrv_inherited_flags(flags),
+                          true, &local_err);
     if (ret < 0) {
         goto fail;
     }
@@ -1331,7 +1455,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         if (!drv) {
             error_setg(errp, "Invalid driver: '%s'", drvname);
             ret = -EINVAL;
-            goto unlink_and_fail;
+            goto fail;
         }
     }
 
@@ -1341,18 +1465,18 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         } else {
             error_setg(errp, "Must specify either driver or file");
             ret = -EINVAL;
-            goto unlink_and_fail;
+            goto fail;
         }
     }
 
     if (!drv) {
-        goto unlink_and_fail;
+        goto fail;
     }
 
     /* Open the image */
     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
     if (ret < 0) {
-        goto unlink_and_fail;
+        goto fail;
     }
 
     if (file && (bs->file != file)) {
@@ -1371,44 +1495,76 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         }
     }
 
+    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
+     * temporary snapshot afterwards. */
+    if (snapshot_flags) {
+        bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            goto close_and_fail;
+        }
+    }
+
+
+done:
     /* Check if any unknown options were used */
-    if (qdict_size(options) != 0) {
+    if (options && (qdict_size(options) != 0)) {
         const QDictEntry *entry = qdict_first(options);
-        error_setg(errp, "Block format '%s' used by device '%s' doesn't "
-                   "support the option '%s'", drv->format_name, bs->device_name,
-                   entry->key);
+        if (flags & BDRV_O_PROTOCOL) {
+            error_setg(errp, "Block protocol '%s' doesn't support the option "
+                       "'%s'", drv->format_name, entry->key);
+        } else {
+            error_setg(errp, "Block format '%s' used by device '%s' doesn't "
+                       "support the option '%s'", drv->format_name,
+                       bs->device_name, entry->key);
+        }
 
         ret = -EINVAL;
         goto close_and_fail;
     }
-    QDECREF(options);
 
     if (!bdrv_key_required(bs)) {
         bdrv_dev_change_media_cb(bs, true);
+    } else if (!runstate_check(RUN_STATE_PRELAUNCH)
+               && !runstate_check(RUN_STATE_INMIGRATE)
+               && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
+        error_setg(errp,
+                   "Guest must be stopped for opening of encrypted image");
+        ret = -EBUSY;
+        goto close_and_fail;
     }
 
+    QDECREF(options);
+    *pbs = bs;
     return 0;
 
-unlink_and_fail:
+fail:
     if (file != NULL) {
         bdrv_unref(file);
     }
-    if (bs->is_temporary) {
-        unlink(filename);
-    }
-fail:
     QDECREF(bs->options);
     QDECREF(options);
     bs->options = NULL;
-    if (error_is_set(&local_err)) {
+    if (!*pbs) {
+        /* If *pbs is NULL, a new BDS has been created in this function and
+           needs to be freed now. Otherwise, it does not need to be closed,
+           since it has not really been opened yet. */
+        bdrv_unref(bs);
+    }
+    if (local_err) {
         error_propagate(errp, local_err);
     }
     return ret;
 
 close_and_fail:
-    bdrv_close(bs);
+    /* See fail path, but now the BDS has to be always closed */
+    if (*pbs) {
+        bdrv_close(bs);
+    } else {
+        bdrv_unref(bs);
+    }
     QDECREF(options);
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         error_propagate(errp, local_err);
     }
     return ret;
@@ -1449,8 +1605,11 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
         QSIMPLEQ_INIT(bs_queue);
     }
 
+    /* bdrv_open() masks this flag out */
+    flags &= ~BDRV_O_PROTOCOL;
+
     if (bs->file) {
-        bdrv_reopen_queue(bs_queue, bs->file, flags);
+        bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
     }
 
     bs_entry = g_new0(BlockReopenQueueEntry, 1);
@@ -1660,16 +1819,12 @@ void bdrv_close(BlockDriverState *bs)
 
     if (bs->drv) {
         if (bs->backing_hd) {
-            bdrv_unref(bs->backing_hd);
-            bs->backing_hd = NULL;
+            BlockDriverState *backing_hd = bs->backing_hd;
+            bdrv_set_backing_hd(bs, NULL);
+            bdrv_unref(backing_hd);
         }
         bs->drv->bdrv_close(bs);
         g_free(bs->opaque);
-#ifdef _WIN32
-        if (bs->is_temporary) {
-            unlink(bs->filename);
-        }
-#endif
         bs->opaque = NULL;
         bs->drv = NULL;
         bs->copy_on_read = 0;
@@ -1703,7 +1858,11 @@ void bdrv_close_all(void)
     BlockDriverState *bs;
 
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
         bdrv_close(bs);
+        aio_context_release(aio_context);
     }
 }
 
@@ -1728,17 +1887,6 @@ static bool bdrv_requests_pending(BlockDriverState *bs)
     return false;
 }
 
-static bool bdrv_requests_pending_all(void)
-{
-    BlockDriverState *bs;
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        if (bdrv_requests_pending(bs)) {
-            return true;
-        }
-    }
-    return false;
-}
-
 /*
  * Wait for pending requests to complete across all BlockDriverStates
  *
@@ -1758,12 +1906,20 @@ void bdrv_drain_all(void)
     BlockDriverState *bs;
 
     while (busy) {
+        busy = false;
+
         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+            AioContext *aio_context = bdrv_get_aio_context(bs);
+            bool bs_busy;
+
+            aio_context_acquire(aio_context);
             bdrv_start_throttled_reqs(bs);
-        }
+            bs_busy = bdrv_requests_pending(bs);
+            bs_busy |= aio_poll(aio_context, bs_busy);
+            aio_context_release(aio_context);
 
-        busy = bdrv_requests_pending_all();
-        busy |= aio_poll(qemu_get_aio_context(), busy);
+            busy |= bs_busy;
+        }
     }
 }
 
@@ -1793,7 +1949,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
                                      BlockDriverState *bs_src)
 {
     /* move some fields that need to stay attached to the device */
-    bs_dest->open_flags         = bs_src->open_flags;
 
     /* dev info */
     bs_dest->dev_ops            = bs_src->dev_ops;
@@ -1827,18 +1982,14 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->refcnt             = bs_src->refcnt;
 
     /* job */
-    bs_dest->in_use             = bs_src->in_use;
     bs_dest->job                = bs_src->job;
 
     /* keep the same entry in bdrv_states */
     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
             bs_src->device_name);
     bs_dest->device_list = bs_src->device_list;
-
-    /* keep the same entry in graph_bdrv_states
-     * We do want to swap name but don't want to swap linked list entries
-     */
-    bs_dest->node_list   = bs_src->node_list;
+    memcpy(bs_dest->op_blockers, bs_src->op_blockers,
+           sizeof(bs_dest->op_blockers));
 }
 
 /*
@@ -1857,12 +2008,22 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
 {
     BlockDriverState tmp;
 
+    /* The code needs to swap the node_name but simply swapping node_list won't
+     * work so first remove the nodes from the graph list, do the swap then
+     * insert them back if needed.
+     */
+    if (bs_new->node_name[0] != '\0') {
+        QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
+    }
+    if (bs_old->node_name[0] != '\0') {
+        QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
+    }
+
     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
     assert(bs_new->device_name[0] == '\0');
     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
     assert(bs_new->job == NULL);
     assert(bs_new->dev == NULL);
-    assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
     assert(!throttle_have_timer(&bs_new->throttle_state));
 
@@ -1881,10 +2042,17 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     /* Check a few fields that should remain attached to the device */
     assert(bs_new->dev == NULL);
     assert(bs_new->job == NULL);
-    assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
     assert(!throttle_have_timer(&bs_new->throttle_state));
 
+    /* insert the nodes back into the graph node list if needed */
+    if (bs_new->node_name[0] != '\0') {
+        QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
+    }
+    if (bs_old->node_name[0] != '\0') {
+        QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
+    }
+
     bdrv_rebind(bs_new);
     bdrv_rebind(bs_old);
 }
@@ -1906,19 +2074,14 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 
     /* The contents of 'tmp' will become bs_top, as we are
      * swapping bs_new and bs_top contents. */
-    bs_top->backing_hd = bs_new;
-    bs_top->open_flags &= ~BDRV_O_NO_BACKING;
-    pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
-            bs_new->filename);
-    pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
-            bs_new->drv ? bs_new->drv->format_name : "");
+    bdrv_set_backing_hd(bs_top, bs_new);
 }
 
 static void bdrv_delete(BlockDriverState *bs)
 {
     assert(!bs->dev);
     assert(!bs->job);
-    assert(!bs->in_use);
+    assert(bdrv_op_blocker_is_empty(bs));
     assert(!bs->refcnt);
     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
 
@@ -2100,7 +2263,8 @@ int bdrv_commit(BlockDriverState *bs)
         return -ENOTSUP;
     }
 
-    if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
+        bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
         return -EBUSY;
     }
 
@@ -2191,12 +2355,17 @@ int bdrv_commit_all(void)
     BlockDriverState *bs;
 
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
         if (bs->drv && bs->backing_hd) {
             int ret = bdrv_commit(bs);
             if (ret < 0) {
+                aio_context_release(aio_context);
                 return ret;
             }
         }
+        aio_context_release(aio_context);
     }
     return 0;
 }
@@ -2208,6 +2377,10 @@ int bdrv_commit_all(void)
  */
 static void tracked_request_end(BdrvTrackedRequest *req)
 {
+    if (req->serialising) {
+        req->bs->serialising_in_flight--;
+    }
+
     QLIST_REMOVE(req, list);
     qemu_co_queue_restart_all(&req->wait_queue);
 }
@@ -2217,15 +2390,18 @@ static void tracked_request_end(BdrvTrackedRequest *req)
  */
 static void tracked_request_begin(BdrvTrackedRequest *req,
                                   BlockDriverState *bs,
-                                  int64_t sector_num,
-                                  int nb_sectors, bool is_write)
+                                  int64_t offset,
+                                  unsigned int bytes, bool is_write)
 {
     *req = (BdrvTrackedRequest){
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .is_write = is_write,
-        .co = qemu_coroutine_self(),
+        .offset         = offset,
+        .bytes          = bytes,
+        .is_write       = is_write,
+        .co             = qemu_coroutine_self(),
+        .serialising    = false,
+        .overlap_offset = offset,
+        .overlap_bytes  = bytes,
     };
 
     qemu_co_queue_init(&req->wait_queue);
@@ -2233,6 +2409,21 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 }
 
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+    int64_t overlap_offset = req->offset & ~(align - 1);
+    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+                               - overlap_offset;
+
+    if (!req->serialising) {
+        req->bs->serialising_in_flight++;
+        req->serialising = true;
+    }
+
+    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
 /**
  * Round a region to cluster boundaries
  */
@@ -2254,53 +2445,75 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    int ret;
+
+    ret = bdrv_get_info(bs, &bdi);
+    if (ret < 0 || bdi.cluster_size == 0) {
+        return bs->request_alignment;
+    } else {
+        return bdi.cluster_size;
+    }
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
-                                     int64_t sector_num, int nb_sectors) {
+                                     int64_t offset, unsigned int bytes)
+{
     /*        aaaa   bbbb */
-    if (sector_num >= req->sector_num + req->nb_sectors) {
+    if (offset >= req->overlap_offset + req->overlap_bytes) {
         return false;
     }
     /* bbbb   aaaa        */
-    if (req->sector_num >= sector_num + nb_sectors) {
+    if (req->overlap_offset >= offset + bytes) {
         return false;
     }
     return true;
 }
 
-static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors)
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
+    BlockDriverState *bs = self->bs;
     BdrvTrackedRequest *req;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
     bool retry;
+    bool waited = false;
 
-    /* If we touch the same cluster it counts as an overlap.  This guarantees
-     * that allocating writes will be serialized and not race with each other
-     * for the same cluster.  For example, in copy-on-read it ensures that the
-     * CoR read and write operations are atomic and guest writes cannot
-     * interleave between them.
-     */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
+    if (!bs->serialising_in_flight) {
+        return false;
+    }
 
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (tracked_request_overlaps(req, cluster_sector_num,
-                                         cluster_nb_sectors)) {
+            if (req == self || (!req->serialising && !self->serialising)) {
+                continue;
+            }
+            if (tracked_request_overlaps(req, self->overlap_offset,
+                                         self->overlap_bytes))
+            {
                 /* Hitting this means there was a reentrant request, for
                  * example, a block driver issuing nested requests.  This must
                  * never happen since it means deadlock.
                  */
                 assert(qemu_coroutine_self() != req->co);
 
-                qemu_co_queue_wait(&req->wait_queue);
-                retry = true;
-                break;
+                /* If the request is already (indirectly) waiting for us, or
+                 * will wait for us as soon as it wakes up, then just go on
+                 * (instead of producing a deadlock in the former case). */
+                if (!req->waiting_for) {
+                    self->waiting_for = req;
+                    qemu_co_queue_wait(&req->wait_queue);
+                    self->waiting_for = NULL;
+                    retry = true;
+                    waited = true;
+                    break;
+                }
             }
         }
     } while (retry);
+
+    return waited;
 }
 
 /*
@@ -2460,13 +2673,11 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
     if (ret) {
         goto exit;
     }
-    new_top_bs->backing_hd = base_bs;
-
-    bdrv_refresh_limits(new_top_bs);
+    bdrv_set_backing_hd(new_top_bs, base_bs);
 
     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
         /* so that bdrv_close() does not recursively close the chain */
-        intermediate_state->bs->backing_hd = NULL;
+        bdrv_set_backing_hd(intermediate_state->bs, NULL);
         bdrv_unref(intermediate_state->bs);
     }
     ret = 0;
@@ -2484,6 +2695,10 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 {
     int64_t len;
 
+    if (size > INT_MAX) {
+        return -EIO;
+    }
+
     if (!bdrv_is_inserted(bs))
         return -ENOMEDIUM;
 
@@ -2504,14 +2719,17 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                               int nb_sectors)
 {
+    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+        return -EIO;
+    }
+
     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
                                    nb_sectors * BDRV_SECTOR_SIZE);
 }
 
 typedef struct RwCo {
     BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
     QEMUIOVector *qiov;
     bool is_write;
     int ret;
@@ -2523,34 +2741,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
     RwCo *rwco = opaque;
 
     if (!rwco->is_write) {
-        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
-                                     rwco->nb_sectors, rwco->qiov,
-                                     rwco->flags);
-    } else {
-        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
-                                      rwco->nb_sectors, rwco->qiov,
+        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+                                      rwco->qiov->size, rwco->qiov,
                                       rwco->flags);
+    } else {
+        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+                                       rwco->qiov->size, rwco->qiov,
+                                       rwco->flags);
     }
 }
 
 /*
  * Process a vectored synchronous request using coroutines
  */
-static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
-                       QEMUIOVector *qiov, bool is_write,
-                       BdrvRequestFlags flags)
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+                        QEMUIOVector *qiov, bool is_write,
+                        BdrvRequestFlags flags)
 {
     Coroutine *co;
     RwCo rwco = {
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
+        .offset = offset,
         .qiov = qiov,
         .is_write = is_write,
         .ret = NOT_DONE,
         .flags = flags,
     };
-    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
 
     /**
      * In sync call context, when the vcpu is blocked, this throttling timer
@@ -2567,10 +2783,12 @@ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
     } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
         co = qemu_coroutine_create(bdrv_rw_co_entry);
         qemu_coroutine_enter(co, &rwco);
         while (rwco.ret == NOT_DONE) {
-            qemu_aio_wait();
+            aio_poll(aio_context, true);
         }
     }
     return rwco.ret;
@@ -2588,8 +2806,13 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
     };
 
+    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+        return -EINVAL;
+    }
+
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+                        &qiov, is_write, flags);
 }
 
 /* return < 0 if error. See bdrv_write() for the return codes */
@@ -2625,11 +2848,6 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
-{
-    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
-}
-
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
                       int nb_sectors, BdrvRequestFlags flags)
 {
@@ -2647,10 +2865,16 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
  */
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
 {
-    int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+    int64_t target_size;
     int64_t ret, nb_sectors, sector_num = 0;
     int n;
 
+    target_size = bdrv_getlength(bs);
+    if (target_size < 0) {
+        return target_size;
+    }
+    target_size /= BDRV_SECTOR_SIZE;
+
     for (;;) {
         nb_sectors = target_size - sector_num;
         if (nb_sectors <= 0) {
@@ -2679,117 +2903,53 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 }
 
-int bdrv_pread(BlockDriverState *bs, int64_t offset,
-               void *buf, int count1)
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 {
-    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-    int len, nb_sectors, count;
-    int64_t sector_num;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = bytes,
+    };
     int ret;
 
-    count = count1;
-    /* first read to align to sector start */
-    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
-    if (len > count)
-        len = count;
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    if (len > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
-        count -= len;
-        if (count == 0)
-            return count1;
-        sector_num++;
-        buf += len;
-    }
-
-    /* read the sectors "in place" */
-    nb_sectors = count >> BDRV_SECTOR_BITS;
-    if (nb_sectors > 0) {
-        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
-            return ret;
-        sector_num += nb_sectors;
-        len = nb_sectors << BDRV_SECTOR_BITS;
-        buf += len;
-        count -= len;
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+    if (ret < 0) {
+        return ret;
     }
 
-    /* add data from the last sector */
-    if (count > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        memcpy(buf, tmp_buf, count);
-    }
-    return count1;
+    return bytes;
 }
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
 {
-    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-    int len, nb_sectors, count;
-    int64_t sector_num;
     int ret;
 
-    count = qiov->size;
-
-    /* first write to align to sector start */
-    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
-    if (len > count)
-        len = count;
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    if (len > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
-                          len);
-        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        count -= len;
-        if (count == 0)
-            return qiov->size;
-        sector_num++;
-    }
-
-    /* write the sectors "in place" */
-    nb_sectors = count >> BDRV_SECTOR_BITS;
-    if (nb_sectors > 0) {
-        QEMUIOVector qiov_inplace;
-
-        qemu_iovec_init(&qiov_inplace, qiov->niov);
-        qemu_iovec_concat(&qiov_inplace, qiov, len,
-                          nb_sectors << BDRV_SECTOR_BITS);
-        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
-        qemu_iovec_destroy(&qiov_inplace);
-        if (ret < 0) {
-            return ret;
-        }
-
-        sector_num += nb_sectors;
-        len = nb_sectors << BDRV_SECTOR_BITS;
-        count -= len;
+    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    if (ret < 0) {
+        return ret;
     }
 
-    /* add data from the last sector */
-    if (count > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
-        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-    }
     return qiov->size;
 }
 
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-                const void *buf, int count1)
+                const void *buf, int bytes)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
         .iov_base   = (void *) buf,
-        .iov_len    = count1,
+        .iov_len    = bytes,
     };
 
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
     qemu_iovec_init_external(&qiov, &iov, 1);
     return bdrv_pwritev(bs, offset, &qiov);
 }
@@ -2890,10 +3050,10 @@ err:
  * implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    int64_t align, QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
     int ret;
 
     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
@@ -2904,14 +3064,15 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
-        bs->copy_on_read_in_flight++;
-    }
-
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+        /* If we touch the same cluster it counts as an overlap.  This
+         * guarantees that allocating writes will be serialized and not race
+         * with each other for the same cluster.  For example, in copy-on-read
+         * it ensures that the CoR read and write operations are atomic and
+         * guest writes cannot interleave between them. */
+        mark_request_serialising(req, bdrv_get_cluster_size(bs));
     }
 
-    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+    wait_serialising_requests(req);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
@@ -2941,7 +3102,8 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
 
         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
-        max_nb_sectors = MAX(0, total_sectors - sector_num);
+        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+                                  align >> BDRV_SECTOR_BITS);
         if (max_nb_sectors > 0) {
             ret = drv->bdrv_co_readv(bs, sector_num,
                                      MIN(nb_sectors, max_nb_sectors), qiov);
@@ -2959,12 +3121,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
 out:
-    tracked_request_end(&req);
-
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        bs->copy_on_read_in_flight--;
-    }
-
     return ret;
 }
 
@@ -2976,6 +3132,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
     BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+
     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
     uint8_t *head_buf = NULL;
@@ -2997,8 +3155,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        /* TODO Switch to byte granularity */
-        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
+        bdrv_io_limits_intercept(bs, bytes, false);
     }
 
     /* Align read if necessary by padding qiov */
@@ -3026,9 +3183,11 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         bytes = ROUND_UP(bytes, align);
     }
 
-    ret = bdrv_aligned_preadv(bs, offset, bytes,
+    tracked_request_begin(&req, bs, offset, bytes, false);
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
+    tracked_request_end(&req);
 
     if (use_local_qiov) {
         qemu_iovec_destroy(&local_qiov);
@@ -3147,10 +3306,11 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
  * Forwards an already correctly aligned write request to the BlockDriver.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
+    bool waited;
     int ret;
 
     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
@@ -3159,17 +3319,32 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+    waited = wait_serialising_requests(req);
+    assert(!waited || !req->serialising);
+    assert(req->overlap_offset <= offset);
+    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
 
-    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        qemu_iovec_is_zero(qiov)) {
+        flags |= BDRV_REQ_ZERO_WRITE;
+        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+            flags |= BDRV_REQ_MAY_UNMAP;
+        }
+    }
 
     if (ret < 0) {
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
     } else {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
     }
+    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
 
     if (ret == 0 && !bs->enable_write_cache) {
         ret = bdrv_co_flush(bs);
@@ -3184,18 +3359,23 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
     }
 
-    tracked_request_end(&req);
-
     return ret;
 }
 
 /*
  * Handle a write request in coroutine context
  */
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
+    BdrvTrackedRequest req;
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
     int ret;
 
     if (!bs->drv) {
@@ -3204,25 +3384,118 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     if (bs->read_only) {
         return -EACCES;
     }
-    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+    if (bdrv_check_byte_request(bs, offset, bytes)) {
         return -EIO;
     }
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
-    }
-
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, nb_sectors, true);
+        bdrv_io_limits_intercept(bs, bytes, true);
+    }
+
+    /*
+     * Align write if necessary by performing a read-modify-write cycle.
+     * Pad qiov with the read parts and be sure to have a tracked request not
+     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+     */
+    tracked_request_begin(&req, bs, offset, bytes, true);
+
+    if (offset & (align - 1)) {
+        QEMUIOVector head_qiov;
+        struct iovec head_iov;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        head_buf = qemu_blockalign(bs, align);
+        head_iov = (struct iovec) {
+            .iov_base   = head_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+                                  align, &head_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        QEMUIOVector tail_qiov;
+        struct iovec tail_iov;
+        size_t tail_bytes;
+        bool waited;
+
+        mark_request_serialising(&req, align);
+        waited = wait_serialising_requests(&req);
+        assert(!waited || !use_local_qiov);
+
+        tail_buf = qemu_blockalign(bs, align);
+        tail_iov = (struct iovec) {
+            .iov_base   = tail_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+                                  align, &tail_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+
+        tail_bytes = (offset + bytes) & (align - 1);
+        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+        bytes = ROUND_UP(bytes, align);
     }
 
-    ret = bdrv_aligned_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
-                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+                               use_local_qiov ? &local_qiov : qiov,
+                               flags);
+
+fail:
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+    }
+    qemu_vfree(head_buf);
+    qemu_vfree(tail_buf);
 
     return ret;
 }
 
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
@@ -3258,8 +3531,9 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
         return -ENOTSUP;
     if (bs->read_only)
         return -EACCES;
-    if (bdrv_in_use(bs))
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
         return -EBUSY;
+    }
     ret = drv->bdrv_truncate(bs, offset);
     if (ret == 0) {
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
@@ -3440,10 +3714,25 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque)
 {
     BlockDriver *drv;
+    int count = 0;
+    const char **formats = NULL;
 
     QLIST_FOREACH(drv, &bdrv_drivers, list) {
-        it(opaque, drv->format_name);
+        if (drv->format_name) {
+            bool found = false;
+            int i = count;
+            while (formats && i && !found) {
+                found = !strcmp(formats[--i], drv->format_name);
+            }
+
+            if (!found) {
+                formats = g_realloc(formats, (count + 1) * sizeof(char *));
+                formats[count++] = drv->format_name;
+                it(opaque, drv->format_name);
+            }
+        }
     }
+    g_free(formats);
 }
 
 /* This function is to find block backend bs */
@@ -3497,30 +3786,26 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
 {
     BlockDriverState *bs = NULL;
 
-    if ((!device && !node_name) || (device && node_name)) {
-        error_setg(errp, "Use either device or node-name but not both");
-        return NULL;
-    }
-
     if (device) {
         bs = bdrv_find(device);
 
-        if (!bs) {
-            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
-            return NULL;
+        if (bs) {
+            return bs;
         }
-
-        return bs;
     }
 
-    bs = bdrv_find_node(node_name);
+    if (node_name) {
+        bs = bdrv_find_node(node_name);
 
-    if (!bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
-        return NULL;
+        if (bs) {
+            return bs;
+        }
     }
 
-    return bs;
+    error_setg(errp, "Cannot find device=%s nor node_name=%s",
+                     device ? device : "",
+                     node_name ? node_name : "");
+    return NULL;
 }
 
 BlockDriverState *bdrv_next(BlockDriverState *bs)
@@ -3556,10 +3841,15 @@ int bdrv_flush_all(void)
     int result = 0;
 
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        int ret = bdrv_flush(bs);
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        int ret;
+
+        aio_context_acquire(aio_context);
+        ret = bdrv_flush(bs);
         if (ret < 0 && !result) {
             result = ret;
         }
+        aio_context_release(aio_context);
     }
 
     return result;
@@ -3667,7 +3957,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
 
     if (!bs->drv->bdrv_co_get_block_status) {
         *pnum = nb_sectors;
-        ret = BDRV_BLOCK_DATA;
+        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
         if (bs->drv->protocol_name) {
             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
         }
@@ -3686,6 +3976,10 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
                                      *pnum, pnum);
     }
 
+    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+        ret |= BDRV_BLOCK_ALLOCATED;
+    }
+
     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
         if (bdrv_unallocated_blocks_are_zero(bs)) {
             ret |= BDRV_BLOCK_ZERO;
@@ -3746,10 +4040,12 @@ int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
         /* Fast-path if already in coroutine context */
         bdrv_get_block_status_co_entry(&data);
     } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
         qemu_coroutine_enter(co, &data);
         while (!data.done) {
-            qemu_aio_wait();
+            aio_poll(aio_context, true);
         }
     }
     return data.ret;
@@ -3762,9 +4058,7 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
     if (ret < 0) {
         return ret;
     }
-    return
-        (ret & BDRV_BLOCK_DATA) ||
-        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
+    return (ret & BDRV_BLOCK_ALLOCATED);
 }
 
 /*
@@ -3949,7 +4243,7 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
 
 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
 {
-    while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
+    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
         bs = bs->file;
     }
 
@@ -4344,7 +4638,7 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
     acb->is_write = is_write;
     acb->qiov = qiov;
     acb->bounce = qemu_blockalign(bs, qiov->size);
-    acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
 
     if (is_write) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
@@ -4383,13 +4677,14 @@ typedef struct BlockDriverAIOCBCoroutine {
 
 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
 {
+    AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
     BlockDriverAIOCBCoroutine *acb =
         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
     bool done = false;
 
     acb->done = &done;
     while (!done) {
-        qemu_aio_wait();
+        aio_poll(aio_context, true);
     }
 }
 
@@ -4426,7 +4721,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
     }
 
-    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
     qemu_bh_schedule(acb->bh);
 }
 
@@ -4462,7 +4757,7 @@ static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_flush(bs);
-    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
     qemu_bh_schedule(acb->bh);
 }
 
@@ -4489,7 +4784,7 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
-    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
     qemu_bh_schedule(acb->bh);
 }
 
@@ -4668,19 +4963,47 @@ flush_parent:
     return bdrv_co_flush(bs->file);
 }
 
-void bdrv_invalidate_cache(BlockDriverState *bs)
+void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
-    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
-        bs->drv->bdrv_invalidate_cache(bs);
+    Error *local_err = NULL;
+    int ret;
+
+    if (!bs->drv)  {
+        return;
+    }
+
+    if (bs->drv->bdrv_invalidate_cache) {
+        bs->drv->bdrv_invalidate_cache(bs, &local_err);
+    } else if (bs->file) {
+        bdrv_invalidate_cache(bs->file, &local_err);
+    }
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    ret = refresh_total_sectors(bs, bs->total_sectors);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not refresh total sector count");
+        return;
     }
 }
 
-void bdrv_invalidate_cache_all(void)
+void bdrv_invalidate_cache_all(Error **errp)
 {
     BlockDriverState *bs;
+    Error *local_err = NULL;
 
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
-        bdrv_invalidate_cache(bs);
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        bdrv_invalidate_cache(bs, &local_err);
+        aio_context_release(aio_context);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
     }
 }
 
@@ -4689,7 +5012,11 @@ void bdrv_clear_incoming_migration_all(void)
     BlockDriverState *bs;
 
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
+        aio_context_release(aio_context);
     }
 }
 
@@ -4705,19 +5032,27 @@ int bdrv_flush(BlockDriverState *bs)
         /* Fast-path if already in coroutine context */
         bdrv_flush_co_entry(&rwco);
     } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
         co = qemu_coroutine_create(bdrv_flush_co_entry);
         qemu_coroutine_enter(co, &rwco);
         while (rwco.ret == NOT_DONE) {
-            qemu_aio_wait();
+            aio_poll(aio_context, true);
         }
     }
 
     return rwco.ret;
 }
 
+typedef struct DiscardCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int ret;
+} DiscardCo;
 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 {
-    RwCo *rwco = opaque;
+    DiscardCo *rwco = opaque;
 
     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }
@@ -4801,7 +5136,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
 {
     Coroutine *co;
-    RwCo rwco = {
+    DiscardCo rwco = {
         .bs = bs,
         .sector_num = sector_num,
         .nb_sectors = nb_sectors,
@@ -4812,10 +5147,12 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
         /* Fast-path if already in coroutine context */
         bdrv_discard_co_entry(&rwco);
     } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
         co = qemu_coroutine_create(bdrv_discard_co_entry);
         qemu_coroutine_enter(co, &rwco);
         while (rwco.ret == NOT_DONE) {
-            qemu_aio_wait();
+            aio_poll(aio_context, true);
         }
     }
 
@@ -4936,7 +5273,8 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return true;
 }
 
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
+                                          Error **errp)
 {
     int64_t bitmap_size;
     BdrvDirtyBitmap *bitmap;
@@ -4945,7 +5283,13 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
 
     granularity >>= BDRV_SECTOR_BITS;
     assert(granularity);
-    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+    bitmap_size = bdrv_getlength(bs);
+    if (bitmap_size < 0) {
+        error_setg_errno(errp, -bitmap_size, "could not get length of device");
+        errno = -bitmap_size;
+        return NULL;
+    }
+    bitmap_size >>= BDRV_SECTOR_BITS;
     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
@@ -5039,15 +5383,74 @@ void bdrv_unref(BlockDriverState *bs)
     }
 }
 
-void bdrv_set_in_use(BlockDriverState *bs, int in_use)
+struct BdrvOpBlocker {
+    Error *reason;
+    QLIST_ENTRY(BdrvOpBlocker) list;
+};
+
+bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
+{
+    BdrvOpBlocker *blocker;
+    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
+    if (!QLIST_EMPTY(&bs->op_blockers[op])) {
+        blocker = QLIST_FIRST(&bs->op_blockers[op]);
+        if (errp) {
+            error_setg(errp, "Device '%s' is busy: %s",
+                       bs->device_name, error_get_pretty(blocker->reason));
+        }
+        return true;
+    }
+    return false;
+}
+
+void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
+{
+    BdrvOpBlocker *blocker;
+    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
+
+    blocker = g_malloc0(sizeof(BdrvOpBlocker));
+    blocker->reason = reason;
+    QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
+}
+
+void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
+{
+    BdrvOpBlocker *blocker, *next;
+    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
+    QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
+        if (blocker->reason == reason) {
+            QLIST_REMOVE(blocker, list);
+            g_free(blocker);
+        }
+    }
+}
+
+void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
+{
+    int i;
+    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
+        bdrv_op_block(bs, i, reason);
+    }
+}
+
+void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
 {
-    assert(bs->in_use != in_use);
-    bs->in_use = in_use;
+    int i;
+    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
+        bdrv_op_unblock(bs, i, reason);
+    }
 }
 
-int bdrv_in_use(BlockDriverState *bs)
+bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
 {
-    return bs->in_use;
+    int i;
+
+    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
+        if (!QLIST_EMPTY(&bs->op_blockers[i])) {
+            return false;
+        }
+    }
+    return true;
 }
 
 void bdrv_iostatus_enable(BlockDriverState *bs)
@@ -5205,9 +5608,8 @@ void bdrv_img_create(const char *filename, const char *fmt,
             back_flags =
                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 
-            bs = bdrv_new("");
-
-            ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
+            bs = NULL;
+            ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
                             backing_drv, &local_err);
             if (ret < 0) {
                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
@@ -5215,7 +5617,6 @@ void bdrv_img_create(const char *filename, const char *fmt,
                                  error_get_pretty(local_err));
                 error_free(local_err);
                 local_err = NULL;
-                bdrv_unref(bs);
                 goto out;
             }
             bdrv_get_geometry(bs, &size);
@@ -5255,15 +5656,73 @@ out:
     free_option_parameters(create_options);
     free_option_parameters(param);
 
-    if (error_is_set(&local_err)) {
+    if (local_err) {
         error_propagate(errp, local_err);
     }
 }
 
 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 {
-    /* Currently BlockDriverState always uses the main loop AioContext */
-    return qemu_get_aio_context();
+    return bs->aio_context;
+}
+
+void bdrv_detach_aio_context(BlockDriverState *bs)
+{
+    if (!bs->drv) {
+        return;
+    }
+
+    if (bs->io_limits_enabled) {
+        throttle_detach_aio_context(&bs->throttle_state);
+    }
+    if (bs->drv->bdrv_detach_aio_context) {
+        bs->drv->bdrv_detach_aio_context(bs);
+    }
+    if (bs->file) {
+        bdrv_detach_aio_context(bs->file);
+    }
+    if (bs->backing_hd) {
+        bdrv_detach_aio_context(bs->backing_hd);
+    }
+
+    bs->aio_context = NULL;
+}
+
+void bdrv_attach_aio_context(BlockDriverState *bs,
+                             AioContext *new_context)
+{
+    if (!bs->drv) {
+        return;
+    }
+
+    bs->aio_context = new_context;
+
+    if (bs->backing_hd) {
+        bdrv_attach_aio_context(bs->backing_hd, new_context);
+    }
+    if (bs->file) {
+        bdrv_attach_aio_context(bs->file, new_context);
+    }
+    if (bs->drv->bdrv_attach_aio_context) {
+        bs->drv->bdrv_attach_aio_context(bs, new_context);
+    }
+    if (bs->io_limits_enabled) {
+        throttle_attach_aio_context(&bs->throttle_state, new_context);
+    }
+}
+
+void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
+{
+    bdrv_drain_all(); /* ensure there are no in-flight requests */
+
+    bdrv_detach_aio_context(bs);
+
+    /* This function executes in the old AioContext so acquire the new one in
+     * case it runs in a different thread.
+     */
+    aio_context_acquire(new_context);
+    bdrv_attach_aio_context(bs, new_context);
+    aio_context_release(new_context);
 }
 
 void bdrv_add_before_write_notifier(BlockDriverState *bs,
@@ -5280,43 +5739,37 @@ int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
     return bs->drv->bdrv_amend_options(bs, options);
 }
 
-/* Used to recurse on single child block filters.
- * Single child block filter will store their child in bs->file.
+/* This function will be called by the bdrv_recurse_is_first_non_filter method
+ * of block filter and by bdrv_is_first_non_filter.
+ * It is used to test if the given bs is the candidate or recurse more in the
+ * node graph.
  */
-bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
+bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
                                       BlockDriverState *candidate)
 {
-    if (!bs->drv) {
-        return false;
-    }
-
-    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
-        if (bs == candidate) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
+    /* return false if basic checks fails */
+    if (!bs || !bs->drv) {
         return false;
     }
 
-    if (!bs->file) {
-        return false;
+    /* the code reached a non block filter driver -> check if the bs is
+     * the same as the candidate. It's the recursion termination condition.
+     */
+    if (!bs->drv->is_filter) {
+        return bs == candidate;
     }
+    /* Down this path the driver is a block filter driver */
 
-    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
-}
-
-bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
-                                      BlockDriverState *candidate)
-{
-    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
+    /* If the block filter recursion method is defined use it to recurse down
+     * the node graph.
+     */
+    if (bs->drv->bdrv_recurse_is_first_non_filter) {
         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
     }
 
-    return bdrv_generic_is_first_non_filter(bs, candidate);
+    /* the driver is a block filter but don't allow to recurse -> return false
+     */
+    return false;
 }
 
 /* This function checks if the candidate is the first non filter bs down it's
@@ -5331,11 +5784,8 @@ bool bdrv_is_first_non_filter(BlockDriverState *candidate)
     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         bool perm;
 
-        if (!bs->file) {
-            continue;
-        }
-
-        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
+        /* try to recurse in this top level bs */
+        perm = bdrv_recurse_is_first_non_filter(bs, candidate);
 
         /* candidate is the first non filter */
         if (perm) {
This page took 0.100427 seconds and 4 git commands to generate.