2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/qtest.h"
34 #include "qemu/notify.h"
35 #include "block/coroutine.h"
36 #include "block/qapi.h"
37 #include "qmp-commands.h"
38 #include "qemu/timer.h"
39 #include "qapi-event.h"
42 #include <sys/types.h>
44 #include <sys/ioctl.h>
45 #include <sys/queue.h>
56 * A BdrvDirtyBitmap can be in three possible states:
57 * (1) successor is NULL and disabled is false: full r/w mode
58 * (2) successor is NULL and disabled is true: read only mode ("disabled")
59 * (3) successor is set: frozen mode.
60 * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
61 * or enabled. A frozen bitmap can only abdicate() or reclaim().
63 struct BdrvDirtyBitmap {
65 BdrvDirtyBitmap *successor;
69 QLIST_ENTRY(BdrvDirtyBitmap) list;
72 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
74 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
75 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
76 BlockCompletionFunc *cb, void *opaque);
77 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
78 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
79 BlockCompletionFunc *cb, void *opaque);
80 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
81 int64_t sector_num, int nb_sectors,
83 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors,
86 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
87 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
88 BdrvRequestFlags flags);
89 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
90 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
91 BdrvRequestFlags flags);
92 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
96 BdrvRequestFlags flags,
97 BlockCompletionFunc *cb,
100 static void coroutine_fn bdrv_co_do_rw(void *opaque);
101 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
102 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
104 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
105 QTAILQ_HEAD_INITIALIZER(bdrv_states);
107 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
108 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
110 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
111 QLIST_HEAD_INITIALIZER(bdrv_drivers);
113 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
115 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
117 /* If non-zero, use only whitelisted block drivers */
118 static int use_bdrv_whitelist;
121 static int is_windows_drive_prefix(const char *filename)
123 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
124 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
128 int is_windows_drive(const char *filename)
130 if (is_windows_drive_prefix(filename) &&
133 if (strstart(filename, "\\\\.\\", NULL) ||
134 strstart(filename, "//./", NULL))
140 /* throttling disk I/O limits */
141 void bdrv_set_io_limits(BlockDriverState *bs,
146 throttle_config(&bs->throttle_state, cfg);
148 for (i = 0; i < 2; i++) {
149 qemu_co_enter_next(&bs->throttled_reqs[i]);
153 /* this function drain all the throttled IOs */
154 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
156 bool drained = false;
157 bool enabled = bs->io_limits_enabled;
160 bs->io_limits_enabled = false;
162 for (i = 0; i < 2; i++) {
163 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
168 bs->io_limits_enabled = enabled;
173 void bdrv_io_limits_disable(BlockDriverState *bs)
175 bs->io_limits_enabled = false;
177 bdrv_start_throttled_reqs(bs);
179 throttle_destroy(&bs->throttle_state);
182 static void bdrv_throttle_read_timer_cb(void *opaque)
184 BlockDriverState *bs = opaque;
185 qemu_co_enter_next(&bs->throttled_reqs[0]);
188 static void bdrv_throttle_write_timer_cb(void *opaque)
190 BlockDriverState *bs = opaque;
191 qemu_co_enter_next(&bs->throttled_reqs[1]);
194 /* should be called before bdrv_set_io_limits if a limit is set */
195 void bdrv_io_limits_enable(BlockDriverState *bs)
197 int clock_type = QEMU_CLOCK_REALTIME;
199 if (qtest_enabled()) {
200 /* For testing block IO throttling only */
201 clock_type = QEMU_CLOCK_VIRTUAL;
203 assert(!bs->io_limits_enabled);
204 throttle_init(&bs->throttle_state,
205 bdrv_get_aio_context(bs),
207 bdrv_throttle_read_timer_cb,
208 bdrv_throttle_write_timer_cb,
210 bs->io_limits_enabled = true;
213 /* This function makes an IO wait if needed
215 * @nb_sectors: the number of sectors of the IO
216 * @is_write: is the IO a write
218 static void bdrv_io_limits_intercept(BlockDriverState *bs,
222 /* does this io must wait */
223 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
225 /* if must wait or any request of this type throttled queue the IO */
227 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
228 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
231 /* the IO will be executed, do the accounting */
232 throttle_account(&bs->throttle_state, is_write, bytes);
235 /* if the next request must wait -> do nothing */
236 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
240 /* else queue next request for execution */
241 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
244 size_t bdrv_opt_mem_align(BlockDriverState *bs)
246 if (!bs || !bs->drv) {
247 /* 4k should be on the safe side */
251 return bs->bl.opt_mem_alignment;
254 /* check if the path starts with "<protocol>:" */
255 int path_has_protocol(const char *path)
260 if (is_windows_drive(path) ||
261 is_windows_drive_prefix(path)) {
264 p = path + strcspn(path, ":/\\");
266 p = path + strcspn(path, ":/");
272 int path_is_absolute(const char *path)
275 /* specific case for names like: "\\.\d:" */
276 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
279 return (*path == '/' || *path == '\\');
281 return (*path == '/');
285 /* if filename is absolute, just copy it to dest. Otherwise, build a
286 path to it by considering it is relative to base_path. URL are
288 void path_combine(char *dest, int dest_size,
289 const char *base_path,
290 const char *filename)
297 if (path_is_absolute(filename)) {
298 pstrcpy(dest, dest_size, filename);
300 p = strchr(base_path, ':');
305 p1 = strrchr(base_path, '/');
309 p2 = strrchr(base_path, '\\');
321 if (len > dest_size - 1)
323 memcpy(dest, base_path, len);
325 pstrcat(dest, dest_size, filename);
329 void bdrv_get_full_backing_filename_from_filename(const char *backed,
331 char *dest, size_t sz,
334 if (backing[0] == '\0' || path_has_protocol(backing) ||
335 path_is_absolute(backing))
337 pstrcpy(dest, sz, backing);
338 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
339 error_setg(errp, "Cannot use relative backing file names for '%s'",
342 path_combine(dest, sz, backed, backing);
346 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
349 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
351 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
355 void bdrv_register(BlockDriver *bdrv)
357 /* Block drivers without coroutine functions need emulation */
358 if (!bdrv->bdrv_co_readv) {
359 bdrv->bdrv_co_readv = bdrv_co_readv_em;
360 bdrv->bdrv_co_writev = bdrv_co_writev_em;
362 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
363 * the block driver lacks aio we need to emulate that too.
365 if (!bdrv->bdrv_aio_readv) {
366 /* add AIO emulation layer */
367 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
368 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
372 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
375 BlockDriverState *bdrv_new_root(void)
377 BlockDriverState *bs = bdrv_new();
379 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
383 BlockDriverState *bdrv_new(void)
385 BlockDriverState *bs;
388 bs = g_new0(BlockDriverState, 1);
389 QLIST_INIT(&bs->dirty_bitmaps);
390 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
391 QLIST_INIT(&bs->op_blockers[i]);
393 bdrv_iostatus_disable(bs);
394 notifier_list_init(&bs->close_notifiers);
395 notifier_with_return_list_init(&bs->before_write_notifiers);
396 qemu_co_queue_init(&bs->throttled_reqs[0]);
397 qemu_co_queue_init(&bs->throttled_reqs[1]);
399 bs->aio_context = qemu_get_aio_context();
404 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
406 notifier_list_add(&bs->close_notifiers, notify);
409 BlockDriver *bdrv_find_format(const char *format_name)
412 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
413 if (!strcmp(drv1->format_name, format_name)) {
420 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
422 static const char *whitelist_rw[] = {
423 CONFIG_BDRV_RW_WHITELIST
425 static const char *whitelist_ro[] = {
426 CONFIG_BDRV_RO_WHITELIST
430 if (!whitelist_rw[0] && !whitelist_ro[0]) {
431 return 1; /* no whitelist, anything goes */
434 for (p = whitelist_rw; *p; p++) {
435 if (!strcmp(drv->format_name, *p)) {
440 for (p = whitelist_ro; *p; p++) {
441 if (!strcmp(drv->format_name, *p)) {
449 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
452 BlockDriver *drv = bdrv_find_format(format_name);
453 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
456 typedef struct CreateCo {
464 static void coroutine_fn bdrv_create_co_entry(void *opaque)
466 Error *local_err = NULL;
469 CreateCo *cco = opaque;
472 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
474 error_propagate(&cco->err, local_err);
479 int bdrv_create(BlockDriver *drv, const char* filename,
480 QemuOpts *opts, Error **errp)
487 .filename = g_strdup(filename),
493 if (!drv->bdrv_create) {
494 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
499 if (qemu_in_coroutine()) {
500 /* Fast-path if already in coroutine context */
501 bdrv_create_co_entry(&cco);
503 co = qemu_coroutine_create(bdrv_create_co_entry);
504 qemu_coroutine_enter(co, &cco);
505 while (cco.ret == NOT_DONE) {
506 aio_poll(qemu_get_aio_context(), true);
513 error_propagate(errp, cco.err);
515 error_setg_errno(errp, -ret, "Could not create image");
520 g_free(cco.filename);
524 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
527 Error *local_err = NULL;
530 drv = bdrv_find_protocol(filename, true, errp);
535 ret = bdrv_create(drv, filename, opts, &local_err);
537 error_propagate(errp, local_err);
542 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
544 BlockDriver *drv = bs->drv;
545 Error *local_err = NULL;
547 memset(&bs->bl, 0, sizeof(bs->bl));
553 /* Take some limits from the children as a default */
555 bdrv_refresh_limits(bs->file, &local_err);
557 error_propagate(errp, local_err);
560 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
561 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
562 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
564 bs->bl.opt_mem_alignment = 512;
567 if (bs->backing_hd) {
568 bdrv_refresh_limits(bs->backing_hd, &local_err);
570 error_propagate(errp, local_err);
573 bs->bl.opt_transfer_length =
574 MAX(bs->bl.opt_transfer_length,
575 bs->backing_hd->bl.opt_transfer_length);
576 bs->bl.max_transfer_length =
577 MIN_NON_ZERO(bs->bl.max_transfer_length,
578 bs->backing_hd->bl.max_transfer_length);
579 bs->bl.opt_mem_alignment =
580 MAX(bs->bl.opt_mem_alignment,
581 bs->backing_hd->bl.opt_mem_alignment);
584 /* Then let the driver override it */
585 if (drv->bdrv_refresh_limits) {
586 drv->bdrv_refresh_limits(bs, errp);
591 * Try to get @bs's logical and physical block size.
592 * On success, store them in @bsz struct and return 0.
593 * On failure return -errno.
594 * @bs must not be empty.
596 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
598 BlockDriver *drv = bs->drv;
600 if (drv && drv->bdrv_probe_blocksizes) {
601 return drv->bdrv_probe_blocksizes(bs, bsz);
608 * Try to get @bs's geometry (cyls, heads, sectors).
609 * On success, store them in @geo struct and return 0.
610 * On failure return -errno.
611 * @bs must not be empty.
613 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
615 BlockDriver *drv = bs->drv;
617 if (drv && drv->bdrv_probe_geometry) {
618 return drv->bdrv_probe_geometry(bs, geo);
625 * Create a uniquely-named empty temporary file.
626 * Return 0 upon success, otherwise a negative errno value.
628 int get_tmp_filename(char *filename, int size)
631 char temp_dir[MAX_PATH];
632 /* GetTempFileName requires that its output buffer (4th param)
633 have length MAX_PATH or greater. */
634 assert(size >= MAX_PATH);
635 return (GetTempPath(MAX_PATH, temp_dir)
636 && GetTempFileName(temp_dir, "qem", 0, filename)
637 ? 0 : -GetLastError());
641 tmpdir = getenv("TMPDIR");
645 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
648 fd = mkstemp(filename);
652 if (close(fd) != 0) {
661 * Detect host devices. By convention, /dev/cdrom[N] is always
662 * recognized as a host CDROM.
664 static BlockDriver *find_hdev_driver(const char *filename)
666 int score_max = 0, score;
667 BlockDriver *drv = NULL, *d;
669 QLIST_FOREACH(d, &bdrv_drivers, list) {
670 if (d->bdrv_probe_device) {
671 score = d->bdrv_probe_device(filename);
672 if (score > score_max) {
682 BlockDriver *bdrv_find_protocol(const char *filename,
683 bool allow_protocol_prefix,
691 /* TODO Drivers without bdrv_file_open must be specified explicitly */
694 * XXX(hch): we really should not let host device detection
695 * override an explicit protocol specification, but moving this
696 * later breaks access to device names with colons in them.
697 * Thanks to the brain-dead persistent naming schemes on udev-
698 * based Linux systems those actually are quite common.
700 drv1 = find_hdev_driver(filename);
705 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
709 p = strchr(filename, ':');
712 if (len > sizeof(protocol) - 1)
713 len = sizeof(protocol) - 1;
714 memcpy(protocol, filename, len);
715 protocol[len] = '\0';
716 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
717 if (drv1->protocol_name &&
718 !strcmp(drv1->protocol_name, protocol)) {
723 error_setg(errp, "Unknown protocol '%s'", protocol);
728 * Guess image format by probing its contents.
729 * This is not a good idea when your image is raw (CVE-2008-2004), but
730 * we do it anyway for backward compatibility.
732 * @buf contains the image's first @buf_size bytes.
733 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
734 * but can be smaller if the image file is smaller)
735 * @filename is its filename.
737 * For all block drivers, call the bdrv_probe() method to get its
739 * Return the first block driver with the highest probing score.
741 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
742 const char *filename)
744 int score_max = 0, score;
745 BlockDriver *drv = NULL, *d;
747 QLIST_FOREACH(d, &bdrv_drivers, list) {
749 score = d->bdrv_probe(buf, buf_size, filename);
750 if (score > score_max) {
760 static int find_image_format(BlockDriverState *bs, const char *filename,
761 BlockDriver **pdrv, Error **errp)
764 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
767 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
768 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
773 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
775 error_setg_errno(errp, -ret, "Could not read image for determining its "
781 drv = bdrv_probe_all(buf, ret, filename);
783 error_setg(errp, "Could not determine image format: No compatible "
792 * Set the current 'total_sectors' value
793 * Return 0 on success, -errno on error.
795 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
797 BlockDriver *drv = bs->drv;
799 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
803 /* query actual device if possible, otherwise just trust the hint */
804 if (drv->bdrv_getlength) {
805 int64_t length = drv->bdrv_getlength(bs);
809 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
812 bs->total_sectors = hint;
817 * Set open flags for a given discard mode
819 * Return 0 on success, -1 if the discard mode was invalid.
821 int bdrv_parse_discard_flags(const char *mode, int *flags)
823 *flags &= ~BDRV_O_UNMAP;
825 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
827 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
828 *flags |= BDRV_O_UNMAP;
837 * Set open flags for a given cache mode
839 * Return 0 on success, -1 if the cache mode was invalid.
841 int bdrv_parse_cache_flags(const char *mode, int *flags)
843 *flags &= ~BDRV_O_CACHE_MASK;
845 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
846 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
847 } else if (!strcmp(mode, "directsync")) {
848 *flags |= BDRV_O_NOCACHE;
849 } else if (!strcmp(mode, "writeback")) {
850 *flags |= BDRV_O_CACHE_WB;
851 } else if (!strcmp(mode, "unsafe")) {
852 *flags |= BDRV_O_CACHE_WB;
853 *flags |= BDRV_O_NO_FLUSH;
854 } else if (!strcmp(mode, "writethrough")) {
855 /* this is the default */
864 * The copy-on-read flag is actually a reference count so multiple users may
865 * use the feature without worrying about clobbering its previous state.
866 * Copy-on-read stays enabled until all users have called to disable it.
868 void bdrv_enable_copy_on_read(BlockDriverState *bs)
873 void bdrv_disable_copy_on_read(BlockDriverState *bs)
875 assert(bs->copy_on_read > 0);
880 * Returns the flags that a temporary snapshot should get, based on the
881 * originally requested flags (the originally requested image will have flags
882 * like a backing file)
884 static int bdrv_temp_snapshot_flags(int flags)
886 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
890 * Returns the flags that bs->file should get, based on the given flags for
893 static int bdrv_inherited_flags(int flags)
895 /* Enable protocol handling, disable format probing for bs->file */
896 flags |= BDRV_O_PROTOCOL;
898 /* Our block drivers take care to send flushes and respect unmap policy,
899 * so we can enable both unconditionally on lower layers. */
900 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
902 /* Clear flags that only apply to the top layer */
903 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
909 * Returns the flags that bs->backing_hd should get, based on the given flags
912 static int bdrv_backing_flags(int flags)
914 /* backing files always opened read-only */
915 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
917 /* snapshot=on is handled on the top layer */
918 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
923 static int bdrv_open_flags(BlockDriverState *bs, int flags)
925 int open_flags = flags | BDRV_O_CACHE_WB;
928 * Clear flags that are internal to the block layer before opening the
931 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
934 * Snapshots should be writable.
936 if (flags & BDRV_O_TEMPORARY) {
937 open_flags |= BDRV_O_RDWR;
943 static void bdrv_assign_node_name(BlockDriverState *bs,
944 const char *node_name,
951 /* Check for empty string or invalid characters */
952 if (!id_wellformed(node_name)) {
953 error_setg(errp, "Invalid node name");
957 /* takes care of avoiding namespaces collisions */
958 if (blk_by_name(node_name)) {
959 error_setg(errp, "node-name=%s is conflicting with a device id",
964 /* takes care of avoiding duplicates node names */
965 if (bdrv_find_node(node_name)) {
966 error_setg(errp, "Duplicate node name");
970 /* copy node name into the bs and insert it into the graph list */
971 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
972 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
976 * Common part for opening disk images and files
978 * Removes all processed options from *options.
980 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
981 QDict *options, int flags, BlockDriver *drv, Error **errp)
984 const char *filename;
985 const char *node_name = NULL;
986 Error *local_err = NULL;
989 assert(bs->file == NULL);
990 assert(options != NULL && bs->options != options);
993 filename = file->filename;
995 filename = qdict_get_try_str(options, "filename");
998 if (drv->bdrv_needs_filename && !filename) {
999 error_setg(errp, "The '%s' block driver requires a file name",
1004 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
1006 node_name = qdict_get_try_str(options, "node-name");
1007 bdrv_assign_node_name(bs, node_name, &local_err);
1009 error_propagate(errp, local_err);
1012 qdict_del(options, "node-name");
1014 /* bdrv_open() with directly using a protocol as drv. This layer is already
1015 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1016 * and return immediately. */
1017 if (file != NULL && drv->bdrv_file_open) {
1018 bdrv_swap(file, bs);
1022 bs->open_flags = flags;
1023 bs->guest_block_size = 512;
1024 bs->request_alignment = 512;
1025 bs->zero_beyond_eof = true;
1026 open_flags = bdrv_open_flags(bs, flags);
1027 bs->read_only = !(open_flags & BDRV_O_RDWR);
1029 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1031 !bs->read_only && bdrv_is_whitelisted(drv, true)
1032 ? "Driver '%s' can only be used for read-only devices"
1033 : "Driver '%s' is not whitelisted",
1038 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
1039 if (flags & BDRV_O_COPY_ON_READ) {
1040 if (!bs->read_only) {
1041 bdrv_enable_copy_on_read(bs);
1043 error_setg(errp, "Can't use copy-on-read on read-only device");
1048 if (filename != NULL) {
1049 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1051 bs->filename[0] = '\0';
1053 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1056 bs->opaque = g_malloc0(drv->instance_size);
1058 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1060 /* Open the image, either directly or using a protocol */
1061 if (drv->bdrv_file_open) {
1062 assert(file == NULL);
1063 assert(!drv->bdrv_needs_filename || filename != NULL);
1064 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1067 error_setg(errp, "Can't use '%s' as a block driver for the "
1068 "protocol level", drv->format_name);
1073 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1078 error_propagate(errp, local_err);
1079 } else if (bs->filename[0]) {
1080 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1082 error_setg_errno(errp, -ret, "Could not open image");
1087 if (bs->encrypted) {
1088 error_report("Encrypted images are deprecated");
1089 error_printf("Support for them will be removed in a future release.\n"
1090 "You can use 'qemu-img convert' to convert your image"
1091 " to an unencrypted one.\n");
1094 ret = refresh_total_sectors(bs, bs->total_sectors);
1096 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1100 bdrv_refresh_limits(bs, &local_err);
1102 error_propagate(errp, local_err);
1107 assert(bdrv_opt_mem_align(bs) != 0);
1108 assert((bs->request_alignment != 0) || bs->sg);
1119 static QDict *parse_json_filename(const char *filename, Error **errp)
1121 QObject *options_obj;
1125 ret = strstart(filename, "json:", &filename);
1128 options_obj = qobject_from_json(filename);
1130 error_setg(errp, "Could not parse the JSON options");
1134 if (qobject_type(options_obj) != QTYPE_QDICT) {
1135 qobject_decref(options_obj);
1136 error_setg(errp, "Invalid JSON object given");
1140 options = qobject_to_qdict(options_obj);
1141 qdict_flatten(options);
1147 * Fills in default options for opening images and converts the legacy
1148 * filename/flags pair to option QDict entries.
1150 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1151 BlockDriver *drv, Error **errp)
1153 const char *filename = *pfilename;
1154 const char *drvname;
1155 bool protocol = flags & BDRV_O_PROTOCOL;
1156 bool parse_filename = false;
1157 Error *local_err = NULL;
1159 /* Parse json: pseudo-protocol */
1160 if (filename && g_str_has_prefix(filename, "json:")) {
1161 QDict *json_options = parse_json_filename(filename, &local_err);
1163 error_propagate(errp, local_err);
1167 /* Options given in the filename have lower priority than options
1168 * specified directly */
1169 qdict_join(*options, json_options, false);
1170 QDECREF(json_options);
1171 *pfilename = filename = NULL;
1174 /* Fetch the file name from the options QDict if necessary */
1175 if (protocol && filename) {
1176 if (!qdict_haskey(*options, "filename")) {
1177 qdict_put(*options, "filename", qstring_from_str(filename));
1178 parse_filename = true;
1180 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1186 /* Find the right block driver */
1187 filename = qdict_get_try_str(*options, "filename");
1188 drvname = qdict_get_try_str(*options, "driver");
1192 error_setg(errp, "Driver specified twice");
1195 drvname = drv->format_name;
1196 qdict_put(*options, "driver", qstring_from_str(drvname));
1198 if (!drvname && protocol) {
1200 drv = bdrv_find_protocol(filename, parse_filename, errp);
1205 drvname = drv->format_name;
1206 qdict_put(*options, "driver", qstring_from_str(drvname));
1208 error_setg(errp, "Must specify either driver or file");
1211 } else if (drvname) {
1212 drv = bdrv_find_format(drvname);
1214 error_setg(errp, "Unknown driver '%s'", drvname);
1220 assert(drv || !protocol);
1222 /* Driver-specific filename parsing */
1223 if (drv && drv->bdrv_parse_filename && parse_filename) {
1224 drv->bdrv_parse_filename(filename, *options, &local_err);
1226 error_propagate(errp, local_err);
1230 if (!drv->bdrv_needs_filename) {
1231 qdict_del(*options, "filename");
1238 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1241 if (bs->backing_hd) {
1242 assert(bs->backing_blocker);
1243 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1244 } else if (backing_hd) {
1245 error_setg(&bs->backing_blocker,
1246 "node is used as backing hd of '%s'",
1247 bdrv_get_device_or_node_name(bs));
1250 bs->backing_hd = backing_hd;
1252 error_free(bs->backing_blocker);
1253 bs->backing_blocker = NULL;
1256 bs->open_flags &= ~BDRV_O_NO_BACKING;
1257 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1258 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1259 backing_hd->drv ? backing_hd->drv->format_name : "");
1261 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1262 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1263 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1264 bs->backing_blocker);
1266 bdrv_refresh_limits(bs, NULL);
1270 * Opens the backing file for a BlockDriverState if not yet open
1272 * options is a QDict of options to pass to the block drivers, or NULL for an
1273 * empty set of options. The reference to the QDict is transferred to this
1274 * function (even on failure), so if the caller intends to reuse the dictionary,
1275 * it needs to use QINCREF() before calling bdrv_file_open.
1277 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1279 char *backing_filename = g_malloc0(PATH_MAX);
1281 BlockDriverState *backing_hd;
1282 Error *local_err = NULL;
1284 if (bs->backing_hd != NULL) {
1289 /* NULL means an empty set of options */
1290 if (options == NULL) {
1291 options = qdict_new();
1294 bs->open_flags &= ~BDRV_O_NO_BACKING;
1295 if (qdict_haskey(options, "file.filename")) {
1296 backing_filename[0] = '\0';
1297 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1301 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1305 error_propagate(errp, local_err);
1311 if (!bs->drv || !bs->drv->supports_backing) {
1313 error_setg(errp, "Driver doesn't support backing files");
1318 backing_hd = bdrv_new();
1320 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1321 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1324 assert(bs->backing_hd == NULL);
1325 ret = bdrv_open(&backing_hd,
1326 *backing_filename ? backing_filename : NULL, NULL, options,
1327 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1329 bdrv_unref(backing_hd);
1331 bs->open_flags |= BDRV_O_NO_BACKING;
1332 error_setg(errp, "Could not open backing file: %s",
1333 error_get_pretty(local_err));
1334 error_free(local_err);
1337 bdrv_set_backing_hd(bs, backing_hd);
1340 g_free(backing_filename);
1345 * Opens a disk image whose options are given as BlockdevRef in another block
1348 * If allow_none is true, no image will be opened if filename is false and no
1349 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1351 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1352 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1353 * itself, all options starting with "${bdref_key}." are considered part of the
1356 * The BlockdevRef will be removed from the options QDict.
1358 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1360 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1361 QDict *options, const char *bdref_key, int flags,
1362 bool allow_none, Error **errp)
1364 QDict *image_options;
1366 char *bdref_key_dot;
1367 const char *reference;
1370 assert(*pbs == NULL);
1372 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1373 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1374 g_free(bdref_key_dot);
1376 reference = qdict_get_try_str(options, bdref_key);
1377 if (!filename && !reference && !qdict_size(image_options)) {
1381 error_setg(errp, "A block device must be specified for \"%s\"",
1385 QDECREF(image_options);
1389 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1392 qdict_del(options, bdref_key);
1396 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1398 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1399 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1401 QemuOpts *opts = NULL;
1402 QDict *snapshot_options;
1403 BlockDriverState *bs_snapshot;
1407 /* if snapshot, we create a temporary backing file and open it
1408 instead of opening 'filename' directly */
1410 /* Get the required size from the image */
1411 total_size = bdrv_getlength(bs);
1412 if (total_size < 0) {
1414 error_setg_errno(errp, -total_size, "Could not get image size");
1418 /* Create the temporary image */
1419 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1421 error_setg_errno(errp, -ret, "Could not get temporary filename");
1425 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1427 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1428 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1429 qemu_opts_del(opts);
1431 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1432 "'%s': %s", tmp_filename,
1433 error_get_pretty(local_err));
1434 error_free(local_err);
1438 /* Prepare a new options QDict for the temporary file */
1439 snapshot_options = qdict_new();
1440 qdict_put(snapshot_options, "file.driver",
1441 qstring_from_str("file"));
1442 qdict_put(snapshot_options, "file.filename",
1443 qstring_from_str(tmp_filename));
1445 bs_snapshot = bdrv_new();
1447 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1448 flags, &bdrv_qcow2, &local_err);
1450 error_propagate(errp, local_err);
1454 bdrv_append(bs_snapshot, bs);
1457 g_free(tmp_filename);
1462 * Opens a disk image (raw, qcow2, vmdk, ...)
1464 * options is a QDict of options to pass to the block drivers, or NULL for an
1465 * empty set of options. The reference to the QDict belongs to the block layer
1466 * after the call (even on failure), so if the caller intends to reuse the
1467 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1469 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1470 * If it is not NULL, the referenced BDS will be reused.
1472 * The reference parameter may be used to specify an existing block device which
1473 * should be opened. If specified, neither options nor a filename may be given,
1474 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1476 int bdrv_open(BlockDriverState **pbs, const char *filename,
1477 const char *reference, QDict *options, int flags,
1478 BlockDriver *drv, Error **errp)
1481 BlockDriverState *file = NULL, *bs;
1482 const char *drvname;
1483 Error *local_err = NULL;
1484 int snapshot_flags = 0;
1489 bool options_non_empty = options ? qdict_size(options) : false;
1493 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1494 "another block device");
1498 if (filename || options_non_empty) {
1499 error_setg(errp, "Cannot reference an existing block device with "
1500 "additional options or a new filename");
1504 bs = bdrv_lookup_bs(reference, reference, errp);
1519 /* NULL means an empty set of options */
1520 if (options == NULL) {
1521 options = qdict_new();
1524 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1529 /* Find the right image format driver */
1531 drvname = qdict_get_try_str(options, "driver");
1533 drv = bdrv_find_format(drvname);
1534 qdict_del(options, "driver");
1536 error_setg(errp, "Unknown driver: '%s'", drvname);
1542 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1543 if (drv && !drv->bdrv_file_open) {
1544 /* If the user explicitly wants a format driver here, we'll need to add
1545 * another layer for the protocol in bs->file */
1546 flags &= ~BDRV_O_PROTOCOL;
1549 bs->options = options;
1550 options = qdict_clone_shallow(options);
1552 /* Open image file without format layer */
1553 if ((flags & BDRV_O_PROTOCOL) == 0) {
1554 if (flags & BDRV_O_RDWR) {
1555 flags |= BDRV_O_ALLOW_RDWR;
1557 if (flags & BDRV_O_SNAPSHOT) {
1558 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1559 flags = bdrv_backing_flags(flags);
1562 assert(file == NULL);
1563 ret = bdrv_open_image(&file, filename, options, "file",
1564 bdrv_inherited_flags(flags),
1571 /* Image format probing */
1574 ret = find_image_format(file, filename, &drv, &local_err);
1579 error_setg(errp, "Must specify either driver or file");
1584 /* Open the image */
1585 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1590 if (file && (bs->file != file)) {
1595 /* If there is a backing file, use it */
1596 if ((flags & BDRV_O_NO_BACKING) == 0) {
1597 QDict *backing_options;
1599 qdict_extract_subqdict(options, &backing_options, "backing.");
1600 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1602 goto close_and_fail;
1606 bdrv_refresh_filename(bs);
1608 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1609 * temporary snapshot afterwards. */
1610 if (snapshot_flags) {
1611 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1613 goto close_and_fail;
1617 /* Check if any unknown options were used */
1618 if (options && (qdict_size(options) != 0)) {
1619 const QDictEntry *entry = qdict_first(options);
1620 if (flags & BDRV_O_PROTOCOL) {
1621 error_setg(errp, "Block protocol '%s' doesn't support the option "
1622 "'%s'", drv->format_name, entry->key);
1624 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1625 "support the option '%s'", drv->format_name,
1626 bdrv_get_device_name(bs), entry->key);
1630 goto close_and_fail;
1633 if (!bdrv_key_required(bs)) {
1635 blk_dev_change_media_cb(bs->blk, true);
1637 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1638 && !runstate_check(RUN_STATE_INMIGRATE)
1639 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1641 "Guest must be stopped for opening of encrypted image");
1643 goto close_and_fail;
1654 QDECREF(bs->options);
1658 /* If *pbs is NULL, a new BDS has been created in this function and
1659 needs to be freed now. Otherwise, it does not need to be closed,
1660 since it has not really been opened yet. */
1664 error_propagate(errp, local_err);
1669 /* See fail path, but now the BDS has to be always closed */
1677 error_propagate(errp, local_err);
1682 typedef struct BlockReopenQueueEntry {
1684 BDRVReopenState state;
1685 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1686 } BlockReopenQueueEntry;
1689 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1690 * reopen of multiple devices.
1692 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1693 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1694 * be created and initialized. This newly created BlockReopenQueue should be
1695 * passed back in for subsequent calls that are intended to be of the same
1698 * bs is the BlockDriverState to add to the reopen queue.
1700 * flags contains the open flags for the associated bs
1702 * returns a pointer to bs_queue, which is either the newly allocated
1703 * bs_queue, or the existing bs_queue being used.
1706 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1707 BlockDriverState *bs, int flags)
1711 BlockReopenQueueEntry *bs_entry;
1712 if (bs_queue == NULL) {
1713 bs_queue = g_new0(BlockReopenQueue, 1);
1714 QSIMPLEQ_INIT(bs_queue);
1717 /* bdrv_open() masks this flag out */
1718 flags &= ~BDRV_O_PROTOCOL;
1721 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1724 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1725 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1727 bs_entry->state.bs = bs;
1728 bs_entry->state.flags = flags;
1734 * Reopen multiple BlockDriverStates atomically & transactionally.
1736 * The queue passed in (bs_queue) must have been built up previous
1737 * via bdrv_reopen_queue().
1739 * Reopens all BDS specified in the queue, with the appropriate
1740 * flags. All devices are prepared for reopen, and failure of any
1741 * device will cause all device changes to be abandonded, and intermediate
1744 * If all devices prepare successfully, then the changes are committed
1748 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1751 BlockReopenQueueEntry *bs_entry, *next;
1752 Error *local_err = NULL;
1754 assert(bs_queue != NULL);
1758 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1759 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1760 error_propagate(errp, local_err);
1763 bs_entry->prepared = true;
1766 /* If we reach this point, we have success and just need to apply the
1769 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1770 bdrv_reopen_commit(&bs_entry->state);
1776 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1777 if (ret && bs_entry->prepared) {
1778 bdrv_reopen_abort(&bs_entry->state);
1787 /* Reopen a single BlockDriverState with the specified flags. */
1788 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1791 Error *local_err = NULL;
1792 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1794 ret = bdrv_reopen_multiple(queue, &local_err);
1795 if (local_err != NULL) {
1796 error_propagate(errp, local_err);
1803 * Prepares a BlockDriverState for reopen. All changes are staged in the
1804 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1805 * the block driver layer .bdrv_reopen_prepare()
1807 * bs is the BlockDriverState to reopen
1808 * flags are the new open flags
1809 * queue is the reopen queue
1811 * Returns 0 on success, non-zero on error. On error errp will be set
1814 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1815 * It is the responsibility of the caller to then call the abort() or
1816 * commit() for any other BDS that have been left in a prepare() state
1819 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1823 Error *local_err = NULL;
1826 assert(reopen_state != NULL);
1827 assert(reopen_state->bs->drv != NULL);
1828 drv = reopen_state->bs->drv;
1830 /* if we are to stay read-only, do not allow permission change
1832 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1833 reopen_state->flags & BDRV_O_RDWR) {
1834 error_setg(errp, "Node '%s' is read only",
1835 bdrv_get_device_or_node_name(reopen_state->bs));
1840 ret = bdrv_flush(reopen_state->bs);
1842 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1847 if (drv->bdrv_reopen_prepare) {
1848 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1850 if (local_err != NULL) {
1851 error_propagate(errp, local_err);
1853 error_setg(errp, "failed while preparing to reopen image '%s'",
1854 reopen_state->bs->filename);
1859 /* It is currently mandatory to have a bdrv_reopen_prepare()
1860 * handler for each supported drv. */
1861 error_setg(errp, "Block format '%s' used by node '%s' "
1862 "does not support reopening files", drv->format_name,
1863 bdrv_get_device_or_node_name(reopen_state->bs));
1875 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1876 * makes them final by swapping the staging BlockDriverState contents into
1877 * the active BlockDriverState contents.
1879 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1883 assert(reopen_state != NULL);
1884 drv = reopen_state->bs->drv;
1885 assert(drv != NULL);
1887 /* If there are any driver level actions to take */
1888 if (drv->bdrv_reopen_commit) {
1889 drv->bdrv_reopen_commit(reopen_state);
1892 /* set BDS specific flags now */
1893 reopen_state->bs->open_flags = reopen_state->flags;
1894 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1896 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1898 bdrv_refresh_limits(reopen_state->bs, NULL);
1902 * Abort the reopen, and delete and free the staged changes in
1905 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1909 assert(reopen_state != NULL);
1910 drv = reopen_state->bs->drv;
1911 assert(drv != NULL);
1913 if (drv->bdrv_reopen_abort) {
1914 drv->bdrv_reopen_abort(reopen_state);
1919 void bdrv_close(BlockDriverState *bs)
1921 BdrvAioNotifier *ban, *ban_next;
1924 block_job_cancel_sync(bs->job);
1926 bdrv_drain_all(); /* complete I/O */
1928 bdrv_drain_all(); /* in case flush left pending I/O */
1929 notifier_list_notify(&bs->close_notifiers, bs);
1932 if (bs->backing_hd) {
1933 BlockDriverState *backing_hd = bs->backing_hd;
1934 bdrv_set_backing_hd(bs, NULL);
1935 bdrv_unref(backing_hd);
1937 bs->drv->bdrv_close(bs);
1941 bs->copy_on_read = 0;
1942 bs->backing_file[0] = '\0';
1943 bs->backing_format[0] = '\0';
1944 bs->total_sectors = 0;
1948 bs->zero_beyond_eof = false;
1949 QDECREF(bs->options);
1951 QDECREF(bs->full_open_options);
1952 bs->full_open_options = NULL;
1954 if (bs->file != NULL) {
1955 bdrv_unref(bs->file);
1961 blk_dev_change_media_cb(bs->blk, false);
1964 /*throttling disk I/O limits*/
1965 if (bs->io_limits_enabled) {
1966 bdrv_io_limits_disable(bs);
1969 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1972 QLIST_INIT(&bs->aio_notifiers);
1975 void bdrv_close_all(void)
1977 BlockDriverState *bs;
1979 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1980 AioContext *aio_context = bdrv_get_aio_context(bs);
1982 aio_context_acquire(aio_context);
1984 aio_context_release(aio_context);
1988 /* Check if any requests are in-flight (including throttled requests) */
1989 static bool bdrv_requests_pending(BlockDriverState *bs)
1991 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1994 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1997 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
2000 if (bs->file && bdrv_requests_pending(bs->file)) {
2003 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
2009 static bool bdrv_drain_one(BlockDriverState *bs)
2013 bdrv_flush_io_queue(bs);
2014 bdrv_start_throttled_reqs(bs);
2015 bs_busy = bdrv_requests_pending(bs);
2016 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2021 * Wait for pending requests to complete on a single BlockDriverState subtree
2023 * See the warning in bdrv_drain_all(). This function can only be called if
2024 * you are sure nothing can generate I/O because you have op blockers
2027 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2030 void bdrv_drain(BlockDriverState *bs)
2032 while (bdrv_drain_one(bs)) {
2033 /* Keep iterating */
2038 * Wait for pending requests to complete across all BlockDriverStates
2040 * This function does not flush data to disk, use bdrv_flush_all() for that
2041 * after calling this function.
2043 * Note that completion of an asynchronous I/O operation can trigger any
2044 * number of other I/O operations on other devices---for example a coroutine
2045 * can be arbitrarily complex and a constant flow of I/O can come until the
2046 * coroutine is complete. Because of this, it is not possible to have a
2047 * function to drain a single device's I/O queue.
2049 void bdrv_drain_all(void)
2051 /* Always run first iteration so any pending completion BHs run */
2053 BlockDriverState *bs;
2055 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2056 AioContext *aio_context = bdrv_get_aio_context(bs);
2058 aio_context_acquire(aio_context);
2060 block_job_pause(bs->job);
2062 aio_context_release(aio_context);
2068 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2069 AioContext *aio_context = bdrv_get_aio_context(bs);
2071 aio_context_acquire(aio_context);
2072 busy |= bdrv_drain_one(bs);
2073 aio_context_release(aio_context);
2077 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2078 AioContext *aio_context = bdrv_get_aio_context(bs);
2080 aio_context_acquire(aio_context);
2082 block_job_resume(bs->job);
2084 aio_context_release(aio_context);
2088 /* make a BlockDriverState anonymous by removing from bdrv_state and
2089 * graph_bdrv_state list.
2090 Also, NULL terminate the device_name to prevent double remove */
2091 void bdrv_make_anon(BlockDriverState *bs)
2094 * Take care to remove bs from bdrv_states only when it's actually
2095 * in it. Note that bs->device_list.tqe_prev is initially null,
2096 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2097 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2098 * resetting it to null on remove.
2100 if (bs->device_list.tqe_prev) {
2101 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2102 bs->device_list.tqe_prev = NULL;
2104 if (bs->node_name[0] != '\0') {
2105 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2107 bs->node_name[0] = '\0';
2110 static void bdrv_rebind(BlockDriverState *bs)
2112 if (bs->drv && bs->drv->bdrv_rebind) {
2113 bs->drv->bdrv_rebind(bs);
2117 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2118 BlockDriverState *bs_src)
2120 /* move some fields that need to stay attached to the device */
2123 bs_dest->guest_block_size = bs_src->guest_block_size;
2124 bs_dest->copy_on_read = bs_src->copy_on_read;
2126 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2128 /* i/o throttled req */
2129 memcpy(&bs_dest->throttle_state,
2130 &bs_src->throttle_state,
2131 sizeof(ThrottleState));
2132 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2133 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2134 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2137 bs_dest->on_read_error = bs_src->on_read_error;
2138 bs_dest->on_write_error = bs_src->on_write_error;
2141 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2142 bs_dest->iostatus = bs_src->iostatus;
2145 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2147 /* reference count */
2148 bs_dest->refcnt = bs_src->refcnt;
2151 bs_dest->job = bs_src->job;
2153 /* keep the same entry in bdrv_states */
2154 bs_dest->device_list = bs_src->device_list;
2155 bs_dest->blk = bs_src->blk;
2157 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2158 sizeof(bs_dest->op_blockers));
2162 * Swap bs contents for two image chains while they are live,
2163 * while keeping required fields on the BlockDriverState that is
2164 * actually attached to a device.
2166 * This will modify the BlockDriverState fields, and swap contents
2167 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2169 * bs_new must not be attached to a BlockBackend.
2171 * This function does not create any image files.
2173 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2175 BlockDriverState tmp;
2177 /* The code needs to swap the node_name but simply swapping node_list won't
2178 * work so first remove the nodes from the graph list, do the swap then
2179 * insert them back if needed.
2181 if (bs_new->node_name[0] != '\0') {
2182 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2184 if (bs_old->node_name[0] != '\0') {
2185 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2188 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2189 assert(!bs_new->blk);
2190 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2191 assert(bs_new->job == NULL);
2192 assert(bs_new->io_limits_enabled == false);
2193 assert(!throttle_have_timer(&bs_new->throttle_state));
2199 /* there are some fields that should not be swapped, move them back */
2200 bdrv_move_feature_fields(&tmp, bs_old);
2201 bdrv_move_feature_fields(bs_old, bs_new);
2202 bdrv_move_feature_fields(bs_new, &tmp);
2204 /* bs_new must remain unattached */
2205 assert(!bs_new->blk);
2207 /* Check a few fields that should remain attached to the device */
2208 assert(bs_new->job == NULL);
2209 assert(bs_new->io_limits_enabled == false);
2210 assert(!throttle_have_timer(&bs_new->throttle_state));
2212 /* insert the nodes back into the graph node list if needed */
2213 if (bs_new->node_name[0] != '\0') {
2214 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2216 if (bs_old->node_name[0] != '\0') {
2217 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2220 bdrv_rebind(bs_new);
2221 bdrv_rebind(bs_old);
2225 * Add new bs contents at the top of an image chain while the chain is
2226 * live, while keeping required fields on the top layer.
2228 * This will modify the BlockDriverState fields, and swap contents
2229 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2231 * bs_new must not be attached to a BlockBackend.
2233 * This function does not create any image files.
2235 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2237 bdrv_swap(bs_new, bs_top);
2239 /* The contents of 'tmp' will become bs_top, as we are
2240 * swapping bs_new and bs_top contents. */
2241 bdrv_set_backing_hd(bs_top, bs_new);
2244 static void bdrv_delete(BlockDriverState *bs)
2247 assert(bdrv_op_blocker_is_empty(bs));
2248 assert(!bs->refcnt);
2249 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2253 /* remove from list, if necessary */
2260 * Run consistency checks on an image
2262 * Returns 0 if the check could be completed (it doesn't mean that the image is
2263 * free of errors) or -errno when an internal error occurred. The results of the
2264 * check are stored in res.
2266 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2268 if (bs->drv == NULL) {
2271 if (bs->drv->bdrv_check == NULL) {
2275 memset(res, 0, sizeof(*res));
2276 return bs->drv->bdrv_check(bs, res, fix);
2279 #define COMMIT_BUF_SECTORS 2048
2281 /* commit COW file into the raw image */
2282 int bdrv_commit(BlockDriverState *bs)
2284 BlockDriver *drv = bs->drv;
2285 int64_t sector, total_sectors, length, backing_length;
2286 int n, ro, open_flags;
2288 uint8_t *buf = NULL;
2293 if (!bs->backing_hd) {
2297 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2298 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2302 ro = bs->backing_hd->read_only;
2303 open_flags = bs->backing_hd->open_flags;
2306 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2311 length = bdrv_getlength(bs);
2317 backing_length = bdrv_getlength(bs->backing_hd);
2318 if (backing_length < 0) {
2319 ret = backing_length;
2323 /* If our top snapshot is larger than the backing file image,
2324 * grow the backing file image if possible. If not possible,
2325 * we must return an error */
2326 if (length > backing_length) {
2327 ret = bdrv_truncate(bs->backing_hd, length);
2333 total_sectors = length >> BDRV_SECTOR_BITS;
2335 /* qemu_try_blockalign() for bs will choose an alignment that works for
2336 * bs->backing_hd as well, so no need to compare the alignment manually. */
2337 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2343 for (sector = 0; sector < total_sectors; sector += n) {
2344 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2349 ret = bdrv_read(bs, sector, buf, n);
2354 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2361 if (drv->bdrv_make_empty) {
2362 ret = drv->bdrv_make_empty(bs);
2370 * Make sure all data we wrote to the backing device is actually
2373 if (bs->backing_hd) {
2374 bdrv_flush(bs->backing_hd);
2382 /* ignoring error return here */
2383 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2389 int bdrv_commit_all(void)
2391 BlockDriverState *bs;
2393 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2394 AioContext *aio_context = bdrv_get_aio_context(bs);
2396 aio_context_acquire(aio_context);
2397 if (bs->drv && bs->backing_hd) {
2398 int ret = bdrv_commit(bs);
2400 aio_context_release(aio_context);
2404 aio_context_release(aio_context);
2410 * Remove an active request from the tracked requests list
2412 * This function should be called when a tracked request is completing.
2414 static void tracked_request_end(BdrvTrackedRequest *req)
2416 if (req->serialising) {
2417 req->bs->serialising_in_flight--;
2420 QLIST_REMOVE(req, list);
2421 qemu_co_queue_restart_all(&req->wait_queue);
2425 * Add an active request to the tracked requests list
2427 static void tracked_request_begin(BdrvTrackedRequest *req,
2428 BlockDriverState *bs,
2430 unsigned int bytes, bool is_write)
2432 *req = (BdrvTrackedRequest){
2436 .is_write = is_write,
2437 .co = qemu_coroutine_self(),
2438 .serialising = false,
2439 .overlap_offset = offset,
2440 .overlap_bytes = bytes,
2443 qemu_co_queue_init(&req->wait_queue);
2445 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2448 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2450 int64_t overlap_offset = req->offset & ~(align - 1);
2451 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2454 if (!req->serialising) {
2455 req->bs->serialising_in_flight++;
2456 req->serialising = true;
2459 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2460 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2464 * Round a region to cluster boundaries
2466 void bdrv_round_to_clusters(BlockDriverState *bs,
2467 int64_t sector_num, int nb_sectors,
2468 int64_t *cluster_sector_num,
2469 int *cluster_nb_sectors)
2471 BlockDriverInfo bdi;
2473 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2474 *cluster_sector_num = sector_num;
2475 *cluster_nb_sectors = nb_sectors;
2477 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2478 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2479 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2484 static int bdrv_get_cluster_size(BlockDriverState *bs)
2486 BlockDriverInfo bdi;
2489 ret = bdrv_get_info(bs, &bdi);
2490 if (ret < 0 || bdi.cluster_size == 0) {
2491 return bs->request_alignment;
2493 return bdi.cluster_size;
2497 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2498 int64_t offset, unsigned int bytes)
2501 if (offset >= req->overlap_offset + req->overlap_bytes) {
2505 if (req->overlap_offset >= offset + bytes) {
2511 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2513 BlockDriverState *bs = self->bs;
2514 BdrvTrackedRequest *req;
2516 bool waited = false;
2518 if (!bs->serialising_in_flight) {
2524 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2525 if (req == self || (!req->serialising && !self->serialising)) {
2528 if (tracked_request_overlaps(req, self->overlap_offset,
2529 self->overlap_bytes))
2531 /* Hitting this means there was a reentrant request, for
2532 * example, a block driver issuing nested requests. This must
2533 * never happen since it means deadlock.
2535 assert(qemu_coroutine_self() != req->co);
2537 /* If the request is already (indirectly) waiting for us, or
2538 * will wait for us as soon as it wakes up, then just go on
2539 * (instead of producing a deadlock in the former case). */
2540 if (!req->waiting_for) {
2541 self->waiting_for = req;
2542 qemu_co_queue_wait(&req->wait_queue);
2543 self->waiting_for = NULL;
2558 * -EINVAL - backing format specified, but no file
2559 * -ENOSPC - can't update the backing file because no space is left in the
2561 * -ENOTSUP - format driver doesn't support changing the backing file
2563 int bdrv_change_backing_file(BlockDriverState *bs,
2564 const char *backing_file, const char *backing_fmt)
2566 BlockDriver *drv = bs->drv;
2569 /* Backing file format doesn't make sense without a backing file */
2570 if (backing_fmt && !backing_file) {
2574 if (drv->bdrv_change_backing_file != NULL) {
2575 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2581 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2582 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2588 * Finds the image layer in the chain that has 'bs' as its backing file.
2590 * active is the current topmost image.
2592 * Returns NULL if bs is not found in active's image chain,
2593 * or if active == bs.
2595 * Returns the bottommost base image if bs == NULL.
2597 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2598 BlockDriverState *bs)
2600 while (active && bs != active->backing_hd) {
2601 active = active->backing_hd;
2607 /* Given a BDS, searches for the base layer. */
2608 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2610 return bdrv_find_overlay(bs, NULL);
2613 typedef struct BlkIntermediateStates {
2614 BlockDriverState *bs;
2615 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2616 } BlkIntermediateStates;
2620 * Drops images above 'base' up to and including 'top', and sets the image
2621 * above 'top' to have base as its backing file.
2623 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2624 * information in 'bs' can be properly updated.
2626 * E.g., this will convert the following chain:
2627 * bottom <- base <- intermediate <- top <- active
2631 * bottom <- base <- active
2633 * It is allowed for bottom==base, in which case it converts:
2635 * base <- intermediate <- top <- active
2641 * If backing_file_str is non-NULL, it will be used when modifying top's
2642 * overlay image metadata.
2645 * if active == top, that is considered an error
2648 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2649 BlockDriverState *base, const char *backing_file_str)
2651 BlockDriverState *intermediate;
2652 BlockDriverState *base_bs = NULL;
2653 BlockDriverState *new_top_bs = NULL;
2654 BlkIntermediateStates *intermediate_state, *next;
2657 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2658 QSIMPLEQ_INIT(&states_to_delete);
2660 if (!top->drv || !base->drv) {
2664 new_top_bs = bdrv_find_overlay(active, top);
2666 if (new_top_bs == NULL) {
2667 /* we could not find the image above 'top', this is an error */
2671 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2672 * to do, no intermediate images */
2673 if (new_top_bs->backing_hd == base) {
2680 /* now we will go down through the list, and add each BDS we find
2681 * into our deletion queue, until we hit the 'base'
2683 while (intermediate) {
2684 intermediate_state = g_new0(BlkIntermediateStates, 1);
2685 intermediate_state->bs = intermediate;
2686 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2688 if (intermediate->backing_hd == base) {
2689 base_bs = intermediate->backing_hd;
2692 intermediate = intermediate->backing_hd;
2694 if (base_bs == NULL) {
2695 /* something went wrong, we did not end at the base. safely
2696 * unravel everything, and exit with error */
2700 /* success - we can delete the intermediate states, and link top->base */
2701 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2702 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2703 base_bs->drv ? base_bs->drv->format_name : "");
2707 bdrv_set_backing_hd(new_top_bs, base_bs);
2709 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2710 /* so that bdrv_close() does not recursively close the chain */
2711 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2712 bdrv_unref(intermediate_state->bs);
2717 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2718 g_free(intermediate_state);
2724 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2727 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2731 if (!bdrv_is_inserted(bs)) {
2742 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2745 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2749 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2750 nb_sectors * BDRV_SECTOR_SIZE);
2753 typedef struct RwCo {
2754 BlockDriverState *bs;
2759 BdrvRequestFlags flags;
2762 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2764 RwCo *rwco = opaque;
2766 if (!rwco->is_write) {
2767 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2768 rwco->qiov->size, rwco->qiov,
2771 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2772 rwco->qiov->size, rwco->qiov,
2778 * Process a vectored synchronous request using coroutines
2780 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2781 QEMUIOVector *qiov, bool is_write,
2782 BdrvRequestFlags flags)
2789 .is_write = is_write,
2795 * In sync call context, when the vcpu is blocked, this throttling timer
2796 * will not fire; so the I/O throttling function has to be disabled here
2797 * if it has been enabled.
2799 if (bs->io_limits_enabled) {
2800 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2801 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2802 bdrv_io_limits_disable(bs);
2805 if (qemu_in_coroutine()) {
2806 /* Fast-path if already in coroutine context */
2807 bdrv_rw_co_entry(&rwco);
2809 AioContext *aio_context = bdrv_get_aio_context(bs);
2811 co = qemu_coroutine_create(bdrv_rw_co_entry);
2812 qemu_coroutine_enter(co, &rwco);
2813 while (rwco.ret == NOT_DONE) {
2814 aio_poll(aio_context, true);
2821 * Process a synchronous request using coroutines
2823 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2824 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2827 struct iovec iov = {
2828 .iov_base = (void *)buf,
2829 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2832 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2836 qemu_iovec_init_external(&qiov, &iov, 1);
2837 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2838 &qiov, is_write, flags);
2841 /* return < 0 if error. See bdrv_write() for the return codes */
2842 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2843 uint8_t *buf, int nb_sectors)
2845 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2848 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2849 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2850 uint8_t *buf, int nb_sectors)
2855 enabled = bs->io_limits_enabled;
2856 bs->io_limits_enabled = false;
2857 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2858 bs->io_limits_enabled = enabled;
2862 /* Return < 0 if error. Important errors are:
2863 -EIO generic I/O error (may happen for all errors)
2864 -ENOMEDIUM No media inserted.
2865 -EINVAL Invalid sector number or nb_sectors
2866 -EACCES Trying to write a read-only device
2868 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2869 const uint8_t *buf, int nb_sectors)
2871 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2874 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2875 int nb_sectors, BdrvRequestFlags flags)
2877 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2878 BDRV_REQ_ZERO_WRITE | flags);
2882 * Completely zero out a block device with the help of bdrv_write_zeroes.
2883 * The operation is sped up by checking the block status and only writing
2884 * zeroes to the device if they currently do not return zeroes. Optional
2885 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2887 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2889 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2891 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2894 target_sectors = bdrv_nb_sectors(bs);
2895 if (target_sectors < 0) {
2896 return target_sectors;
2900 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2901 if (nb_sectors <= 0) {
2904 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2906 error_report("error getting block status at sector %" PRId64 ": %s",
2907 sector_num, strerror(-ret));
2910 if (ret & BDRV_BLOCK_ZERO) {
2914 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2916 error_report("error writing zeroes at sector %" PRId64 ": %s",
2917 sector_num, strerror(-ret));
2924 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2927 struct iovec iov = {
2928 .iov_base = (void *)buf,
2937 qemu_iovec_init_external(&qiov, &iov, 1);
2938 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2946 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2950 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2958 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2959 const void *buf, int bytes)
2962 struct iovec iov = {
2963 .iov_base = (void *) buf,
2971 qemu_iovec_init_external(&qiov, &iov, 1);
2972 return bdrv_pwritev(bs, offset, &qiov);
2976 * Writes to the file and ensures that no writes are reordered across this
2977 * request (acts as a barrier)
2979 * Returns 0 on success, -errno in error cases.
2981 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2982 const void *buf, int count)
2986 ret = bdrv_pwrite(bs, offset, buf, count);
2991 /* No flush needed for cache modes that already do it */
2992 if (bs->enable_write_cache) {
2999 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
3000 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3002 /* Perform I/O through a temporary buffer so that users who scribble over
3003 * their read buffer while the operation is in progress do not end up
3004 * modifying the image file. This is critical for zero-copy guest I/O
3005 * where anything might happen inside guest memory.
3007 void *bounce_buffer;
3009 BlockDriver *drv = bs->drv;
3011 QEMUIOVector bounce_qiov;
3012 int64_t cluster_sector_num;
3013 int cluster_nb_sectors;
3017 /* Cover entire cluster so no additional backing file I/O is required when
3018 * allocating cluster in the image file.
3020 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3021 &cluster_sector_num, &cluster_nb_sectors);
3023 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3024 cluster_sector_num, cluster_nb_sectors);
3026 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3027 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3028 if (bounce_buffer == NULL) {
3033 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3035 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3041 if (drv->bdrv_co_write_zeroes &&
3042 buffer_is_zero(bounce_buffer, iov.iov_len)) {
3043 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3044 cluster_nb_sectors, 0);
3046 /* This does not change the data on the disk, it is not necessary
3047 * to flush even in cache=writethrough mode.
3049 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3054 /* It might be okay to ignore write errors for guest requests. If this
3055 * is a deliberate copy-on-read then we don't want to ignore the error.
3056 * Simply report it in all cases.
3061 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3062 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3063 nb_sectors * BDRV_SECTOR_SIZE);
3066 qemu_vfree(bounce_buffer);
3071 * Forwards an already correctly aligned request to the BlockDriver. This
3072 * handles copy on read and zeroing after EOF; any other features must be
3073 * implemented by the caller.
3075 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3076 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3077 int64_t align, QEMUIOVector *qiov, int flags)
3079 BlockDriver *drv = bs->drv;
3082 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3083 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3085 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3086 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3087 assert(!qiov || bytes == qiov->size);
3089 /* Handle Copy on Read and associated serialisation */
3090 if (flags & BDRV_REQ_COPY_ON_READ) {
3091 /* If we touch the same cluster it counts as an overlap. This
3092 * guarantees that allocating writes will be serialized and not race
3093 * with each other for the same cluster. For example, in copy-on-read
3094 * it ensures that the CoR read and write operations are atomic and
3095 * guest writes cannot interleave between them. */
3096 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3099 wait_serialising_requests(req);
3101 if (flags & BDRV_REQ_COPY_ON_READ) {
3104 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3109 if (!ret || pnum != nb_sectors) {
3110 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3115 /* Forward the request to the BlockDriver */
3116 if (!bs->zero_beyond_eof) {
3117 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3119 /* Read zeros after EOF */
3120 int64_t total_sectors, max_nb_sectors;
3122 total_sectors = bdrv_nb_sectors(bs);
3123 if (total_sectors < 0) {
3124 ret = total_sectors;
3128 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3129 align >> BDRV_SECTOR_BITS);
3130 if (nb_sectors < max_nb_sectors) {
3131 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3132 } else if (max_nb_sectors > 0) {
3133 QEMUIOVector local_qiov;
3135 qemu_iovec_init(&local_qiov, qiov->niov);
3136 qemu_iovec_concat(&local_qiov, qiov, 0,
3137 max_nb_sectors * BDRV_SECTOR_SIZE);
3139 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3142 qemu_iovec_destroy(&local_qiov);
3147 /* Reading beyond end of file is supposed to produce zeroes */
3148 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3149 uint64_t offset = MAX(0, total_sectors - sector_num);
3150 uint64_t bytes = (sector_num + nb_sectors - offset) *
3152 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3160 static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3162 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3163 return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3166 static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3167 int64_t offset, size_t bytes)
3169 int64_t align = bdrv_get_align(bs);
3170 return !(offset & (align - 1) || (bytes & (align - 1)));
3174 * Handle a read request in coroutine context
3176 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3177 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3178 BdrvRequestFlags flags)
3180 BlockDriver *drv = bs->drv;
3181 BdrvTrackedRequest req;
3183 uint64_t align = bdrv_get_align(bs);
3184 uint8_t *head_buf = NULL;
3185 uint8_t *tail_buf = NULL;
3186 QEMUIOVector local_qiov;
3187 bool use_local_qiov = false;
3194 ret = bdrv_check_byte_request(bs, offset, bytes);
3199 if (bs->copy_on_read) {
3200 flags |= BDRV_REQ_COPY_ON_READ;
3203 /* throttling disk I/O */
3204 if (bs->io_limits_enabled) {
3205 bdrv_io_limits_intercept(bs, bytes, false);
3208 /* Align read if necessary by padding qiov */
3209 if (offset & (align - 1)) {
3210 head_buf = qemu_blockalign(bs, align);
3211 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3212 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3213 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3214 use_local_qiov = true;
3216 bytes += offset & (align - 1);
3217 offset = offset & ~(align - 1);
3220 if ((offset + bytes) & (align - 1)) {
3221 if (!use_local_qiov) {
3222 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3223 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3224 use_local_qiov = true;
3226 tail_buf = qemu_blockalign(bs, align);
3227 qemu_iovec_add(&local_qiov, tail_buf,
3228 align - ((offset + bytes) & (align - 1)));
3230 bytes = ROUND_UP(bytes, align);
3233 tracked_request_begin(&req, bs, offset, bytes, false);
3234 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3235 use_local_qiov ? &local_qiov : qiov,
3237 tracked_request_end(&req);
3239 if (use_local_qiov) {
3240 qemu_iovec_destroy(&local_qiov);
3241 qemu_vfree(head_buf);
3242 qemu_vfree(tail_buf);
3248 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3249 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3250 BdrvRequestFlags flags)
3252 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3256 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3257 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3260 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3261 int nb_sectors, QEMUIOVector *qiov)
3263 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3265 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3268 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3269 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3271 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3273 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3274 BDRV_REQ_COPY_ON_READ);
3277 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3279 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3280 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3282 BlockDriver *drv = bs->drv;
3284 struct iovec iov = {0};
3287 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3288 BDRV_REQUEST_MAX_SECTORS);
3290 while (nb_sectors > 0 && !ret) {
3291 int num = nb_sectors;
3293 /* Align request. Block drivers can expect the "bulk" of the request
3296 if (bs->bl.write_zeroes_alignment
3297 && num > bs->bl.write_zeroes_alignment) {
3298 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3299 /* Make a small request up to the first aligned sector. */
3300 num = bs->bl.write_zeroes_alignment;
3301 num -= sector_num % bs->bl.write_zeroes_alignment;
3302 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3303 /* Shorten the request to the last aligned sector. num cannot
3304 * underflow because num > bs->bl.write_zeroes_alignment.
3306 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3310 /* limit request size */
3311 if (num > max_write_zeroes) {
3312 num = max_write_zeroes;
3316 /* First try the efficient write zeroes operation */
3317 if (drv->bdrv_co_write_zeroes) {
3318 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3321 if (ret == -ENOTSUP) {
3322 /* Fall back to bounce buffer if write zeroes is unsupported */
3323 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3324 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3325 num = MIN(num, max_xfer_len);
3326 iov.iov_len = num * BDRV_SECTOR_SIZE;
3327 if (iov.iov_base == NULL) {
3328 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3329 if (iov.iov_base == NULL) {
3333 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3335 qemu_iovec_init_external(&qiov, &iov, 1);
3337 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3339 /* Keep bounce buffer around if it is big enough for all
3340 * all future requests.
3342 if (num < max_xfer_len) {
3343 qemu_vfree(iov.iov_base);
3344 iov.iov_base = NULL;
3353 qemu_vfree(iov.iov_base);
3358 * Forwards an already correctly aligned write request to the BlockDriver.
3360 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3361 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3362 QEMUIOVector *qiov, int flags)
3364 BlockDriver *drv = bs->drv;
3368 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3369 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3371 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3372 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3373 assert(!qiov || bytes == qiov->size);
3375 waited = wait_serialising_requests(req);
3376 assert(!waited || !req->serialising);
3377 assert(req->overlap_offset <= offset);
3378 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3380 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3382 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3383 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3384 qemu_iovec_is_zero(qiov)) {
3385 flags |= BDRV_REQ_ZERO_WRITE;
3386 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3387 flags |= BDRV_REQ_MAY_UNMAP;
3392 /* Do nothing, write notifier decided to fail this request */
3393 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3394 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3395 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3397 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3398 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3400 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3402 if (ret == 0 && !bs->enable_write_cache) {
3403 ret = bdrv_co_flush(bs);
3406 bdrv_set_dirty(bs, sector_num, nb_sectors);
3408 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3411 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3418 * Handle a write request in coroutine context
3420 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3421 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3422 BdrvRequestFlags flags)
3424 BdrvTrackedRequest req;
3425 uint64_t align = bdrv_get_align(bs);
3426 uint8_t *head_buf = NULL;
3427 uint8_t *tail_buf = NULL;
3428 QEMUIOVector local_qiov;
3429 bool use_local_qiov = false;
3435 if (bs->read_only) {
3439 ret = bdrv_check_byte_request(bs, offset, bytes);
3444 /* throttling disk I/O */
3445 if (bs->io_limits_enabled) {
3446 bdrv_io_limits_intercept(bs, bytes, true);
3450 * Align write if necessary by performing a read-modify-write cycle.
3451 * Pad qiov with the read parts and be sure to have a tracked request not
3452 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3454 tracked_request_begin(&req, bs, offset, bytes, true);
3456 if (offset & (align - 1)) {
3457 QEMUIOVector head_qiov;
3458 struct iovec head_iov;
3460 mark_request_serialising(&req, align);
3461 wait_serialising_requests(&req);
3463 head_buf = qemu_blockalign(bs, align);
3464 head_iov = (struct iovec) {
3465 .iov_base = head_buf,
3468 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3470 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3471 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3472 align, &head_qiov, 0);
3476 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3478 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3479 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3480 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3481 use_local_qiov = true;
3483 bytes += offset & (align - 1);
3484 offset = offset & ~(align - 1);
3487 if ((offset + bytes) & (align - 1)) {
3488 QEMUIOVector tail_qiov;
3489 struct iovec tail_iov;
3493 mark_request_serialising(&req, align);
3494 waited = wait_serialising_requests(&req);
3495 assert(!waited || !use_local_qiov);
3497 tail_buf = qemu_blockalign(bs, align);
3498 tail_iov = (struct iovec) {
3499 .iov_base = tail_buf,
3502 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3504 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3505 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3506 align, &tail_qiov, 0);
3510 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3512 if (!use_local_qiov) {
3513 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3514 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3515 use_local_qiov = true;
3518 tail_bytes = (offset + bytes) & (align - 1);
3519 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3521 bytes = ROUND_UP(bytes, align);
3524 if (use_local_qiov) {
3525 /* Local buffer may have non-zero data. */
3526 flags &= ~BDRV_REQ_ZERO_WRITE;
3528 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3529 use_local_qiov ? &local_qiov : qiov,
3533 tracked_request_end(&req);
3535 if (use_local_qiov) {
3536 qemu_iovec_destroy(&local_qiov);
3538 qemu_vfree(head_buf);
3539 qemu_vfree(tail_buf);
3544 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3545 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3546 BdrvRequestFlags flags)
3548 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3552 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3553 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3556 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3557 int nb_sectors, QEMUIOVector *qiov)
3559 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3561 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3564 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3565 int64_t sector_num, int nb_sectors,
3566 BdrvRequestFlags flags)
3570 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3572 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3573 flags &= ~BDRV_REQ_MAY_UNMAP;
3575 if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3576 nb_sectors << BDRV_SECTOR_BITS)) {
3577 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3578 BDRV_REQ_ZERO_WRITE | flags);
3581 QEMUIOVector local_qiov;
3582 size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3584 buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3585 memset(buf, 0, bytes);
3586 qemu_iovec_init(&local_qiov, 1);
3587 qemu_iovec_add(&local_qiov, buf, bytes);
3589 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3590 BDRV_REQ_ZERO_WRITE | flags);
3597 * Truncate file to 'offset' bytes (needed only for file protocols)
3599 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3601 BlockDriver *drv = bs->drv;
3605 if (!drv->bdrv_truncate)
3610 ret = drv->bdrv_truncate(bs, offset);
3612 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3614 blk_dev_resize_cb(bs->blk);
3621 * Length of a allocated file in bytes. Sparse files are counted by actual
3622 * allocated space. Return < 0 if error or unknown.
3624 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3626 BlockDriver *drv = bs->drv;
3630 if (drv->bdrv_get_allocated_file_size) {
3631 return drv->bdrv_get_allocated_file_size(bs);
3634 return bdrv_get_allocated_file_size(bs->file);
3640 * Return number of sectors on success, -errno on error.
3642 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3644 BlockDriver *drv = bs->drv;
3649 if (drv->has_variable_length) {
3650 int ret = refresh_total_sectors(bs, bs->total_sectors);
3655 return bs->total_sectors;
3659 * Return length in bytes on success, -errno on error.
3660 * The length is always a multiple of BDRV_SECTOR_SIZE.
3662 int64_t bdrv_getlength(BlockDriverState *bs)
3664 int64_t ret = bdrv_nb_sectors(bs);
3666 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3669 /* return 0 as number of sectors if no device present or error */
3670 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3672 int64_t nb_sectors = bdrv_nb_sectors(bs);
3674 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3677 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3678 BlockdevOnError on_write_error)
3680 bs->on_read_error = on_read_error;
3681 bs->on_write_error = on_write_error;
3684 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3686 return is_read ? bs->on_read_error : bs->on_write_error;
3689 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3691 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3694 case BLOCKDEV_ON_ERROR_ENOSPC:
3695 return (error == ENOSPC) ?
3696 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3697 case BLOCKDEV_ON_ERROR_STOP:
3698 return BLOCK_ERROR_ACTION_STOP;
3699 case BLOCKDEV_ON_ERROR_REPORT:
3700 return BLOCK_ERROR_ACTION_REPORT;
3701 case BLOCKDEV_ON_ERROR_IGNORE:
3702 return BLOCK_ERROR_ACTION_IGNORE;
3708 static void send_qmp_error_event(BlockDriverState *bs,
3709 BlockErrorAction action,
3710 bool is_read, int error)
3712 IoOperationType optype;
3714 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3715 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3716 bdrv_iostatus_is_enabled(bs),
3717 error == ENOSPC, strerror(error),
3721 /* This is done by device models because, while the block layer knows
3722 * about the error, it does not know whether an operation comes from
3723 * the device or the block layer (from a job, for example).
3725 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3726 bool is_read, int error)
3730 if (action == BLOCK_ERROR_ACTION_STOP) {
3731 /* First set the iostatus, so that "info block" returns an iostatus
3732 * that matches the events raised so far (an additional error iostatus
3733 * is fine, but not a lost one).
3735 bdrv_iostatus_set_err(bs, error);
3737 /* Then raise the request to stop the VM and the event.
3738 * qemu_system_vmstop_request_prepare has two effects. First,
3739 * it ensures that the STOP event always comes after the
3740 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3741 * can observe the STOP event and do a "cont" before the STOP
3742 * event is issued, the VM will not stop. In this case, vm_start()
3743 * also ensures that the STOP/RESUME pair of events is emitted.
3745 qemu_system_vmstop_request_prepare();
3746 send_qmp_error_event(bs, action, is_read, error);
3747 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3749 send_qmp_error_event(bs, action, is_read, error);
3753 int bdrv_is_read_only(BlockDriverState *bs)
3755 return bs->read_only;
3758 int bdrv_is_sg(BlockDriverState *bs)
3763 int bdrv_enable_write_cache(BlockDriverState *bs)
3765 return bs->enable_write_cache;
3768 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3770 bs->enable_write_cache = wce;
3772 /* so a reopen() will preserve wce */
3774 bs->open_flags |= BDRV_O_CACHE_WB;
3776 bs->open_flags &= ~BDRV_O_CACHE_WB;
3780 int bdrv_is_encrypted(BlockDriverState *bs)
3782 if (bs->backing_hd && bs->backing_hd->encrypted)
3784 return bs->encrypted;
3787 int bdrv_key_required(BlockDriverState *bs)
3789 BlockDriverState *backing_hd = bs->backing_hd;
3791 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3793 return (bs->encrypted && !bs->valid_key);
3796 int bdrv_set_key(BlockDriverState *bs, const char *key)
3799 if (bs->backing_hd && bs->backing_hd->encrypted) {
3800 ret = bdrv_set_key(bs->backing_hd, key);
3806 if (!bs->encrypted) {
3808 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3811 ret = bs->drv->bdrv_set_key(bs, key);
3814 } else if (!bs->valid_key) {
3817 /* call the change callback now, we skipped it on open */
3818 blk_dev_change_media_cb(bs->blk, true);
3825 * Provide an encryption key for @bs.
3826 * If @key is non-null:
3827 * If @bs is not encrypted, fail.
3828 * Else if the key is invalid, fail.
3829 * Else set @bs's key to @key, replacing the existing key, if any.
3831 * If @bs is encrypted and still lacks a key, fail.
3833 * On failure, store an error object through @errp if non-null.
3835 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3838 if (!bdrv_is_encrypted(bs)) {
3839 error_setg(errp, "Node '%s' is not encrypted",
3840 bdrv_get_device_or_node_name(bs));
3841 } else if (bdrv_set_key(bs, key) < 0) {
3842 error_set(errp, QERR_INVALID_PASSWORD);
3845 if (bdrv_key_required(bs)) {
3846 error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3847 "'%s' (%s) is encrypted",
3848 bdrv_get_device_or_node_name(bs),
3849 bdrv_get_encrypted_filename(bs));
3854 const char *bdrv_get_format_name(BlockDriverState *bs)
3856 return bs->drv ? bs->drv->format_name : NULL;
3859 static int qsort_strcmp(const void *a, const void *b)
3861 return strcmp(a, b);
3864 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3870 const char **formats = NULL;
3872 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3873 if (drv->format_name) {
3876 while (formats && i && !found) {
3877 found = !strcmp(formats[--i], drv->format_name);
3881 formats = g_renew(const char *, formats, count + 1);
3882 formats[count++] = drv->format_name;
3887 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3889 for (i = 0; i < count; i++) {
3890 it(opaque, formats[i]);
3896 /* This function is to find a node in the bs graph */
3897 BlockDriverState *bdrv_find_node(const char *node_name)
3899 BlockDriverState *bs;
3903 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3904 if (!strcmp(node_name, bs->node_name)) {
3911 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3912 BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
3914 BlockDeviceInfoList *list, *entry;
3915 BlockDriverState *bs;
3918 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3919 BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3921 qapi_free_BlockDeviceInfoList(list);
3924 entry = g_malloc0(sizeof(*entry));
3925 entry->value = info;
3933 BlockDriverState *bdrv_lookup_bs(const char *device,
3934 const char *node_name,
3938 BlockDriverState *bs;
3941 blk = blk_by_name(device);
3949 bs = bdrv_find_node(node_name);
3956 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3957 device ? device : "",
3958 node_name ? node_name : "");
3962 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3963 * return false. If either argument is NULL, return false. */
3964 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3966 while (top && top != base) {
3967 top = top->backing_hd;
3973 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3976 return QTAILQ_FIRST(&graph_bdrv_states);
3978 return QTAILQ_NEXT(bs, node_list);
3981 BlockDriverState *bdrv_next(BlockDriverState *bs)
3984 return QTAILQ_FIRST(&bdrv_states);
3986 return QTAILQ_NEXT(bs, device_list);
3989 const char *bdrv_get_node_name(const BlockDriverState *bs)
3991 return bs->node_name;
3994 /* TODO check what callers really want: bs->node_name or blk_name() */
3995 const char *bdrv_get_device_name(const BlockDriverState *bs)
3997 return bs->blk ? blk_name(bs->blk) : "";
4000 /* This can be used to identify nodes that might not have a device
4001 * name associated. Since node and device names live in the same
4002 * namespace, the result is unambiguous. The exception is if both are
4003 * absent, then this returns an empty (non-null) string. */
4004 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
4006 return bs->blk ? blk_name(bs->blk) : bs->node_name;
4009 int bdrv_get_flags(BlockDriverState *bs)
4011 return bs->open_flags;
4014 int bdrv_flush_all(void)
4016 BlockDriverState *bs;
4019 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4020 AioContext *aio_context = bdrv_get_aio_context(bs);
4023 aio_context_acquire(aio_context);
4024 ret = bdrv_flush(bs);
4025 if (ret < 0 && !result) {
4028 aio_context_release(aio_context);
4034 int bdrv_has_zero_init_1(BlockDriverState *bs)
4039 int bdrv_has_zero_init(BlockDriverState *bs)
4043 /* If BS is a copy on write image, it is initialized to
4044 the contents of the base image, which may not be zeroes. */
4045 if (bs->backing_hd) {
4048 if (bs->drv->bdrv_has_zero_init) {
4049 return bs->drv->bdrv_has_zero_init(bs);
4056 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4058 BlockDriverInfo bdi;
4060 if (bs->backing_hd) {
4064 if (bdrv_get_info(bs, &bdi) == 0) {
4065 return bdi.unallocated_blocks_are_zero;
4071 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4073 BlockDriverInfo bdi;
4075 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4079 if (bdrv_get_info(bs, &bdi) == 0) {
4080 return bdi.can_write_zeroes_with_unmap;
4086 typedef struct BdrvCoGetBlockStatusData {
4087 BlockDriverState *bs;
4088 BlockDriverState *base;
4094 } BdrvCoGetBlockStatusData;
4097 * Returns the allocation status of the specified sectors.
4098 * Drivers not implementing the functionality are assumed to not support
4099 * backing files, hence all their sectors are reported as allocated.
4101 * If 'sector_num' is beyond the end of the disk image the return value is 0
4102 * and 'pnum' is set to 0.
4104 * 'pnum' is set to the number of sectors (including and immediately following
4105 * the specified sector) that are known to be in the same
4106 * allocated/unallocated state.
4108 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4109 * beyond the end of the disk image it will be clamped.
4111 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4113 int nb_sectors, int *pnum)
4115 int64_t total_sectors;
4119 total_sectors = bdrv_nb_sectors(bs);
4120 if (total_sectors < 0) {
4121 return total_sectors;
4124 if (sector_num >= total_sectors) {
4129 n = total_sectors - sector_num;
4130 if (n < nb_sectors) {
4134 if (!bs->drv->bdrv_co_get_block_status) {
4136 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4137 if (bs->drv->protocol_name) {
4138 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4143 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4149 if (ret & BDRV_BLOCK_RAW) {
4150 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4151 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4155 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4156 ret |= BDRV_BLOCK_ALLOCATED;
4159 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4160 if (bdrv_unallocated_blocks_are_zero(bs)) {
4161 ret |= BDRV_BLOCK_ZERO;
4162 } else if (bs->backing_hd) {
4163 BlockDriverState *bs2 = bs->backing_hd;
4164 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4165 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4166 ret |= BDRV_BLOCK_ZERO;
4172 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4173 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4176 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4179 /* Ignore errors. This is just providing extra information, it
4180 * is useful but not necessary.
4183 /* !file_pnum indicates an offset at or beyond the EOF; it is
4184 * perfectly valid for the format block driver to point to such
4185 * offsets, so catch it and mark everything as zero */
4186 ret |= BDRV_BLOCK_ZERO;
4188 /* Limit request to the range reported by the protocol driver */
4190 ret |= (ret2 & BDRV_BLOCK_ZERO);
4198 /* Coroutine wrapper for bdrv_get_block_status() */
4199 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4201 BdrvCoGetBlockStatusData *data = opaque;
4202 BlockDriverState *bs = data->bs;
4204 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4210 * Synchronous wrapper around bdrv_co_get_block_status().
4212 * See bdrv_co_get_block_status() for details.
4214 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4215 int nb_sectors, int *pnum)
4218 BdrvCoGetBlockStatusData data = {
4220 .sector_num = sector_num,
4221 .nb_sectors = nb_sectors,
4226 if (qemu_in_coroutine()) {
4227 /* Fast-path if already in coroutine context */
4228 bdrv_get_block_status_co_entry(&data);
4230 AioContext *aio_context = bdrv_get_aio_context(bs);
4232 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4233 qemu_coroutine_enter(co, &data);
4234 while (!data.done) {
4235 aio_poll(aio_context, true);
4241 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4242 int nb_sectors, int *pnum)
4244 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4248 return !!(ret & BDRV_BLOCK_ALLOCATED);
4252 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4254 * Return true if the given sector is allocated in any image between
4255 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4256 * sector is allocated in any image of the chain. Return false otherwise.
4258 * 'pnum' is set to the number of sectors (including and immediately following
4259 * the specified sector) that are known to be in the same
4260 * allocated/unallocated state.
4263 int bdrv_is_allocated_above(BlockDriverState *top,
4264 BlockDriverState *base,
4266 int nb_sectors, int *pnum)
4268 BlockDriverState *intermediate;
4269 int ret, n = nb_sectors;
4272 while (intermediate && intermediate != base) {
4274 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4284 * [sector_num, nb_sectors] is unallocated on top but intermediate
4287 * [sector_num+x, nr_sectors] allocated.
4289 if (n > pnum_inter &&
4290 (intermediate == top ||
4291 sector_num + pnum_inter < intermediate->total_sectors)) {
4295 intermediate = intermediate->backing_hd;
4302 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4304 if (bs->backing_hd && bs->backing_hd->encrypted)
4305 return bs->backing_file;
4306 else if (bs->encrypted)
4307 return bs->filename;
4312 void bdrv_get_backing_filename(BlockDriverState *bs,
4313 char *filename, int filename_size)
4315 pstrcpy(filename, filename_size, bs->backing_file);
4318 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4319 const uint8_t *buf, int nb_sectors)
4321 BlockDriver *drv = bs->drv;
4327 if (!drv->bdrv_write_compressed) {
4330 ret = bdrv_check_request(bs, sector_num, nb_sectors);
4335 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4337 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4340 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4342 BlockDriver *drv = bs->drv;
4345 if (!drv->bdrv_get_info)
4347 memset(bdi, 0, sizeof(*bdi));
4348 return drv->bdrv_get_info(bs, bdi);
4351 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4353 BlockDriver *drv = bs->drv;
4354 if (drv && drv->bdrv_get_specific_info) {
4355 return drv->bdrv_get_specific_info(bs);
4360 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4361 int64_t pos, int size)
4364 struct iovec iov = {
4365 .iov_base = (void *) buf,
4369 qemu_iovec_init_external(&qiov, &iov, 1);
4370 return bdrv_writev_vmstate(bs, &qiov, pos);
4373 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4375 BlockDriver *drv = bs->drv;
4379 } else if (drv->bdrv_save_vmstate) {
4380 return drv->bdrv_save_vmstate(bs, qiov, pos);
4381 } else if (bs->file) {
4382 return bdrv_writev_vmstate(bs->file, qiov, pos);
4388 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4389 int64_t pos, int size)
4391 BlockDriver *drv = bs->drv;
4394 if (drv->bdrv_load_vmstate)
4395 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4397 return bdrv_load_vmstate(bs->file, buf, pos, size);
4401 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4403 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4407 bs->drv->bdrv_debug_event(bs, event);
4410 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4413 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4417 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4418 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4424 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4426 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4430 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4431 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4437 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4439 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4443 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4444 return bs->drv->bdrv_debug_resume(bs, tag);
4450 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4452 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4456 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4457 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4463 int bdrv_is_snapshot(BlockDriverState *bs)
4465 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4468 /* backing_file can either be relative, or absolute, or a protocol. If it is
4469 * relative, it must be relative to the chain. So, passing in bs->filename
4470 * from a BDS as backing_file should not be done, as that may be relative to
4471 * the CWD rather than the chain. */
4472 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4473 const char *backing_file)
4475 char *filename_full = NULL;
4476 char *backing_file_full = NULL;
4477 char *filename_tmp = NULL;
4478 int is_protocol = 0;
4479 BlockDriverState *curr_bs = NULL;
4480 BlockDriverState *retval = NULL;
4482 if (!bs || !bs->drv || !backing_file) {
4486 filename_full = g_malloc(PATH_MAX);
4487 backing_file_full = g_malloc(PATH_MAX);
4488 filename_tmp = g_malloc(PATH_MAX);
4490 is_protocol = path_has_protocol(backing_file);
4492 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4494 /* If either of the filename paths is actually a protocol, then
4495 * compare unmodified paths; otherwise make paths relative */
4496 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4497 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4498 retval = curr_bs->backing_hd;
4502 /* If not an absolute filename path, make it relative to the current
4503 * image's filename path */
4504 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4507 /* We are going to compare absolute pathnames */
4508 if (!realpath(filename_tmp, filename_full)) {
4512 /* We need to make sure the backing filename we are comparing against
4513 * is relative to the current image filename (or absolute) */
4514 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4515 curr_bs->backing_file);
4517 if (!realpath(filename_tmp, backing_file_full)) {
4521 if (strcmp(backing_file_full, filename_full) == 0) {
4522 retval = curr_bs->backing_hd;
4528 g_free(filename_full);
4529 g_free(backing_file_full);
4530 g_free(filename_tmp);
4534 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4540 if (!bs->backing_hd) {
4544 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4547 /**************************************************************/
4550 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4551 QEMUIOVector *qiov, int nb_sectors,
4552 BlockCompletionFunc *cb, void *opaque)
4554 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4556 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4560 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4561 QEMUIOVector *qiov, int nb_sectors,
4562 BlockCompletionFunc *cb, void *opaque)
4564 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4566 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4570 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4571 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4572 BlockCompletionFunc *cb, void *opaque)
4574 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4576 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4577 BDRV_REQ_ZERO_WRITE | flags,
4582 typedef struct MultiwriteCB {
4587 BlockCompletionFunc *cb;
4589 QEMUIOVector *free_qiov;
4593 static void multiwrite_user_cb(MultiwriteCB *mcb)
4597 for (i = 0; i < mcb->num_callbacks; i++) {
4598 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4599 if (mcb->callbacks[i].free_qiov) {
4600 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4602 g_free(mcb->callbacks[i].free_qiov);
4606 static void multiwrite_cb(void *opaque, int ret)
4608 MultiwriteCB *mcb = opaque;
4610 trace_multiwrite_cb(mcb, ret);
4612 if (ret < 0 && !mcb->error) {
4616 mcb->num_requests--;
4617 if (mcb->num_requests == 0) {
4618 multiwrite_user_cb(mcb);
4623 static int multiwrite_req_compare(const void *a, const void *b)
4625 const BlockRequest *req1 = a, *req2 = b;
4628 * Note that we can't simply subtract req2->sector from req1->sector
4629 * here as that could overflow the return value.
4631 if (req1->sector > req2->sector) {
4633 } else if (req1->sector < req2->sector) {
4641 * Takes a bunch of requests and tries to merge them. Returns the number of
4642 * requests that remain after merging.
4644 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4645 int num_reqs, MultiwriteCB *mcb)
4649 // Sort requests by start sector
4650 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4652 // Check if adjacent requests touch the same clusters. If so, combine them,
4653 // filling up gaps with zero sectors.
4655 for (i = 1; i < num_reqs; i++) {
4657 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4659 // Handle exactly sequential writes and overlapping writes.
4660 if (reqs[i].sector <= oldreq_last) {
4664 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4668 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4669 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4675 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4676 qemu_iovec_init(qiov,
4677 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4679 // Add the first request to the merged one. If the requests are
4680 // overlapping, drop the last sectors of the first request.
4681 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4682 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4684 // We should need to add any zeros between the two requests
4685 assert (reqs[i].sector <= oldreq_last);
4687 // Add the second request
4688 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4690 // Add tail of first request, if necessary
4691 if (qiov->size < reqs[outidx].qiov->size) {
4692 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4693 reqs[outidx].qiov->size - qiov->size);
4696 reqs[outidx].nb_sectors = qiov->size >> 9;
4697 reqs[outidx].qiov = qiov;
4699 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4702 reqs[outidx].sector = reqs[i].sector;
4703 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4704 reqs[outidx].qiov = reqs[i].qiov;
4708 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4714 * Submit multiple AIO write requests at once.
4716 * On success, the function returns 0 and all requests in the reqs array have
4717 * been submitted. In error case this function returns -1, and any of the
4718 * requests may or may not be submitted yet. In particular, this means that the
4719 * callback will be called for some of the requests, for others it won't. The
4720 * caller must check the error field of the BlockRequest to wait for the right
4721 * callbacks (if error != 0, no callback will be called).
4723 * The implementation may modify the contents of the reqs array, e.g. to merge
4724 * requests. However, the fields opaque and error are left unmodified as they
4725 * are used to signal failure for a single request to the caller.
4727 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4732 /* don't submit writes if we don't have a medium */
4733 if (bs->drv == NULL) {
4734 for (i = 0; i < num_reqs; i++) {
4735 reqs[i].error = -ENOMEDIUM;
4740 if (num_reqs == 0) {
4744 // Create MultiwriteCB structure
4745 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4746 mcb->num_requests = 0;
4747 mcb->num_callbacks = num_reqs;
4749 for (i = 0; i < num_reqs; i++) {
4750 mcb->callbacks[i].cb = reqs[i].cb;
4751 mcb->callbacks[i].opaque = reqs[i].opaque;
4754 // Check for mergable requests
4755 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4757 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4759 /* Run the aio requests. */
4760 mcb->num_requests = num_reqs;
4761 for (i = 0; i < num_reqs; i++) {
4762 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4763 reqs[i].nb_sectors, reqs[i].flags,
4771 void bdrv_aio_cancel(BlockAIOCB *acb)
4774 bdrv_aio_cancel_async(acb);
4775 while (acb->refcnt > 1) {
4776 if (acb->aiocb_info->get_aio_context) {
4777 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4778 } else if (acb->bs) {
4779 aio_poll(bdrv_get_aio_context(acb->bs), true);
4784 qemu_aio_unref(acb);
4787 /* Async version of aio cancel. The caller is not blocked if the acb implements
4788 * cancel_async, otherwise we do nothing and let the request normally complete.
4789 * In either case the completion callback must be called. */
4790 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4792 if (acb->aiocb_info->cancel_async) {
4793 acb->aiocb_info->cancel_async(acb);
4797 /**************************************************************/
4798 /* async block device emulation */
4800 typedef struct BlockAIOCBSync {
4804 /* vector translation state */
4810 static const AIOCBInfo bdrv_em_aiocb_info = {
4811 .aiocb_size = sizeof(BlockAIOCBSync),
4814 static void bdrv_aio_bh_cb(void *opaque)
4816 BlockAIOCBSync *acb = opaque;
4818 if (!acb->is_write && acb->ret >= 0) {
4819 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4821 qemu_vfree(acb->bounce);
4822 acb->common.cb(acb->common.opaque, acb->ret);
4823 qemu_bh_delete(acb->bh);
4825 qemu_aio_unref(acb);
4828 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4832 BlockCompletionFunc *cb,
4837 BlockAIOCBSync *acb;
4839 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4840 acb->is_write = is_write;
4842 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4843 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4845 if (acb->bounce == NULL) {
4847 } else if (is_write) {
4848 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4849 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4851 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4854 qemu_bh_schedule(acb->bh);
4856 return &acb->common;
4859 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4860 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4861 BlockCompletionFunc *cb, void *opaque)
4863 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4866 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4867 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4868 BlockCompletionFunc *cb, void *opaque)
4870 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4874 typedef struct BlockAIOCBCoroutine {
4881 } BlockAIOCBCoroutine;
4883 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4884 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4887 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4889 if (!acb->need_bh) {
4890 acb->common.cb(acb->common.opaque, acb->req.error);
4891 qemu_aio_unref(acb);
4895 static void bdrv_co_em_bh(void *opaque)
4897 BlockAIOCBCoroutine *acb = opaque;
4899 assert(!acb->need_bh);
4900 qemu_bh_delete(acb->bh);
4901 bdrv_co_complete(acb);
4904 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4906 acb->need_bh = false;
4907 if (acb->req.error != -EINPROGRESS) {
4908 BlockDriverState *bs = acb->common.bs;
4910 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4911 qemu_bh_schedule(acb->bh);
4915 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4916 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4918 BlockAIOCBCoroutine *acb = opaque;
4919 BlockDriverState *bs = acb->common.bs;
4921 if (!acb->is_write) {
4922 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4923 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4925 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4926 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4929 bdrv_co_complete(acb);
4932 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4936 BdrvRequestFlags flags,
4937 BlockCompletionFunc *cb,
4942 BlockAIOCBCoroutine *acb;
4944 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4945 acb->need_bh = true;
4946 acb->req.error = -EINPROGRESS;
4947 acb->req.sector = sector_num;
4948 acb->req.nb_sectors = nb_sectors;
4949 acb->req.qiov = qiov;
4950 acb->req.flags = flags;
4951 acb->is_write = is_write;
4953 co = qemu_coroutine_create(bdrv_co_do_rw);
4954 qemu_coroutine_enter(co, acb);
4956 bdrv_co_maybe_schedule_bh(acb);
4957 return &acb->common;
4960 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4962 BlockAIOCBCoroutine *acb = opaque;
4963 BlockDriverState *bs = acb->common.bs;
4965 acb->req.error = bdrv_co_flush(bs);
4966 bdrv_co_complete(acb);
4969 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4970 BlockCompletionFunc *cb, void *opaque)
4972 trace_bdrv_aio_flush(bs, opaque);
4975 BlockAIOCBCoroutine *acb;
4977 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4978 acb->need_bh = true;
4979 acb->req.error = -EINPROGRESS;
4981 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4982 qemu_coroutine_enter(co, acb);
4984 bdrv_co_maybe_schedule_bh(acb);
4985 return &acb->common;
4988 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4990 BlockAIOCBCoroutine *acb = opaque;
4991 BlockDriverState *bs = acb->common.bs;
4993 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4994 bdrv_co_complete(acb);
4997 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4998 int64_t sector_num, int nb_sectors,
4999 BlockCompletionFunc *cb, void *opaque)
5002 BlockAIOCBCoroutine *acb;
5004 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
5006 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
5007 acb->need_bh = true;
5008 acb->req.error = -EINPROGRESS;
5009 acb->req.sector = sector_num;
5010 acb->req.nb_sectors = nb_sectors;
5011 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5012 qemu_coroutine_enter(co, acb);
5014 bdrv_co_maybe_schedule_bh(acb);
5015 return &acb->common;
5018 void bdrv_init(void)
5020 module_call_init(MODULE_INIT_BLOCK);
5023 void bdrv_init_with_whitelist(void)
5025 use_bdrv_whitelist = 1;
5029 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
5030 BlockCompletionFunc *cb, void *opaque)
5034 acb = g_slice_alloc(aiocb_info->aiocb_size);
5035 acb->aiocb_info = aiocb_info;
5038 acb->opaque = opaque;
5043 void qemu_aio_ref(void *p)
5045 BlockAIOCB *acb = p;
5049 void qemu_aio_unref(void *p)
5051 BlockAIOCB *acb = p;
5052 assert(acb->refcnt > 0);
5053 if (--acb->refcnt == 0) {
5054 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5058 /**************************************************************/
5059 /* Coroutine block device emulation */
5061 typedef struct CoroutineIOCompletion {
5062 Coroutine *coroutine;
5064 } CoroutineIOCompletion;
5066 static void bdrv_co_io_em_complete(void *opaque, int ret)
5068 CoroutineIOCompletion *co = opaque;
5071 qemu_coroutine_enter(co->coroutine, NULL);
5074 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5075 int nb_sectors, QEMUIOVector *iov,
5078 CoroutineIOCompletion co = {
5079 .coroutine = qemu_coroutine_self(),
5084 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5085 bdrv_co_io_em_complete, &co);
5087 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5088 bdrv_co_io_em_complete, &co);
5091 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
5095 qemu_coroutine_yield();
5100 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5101 int64_t sector_num, int nb_sectors,
5104 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5107 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5108 int64_t sector_num, int nb_sectors,
5111 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5114 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
5116 RwCo *rwco = opaque;
5118 rwco->ret = bdrv_co_flush(rwco->bs);
5121 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5125 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
5129 /* Write back cached data to the OS even with cache=unsafe */
5130 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
5131 if (bs->drv->bdrv_co_flush_to_os) {
5132 ret = bs->drv->bdrv_co_flush_to_os(bs);
5138 /* But don't actually force it to the disk with cache=unsafe */
5139 if (bs->open_flags & BDRV_O_NO_FLUSH) {
5143 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5144 if (bs->drv->bdrv_co_flush_to_disk) {
5145 ret = bs->drv->bdrv_co_flush_to_disk(bs);
5146 } else if (bs->drv->bdrv_aio_flush) {
5148 CoroutineIOCompletion co = {
5149 .coroutine = qemu_coroutine_self(),
5152 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5156 qemu_coroutine_yield();
5161 * Some block drivers always operate in either writethrough or unsafe
5162 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5163 * know how the server works (because the behaviour is hardcoded or
5164 * depends on server-side configuration), so we can't ensure that
5165 * everything is safe on disk. Returning an error doesn't work because
5166 * that would break guests even if the server operates in writethrough
5169 * Let's hope the user knows what he's doing.
5177 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5178 * in the case of cache=unsafe, so there are no useless flushes.
5181 return bdrv_co_flush(bs->file);
5184 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5186 Error *local_err = NULL;
5193 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5196 bs->open_flags &= ~BDRV_O_INCOMING;
5198 if (bs->drv->bdrv_invalidate_cache) {
5199 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5200 } else if (bs->file) {
5201 bdrv_invalidate_cache(bs->file, &local_err);
5204 error_propagate(errp, local_err);
5208 ret = refresh_total_sectors(bs, bs->total_sectors);
5210 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5215 void bdrv_invalidate_cache_all(Error **errp)
5217 BlockDriverState *bs;
5218 Error *local_err = NULL;
5220 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5221 AioContext *aio_context = bdrv_get_aio_context(bs);
5223 aio_context_acquire(aio_context);
5224 bdrv_invalidate_cache(bs, &local_err);
5225 aio_context_release(aio_context);
5227 error_propagate(errp, local_err);
5233 int bdrv_flush(BlockDriverState *bs)
5241 if (qemu_in_coroutine()) {
5242 /* Fast-path if already in coroutine context */
5243 bdrv_flush_co_entry(&rwco);
5245 AioContext *aio_context = bdrv_get_aio_context(bs);
5247 co = qemu_coroutine_create(bdrv_flush_co_entry);
5248 qemu_coroutine_enter(co, &rwco);
5249 while (rwco.ret == NOT_DONE) {
5250 aio_poll(aio_context, true);
5257 typedef struct DiscardCo {
5258 BlockDriverState *bs;
5263 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5265 DiscardCo *rwco = opaque;
5267 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5270 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5273 int max_discard, ret;
5279 ret = bdrv_check_request(bs, sector_num, nb_sectors);
5282 } else if (bs->read_only) {
5286 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5288 /* Do nothing if disabled. */
5289 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5293 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5297 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5298 while (nb_sectors > 0) {
5300 int num = nb_sectors;
5303 if (bs->bl.discard_alignment &&
5304 num >= bs->bl.discard_alignment &&
5305 sector_num % bs->bl.discard_alignment) {
5306 if (num > bs->bl.discard_alignment) {
5307 num = bs->bl.discard_alignment;
5309 num -= sector_num % bs->bl.discard_alignment;
5312 /* limit request size */
5313 if (num > max_discard) {
5317 if (bs->drv->bdrv_co_discard) {
5318 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5321 CoroutineIOCompletion co = {
5322 .coroutine = qemu_coroutine_self(),
5325 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5326 bdrv_co_io_em_complete, &co);
5330 qemu_coroutine_yield();
5334 if (ret && ret != -ENOTSUP) {
5344 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5349 .sector_num = sector_num,
5350 .nb_sectors = nb_sectors,
5354 if (qemu_in_coroutine()) {
5355 /* Fast-path if already in coroutine context */
5356 bdrv_discard_co_entry(&rwco);
5358 AioContext *aio_context = bdrv_get_aio_context(bs);
5360 co = qemu_coroutine_create(bdrv_discard_co_entry);
5361 qemu_coroutine_enter(co, &rwco);
5362 while (rwco.ret == NOT_DONE) {
5363 aio_poll(aio_context, true);
5370 /**************************************************************/
5371 /* removable device support */
5374 * Return TRUE if the media is present
5376 int bdrv_is_inserted(BlockDriverState *bs)
5378 BlockDriver *drv = bs->drv;
5382 if (!drv->bdrv_is_inserted)
5384 return drv->bdrv_is_inserted(bs);
5388 * Return whether the media changed since the last call to this
5389 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5391 int bdrv_media_changed(BlockDriverState *bs)
5393 BlockDriver *drv = bs->drv;
5395 if (drv && drv->bdrv_media_changed) {
5396 return drv->bdrv_media_changed(bs);
5402 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5404 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5406 BlockDriver *drv = bs->drv;
5407 const char *device_name;
5409 if (drv && drv->bdrv_eject) {
5410 drv->bdrv_eject(bs, eject_flag);
5413 device_name = bdrv_get_device_name(bs);
5414 if (device_name[0] != '\0') {
5415 qapi_event_send_device_tray_moved(device_name,
5416 eject_flag, &error_abort);
5421 * Lock or unlock the media (if it is locked, the user won't be able
5422 * to eject it manually).
5424 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5426 BlockDriver *drv = bs->drv;
5428 trace_bdrv_lock_medium(bs, locked);
5430 if (drv && drv->bdrv_lock_medium) {
5431 drv->bdrv_lock_medium(bs, locked);
5435 /* needed for generic scsi interface */
5437 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5439 BlockDriver *drv = bs->drv;
5441 if (drv && drv->bdrv_ioctl)
5442 return drv->bdrv_ioctl(bs, req, buf);
5446 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5447 unsigned long int req, void *buf,
5448 BlockCompletionFunc *cb, void *opaque)
5450 BlockDriver *drv = bs->drv;
5452 if (drv && drv->bdrv_aio_ioctl)
5453 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5457 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5459 bs->guest_block_size = align;
5462 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5464 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5467 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5469 return memset(qemu_blockalign(bs, size), 0, size);
5472 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5474 size_t align = bdrv_opt_mem_align(bs);
5476 /* Ensure that NULL is never returned on success */
5482 return qemu_try_memalign(align, size);
5485 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5487 void *mem = qemu_try_blockalign(bs, size);
5490 memset(mem, 0, size);
5497 * Check if all memory in this vector is sector aligned.
5499 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5502 size_t alignment = bdrv_opt_mem_align(bs);
5504 for (i = 0; i < qiov->niov; i++) {
5505 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5508 if (qiov->iov[i].iov_len % alignment) {
5516 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
5518 BdrvDirtyBitmap *bm;
5521 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5522 if (bm->name && !strcmp(name, bm->name)) {
5529 void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5531 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5532 g_free(bitmap->name);
5533 bitmap->name = NULL;
5536 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
5537 uint32_t granularity,
5541 int64_t bitmap_size;
5542 BdrvDirtyBitmap *bitmap;
5543 uint32_t sector_granularity;
5545 assert((granularity & (granularity - 1)) == 0);
5547 if (name && bdrv_find_dirty_bitmap(bs, name)) {
5548 error_setg(errp, "Bitmap already exists: %s", name);
5551 sector_granularity = granularity >> BDRV_SECTOR_BITS;
5552 assert(sector_granularity);
5553 bitmap_size = bdrv_nb_sectors(bs);
5554 if (bitmap_size < 0) {
5555 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5556 errno = -bitmap_size;
5559 bitmap = g_new0(BdrvDirtyBitmap, 1);
5560 bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
5561 bitmap->size = bitmap_size;
5562 bitmap->name = g_strdup(name);
5563 bitmap->disabled = false;
5564 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5568 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
5570 return bitmap->successor;
5573 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
5575 return !(bitmap->disabled || bitmap->successor);
5579 * Create a successor bitmap destined to replace this bitmap after an operation.
5580 * Requires that the bitmap is not frozen and has no successor.
5582 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
5583 BdrvDirtyBitmap *bitmap, Error **errp)
5585 uint64_t granularity;
5586 BdrvDirtyBitmap *child;
5588 if (bdrv_dirty_bitmap_frozen(bitmap)) {
5589 error_setg(errp, "Cannot create a successor for a bitmap that is "
5590 "currently frozen");
5593 assert(!bitmap->successor);
5595 /* Create an anonymous successor */
5596 granularity = bdrv_dirty_bitmap_granularity(bitmap);
5597 child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
5602 /* Successor will be on or off based on our current state. */
5603 child->disabled = bitmap->disabled;
5605 /* Install the successor and freeze the parent */
5606 bitmap->successor = child;
5611 * For a bitmap with a successor, yield our name to the successor,
5612 * delete the old bitmap, and return a handle to the new bitmap.
5614 BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
5615 BdrvDirtyBitmap *bitmap,
5619 BdrvDirtyBitmap *successor = bitmap->successor;
5621 if (successor == NULL) {
5622 error_setg(errp, "Cannot relinquish control if "
5623 "there's no successor present");
5627 name = bitmap->name;
5628 bitmap->name = NULL;
5629 successor->name = name;
5630 bitmap->successor = NULL;
5631 bdrv_release_dirty_bitmap(bs, bitmap);
5637 * In cases of failure where we can no longer safely delete the parent,
5638 * we may wish to re-join the parent and child/successor.
5639 * The merged parent will be un-frozen, but not explicitly re-enabled.
5641 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
5642 BdrvDirtyBitmap *parent,
5645 BdrvDirtyBitmap *successor = parent->successor;
5648 error_setg(errp, "Cannot reclaim a successor when none is present");
5652 if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
5653 error_setg(errp, "Merging of parent and successor bitmap failed");
5656 bdrv_release_dirty_bitmap(bs, successor);
5657 parent->successor = NULL;
5662 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5664 BdrvDirtyBitmap *bm, *next;
5665 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5667 assert(!bdrv_dirty_bitmap_frozen(bm));
5668 QLIST_REMOVE(bitmap, list);
5669 hbitmap_free(bitmap->bitmap);
5670 g_free(bitmap->name);
5677 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5679 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5680 bitmap->disabled = true;
5683 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5685 assert(!bdrv_dirty_bitmap_frozen(bitmap));
5686 bitmap->disabled = false;
5689 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5691 BdrvDirtyBitmap *bm;
5692 BlockDirtyInfoList *list = NULL;
5693 BlockDirtyInfoList **plist = &list;
5695 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5696 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5697 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5698 info->count = bdrv_get_dirty_count(bs, bm);
5699 info->granularity = bdrv_dirty_bitmap_granularity(bm);
5700 info->has_name = !!bm->name;
5701 info->name = g_strdup(bm->name);
5702 entry->value = info;
5704 plist = &entry->next;
5710 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5713 return hbitmap_get(bitmap->bitmap, sector);
5720 * Chooses a default granularity based on the existing cluster size,
5721 * but clamped between [4K, 64K]. Defaults to 64K in the case that there
5722 * is no cluster size information available.
5724 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
5726 BlockDriverInfo bdi;
5727 uint32_t granularity;
5729 if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
5730 granularity = MAX(4096, bdi.cluster_size);
5731 granularity = MIN(65536, granularity);
5733 granularity = 65536;
5739 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
5741 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
5744 void bdrv_dirty_iter_init(BlockDriverState *bs,
5745 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5747 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5750 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5751 int64_t cur_sector, int nr_sectors)
5753 assert(bdrv_dirty_bitmap_enabled(bitmap));
5754 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5757 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5758 int64_t cur_sector, int nr_sectors)
5760 assert(bdrv_dirty_bitmap_enabled(bitmap));
5761 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5764 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5766 assert(bdrv_dirty_bitmap_enabled(bitmap));
5767 hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
5770 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5773 BdrvDirtyBitmap *bitmap;
5774 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5775 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5778 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5782 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5785 BdrvDirtyBitmap *bitmap;
5786 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5787 if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5790 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5795 * Advance an HBitmapIter to an arbitrary offset.
5797 void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
5800 hbitmap_iter_init(hbi, hbi->hb, offset);
5803 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5805 return hbitmap_count(bitmap->bitmap);
5808 /* Get a reference to bs */
5809 void bdrv_ref(BlockDriverState *bs)
5814 /* Release a previously grabbed reference to bs.
5815 * If after releasing, reference count is zero, the BlockDriverState is
5817 void bdrv_unref(BlockDriverState *bs)
5822 assert(bs->refcnt > 0);
5823 if (--bs->refcnt == 0) {
5828 struct BdrvOpBlocker {
5830 QLIST_ENTRY(BdrvOpBlocker) list;
5833 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5835 BdrvOpBlocker *blocker;
5836 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5837 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5838 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5840 error_setg(errp, "Node '%s' is busy: %s",
5841 bdrv_get_device_or_node_name(bs),
5842 error_get_pretty(blocker->reason));
5849 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5851 BdrvOpBlocker *blocker;
5852 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5854 blocker = g_new0(BdrvOpBlocker, 1);
5855 blocker->reason = reason;
5856 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5859 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5861 BdrvOpBlocker *blocker, *next;
5862 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5863 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5864 if (blocker->reason == reason) {
5865 QLIST_REMOVE(blocker, list);
5871 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5874 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5875 bdrv_op_block(bs, i, reason);
5879 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5882 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5883 bdrv_op_unblock(bs, i, reason);
5887 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5891 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5892 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5899 void bdrv_iostatus_enable(BlockDriverState *bs)
5901 bs->iostatus_enabled = true;
5902 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5905 /* The I/O status is only enabled if the drive explicitly
5906 * enables it _and_ the VM is configured to stop on errors */
5907 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5909 return (bs->iostatus_enabled &&
5910 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5911 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5912 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5915 void bdrv_iostatus_disable(BlockDriverState *bs)
5917 bs->iostatus_enabled = false;
5920 void bdrv_iostatus_reset(BlockDriverState *bs)
5922 if (bdrv_iostatus_is_enabled(bs)) {
5923 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5925 block_job_iostatus_reset(bs->job);
5930 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5932 assert(bdrv_iostatus_is_enabled(bs));
5933 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5934 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5935 BLOCK_DEVICE_IO_STATUS_FAILED;
5939 void bdrv_img_create(const char *filename, const char *fmt,
5940 const char *base_filename, const char *base_fmt,
5941 char *options, uint64_t img_size, int flags,
5942 Error **errp, bool quiet)
5944 QemuOptsList *create_opts = NULL;
5945 QemuOpts *opts = NULL;
5946 const char *backing_fmt, *backing_file;
5948 BlockDriver *drv, *proto_drv;
5949 BlockDriver *backing_drv = NULL;
5950 Error *local_err = NULL;
5953 /* Find driver and parse its options */
5954 drv = bdrv_find_format(fmt);
5956 error_setg(errp, "Unknown file format '%s'", fmt);
5960 proto_drv = bdrv_find_protocol(filename, true, errp);
5965 if (!drv->create_opts) {
5966 error_setg(errp, "Format driver '%s' does not support image creation",
5971 if (!proto_drv->create_opts) {
5972 error_setg(errp, "Protocol driver '%s' does not support image creation",
5973 proto_drv->format_name);
5977 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5978 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5980 /* Create parameter list with default values */
5981 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5982 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5984 /* Parse -o options */
5986 qemu_opts_do_parse(opts, options, NULL, &local_err);
5988 error_report_err(local_err);
5990 error_setg(errp, "Invalid options for file format '%s'", fmt);
5995 if (base_filename) {
5996 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
5998 error_setg(errp, "Backing file not supported for file format '%s'",
6005 qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6007 error_setg(errp, "Backing file format not supported for file "
6008 "format '%s'", fmt);
6013 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
6015 if (!strcmp(filename, backing_file)) {
6016 error_setg(errp, "Error: Trying to create an image with the "
6017 "same filename as the backing file");
6022 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
6024 backing_drv = bdrv_find_format(backing_fmt);
6026 error_setg(errp, "Unknown backing file format '%s'",
6032 // The size for the image must always be specified, with one exception:
6033 // If we are using a backing file, we can obtain the size from there
6034 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
6037 BlockDriverState *bs;
6038 char *full_backing = g_new0(char, PATH_MAX);
6042 bdrv_get_full_backing_filename_from_filename(filename, backing_file,
6043 full_backing, PATH_MAX,
6046 g_free(full_backing);
6050 /* backing files always opened read-only */
6052 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
6055 ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
6056 backing_drv, &local_err);
6057 g_free(full_backing);
6061 size = bdrv_getlength(bs);
6063 error_setg_errno(errp, -size, "Could not get size of '%s'",
6069 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
6073 error_setg(errp, "Image creation needs a size parameter");
6079 printf("Formatting '%s', fmt=%s", filename, fmt);
6080 qemu_opts_print(opts, " ");
6084 ret = bdrv_create(drv, filename, opts, &local_err);
6086 if (ret == -EFBIG) {
6087 /* This is generally a better message than whatever the driver would
6088 * deliver (especially because of the cluster_size_hint), since that
6089 * is most probably not much different from "image too large". */
6090 const char *cluster_size_hint = "";
6091 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
6092 cluster_size_hint = " (try using a larger cluster size)";
6094 error_setg(errp, "The image size is too large for file format '%s'"
6095 "%s", fmt, cluster_size_hint);
6096 error_free(local_err);
6101 qemu_opts_del(opts);
6102 qemu_opts_free(create_opts);
6104 error_propagate(errp, local_err);
6108 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
6110 return bs->aio_context;
6113 void bdrv_detach_aio_context(BlockDriverState *bs)
6115 BdrvAioNotifier *baf;
6121 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
6122 baf->detach_aio_context(baf->opaque);
6125 if (bs->io_limits_enabled) {
6126 throttle_detach_aio_context(&bs->throttle_state);
6128 if (bs->drv->bdrv_detach_aio_context) {
6129 bs->drv->bdrv_detach_aio_context(bs);
6132 bdrv_detach_aio_context(bs->file);
6134 if (bs->backing_hd) {
6135 bdrv_detach_aio_context(bs->backing_hd);
6138 bs->aio_context = NULL;
6141 void bdrv_attach_aio_context(BlockDriverState *bs,
6142 AioContext *new_context)
6144 BdrvAioNotifier *ban;
6150 bs->aio_context = new_context;
6152 if (bs->backing_hd) {
6153 bdrv_attach_aio_context(bs->backing_hd, new_context);
6156 bdrv_attach_aio_context(bs->file, new_context);
6158 if (bs->drv->bdrv_attach_aio_context) {
6159 bs->drv->bdrv_attach_aio_context(bs, new_context);
6161 if (bs->io_limits_enabled) {
6162 throttle_attach_aio_context(&bs->throttle_state, new_context);
6165 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
6166 ban->attached_aio_context(new_context, ban->opaque);
6170 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
6172 bdrv_drain_all(); /* ensure there are no in-flight requests */
6174 bdrv_detach_aio_context(bs);
6176 /* This function executes in the old AioContext so acquire the new one in
6177 * case it runs in a different thread.
6179 aio_context_acquire(new_context);
6180 bdrv_attach_aio_context(bs, new_context);
6181 aio_context_release(new_context);
6184 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
6185 void (*attached_aio_context)(AioContext *new_context, void *opaque),
6186 void (*detach_aio_context)(void *opaque), void *opaque)
6188 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
6189 *ban = (BdrvAioNotifier){
6190 .attached_aio_context = attached_aio_context,
6191 .detach_aio_context = detach_aio_context,
6195 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
6198 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6199 void (*attached_aio_context)(AioContext *,
6201 void (*detach_aio_context)(void *),
6204 BdrvAioNotifier *ban, *ban_next;
6206 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6207 if (ban->attached_aio_context == attached_aio_context &&
6208 ban->detach_aio_context == detach_aio_context &&
6209 ban->opaque == opaque)
6211 QLIST_REMOVE(ban, list);
6221 void bdrv_add_before_write_notifier(BlockDriverState *bs,
6222 NotifierWithReturn *notifier)
6224 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6227 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6228 BlockDriverAmendStatusCB *status_cb)
6230 if (!bs->drv->bdrv_amend_options) {
6233 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6236 /* This function will be called by the bdrv_recurse_is_first_non_filter method
6237 * of block filter and by bdrv_is_first_non_filter.
6238 * It is used to test if the given bs is the candidate or recurse more in the
6241 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
6242 BlockDriverState *candidate)
6244 /* return false if basic checks fails */
6245 if (!bs || !bs->drv) {
6249 /* the code reached a non block filter driver -> check if the bs is
6250 * the same as the candidate. It's the recursion termination condition.
6252 if (!bs->drv->is_filter) {
6253 return bs == candidate;
6255 /* Down this path the driver is a block filter driver */
6257 /* If the block filter recursion method is defined use it to recurse down
6260 if (bs->drv->bdrv_recurse_is_first_non_filter) {
6261 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
6264 /* the driver is a block filter but don't allow to recurse -> return false
6269 /* This function checks if the candidate is the first non filter bs down it's
6270 * bs chain. Since we don't have pointers to parents it explore all bs chains
6271 * from the top. Some filters can choose not to pass down the recursion.
6273 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
6275 BlockDriverState *bs;
6277 /* walk down the bs forest recursively */
6278 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6281 /* try to recurse in this top level bs */
6282 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
6284 /* candidate is the first non filter */
6293 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6295 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
6296 AioContext *aio_context;
6298 if (!to_replace_bs) {
6299 error_setg(errp, "Node name '%s' not found", node_name);
6303 aio_context = bdrv_get_aio_context(to_replace_bs);
6304 aio_context_acquire(aio_context);
6306 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
6307 to_replace_bs = NULL;
6311 /* We don't want arbitrary node of the BDS chain to be replaced only the top
6312 * most non filter in order to prevent data corruption.
6313 * Another benefit is that this tests exclude backing files which are
6314 * blocked by the backing blockers.
6316 if (!bdrv_is_first_non_filter(to_replace_bs)) {
6317 error_setg(errp, "Only top most non filter can be replaced");
6318 to_replace_bs = NULL;
6323 aio_context_release(aio_context);
6324 return to_replace_bs;
6327 void bdrv_io_plug(BlockDriverState *bs)
6329 BlockDriver *drv = bs->drv;
6330 if (drv && drv->bdrv_io_plug) {
6331 drv->bdrv_io_plug(bs);
6332 } else if (bs->file) {
6333 bdrv_io_plug(bs->file);
6337 void bdrv_io_unplug(BlockDriverState *bs)
6339 BlockDriver *drv = bs->drv;
6340 if (drv && drv->bdrv_io_unplug) {
6341 drv->bdrv_io_unplug(bs);
6342 } else if (bs->file) {
6343 bdrv_io_unplug(bs->file);
6347 void bdrv_flush_io_queue(BlockDriverState *bs)
6349 BlockDriver *drv = bs->drv;
6350 if (drv && drv->bdrv_flush_io_queue) {
6351 drv->bdrv_flush_io_queue(bs);
6352 } else if (bs->file) {
6353 bdrv_flush_io_queue(bs->file);
6357 static bool append_open_options(QDict *d, BlockDriverState *bs)
6359 const QDictEntry *entry;
6360 bool found_any = false;
6362 for (entry = qdict_first(bs->options); entry;
6363 entry = qdict_next(bs->options, entry))
6365 /* Only take options for this level and exclude all non-driver-specific
6367 if (!strchr(qdict_entry_key(entry), '.') &&
6368 strcmp(qdict_entry_key(entry), "node-name"))
6370 qobject_incref(qdict_entry_value(entry));
6371 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6379 /* Updates the following BDS fields:
6380 * - exact_filename: A filename which may be used for opening a block device
6381 * which (mostly) equals the given BDS (even without any
6382 * other options; so reading and writing must return the same
6383 * results, but caching etc. may be different)
6384 * - full_open_options: Options which, when given when opening a block device
6385 * (without a filename), result in a BDS (mostly)
6386 * equalling the given one
6387 * - filename: If exact_filename is set, it is copied here. Otherwise,
6388 * full_open_options is converted to a JSON object, prefixed with
6389 * "json:" (for use through the JSON pseudo protocol) and put here.
6391 void bdrv_refresh_filename(BlockDriverState *bs)
6393 BlockDriver *drv = bs->drv;
6400 /* This BDS's file name will most probably depend on its file's name, so
6401 * refresh that first */
6403 bdrv_refresh_filename(bs->file);
6406 if (drv->bdrv_refresh_filename) {
6407 /* Obsolete information is of no use here, so drop the old file name
6408 * information before refreshing it */
6409 bs->exact_filename[0] = '\0';
6410 if (bs->full_open_options) {
6411 QDECREF(bs->full_open_options);
6412 bs->full_open_options = NULL;
6415 drv->bdrv_refresh_filename(bs);
6416 } else if (bs->file) {
6417 /* Try to reconstruct valid information from the underlying file */
6418 bool has_open_options;
6420 bs->exact_filename[0] = '\0';
6421 if (bs->full_open_options) {
6422 QDECREF(bs->full_open_options);
6423 bs->full_open_options = NULL;
6427 has_open_options = append_open_options(opts, bs);
6429 /* If no specific options have been given for this BDS, the filename of
6430 * the underlying file should suffice for this one as well */
6431 if (bs->file->exact_filename[0] && !has_open_options) {
6432 strcpy(bs->exact_filename, bs->file->exact_filename);
6434 /* Reconstructing the full options QDict is simple for most format block
6435 * drivers, as long as the full options are known for the underlying
6436 * file BDS. The full options QDict of that file BDS should somehow
6437 * contain a representation of the filename, therefore the following
6438 * suffices without querying the (exact_)filename of this BDS. */
6439 if (bs->file->full_open_options) {
6440 qdict_put_obj(opts, "driver",
6441 QOBJECT(qstring_from_str(drv->format_name)));
6442 QINCREF(bs->file->full_open_options);
6443 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6445 bs->full_open_options = opts;
6449 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6450 /* There is no underlying file BDS (at least referenced by BDS.file),
6451 * so the full options QDict should be equal to the options given
6452 * specifically for this block device when it was opened (plus the
6453 * driver specification).
6454 * Because those options don't change, there is no need to update
6455 * full_open_options when it's already set. */
6458 append_open_options(opts, bs);
6459 qdict_put_obj(opts, "driver",
6460 QOBJECT(qstring_from_str(drv->format_name)));
6462 if (bs->exact_filename[0]) {
6463 /* This may not work for all block protocol drivers (some may
6464 * require this filename to be parsed), but we have to find some
6465 * default solution here, so just include it. If some block driver
6466 * does not support pure options without any filename at all or
6467 * needs some special format of the options QDict, it needs to
6468 * implement the driver-specific bdrv_refresh_filename() function.
6470 qdict_put_obj(opts, "filename",
6471 QOBJECT(qstring_from_str(bs->exact_filename)));
6474 bs->full_open_options = opts;
6477 if (bs->exact_filename[0]) {
6478 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6479 } else if (bs->full_open_options) {
6480 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6481 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6482 qstring_get_str(json));
6487 /* This accessor function purpose is to allow the device models to access the
6488 * BlockAcctStats structure embedded inside a BlockDriverState without being
6489 * aware of the BlockDriverState structure layout.
6490 * It will go away when the BlockAcctStats structure will be moved inside
6491 * the device models.
6493 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)