2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
41 #include <sys/types.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
54 struct BdrvDirtyBitmap {
56 QLIST_ENTRY(BdrvDirtyBitmap) list;
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
83 BdrvRequestFlags flags,
84 BlockCompletionFunc *cb,
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
104 static int is_windows_drive_prefix(const char *filename)
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
111 int is_windows_drive(const char *filename)
113 if (is_windows_drive_prefix(filename) &&
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
129 throttle_config(&bs->throttle_state, cfg);
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
143 bs->io_limits_enabled = false;
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
151 bs->io_limits_enabled = enabled;
156 void bdrv_io_limits_disable(BlockDriverState *bs)
158 bs->io_limits_enabled = false;
160 bdrv_start_throttled_reqs(bs);
162 throttle_destroy(&bs->throttle_state);
165 static void bdrv_throttle_read_timer_cb(void *opaque)
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
171 static void bdrv_throttle_write_timer_cb(void *opaque)
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 bdrv_get_aio_context(bs),
184 bdrv_throttle_read_timer_cb,
185 bdrv_throttle_write_timer_cb,
187 bs->io_limits_enabled = true;
190 /* This function makes an IO wait if needed
192 * @nb_sectors: the number of sectors of the IO
193 * @is_write: is the IO a write
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
199 /* does this io must wait */
200 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
202 /* if must wait or any request of this type throttled queue the IO */
204 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208 /* the IO will be executed, do the accounting */
209 throttle_account(&bs->throttle_state, is_write, bytes);
212 /* if the next request must wait -> do nothing */
213 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
217 /* else queue next request for execution */
218 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
223 if (!bs || !bs->drv) {
224 /* 4k should be on the safe side */
228 return bs->bl.opt_mem_alignment;
231 /* check if the path starts with "<protocol>:" */
232 int path_has_protocol(const char *path)
237 if (is_windows_drive(path) ||
238 is_windows_drive_prefix(path)) {
241 p = path + strcspn(path, ":/\\");
243 p = path + strcspn(path, ":/");
249 int path_is_absolute(const char *path)
252 /* specific case for names like: "\\.\d:" */
253 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
256 return (*path == '/' || *path == '\\');
258 return (*path == '/');
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263 path to it by considering it is relative to base_path. URL are
265 void path_combine(char *dest, int dest_size,
266 const char *base_path,
267 const char *filename)
274 if (path_is_absolute(filename)) {
275 pstrcpy(dest, dest_size, filename);
277 p = strchr(base_path, ':');
282 p1 = strrchr(base_path, '/');
286 p2 = strrchr(base_path, '\\');
298 if (len > dest_size - 1)
300 memcpy(dest, base_path, len);
302 pstrcat(dest, dest_size, filename);
306 void bdrv_get_full_backing_filename_from_filename(const char *backed,
308 char *dest, size_t sz)
310 if (backing[0] == '\0' || path_has_protocol(backing)) {
311 pstrcpy(dest, sz, backing);
313 path_combine(dest, sz, backed, backing);
317 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
319 bdrv_get_full_backing_filename_from_filename(bs->filename, bs->backing_file,
323 void bdrv_register(BlockDriver *bdrv)
325 /* Block drivers without coroutine functions need emulation */
326 if (!bdrv->bdrv_co_readv) {
327 bdrv->bdrv_co_readv = bdrv_co_readv_em;
328 bdrv->bdrv_co_writev = bdrv_co_writev_em;
330 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
331 * the block driver lacks aio we need to emulate that too.
333 if (!bdrv->bdrv_aio_readv) {
334 /* add AIO emulation layer */
335 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
336 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
340 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
343 BlockDriverState *bdrv_new_root(void)
345 BlockDriverState *bs = bdrv_new();
347 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
351 BlockDriverState *bdrv_new(void)
353 BlockDriverState *bs;
356 bs = g_new0(BlockDriverState, 1);
357 QLIST_INIT(&bs->dirty_bitmaps);
358 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
359 QLIST_INIT(&bs->op_blockers[i]);
361 bdrv_iostatus_disable(bs);
362 notifier_list_init(&bs->close_notifiers);
363 notifier_with_return_list_init(&bs->before_write_notifiers);
364 qemu_co_queue_init(&bs->throttled_reqs[0]);
365 qemu_co_queue_init(&bs->throttled_reqs[1]);
367 bs->aio_context = qemu_get_aio_context();
372 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
374 notifier_list_add(&bs->close_notifiers, notify);
377 BlockDriver *bdrv_find_format(const char *format_name)
380 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
381 if (!strcmp(drv1->format_name, format_name)) {
388 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
390 static const char *whitelist_rw[] = {
391 CONFIG_BDRV_RW_WHITELIST
393 static const char *whitelist_ro[] = {
394 CONFIG_BDRV_RO_WHITELIST
398 if (!whitelist_rw[0] && !whitelist_ro[0]) {
399 return 1; /* no whitelist, anything goes */
402 for (p = whitelist_rw; *p; p++) {
403 if (!strcmp(drv->format_name, *p)) {
408 for (p = whitelist_ro; *p; p++) {
409 if (!strcmp(drv->format_name, *p)) {
417 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
420 BlockDriver *drv = bdrv_find_format(format_name);
421 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
424 typedef struct CreateCo {
432 static void coroutine_fn bdrv_create_co_entry(void *opaque)
434 Error *local_err = NULL;
437 CreateCo *cco = opaque;
440 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
442 error_propagate(&cco->err, local_err);
447 int bdrv_create(BlockDriver *drv, const char* filename,
448 QemuOpts *opts, Error **errp)
455 .filename = g_strdup(filename),
461 if (!drv->bdrv_create) {
462 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
467 if (qemu_in_coroutine()) {
468 /* Fast-path if already in coroutine context */
469 bdrv_create_co_entry(&cco);
471 co = qemu_coroutine_create(bdrv_create_co_entry);
472 qemu_coroutine_enter(co, &cco);
473 while (cco.ret == NOT_DONE) {
474 aio_poll(qemu_get_aio_context(), true);
481 error_propagate(errp, cco.err);
483 error_setg_errno(errp, -ret, "Could not create image");
488 g_free(cco.filename);
492 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
495 Error *local_err = NULL;
498 drv = bdrv_find_protocol(filename, true);
500 error_setg(errp, "Could not find protocol for file '%s'", filename);
504 ret = bdrv_create(drv, filename, opts, &local_err);
506 error_propagate(errp, local_err);
511 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
513 BlockDriver *drv = bs->drv;
514 Error *local_err = NULL;
516 memset(&bs->bl, 0, sizeof(bs->bl));
522 /* Take some limits from the children as a default */
524 bdrv_refresh_limits(bs->file, &local_err);
526 error_propagate(errp, local_err);
529 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
530 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
531 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
533 bs->bl.opt_mem_alignment = 512;
536 if (bs->backing_hd) {
537 bdrv_refresh_limits(bs->backing_hd, &local_err);
539 error_propagate(errp, local_err);
542 bs->bl.opt_transfer_length =
543 MAX(bs->bl.opt_transfer_length,
544 bs->backing_hd->bl.opt_transfer_length);
545 bs->bl.max_transfer_length =
546 MIN_NON_ZERO(bs->bl.max_transfer_length,
547 bs->backing_hd->bl.max_transfer_length);
548 bs->bl.opt_mem_alignment =
549 MAX(bs->bl.opt_mem_alignment,
550 bs->backing_hd->bl.opt_mem_alignment);
553 /* Then let the driver override it */
554 if (drv->bdrv_refresh_limits) {
555 drv->bdrv_refresh_limits(bs, errp);
560 * Create a uniquely-named empty temporary file.
561 * Return 0 upon success, otherwise a negative errno value.
563 int get_tmp_filename(char *filename, int size)
566 char temp_dir[MAX_PATH];
567 /* GetTempFileName requires that its output buffer (4th param)
568 have length MAX_PATH or greater. */
569 assert(size >= MAX_PATH);
570 return (GetTempPath(MAX_PATH, temp_dir)
571 && GetTempFileName(temp_dir, "qem", 0, filename)
572 ? 0 : -GetLastError());
576 tmpdir = getenv("TMPDIR");
580 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
583 fd = mkstemp(filename);
587 if (close(fd) != 0) {
596 * Detect host devices. By convention, /dev/cdrom[N] is always
597 * recognized as a host CDROM.
599 static BlockDriver *find_hdev_driver(const char *filename)
601 int score_max = 0, score;
602 BlockDriver *drv = NULL, *d;
604 QLIST_FOREACH(d, &bdrv_drivers, list) {
605 if (d->bdrv_probe_device) {
606 score = d->bdrv_probe_device(filename);
607 if (score > score_max) {
617 BlockDriver *bdrv_find_protocol(const char *filename,
618 bool allow_protocol_prefix)
625 /* TODO Drivers without bdrv_file_open must be specified explicitly */
628 * XXX(hch): we really should not let host device detection
629 * override an explicit protocol specification, but moving this
630 * later breaks access to device names with colons in them.
631 * Thanks to the brain-dead persistent naming schemes on udev-
632 * based Linux systems those actually are quite common.
634 drv1 = find_hdev_driver(filename);
639 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
643 p = strchr(filename, ':');
646 if (len > sizeof(protocol) - 1)
647 len = sizeof(protocol) - 1;
648 memcpy(protocol, filename, len);
649 protocol[len] = '\0';
650 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
651 if (drv1->protocol_name &&
652 !strcmp(drv1->protocol_name, protocol)) {
660 * Guess image format by probing its contents.
661 * This is not a good idea when your image is raw (CVE-2008-2004), but
662 * we do it anyway for backward compatibility.
664 * @buf contains the image's first @buf_size bytes.
665 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
666 * but can be smaller if the image file is smaller)
667 * @filename is its filename.
669 * For all block drivers, call the bdrv_probe() method to get its
671 * Return the first block driver with the highest probing score.
673 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
674 const char *filename)
676 int score_max = 0, score;
677 BlockDriver *drv = NULL, *d;
679 QLIST_FOREACH(d, &bdrv_drivers, list) {
681 score = d->bdrv_probe(buf, buf_size, filename);
682 if (score > score_max) {
692 static int find_image_format(BlockDriverState *bs, const char *filename,
693 BlockDriver **pdrv, Error **errp)
696 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
699 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
700 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
705 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
707 error_setg_errno(errp, -ret, "Could not read image for determining its "
713 drv = bdrv_probe_all(buf, ret, filename);
715 error_setg(errp, "Could not determine image format: No compatible "
724 * Set the current 'total_sectors' value
725 * Return 0 on success, -errno on error.
727 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
729 BlockDriver *drv = bs->drv;
731 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
735 /* query actual device if possible, otherwise just trust the hint */
736 if (drv->bdrv_getlength) {
737 int64_t length = drv->bdrv_getlength(bs);
741 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
744 bs->total_sectors = hint;
749 * Set open flags for a given discard mode
751 * Return 0 on success, -1 if the discard mode was invalid.
753 int bdrv_parse_discard_flags(const char *mode, int *flags)
755 *flags &= ~BDRV_O_UNMAP;
757 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
759 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
760 *flags |= BDRV_O_UNMAP;
769 * Set open flags for a given cache mode
771 * Return 0 on success, -1 if the cache mode was invalid.
773 int bdrv_parse_cache_flags(const char *mode, int *flags)
775 *flags &= ~BDRV_O_CACHE_MASK;
777 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
778 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
779 } else if (!strcmp(mode, "directsync")) {
780 *flags |= BDRV_O_NOCACHE;
781 } else if (!strcmp(mode, "writeback")) {
782 *flags |= BDRV_O_CACHE_WB;
783 } else if (!strcmp(mode, "unsafe")) {
784 *flags |= BDRV_O_CACHE_WB;
785 *flags |= BDRV_O_NO_FLUSH;
786 } else if (!strcmp(mode, "writethrough")) {
787 /* this is the default */
796 * The copy-on-read flag is actually a reference count so multiple users may
797 * use the feature without worrying about clobbering its previous state.
798 * Copy-on-read stays enabled until all users have called to disable it.
800 void bdrv_enable_copy_on_read(BlockDriverState *bs)
805 void bdrv_disable_copy_on_read(BlockDriverState *bs)
807 assert(bs->copy_on_read > 0);
812 * Returns the flags that a temporary snapshot should get, based on the
813 * originally requested flags (the originally requested image will have flags
814 * like a backing file)
816 static int bdrv_temp_snapshot_flags(int flags)
818 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
822 * Returns the flags that bs->file should get, based on the given flags for
825 static int bdrv_inherited_flags(int flags)
827 /* Enable protocol handling, disable format probing for bs->file */
828 flags |= BDRV_O_PROTOCOL;
830 /* Our block drivers take care to send flushes and respect unmap policy,
831 * so we can enable both unconditionally on lower layers. */
832 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
834 /* Clear flags that only apply to the top layer */
835 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
841 * Returns the flags that bs->backing_hd should get, based on the given flags
844 static int bdrv_backing_flags(int flags)
846 /* backing files always opened read-only */
847 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
849 /* snapshot=on is handled on the top layer */
850 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
855 static int bdrv_open_flags(BlockDriverState *bs, int flags)
857 int open_flags = flags | BDRV_O_CACHE_WB;
860 * Clear flags that are internal to the block layer before opening the
863 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
866 * Snapshots should be writable.
868 if (flags & BDRV_O_TEMPORARY) {
869 open_flags |= BDRV_O_RDWR;
875 static void bdrv_assign_node_name(BlockDriverState *bs,
876 const char *node_name,
883 /* Check for empty string or invalid characters */
884 if (!id_wellformed(node_name)) {
885 error_setg(errp, "Invalid node name");
889 /* takes care of avoiding namespaces collisions */
890 if (blk_by_name(node_name)) {
891 error_setg(errp, "node-name=%s is conflicting with a device id",
896 /* takes care of avoiding duplicates node names */
897 if (bdrv_find_node(node_name)) {
898 error_setg(errp, "Duplicate node name");
902 /* copy node name into the bs and insert it into the graph list */
903 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
904 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
908 * Common part for opening disk images and files
910 * Removes all processed options from *options.
912 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
913 QDict *options, int flags, BlockDriver *drv, Error **errp)
916 const char *filename;
917 const char *node_name = NULL;
918 Error *local_err = NULL;
921 assert(bs->file == NULL);
922 assert(options != NULL && bs->options != options);
925 filename = file->filename;
927 filename = qdict_get_try_str(options, "filename");
930 if (drv->bdrv_needs_filename && !filename) {
931 error_setg(errp, "The '%s' block driver requires a file name",
936 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
938 node_name = qdict_get_try_str(options, "node-name");
939 bdrv_assign_node_name(bs, node_name, &local_err);
941 error_propagate(errp, local_err);
944 qdict_del(options, "node-name");
946 /* bdrv_open() with directly using a protocol as drv. This layer is already
947 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
948 * and return immediately. */
949 if (file != NULL && drv->bdrv_file_open) {
954 bs->open_flags = flags;
955 bs->guest_block_size = 512;
956 bs->request_alignment = 512;
957 bs->zero_beyond_eof = true;
958 open_flags = bdrv_open_flags(bs, flags);
959 bs->read_only = !(open_flags & BDRV_O_RDWR);
960 bs->growable = !!(flags & BDRV_O_PROTOCOL);
962 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
964 !bs->read_only && bdrv_is_whitelisted(drv, true)
965 ? "Driver '%s' can only be used for read-only devices"
966 : "Driver '%s' is not whitelisted",
971 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
972 if (flags & BDRV_O_COPY_ON_READ) {
973 if (!bs->read_only) {
974 bdrv_enable_copy_on_read(bs);
976 error_setg(errp, "Can't use copy-on-read on read-only device");
981 if (filename != NULL) {
982 pstrcpy(bs->filename, sizeof(bs->filename), filename);
984 bs->filename[0] = '\0';
986 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
989 bs->opaque = g_malloc0(drv->instance_size);
991 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
993 /* Open the image, either directly or using a protocol */
994 if (drv->bdrv_file_open) {
995 assert(file == NULL);
996 assert(!drv->bdrv_needs_filename || filename != NULL);
997 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1000 error_setg(errp, "Can't use '%s' as a block driver for the "
1001 "protocol level", drv->format_name);
1006 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1011 error_propagate(errp, local_err);
1012 } else if (bs->filename[0]) {
1013 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1015 error_setg_errno(errp, -ret, "Could not open image");
1020 ret = refresh_total_sectors(bs, bs->total_sectors);
1022 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1026 bdrv_refresh_limits(bs, &local_err);
1028 error_propagate(errp, local_err);
1033 assert(bdrv_opt_mem_align(bs) != 0);
1034 assert((bs->request_alignment != 0) || bs->sg);
1045 static QDict *parse_json_filename(const char *filename, Error **errp)
1047 QObject *options_obj;
1051 ret = strstart(filename, "json:", &filename);
1054 options_obj = qobject_from_json(filename);
1056 error_setg(errp, "Could not parse the JSON options");
1060 if (qobject_type(options_obj) != QTYPE_QDICT) {
1061 qobject_decref(options_obj);
1062 error_setg(errp, "Invalid JSON object given");
1066 options = qobject_to_qdict(options_obj);
1067 qdict_flatten(options);
1073 * Fills in default options for opening images and converts the legacy
1074 * filename/flags pair to option QDict entries.
1076 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1077 BlockDriver *drv, Error **errp)
1079 const char *filename = *pfilename;
1080 const char *drvname;
1081 bool protocol = flags & BDRV_O_PROTOCOL;
1082 bool parse_filename = false;
1083 Error *local_err = NULL;
1085 /* Parse json: pseudo-protocol */
1086 if (filename && g_str_has_prefix(filename, "json:")) {
1087 QDict *json_options = parse_json_filename(filename, &local_err);
1089 error_propagate(errp, local_err);
1093 /* Options given in the filename have lower priority than options
1094 * specified directly */
1095 qdict_join(*options, json_options, false);
1096 QDECREF(json_options);
1097 *pfilename = filename = NULL;
1100 /* Fetch the file name from the options QDict if necessary */
1101 if (protocol && filename) {
1102 if (!qdict_haskey(*options, "filename")) {
1103 qdict_put(*options, "filename", qstring_from_str(filename));
1104 parse_filename = true;
1106 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1112 /* Find the right block driver */
1113 filename = qdict_get_try_str(*options, "filename");
1114 drvname = qdict_get_try_str(*options, "driver");
1118 error_setg(errp, "Driver specified twice");
1121 drvname = drv->format_name;
1122 qdict_put(*options, "driver", qstring_from_str(drvname));
1124 if (!drvname && protocol) {
1126 drv = bdrv_find_protocol(filename, parse_filename);
1128 error_setg(errp, "Unknown protocol");
1132 drvname = drv->format_name;
1133 qdict_put(*options, "driver", qstring_from_str(drvname));
1135 error_setg(errp, "Must specify either driver or file");
1138 } else if (drvname) {
1139 drv = bdrv_find_format(drvname);
1141 error_setg(errp, "Unknown driver '%s'", drvname);
1147 assert(drv || !protocol);
1149 /* Driver-specific filename parsing */
1150 if (drv && drv->bdrv_parse_filename && parse_filename) {
1151 drv->bdrv_parse_filename(filename, *options, &local_err);
1153 error_propagate(errp, local_err);
1157 if (!drv->bdrv_needs_filename) {
1158 qdict_del(*options, "filename");
1165 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1168 if (bs->backing_hd) {
1169 assert(bs->backing_blocker);
1170 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1171 } else if (backing_hd) {
1172 error_setg(&bs->backing_blocker,
1173 "device is used as backing hd of '%s'",
1174 bdrv_get_device_name(bs));
1177 bs->backing_hd = backing_hd;
1179 error_free(bs->backing_blocker);
1180 bs->backing_blocker = NULL;
1183 bs->open_flags &= ~BDRV_O_NO_BACKING;
1184 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1185 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1186 backing_hd->drv ? backing_hd->drv->format_name : "");
1188 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1189 /* Otherwise we won't be able to commit due to check in bdrv_commit */
1190 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1191 bs->backing_blocker);
1193 bdrv_refresh_limits(bs, NULL);
1197 * Opens the backing file for a BlockDriverState if not yet open
1199 * options is a QDict of options to pass to the block drivers, or NULL for an
1200 * empty set of options. The reference to the QDict is transferred to this
1201 * function (even on failure), so if the caller intends to reuse the dictionary,
1202 * it needs to use QINCREF() before calling bdrv_file_open.
1204 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1206 char *backing_filename = g_malloc0(PATH_MAX);
1208 BlockDriverState *backing_hd;
1209 Error *local_err = NULL;
1211 if (bs->backing_hd != NULL) {
1216 /* NULL means an empty set of options */
1217 if (options == NULL) {
1218 options = qdict_new();
1221 bs->open_flags &= ~BDRV_O_NO_BACKING;
1222 if (qdict_haskey(options, "file.filename")) {
1223 backing_filename[0] = '\0';
1224 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1228 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1231 if (!bs->drv || !bs->drv->supports_backing) {
1233 error_setg(errp, "Driver doesn't support backing files");
1238 backing_hd = bdrv_new();
1240 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1241 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1244 assert(bs->backing_hd == NULL);
1245 ret = bdrv_open(&backing_hd,
1246 *backing_filename ? backing_filename : NULL, NULL, options,
1247 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1249 bdrv_unref(backing_hd);
1251 bs->open_flags |= BDRV_O_NO_BACKING;
1252 error_setg(errp, "Could not open backing file: %s",
1253 error_get_pretty(local_err));
1254 error_free(local_err);
1257 bdrv_set_backing_hd(bs, backing_hd);
1260 g_free(backing_filename);
1265 * Opens a disk image whose options are given as BlockdevRef in another block
1268 * If allow_none is true, no image will be opened if filename is false and no
1269 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1271 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1272 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1273 * itself, all options starting with "${bdref_key}." are considered part of the
1276 * The BlockdevRef will be removed from the options QDict.
1278 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1280 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1281 QDict *options, const char *bdref_key, int flags,
1282 bool allow_none, Error **errp)
1284 QDict *image_options;
1286 char *bdref_key_dot;
1287 const char *reference;
1290 assert(*pbs == NULL);
1292 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1293 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1294 g_free(bdref_key_dot);
1296 reference = qdict_get_try_str(options, bdref_key);
1297 if (!filename && !reference && !qdict_size(image_options)) {
1301 error_setg(errp, "A block device must be specified for \"%s\"",
1305 QDECREF(image_options);
1309 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1312 qdict_del(options, bdref_key);
1316 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1318 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1319 char *tmp_filename = g_malloc0(PATH_MAX + 1);
1321 QemuOpts *opts = NULL;
1322 QDict *snapshot_options;
1323 BlockDriverState *bs_snapshot;
1327 /* if snapshot, we create a temporary backing file and open it
1328 instead of opening 'filename' directly */
1330 /* Get the required size from the image */
1331 total_size = bdrv_getlength(bs);
1332 if (total_size < 0) {
1334 error_setg_errno(errp, -total_size, "Could not get image size");
1338 /* Create the temporary image */
1339 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1341 error_setg_errno(errp, -ret, "Could not get temporary filename");
1345 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1347 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1349 qemu_opts_del(opts);
1351 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352 "'%s': %s", tmp_filename,
1353 error_get_pretty(local_err));
1354 error_free(local_err);
1358 /* Prepare a new options QDict for the temporary file */
1359 snapshot_options = qdict_new();
1360 qdict_put(snapshot_options, "file.driver",
1361 qstring_from_str("file"));
1362 qdict_put(snapshot_options, "file.filename",
1363 qstring_from_str(tmp_filename));
1365 bs_snapshot = bdrv_new();
1367 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368 flags, &bdrv_qcow2, &local_err);
1370 error_propagate(errp, local_err);
1374 bdrv_append(bs_snapshot, bs);
1377 g_free(tmp_filename);
1382 * Opens a disk image (raw, qcow2, vmdk, ...)
1384 * options is a QDict of options to pass to the block drivers, or NULL for an
1385 * empty set of options. The reference to the QDict belongs to the block layer
1386 * after the call (even on failure), so if the caller intends to reuse the
1387 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1389 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390 * If it is not NULL, the referenced BDS will be reused.
1392 * The reference parameter may be used to specify an existing block device which
1393 * should be opened. If specified, neither options nor a filename may be given,
1394 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397 const char *reference, QDict *options, int flags,
1398 BlockDriver *drv, Error **errp)
1401 BlockDriverState *file = NULL, *bs;
1402 const char *drvname;
1403 Error *local_err = NULL;
1404 int snapshot_flags = 0;
1409 bool options_non_empty = options ? qdict_size(options) : false;
1413 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414 "another block device");
1418 if (filename || options_non_empty) {
1419 error_setg(errp, "Cannot reference an existing block device with "
1420 "additional options or a new filename");
1424 bs = bdrv_lookup_bs(reference, reference, errp);
1439 /* NULL means an empty set of options */
1440 if (options == NULL) {
1441 options = qdict_new();
1444 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1449 /* Find the right image format driver */
1451 drvname = qdict_get_try_str(options, "driver");
1453 drv = bdrv_find_format(drvname);
1454 qdict_del(options, "driver");
1456 error_setg(errp, "Unknown driver: '%s'", drvname);
1462 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463 if (drv && !drv->bdrv_file_open) {
1464 /* If the user explicitly wants a format driver here, we'll need to add
1465 * another layer for the protocol in bs->file */
1466 flags &= ~BDRV_O_PROTOCOL;
1469 bs->options = options;
1470 options = qdict_clone_shallow(options);
1472 /* Open image file without format layer */
1473 if ((flags & BDRV_O_PROTOCOL) == 0) {
1474 if (flags & BDRV_O_RDWR) {
1475 flags |= BDRV_O_ALLOW_RDWR;
1477 if (flags & BDRV_O_SNAPSHOT) {
1478 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479 flags = bdrv_backing_flags(flags);
1482 assert(file == NULL);
1483 ret = bdrv_open_image(&file, filename, options, "file",
1484 bdrv_inherited_flags(flags),
1491 /* Image format probing */
1494 ret = find_image_format(file, filename, &drv, &local_err);
1499 error_setg(errp, "Must specify either driver or file");
1504 /* Open the image */
1505 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1510 if (file && (bs->file != file)) {
1515 /* If there is a backing file, use it */
1516 if ((flags & BDRV_O_NO_BACKING) == 0) {
1517 QDict *backing_options;
1519 qdict_extract_subqdict(options, &backing_options, "backing.");
1520 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1522 goto close_and_fail;
1526 bdrv_refresh_filename(bs);
1528 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1529 * temporary snapshot afterwards. */
1530 if (snapshot_flags) {
1531 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1533 goto close_and_fail;
1537 /* Check if any unknown options were used */
1538 if (options && (qdict_size(options) != 0)) {
1539 const QDictEntry *entry = qdict_first(options);
1540 if (flags & BDRV_O_PROTOCOL) {
1541 error_setg(errp, "Block protocol '%s' doesn't support the option "
1542 "'%s'", drv->format_name, entry->key);
1544 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1545 "support the option '%s'", drv->format_name,
1546 bdrv_get_device_name(bs), entry->key);
1550 goto close_and_fail;
1553 if (!bdrv_key_required(bs)) {
1555 blk_dev_change_media_cb(bs->blk, true);
1557 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1558 && !runstate_check(RUN_STATE_INMIGRATE)
1559 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1561 "Guest must be stopped for opening of encrypted image");
1563 goto close_and_fail;
1574 QDECREF(bs->options);
1578 /* If *pbs is NULL, a new BDS has been created in this function and
1579 needs to be freed now. Otherwise, it does not need to be closed,
1580 since it has not really been opened yet. */
1584 error_propagate(errp, local_err);
1589 /* See fail path, but now the BDS has to be always closed */
1597 error_propagate(errp, local_err);
1602 typedef struct BlockReopenQueueEntry {
1604 BDRVReopenState state;
1605 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1606 } BlockReopenQueueEntry;
1609 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1610 * reopen of multiple devices.
1612 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1613 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1614 * be created and initialized. This newly created BlockReopenQueue should be
1615 * passed back in for subsequent calls that are intended to be of the same
1618 * bs is the BlockDriverState to add to the reopen queue.
1620 * flags contains the open flags for the associated bs
1622 * returns a pointer to bs_queue, which is either the newly allocated
1623 * bs_queue, or the existing bs_queue being used.
1626 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1627 BlockDriverState *bs, int flags)
1631 BlockReopenQueueEntry *bs_entry;
1632 if (bs_queue == NULL) {
1633 bs_queue = g_new0(BlockReopenQueue, 1);
1634 QSIMPLEQ_INIT(bs_queue);
1637 /* bdrv_open() masks this flag out */
1638 flags &= ~BDRV_O_PROTOCOL;
1641 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1644 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1645 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1647 bs_entry->state.bs = bs;
1648 bs_entry->state.flags = flags;
1654 * Reopen multiple BlockDriverStates atomically & transactionally.
1656 * The queue passed in (bs_queue) must have been built up previous
1657 * via bdrv_reopen_queue().
1659 * Reopens all BDS specified in the queue, with the appropriate
1660 * flags. All devices are prepared for reopen, and failure of any
1661 * device will cause all device changes to be abandonded, and intermediate
1664 * If all devices prepare successfully, then the changes are committed
1668 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1671 BlockReopenQueueEntry *bs_entry, *next;
1672 Error *local_err = NULL;
1674 assert(bs_queue != NULL);
1678 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1679 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1680 error_propagate(errp, local_err);
1683 bs_entry->prepared = true;
1686 /* If we reach this point, we have success and just need to apply the
1689 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1690 bdrv_reopen_commit(&bs_entry->state);
1696 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1697 if (ret && bs_entry->prepared) {
1698 bdrv_reopen_abort(&bs_entry->state);
1707 /* Reopen a single BlockDriverState with the specified flags. */
1708 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1711 Error *local_err = NULL;
1712 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1714 ret = bdrv_reopen_multiple(queue, &local_err);
1715 if (local_err != NULL) {
1716 error_propagate(errp, local_err);
1723 * Prepares a BlockDriverState for reopen. All changes are staged in the
1724 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1725 * the block driver layer .bdrv_reopen_prepare()
1727 * bs is the BlockDriverState to reopen
1728 * flags are the new open flags
1729 * queue is the reopen queue
1731 * Returns 0 on success, non-zero on error. On error errp will be set
1734 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1735 * It is the responsibility of the caller to then call the abort() or
1736 * commit() for any other BDS that have been left in a prepare() state
1739 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1743 Error *local_err = NULL;
1746 assert(reopen_state != NULL);
1747 assert(reopen_state->bs->drv != NULL);
1748 drv = reopen_state->bs->drv;
1750 /* if we are to stay read-only, do not allow permission change
1752 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1753 reopen_state->flags & BDRV_O_RDWR) {
1754 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1755 bdrv_get_device_name(reopen_state->bs));
1760 ret = bdrv_flush(reopen_state->bs);
1762 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1767 if (drv->bdrv_reopen_prepare) {
1768 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1770 if (local_err != NULL) {
1771 error_propagate(errp, local_err);
1773 error_setg(errp, "failed while preparing to reopen image '%s'",
1774 reopen_state->bs->filename);
1779 /* It is currently mandatory to have a bdrv_reopen_prepare()
1780 * handler for each supported drv. */
1781 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1782 drv->format_name, bdrv_get_device_name(reopen_state->bs),
1783 "reopening of file");
1795 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1796 * makes them final by swapping the staging BlockDriverState contents into
1797 * the active BlockDriverState contents.
1799 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1803 assert(reopen_state != NULL);
1804 drv = reopen_state->bs->drv;
1805 assert(drv != NULL);
1807 /* If there are any driver level actions to take */
1808 if (drv->bdrv_reopen_commit) {
1809 drv->bdrv_reopen_commit(reopen_state);
1812 /* set BDS specific flags now */
1813 reopen_state->bs->open_flags = reopen_state->flags;
1814 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1816 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1818 bdrv_refresh_limits(reopen_state->bs, NULL);
1822 * Abort the reopen, and delete and free the staged changes in
1825 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1829 assert(reopen_state != NULL);
1830 drv = reopen_state->bs->drv;
1831 assert(drv != NULL);
1833 if (drv->bdrv_reopen_abort) {
1834 drv->bdrv_reopen_abort(reopen_state);
1839 void bdrv_close(BlockDriverState *bs)
1841 BdrvAioNotifier *ban, *ban_next;
1844 block_job_cancel_sync(bs->job);
1846 bdrv_drain_all(); /* complete I/O */
1848 bdrv_drain_all(); /* in case flush left pending I/O */
1849 notifier_list_notify(&bs->close_notifiers, bs);
1852 if (bs->backing_hd) {
1853 BlockDriverState *backing_hd = bs->backing_hd;
1854 bdrv_set_backing_hd(bs, NULL);
1855 bdrv_unref(backing_hd);
1857 bs->drv->bdrv_close(bs);
1861 bs->copy_on_read = 0;
1862 bs->backing_file[0] = '\0';
1863 bs->backing_format[0] = '\0';
1864 bs->total_sectors = 0;
1869 bs->zero_beyond_eof = false;
1870 QDECREF(bs->options);
1872 QDECREF(bs->full_open_options);
1873 bs->full_open_options = NULL;
1875 if (bs->file != NULL) {
1876 bdrv_unref(bs->file);
1882 blk_dev_change_media_cb(bs->blk, false);
1885 /*throttling disk I/O limits*/
1886 if (bs->io_limits_enabled) {
1887 bdrv_io_limits_disable(bs);
1890 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1893 QLIST_INIT(&bs->aio_notifiers);
1896 void bdrv_close_all(void)
1898 BlockDriverState *bs;
1900 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1901 AioContext *aio_context = bdrv_get_aio_context(bs);
1903 aio_context_acquire(aio_context);
1905 aio_context_release(aio_context);
1909 /* Check if any requests are in-flight (including throttled requests) */
1910 static bool bdrv_requests_pending(BlockDriverState *bs)
1912 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1915 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1918 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1921 if (bs->file && bdrv_requests_pending(bs->file)) {
1924 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1930 static bool bdrv_drain_one(BlockDriverState *bs)
1934 bdrv_flush_io_queue(bs);
1935 bdrv_start_throttled_reqs(bs);
1936 bs_busy = bdrv_requests_pending(bs);
1937 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1942 * Wait for pending requests to complete on a single BlockDriverState subtree
1944 * See the warning in bdrv_drain_all(). This function can only be called if
1945 * you are sure nothing can generate I/O because you have op blockers
1948 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1951 void bdrv_drain(BlockDriverState *bs)
1953 while (bdrv_drain_one(bs)) {
1954 /* Keep iterating */
1959 * Wait for pending requests to complete across all BlockDriverStates
1961 * This function does not flush data to disk, use bdrv_flush_all() for that
1962 * after calling this function.
1964 * Note that completion of an asynchronous I/O operation can trigger any
1965 * number of other I/O operations on other devices---for example a coroutine
1966 * can be arbitrarily complex and a constant flow of I/O can come until the
1967 * coroutine is complete. Because of this, it is not possible to have a
1968 * function to drain a single device's I/O queue.
1970 void bdrv_drain_all(void)
1972 /* Always run first iteration so any pending completion BHs run */
1974 BlockDriverState *bs;
1979 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1980 AioContext *aio_context = bdrv_get_aio_context(bs);
1982 aio_context_acquire(aio_context);
1983 busy |= bdrv_drain_one(bs);
1984 aio_context_release(aio_context);
1989 /* make a BlockDriverState anonymous by removing from bdrv_state and
1990 * graph_bdrv_state list.
1991 Also, NULL terminate the device_name to prevent double remove */
1992 void bdrv_make_anon(BlockDriverState *bs)
1995 * Take care to remove bs from bdrv_states only when it's actually
1996 * in it. Note that bs->device_list.tqe_prev is initially null,
1997 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
1998 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1999 * resetting it to null on remove.
2001 if (bs->device_list.tqe_prev) {
2002 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2003 bs->device_list.tqe_prev = NULL;
2005 if (bs->node_name[0] != '\0') {
2006 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2008 bs->node_name[0] = '\0';
2011 static void bdrv_rebind(BlockDriverState *bs)
2013 if (bs->drv && bs->drv->bdrv_rebind) {
2014 bs->drv->bdrv_rebind(bs);
2018 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2019 BlockDriverState *bs_src)
2021 /* move some fields that need to stay attached to the device */
2024 bs_dest->guest_block_size = bs_src->guest_block_size;
2025 bs_dest->copy_on_read = bs_src->copy_on_read;
2027 bs_dest->enable_write_cache = bs_src->enable_write_cache;
2029 /* i/o throttled req */
2030 memcpy(&bs_dest->throttle_state,
2031 &bs_src->throttle_state,
2032 sizeof(ThrottleState));
2033 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2034 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
2035 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
2038 bs_dest->on_read_error = bs_src->on_read_error;
2039 bs_dest->on_write_error = bs_src->on_write_error;
2042 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2043 bs_dest->iostatus = bs_src->iostatus;
2046 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
2048 /* reference count */
2049 bs_dest->refcnt = bs_src->refcnt;
2052 bs_dest->job = bs_src->job;
2054 /* keep the same entry in bdrv_states */
2055 bs_dest->device_list = bs_src->device_list;
2056 bs_dest->blk = bs_src->blk;
2058 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2059 sizeof(bs_dest->op_blockers));
2063 * Swap bs contents for two image chains while they are live,
2064 * while keeping required fields on the BlockDriverState that is
2065 * actually attached to a device.
2067 * This will modify the BlockDriverState fields, and swap contents
2068 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2070 * bs_new must not be attached to a BlockBackend.
2072 * This function does not create any image files.
2074 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2076 BlockDriverState tmp;
2078 /* The code needs to swap the node_name but simply swapping node_list won't
2079 * work so first remove the nodes from the graph list, do the swap then
2080 * insert them back if needed.
2082 if (bs_new->node_name[0] != '\0') {
2083 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2085 if (bs_old->node_name[0] != '\0') {
2086 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2089 /* bs_new must be unattached and shouldn't have anything fancy enabled */
2090 assert(!bs_new->blk);
2091 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2092 assert(bs_new->job == NULL);
2093 assert(bs_new->io_limits_enabled == false);
2094 assert(!throttle_have_timer(&bs_new->throttle_state));
2100 /* there are some fields that should not be swapped, move them back */
2101 bdrv_move_feature_fields(&tmp, bs_old);
2102 bdrv_move_feature_fields(bs_old, bs_new);
2103 bdrv_move_feature_fields(bs_new, &tmp);
2105 /* bs_new must remain unattached */
2106 assert(!bs_new->blk);
2108 /* Check a few fields that should remain attached to the device */
2109 assert(bs_new->job == NULL);
2110 assert(bs_new->io_limits_enabled == false);
2111 assert(!throttle_have_timer(&bs_new->throttle_state));
2113 /* insert the nodes back into the graph node list if needed */
2114 if (bs_new->node_name[0] != '\0') {
2115 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2117 if (bs_old->node_name[0] != '\0') {
2118 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2121 bdrv_rebind(bs_new);
2122 bdrv_rebind(bs_old);
2126 * Add new bs contents at the top of an image chain while the chain is
2127 * live, while keeping required fields on the top layer.
2129 * This will modify the BlockDriverState fields, and swap contents
2130 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2132 * bs_new must not be attached to a BlockBackend.
2134 * This function does not create any image files.
2136 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2138 bdrv_swap(bs_new, bs_top);
2140 /* The contents of 'tmp' will become bs_top, as we are
2141 * swapping bs_new and bs_top contents. */
2142 bdrv_set_backing_hd(bs_top, bs_new);
2145 static void bdrv_delete(BlockDriverState *bs)
2148 assert(bdrv_op_blocker_is_empty(bs));
2149 assert(!bs->refcnt);
2150 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2154 /* remove from list, if necessary */
2161 * Run consistency checks on an image
2163 * Returns 0 if the check could be completed (it doesn't mean that the image is
2164 * free of errors) or -errno when an internal error occurred. The results of the
2165 * check are stored in res.
2167 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2169 if (bs->drv == NULL) {
2172 if (bs->drv->bdrv_check == NULL) {
2176 memset(res, 0, sizeof(*res));
2177 return bs->drv->bdrv_check(bs, res, fix);
2180 #define COMMIT_BUF_SECTORS 2048
2182 /* commit COW file into the raw image */
2183 int bdrv_commit(BlockDriverState *bs)
2185 BlockDriver *drv = bs->drv;
2186 int64_t sector, total_sectors, length, backing_length;
2187 int n, ro, open_flags;
2189 uint8_t *buf = NULL;
2190 char filename[PATH_MAX];
2195 if (!bs->backing_hd) {
2199 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2200 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2204 ro = bs->backing_hd->read_only;
2205 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2206 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2207 open_flags = bs->backing_hd->open_flags;
2210 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2215 length = bdrv_getlength(bs);
2221 backing_length = bdrv_getlength(bs->backing_hd);
2222 if (backing_length < 0) {
2223 ret = backing_length;
2227 /* If our top snapshot is larger than the backing file image,
2228 * grow the backing file image if possible. If not possible,
2229 * we must return an error */
2230 if (length > backing_length) {
2231 ret = bdrv_truncate(bs->backing_hd, length);
2237 total_sectors = length >> BDRV_SECTOR_BITS;
2239 /* qemu_try_blockalign() for bs will choose an alignment that works for
2240 * bs->backing_hd as well, so no need to compare the alignment manually. */
2241 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2247 for (sector = 0; sector < total_sectors; sector += n) {
2248 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2253 ret = bdrv_read(bs, sector, buf, n);
2258 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2265 if (drv->bdrv_make_empty) {
2266 ret = drv->bdrv_make_empty(bs);
2274 * Make sure all data we wrote to the backing device is actually
2277 if (bs->backing_hd) {
2278 bdrv_flush(bs->backing_hd);
2286 /* ignoring error return here */
2287 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2293 int bdrv_commit_all(void)
2295 BlockDriverState *bs;
2297 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2298 AioContext *aio_context = bdrv_get_aio_context(bs);
2300 aio_context_acquire(aio_context);
2301 if (bs->drv && bs->backing_hd) {
2302 int ret = bdrv_commit(bs);
2304 aio_context_release(aio_context);
2308 aio_context_release(aio_context);
2314 * Remove an active request from the tracked requests list
2316 * This function should be called when a tracked request is completing.
2318 static void tracked_request_end(BdrvTrackedRequest *req)
2320 if (req->serialising) {
2321 req->bs->serialising_in_flight--;
2324 QLIST_REMOVE(req, list);
2325 qemu_co_queue_restart_all(&req->wait_queue);
2329 * Add an active request to the tracked requests list
2331 static void tracked_request_begin(BdrvTrackedRequest *req,
2332 BlockDriverState *bs,
2334 unsigned int bytes, bool is_write)
2336 *req = (BdrvTrackedRequest){
2340 .is_write = is_write,
2341 .co = qemu_coroutine_self(),
2342 .serialising = false,
2343 .overlap_offset = offset,
2344 .overlap_bytes = bytes,
2347 qemu_co_queue_init(&req->wait_queue);
2349 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2352 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2354 int64_t overlap_offset = req->offset & ~(align - 1);
2355 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2358 if (!req->serialising) {
2359 req->bs->serialising_in_flight++;
2360 req->serialising = true;
2363 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2364 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2368 * Round a region to cluster boundaries
2370 void bdrv_round_to_clusters(BlockDriverState *bs,
2371 int64_t sector_num, int nb_sectors,
2372 int64_t *cluster_sector_num,
2373 int *cluster_nb_sectors)
2375 BlockDriverInfo bdi;
2377 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2378 *cluster_sector_num = sector_num;
2379 *cluster_nb_sectors = nb_sectors;
2381 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2382 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2383 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2388 static int bdrv_get_cluster_size(BlockDriverState *bs)
2390 BlockDriverInfo bdi;
2393 ret = bdrv_get_info(bs, &bdi);
2394 if (ret < 0 || bdi.cluster_size == 0) {
2395 return bs->request_alignment;
2397 return bdi.cluster_size;
2401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2402 int64_t offset, unsigned int bytes)
2405 if (offset >= req->overlap_offset + req->overlap_bytes) {
2409 if (req->overlap_offset >= offset + bytes) {
2415 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2417 BlockDriverState *bs = self->bs;
2418 BdrvTrackedRequest *req;
2420 bool waited = false;
2422 if (!bs->serialising_in_flight) {
2428 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2429 if (req == self || (!req->serialising && !self->serialising)) {
2432 if (tracked_request_overlaps(req, self->overlap_offset,
2433 self->overlap_bytes))
2435 /* Hitting this means there was a reentrant request, for
2436 * example, a block driver issuing nested requests. This must
2437 * never happen since it means deadlock.
2439 assert(qemu_coroutine_self() != req->co);
2441 /* If the request is already (indirectly) waiting for us, or
2442 * will wait for us as soon as it wakes up, then just go on
2443 * (instead of producing a deadlock in the former case). */
2444 if (!req->waiting_for) {
2445 self->waiting_for = req;
2446 qemu_co_queue_wait(&req->wait_queue);
2447 self->waiting_for = NULL;
2462 * -EINVAL - backing format specified, but no file
2463 * -ENOSPC - can't update the backing file because no space is left in the
2465 * -ENOTSUP - format driver doesn't support changing the backing file
2467 int bdrv_change_backing_file(BlockDriverState *bs,
2468 const char *backing_file, const char *backing_fmt)
2470 BlockDriver *drv = bs->drv;
2473 /* Backing file format doesn't make sense without a backing file */
2474 if (backing_fmt && !backing_file) {
2478 if (drv->bdrv_change_backing_file != NULL) {
2479 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2485 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2486 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2492 * Finds the image layer in the chain that has 'bs' as its backing file.
2494 * active is the current topmost image.
2496 * Returns NULL if bs is not found in active's image chain,
2497 * or if active == bs.
2499 * Returns the bottommost base image if bs == NULL.
2501 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2502 BlockDriverState *bs)
2504 while (active && bs != active->backing_hd) {
2505 active = active->backing_hd;
2511 /* Given a BDS, searches for the base layer. */
2512 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2514 return bdrv_find_overlay(bs, NULL);
2517 typedef struct BlkIntermediateStates {
2518 BlockDriverState *bs;
2519 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2520 } BlkIntermediateStates;
2524 * Drops images above 'base' up to and including 'top', and sets the image
2525 * above 'top' to have base as its backing file.
2527 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2528 * information in 'bs' can be properly updated.
2530 * E.g., this will convert the following chain:
2531 * bottom <- base <- intermediate <- top <- active
2535 * bottom <- base <- active
2537 * It is allowed for bottom==base, in which case it converts:
2539 * base <- intermediate <- top <- active
2545 * If backing_file_str is non-NULL, it will be used when modifying top's
2546 * overlay image metadata.
2549 * if active == top, that is considered an error
2552 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2553 BlockDriverState *base, const char *backing_file_str)
2555 BlockDriverState *intermediate;
2556 BlockDriverState *base_bs = NULL;
2557 BlockDriverState *new_top_bs = NULL;
2558 BlkIntermediateStates *intermediate_state, *next;
2561 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2562 QSIMPLEQ_INIT(&states_to_delete);
2564 if (!top->drv || !base->drv) {
2568 new_top_bs = bdrv_find_overlay(active, top);
2570 if (new_top_bs == NULL) {
2571 /* we could not find the image above 'top', this is an error */
2575 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2576 * to do, no intermediate images */
2577 if (new_top_bs->backing_hd == base) {
2584 /* now we will go down through the list, and add each BDS we find
2585 * into our deletion queue, until we hit the 'base'
2587 while (intermediate) {
2588 intermediate_state = g_new0(BlkIntermediateStates, 1);
2589 intermediate_state->bs = intermediate;
2590 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2592 if (intermediate->backing_hd == base) {
2593 base_bs = intermediate->backing_hd;
2596 intermediate = intermediate->backing_hd;
2598 if (base_bs == NULL) {
2599 /* something went wrong, we did not end at the base. safely
2600 * unravel everything, and exit with error */
2604 /* success - we can delete the intermediate states, and link top->base */
2605 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2606 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2607 base_bs->drv ? base_bs->drv->format_name : "");
2611 bdrv_set_backing_hd(new_top_bs, base_bs);
2613 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614 /* so that bdrv_close() does not recursively close the chain */
2615 bdrv_set_backing_hd(intermediate_state->bs, NULL);
2616 bdrv_unref(intermediate_state->bs);
2621 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2622 g_free(intermediate_state);
2628 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2633 if (size > INT_MAX) {
2637 if (!bdrv_is_inserted(bs))
2643 len = bdrv_getlength(bs);
2648 if ((offset > len) || (len - offset < size))
2654 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2657 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2661 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2662 nb_sectors * BDRV_SECTOR_SIZE);
2665 typedef struct RwCo {
2666 BlockDriverState *bs;
2671 BdrvRequestFlags flags;
2674 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2676 RwCo *rwco = opaque;
2678 if (!rwco->is_write) {
2679 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2680 rwco->qiov->size, rwco->qiov,
2683 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2684 rwco->qiov->size, rwco->qiov,
2690 * Process a vectored synchronous request using coroutines
2692 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2693 QEMUIOVector *qiov, bool is_write,
2694 BdrvRequestFlags flags)
2701 .is_write = is_write,
2707 * In sync call context, when the vcpu is blocked, this throttling timer
2708 * will not fire; so the I/O throttling function has to be disabled here
2709 * if it has been enabled.
2711 if (bs->io_limits_enabled) {
2712 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2713 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2714 bdrv_io_limits_disable(bs);
2717 if (qemu_in_coroutine()) {
2718 /* Fast-path if already in coroutine context */
2719 bdrv_rw_co_entry(&rwco);
2721 AioContext *aio_context = bdrv_get_aio_context(bs);
2723 co = qemu_coroutine_create(bdrv_rw_co_entry);
2724 qemu_coroutine_enter(co, &rwco);
2725 while (rwco.ret == NOT_DONE) {
2726 aio_poll(aio_context, true);
2733 * Process a synchronous request using coroutines
2735 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2736 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2739 struct iovec iov = {
2740 .iov_base = (void *)buf,
2741 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2744 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2748 qemu_iovec_init_external(&qiov, &iov, 1);
2749 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2750 &qiov, is_write, flags);
2753 /* return < 0 if error. See bdrv_write() for the return codes */
2754 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2755 uint8_t *buf, int nb_sectors)
2757 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2760 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2761 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2762 uint8_t *buf, int nb_sectors)
2767 enabled = bs->io_limits_enabled;
2768 bs->io_limits_enabled = false;
2769 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2770 bs->io_limits_enabled = enabled;
2774 /* Return < 0 if error. Important errors are:
2775 -EIO generic I/O error (may happen for all errors)
2776 -ENOMEDIUM No media inserted.
2777 -EINVAL Invalid sector number or nb_sectors
2778 -EACCES Trying to write a read-only device
2780 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2781 const uint8_t *buf, int nb_sectors)
2783 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2786 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2787 int nb_sectors, BdrvRequestFlags flags)
2789 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2790 BDRV_REQ_ZERO_WRITE | flags);
2794 * Completely zero out a block device with the help of bdrv_write_zeroes.
2795 * The operation is sped up by checking the block status and only writing
2796 * zeroes to the device if they currently do not return zeroes. Optional
2797 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2799 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2801 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2803 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2806 target_sectors = bdrv_nb_sectors(bs);
2807 if (target_sectors < 0) {
2808 return target_sectors;
2812 nb_sectors = target_sectors - sector_num;
2813 if (nb_sectors <= 0) {
2816 if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2817 nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2819 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2821 error_report("error getting block status at sector %" PRId64 ": %s",
2822 sector_num, strerror(-ret));
2825 if (ret & BDRV_BLOCK_ZERO) {
2829 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2831 error_report("error writing zeroes at sector %" PRId64 ": %s",
2832 sector_num, strerror(-ret));
2839 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2842 struct iovec iov = {
2843 .iov_base = (void *)buf,
2852 qemu_iovec_init_external(&qiov, &iov, 1);
2853 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2861 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2865 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2873 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2874 const void *buf, int bytes)
2877 struct iovec iov = {
2878 .iov_base = (void *) buf,
2886 qemu_iovec_init_external(&qiov, &iov, 1);
2887 return bdrv_pwritev(bs, offset, &qiov);
2891 * Writes to the file and ensures that no writes are reordered across this
2892 * request (acts as a barrier)
2894 * Returns 0 on success, -errno in error cases.
2896 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2897 const void *buf, int count)
2901 ret = bdrv_pwrite(bs, offset, buf, count);
2906 /* No flush needed for cache modes that already do it */
2907 if (bs->enable_write_cache) {
2914 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2915 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2917 /* Perform I/O through a temporary buffer so that users who scribble over
2918 * their read buffer while the operation is in progress do not end up
2919 * modifying the image file. This is critical for zero-copy guest I/O
2920 * where anything might happen inside guest memory.
2922 void *bounce_buffer;
2924 BlockDriver *drv = bs->drv;
2926 QEMUIOVector bounce_qiov;
2927 int64_t cluster_sector_num;
2928 int cluster_nb_sectors;
2932 /* Cover entire cluster so no additional backing file I/O is required when
2933 * allocating cluster in the image file.
2935 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2936 &cluster_sector_num, &cluster_nb_sectors);
2938 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2939 cluster_sector_num, cluster_nb_sectors);
2941 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2942 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2943 if (bounce_buffer == NULL) {
2948 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2950 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2956 if (drv->bdrv_co_write_zeroes &&
2957 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2958 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2959 cluster_nb_sectors, 0);
2961 /* This does not change the data on the disk, it is not necessary
2962 * to flush even in cache=writethrough mode.
2964 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2969 /* It might be okay to ignore write errors for guest requests. If this
2970 * is a deliberate copy-on-read then we don't want to ignore the error.
2971 * Simply report it in all cases.
2976 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2977 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2978 nb_sectors * BDRV_SECTOR_SIZE);
2981 qemu_vfree(bounce_buffer);
2986 * Forwards an already correctly aligned request to the BlockDriver. This
2987 * handles copy on read and zeroing after EOF; any other features must be
2988 * implemented by the caller.
2990 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2991 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2992 int64_t align, QEMUIOVector *qiov, int flags)
2994 BlockDriver *drv = bs->drv;
2997 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2998 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3000 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3001 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3002 assert(!qiov || bytes == qiov->size);
3004 /* Handle Copy on Read and associated serialisation */
3005 if (flags & BDRV_REQ_COPY_ON_READ) {
3006 /* If we touch the same cluster it counts as an overlap. This
3007 * guarantees that allocating writes will be serialized and not race
3008 * with each other for the same cluster. For example, in copy-on-read
3009 * it ensures that the CoR read and write operations are atomic and
3010 * guest writes cannot interleave between them. */
3011 mark_request_serialising(req, bdrv_get_cluster_size(bs));
3014 wait_serialising_requests(req);
3016 if (flags & BDRV_REQ_COPY_ON_READ) {
3019 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3024 if (!ret || pnum != nb_sectors) {
3025 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3030 /* Forward the request to the BlockDriver */
3031 if (!(bs->zero_beyond_eof && bs->growable)) {
3032 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3034 /* Read zeros after EOF of growable BDSes */
3035 int64_t total_sectors, max_nb_sectors;
3037 total_sectors = bdrv_nb_sectors(bs);
3038 if (total_sectors < 0) {
3039 ret = total_sectors;
3043 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3044 align >> BDRV_SECTOR_BITS);
3045 if (nb_sectors < max_nb_sectors) {
3046 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3047 } else if (max_nb_sectors > 0) {
3048 QEMUIOVector local_qiov;
3050 qemu_iovec_init(&local_qiov, qiov->niov);
3051 qemu_iovec_concat(&local_qiov, qiov, 0,
3052 max_nb_sectors * BDRV_SECTOR_SIZE);
3054 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3057 qemu_iovec_destroy(&local_qiov);
3062 /* Reading beyond end of file is supposed to produce zeroes */
3063 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3064 uint64_t offset = MAX(0, total_sectors - sector_num);
3065 uint64_t bytes = (sector_num + nb_sectors - offset) *
3067 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3076 * Handle a read request in coroutine context
3078 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3079 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3080 BdrvRequestFlags flags)
3082 BlockDriver *drv = bs->drv;
3083 BdrvTrackedRequest req;
3085 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3086 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3087 uint8_t *head_buf = NULL;
3088 uint8_t *tail_buf = NULL;
3089 QEMUIOVector local_qiov;
3090 bool use_local_qiov = false;
3096 if (bdrv_check_byte_request(bs, offset, bytes)) {
3100 if (bs->copy_on_read) {
3101 flags |= BDRV_REQ_COPY_ON_READ;
3104 /* throttling disk I/O */
3105 if (bs->io_limits_enabled) {
3106 bdrv_io_limits_intercept(bs, bytes, false);
3109 /* Align read if necessary by padding qiov */
3110 if (offset & (align - 1)) {
3111 head_buf = qemu_blockalign(bs, align);
3112 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3113 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3114 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3115 use_local_qiov = true;
3117 bytes += offset & (align - 1);
3118 offset = offset & ~(align - 1);
3121 if ((offset + bytes) & (align - 1)) {
3122 if (!use_local_qiov) {
3123 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3124 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3125 use_local_qiov = true;
3127 tail_buf = qemu_blockalign(bs, align);
3128 qemu_iovec_add(&local_qiov, tail_buf,
3129 align - ((offset + bytes) & (align - 1)));
3131 bytes = ROUND_UP(bytes, align);
3134 tracked_request_begin(&req, bs, offset, bytes, false);
3135 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3136 use_local_qiov ? &local_qiov : qiov,
3138 tracked_request_end(&req);
3140 if (use_local_qiov) {
3141 qemu_iovec_destroy(&local_qiov);
3142 qemu_vfree(head_buf);
3143 qemu_vfree(tail_buf);
3149 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3150 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3151 BdrvRequestFlags flags)
3153 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3157 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3158 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3161 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3162 int nb_sectors, QEMUIOVector *qiov)
3164 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3166 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3169 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3170 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3172 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3174 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3175 BDRV_REQ_COPY_ON_READ);
3178 /* if no limit is specified in the BlockLimits use a default
3179 * of 32768 512-byte sectors (16 MiB) per request.
3181 #define MAX_WRITE_ZEROES_DEFAULT 32768
3183 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3184 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3186 BlockDriver *drv = bs->drv;
3188 struct iovec iov = {0};
3191 int max_write_zeroes = bs->bl.max_write_zeroes ?
3192 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3194 while (nb_sectors > 0 && !ret) {
3195 int num = nb_sectors;
3197 /* Align request. Block drivers can expect the "bulk" of the request
3200 if (bs->bl.write_zeroes_alignment
3201 && num > bs->bl.write_zeroes_alignment) {
3202 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3203 /* Make a small request up to the first aligned sector. */
3204 num = bs->bl.write_zeroes_alignment;
3205 num -= sector_num % bs->bl.write_zeroes_alignment;
3206 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3207 /* Shorten the request to the last aligned sector. num cannot
3208 * underflow because num > bs->bl.write_zeroes_alignment.
3210 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3214 /* limit request size */
3215 if (num > max_write_zeroes) {
3216 num = max_write_zeroes;
3220 /* First try the efficient write zeroes operation */
3221 if (drv->bdrv_co_write_zeroes) {
3222 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3225 if (ret == -ENOTSUP) {
3226 /* Fall back to bounce buffer if write zeroes is unsupported */
3227 iov.iov_len = num * BDRV_SECTOR_SIZE;
3228 if (iov.iov_base == NULL) {
3229 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3230 if (iov.iov_base == NULL) {
3234 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3236 qemu_iovec_init_external(&qiov, &iov, 1);
3238 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3240 /* Keep bounce buffer around if it is big enough for all
3241 * all future requests.
3243 if (num < max_write_zeroes) {
3244 qemu_vfree(iov.iov_base);
3245 iov.iov_base = NULL;
3254 qemu_vfree(iov.iov_base);
3259 * Forwards an already correctly aligned write request to the BlockDriver.
3261 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3262 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3263 QEMUIOVector *qiov, int flags)
3265 BlockDriver *drv = bs->drv;
3269 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3270 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3272 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3273 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3274 assert(!qiov || bytes == qiov->size);
3276 waited = wait_serialising_requests(req);
3277 assert(!waited || !req->serialising);
3278 assert(req->overlap_offset <= offset);
3279 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3281 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3283 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3284 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3285 qemu_iovec_is_zero(qiov)) {
3286 flags |= BDRV_REQ_ZERO_WRITE;
3287 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3288 flags |= BDRV_REQ_MAY_UNMAP;
3293 /* Do nothing, write notifier decided to fail this request */
3294 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3295 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3296 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3298 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3299 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3301 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3303 if (ret == 0 && !bs->enable_write_cache) {
3304 ret = bdrv_co_flush(bs);
3307 bdrv_set_dirty(bs, sector_num, nb_sectors);
3309 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3311 if (bs->growable && ret >= 0) {
3312 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3319 * Handle a write request in coroutine context
3321 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3322 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3323 BdrvRequestFlags flags)
3325 BdrvTrackedRequest req;
3326 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3327 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3328 uint8_t *head_buf = NULL;
3329 uint8_t *tail_buf = NULL;
3330 QEMUIOVector local_qiov;
3331 bool use_local_qiov = false;
3337 if (bs->read_only) {
3340 if (bdrv_check_byte_request(bs, offset, bytes)) {
3344 /* throttling disk I/O */
3345 if (bs->io_limits_enabled) {
3346 bdrv_io_limits_intercept(bs, bytes, true);
3350 * Align write if necessary by performing a read-modify-write cycle.
3351 * Pad qiov with the read parts and be sure to have a tracked request not
3352 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3354 tracked_request_begin(&req, bs, offset, bytes, true);
3356 if (offset & (align - 1)) {
3357 QEMUIOVector head_qiov;
3358 struct iovec head_iov;
3360 mark_request_serialising(&req, align);
3361 wait_serialising_requests(&req);
3363 head_buf = qemu_blockalign(bs, align);
3364 head_iov = (struct iovec) {
3365 .iov_base = head_buf,
3368 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3370 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3371 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3372 align, &head_qiov, 0);
3376 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3378 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3379 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3380 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3381 use_local_qiov = true;
3383 bytes += offset & (align - 1);
3384 offset = offset & ~(align - 1);
3387 if ((offset + bytes) & (align - 1)) {
3388 QEMUIOVector tail_qiov;
3389 struct iovec tail_iov;
3393 mark_request_serialising(&req, align);
3394 waited = wait_serialising_requests(&req);
3395 assert(!waited || !use_local_qiov);
3397 tail_buf = qemu_blockalign(bs, align);
3398 tail_iov = (struct iovec) {
3399 .iov_base = tail_buf,
3402 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3404 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3405 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3406 align, &tail_qiov, 0);
3410 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3412 if (!use_local_qiov) {
3413 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3414 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3415 use_local_qiov = true;
3418 tail_bytes = (offset + bytes) & (align - 1);
3419 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3421 bytes = ROUND_UP(bytes, align);
3424 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3425 use_local_qiov ? &local_qiov : qiov,
3429 tracked_request_end(&req);
3431 if (use_local_qiov) {
3432 qemu_iovec_destroy(&local_qiov);
3434 qemu_vfree(head_buf);
3435 qemu_vfree(tail_buf);
3440 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3441 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3442 BdrvRequestFlags flags)
3444 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3448 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3449 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3452 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3453 int nb_sectors, QEMUIOVector *qiov)
3455 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3457 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3460 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3461 int64_t sector_num, int nb_sectors,
3462 BdrvRequestFlags flags)
3464 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3466 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3467 flags &= ~BDRV_REQ_MAY_UNMAP;
3470 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3471 BDRV_REQ_ZERO_WRITE | flags);
3475 * Truncate file to 'offset' bytes (needed only for file protocols)
3477 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3479 BlockDriver *drv = bs->drv;
3483 if (!drv->bdrv_truncate)
3488 ret = drv->bdrv_truncate(bs, offset);
3490 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3492 blk_dev_resize_cb(bs->blk);
3499 * Length of a allocated file in bytes. Sparse files are counted by actual
3500 * allocated space. Return < 0 if error or unknown.
3502 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3504 BlockDriver *drv = bs->drv;
3508 if (drv->bdrv_get_allocated_file_size) {
3509 return drv->bdrv_get_allocated_file_size(bs);
3512 return bdrv_get_allocated_file_size(bs->file);
3518 * Return number of sectors on success, -errno on error.
3520 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3522 BlockDriver *drv = bs->drv;
3527 if (drv->has_variable_length) {
3528 int ret = refresh_total_sectors(bs, bs->total_sectors);
3533 return bs->total_sectors;
3537 * Return length in bytes on success, -errno on error.
3538 * The length is always a multiple of BDRV_SECTOR_SIZE.
3540 int64_t bdrv_getlength(BlockDriverState *bs)
3542 int64_t ret = bdrv_nb_sectors(bs);
3544 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3547 /* return 0 as number of sectors if no device present or error */
3548 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3550 int64_t nb_sectors = bdrv_nb_sectors(bs);
3552 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3555 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3556 BlockdevOnError on_write_error)
3558 bs->on_read_error = on_read_error;
3559 bs->on_write_error = on_write_error;
3562 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3564 return is_read ? bs->on_read_error : bs->on_write_error;
3567 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3569 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3572 case BLOCKDEV_ON_ERROR_ENOSPC:
3573 return (error == ENOSPC) ?
3574 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3575 case BLOCKDEV_ON_ERROR_STOP:
3576 return BLOCK_ERROR_ACTION_STOP;
3577 case BLOCKDEV_ON_ERROR_REPORT:
3578 return BLOCK_ERROR_ACTION_REPORT;
3579 case BLOCKDEV_ON_ERROR_IGNORE:
3580 return BLOCK_ERROR_ACTION_IGNORE;
3586 static void send_qmp_error_event(BlockDriverState *bs,
3587 BlockErrorAction action,
3588 bool is_read, int error)
3590 IoOperationType optype;
3592 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3593 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3594 bdrv_iostatus_is_enabled(bs),
3595 error == ENOSPC, strerror(error),
3599 /* This is done by device models because, while the block layer knows
3600 * about the error, it does not know whether an operation comes from
3601 * the device or the block layer (from a job, for example).
3603 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3604 bool is_read, int error)
3608 if (action == BLOCK_ERROR_ACTION_STOP) {
3609 /* First set the iostatus, so that "info block" returns an iostatus
3610 * that matches the events raised so far (an additional error iostatus
3611 * is fine, but not a lost one).
3613 bdrv_iostatus_set_err(bs, error);
3615 /* Then raise the request to stop the VM and the event.
3616 * qemu_system_vmstop_request_prepare has two effects. First,
3617 * it ensures that the STOP event always comes after the
3618 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3619 * can observe the STOP event and do a "cont" before the STOP
3620 * event is issued, the VM will not stop. In this case, vm_start()
3621 * also ensures that the STOP/RESUME pair of events is emitted.
3623 qemu_system_vmstop_request_prepare();
3624 send_qmp_error_event(bs, action, is_read, error);
3625 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3627 send_qmp_error_event(bs, action, is_read, error);
3631 int bdrv_is_read_only(BlockDriverState *bs)
3633 return bs->read_only;
3636 int bdrv_is_sg(BlockDriverState *bs)
3641 int bdrv_enable_write_cache(BlockDriverState *bs)
3643 return bs->enable_write_cache;
3646 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3648 bs->enable_write_cache = wce;
3650 /* so a reopen() will preserve wce */
3652 bs->open_flags |= BDRV_O_CACHE_WB;
3654 bs->open_flags &= ~BDRV_O_CACHE_WB;
3658 int bdrv_is_encrypted(BlockDriverState *bs)
3660 if (bs->backing_hd && bs->backing_hd->encrypted)
3662 return bs->encrypted;
3665 int bdrv_key_required(BlockDriverState *bs)
3667 BlockDriverState *backing_hd = bs->backing_hd;
3669 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3671 return (bs->encrypted && !bs->valid_key);
3674 int bdrv_set_key(BlockDriverState *bs, const char *key)
3677 if (bs->backing_hd && bs->backing_hd->encrypted) {
3678 ret = bdrv_set_key(bs->backing_hd, key);
3684 if (!bs->encrypted) {
3686 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3689 ret = bs->drv->bdrv_set_key(bs, key);
3692 } else if (!bs->valid_key) {
3695 /* call the change callback now, we skipped it on open */
3696 blk_dev_change_media_cb(bs->blk, true);
3702 const char *bdrv_get_format_name(BlockDriverState *bs)
3704 return bs->drv ? bs->drv->format_name : NULL;
3707 static int qsort_strcmp(const void *a, const void *b)
3709 return strcmp(a, b);
3712 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3718 const char **formats = NULL;
3720 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3721 if (drv->format_name) {
3724 while (formats && i && !found) {
3725 found = !strcmp(formats[--i], drv->format_name);
3729 formats = g_renew(const char *, formats, count + 1);
3730 formats[count++] = drv->format_name;
3735 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3737 for (i = 0; i < count; i++) {
3738 it(opaque, formats[i]);
3744 /* This function is to find block backend bs */
3745 /* TODO convert callers to blk_by_name(), then remove */
3746 BlockDriverState *bdrv_find(const char *name)
3748 BlockBackend *blk = blk_by_name(name);
3750 return blk ? blk_bs(blk) : NULL;
3753 /* This function is to find a node in the bs graph */
3754 BlockDriverState *bdrv_find_node(const char *node_name)
3756 BlockDriverState *bs;
3760 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3761 if (!strcmp(node_name, bs->node_name)) {
3768 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3769 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3771 BlockDeviceInfoList *list, *entry;
3772 BlockDriverState *bs;
3775 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3776 entry = g_malloc0(sizeof(*entry));
3777 entry->value = bdrv_block_device_info(bs);
3785 BlockDriverState *bdrv_lookup_bs(const char *device,
3786 const char *node_name,
3790 BlockDriverState *bs;
3793 blk = blk_by_name(device);
3801 bs = bdrv_find_node(node_name);
3808 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3809 device ? device : "",
3810 node_name ? node_name : "");
3814 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3815 * return false. If either argument is NULL, return false. */
3816 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3818 while (top && top != base) {
3819 top = top->backing_hd;
3825 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3828 return QTAILQ_FIRST(&graph_bdrv_states);
3830 return QTAILQ_NEXT(bs, node_list);
3833 BlockDriverState *bdrv_next(BlockDriverState *bs)
3836 return QTAILQ_FIRST(&bdrv_states);
3838 return QTAILQ_NEXT(bs, device_list);
3841 const char *bdrv_get_node_name(const BlockDriverState *bs)
3843 return bs->node_name;
3846 /* TODO check what callers really want: bs->node_name or blk_name() */
3847 const char *bdrv_get_device_name(const BlockDriverState *bs)
3849 return bs->blk ? blk_name(bs->blk) : "";
3852 int bdrv_get_flags(BlockDriverState *bs)
3854 return bs->open_flags;
3857 int bdrv_flush_all(void)
3859 BlockDriverState *bs;
3862 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3863 AioContext *aio_context = bdrv_get_aio_context(bs);
3866 aio_context_acquire(aio_context);
3867 ret = bdrv_flush(bs);
3868 if (ret < 0 && !result) {
3871 aio_context_release(aio_context);
3877 int bdrv_has_zero_init_1(BlockDriverState *bs)
3882 int bdrv_has_zero_init(BlockDriverState *bs)
3886 /* If BS is a copy on write image, it is initialized to
3887 the contents of the base image, which may not be zeroes. */
3888 if (bs->backing_hd) {
3891 if (bs->drv->bdrv_has_zero_init) {
3892 return bs->drv->bdrv_has_zero_init(bs);
3899 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3901 BlockDriverInfo bdi;
3903 if (bs->backing_hd) {
3907 if (bdrv_get_info(bs, &bdi) == 0) {
3908 return bdi.unallocated_blocks_are_zero;
3914 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3916 BlockDriverInfo bdi;
3918 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3922 if (bdrv_get_info(bs, &bdi) == 0) {
3923 return bdi.can_write_zeroes_with_unmap;
3929 typedef struct BdrvCoGetBlockStatusData {
3930 BlockDriverState *bs;
3931 BlockDriverState *base;
3937 } BdrvCoGetBlockStatusData;
3940 * Returns the allocation status of the specified sectors.
3941 * Drivers not implementing the functionality are assumed to not support
3942 * backing files, hence all their sectors are reported as allocated.
3944 * If 'sector_num' is beyond the end of the disk image the return value is 0
3945 * and 'pnum' is set to 0.
3947 * 'pnum' is set to the number of sectors (including and immediately following
3948 * the specified sector) that are known to be in the same
3949 * allocated/unallocated state.
3951 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3952 * beyond the end of the disk image it will be clamped.
3954 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3956 int nb_sectors, int *pnum)
3958 int64_t total_sectors;
3962 total_sectors = bdrv_nb_sectors(bs);
3963 if (total_sectors < 0) {
3964 return total_sectors;
3967 if (sector_num >= total_sectors) {
3972 n = total_sectors - sector_num;
3973 if (n < nb_sectors) {
3977 if (!bs->drv->bdrv_co_get_block_status) {
3979 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3980 if (bs->drv->protocol_name) {
3981 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3986 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3992 if (ret & BDRV_BLOCK_RAW) {
3993 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3994 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3998 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3999 ret |= BDRV_BLOCK_ALLOCATED;
4002 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4003 if (bdrv_unallocated_blocks_are_zero(bs)) {
4004 ret |= BDRV_BLOCK_ZERO;
4005 } else if (bs->backing_hd) {
4006 BlockDriverState *bs2 = bs->backing_hd;
4007 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4008 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4009 ret |= BDRV_BLOCK_ZERO;
4015 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4016 (ret & BDRV_BLOCK_OFFSET_VALID)) {
4019 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4022 /* Ignore errors. This is just providing extra information, it
4023 * is useful but not necessary.
4026 /* !file_pnum indicates an offset at or beyond the EOF; it is
4027 * perfectly valid for the format block driver to point to such
4028 * offsets, so catch it and mark everything as zero */
4029 ret |= BDRV_BLOCK_ZERO;
4031 /* Limit request to the range reported by the protocol driver */
4033 ret |= (ret2 & BDRV_BLOCK_ZERO);
4041 /* Coroutine wrapper for bdrv_get_block_status() */
4042 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4044 BdrvCoGetBlockStatusData *data = opaque;
4045 BlockDriverState *bs = data->bs;
4047 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4053 * Synchronous wrapper around bdrv_co_get_block_status().
4055 * See bdrv_co_get_block_status() for details.
4057 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4058 int nb_sectors, int *pnum)
4061 BdrvCoGetBlockStatusData data = {
4063 .sector_num = sector_num,
4064 .nb_sectors = nb_sectors,
4069 if (qemu_in_coroutine()) {
4070 /* Fast-path if already in coroutine context */
4071 bdrv_get_block_status_co_entry(&data);
4073 AioContext *aio_context = bdrv_get_aio_context(bs);
4075 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4076 qemu_coroutine_enter(co, &data);
4077 while (!data.done) {
4078 aio_poll(aio_context, true);
4084 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4085 int nb_sectors, int *pnum)
4087 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4091 return !!(ret & BDRV_BLOCK_ALLOCATED);
4095 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4097 * Return true if the given sector is allocated in any image between
4098 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4099 * sector is allocated in any image of the chain. Return false otherwise.
4101 * 'pnum' is set to the number of sectors (including and immediately following
4102 * the specified sector) that are known to be in the same
4103 * allocated/unallocated state.
4106 int bdrv_is_allocated_above(BlockDriverState *top,
4107 BlockDriverState *base,
4109 int nb_sectors, int *pnum)
4111 BlockDriverState *intermediate;
4112 int ret, n = nb_sectors;
4115 while (intermediate && intermediate != base) {
4117 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4127 * [sector_num, nb_sectors] is unallocated on top but intermediate
4130 * [sector_num+x, nr_sectors] allocated.
4132 if (n > pnum_inter &&
4133 (intermediate == top ||
4134 sector_num + pnum_inter < intermediate->total_sectors)) {
4138 intermediate = intermediate->backing_hd;
4145 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4147 if (bs->backing_hd && bs->backing_hd->encrypted)
4148 return bs->backing_file;
4149 else if (bs->encrypted)
4150 return bs->filename;
4155 void bdrv_get_backing_filename(BlockDriverState *bs,
4156 char *filename, int filename_size)
4158 pstrcpy(filename, filename_size, bs->backing_file);
4161 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4162 const uint8_t *buf, int nb_sectors)
4164 BlockDriver *drv = bs->drv;
4167 if (!drv->bdrv_write_compressed)
4169 if (bdrv_check_request(bs, sector_num, nb_sectors))
4172 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4174 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4177 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4179 BlockDriver *drv = bs->drv;
4182 if (!drv->bdrv_get_info)
4184 memset(bdi, 0, sizeof(*bdi));
4185 return drv->bdrv_get_info(bs, bdi);
4188 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4190 BlockDriver *drv = bs->drv;
4191 if (drv && drv->bdrv_get_specific_info) {
4192 return drv->bdrv_get_specific_info(bs);
4197 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4198 int64_t pos, int size)
4201 struct iovec iov = {
4202 .iov_base = (void *) buf,
4206 qemu_iovec_init_external(&qiov, &iov, 1);
4207 return bdrv_writev_vmstate(bs, &qiov, pos);
4210 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4212 BlockDriver *drv = bs->drv;
4216 } else if (drv->bdrv_save_vmstate) {
4217 return drv->bdrv_save_vmstate(bs, qiov, pos);
4218 } else if (bs->file) {
4219 return bdrv_writev_vmstate(bs->file, qiov, pos);
4225 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4226 int64_t pos, int size)
4228 BlockDriver *drv = bs->drv;
4231 if (drv->bdrv_load_vmstate)
4232 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4234 return bdrv_load_vmstate(bs->file, buf, pos, size);
4238 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4240 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4244 bs->drv->bdrv_debug_event(bs, event);
4247 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4250 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4254 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4255 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4261 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4263 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4267 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4268 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4274 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4276 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4280 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4281 return bs->drv->bdrv_debug_resume(bs, tag);
4287 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4289 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4293 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4294 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4300 int bdrv_is_snapshot(BlockDriverState *bs)
4302 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4305 /* backing_file can either be relative, or absolute, or a protocol. If it is
4306 * relative, it must be relative to the chain. So, passing in bs->filename
4307 * from a BDS as backing_file should not be done, as that may be relative to
4308 * the CWD rather than the chain. */
4309 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4310 const char *backing_file)
4312 char *filename_full = NULL;
4313 char *backing_file_full = NULL;
4314 char *filename_tmp = NULL;
4315 int is_protocol = 0;
4316 BlockDriverState *curr_bs = NULL;
4317 BlockDriverState *retval = NULL;
4319 if (!bs || !bs->drv || !backing_file) {
4323 filename_full = g_malloc(PATH_MAX);
4324 backing_file_full = g_malloc(PATH_MAX);
4325 filename_tmp = g_malloc(PATH_MAX);
4327 is_protocol = path_has_protocol(backing_file);
4329 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4331 /* If either of the filename paths is actually a protocol, then
4332 * compare unmodified paths; otherwise make paths relative */
4333 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4334 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4335 retval = curr_bs->backing_hd;
4339 /* If not an absolute filename path, make it relative to the current
4340 * image's filename path */
4341 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4344 /* We are going to compare absolute pathnames */
4345 if (!realpath(filename_tmp, filename_full)) {
4349 /* We need to make sure the backing filename we are comparing against
4350 * is relative to the current image filename (or absolute) */
4351 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4352 curr_bs->backing_file);
4354 if (!realpath(filename_tmp, backing_file_full)) {
4358 if (strcmp(backing_file_full, filename_full) == 0) {
4359 retval = curr_bs->backing_hd;
4365 g_free(filename_full);
4366 g_free(backing_file_full);
4367 g_free(filename_tmp);
4371 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4377 if (!bs->backing_hd) {
4381 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4384 /**************************************************************/
4387 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4388 QEMUIOVector *qiov, int nb_sectors,
4389 BlockCompletionFunc *cb, void *opaque)
4391 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4393 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4397 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4398 QEMUIOVector *qiov, int nb_sectors,
4399 BlockCompletionFunc *cb, void *opaque)
4401 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4403 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4407 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4408 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4409 BlockCompletionFunc *cb, void *opaque)
4411 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4413 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4414 BDRV_REQ_ZERO_WRITE | flags,
4419 typedef struct MultiwriteCB {
4424 BlockCompletionFunc *cb;
4426 QEMUIOVector *free_qiov;
4430 static void multiwrite_user_cb(MultiwriteCB *mcb)
4434 for (i = 0; i < mcb->num_callbacks; i++) {
4435 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4436 if (mcb->callbacks[i].free_qiov) {
4437 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4439 g_free(mcb->callbacks[i].free_qiov);
4443 static void multiwrite_cb(void *opaque, int ret)
4445 MultiwriteCB *mcb = opaque;
4447 trace_multiwrite_cb(mcb, ret);
4449 if (ret < 0 && !mcb->error) {
4453 mcb->num_requests--;
4454 if (mcb->num_requests == 0) {
4455 multiwrite_user_cb(mcb);
4460 static int multiwrite_req_compare(const void *a, const void *b)
4462 const BlockRequest *req1 = a, *req2 = b;
4465 * Note that we can't simply subtract req2->sector from req1->sector
4466 * here as that could overflow the return value.
4468 if (req1->sector > req2->sector) {
4470 } else if (req1->sector < req2->sector) {
4478 * Takes a bunch of requests and tries to merge them. Returns the number of
4479 * requests that remain after merging.
4481 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4482 int num_reqs, MultiwriteCB *mcb)
4486 // Sort requests by start sector
4487 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4489 // Check if adjacent requests touch the same clusters. If so, combine them,
4490 // filling up gaps with zero sectors.
4492 for (i = 1; i < num_reqs; i++) {
4494 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4496 // Handle exactly sequential writes and overlapping writes.
4497 if (reqs[i].sector <= oldreq_last) {
4501 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4505 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4506 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4512 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4513 qemu_iovec_init(qiov,
4514 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4516 // Add the first request to the merged one. If the requests are
4517 // overlapping, drop the last sectors of the first request.
4518 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4519 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4521 // We should need to add any zeros between the two requests
4522 assert (reqs[i].sector <= oldreq_last);
4524 // Add the second request
4525 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4527 // Add tail of first request, if necessary
4528 if (qiov->size < reqs[outidx].qiov->size) {
4529 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4530 reqs[outidx].qiov->size - qiov->size);
4533 reqs[outidx].nb_sectors = qiov->size >> 9;
4534 reqs[outidx].qiov = qiov;
4536 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4539 reqs[outidx].sector = reqs[i].sector;
4540 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4541 reqs[outidx].qiov = reqs[i].qiov;
4549 * Submit multiple AIO write requests at once.
4551 * On success, the function returns 0 and all requests in the reqs array have
4552 * been submitted. In error case this function returns -1, and any of the
4553 * requests may or may not be submitted yet. In particular, this means that the
4554 * callback will be called for some of the requests, for others it won't. The
4555 * caller must check the error field of the BlockRequest to wait for the right
4556 * callbacks (if error != 0, no callback will be called).
4558 * The implementation may modify the contents of the reqs array, e.g. to merge
4559 * requests. However, the fields opaque and error are left unmodified as they
4560 * are used to signal failure for a single request to the caller.
4562 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4567 /* don't submit writes if we don't have a medium */
4568 if (bs->drv == NULL) {
4569 for (i = 0; i < num_reqs; i++) {
4570 reqs[i].error = -ENOMEDIUM;
4575 if (num_reqs == 0) {
4579 // Create MultiwriteCB structure
4580 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4581 mcb->num_requests = 0;
4582 mcb->num_callbacks = num_reqs;
4584 for (i = 0; i < num_reqs; i++) {
4585 mcb->callbacks[i].cb = reqs[i].cb;
4586 mcb->callbacks[i].opaque = reqs[i].opaque;
4589 // Check for mergable requests
4590 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4592 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4594 /* Run the aio requests. */
4595 mcb->num_requests = num_reqs;
4596 for (i = 0; i < num_reqs; i++) {
4597 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4598 reqs[i].nb_sectors, reqs[i].flags,
4606 void bdrv_aio_cancel(BlockAIOCB *acb)
4609 bdrv_aio_cancel_async(acb);
4610 while (acb->refcnt > 1) {
4611 if (acb->aiocb_info->get_aio_context) {
4612 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4613 } else if (acb->bs) {
4614 aio_poll(bdrv_get_aio_context(acb->bs), true);
4619 qemu_aio_unref(acb);
4622 /* Async version of aio cancel. The caller is not blocked if the acb implements
4623 * cancel_async, otherwise we do nothing and let the request normally complete.
4624 * In either case the completion callback must be called. */
4625 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4627 if (acb->aiocb_info->cancel_async) {
4628 acb->aiocb_info->cancel_async(acb);
4632 /**************************************************************/
4633 /* async block device emulation */
4635 typedef struct BlockAIOCBSync {
4639 /* vector translation state */
4645 static const AIOCBInfo bdrv_em_aiocb_info = {
4646 .aiocb_size = sizeof(BlockAIOCBSync),
4649 static void bdrv_aio_bh_cb(void *opaque)
4651 BlockAIOCBSync *acb = opaque;
4653 if (!acb->is_write && acb->ret >= 0) {
4654 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4656 qemu_vfree(acb->bounce);
4657 acb->common.cb(acb->common.opaque, acb->ret);
4658 qemu_bh_delete(acb->bh);
4660 qemu_aio_unref(acb);
4663 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4667 BlockCompletionFunc *cb,
4672 BlockAIOCBSync *acb;
4674 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4675 acb->is_write = is_write;
4677 acb->bounce = qemu_try_blockalign(bs, qiov->size);
4678 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4680 if (acb->bounce == NULL) {
4682 } else if (is_write) {
4683 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4684 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4686 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4689 qemu_bh_schedule(acb->bh);
4691 return &acb->common;
4694 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4695 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4696 BlockCompletionFunc *cb, void *opaque)
4698 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4701 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4702 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4703 BlockCompletionFunc *cb, void *opaque)
4705 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4709 typedef struct BlockAIOCBCoroutine {
4715 } BlockAIOCBCoroutine;
4717 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4718 .aiocb_size = sizeof(BlockAIOCBCoroutine),
4721 static void bdrv_co_em_bh(void *opaque)
4723 BlockAIOCBCoroutine *acb = opaque;
4725 acb->common.cb(acb->common.opaque, acb->req.error);
4727 qemu_bh_delete(acb->bh);
4728 qemu_aio_unref(acb);
4731 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4732 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4734 BlockAIOCBCoroutine *acb = opaque;
4735 BlockDriverState *bs = acb->common.bs;
4737 if (!acb->is_write) {
4738 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4739 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4741 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4742 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4745 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4746 qemu_bh_schedule(acb->bh);
4749 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4753 BdrvRequestFlags flags,
4754 BlockCompletionFunc *cb,
4759 BlockAIOCBCoroutine *acb;
4761 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4762 acb->req.sector = sector_num;
4763 acb->req.nb_sectors = nb_sectors;
4764 acb->req.qiov = qiov;
4765 acb->req.flags = flags;
4766 acb->is_write = is_write;
4768 co = qemu_coroutine_create(bdrv_co_do_rw);
4769 qemu_coroutine_enter(co, acb);
4771 return &acb->common;
4774 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4776 BlockAIOCBCoroutine *acb = opaque;
4777 BlockDriverState *bs = acb->common.bs;
4779 acb->req.error = bdrv_co_flush(bs);
4780 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4781 qemu_bh_schedule(acb->bh);
4784 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4785 BlockCompletionFunc *cb, void *opaque)
4787 trace_bdrv_aio_flush(bs, opaque);
4790 BlockAIOCBCoroutine *acb;
4792 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4794 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4795 qemu_coroutine_enter(co, acb);
4797 return &acb->common;
4800 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4802 BlockAIOCBCoroutine *acb = opaque;
4803 BlockDriverState *bs = acb->common.bs;
4805 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4806 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4807 qemu_bh_schedule(acb->bh);
4810 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4811 int64_t sector_num, int nb_sectors,
4812 BlockCompletionFunc *cb, void *opaque)
4815 BlockAIOCBCoroutine *acb;
4817 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4819 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4820 acb->req.sector = sector_num;
4821 acb->req.nb_sectors = nb_sectors;
4822 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4823 qemu_coroutine_enter(co, acb);
4825 return &acb->common;
4828 void bdrv_init(void)
4830 module_call_init(MODULE_INIT_BLOCK);
4833 void bdrv_init_with_whitelist(void)
4835 use_bdrv_whitelist = 1;
4839 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4840 BlockCompletionFunc *cb, void *opaque)
4844 acb = g_slice_alloc(aiocb_info->aiocb_size);
4845 acb->aiocb_info = aiocb_info;
4848 acb->opaque = opaque;
4853 void qemu_aio_ref(void *p)
4855 BlockAIOCB *acb = p;
4859 void qemu_aio_unref(void *p)
4861 BlockAIOCB *acb = p;
4862 assert(acb->refcnt > 0);
4863 if (--acb->refcnt == 0) {
4864 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4868 /**************************************************************/
4869 /* Coroutine block device emulation */
4871 typedef struct CoroutineIOCompletion {
4872 Coroutine *coroutine;
4874 } CoroutineIOCompletion;
4876 static void bdrv_co_io_em_complete(void *opaque, int ret)
4878 CoroutineIOCompletion *co = opaque;
4881 qemu_coroutine_enter(co->coroutine, NULL);
4884 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4885 int nb_sectors, QEMUIOVector *iov,
4888 CoroutineIOCompletion co = {
4889 .coroutine = qemu_coroutine_self(),
4894 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4895 bdrv_co_io_em_complete, &co);
4897 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4898 bdrv_co_io_em_complete, &co);
4901 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4905 qemu_coroutine_yield();
4910 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4911 int64_t sector_num, int nb_sectors,
4914 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4917 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4918 int64_t sector_num, int nb_sectors,
4921 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4924 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4926 RwCo *rwco = opaque;
4928 rwco->ret = bdrv_co_flush(rwco->bs);
4931 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4935 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4939 /* Write back cached data to the OS even with cache=unsafe */
4940 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4941 if (bs->drv->bdrv_co_flush_to_os) {
4942 ret = bs->drv->bdrv_co_flush_to_os(bs);
4948 /* But don't actually force it to the disk with cache=unsafe */
4949 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4953 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4954 if (bs->drv->bdrv_co_flush_to_disk) {
4955 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4956 } else if (bs->drv->bdrv_aio_flush) {
4958 CoroutineIOCompletion co = {
4959 .coroutine = qemu_coroutine_self(),
4962 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4966 qemu_coroutine_yield();
4971 * Some block drivers always operate in either writethrough or unsafe
4972 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4973 * know how the server works (because the behaviour is hardcoded or
4974 * depends on server-side configuration), so we can't ensure that
4975 * everything is safe on disk. Returning an error doesn't work because
4976 * that would break guests even if the server operates in writethrough
4979 * Let's hope the user knows what he's doing.
4987 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4988 * in the case of cache=unsafe, so there are no useless flushes.
4991 return bdrv_co_flush(bs->file);
4994 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4996 Error *local_err = NULL;
5003 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5006 bs->open_flags &= ~BDRV_O_INCOMING;
5008 if (bs->drv->bdrv_invalidate_cache) {
5009 bs->drv->bdrv_invalidate_cache(bs, &local_err);
5010 } else if (bs->file) {
5011 bdrv_invalidate_cache(bs->file, &local_err);
5014 error_propagate(errp, local_err);
5018 ret = refresh_total_sectors(bs, bs->total_sectors);
5020 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5025 void bdrv_invalidate_cache_all(Error **errp)
5027 BlockDriverState *bs;
5028 Error *local_err = NULL;
5030 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5031 AioContext *aio_context = bdrv_get_aio_context(bs);
5033 aio_context_acquire(aio_context);
5034 bdrv_invalidate_cache(bs, &local_err);
5035 aio_context_release(aio_context);
5037 error_propagate(errp, local_err);
5043 int bdrv_flush(BlockDriverState *bs)
5051 if (qemu_in_coroutine()) {
5052 /* Fast-path if already in coroutine context */
5053 bdrv_flush_co_entry(&rwco);
5055 AioContext *aio_context = bdrv_get_aio_context(bs);
5057 co = qemu_coroutine_create(bdrv_flush_co_entry);
5058 qemu_coroutine_enter(co, &rwco);
5059 while (rwco.ret == NOT_DONE) {
5060 aio_poll(aio_context, true);
5067 typedef struct DiscardCo {
5068 BlockDriverState *bs;
5073 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5075 DiscardCo *rwco = opaque;
5077 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5080 /* if no limit is specified in the BlockLimits use a default
5081 * of 32768 512-byte sectors (16 MiB) per request.
5083 #define MAX_DISCARD_DEFAULT 32768
5085 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5092 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5094 } else if (bs->read_only) {
5098 bdrv_reset_dirty(bs, sector_num, nb_sectors);
5100 /* Do nothing if disabled. */
5101 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5105 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5109 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5110 while (nb_sectors > 0) {
5112 int num = nb_sectors;
5115 if (bs->bl.discard_alignment &&
5116 num >= bs->bl.discard_alignment &&
5117 sector_num % bs->bl.discard_alignment) {
5118 if (num > bs->bl.discard_alignment) {
5119 num = bs->bl.discard_alignment;
5121 num -= sector_num % bs->bl.discard_alignment;
5124 /* limit request size */
5125 if (num > max_discard) {
5129 if (bs->drv->bdrv_co_discard) {
5130 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5133 CoroutineIOCompletion co = {
5134 .coroutine = qemu_coroutine_self(),
5137 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5138 bdrv_co_io_em_complete, &co);
5142 qemu_coroutine_yield();
5146 if (ret && ret != -ENOTSUP) {
5156 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5161 .sector_num = sector_num,
5162 .nb_sectors = nb_sectors,
5166 if (qemu_in_coroutine()) {
5167 /* Fast-path if already in coroutine context */
5168 bdrv_discard_co_entry(&rwco);
5170 AioContext *aio_context = bdrv_get_aio_context(bs);
5172 co = qemu_coroutine_create(bdrv_discard_co_entry);
5173 qemu_coroutine_enter(co, &rwco);
5174 while (rwco.ret == NOT_DONE) {
5175 aio_poll(aio_context, true);
5182 /**************************************************************/
5183 /* removable device support */
5186 * Return TRUE if the media is present
5188 int bdrv_is_inserted(BlockDriverState *bs)
5190 BlockDriver *drv = bs->drv;
5194 if (!drv->bdrv_is_inserted)
5196 return drv->bdrv_is_inserted(bs);
5200 * Return whether the media changed since the last call to this
5201 * function, or -ENOTSUP if we don't know. Most drivers don't know.
5203 int bdrv_media_changed(BlockDriverState *bs)
5205 BlockDriver *drv = bs->drv;
5207 if (drv && drv->bdrv_media_changed) {
5208 return drv->bdrv_media_changed(bs);
5214 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5216 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5218 BlockDriver *drv = bs->drv;
5219 const char *device_name;
5221 if (drv && drv->bdrv_eject) {
5222 drv->bdrv_eject(bs, eject_flag);
5225 device_name = bdrv_get_device_name(bs);
5226 if (device_name[0] != '\0') {
5227 qapi_event_send_device_tray_moved(device_name,
5228 eject_flag, &error_abort);
5233 * Lock or unlock the media (if it is locked, the user won't be able
5234 * to eject it manually).
5236 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5238 BlockDriver *drv = bs->drv;
5240 trace_bdrv_lock_medium(bs, locked);
5242 if (drv && drv->bdrv_lock_medium) {
5243 drv->bdrv_lock_medium(bs, locked);
5247 /* needed for generic scsi interface */
5249 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5251 BlockDriver *drv = bs->drv;
5253 if (drv && drv->bdrv_ioctl)
5254 return drv->bdrv_ioctl(bs, req, buf);
5258 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5259 unsigned long int req, void *buf,
5260 BlockCompletionFunc *cb, void *opaque)
5262 BlockDriver *drv = bs->drv;
5264 if (drv && drv->bdrv_aio_ioctl)
5265 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5269 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5271 bs->guest_block_size = align;
5274 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5276 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5279 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5281 return memset(qemu_blockalign(bs, size), 0, size);
5284 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5286 size_t align = bdrv_opt_mem_align(bs);
5288 /* Ensure that NULL is never returned on success */
5294 return qemu_try_memalign(align, size);
5297 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5299 void *mem = qemu_try_blockalign(bs, size);
5302 memset(mem, 0, size);
5309 * Check if all memory in this vector is sector aligned.
5311 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5314 size_t alignment = bdrv_opt_mem_align(bs);
5316 for (i = 0; i < qiov->niov; i++) {
5317 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5320 if (qiov->iov[i].iov_len % alignment) {
5328 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5331 int64_t bitmap_size;
5332 BdrvDirtyBitmap *bitmap;
5334 assert((granularity & (granularity - 1)) == 0);
5336 granularity >>= BDRV_SECTOR_BITS;
5337 assert(granularity);
5338 bitmap_size = bdrv_nb_sectors(bs);
5339 if (bitmap_size < 0) {
5340 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5341 errno = -bitmap_size;
5344 bitmap = g_new0(BdrvDirtyBitmap, 1);
5345 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5346 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5350 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5352 BdrvDirtyBitmap *bm, *next;
5353 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5355 QLIST_REMOVE(bitmap, list);
5356 hbitmap_free(bitmap->bitmap);
5363 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5365 BdrvDirtyBitmap *bm;
5366 BlockDirtyInfoList *list = NULL;
5367 BlockDirtyInfoList **plist = &list;
5369 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5370 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5371 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5372 info->count = bdrv_get_dirty_count(bs, bm);
5374 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5375 entry->value = info;
5377 plist = &entry->next;
5383 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5386 return hbitmap_get(bitmap->bitmap, sector);
5392 void bdrv_dirty_iter_init(BlockDriverState *bs,
5393 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5395 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5398 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5401 BdrvDirtyBitmap *bitmap;
5402 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5403 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5407 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5409 BdrvDirtyBitmap *bitmap;
5410 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5411 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5415 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5417 return hbitmap_count(bitmap->bitmap);
5420 /* Get a reference to bs */
5421 void bdrv_ref(BlockDriverState *bs)
5426 /* Release a previously grabbed reference to bs.
5427 * If after releasing, reference count is zero, the BlockDriverState is
5429 void bdrv_unref(BlockDriverState *bs)
5434 assert(bs->refcnt > 0);
5435 if (--bs->refcnt == 0) {
5440 struct BdrvOpBlocker {
5442 QLIST_ENTRY(BdrvOpBlocker) list;
5445 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5447 BdrvOpBlocker *blocker;
5448 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5449 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5450 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5452 error_setg(errp, "Device '%s' is busy: %s",
5453 bdrv_get_device_name(bs),
5454 error_get_pretty(blocker->reason));
5461 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5463 BdrvOpBlocker *blocker;
5464 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5466 blocker = g_new0(BdrvOpBlocker, 1);
5467 blocker->reason = reason;
5468 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5471 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5473 BdrvOpBlocker *blocker, *next;
5474 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5475 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5476 if (blocker->reason == reason) {
5477 QLIST_REMOVE(blocker, list);
5483 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5486 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5487 bdrv_op_block(bs, i, reason);
5491 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5494 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5495 bdrv_op_unblock(bs, i, reason);
5499 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5503 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5504 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5511 void bdrv_iostatus_enable(BlockDriverState *bs)
5513 bs->iostatus_enabled = true;
5514 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5517 /* The I/O status is only enabled if the drive explicitly
5518 * enables it _and_ the VM is configured to stop on errors */
5519 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5521 return (bs->iostatus_enabled &&
5522 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5523 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5524 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5527 void bdrv_iostatus_disable(BlockDriverState *bs)
5529 bs->iostatus_enabled = false;
5532 void bdrv_iostatus_reset(BlockDriverState *bs)
5534 if (bdrv_iostatus_is_enabled(bs)) {
5535 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5537 block_job_iostatus_reset(bs->job);
5542 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5544 assert(bdrv_iostatus_is_enabled(bs));
5545 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5546 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5547 BLOCK_DEVICE_IO_STATUS_FAILED;
5551 void bdrv_img_create(const char *filename, const char *fmt,
5552 const char *base_filename, const char *base_fmt,
5553 char *options, uint64_t img_size, int flags,
5554 Error **errp, bool quiet)
5556 QemuOptsList *create_opts = NULL;
5557 QemuOpts *opts = NULL;
5558 const char *backing_fmt, *backing_file;
5560 BlockDriver *drv, *proto_drv;
5561 BlockDriver *backing_drv = NULL;
5562 Error *local_err = NULL;
5565 /* Find driver and parse its options */
5566 drv = bdrv_find_format(fmt);
5568 error_setg(errp, "Unknown file format '%s'", fmt);
5572 proto_drv = bdrv_find_protocol(filename, true);
5574 error_setg(errp, "Unknown protocol '%s'", filename);
5578 if (!drv->create_opts) {
5579 error_setg(errp, "Format driver '%s' does not support image creation",
5584 if (!proto_drv->create_opts) {
5585 error_setg(errp, "Protocol driver '%s' does not support image creation",
5586 proto_drv->format_name);
5590 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5591 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5593 /* Create parameter list with default values */
5594 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5595 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5597 /* Parse -o options */
5599 if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5600 error_setg(errp, "Invalid options for file format '%s'", fmt);
5605 if (base_filename) {
5606 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5607 error_setg(errp, "Backing file not supported for file format '%s'",
5614 if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5615 error_setg(errp, "Backing file format not supported for file "
5616 "format '%s'", fmt);
5621 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5623 if (!strcmp(filename, backing_file)) {
5624 error_setg(errp, "Error: Trying to create an image with the "
5625 "same filename as the backing file");
5630 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5632 backing_drv = bdrv_find_format(backing_fmt);
5634 error_setg(errp, "Unknown backing file format '%s'",
5640 // The size for the image must always be specified, with one exception:
5641 // If we are using a backing file, we can obtain the size from there
5642 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5645 BlockDriverState *bs;
5649 /* backing files always opened read-only */
5651 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5654 ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5655 backing_drv, &local_err);
5659 size = bdrv_getlength(bs);
5661 error_setg_errno(errp, -size, "Could not get size of '%s'",
5667 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5671 error_setg(errp, "Image creation needs a size parameter");
5677 printf("Formatting '%s', fmt=%s", filename, fmt);
5678 qemu_opts_print(opts, " ");
5682 ret = bdrv_create(drv, filename, opts, &local_err);
5684 if (ret == -EFBIG) {
5685 /* This is generally a better message than whatever the driver would
5686 * deliver (especially because of the cluster_size_hint), since that
5687 * is most probably not much different from "image too large". */
5688 const char *cluster_size_hint = "";
5689 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5690 cluster_size_hint = " (try using a larger cluster size)";
5692 error_setg(errp, "The image size is too large for file format '%s'"
5693 "%s", fmt, cluster_size_hint);
5694 error_free(local_err);
5699 qemu_opts_del(opts);
5700 qemu_opts_free(create_opts);
5702 error_propagate(errp, local_err);
5706 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5708 return bs->aio_context;
5711 void bdrv_detach_aio_context(BlockDriverState *bs)
5713 BdrvAioNotifier *baf;
5719 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5720 baf->detach_aio_context(baf->opaque);
5723 if (bs->io_limits_enabled) {
5724 throttle_detach_aio_context(&bs->throttle_state);
5726 if (bs->drv->bdrv_detach_aio_context) {
5727 bs->drv->bdrv_detach_aio_context(bs);
5730 bdrv_detach_aio_context(bs->file);
5732 if (bs->backing_hd) {
5733 bdrv_detach_aio_context(bs->backing_hd);
5736 bs->aio_context = NULL;
5739 void bdrv_attach_aio_context(BlockDriverState *bs,
5740 AioContext *new_context)
5742 BdrvAioNotifier *ban;
5748 bs->aio_context = new_context;
5750 if (bs->backing_hd) {
5751 bdrv_attach_aio_context(bs->backing_hd, new_context);
5754 bdrv_attach_aio_context(bs->file, new_context);
5756 if (bs->drv->bdrv_attach_aio_context) {
5757 bs->drv->bdrv_attach_aio_context(bs, new_context);
5759 if (bs->io_limits_enabled) {
5760 throttle_attach_aio_context(&bs->throttle_state, new_context);
5763 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5764 ban->attached_aio_context(new_context, ban->opaque);
5768 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5770 bdrv_drain_all(); /* ensure there are no in-flight requests */
5772 bdrv_detach_aio_context(bs);
5774 /* This function executes in the old AioContext so acquire the new one in
5775 * case it runs in a different thread.
5777 aio_context_acquire(new_context);
5778 bdrv_attach_aio_context(bs, new_context);
5779 aio_context_release(new_context);
5782 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5783 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5784 void (*detach_aio_context)(void *opaque), void *opaque)
5786 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5787 *ban = (BdrvAioNotifier){
5788 .attached_aio_context = attached_aio_context,
5789 .detach_aio_context = detach_aio_context,
5793 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5796 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5797 void (*attached_aio_context)(AioContext *,
5799 void (*detach_aio_context)(void *),
5802 BdrvAioNotifier *ban, *ban_next;
5804 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5805 if (ban->attached_aio_context == attached_aio_context &&
5806 ban->detach_aio_context == detach_aio_context &&
5807 ban->opaque == opaque)
5809 QLIST_REMOVE(ban, list);
5819 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5820 NotifierWithReturn *notifier)
5822 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5825 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5826 BlockDriverAmendStatusCB *status_cb)
5828 if (!bs->drv->bdrv_amend_options) {
5831 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5834 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5835 * of block filter and by bdrv_is_first_non_filter.
5836 * It is used to test if the given bs is the candidate or recurse more in the
5839 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5840 BlockDriverState *candidate)
5842 /* return false if basic checks fails */
5843 if (!bs || !bs->drv) {
5847 /* the code reached a non block filter driver -> check if the bs is
5848 * the same as the candidate. It's the recursion termination condition.
5850 if (!bs->drv->is_filter) {
5851 return bs == candidate;
5853 /* Down this path the driver is a block filter driver */
5855 /* If the block filter recursion method is defined use it to recurse down
5858 if (bs->drv->bdrv_recurse_is_first_non_filter) {
5859 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5862 /* the driver is a block filter but don't allow to recurse -> return false
5867 /* This function checks if the candidate is the first non filter bs down it's
5868 * bs chain. Since we don't have pointers to parents it explore all bs chains
5869 * from the top. Some filters can choose not to pass down the recursion.
5871 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5873 BlockDriverState *bs;
5875 /* walk down the bs forest recursively */
5876 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5879 /* try to recurse in this top level bs */
5880 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5882 /* candidate is the first non filter */
5891 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5893 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5894 AioContext *aio_context;
5896 if (!to_replace_bs) {
5897 error_setg(errp, "Node name '%s' not found", node_name);
5901 aio_context = bdrv_get_aio_context(to_replace_bs);
5902 aio_context_acquire(aio_context);
5904 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5905 to_replace_bs = NULL;
5909 /* We don't want arbitrary node of the BDS chain to be replaced only the top
5910 * most non filter in order to prevent data corruption.
5911 * Another benefit is that this tests exclude backing files which are
5912 * blocked by the backing blockers.
5914 if (!bdrv_is_first_non_filter(to_replace_bs)) {
5915 error_setg(errp, "Only top most non filter can be replaced");
5916 to_replace_bs = NULL;
5921 aio_context_release(aio_context);
5922 return to_replace_bs;
5925 void bdrv_io_plug(BlockDriverState *bs)
5927 BlockDriver *drv = bs->drv;
5928 if (drv && drv->bdrv_io_plug) {
5929 drv->bdrv_io_plug(bs);
5930 } else if (bs->file) {
5931 bdrv_io_plug(bs->file);
5935 void bdrv_io_unplug(BlockDriverState *bs)
5937 BlockDriver *drv = bs->drv;
5938 if (drv && drv->bdrv_io_unplug) {
5939 drv->bdrv_io_unplug(bs);
5940 } else if (bs->file) {
5941 bdrv_io_unplug(bs->file);
5945 void bdrv_flush_io_queue(BlockDriverState *bs)
5947 BlockDriver *drv = bs->drv;
5948 if (drv && drv->bdrv_flush_io_queue) {
5949 drv->bdrv_flush_io_queue(bs);
5950 } else if (bs->file) {
5951 bdrv_flush_io_queue(bs->file);
5955 static bool append_open_options(QDict *d, BlockDriverState *bs)
5957 const QDictEntry *entry;
5958 bool found_any = false;
5960 for (entry = qdict_first(bs->options); entry;
5961 entry = qdict_next(bs->options, entry))
5963 /* Only take options for this level and exclude all non-driver-specific
5965 if (!strchr(qdict_entry_key(entry), '.') &&
5966 strcmp(qdict_entry_key(entry), "node-name"))
5968 qobject_incref(qdict_entry_value(entry));
5969 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5977 /* Updates the following BDS fields:
5978 * - exact_filename: A filename which may be used for opening a block device
5979 * which (mostly) equals the given BDS (even without any
5980 * other options; so reading and writing must return the same
5981 * results, but caching etc. may be different)
5982 * - full_open_options: Options which, when given when opening a block device
5983 * (without a filename), result in a BDS (mostly)
5984 * equalling the given one
5985 * - filename: If exact_filename is set, it is copied here. Otherwise,
5986 * full_open_options is converted to a JSON object, prefixed with
5987 * "json:" (for use through the JSON pseudo protocol) and put here.
5989 void bdrv_refresh_filename(BlockDriverState *bs)
5991 BlockDriver *drv = bs->drv;
5998 /* This BDS's file name will most probably depend on its file's name, so
5999 * refresh that first */
6001 bdrv_refresh_filename(bs->file);
6004 if (drv->bdrv_refresh_filename) {
6005 /* Obsolete information is of no use here, so drop the old file name
6006 * information before refreshing it */
6007 bs->exact_filename[0] = '\0';
6008 if (bs->full_open_options) {
6009 QDECREF(bs->full_open_options);
6010 bs->full_open_options = NULL;
6013 drv->bdrv_refresh_filename(bs);
6014 } else if (bs->file) {
6015 /* Try to reconstruct valid information from the underlying file */
6016 bool has_open_options;
6018 bs->exact_filename[0] = '\0';
6019 if (bs->full_open_options) {
6020 QDECREF(bs->full_open_options);
6021 bs->full_open_options = NULL;
6025 has_open_options = append_open_options(opts, bs);
6027 /* If no specific options have been given for this BDS, the filename of
6028 * the underlying file should suffice for this one as well */
6029 if (bs->file->exact_filename[0] && !has_open_options) {
6030 strcpy(bs->exact_filename, bs->file->exact_filename);
6032 /* Reconstructing the full options QDict is simple for most format block
6033 * drivers, as long as the full options are known for the underlying
6034 * file BDS. The full options QDict of that file BDS should somehow
6035 * contain a representation of the filename, therefore the following
6036 * suffices without querying the (exact_)filename of this BDS. */
6037 if (bs->file->full_open_options) {
6038 qdict_put_obj(opts, "driver",
6039 QOBJECT(qstring_from_str(drv->format_name)));
6040 QINCREF(bs->file->full_open_options);
6041 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6043 bs->full_open_options = opts;
6047 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6048 /* There is no underlying file BDS (at least referenced by BDS.file),
6049 * so the full options QDict should be equal to the options given
6050 * specifically for this block device when it was opened (plus the
6051 * driver specification).
6052 * Because those options don't change, there is no need to update
6053 * full_open_options when it's already set. */
6056 append_open_options(opts, bs);
6057 qdict_put_obj(opts, "driver",
6058 QOBJECT(qstring_from_str(drv->format_name)));
6060 if (bs->exact_filename[0]) {
6061 /* This may not work for all block protocol drivers (some may
6062 * require this filename to be parsed), but we have to find some
6063 * default solution here, so just include it. If some block driver
6064 * does not support pure options without any filename at all or
6065 * needs some special format of the options QDict, it needs to
6066 * implement the driver-specific bdrv_refresh_filename() function.
6068 qdict_put_obj(opts, "filename",
6069 QOBJECT(qstring_from_str(bs->exact_filename)));
6072 bs->full_open_options = opts;
6075 if (bs->exact_filename[0]) {
6076 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6077 } else if (bs->full_open_options) {
6078 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6079 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6080 qstring_get_str(json));
6085 /* This accessor function purpose is to allow the device models to access the
6086 * BlockAcctStats structure embedded inside a BlockDriverState without being
6087 * aware of the BlockDriverState structure layout.
6088 * It will go away when the BlockAcctStats structure will be moved inside
6089 * the device models.
6091 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)