2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 BlockDriverCompletionFunc *cb,
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
208 return strchr(path, ':') != NULL;
211 int path_is_absolute(const char *path)
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
219 p = strchr(path, ':');
225 return (*p == '/' || *p == '\\');
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
246 p = strchr(base_path, ':');
251 p1 = strrchr(base_path, '/');
255 p2 = strrchr(base_path, '\\');
267 if (len > dest_size - 1)
269 memcpy(dest, base_path, len);
271 pstrcat(dest, dest_size, filename);
275 void bdrv_register(BlockDriver *bdrv)
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
298 BlockDriverState *bs;
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305 bdrv_iostatus_disable(bs);
309 BlockDriver *bdrv_find_format(const char *format_name)
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
320 static int bdrv_is_whitelisted(BlockDriver *drv)
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
328 return 1; /* no whitelist, anything goes */
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
344 typedef struct CreateCo {
347 QEMUOptionParameter *options;
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
353 CreateCo *cco = opaque;
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
367 .filename = g_strdup(filename),
372 if (!drv->bdrv_create) {
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
388 g_free(cco.filename);
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
397 drv = bdrv_find_protocol(filename);
402 return bdrv_create(drv, filename, options);
406 void get_tmp_filename(char *filename, int size)
408 char temp_dir[MAX_PATH];
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
414 void get_tmp_filename(char *filename, int size)
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
432 static BlockDriver *find_hdev_driver(const char *filename)
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
450 BlockDriver *bdrv_find_protocol(const char *filename)
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
466 drv1 = find_hdev_driver(filename);
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
474 p = strchr(filename, ':');
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
495 BlockDriverState *bs;
497 ret = bdrv_file_open(&bs, filename, 0);
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
506 drv = bdrv_find_format("raw");
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
540 * Set the current 'total_sectors' value
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
544 BlockDriver *drv = bs->drv;
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
556 hint = length >> BDRV_SECTOR_BITS;
559 bs->total_sectors = hint;
564 * Set open flags for a given cache mode
566 * Return 0 on success, -1 if the cache mode was invalid.
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
570 *flags &= ~BDRV_O_CACHE_MASK;
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
602 assert(bs->copy_on_read > 0);
607 * Common part for opening disk images and files
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
619 bs->total_sectors = 0;
623 bs->open_flags = flags;
625 bs->buffer_alignment = 512;
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
640 bs->opaque = g_malloc0(drv->instance_size);
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
645 * Clear flags that are internal to the block layer before opening the
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
651 * Snapshots should be writable.
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
665 ret = drv->bdrv_open(bs, open_flags);
673 ret = refresh_total_sectors(bs, bs->total_sectors);
679 if (bs->is_temporary) {
687 bdrv_delete(bs->file);
697 * Opens a file using a protocol (file, host_device, nbd, ...)
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
701 BlockDriverState *bs;
705 drv = bdrv_find_protocol(filename);
711 ret = bdrv_open_common(bs, filename, flags, drv);
722 * Opens a disk image (raw, qcow2, vmdk, ...)
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
728 char tmp_filename[PATH_MAX];
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
741 /* if there is a backing file, use it */
743 ret = bdrv_open(bs1, filename, 0, drv);
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
750 if (bs1->drv && bs1->drv->protocol_name)
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
757 /* Real path is meaningless for protocols */
759 snprintf(backing_filename, sizeof(backing_filename),
761 else if (!realpath(filename, backing_filename))
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
780 filename = tmp_filename;
782 bs->is_temporary = 1;
785 /* Find the right image format driver */
787 ret = find_image_format(filename, &drv);
791 goto unlink_and_fail;
795 ret = bdrv_open_common(bs, filename, flags, drv);
797 goto unlink_and_fail;
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
804 BlockDriver *back_drv = NULL;
806 bs->backing_hd = bdrv_new("");
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
820 /* backing files always opened read-only */
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
849 if (bs->is_temporary) {
855 void bdrv_close(BlockDriverState *bs)
860 block_job_cancel_sync(bs->job);
864 if (bs == bs_snapshots) {
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
871 bs->drv->bdrv_close(bs);
874 if (bs->is_temporary) {
875 unlink(bs->filename);
880 bs->copy_on_read = 0;
881 bs->backing_file[0] = '\0';
882 bs->backing_format[0] = '\0';
884 if (bs->file != NULL) {
885 bdrv_close(bs->file);
888 bdrv_dev_change_media_cb(bs, false);
891 /*throttling disk I/O limits*/
892 if (bs->io_limits_enabled) {
893 bdrv_io_limits_disable(bs);
897 void bdrv_close_all(void)
899 BlockDriverState *bs;
901 QTAILQ_FOREACH(bs, &bdrv_states, list) {
907 * Wait for pending requests to complete across all BlockDriverStates
909 * This function does not flush data to disk, use bdrv_flush_all() for that
910 * after calling this function.
912 * Note that completion of an asynchronous I/O operation can trigger any
913 * number of other I/O operations on other devices---for example a coroutine
914 * can be arbitrarily complex and a constant flow of I/O can come until the
915 * coroutine is complete. Because of this, it is not possible to have a
916 * function to drain a single device's I/O queue.
918 void bdrv_drain_all(void)
920 BlockDriverState *bs;
924 busy = qemu_aio_wait();
926 /* FIXME: We do not have timer support here, so this is effectively
929 QTAILQ_FOREACH(bs, &bdrv_states, list) {
930 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
931 qemu_co_queue_restart_all(&bs->throttled_reqs);
937 /* If requests are still pending there is a bug somewhere */
938 QTAILQ_FOREACH(bs, &bdrv_states, list) {
939 assert(QLIST_EMPTY(&bs->tracked_requests));
940 assert(qemu_co_queue_empty(&bs->throttled_reqs));
944 /* make a BlockDriverState anonymous by removing from bdrv_state list.
945 Also, NULL terminate the device_name to prevent double remove */
946 void bdrv_make_anon(BlockDriverState *bs)
948 if (bs->device_name[0] != '\0') {
949 QTAILQ_REMOVE(&bdrv_states, bs, list);
951 bs->device_name[0] = '\0';
954 static void bdrv_rebind(BlockDriverState *bs)
956 if (bs->drv && bs->drv->bdrv_rebind) {
957 bs->drv->bdrv_rebind(bs);
962 * Add new bs contents at the top of an image chain while the chain is
963 * live, while keeping required fields on the top layer.
965 * This will modify the BlockDriverState fields, and swap contents
966 * between bs_new and bs_top. Both bs_new and bs_top are modified.
968 * bs_new is required to be anonymous.
970 * This function does not create any image files.
972 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
974 BlockDriverState tmp;
976 /* bs_new must be anonymous */
977 assert(bs_new->device_name[0] == '\0');
981 /* there are some fields that need to stay on the top layer: */
982 tmp.open_flags = bs_top->open_flags;
985 tmp.dev_ops = bs_top->dev_ops;
986 tmp.dev_opaque = bs_top->dev_opaque;
987 tmp.dev = bs_top->dev;
988 tmp.buffer_alignment = bs_top->buffer_alignment;
989 tmp.copy_on_read = bs_top->copy_on_read;
991 /* i/o timing parameters */
992 tmp.slice_time = bs_top->slice_time;
993 tmp.slice_start = bs_top->slice_start;
994 tmp.slice_end = bs_top->slice_end;
995 tmp.io_limits = bs_top->io_limits;
996 tmp.io_base = bs_top->io_base;
997 tmp.throttled_reqs = bs_top->throttled_reqs;
998 tmp.block_timer = bs_top->block_timer;
999 tmp.io_limits_enabled = bs_top->io_limits_enabled;
1002 tmp.cyls = bs_top->cyls;
1003 tmp.heads = bs_top->heads;
1004 tmp.secs = bs_top->secs;
1005 tmp.translation = bs_top->translation;
1008 tmp.on_read_error = bs_top->on_read_error;
1009 tmp.on_write_error = bs_top->on_write_error;
1012 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1013 tmp.iostatus = bs_top->iostatus;
1015 /* keep the same entry in bdrv_states */
1016 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1017 tmp.list = bs_top->list;
1019 /* The contents of 'tmp' will become bs_top, as we are
1020 * swapping bs_new and bs_top contents. */
1021 tmp.backing_hd = bs_new;
1022 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1023 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1025 /* swap contents of the fixed new bs and the current top */
1029 /* device_name[] was carried over from the old bs_top. bs_new
1030 * shouldn't be in bdrv_states, so we need to make device_name[]
1031 * reflect the anonymity of bs_new
1033 bs_new->device_name[0] = '\0';
1035 /* clear the copied fields in the new backing file */
1036 bdrv_detach_dev(bs_new, bs_new->dev);
1038 qemu_co_queue_init(&bs_new->throttled_reqs);
1039 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1040 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1041 bdrv_iostatus_disable(bs_new);
1043 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1044 * to affect or delete the block_timer, as it has been moved to bs_top */
1045 bs_new->io_limits_enabled = false;
1046 bs_new->block_timer = NULL;
1047 bs_new->slice_time = 0;
1048 bs_new->slice_start = 0;
1049 bs_new->slice_end = 0;
1051 bdrv_rebind(bs_new);
1052 bdrv_rebind(bs_top);
1055 void bdrv_delete(BlockDriverState *bs)
1059 assert(!bs->in_use);
1061 /* remove from list, if necessary */
1065 if (bs->file != NULL) {
1066 bdrv_delete(bs->file);
1069 assert(bs != bs_snapshots);
1073 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1074 /* TODO change to DeviceState *dev when all users are qdevified */
1080 bdrv_iostatus_reset(bs);
1084 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1085 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1087 if (bdrv_attach_dev(bs, dev) < 0) {
1092 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1093 /* TODO change to DeviceState *dev when all users are qdevified */
1095 assert(bs->dev == dev);
1098 bs->dev_opaque = NULL;
1099 bs->buffer_alignment = 512;
1102 /* TODO change to return DeviceState * when all users are qdevified */
1103 void *bdrv_get_attached_dev(BlockDriverState *bs)
1108 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1112 bs->dev_opaque = opaque;
1113 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1114 bs_snapshots = NULL;
1118 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1119 BlockQMPEventAction action, int is_read)
1122 const char *action_str;
1125 case BDRV_ACTION_REPORT:
1126 action_str = "report";
1128 case BDRV_ACTION_IGNORE:
1129 action_str = "ignore";
1131 case BDRV_ACTION_STOP:
1132 action_str = "stop";
1138 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1141 is_read ? "read" : "write");
1142 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1144 qobject_decref(data);
1147 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1151 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1152 bdrv_get_device_name(bs), ejected);
1153 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1155 qobject_decref(data);
1158 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1160 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1161 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1162 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1163 if (tray_was_closed) {
1165 bdrv_emit_qmp_eject_event(bs, true);
1169 bdrv_emit_qmp_eject_event(bs, false);
1174 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1176 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1179 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1181 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1182 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1186 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1188 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1189 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1194 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1196 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1197 bs->dev_ops->resize_cb(bs->dev_opaque);
1201 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1203 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1204 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1210 * Run consistency checks on an image
1212 * Returns 0 if the check could be completed (it doesn't mean that the image is
1213 * free of errors) or -errno when an internal error occurred. The results of the
1214 * check are stored in res.
1216 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1218 if (bs->drv->bdrv_check == NULL) {
1222 memset(res, 0, sizeof(*res));
1223 return bs->drv->bdrv_check(bs, res);
1226 #define COMMIT_BUF_SECTORS 2048
1228 /* commit COW file into the raw image */
1229 int bdrv_commit(BlockDriverState *bs)
1231 BlockDriver *drv = bs->drv;
1232 BlockDriver *backing_drv;
1233 int64_t sector, total_sectors;
1234 int n, ro, open_flags;
1235 int ret = 0, rw_ret = 0;
1237 char filename[1024];
1238 BlockDriverState *bs_rw, *bs_ro;
1243 if (!bs->backing_hd) {
1247 if (bs->backing_hd->keep_read_only) {
1251 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1255 backing_drv = bs->backing_hd->drv;
1256 ro = bs->backing_hd->read_only;
1257 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1258 open_flags = bs->backing_hd->open_flags;
1262 bdrv_delete(bs->backing_hd);
1263 bs->backing_hd = NULL;
1264 bs_rw = bdrv_new("");
1265 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1269 /* try to re-open read-only */
1270 bs_ro = bdrv_new("");
1271 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1275 /* drive not functional anymore */
1279 bs->backing_hd = bs_ro;
1282 bs->backing_hd = bs_rw;
1285 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1286 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1288 for (sector = 0; sector < total_sectors; sector += n) {
1289 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1291 if (bdrv_read(bs, sector, buf, n) != 0) {
1296 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1303 if (drv->bdrv_make_empty) {
1304 ret = drv->bdrv_make_empty(bs);
1309 * Make sure all data we wrote to the backing device is actually
1313 bdrv_flush(bs->backing_hd);
1320 bdrv_delete(bs->backing_hd);
1321 bs->backing_hd = NULL;
1322 bs_ro = bdrv_new("");
1323 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1327 /* drive not functional anymore */
1331 bs->backing_hd = bs_ro;
1332 bs->backing_hd->keep_read_only = 0;
1338 int bdrv_commit_all(void)
1340 BlockDriverState *bs;
1342 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1343 int ret = bdrv_commit(bs);
1351 struct BdrvTrackedRequest {
1352 BlockDriverState *bs;
1356 QLIST_ENTRY(BdrvTrackedRequest) list;
1357 Coroutine *co; /* owner, used for deadlock detection */
1358 CoQueue wait_queue; /* coroutines blocked on this request */
1362 * Remove an active request from the tracked requests list
1364 * This function should be called when a tracked request is completing.
1366 static void tracked_request_end(BdrvTrackedRequest *req)
1368 QLIST_REMOVE(req, list);
1369 qemu_co_queue_restart_all(&req->wait_queue);
1373 * Add an active request to the tracked requests list
1375 static void tracked_request_begin(BdrvTrackedRequest *req,
1376 BlockDriverState *bs,
1378 int nb_sectors, bool is_write)
1380 *req = (BdrvTrackedRequest){
1382 .sector_num = sector_num,
1383 .nb_sectors = nb_sectors,
1384 .is_write = is_write,
1385 .co = qemu_coroutine_self(),
1388 qemu_co_queue_init(&req->wait_queue);
1390 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1394 * Round a region to cluster boundaries
1396 static void round_to_clusters(BlockDriverState *bs,
1397 int64_t sector_num, int nb_sectors,
1398 int64_t *cluster_sector_num,
1399 int *cluster_nb_sectors)
1401 BlockDriverInfo bdi;
1403 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1404 *cluster_sector_num = sector_num;
1405 *cluster_nb_sectors = nb_sectors;
1407 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1408 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1409 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1414 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1415 int64_t sector_num, int nb_sectors) {
1417 if (sector_num >= req->sector_num + req->nb_sectors) {
1421 if (req->sector_num >= sector_num + nb_sectors) {
1427 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1428 int64_t sector_num, int nb_sectors)
1430 BdrvTrackedRequest *req;
1431 int64_t cluster_sector_num;
1432 int cluster_nb_sectors;
1435 /* If we touch the same cluster it counts as an overlap. This guarantees
1436 * that allocating writes will be serialized and not race with each other
1437 * for the same cluster. For example, in copy-on-read it ensures that the
1438 * CoR read and write operations are atomic and guest writes cannot
1439 * interleave between them.
1441 round_to_clusters(bs, sector_num, nb_sectors,
1442 &cluster_sector_num, &cluster_nb_sectors);
1446 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1447 if (tracked_request_overlaps(req, cluster_sector_num,
1448 cluster_nb_sectors)) {
1449 /* Hitting this means there was a reentrant request, for
1450 * example, a block driver issuing nested requests. This must
1451 * never happen since it means deadlock.
1453 assert(qemu_coroutine_self() != req->co);
1455 qemu_co_queue_wait(&req->wait_queue);
1466 * -EINVAL - backing format specified, but no file
1467 * -ENOSPC - can't update the backing file because no space is left in the
1469 * -ENOTSUP - format driver doesn't support changing the backing file
1471 int bdrv_change_backing_file(BlockDriverState *bs,
1472 const char *backing_file, const char *backing_fmt)
1474 BlockDriver *drv = bs->drv;
1477 /* Backing file format doesn't make sense without a backing file */
1478 if (backing_fmt && !backing_file) {
1482 if (drv->bdrv_change_backing_file != NULL) {
1483 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1489 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1490 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1495 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1500 if (!bdrv_is_inserted(bs))
1506 len = bdrv_getlength(bs);
1511 if ((offset > len) || (len - offset < size))
1517 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1520 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1521 nb_sectors * BDRV_SECTOR_SIZE);
1524 typedef struct RwCo {
1525 BlockDriverState *bs;
1533 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1535 RwCo *rwco = opaque;
1537 if (!rwco->is_write) {
1538 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1539 rwco->nb_sectors, rwco->qiov, 0);
1541 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1542 rwco->nb_sectors, rwco->qiov, 0);
1547 * Process a synchronous request using coroutines
1549 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1550 int nb_sectors, bool is_write)
1553 struct iovec iov = {
1554 .iov_base = (void *)buf,
1555 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1560 .sector_num = sector_num,
1561 .nb_sectors = nb_sectors,
1563 .is_write = is_write,
1567 qemu_iovec_init_external(&qiov, &iov, 1);
1570 * In sync call context, when the vcpu is blocked, this throttling timer
1571 * will not fire; so the I/O throttling function has to be disabled here
1572 * if it has been enabled.
1574 if (bs->io_limits_enabled) {
1575 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1576 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1577 bdrv_io_limits_disable(bs);
1580 if (qemu_in_coroutine()) {
1581 /* Fast-path if already in coroutine context */
1582 bdrv_rw_co_entry(&rwco);
1584 co = qemu_coroutine_create(bdrv_rw_co_entry);
1585 qemu_coroutine_enter(co, &rwco);
1586 while (rwco.ret == NOT_DONE) {
1593 /* return < 0 if error. See bdrv_write() for the return codes */
1594 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1595 uint8_t *buf, int nb_sectors)
1597 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1600 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1602 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1603 int nb_sectors, int dirty)
1606 unsigned long val, idx, bit;
1608 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1609 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1611 for (; start <= end; start++) {
1612 idx = start / BITS_PER_LONG;
1613 bit = start % BITS_PER_LONG;
1614 val = bs->dirty_bitmap[idx];
1616 if (!(val & (1UL << bit))) {
1621 if (val & (1UL << bit)) {
1623 val &= ~(1UL << bit);
1626 bs->dirty_bitmap[idx] = val;
1630 /* Return < 0 if error. Important errors are:
1631 -EIO generic I/O error (may happen for all errors)
1632 -ENOMEDIUM No media inserted.
1633 -EINVAL Invalid sector number or nb_sectors
1634 -EACCES Trying to write a read-only device
1636 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1637 const uint8_t *buf, int nb_sectors)
1639 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1642 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1643 void *buf, int count1)
1645 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1646 int len, nb_sectors, count;
1651 /* first read to align to sector start */
1652 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1655 sector_num = offset >> BDRV_SECTOR_BITS;
1657 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1659 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1667 /* read the sectors "in place" */
1668 nb_sectors = count >> BDRV_SECTOR_BITS;
1669 if (nb_sectors > 0) {
1670 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1672 sector_num += nb_sectors;
1673 len = nb_sectors << BDRV_SECTOR_BITS;
1678 /* add data from the last sector */
1680 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1682 memcpy(buf, tmp_buf, count);
1687 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1688 const void *buf, int count1)
1690 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1691 int len, nb_sectors, count;
1696 /* first write to align to sector start */
1697 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1700 sector_num = offset >> BDRV_SECTOR_BITS;
1702 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1704 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1705 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1714 /* write the sectors "in place" */
1715 nb_sectors = count >> BDRV_SECTOR_BITS;
1716 if (nb_sectors > 0) {
1717 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1719 sector_num += nb_sectors;
1720 len = nb_sectors << BDRV_SECTOR_BITS;
1725 /* add data from the last sector */
1727 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1729 memcpy(tmp_buf, buf, count);
1730 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1737 * Writes to the file and ensures that no writes are reordered across this
1738 * request (acts as a barrier)
1740 * Returns 0 on success, -errno in error cases.
1742 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1743 const void *buf, int count)
1747 ret = bdrv_pwrite(bs, offset, buf, count);
1752 /* No flush needed for cache modes that use O_DSYNC */
1753 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1760 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1761 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1763 /* Perform I/O through a temporary buffer so that users who scribble over
1764 * their read buffer while the operation is in progress do not end up
1765 * modifying the image file. This is critical for zero-copy guest I/O
1766 * where anything might happen inside guest memory.
1768 void *bounce_buffer;
1770 BlockDriver *drv = bs->drv;
1772 QEMUIOVector bounce_qiov;
1773 int64_t cluster_sector_num;
1774 int cluster_nb_sectors;
1778 /* Cover entire cluster so no additional backing file I/O is required when
1779 * allocating cluster in the image file.
1781 round_to_clusters(bs, sector_num, nb_sectors,
1782 &cluster_sector_num, &cluster_nb_sectors);
1784 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1785 cluster_sector_num, cluster_nb_sectors);
1787 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1788 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1789 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1791 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1797 if (drv->bdrv_co_write_zeroes &&
1798 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1799 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1800 cluster_nb_sectors);
1802 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1807 /* It might be okay to ignore write errors for guest requests. If this
1808 * is a deliberate copy-on-read then we don't want to ignore the error.
1809 * Simply report it in all cases.
1814 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1815 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1816 nb_sectors * BDRV_SECTOR_SIZE);
1819 qemu_vfree(bounce_buffer);
1824 * Handle a read request in coroutine context
1826 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1827 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1828 BdrvRequestFlags flags)
1830 BlockDriver *drv = bs->drv;
1831 BdrvTrackedRequest req;
1837 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1841 /* throttling disk read I/O */
1842 if (bs->io_limits_enabled) {
1843 bdrv_io_limits_intercept(bs, false, nb_sectors);
1846 if (bs->copy_on_read) {
1847 flags |= BDRV_REQ_COPY_ON_READ;
1849 if (flags & BDRV_REQ_COPY_ON_READ) {
1850 bs->copy_on_read_in_flight++;
1853 if (bs->copy_on_read_in_flight) {
1854 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1857 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1859 if (flags & BDRV_REQ_COPY_ON_READ) {
1862 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1867 if (!ret || pnum != nb_sectors) {
1868 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1873 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1876 tracked_request_end(&req);
1878 if (flags & BDRV_REQ_COPY_ON_READ) {
1879 bs->copy_on_read_in_flight--;
1885 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1886 int nb_sectors, QEMUIOVector *qiov)
1888 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1890 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1893 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1894 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1896 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1898 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1899 BDRV_REQ_COPY_ON_READ);
1902 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1903 int64_t sector_num, int nb_sectors)
1905 BlockDriver *drv = bs->drv;
1910 /* TODO Emulate only part of misaligned requests instead of letting block
1911 * drivers return -ENOTSUP and emulate everything */
1913 /* First try the efficient write zeroes operation */
1914 if (drv->bdrv_co_write_zeroes) {
1915 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1916 if (ret != -ENOTSUP) {
1921 /* Fall back to bounce buffer if write zeroes is unsupported */
1922 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1923 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1924 memset(iov.iov_base, 0, iov.iov_len);
1925 qemu_iovec_init_external(&qiov, &iov, 1);
1927 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1929 qemu_vfree(iov.iov_base);
1934 * Handle a write request in coroutine context
1936 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1937 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1938 BdrvRequestFlags flags)
1940 BlockDriver *drv = bs->drv;
1941 BdrvTrackedRequest req;
1947 if (bs->read_only) {
1950 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1954 /* throttling disk write I/O */
1955 if (bs->io_limits_enabled) {
1956 bdrv_io_limits_intercept(bs, true, nb_sectors);
1959 if (bs->copy_on_read_in_flight) {
1960 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1963 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1965 if (flags & BDRV_REQ_ZERO_WRITE) {
1966 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1968 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1971 if (bs->dirty_bitmap) {
1972 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1975 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1976 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1979 tracked_request_end(&req);
1984 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1985 int nb_sectors, QEMUIOVector *qiov)
1987 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1989 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1992 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1993 int64_t sector_num, int nb_sectors)
1995 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1997 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1998 BDRV_REQ_ZERO_WRITE);
2002 * Truncate file to 'offset' bytes (needed only for file protocols)
2004 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2006 BlockDriver *drv = bs->drv;
2010 if (!drv->bdrv_truncate)
2014 if (bdrv_in_use(bs))
2016 ret = drv->bdrv_truncate(bs, offset);
2018 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2019 bdrv_dev_resize_cb(bs);
2025 * Length of a allocated file in bytes. Sparse files are counted by actual
2026 * allocated space. Return < 0 if error or unknown.
2028 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2030 BlockDriver *drv = bs->drv;
2034 if (drv->bdrv_get_allocated_file_size) {
2035 return drv->bdrv_get_allocated_file_size(bs);
2038 return bdrv_get_allocated_file_size(bs->file);
2044 * Length of a file in bytes. Return < 0 if error or unknown.
2046 int64_t bdrv_getlength(BlockDriverState *bs)
2048 BlockDriver *drv = bs->drv;
2052 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2053 if (drv->bdrv_getlength) {
2054 return drv->bdrv_getlength(bs);
2057 return bs->total_sectors * BDRV_SECTOR_SIZE;
2060 /* return 0 as number of sectors if no device present or error */
2061 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2064 length = bdrv_getlength(bs);
2068 length = length >> BDRV_SECTOR_BITS;
2069 *nb_sectors_ptr = length;
2073 uint8_t boot_ind; /* 0x80 - active */
2074 uint8_t head; /* starting head */
2075 uint8_t sector; /* starting sector */
2076 uint8_t cyl; /* starting cylinder */
2077 uint8_t sys_ind; /* What partition type */
2078 uint8_t end_head; /* end head */
2079 uint8_t end_sector; /* end sector */
2080 uint8_t end_cyl; /* end cylinder */
2081 uint32_t start_sect; /* starting sector counting from 0 */
2082 uint32_t nr_sects; /* nr of sectors in partition */
2085 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2086 static int guess_disk_lchs(BlockDriverState *bs,
2087 int *pcylinders, int *pheads, int *psectors)
2089 uint8_t buf[BDRV_SECTOR_SIZE];
2090 int ret, i, heads, sectors, cylinders;
2091 struct partition *p;
2093 uint64_t nb_sectors;
2096 bdrv_get_geometry(bs, &nb_sectors);
2099 * The function will be invoked during startup not only in sync I/O mode,
2100 * but also in async I/O mode. So the I/O throttling function has to
2101 * be disabled temporarily here, not permanently.
2103 enabled = bs->io_limits_enabled;
2104 bs->io_limits_enabled = false;
2105 ret = bdrv_read(bs, 0, buf, 1);
2106 bs->io_limits_enabled = enabled;
2109 /* test msdos magic */
2110 if (buf[510] != 0x55 || buf[511] != 0xaa)
2112 for(i = 0; i < 4; i++) {
2113 p = ((struct partition *)(buf + 0x1be)) + i;
2114 nr_sects = le32_to_cpu(p->nr_sects);
2115 if (nr_sects && p->end_head) {
2116 /* We make the assumption that the partition terminates on
2117 a cylinder boundary */
2118 heads = p->end_head + 1;
2119 sectors = p->end_sector & 63;
2122 cylinders = nb_sectors / (heads * sectors);
2123 if (cylinders < 1 || cylinders > 16383)
2126 *psectors = sectors;
2127 *pcylinders = cylinders;
2129 printf("guessed geometry: LCHS=%d %d %d\n",
2130 cylinders, heads, sectors);
2138 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2140 int translation, lba_detected = 0;
2141 int cylinders, heads, secs;
2142 uint64_t nb_sectors;
2144 /* if a geometry hint is available, use it */
2145 bdrv_get_geometry(bs, &nb_sectors);
2146 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2147 translation = bdrv_get_translation_hint(bs);
2148 if (cylinders != 0) {
2153 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2155 /* if heads > 16, it means that a BIOS LBA
2156 translation was active, so the default
2157 hardware geometry is OK */
2159 goto default_geometry;
2164 /* disable any translation to be in sync with
2165 the logical geometry */
2166 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2167 bdrv_set_translation_hint(bs,
2168 BIOS_ATA_TRANSLATION_NONE);
2173 /* if no geometry, use a standard physical disk geometry */
2174 cylinders = nb_sectors / (16 * 63);
2176 if (cylinders > 16383)
2178 else if (cylinders < 2)
2183 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2184 if ((*pcyls * *pheads) <= 131072) {
2185 bdrv_set_translation_hint(bs,
2186 BIOS_ATA_TRANSLATION_LARGE);
2188 bdrv_set_translation_hint(bs,
2189 BIOS_ATA_TRANSLATION_LBA);
2193 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2197 void bdrv_set_geometry_hint(BlockDriverState *bs,
2198 int cyls, int heads, int secs)
2205 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2207 bs->translation = translation;
2210 void bdrv_get_geometry_hint(BlockDriverState *bs,
2211 int *pcyls, int *pheads, int *psecs)
2214 *pheads = bs->heads;
2218 /* throttling disk io limits */
2219 void bdrv_set_io_limits(BlockDriverState *bs,
2220 BlockIOLimit *io_limits)
2222 bs->io_limits = *io_limits;
2223 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2226 /* Recognize floppy formats */
2227 typedef struct FDFormat {
2235 static const FDFormat fd_formats[] = {
2236 /* First entry is default format */
2237 /* 1.44 MB 3"1/2 floppy disks */
2238 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2239 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2240 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2241 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2242 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2243 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2244 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2245 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2246 /* 2.88 MB 3"1/2 floppy disks */
2247 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2248 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2249 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2250 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2251 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2252 /* 720 kB 3"1/2 floppy disks */
2253 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2254 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2255 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2256 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2257 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2258 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2259 /* 1.2 MB 5"1/4 floppy disks */
2260 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2262 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2263 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2264 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2265 /* 720 kB 5"1/4 floppy disks */
2266 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2267 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2268 /* 360 kB 5"1/4 floppy disks */
2269 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2270 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2271 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2272 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2273 /* 320 kB 5"1/4 floppy disks */
2274 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2275 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2276 /* 360 kB must match 5"1/4 better than 3"1/2... */
2277 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2279 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2282 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2283 int *max_track, int *last_sect,
2284 FDriveType drive_in, FDriveType *drive,
2287 const FDFormat *parse;
2288 uint64_t nb_sectors, size;
2289 int i, first_match, match;
2291 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2292 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2293 /* User defined disk */
2294 *rate = FDRIVE_RATE_500K;
2296 bdrv_get_geometry(bs, &nb_sectors);
2299 for (i = 0; ; i++) {
2300 parse = &fd_formats[i];
2301 if (parse->drive == FDRIVE_DRV_NONE) {
2304 if (drive_in == parse->drive ||
2305 drive_in == FDRIVE_DRV_NONE) {
2306 size = (parse->max_head + 1) * parse->max_track *
2308 if (nb_sectors == size) {
2312 if (first_match == -1) {
2318 if (first_match == -1) {
2321 match = first_match;
2323 parse = &fd_formats[match];
2325 *nb_heads = parse->max_head + 1;
2326 *max_track = parse->max_track;
2327 *last_sect = parse->last_sect;
2328 *drive = parse->drive;
2329 *rate = parse->rate;
2333 int bdrv_get_translation_hint(BlockDriverState *bs)
2335 return bs->translation;
2338 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2339 BlockErrorAction on_write_error)
2341 bs->on_read_error = on_read_error;
2342 bs->on_write_error = on_write_error;
2345 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2347 return is_read ? bs->on_read_error : bs->on_write_error;
2350 int bdrv_is_read_only(BlockDriverState *bs)
2352 return bs->read_only;
2355 int bdrv_is_sg(BlockDriverState *bs)
2360 int bdrv_enable_write_cache(BlockDriverState *bs)
2362 return bs->enable_write_cache;
2365 int bdrv_is_encrypted(BlockDriverState *bs)
2367 if (bs->backing_hd && bs->backing_hd->encrypted)
2369 return bs->encrypted;
2372 int bdrv_key_required(BlockDriverState *bs)
2374 BlockDriverState *backing_hd = bs->backing_hd;
2376 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2378 return (bs->encrypted && !bs->valid_key);
2381 int bdrv_set_key(BlockDriverState *bs, const char *key)
2384 if (bs->backing_hd && bs->backing_hd->encrypted) {
2385 ret = bdrv_set_key(bs->backing_hd, key);
2391 if (!bs->encrypted) {
2393 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2396 ret = bs->drv->bdrv_set_key(bs, key);
2399 } else if (!bs->valid_key) {
2401 /* call the change callback now, we skipped it on open */
2402 bdrv_dev_change_media_cb(bs, true);
2407 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2412 pstrcpy(buf, buf_size, bs->drv->format_name);
2416 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2421 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2422 it(opaque, drv->format_name);
2426 BlockDriverState *bdrv_find(const char *name)
2428 BlockDriverState *bs;
2430 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2431 if (!strcmp(name, bs->device_name)) {
2438 BlockDriverState *bdrv_next(BlockDriverState *bs)
2441 return QTAILQ_FIRST(&bdrv_states);
2443 return QTAILQ_NEXT(bs, list);
2446 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2448 BlockDriverState *bs;
2450 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2455 const char *bdrv_get_device_name(BlockDriverState *bs)
2457 return bs->device_name;
2460 void bdrv_flush_all(void)
2462 BlockDriverState *bs;
2464 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2469 int bdrv_has_zero_init(BlockDriverState *bs)
2473 if (bs->drv->bdrv_has_zero_init) {
2474 return bs->drv->bdrv_has_zero_init(bs);
2480 typedef struct BdrvCoIsAllocatedData {
2481 BlockDriverState *bs;
2487 } BdrvCoIsAllocatedData;
2490 * Returns true iff the specified sector is present in the disk image. Drivers
2491 * not implementing the functionality are assumed to not support backing files,
2492 * hence all their sectors are reported as allocated.
2494 * If 'sector_num' is beyond the end of the disk image the return value is 0
2495 * and 'pnum' is set to 0.
2497 * 'pnum' is set to the number of sectors (including and immediately following
2498 * the specified sector) that are known to be in the same
2499 * allocated/unallocated state.
2501 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2502 * beyond the end of the disk image it will be clamped.
2504 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2505 int nb_sectors, int *pnum)
2509 if (sector_num >= bs->total_sectors) {
2514 n = bs->total_sectors - sector_num;
2515 if (n < nb_sectors) {
2519 if (!bs->drv->bdrv_co_is_allocated) {
2524 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2527 /* Coroutine wrapper for bdrv_is_allocated() */
2528 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2530 BdrvCoIsAllocatedData *data = opaque;
2531 BlockDriverState *bs = data->bs;
2533 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2539 * Synchronous wrapper around bdrv_co_is_allocated().
2541 * See bdrv_co_is_allocated() for details.
2543 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2547 BdrvCoIsAllocatedData data = {
2549 .sector_num = sector_num,
2550 .nb_sectors = nb_sectors,
2555 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2556 qemu_coroutine_enter(co, &data);
2557 while (!data.done) {
2563 BlockInfoList *qmp_query_block(Error **errp)
2565 BlockInfoList *head = NULL, *cur_item = NULL;
2566 BlockDriverState *bs;
2568 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2569 BlockInfoList *info = g_malloc0(sizeof(*info));
2571 info->value = g_malloc0(sizeof(*info->value));
2572 info->value->device = g_strdup(bs->device_name);
2573 info->value->type = g_strdup("unknown");
2574 info->value->locked = bdrv_dev_is_medium_locked(bs);
2575 info->value->removable = bdrv_dev_has_removable_media(bs);
2577 if (bdrv_dev_has_removable_media(bs)) {
2578 info->value->has_tray_open = true;
2579 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2582 if (bdrv_iostatus_is_enabled(bs)) {
2583 info->value->has_io_status = true;
2584 info->value->io_status = bs->iostatus;
2588 info->value->has_inserted = true;
2589 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2590 info->value->inserted->file = g_strdup(bs->filename);
2591 info->value->inserted->ro = bs->read_only;
2592 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2593 info->value->inserted->encrypted = bs->encrypted;
2594 if (bs->backing_file[0]) {
2595 info->value->inserted->has_backing_file = true;
2596 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2599 if (bs->io_limits_enabled) {
2600 info->value->inserted->bps =
2601 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2602 info->value->inserted->bps_rd =
2603 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2604 info->value->inserted->bps_wr =
2605 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2606 info->value->inserted->iops =
2607 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2608 info->value->inserted->iops_rd =
2609 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2610 info->value->inserted->iops_wr =
2611 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2615 /* XXX: waiting for the qapi to support GSList */
2617 head = cur_item = info;
2619 cur_item->next = info;
2627 /* Consider exposing this as a full fledged QMP command */
2628 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2632 s = g_malloc0(sizeof(*s));
2634 if (bs->device_name[0]) {
2635 s->has_device = true;
2636 s->device = g_strdup(bs->device_name);
2639 s->stats = g_malloc0(sizeof(*s->stats));
2640 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2641 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2642 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2643 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2644 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2645 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2646 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2647 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2648 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2651 s->has_parent = true;
2652 s->parent = qmp_query_blockstat(bs->file, NULL);
2658 BlockStatsList *qmp_query_blockstats(Error **errp)
2660 BlockStatsList *head = NULL, *cur_item = NULL;
2661 BlockDriverState *bs;
2663 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2664 BlockStatsList *info = g_malloc0(sizeof(*info));
2665 info->value = qmp_query_blockstat(bs, NULL);
2667 /* XXX: waiting for the qapi to support GSList */
2669 head = cur_item = info;
2671 cur_item->next = info;
2679 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2681 if (bs->backing_hd && bs->backing_hd->encrypted)
2682 return bs->backing_file;
2683 else if (bs->encrypted)
2684 return bs->filename;
2689 void bdrv_get_backing_filename(BlockDriverState *bs,
2690 char *filename, int filename_size)
2692 pstrcpy(filename, filename_size, bs->backing_file);
2695 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2696 const uint8_t *buf, int nb_sectors)
2698 BlockDriver *drv = bs->drv;
2701 if (!drv->bdrv_write_compressed)
2703 if (bdrv_check_request(bs, sector_num, nb_sectors))
2706 if (bs->dirty_bitmap) {
2707 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2710 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2713 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2715 BlockDriver *drv = bs->drv;
2718 if (!drv->bdrv_get_info)
2720 memset(bdi, 0, sizeof(*bdi));
2721 return drv->bdrv_get_info(bs, bdi);
2724 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2725 int64_t pos, int size)
2727 BlockDriver *drv = bs->drv;
2730 if (drv->bdrv_save_vmstate)
2731 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2733 return bdrv_save_vmstate(bs->file, buf, pos, size);
2737 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2738 int64_t pos, int size)
2740 BlockDriver *drv = bs->drv;
2743 if (drv->bdrv_load_vmstate)
2744 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2746 return bdrv_load_vmstate(bs->file, buf, pos, size);
2750 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2752 BlockDriver *drv = bs->drv;
2754 if (!drv || !drv->bdrv_debug_event) {
2758 return drv->bdrv_debug_event(bs, event);
2762 /**************************************************************/
2763 /* handling of snapshots */
2765 int bdrv_can_snapshot(BlockDriverState *bs)
2767 BlockDriver *drv = bs->drv;
2768 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2772 if (!drv->bdrv_snapshot_create) {
2773 if (bs->file != NULL) {
2774 return bdrv_can_snapshot(bs->file);
2782 int bdrv_is_snapshot(BlockDriverState *bs)
2784 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2787 BlockDriverState *bdrv_snapshots(void)
2789 BlockDriverState *bs;
2792 return bs_snapshots;
2796 while ((bs = bdrv_next(bs))) {
2797 if (bdrv_can_snapshot(bs)) {
2805 int bdrv_snapshot_create(BlockDriverState *bs,
2806 QEMUSnapshotInfo *sn_info)
2808 BlockDriver *drv = bs->drv;
2811 if (drv->bdrv_snapshot_create)
2812 return drv->bdrv_snapshot_create(bs, sn_info);
2814 return bdrv_snapshot_create(bs->file, sn_info);
2818 int bdrv_snapshot_goto(BlockDriverState *bs,
2819 const char *snapshot_id)
2821 BlockDriver *drv = bs->drv;
2826 if (drv->bdrv_snapshot_goto)
2827 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2830 drv->bdrv_close(bs);
2831 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2832 open_ret = drv->bdrv_open(bs, bs->open_flags);
2834 bdrv_delete(bs->file);
2844 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2846 BlockDriver *drv = bs->drv;
2849 if (drv->bdrv_snapshot_delete)
2850 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2852 return bdrv_snapshot_delete(bs->file, snapshot_id);
2856 int bdrv_snapshot_list(BlockDriverState *bs,
2857 QEMUSnapshotInfo **psn_info)
2859 BlockDriver *drv = bs->drv;
2862 if (drv->bdrv_snapshot_list)
2863 return drv->bdrv_snapshot_list(bs, psn_info);
2865 return bdrv_snapshot_list(bs->file, psn_info);
2869 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2870 const char *snapshot_name)
2872 BlockDriver *drv = bs->drv;
2876 if (!bs->read_only) {
2879 if (drv->bdrv_snapshot_load_tmp) {
2880 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2885 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2886 const char *backing_file)
2892 if (bs->backing_hd) {
2893 if (strcmp(bs->backing_file, backing_file) == 0) {
2894 return bs->backing_hd;
2896 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2903 #define NB_SUFFIXES 4
2905 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2907 static const char suffixes[NB_SUFFIXES] = "KMGT";
2912 snprintf(buf, buf_size, "%" PRId64, size);
2915 for(i = 0; i < NB_SUFFIXES; i++) {
2916 if (size < (10 * base)) {
2917 snprintf(buf, buf_size, "%0.1f%c",
2918 (double)size / base,
2921 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2922 snprintf(buf, buf_size, "%" PRId64 "%c",
2923 ((size + (base >> 1)) / base),
2933 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2935 char buf1[128], date_buf[128], clock_buf[128];
2945 snprintf(buf, buf_size,
2946 "%-10s%-20s%7s%20s%15s",
2947 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2951 ptm = localtime(&ti);
2952 strftime(date_buf, sizeof(date_buf),
2953 "%Y-%m-%d %H:%M:%S", ptm);
2955 localtime_r(&ti, &tm);
2956 strftime(date_buf, sizeof(date_buf),
2957 "%Y-%m-%d %H:%M:%S", &tm);
2959 secs = sn->vm_clock_nsec / 1000000000;
2960 snprintf(clock_buf, sizeof(clock_buf),
2961 "%02d:%02d:%02d.%03d",
2963 (int)((secs / 60) % 60),
2965 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2966 snprintf(buf, buf_size,
2967 "%-10s%-20s%7s%20s%15s",
2968 sn->id_str, sn->name,
2969 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2976 /**************************************************************/
2979 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2980 QEMUIOVector *qiov, int nb_sectors,
2981 BlockDriverCompletionFunc *cb, void *opaque)
2983 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2985 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2989 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2990 QEMUIOVector *qiov, int nb_sectors,
2991 BlockDriverCompletionFunc *cb, void *opaque)
2993 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2995 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3000 typedef struct MultiwriteCB {
3005 BlockDriverCompletionFunc *cb;
3007 QEMUIOVector *free_qiov;
3011 static void multiwrite_user_cb(MultiwriteCB *mcb)
3015 for (i = 0; i < mcb->num_callbacks; i++) {
3016 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3017 if (mcb->callbacks[i].free_qiov) {
3018 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3020 g_free(mcb->callbacks[i].free_qiov);
3024 static void multiwrite_cb(void *opaque, int ret)
3026 MultiwriteCB *mcb = opaque;
3028 trace_multiwrite_cb(mcb, ret);
3030 if (ret < 0 && !mcb->error) {
3034 mcb->num_requests--;
3035 if (mcb->num_requests == 0) {
3036 multiwrite_user_cb(mcb);
3041 static int multiwrite_req_compare(const void *a, const void *b)
3043 const BlockRequest *req1 = a, *req2 = b;
3046 * Note that we can't simply subtract req2->sector from req1->sector
3047 * here as that could overflow the return value.
3049 if (req1->sector > req2->sector) {
3051 } else if (req1->sector < req2->sector) {
3059 * Takes a bunch of requests and tries to merge them. Returns the number of
3060 * requests that remain after merging.
3062 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3063 int num_reqs, MultiwriteCB *mcb)
3067 // Sort requests by start sector
3068 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3070 // Check if adjacent requests touch the same clusters. If so, combine them,
3071 // filling up gaps with zero sectors.
3073 for (i = 1; i < num_reqs; i++) {
3075 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3077 // Handle exactly sequential writes and overlapping writes.
3078 if (reqs[i].sector <= oldreq_last) {
3082 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3088 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3089 qemu_iovec_init(qiov,
3090 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3092 // Add the first request to the merged one. If the requests are
3093 // overlapping, drop the last sectors of the first request.
3094 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3095 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3097 // We should need to add any zeros between the two requests
3098 assert (reqs[i].sector <= oldreq_last);
3100 // Add the second request
3101 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3103 reqs[outidx].nb_sectors = qiov->size >> 9;
3104 reqs[outidx].qiov = qiov;
3106 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3109 reqs[outidx].sector = reqs[i].sector;
3110 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3111 reqs[outidx].qiov = reqs[i].qiov;
3119 * Submit multiple AIO write requests at once.
3121 * On success, the function returns 0 and all requests in the reqs array have
3122 * been submitted. In error case this function returns -1, and any of the
3123 * requests may or may not be submitted yet. In particular, this means that the
3124 * callback will be called for some of the requests, for others it won't. The
3125 * caller must check the error field of the BlockRequest to wait for the right
3126 * callbacks (if error != 0, no callback will be called).
3128 * The implementation may modify the contents of the reqs array, e.g. to merge
3129 * requests. However, the fields opaque and error are left unmodified as they
3130 * are used to signal failure for a single request to the caller.
3132 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3137 /* don't submit writes if we don't have a medium */
3138 if (bs->drv == NULL) {
3139 for (i = 0; i < num_reqs; i++) {
3140 reqs[i].error = -ENOMEDIUM;
3145 if (num_reqs == 0) {
3149 // Create MultiwriteCB structure
3150 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3151 mcb->num_requests = 0;
3152 mcb->num_callbacks = num_reqs;
3154 for (i = 0; i < num_reqs; i++) {
3155 mcb->callbacks[i].cb = reqs[i].cb;
3156 mcb->callbacks[i].opaque = reqs[i].opaque;
3159 // Check for mergable requests
3160 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3162 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3164 /* Run the aio requests. */
3165 mcb->num_requests = num_reqs;
3166 for (i = 0; i < num_reqs; i++) {
3167 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3168 reqs[i].nb_sectors, multiwrite_cb, mcb);
3174 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3176 acb->pool->cancel(acb);
3179 /* block I/O throttling */
3180 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3181 bool is_write, double elapsed_time, uint64_t *wait)
3183 uint64_t bps_limit = 0;
3184 double bytes_limit, bytes_base, bytes_res;
3185 double slice_time, wait_time;
3187 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3188 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3189 } else if (bs->io_limits.bps[is_write]) {
3190 bps_limit = bs->io_limits.bps[is_write];
3199 slice_time = bs->slice_end - bs->slice_start;
3200 slice_time /= (NANOSECONDS_PER_SECOND);
3201 bytes_limit = bps_limit * slice_time;
3202 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3203 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3204 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3207 /* bytes_base: the bytes of data which have been read/written; and
3208 * it is obtained from the history statistic info.
3209 * bytes_res: the remaining bytes of data which need to be read/written.
3210 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3211 * the total time for completing reading/writting all data.
3213 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3215 if (bytes_base + bytes_res <= bytes_limit) {
3223 /* Calc approx time to dispatch */
3224 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3226 /* When the I/O rate at runtime exceeds the limits,
3227 * bs->slice_end need to be extended in order that the current statistic
3228 * info can be kept until the timer fire, so it is increased and tuned
3229 * based on the result of experiment.
3231 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3232 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3234 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3240 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3241 double elapsed_time, uint64_t *wait)
3243 uint64_t iops_limit = 0;
3244 double ios_limit, ios_base;
3245 double slice_time, wait_time;
3247 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3248 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3249 } else if (bs->io_limits.iops[is_write]) {
3250 iops_limit = bs->io_limits.iops[is_write];
3259 slice_time = bs->slice_end - bs->slice_start;
3260 slice_time /= (NANOSECONDS_PER_SECOND);
3261 ios_limit = iops_limit * slice_time;
3262 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3263 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3264 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3267 if (ios_base + 1 <= ios_limit) {
3275 /* Calc approx time to dispatch */
3276 wait_time = (ios_base + 1) / iops_limit;
3277 if (wait_time > elapsed_time) {
3278 wait_time = wait_time - elapsed_time;
3283 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3284 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3286 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3292 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3293 bool is_write, int64_t *wait)
3295 int64_t now, max_wait;
3296 uint64_t bps_wait = 0, iops_wait = 0;
3297 double elapsed_time;
3298 int bps_ret, iops_ret;
3300 now = qemu_get_clock_ns(vm_clock);
3301 if ((bs->slice_start < now)
3302 && (bs->slice_end > now)) {
3303 bs->slice_end = now + bs->slice_time;
3305 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3306 bs->slice_start = now;
3307 bs->slice_end = now + bs->slice_time;
3309 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3310 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3312 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3313 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3316 elapsed_time = now - bs->slice_start;
3317 elapsed_time /= (NANOSECONDS_PER_SECOND);
3319 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3320 is_write, elapsed_time, &bps_wait);
3321 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3322 elapsed_time, &iops_wait);
3323 if (bps_ret || iops_ret) {
3324 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3329 now = qemu_get_clock_ns(vm_clock);
3330 if (bs->slice_end < now + max_wait) {
3331 bs->slice_end = now + max_wait;
3344 /**************************************************************/
3345 /* async block device emulation */
3347 typedef struct BlockDriverAIOCBSync {
3348 BlockDriverAIOCB common;
3351 /* vector translation state */
3355 } BlockDriverAIOCBSync;
3357 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3359 BlockDriverAIOCBSync *acb =
3360 container_of(blockacb, BlockDriverAIOCBSync, common);
3361 qemu_bh_delete(acb->bh);
3363 qemu_aio_release(acb);
3366 static AIOPool bdrv_em_aio_pool = {
3367 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3368 .cancel = bdrv_aio_cancel_em,
3371 static void bdrv_aio_bh_cb(void *opaque)
3373 BlockDriverAIOCBSync *acb = opaque;
3376 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3377 qemu_vfree(acb->bounce);
3378 acb->common.cb(acb->common.opaque, acb->ret);
3379 qemu_bh_delete(acb->bh);
3381 qemu_aio_release(acb);
3384 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3388 BlockDriverCompletionFunc *cb,
3393 BlockDriverAIOCBSync *acb;
3395 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3396 acb->is_write = is_write;
3398 acb->bounce = qemu_blockalign(bs, qiov->size);
3399 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3402 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3403 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3405 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3408 qemu_bh_schedule(acb->bh);
3410 return &acb->common;
3413 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3414 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3415 BlockDriverCompletionFunc *cb, void *opaque)
3417 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3420 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3421 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3422 BlockDriverCompletionFunc *cb, void *opaque)
3424 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3428 typedef struct BlockDriverAIOCBCoroutine {
3429 BlockDriverAIOCB common;
3433 } BlockDriverAIOCBCoroutine;
3435 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3440 static AIOPool bdrv_em_co_aio_pool = {
3441 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3442 .cancel = bdrv_aio_co_cancel_em,
3445 static void bdrv_co_em_bh(void *opaque)
3447 BlockDriverAIOCBCoroutine *acb = opaque;
3449 acb->common.cb(acb->common.opaque, acb->req.error);
3450 qemu_bh_delete(acb->bh);
3451 qemu_aio_release(acb);
3454 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3455 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3457 BlockDriverAIOCBCoroutine *acb = opaque;
3458 BlockDriverState *bs = acb->common.bs;
3460 if (!acb->is_write) {
3461 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3462 acb->req.nb_sectors, acb->req.qiov, 0);
3464 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3465 acb->req.nb_sectors, acb->req.qiov, 0);
3468 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3469 qemu_bh_schedule(acb->bh);
3472 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3476 BlockDriverCompletionFunc *cb,
3481 BlockDriverAIOCBCoroutine *acb;
3483 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3484 acb->req.sector = sector_num;
3485 acb->req.nb_sectors = nb_sectors;
3486 acb->req.qiov = qiov;
3487 acb->is_write = is_write;
3489 co = qemu_coroutine_create(bdrv_co_do_rw);
3490 qemu_coroutine_enter(co, acb);
3492 return &acb->common;
3495 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3497 BlockDriverAIOCBCoroutine *acb = opaque;
3498 BlockDriverState *bs = acb->common.bs;
3500 acb->req.error = bdrv_co_flush(bs);
3501 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3502 qemu_bh_schedule(acb->bh);
3505 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3506 BlockDriverCompletionFunc *cb, void *opaque)
3508 trace_bdrv_aio_flush(bs, opaque);
3511 BlockDriverAIOCBCoroutine *acb;
3513 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3514 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3515 qemu_coroutine_enter(co, acb);
3517 return &acb->common;
3520 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3522 BlockDriverAIOCBCoroutine *acb = opaque;
3523 BlockDriverState *bs = acb->common.bs;
3525 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3526 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3527 qemu_bh_schedule(acb->bh);
3530 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3531 int64_t sector_num, int nb_sectors,
3532 BlockDriverCompletionFunc *cb, void *opaque)
3535 BlockDriverAIOCBCoroutine *acb;
3537 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3539 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3540 acb->req.sector = sector_num;
3541 acb->req.nb_sectors = nb_sectors;
3542 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3543 qemu_coroutine_enter(co, acb);
3545 return &acb->common;
3548 void bdrv_init(void)
3550 module_call_init(MODULE_INIT_BLOCK);
3553 void bdrv_init_with_whitelist(void)
3555 use_bdrv_whitelist = 1;
3559 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3560 BlockDriverCompletionFunc *cb, void *opaque)
3562 BlockDriverAIOCB *acb;
3564 if (pool->free_aiocb) {
3565 acb = pool->free_aiocb;
3566 pool->free_aiocb = acb->next;
3568 acb = g_malloc0(pool->aiocb_size);
3573 acb->opaque = opaque;
3577 void qemu_aio_release(void *p)
3579 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3580 AIOPool *pool = acb->pool;
3581 acb->next = pool->free_aiocb;
3582 pool->free_aiocb = acb;
3585 /**************************************************************/
3586 /* Coroutine block device emulation */
3588 typedef struct CoroutineIOCompletion {
3589 Coroutine *coroutine;
3591 } CoroutineIOCompletion;
3593 static void bdrv_co_io_em_complete(void *opaque, int ret)
3595 CoroutineIOCompletion *co = opaque;
3598 qemu_coroutine_enter(co->coroutine, NULL);
3601 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3602 int nb_sectors, QEMUIOVector *iov,
3605 CoroutineIOCompletion co = {
3606 .coroutine = qemu_coroutine_self(),
3608 BlockDriverAIOCB *acb;
3611 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3612 bdrv_co_io_em_complete, &co);
3614 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3615 bdrv_co_io_em_complete, &co);
3618 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3622 qemu_coroutine_yield();
3627 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3628 int64_t sector_num, int nb_sectors,
3631 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3634 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3635 int64_t sector_num, int nb_sectors,
3638 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3641 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3643 RwCo *rwco = opaque;
3645 rwco->ret = bdrv_co_flush(rwco->bs);
3648 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3652 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3656 /* Write back cached data to the OS even with cache=unsafe */
3657 if (bs->drv->bdrv_co_flush_to_os) {
3658 ret = bs->drv->bdrv_co_flush_to_os(bs);
3664 /* But don't actually force it to the disk with cache=unsafe */
3665 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3669 if (bs->drv->bdrv_co_flush_to_disk) {
3670 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3671 } else if (bs->drv->bdrv_aio_flush) {
3672 BlockDriverAIOCB *acb;
3673 CoroutineIOCompletion co = {
3674 .coroutine = qemu_coroutine_self(),
3677 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3681 qemu_coroutine_yield();
3686 * Some block drivers always operate in either writethrough or unsafe
3687 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3688 * know how the server works (because the behaviour is hardcoded or
3689 * depends on server-side configuration), so we can't ensure that
3690 * everything is safe on disk. Returning an error doesn't work because
3691 * that would break guests even if the server operates in writethrough
3694 * Let's hope the user knows what he's doing.
3702 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3703 * in the case of cache=unsafe, so there are no useless flushes.
3705 return bdrv_co_flush(bs->file);
3708 void bdrv_invalidate_cache(BlockDriverState *bs)
3710 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3711 bs->drv->bdrv_invalidate_cache(bs);
3715 void bdrv_invalidate_cache_all(void)
3717 BlockDriverState *bs;
3719 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3720 bdrv_invalidate_cache(bs);
3724 void bdrv_clear_incoming_migration_all(void)
3726 BlockDriverState *bs;
3728 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3729 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3733 int bdrv_flush(BlockDriverState *bs)
3741 if (qemu_in_coroutine()) {
3742 /* Fast-path if already in coroutine context */
3743 bdrv_flush_co_entry(&rwco);
3745 co = qemu_coroutine_create(bdrv_flush_co_entry);
3746 qemu_coroutine_enter(co, &rwco);
3747 while (rwco.ret == NOT_DONE) {
3755 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3757 RwCo *rwco = opaque;
3759 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3762 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3767 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3769 } else if (bs->read_only) {
3771 } else if (bs->drv->bdrv_co_discard) {
3772 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3773 } else if (bs->drv->bdrv_aio_discard) {
3774 BlockDriverAIOCB *acb;
3775 CoroutineIOCompletion co = {
3776 .coroutine = qemu_coroutine_self(),
3779 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3780 bdrv_co_io_em_complete, &co);
3784 qemu_coroutine_yield();
3792 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3797 .sector_num = sector_num,
3798 .nb_sectors = nb_sectors,
3802 if (qemu_in_coroutine()) {
3803 /* Fast-path if already in coroutine context */
3804 bdrv_discard_co_entry(&rwco);
3806 co = qemu_coroutine_create(bdrv_discard_co_entry);
3807 qemu_coroutine_enter(co, &rwco);
3808 while (rwco.ret == NOT_DONE) {
3816 /**************************************************************/
3817 /* removable device support */
3820 * Return TRUE if the media is present
3822 int bdrv_is_inserted(BlockDriverState *bs)
3824 BlockDriver *drv = bs->drv;
3828 if (!drv->bdrv_is_inserted)
3830 return drv->bdrv_is_inserted(bs);
3834 * Return whether the media changed since the last call to this
3835 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3837 int bdrv_media_changed(BlockDriverState *bs)
3839 BlockDriver *drv = bs->drv;
3841 if (drv && drv->bdrv_media_changed) {
3842 return drv->bdrv_media_changed(bs);
3848 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3850 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3852 BlockDriver *drv = bs->drv;
3854 if (drv && drv->bdrv_eject) {
3855 drv->bdrv_eject(bs, eject_flag);
3858 if (bs->device_name[0] != '\0') {
3859 bdrv_emit_qmp_eject_event(bs, eject_flag);
3864 * Lock or unlock the media (if it is locked, the user won't be able
3865 * to eject it manually).
3867 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3869 BlockDriver *drv = bs->drv;
3871 trace_bdrv_lock_medium(bs, locked);
3873 if (drv && drv->bdrv_lock_medium) {
3874 drv->bdrv_lock_medium(bs, locked);
3878 /* needed for generic scsi interface */
3880 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3882 BlockDriver *drv = bs->drv;
3884 if (drv && drv->bdrv_ioctl)
3885 return drv->bdrv_ioctl(bs, req, buf);
3889 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3890 unsigned long int req, void *buf,
3891 BlockDriverCompletionFunc *cb, void *opaque)
3893 BlockDriver *drv = bs->drv;
3895 if (drv && drv->bdrv_aio_ioctl)
3896 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3900 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3902 bs->buffer_alignment = align;
3905 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3907 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3910 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3912 int64_t bitmap_size;
3914 bs->dirty_count = 0;
3916 if (!bs->dirty_bitmap) {
3917 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3918 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3919 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3921 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3924 if (bs->dirty_bitmap) {
3925 g_free(bs->dirty_bitmap);
3926 bs->dirty_bitmap = NULL;
3931 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3933 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3935 if (bs->dirty_bitmap &&
3936 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3937 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3938 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3944 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3947 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3950 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3952 return bs->dirty_count;
3955 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3957 assert(bs->in_use != in_use);
3958 bs->in_use = in_use;
3961 int bdrv_in_use(BlockDriverState *bs)
3966 void bdrv_iostatus_enable(BlockDriverState *bs)
3968 bs->iostatus_enabled = true;
3969 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3972 /* The I/O status is only enabled if the drive explicitly
3973 * enables it _and_ the VM is configured to stop on errors */
3974 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3976 return (bs->iostatus_enabled &&
3977 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3978 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3979 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3982 void bdrv_iostatus_disable(BlockDriverState *bs)
3984 bs->iostatus_enabled = false;
3987 void bdrv_iostatus_reset(BlockDriverState *bs)
3989 if (bdrv_iostatus_is_enabled(bs)) {
3990 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3994 /* XXX: Today this is set by device models because it makes the implementation
3995 quite simple. However, the block layer knows about the error, so it's
3996 possible to implement this without device models being involved */
3997 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3999 if (bdrv_iostatus_is_enabled(bs) &&
4000 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4002 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4003 BLOCK_DEVICE_IO_STATUS_FAILED;
4008 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4009 enum BlockAcctType type)
4011 assert(type < BDRV_MAX_IOTYPE);
4013 cookie->bytes = bytes;
4014 cookie->start_time_ns = get_clock();
4015 cookie->type = type;
4019 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4021 assert(cookie->type < BDRV_MAX_IOTYPE);
4023 bs->nr_bytes[cookie->type] += cookie->bytes;
4024 bs->nr_ops[cookie->type]++;
4025 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4028 int bdrv_img_create(const char *filename, const char *fmt,
4029 const char *base_filename, const char *base_fmt,
4030 char *options, uint64_t img_size, int flags)
4032 QEMUOptionParameter *param = NULL, *create_options = NULL;
4033 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4034 BlockDriverState *bs = NULL;
4035 BlockDriver *drv, *proto_drv;
4036 BlockDriver *backing_drv = NULL;
4039 /* Find driver and parse its options */
4040 drv = bdrv_find_format(fmt);
4042 error_report("Unknown file format '%s'", fmt);
4047 proto_drv = bdrv_find_protocol(filename);
4049 error_report("Unknown protocol '%s'", filename);
4054 create_options = append_option_parameters(create_options,
4055 drv->create_options);
4056 create_options = append_option_parameters(create_options,
4057 proto_drv->create_options);
4059 /* Create parameter list with default values */
4060 param = parse_option_parameters("", create_options, param);
4062 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4064 /* Parse -o options */
4066 param = parse_option_parameters(options, create_options, param);
4067 if (param == NULL) {
4068 error_report("Invalid options for file format '%s'.", fmt);
4074 if (base_filename) {
4075 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4077 error_report("Backing file not supported for file format '%s'",
4085 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4086 error_report("Backing file format not supported for file "
4087 "format '%s'", fmt);
4093 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4094 if (backing_file && backing_file->value.s) {
4095 if (!strcmp(filename, backing_file->value.s)) {
4096 error_report("Error: Trying to create an image with the "
4097 "same filename as the backing file");
4103 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4104 if (backing_fmt && backing_fmt->value.s) {
4105 backing_drv = bdrv_find_format(backing_fmt->value.s);
4107 error_report("Unknown backing file format '%s'",
4108 backing_fmt->value.s);
4114 // The size for the image must always be specified, with one exception:
4115 // If we are using a backing file, we can obtain the size from there
4116 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4117 if (size && size->value.n == -1) {
4118 if (backing_file && backing_file->value.s) {
4123 /* backing files always opened read-only */
4125 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4129 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4131 error_report("Could not open '%s'", backing_file->value.s);
4134 bdrv_get_geometry(bs, &size);
4137 snprintf(buf, sizeof(buf), "%" PRId64, size);
4138 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4140 error_report("Image creation needs a size parameter");
4146 printf("Formatting '%s', fmt=%s ", filename, fmt);
4147 print_option_parameters(param);
4150 ret = bdrv_create(drv, filename, param);
4153 if (ret == -ENOTSUP) {
4154 error_report("Formatting or formatting option not supported for "
4155 "file format '%s'", fmt);
4156 } else if (ret == -EFBIG) {
4157 error_report("The image size is too large for file format '%s'",
4160 error_report("%s: error while creating %s: %s", filename, fmt,
4166 free_option_parameters(create_options);
4167 free_option_parameters(param);
4176 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4177 int64_t speed, BlockDriverCompletionFunc *cb,
4178 void *opaque, Error **errp)
4182 if (bs->job || bdrv_in_use(bs)) {
4183 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4186 bdrv_set_in_use(bs, 1);
4188 job = g_malloc0(job_type->instance_size);
4189 job->job_type = job_type;
4192 job->opaque = opaque;
4195 /* Only set speed when necessary to avoid NotSupported error */
4197 Error *local_err = NULL;
4199 block_job_set_speed(job, speed, &local_err);
4200 if (error_is_set(&local_err)) {
4203 bdrv_set_in_use(bs, 0);
4204 error_propagate(errp, local_err);
4211 void block_job_complete(BlockJob *job, int ret)
4213 BlockDriverState *bs = job->bs;
4215 assert(bs->job == job);
4216 job->cb(job->opaque, ret);
4219 bdrv_set_in_use(bs, 0);
4222 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4224 Error *local_err = NULL;
4226 if (!job->job_type->set_speed) {
4227 error_set(errp, QERR_NOT_SUPPORTED);
4230 job->job_type->set_speed(job, speed, &local_err);
4231 if (error_is_set(&local_err)) {
4232 error_propagate(errp, local_err);
4239 void block_job_cancel(BlockJob *job)
4241 job->cancelled = true;
4244 bool block_job_is_cancelled(BlockJob *job)
4246 return job->cancelled;
4249 void block_job_cancel_sync(BlockJob *job)
4251 BlockDriverState *bs = job->bs;
4253 assert(bs->job == job);
4254 block_job_cancel(job);
4255 while (bs->job != NULL && bs->job->busy) {