]> Git Repo - qemu.git/blob - block.c
block: Keep track of devices' I/O status
[qemu.git] / block.c
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qemu-objects.h"
31 #include "qemu-coroutine.h"
32
33 #ifdef CONFIG_BSD
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/queue.h>
38 #ifndef __DragonFly__
39 #include <sys/disk.h>
40 #endif
41 #endif
42
43 #ifdef _WIN32
44 #include <windows.h>
45 #endif
46
47 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
48 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
49         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
50         BlockDriverCompletionFunc *cb, void *opaque);
51 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
52         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
53         BlockDriverCompletionFunc *cb, void *opaque);
54 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
55         BlockDriverCompletionFunc *cb, void *opaque);
56 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
57         BlockDriverCompletionFunc *cb, void *opaque);
58 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
59                         uint8_t *buf, int nb_sectors);
60 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
61                          const uint8_t *buf, int nb_sectors);
62 static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
63         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
64         BlockDriverCompletionFunc *cb, void *opaque);
65 static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
66         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
67         BlockDriverCompletionFunc *cb, void *opaque);
68 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
69                                          int64_t sector_num, int nb_sectors,
70                                          QEMUIOVector *iov);
71 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
72                                          int64_t sector_num, int nb_sectors,
73                                          QEMUIOVector *iov);
74 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
75
76 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
77     QTAILQ_HEAD_INITIALIZER(bdrv_states);
78
79 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
80     QLIST_HEAD_INITIALIZER(bdrv_drivers);
81
82 /* The device to use for VM snapshots */
83 static BlockDriverState *bs_snapshots;
84
85 /* If non-zero, use only whitelisted block drivers */
86 static int use_bdrv_whitelist;
87
88 #ifdef _WIN32
89 static int is_windows_drive_prefix(const char *filename)
90 {
91     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
92              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
93             filename[1] == ':');
94 }
95
96 int is_windows_drive(const char *filename)
97 {
98     if (is_windows_drive_prefix(filename) &&
99         filename[2] == '\0')
100         return 1;
101     if (strstart(filename, "\\\\.\\", NULL) ||
102         strstart(filename, "//./", NULL))
103         return 1;
104     return 0;
105 }
106 #endif
107
108 /* check if the path starts with "<protocol>:" */
109 static int path_has_protocol(const char *path)
110 {
111 #ifdef _WIN32
112     if (is_windows_drive(path) ||
113         is_windows_drive_prefix(path)) {
114         return 0;
115     }
116 #endif
117
118     return strchr(path, ':') != NULL;
119 }
120
121 int path_is_absolute(const char *path)
122 {
123     const char *p;
124 #ifdef _WIN32
125     /* specific case for names like: "\\.\d:" */
126     if (*path == '/' || *path == '\\')
127         return 1;
128 #endif
129     p = strchr(path, ':');
130     if (p)
131         p++;
132     else
133         p = path;
134 #ifdef _WIN32
135     return (*p == '/' || *p == '\\');
136 #else
137     return (*p == '/');
138 #endif
139 }
140
141 /* if filename is absolute, just copy it to dest. Otherwise, build a
142    path to it by considering it is relative to base_path. URL are
143    supported. */
144 void path_combine(char *dest, int dest_size,
145                   const char *base_path,
146                   const char *filename)
147 {
148     const char *p, *p1;
149     int len;
150
151     if (dest_size <= 0)
152         return;
153     if (path_is_absolute(filename)) {
154         pstrcpy(dest, dest_size, filename);
155     } else {
156         p = strchr(base_path, ':');
157         if (p)
158             p++;
159         else
160             p = base_path;
161         p1 = strrchr(base_path, '/');
162 #ifdef _WIN32
163         {
164             const char *p2;
165             p2 = strrchr(base_path, '\\');
166             if (!p1 || p2 > p1)
167                 p1 = p2;
168         }
169 #endif
170         if (p1)
171             p1++;
172         else
173             p1 = base_path;
174         if (p1 > p)
175             p = p1;
176         len = p - base_path;
177         if (len > dest_size - 1)
178             len = dest_size - 1;
179         memcpy(dest, base_path, len);
180         dest[len] = '\0';
181         pstrcat(dest, dest_size, filename);
182     }
183 }
184
185 void bdrv_register(BlockDriver *bdrv)
186 {
187     if (bdrv->bdrv_co_readv) {
188         /* Emulate AIO by coroutines, and sync by AIO */
189         bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em;
190         bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em;
191         bdrv->bdrv_read = bdrv_read_em;
192         bdrv->bdrv_write = bdrv_write_em;
193      } else {
194         bdrv->bdrv_co_readv = bdrv_co_readv_em;
195         bdrv->bdrv_co_writev = bdrv_co_writev_em;
196
197         if (!bdrv->bdrv_aio_readv) {
198             /* add AIO emulation layer */
199             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
200             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
201         } else if (!bdrv->bdrv_read) {
202             /* add synchronous IO emulation layer */
203             bdrv->bdrv_read = bdrv_read_em;
204             bdrv->bdrv_write = bdrv_write_em;
205         }
206     }
207
208     if (!bdrv->bdrv_aio_flush)
209         bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
210
211     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
212 }
213
214 /* create a new block device (by default it is empty) */
215 BlockDriverState *bdrv_new(const char *device_name)
216 {
217     BlockDriverState *bs;
218
219     bs = g_malloc0(sizeof(BlockDriverState));
220     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
221     if (device_name[0] != '\0') {
222         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
223     }
224     bdrv_iostatus_disable(bs);
225     return bs;
226 }
227
228 BlockDriver *bdrv_find_format(const char *format_name)
229 {
230     BlockDriver *drv1;
231     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
232         if (!strcmp(drv1->format_name, format_name)) {
233             return drv1;
234         }
235     }
236     return NULL;
237 }
238
239 static int bdrv_is_whitelisted(BlockDriver *drv)
240 {
241     static const char *whitelist[] = {
242         CONFIG_BDRV_WHITELIST
243     };
244     const char **p;
245
246     if (!whitelist[0])
247         return 1;               /* no whitelist, anything goes */
248
249     for (p = whitelist; *p; p++) {
250         if (!strcmp(drv->format_name, *p)) {
251             return 1;
252         }
253     }
254     return 0;
255 }
256
257 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
258 {
259     BlockDriver *drv = bdrv_find_format(format_name);
260     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
261 }
262
263 int bdrv_create(BlockDriver *drv, const char* filename,
264     QEMUOptionParameter *options)
265 {
266     if (!drv->bdrv_create)
267         return -ENOTSUP;
268
269     return drv->bdrv_create(filename, options);
270 }
271
272 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
273 {
274     BlockDriver *drv;
275
276     drv = bdrv_find_protocol(filename);
277     if (drv == NULL) {
278         return -ENOENT;
279     }
280
281     return bdrv_create(drv, filename, options);
282 }
283
284 #ifdef _WIN32
285 void get_tmp_filename(char *filename, int size)
286 {
287     char temp_dir[MAX_PATH];
288
289     GetTempPath(MAX_PATH, temp_dir);
290     GetTempFileName(temp_dir, "qem", 0, filename);
291 }
292 #else
293 void get_tmp_filename(char *filename, int size)
294 {
295     int fd;
296     const char *tmpdir;
297     /* XXX: race condition possible */
298     tmpdir = getenv("TMPDIR");
299     if (!tmpdir)
300         tmpdir = "/tmp";
301     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
302     fd = mkstemp(filename);
303     close(fd);
304 }
305 #endif
306
307 /*
308  * Detect host devices. By convention, /dev/cdrom[N] is always
309  * recognized as a host CDROM.
310  */
311 static BlockDriver *find_hdev_driver(const char *filename)
312 {
313     int score_max = 0, score;
314     BlockDriver *drv = NULL, *d;
315
316     QLIST_FOREACH(d, &bdrv_drivers, list) {
317         if (d->bdrv_probe_device) {
318             score = d->bdrv_probe_device(filename);
319             if (score > score_max) {
320                 score_max = score;
321                 drv = d;
322             }
323         }
324     }
325
326     return drv;
327 }
328
329 BlockDriver *bdrv_find_protocol(const char *filename)
330 {
331     BlockDriver *drv1;
332     char protocol[128];
333     int len;
334     const char *p;
335
336     /* TODO Drivers without bdrv_file_open must be specified explicitly */
337
338     /*
339      * XXX(hch): we really should not let host device detection
340      * override an explicit protocol specification, but moving this
341      * later breaks access to device names with colons in them.
342      * Thanks to the brain-dead persistent naming schemes on udev-
343      * based Linux systems those actually are quite common.
344      */
345     drv1 = find_hdev_driver(filename);
346     if (drv1) {
347         return drv1;
348     }
349
350     if (!path_has_protocol(filename)) {
351         return bdrv_find_format("file");
352     }
353     p = strchr(filename, ':');
354     assert(p != NULL);
355     len = p - filename;
356     if (len > sizeof(protocol) - 1)
357         len = sizeof(protocol) - 1;
358     memcpy(protocol, filename, len);
359     protocol[len] = '\0';
360     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
361         if (drv1->protocol_name &&
362             !strcmp(drv1->protocol_name, protocol)) {
363             return drv1;
364         }
365     }
366     return NULL;
367 }
368
369 static int find_image_format(const char *filename, BlockDriver **pdrv)
370 {
371     int ret, score, score_max;
372     BlockDriver *drv1, *drv;
373     uint8_t buf[2048];
374     BlockDriverState *bs;
375
376     ret = bdrv_file_open(&bs, filename, 0);
377     if (ret < 0) {
378         *pdrv = NULL;
379         return ret;
380     }
381
382     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
383     if (bs->sg || !bdrv_is_inserted(bs)) {
384         bdrv_delete(bs);
385         drv = bdrv_find_format("raw");
386         if (!drv) {
387             ret = -ENOENT;
388         }
389         *pdrv = drv;
390         return ret;
391     }
392
393     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
394     bdrv_delete(bs);
395     if (ret < 0) {
396         *pdrv = NULL;
397         return ret;
398     }
399
400     score_max = 0;
401     drv = NULL;
402     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
403         if (drv1->bdrv_probe) {
404             score = drv1->bdrv_probe(buf, ret, filename);
405             if (score > score_max) {
406                 score_max = score;
407                 drv = drv1;
408             }
409         }
410     }
411     if (!drv) {
412         ret = -ENOENT;
413     }
414     *pdrv = drv;
415     return ret;
416 }
417
418 /**
419  * Set the current 'total_sectors' value
420  */
421 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
422 {
423     BlockDriver *drv = bs->drv;
424
425     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
426     if (bs->sg)
427         return 0;
428
429     /* query actual device if possible, otherwise just trust the hint */
430     if (drv->bdrv_getlength) {
431         int64_t length = drv->bdrv_getlength(bs);
432         if (length < 0) {
433             return length;
434         }
435         hint = length >> BDRV_SECTOR_BITS;
436     }
437
438     bs->total_sectors = hint;
439     return 0;
440 }
441
442 /**
443  * Set open flags for a given cache mode
444  *
445  * Return 0 on success, -1 if the cache mode was invalid.
446  */
447 int bdrv_parse_cache_flags(const char *mode, int *flags)
448 {
449     *flags &= ~BDRV_O_CACHE_MASK;
450
451     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
452         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
453     } else if (!strcmp(mode, "directsync")) {
454         *flags |= BDRV_O_NOCACHE;
455     } else if (!strcmp(mode, "writeback")) {
456         *flags |= BDRV_O_CACHE_WB;
457     } else if (!strcmp(mode, "unsafe")) {
458         *flags |= BDRV_O_CACHE_WB;
459         *flags |= BDRV_O_NO_FLUSH;
460     } else if (!strcmp(mode, "writethrough")) {
461         /* this is the default */
462     } else {
463         return -1;
464     }
465
466     return 0;
467 }
468
469 /*
470  * Common part for opening disk images and files
471  */
472 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
473     int flags, BlockDriver *drv)
474 {
475     int ret, open_flags;
476
477     assert(drv != NULL);
478
479     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
480
481     bs->file = NULL;
482     bs->total_sectors = 0;
483     bs->encrypted = 0;
484     bs->valid_key = 0;
485     bs->open_flags = flags;
486     bs->buffer_alignment = 512;
487
488     pstrcpy(bs->filename, sizeof(bs->filename), filename);
489
490     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
491         return -ENOTSUP;
492     }
493
494     bs->drv = drv;
495     bs->opaque = g_malloc0(drv->instance_size);
496
497     if (flags & BDRV_O_CACHE_WB)
498         bs->enable_write_cache = 1;
499
500     /*
501      * Clear flags that are internal to the block layer before opening the
502      * image.
503      */
504     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
505
506     /*
507      * Snapshots should be writable.
508      */
509     if (bs->is_temporary) {
510         open_flags |= BDRV_O_RDWR;
511     }
512
513     /* Open the image, either directly or using a protocol */
514     if (drv->bdrv_file_open) {
515         ret = drv->bdrv_file_open(bs, filename, open_flags);
516     } else {
517         ret = bdrv_file_open(&bs->file, filename, open_flags);
518         if (ret >= 0) {
519             ret = drv->bdrv_open(bs, open_flags);
520         }
521     }
522
523     if (ret < 0) {
524         goto free_and_fail;
525     }
526
527     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
528
529     ret = refresh_total_sectors(bs, bs->total_sectors);
530     if (ret < 0) {
531         goto free_and_fail;
532     }
533
534 #ifndef _WIN32
535     if (bs->is_temporary) {
536         unlink(filename);
537     }
538 #endif
539     return 0;
540
541 free_and_fail:
542     if (bs->file) {
543         bdrv_delete(bs->file);
544         bs->file = NULL;
545     }
546     g_free(bs->opaque);
547     bs->opaque = NULL;
548     bs->drv = NULL;
549     return ret;
550 }
551
552 /*
553  * Opens a file using a protocol (file, host_device, nbd, ...)
554  */
555 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
556 {
557     BlockDriverState *bs;
558     BlockDriver *drv;
559     int ret;
560
561     drv = bdrv_find_protocol(filename);
562     if (!drv) {
563         return -ENOENT;
564     }
565
566     bs = bdrv_new("");
567     ret = bdrv_open_common(bs, filename, flags, drv);
568     if (ret < 0) {
569         bdrv_delete(bs);
570         return ret;
571     }
572     bs->growable = 1;
573     *pbs = bs;
574     return 0;
575 }
576
577 /*
578  * Opens a disk image (raw, qcow2, vmdk, ...)
579  */
580 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
581               BlockDriver *drv)
582 {
583     int ret;
584
585     if (flags & BDRV_O_SNAPSHOT) {
586         BlockDriverState *bs1;
587         int64_t total_size;
588         int is_protocol = 0;
589         BlockDriver *bdrv_qcow2;
590         QEMUOptionParameter *options;
591         char tmp_filename[PATH_MAX];
592         char backing_filename[PATH_MAX];
593
594         /* if snapshot, we create a temporary backing file and open it
595            instead of opening 'filename' directly */
596
597         /* if there is a backing file, use it */
598         bs1 = bdrv_new("");
599         ret = bdrv_open(bs1, filename, 0, drv);
600         if (ret < 0) {
601             bdrv_delete(bs1);
602             return ret;
603         }
604         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
605
606         if (bs1->drv && bs1->drv->protocol_name)
607             is_protocol = 1;
608
609         bdrv_delete(bs1);
610
611         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
612
613         /* Real path is meaningless for protocols */
614         if (is_protocol)
615             snprintf(backing_filename, sizeof(backing_filename),
616                      "%s", filename);
617         else if (!realpath(filename, backing_filename))
618             return -errno;
619
620         bdrv_qcow2 = bdrv_find_format("qcow2");
621         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
622
623         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
624         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
625         if (drv) {
626             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
627                 drv->format_name);
628         }
629
630         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
631         free_option_parameters(options);
632         if (ret < 0) {
633             return ret;
634         }
635
636         filename = tmp_filename;
637         drv = bdrv_qcow2;
638         bs->is_temporary = 1;
639     }
640
641     /* Find the right image format driver */
642     if (!drv) {
643         ret = find_image_format(filename, &drv);
644     }
645
646     if (!drv) {
647         goto unlink_and_fail;
648     }
649
650     /* Open the image */
651     ret = bdrv_open_common(bs, filename, flags, drv);
652     if (ret < 0) {
653         goto unlink_and_fail;
654     }
655
656     /* If there is a backing file, use it */
657     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
658         char backing_filename[PATH_MAX];
659         int back_flags;
660         BlockDriver *back_drv = NULL;
661
662         bs->backing_hd = bdrv_new("");
663
664         if (path_has_protocol(bs->backing_file)) {
665             pstrcpy(backing_filename, sizeof(backing_filename),
666                     bs->backing_file);
667         } else {
668             path_combine(backing_filename, sizeof(backing_filename),
669                          filename, bs->backing_file);
670         }
671
672         if (bs->backing_format[0] != '\0') {
673             back_drv = bdrv_find_format(bs->backing_format);
674         }
675
676         /* backing files always opened read-only */
677         back_flags =
678             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
679
680         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
681         if (ret < 0) {
682             bdrv_close(bs);
683             return ret;
684         }
685         if (bs->is_temporary) {
686             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
687         } else {
688             /* base image inherits from "parent" */
689             bs->backing_hd->keep_read_only = bs->keep_read_only;
690         }
691     }
692
693     if (!bdrv_key_required(bs)) {
694         bdrv_dev_change_media_cb(bs, true);
695     }
696
697     return 0;
698
699 unlink_and_fail:
700     if (bs->is_temporary) {
701         unlink(filename);
702     }
703     return ret;
704 }
705
706 void bdrv_close(BlockDriverState *bs)
707 {
708     if (bs->drv) {
709         if (bs == bs_snapshots) {
710             bs_snapshots = NULL;
711         }
712         if (bs->backing_hd) {
713             bdrv_delete(bs->backing_hd);
714             bs->backing_hd = NULL;
715         }
716         bs->drv->bdrv_close(bs);
717         g_free(bs->opaque);
718 #ifdef _WIN32
719         if (bs->is_temporary) {
720             unlink(bs->filename);
721         }
722 #endif
723         bs->opaque = NULL;
724         bs->drv = NULL;
725
726         if (bs->file != NULL) {
727             bdrv_close(bs->file);
728         }
729
730         bdrv_dev_change_media_cb(bs, false);
731     }
732 }
733
734 void bdrv_close_all(void)
735 {
736     BlockDriverState *bs;
737
738     QTAILQ_FOREACH(bs, &bdrv_states, list) {
739         bdrv_close(bs);
740     }
741 }
742
743 /* make a BlockDriverState anonymous by removing from bdrv_state list.
744    Also, NULL terminate the device_name to prevent double remove */
745 void bdrv_make_anon(BlockDriverState *bs)
746 {
747     if (bs->device_name[0] != '\0') {
748         QTAILQ_REMOVE(&bdrv_states, bs, list);
749     }
750     bs->device_name[0] = '\0';
751 }
752
753 void bdrv_delete(BlockDriverState *bs)
754 {
755     assert(!bs->dev);
756
757     /* remove from list, if necessary */
758     bdrv_make_anon(bs);
759
760     bdrv_close(bs);
761     if (bs->file != NULL) {
762         bdrv_delete(bs->file);
763     }
764
765     assert(bs != bs_snapshots);
766     g_free(bs);
767 }
768
769 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
770 /* TODO change to DeviceState *dev when all users are qdevified */
771 {
772     if (bs->dev) {
773         return -EBUSY;
774     }
775     bs->dev = dev;
776     bdrv_iostatus_reset(bs);
777     return 0;
778 }
779
780 /* TODO qdevified devices don't use this, remove when devices are qdevified */
781 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
782 {
783     if (bdrv_attach_dev(bs, dev) < 0) {
784         abort();
785     }
786 }
787
788 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
789 /* TODO change to DeviceState *dev when all users are qdevified */
790 {
791     assert(bs->dev == dev);
792     bs->dev = NULL;
793     bs->dev_ops = NULL;
794     bs->dev_opaque = NULL;
795     bs->buffer_alignment = 512;
796 }
797
798 /* TODO change to return DeviceState * when all users are qdevified */
799 void *bdrv_get_attached_dev(BlockDriverState *bs)
800 {
801     return bs->dev;
802 }
803
804 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
805                       void *opaque)
806 {
807     bs->dev_ops = ops;
808     bs->dev_opaque = opaque;
809     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
810         bs_snapshots = NULL;
811     }
812 }
813
814 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
815 {
816     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
817         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
818     }
819 }
820
821 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
822 {
823     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
824 }
825
826 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
827 {
828     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
829         return bs->dev_ops->is_tray_open(bs->dev_opaque);
830     }
831     return false;
832 }
833
834 static void bdrv_dev_resize_cb(BlockDriverState *bs)
835 {
836     if (bs->dev_ops && bs->dev_ops->resize_cb) {
837         bs->dev_ops->resize_cb(bs->dev_opaque);
838     }
839 }
840
841 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
842 {
843     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
844         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
845     }
846     return false;
847 }
848
849 /*
850  * Run consistency checks on an image
851  *
852  * Returns 0 if the check could be completed (it doesn't mean that the image is
853  * free of errors) or -errno when an internal error occurred. The results of the
854  * check are stored in res.
855  */
856 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
857 {
858     if (bs->drv->bdrv_check == NULL) {
859         return -ENOTSUP;
860     }
861
862     memset(res, 0, sizeof(*res));
863     return bs->drv->bdrv_check(bs, res);
864 }
865
866 #define COMMIT_BUF_SECTORS 2048
867
868 /* commit COW file into the raw image */
869 int bdrv_commit(BlockDriverState *bs)
870 {
871     BlockDriver *drv = bs->drv;
872     BlockDriver *backing_drv;
873     int64_t sector, total_sectors;
874     int n, ro, open_flags;
875     int ret = 0, rw_ret = 0;
876     uint8_t *buf;
877     char filename[1024];
878     BlockDriverState *bs_rw, *bs_ro;
879
880     if (!drv)
881         return -ENOMEDIUM;
882     
883     if (!bs->backing_hd) {
884         return -ENOTSUP;
885     }
886
887     if (bs->backing_hd->keep_read_only) {
888         return -EACCES;
889     }
890
891     backing_drv = bs->backing_hd->drv;
892     ro = bs->backing_hd->read_only;
893     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
894     open_flags =  bs->backing_hd->open_flags;
895
896     if (ro) {
897         /* re-open as RW */
898         bdrv_delete(bs->backing_hd);
899         bs->backing_hd = NULL;
900         bs_rw = bdrv_new("");
901         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
902             backing_drv);
903         if (rw_ret < 0) {
904             bdrv_delete(bs_rw);
905             /* try to re-open read-only */
906             bs_ro = bdrv_new("");
907             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
908                 backing_drv);
909             if (ret < 0) {
910                 bdrv_delete(bs_ro);
911                 /* drive not functional anymore */
912                 bs->drv = NULL;
913                 return ret;
914             }
915             bs->backing_hd = bs_ro;
916             return rw_ret;
917         }
918         bs->backing_hd = bs_rw;
919     }
920
921     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
922     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
923
924     for (sector = 0; sector < total_sectors; sector += n) {
925         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
926
927             if (bdrv_read(bs, sector, buf, n) != 0) {
928                 ret = -EIO;
929                 goto ro_cleanup;
930             }
931
932             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
933                 ret = -EIO;
934                 goto ro_cleanup;
935             }
936         }
937     }
938
939     if (drv->bdrv_make_empty) {
940         ret = drv->bdrv_make_empty(bs);
941         bdrv_flush(bs);
942     }
943
944     /*
945      * Make sure all data we wrote to the backing device is actually
946      * stable on disk.
947      */
948     if (bs->backing_hd)
949         bdrv_flush(bs->backing_hd);
950
951 ro_cleanup:
952     g_free(buf);
953
954     if (ro) {
955         /* re-open as RO */
956         bdrv_delete(bs->backing_hd);
957         bs->backing_hd = NULL;
958         bs_ro = bdrv_new("");
959         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
960             backing_drv);
961         if (ret < 0) {
962             bdrv_delete(bs_ro);
963             /* drive not functional anymore */
964             bs->drv = NULL;
965             return ret;
966         }
967         bs->backing_hd = bs_ro;
968         bs->backing_hd->keep_read_only = 0;
969     }
970
971     return ret;
972 }
973
974 void bdrv_commit_all(void)
975 {
976     BlockDriverState *bs;
977
978     QTAILQ_FOREACH(bs, &bdrv_states, list) {
979         bdrv_commit(bs);
980     }
981 }
982
983 /*
984  * Return values:
985  * 0        - success
986  * -EINVAL  - backing format specified, but no file
987  * -ENOSPC  - can't update the backing file because no space is left in the
988  *            image file header
989  * -ENOTSUP - format driver doesn't support changing the backing file
990  */
991 int bdrv_change_backing_file(BlockDriverState *bs,
992     const char *backing_file, const char *backing_fmt)
993 {
994     BlockDriver *drv = bs->drv;
995
996     if (drv->bdrv_change_backing_file != NULL) {
997         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
998     } else {
999         return -ENOTSUP;
1000     }
1001 }
1002
1003 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1004                                    size_t size)
1005 {
1006     int64_t len;
1007
1008     if (!bdrv_is_inserted(bs))
1009         return -ENOMEDIUM;
1010
1011     if (bs->growable)
1012         return 0;
1013
1014     len = bdrv_getlength(bs);
1015
1016     if (offset < 0)
1017         return -EIO;
1018
1019     if ((offset > len) || (len - offset < size))
1020         return -EIO;
1021
1022     return 0;
1023 }
1024
1025 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1026                               int nb_sectors)
1027 {
1028     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1029                                    nb_sectors * BDRV_SECTOR_SIZE);
1030 }
1031
1032 static inline bool bdrv_has_async_rw(BlockDriver *drv)
1033 {
1034     return drv->bdrv_co_readv != bdrv_co_readv_em
1035         || drv->bdrv_aio_readv != bdrv_aio_readv_em;
1036 }
1037
1038 static inline bool bdrv_has_async_flush(BlockDriver *drv)
1039 {
1040     return drv->bdrv_aio_flush != bdrv_aio_flush_em;
1041 }
1042
1043 /* return < 0 if error. See bdrv_write() for the return codes */
1044 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1045               uint8_t *buf, int nb_sectors)
1046 {
1047     BlockDriver *drv = bs->drv;
1048
1049     if (!drv)
1050         return -ENOMEDIUM;
1051
1052     if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
1053         QEMUIOVector qiov;
1054         struct iovec iov = {
1055             .iov_base = (void *)buf,
1056             .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1057         };
1058
1059         qemu_iovec_init_external(&qiov, &iov, 1);
1060         return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
1061     }
1062
1063     if (bdrv_check_request(bs, sector_num, nb_sectors))
1064         return -EIO;
1065
1066     return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
1067 }
1068
1069 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1070                              int nb_sectors, int dirty)
1071 {
1072     int64_t start, end;
1073     unsigned long val, idx, bit;
1074
1075     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1076     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1077
1078     for (; start <= end; start++) {
1079         idx = start / (sizeof(unsigned long) * 8);
1080         bit = start % (sizeof(unsigned long) * 8);
1081         val = bs->dirty_bitmap[idx];
1082         if (dirty) {
1083             if (!(val & (1UL << bit))) {
1084                 bs->dirty_count++;
1085                 val |= 1UL << bit;
1086             }
1087         } else {
1088             if (val & (1UL << bit)) {
1089                 bs->dirty_count--;
1090                 val &= ~(1UL << bit);
1091             }
1092         }
1093         bs->dirty_bitmap[idx] = val;
1094     }
1095 }
1096
1097 /* Return < 0 if error. Important errors are:
1098   -EIO         generic I/O error (may happen for all errors)
1099   -ENOMEDIUM   No media inserted.
1100   -EINVAL      Invalid sector number or nb_sectors
1101   -EACCES      Trying to write a read-only device
1102 */
1103 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1104                const uint8_t *buf, int nb_sectors)
1105 {
1106     BlockDriver *drv = bs->drv;
1107
1108     if (!bs->drv)
1109         return -ENOMEDIUM;
1110
1111     if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
1112         QEMUIOVector qiov;
1113         struct iovec iov = {
1114             .iov_base = (void *)buf,
1115             .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1116         };
1117
1118         qemu_iovec_init_external(&qiov, &iov, 1);
1119         return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1120     }
1121
1122     if (bs->read_only)
1123         return -EACCES;
1124     if (bdrv_check_request(bs, sector_num, nb_sectors))
1125         return -EIO;
1126
1127     if (bs->dirty_bitmap) {
1128         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1129     }
1130
1131     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1132         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1133     }
1134
1135     return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
1136 }
1137
1138 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1139                void *buf, int count1)
1140 {
1141     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1142     int len, nb_sectors, count;
1143     int64_t sector_num;
1144     int ret;
1145
1146     count = count1;
1147     /* first read to align to sector start */
1148     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1149     if (len > count)
1150         len = count;
1151     sector_num = offset >> BDRV_SECTOR_BITS;
1152     if (len > 0) {
1153         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1154             return ret;
1155         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1156         count -= len;
1157         if (count == 0)
1158             return count1;
1159         sector_num++;
1160         buf += len;
1161     }
1162
1163     /* read the sectors "in place" */
1164     nb_sectors = count >> BDRV_SECTOR_BITS;
1165     if (nb_sectors > 0) {
1166         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1167             return ret;
1168         sector_num += nb_sectors;
1169         len = nb_sectors << BDRV_SECTOR_BITS;
1170         buf += len;
1171         count -= len;
1172     }
1173
1174     /* add data from the last sector */
1175     if (count > 0) {
1176         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1177             return ret;
1178         memcpy(buf, tmp_buf, count);
1179     }
1180     return count1;
1181 }
1182
1183 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1184                 const void *buf, int count1)
1185 {
1186     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1187     int len, nb_sectors, count;
1188     int64_t sector_num;
1189     int ret;
1190
1191     count = count1;
1192     /* first write to align to sector start */
1193     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1194     if (len > count)
1195         len = count;
1196     sector_num = offset >> BDRV_SECTOR_BITS;
1197     if (len > 0) {
1198         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1199             return ret;
1200         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1201         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1202             return ret;
1203         count -= len;
1204         if (count == 0)
1205             return count1;
1206         sector_num++;
1207         buf += len;
1208     }
1209
1210     /* write the sectors "in place" */
1211     nb_sectors = count >> BDRV_SECTOR_BITS;
1212     if (nb_sectors > 0) {
1213         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1214             return ret;
1215         sector_num += nb_sectors;
1216         len = nb_sectors << BDRV_SECTOR_BITS;
1217         buf += len;
1218         count -= len;
1219     }
1220
1221     /* add data from the last sector */
1222     if (count > 0) {
1223         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1224             return ret;
1225         memcpy(tmp_buf, buf, count);
1226         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1227             return ret;
1228     }
1229     return count1;
1230 }
1231
1232 /*
1233  * Writes to the file and ensures that no writes are reordered across this
1234  * request (acts as a barrier)
1235  *
1236  * Returns 0 on success, -errno in error cases.
1237  */
1238 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1239     const void *buf, int count)
1240 {
1241     int ret;
1242
1243     ret = bdrv_pwrite(bs, offset, buf, count);
1244     if (ret < 0) {
1245         return ret;
1246     }
1247
1248     /* No flush needed for cache modes that use O_DSYNC */
1249     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1250         bdrv_flush(bs);
1251     }
1252
1253     return 0;
1254 }
1255
1256 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1257     int nb_sectors, QEMUIOVector *qiov)
1258 {
1259     BlockDriver *drv = bs->drv;
1260
1261     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1262
1263     if (!drv) {
1264         return -ENOMEDIUM;
1265     }
1266     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1267         return -EIO;
1268     }
1269
1270     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1271 }
1272
1273 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1274     int nb_sectors, QEMUIOVector *qiov)
1275 {
1276     BlockDriver *drv = bs->drv;
1277
1278     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1279
1280     if (!bs->drv) {
1281         return -ENOMEDIUM;
1282     }
1283     if (bs->read_only) {
1284         return -EACCES;
1285     }
1286     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1287         return -EIO;
1288     }
1289
1290     if (bs->dirty_bitmap) {
1291         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1292     }
1293
1294     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1295         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1296     }
1297
1298     return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1299 }
1300
1301 /**
1302  * Truncate file to 'offset' bytes (needed only for file protocols)
1303  */
1304 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1305 {
1306     BlockDriver *drv = bs->drv;
1307     int ret;
1308     if (!drv)
1309         return -ENOMEDIUM;
1310     if (!drv->bdrv_truncate)
1311         return -ENOTSUP;
1312     if (bs->read_only)
1313         return -EACCES;
1314     if (bdrv_in_use(bs))
1315         return -EBUSY;
1316     ret = drv->bdrv_truncate(bs, offset);
1317     if (ret == 0) {
1318         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1319         bdrv_dev_resize_cb(bs);
1320     }
1321     return ret;
1322 }
1323
1324 /**
1325  * Length of a allocated file in bytes. Sparse files are counted by actual
1326  * allocated space. Return < 0 if error or unknown.
1327  */
1328 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1329 {
1330     BlockDriver *drv = bs->drv;
1331     if (!drv) {
1332         return -ENOMEDIUM;
1333     }
1334     if (drv->bdrv_get_allocated_file_size) {
1335         return drv->bdrv_get_allocated_file_size(bs);
1336     }
1337     if (bs->file) {
1338         return bdrv_get_allocated_file_size(bs->file);
1339     }
1340     return -ENOTSUP;
1341 }
1342
1343 /**
1344  * Length of a file in bytes. Return < 0 if error or unknown.
1345  */
1346 int64_t bdrv_getlength(BlockDriverState *bs)
1347 {
1348     BlockDriver *drv = bs->drv;
1349     if (!drv)
1350         return -ENOMEDIUM;
1351
1352     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1353         if (drv->bdrv_getlength) {
1354             return drv->bdrv_getlength(bs);
1355         }
1356     }
1357     return bs->total_sectors * BDRV_SECTOR_SIZE;
1358 }
1359
1360 /* return 0 as number of sectors if no device present or error */
1361 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1362 {
1363     int64_t length;
1364     length = bdrv_getlength(bs);
1365     if (length < 0)
1366         length = 0;
1367     else
1368         length = length >> BDRV_SECTOR_BITS;
1369     *nb_sectors_ptr = length;
1370 }
1371
1372 struct partition {
1373         uint8_t boot_ind;           /* 0x80 - active */
1374         uint8_t head;               /* starting head */
1375         uint8_t sector;             /* starting sector */
1376         uint8_t cyl;                /* starting cylinder */
1377         uint8_t sys_ind;            /* What partition type */
1378         uint8_t end_head;           /* end head */
1379         uint8_t end_sector;         /* end sector */
1380         uint8_t end_cyl;            /* end cylinder */
1381         uint32_t start_sect;        /* starting sector counting from 0 */
1382         uint32_t nr_sects;          /* nr of sectors in partition */
1383 } QEMU_PACKED;
1384
1385 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1386 static int guess_disk_lchs(BlockDriverState *bs,
1387                            int *pcylinders, int *pheads, int *psectors)
1388 {
1389     uint8_t buf[BDRV_SECTOR_SIZE];
1390     int ret, i, heads, sectors, cylinders;
1391     struct partition *p;
1392     uint32_t nr_sects;
1393     uint64_t nb_sectors;
1394
1395     bdrv_get_geometry(bs, &nb_sectors);
1396
1397     ret = bdrv_read(bs, 0, buf, 1);
1398     if (ret < 0)
1399         return -1;
1400     /* test msdos magic */
1401     if (buf[510] != 0x55 || buf[511] != 0xaa)
1402         return -1;
1403     for(i = 0; i < 4; i++) {
1404         p = ((struct partition *)(buf + 0x1be)) + i;
1405         nr_sects = le32_to_cpu(p->nr_sects);
1406         if (nr_sects && p->end_head) {
1407             /* We make the assumption that the partition terminates on
1408                a cylinder boundary */
1409             heads = p->end_head + 1;
1410             sectors = p->end_sector & 63;
1411             if (sectors == 0)
1412                 continue;
1413             cylinders = nb_sectors / (heads * sectors);
1414             if (cylinders < 1 || cylinders > 16383)
1415                 continue;
1416             *pheads = heads;
1417             *psectors = sectors;
1418             *pcylinders = cylinders;
1419 #if 0
1420             printf("guessed geometry: LCHS=%d %d %d\n",
1421                    cylinders, heads, sectors);
1422 #endif
1423             return 0;
1424         }
1425     }
1426     return -1;
1427 }
1428
1429 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1430 {
1431     int translation, lba_detected = 0;
1432     int cylinders, heads, secs;
1433     uint64_t nb_sectors;
1434
1435     /* if a geometry hint is available, use it */
1436     bdrv_get_geometry(bs, &nb_sectors);
1437     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1438     translation = bdrv_get_translation_hint(bs);
1439     if (cylinders != 0) {
1440         *pcyls = cylinders;
1441         *pheads = heads;
1442         *psecs = secs;
1443     } else {
1444         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1445             if (heads > 16) {
1446                 /* if heads > 16, it means that a BIOS LBA
1447                    translation was active, so the default
1448                    hardware geometry is OK */
1449                 lba_detected = 1;
1450                 goto default_geometry;
1451             } else {
1452                 *pcyls = cylinders;
1453                 *pheads = heads;
1454                 *psecs = secs;
1455                 /* disable any translation to be in sync with
1456                    the logical geometry */
1457                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1458                     bdrv_set_translation_hint(bs,
1459                                               BIOS_ATA_TRANSLATION_NONE);
1460                 }
1461             }
1462         } else {
1463         default_geometry:
1464             /* if no geometry, use a standard physical disk geometry */
1465             cylinders = nb_sectors / (16 * 63);
1466
1467             if (cylinders > 16383)
1468                 cylinders = 16383;
1469             else if (cylinders < 2)
1470                 cylinders = 2;
1471             *pcyls = cylinders;
1472             *pheads = 16;
1473             *psecs = 63;
1474             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1475                 if ((*pcyls * *pheads) <= 131072) {
1476                     bdrv_set_translation_hint(bs,
1477                                               BIOS_ATA_TRANSLATION_LARGE);
1478                 } else {
1479                     bdrv_set_translation_hint(bs,
1480                                               BIOS_ATA_TRANSLATION_LBA);
1481                 }
1482             }
1483         }
1484         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1485     }
1486 }
1487
1488 void bdrv_set_geometry_hint(BlockDriverState *bs,
1489                             int cyls, int heads, int secs)
1490 {
1491     bs->cyls = cyls;
1492     bs->heads = heads;
1493     bs->secs = secs;
1494 }
1495
1496 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1497 {
1498     bs->translation = translation;
1499 }
1500
1501 void bdrv_get_geometry_hint(BlockDriverState *bs,
1502                             int *pcyls, int *pheads, int *psecs)
1503 {
1504     *pcyls = bs->cyls;
1505     *pheads = bs->heads;
1506     *psecs = bs->secs;
1507 }
1508
1509 /* Recognize floppy formats */
1510 typedef struct FDFormat {
1511     FDriveType drive;
1512     uint8_t last_sect;
1513     uint8_t max_track;
1514     uint8_t max_head;
1515 } FDFormat;
1516
1517 static const FDFormat fd_formats[] = {
1518     /* First entry is default format */
1519     /* 1.44 MB 3"1/2 floppy disks */
1520     { FDRIVE_DRV_144, 18, 80, 1, },
1521     { FDRIVE_DRV_144, 20, 80, 1, },
1522     { FDRIVE_DRV_144, 21, 80, 1, },
1523     { FDRIVE_DRV_144, 21, 82, 1, },
1524     { FDRIVE_DRV_144, 21, 83, 1, },
1525     { FDRIVE_DRV_144, 22, 80, 1, },
1526     { FDRIVE_DRV_144, 23, 80, 1, },
1527     { FDRIVE_DRV_144, 24, 80, 1, },
1528     /* 2.88 MB 3"1/2 floppy disks */
1529     { FDRIVE_DRV_288, 36, 80, 1, },
1530     { FDRIVE_DRV_288, 39, 80, 1, },
1531     { FDRIVE_DRV_288, 40, 80, 1, },
1532     { FDRIVE_DRV_288, 44, 80, 1, },
1533     { FDRIVE_DRV_288, 48, 80, 1, },
1534     /* 720 kB 3"1/2 floppy disks */
1535     { FDRIVE_DRV_144,  9, 80, 1, },
1536     { FDRIVE_DRV_144, 10, 80, 1, },
1537     { FDRIVE_DRV_144, 10, 82, 1, },
1538     { FDRIVE_DRV_144, 10, 83, 1, },
1539     { FDRIVE_DRV_144, 13, 80, 1, },
1540     { FDRIVE_DRV_144, 14, 80, 1, },
1541     /* 1.2 MB 5"1/4 floppy disks */
1542     { FDRIVE_DRV_120, 15, 80, 1, },
1543     { FDRIVE_DRV_120, 18, 80, 1, },
1544     { FDRIVE_DRV_120, 18, 82, 1, },
1545     { FDRIVE_DRV_120, 18, 83, 1, },
1546     { FDRIVE_DRV_120, 20, 80, 1, },
1547     /* 720 kB 5"1/4 floppy disks */
1548     { FDRIVE_DRV_120,  9, 80, 1, },
1549     { FDRIVE_DRV_120, 11, 80, 1, },
1550     /* 360 kB 5"1/4 floppy disks */
1551     { FDRIVE_DRV_120,  9, 40, 1, },
1552     { FDRIVE_DRV_120,  9, 40, 0, },
1553     { FDRIVE_DRV_120, 10, 41, 1, },
1554     { FDRIVE_DRV_120, 10, 42, 1, },
1555     /* 320 kB 5"1/4 floppy disks */
1556     { FDRIVE_DRV_120,  8, 40, 1, },
1557     { FDRIVE_DRV_120,  8, 40, 0, },
1558     /* 360 kB must match 5"1/4 better than 3"1/2... */
1559     { FDRIVE_DRV_144,  9, 80, 0, },
1560     /* end */
1561     { FDRIVE_DRV_NONE, -1, -1, 0, },
1562 };
1563
1564 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1565                                    int *max_track, int *last_sect,
1566                                    FDriveType drive_in, FDriveType *drive)
1567 {
1568     const FDFormat *parse;
1569     uint64_t nb_sectors, size;
1570     int i, first_match, match;
1571
1572     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1573     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1574         /* User defined disk */
1575     } else {
1576         bdrv_get_geometry(bs, &nb_sectors);
1577         match = -1;
1578         first_match = -1;
1579         for (i = 0; ; i++) {
1580             parse = &fd_formats[i];
1581             if (parse->drive == FDRIVE_DRV_NONE) {
1582                 break;
1583             }
1584             if (drive_in == parse->drive ||
1585                 drive_in == FDRIVE_DRV_NONE) {
1586                 size = (parse->max_head + 1) * parse->max_track *
1587                     parse->last_sect;
1588                 if (nb_sectors == size) {
1589                     match = i;
1590                     break;
1591                 }
1592                 if (first_match == -1) {
1593                     first_match = i;
1594                 }
1595             }
1596         }
1597         if (match == -1) {
1598             if (first_match == -1) {
1599                 match = 1;
1600             } else {
1601                 match = first_match;
1602             }
1603             parse = &fd_formats[match];
1604         }
1605         *nb_heads = parse->max_head + 1;
1606         *max_track = parse->max_track;
1607         *last_sect = parse->last_sect;
1608         *drive = parse->drive;
1609     }
1610 }
1611
1612 int bdrv_get_translation_hint(BlockDriverState *bs)
1613 {
1614     return bs->translation;
1615 }
1616
1617 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1618                        BlockErrorAction on_write_error)
1619 {
1620     bs->on_read_error = on_read_error;
1621     bs->on_write_error = on_write_error;
1622 }
1623
1624 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1625 {
1626     return is_read ? bs->on_read_error : bs->on_write_error;
1627 }
1628
1629 int bdrv_is_read_only(BlockDriverState *bs)
1630 {
1631     return bs->read_only;
1632 }
1633
1634 int bdrv_is_sg(BlockDriverState *bs)
1635 {
1636     return bs->sg;
1637 }
1638
1639 int bdrv_enable_write_cache(BlockDriverState *bs)
1640 {
1641     return bs->enable_write_cache;
1642 }
1643
1644 int bdrv_is_encrypted(BlockDriverState *bs)
1645 {
1646     if (bs->backing_hd && bs->backing_hd->encrypted)
1647         return 1;
1648     return bs->encrypted;
1649 }
1650
1651 int bdrv_key_required(BlockDriverState *bs)
1652 {
1653     BlockDriverState *backing_hd = bs->backing_hd;
1654
1655     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1656         return 1;
1657     return (bs->encrypted && !bs->valid_key);
1658 }
1659
1660 int bdrv_set_key(BlockDriverState *bs, const char *key)
1661 {
1662     int ret;
1663     if (bs->backing_hd && bs->backing_hd->encrypted) {
1664         ret = bdrv_set_key(bs->backing_hd, key);
1665         if (ret < 0)
1666             return ret;
1667         if (!bs->encrypted)
1668             return 0;
1669     }
1670     if (!bs->encrypted) {
1671         return -EINVAL;
1672     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1673         return -ENOMEDIUM;
1674     }
1675     ret = bs->drv->bdrv_set_key(bs, key);
1676     if (ret < 0) {
1677         bs->valid_key = 0;
1678     } else if (!bs->valid_key) {
1679         bs->valid_key = 1;
1680         /* call the change callback now, we skipped it on open */
1681         bdrv_dev_change_media_cb(bs, true);
1682     }
1683     return ret;
1684 }
1685
1686 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1687 {
1688     if (!bs->drv) {
1689         buf[0] = '\0';
1690     } else {
1691         pstrcpy(buf, buf_size, bs->drv->format_name);
1692     }
1693 }
1694
1695 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1696                          void *opaque)
1697 {
1698     BlockDriver *drv;
1699
1700     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1701         it(opaque, drv->format_name);
1702     }
1703 }
1704
1705 BlockDriverState *bdrv_find(const char *name)
1706 {
1707     BlockDriverState *bs;
1708
1709     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1710         if (!strcmp(name, bs->device_name)) {
1711             return bs;
1712         }
1713     }
1714     return NULL;
1715 }
1716
1717 BlockDriverState *bdrv_next(BlockDriverState *bs)
1718 {
1719     if (!bs) {
1720         return QTAILQ_FIRST(&bdrv_states);
1721     }
1722     return QTAILQ_NEXT(bs, list);
1723 }
1724
1725 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1726 {
1727     BlockDriverState *bs;
1728
1729     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1730         it(opaque, bs);
1731     }
1732 }
1733
1734 const char *bdrv_get_device_name(BlockDriverState *bs)
1735 {
1736     return bs->device_name;
1737 }
1738
1739 int bdrv_flush(BlockDriverState *bs)
1740 {
1741     if (bs->open_flags & BDRV_O_NO_FLUSH) {
1742         return 0;
1743     }
1744
1745     if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) {
1746         return bdrv_co_flush_em(bs);
1747     }
1748
1749     if (bs->drv && bs->drv->bdrv_flush) {
1750         return bs->drv->bdrv_flush(bs);
1751     }
1752
1753     /*
1754      * Some block drivers always operate in either writethrough or unsafe mode
1755      * and don't support bdrv_flush therefore. Usually qemu doesn't know how
1756      * the server works (because the behaviour is hardcoded or depends on
1757      * server-side configuration), so we can't ensure that everything is safe
1758      * on disk. Returning an error doesn't work because that would break guests
1759      * even if the server operates in writethrough mode.
1760      *
1761      * Let's hope the user knows what he's doing.
1762      */
1763     return 0;
1764 }
1765
1766 void bdrv_flush_all(void)
1767 {
1768     BlockDriverState *bs;
1769
1770     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1771         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1772             bdrv_flush(bs);
1773         }
1774     }
1775 }
1776
1777 int bdrv_has_zero_init(BlockDriverState *bs)
1778 {
1779     assert(bs->drv);
1780
1781     if (bs->drv->bdrv_has_zero_init) {
1782         return bs->drv->bdrv_has_zero_init(bs);
1783     }
1784
1785     return 1;
1786 }
1787
1788 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
1789 {
1790     if (!bs->drv) {
1791         return -ENOMEDIUM;
1792     }
1793     if (!bs->drv->bdrv_discard) {
1794         return 0;
1795     }
1796     return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
1797 }
1798
1799 /*
1800  * Returns true iff the specified sector is present in the disk image. Drivers
1801  * not implementing the functionality are assumed to not support backing files,
1802  * hence all their sectors are reported as allocated.
1803  *
1804  * 'pnum' is set to the number of sectors (including and immediately following
1805  * the specified sector) that are known to be in the same
1806  * allocated/unallocated state.
1807  *
1808  * 'nb_sectors' is the max value 'pnum' should be set to.
1809  */
1810 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1811         int *pnum)
1812 {
1813     int64_t n;
1814     if (!bs->drv->bdrv_is_allocated) {
1815         if (sector_num >= bs->total_sectors) {
1816             *pnum = 0;
1817             return 0;
1818         }
1819         n = bs->total_sectors - sector_num;
1820         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1821         return 1;
1822     }
1823     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1824 }
1825
1826 void bdrv_mon_event(const BlockDriverState *bdrv,
1827                     BlockMonEventAction action, int is_read)
1828 {
1829     QObject *data;
1830     const char *action_str;
1831
1832     switch (action) {
1833     case BDRV_ACTION_REPORT:
1834         action_str = "report";
1835         break;
1836     case BDRV_ACTION_IGNORE:
1837         action_str = "ignore";
1838         break;
1839     case BDRV_ACTION_STOP:
1840         action_str = "stop";
1841         break;
1842     default:
1843         abort();
1844     }
1845
1846     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1847                               bdrv->device_name,
1848                               action_str,
1849                               is_read ? "read" : "write");
1850     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1851
1852     qobject_decref(data);
1853 }
1854
1855 static void bdrv_print_dict(QObject *obj, void *opaque)
1856 {
1857     QDict *bs_dict;
1858     Monitor *mon = opaque;
1859
1860     bs_dict = qobject_to_qdict(obj);
1861
1862     monitor_printf(mon, "%s: removable=%d",
1863                         qdict_get_str(bs_dict, "device"),
1864                         qdict_get_bool(bs_dict, "removable"));
1865
1866     if (qdict_get_bool(bs_dict, "removable")) {
1867         monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1868         monitor_printf(mon, " tray-open=%d",
1869                        qdict_get_bool(bs_dict, "tray-open"));
1870     }
1871     if (qdict_haskey(bs_dict, "inserted")) {
1872         QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1873
1874         monitor_printf(mon, " file=");
1875         monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1876         if (qdict_haskey(qdict, "backing_file")) {
1877             monitor_printf(mon, " backing_file=");
1878             monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1879         }
1880         monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1881                             qdict_get_bool(qdict, "ro"),
1882                             qdict_get_str(qdict, "drv"),
1883                             qdict_get_bool(qdict, "encrypted"));
1884     } else {
1885         monitor_printf(mon, " [not inserted]");
1886     }
1887
1888     monitor_printf(mon, "\n");
1889 }
1890
1891 void bdrv_info_print(Monitor *mon, const QObject *data)
1892 {
1893     qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1894 }
1895
1896 void bdrv_info(Monitor *mon, QObject **ret_data)
1897 {
1898     QList *bs_list;
1899     BlockDriverState *bs;
1900
1901     bs_list = qlist_new();
1902
1903     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1904         QObject *bs_obj;
1905         QDict *bs_dict;
1906
1907         bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': 'unknown', "
1908                                     "'removable': %i, 'locked': %i }",
1909                                     bs->device_name,
1910                                     bdrv_dev_has_removable_media(bs),
1911                                     bdrv_dev_is_medium_locked(bs));
1912         bs_dict = qobject_to_qdict(bs_obj);
1913
1914         if (bdrv_dev_has_removable_media(bs)) {
1915             qdict_put(bs_dict, "tray-open",
1916                       qbool_from_int(bdrv_dev_is_tray_open(bs)));
1917         }
1918         if (bs->drv) {
1919             QObject *obj;
1920
1921             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1922                                      "'encrypted': %i }",
1923                                      bs->filename, bs->read_only,
1924                                      bs->drv->format_name,
1925                                      bdrv_is_encrypted(bs));
1926             if (bs->backing_file[0] != '\0') {
1927                 QDict *qdict = qobject_to_qdict(obj);
1928                 qdict_put(qdict, "backing_file",
1929                           qstring_from_str(bs->backing_file));
1930             }
1931
1932             qdict_put_obj(bs_dict, "inserted", obj);
1933         }
1934         qlist_append_obj(bs_list, bs_obj);
1935     }
1936
1937     *ret_data = QOBJECT(bs_list);
1938 }
1939
1940 static void bdrv_stats_iter(QObject *data, void *opaque)
1941 {
1942     QDict *qdict;
1943     Monitor *mon = opaque;
1944
1945     qdict = qobject_to_qdict(data);
1946     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1947
1948     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1949     monitor_printf(mon, " rd_bytes=%" PRId64
1950                         " wr_bytes=%" PRId64
1951                         " rd_operations=%" PRId64
1952                         " wr_operations=%" PRId64
1953                         " flush_operations=%" PRId64
1954                         " wr_total_time_ns=%" PRId64
1955                         " rd_total_time_ns=%" PRId64
1956                         " flush_total_time_ns=%" PRId64
1957                         "\n",
1958                         qdict_get_int(qdict, "rd_bytes"),
1959                         qdict_get_int(qdict, "wr_bytes"),
1960                         qdict_get_int(qdict, "rd_operations"),
1961                         qdict_get_int(qdict, "wr_operations"),
1962                         qdict_get_int(qdict, "flush_operations"),
1963                         qdict_get_int(qdict, "wr_total_time_ns"),
1964                         qdict_get_int(qdict, "rd_total_time_ns"),
1965                         qdict_get_int(qdict, "flush_total_time_ns"));
1966 }
1967
1968 void bdrv_stats_print(Monitor *mon, const QObject *data)
1969 {
1970     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1971 }
1972
1973 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1974 {
1975     QObject *res;
1976     QDict *dict;
1977
1978     res = qobject_from_jsonf("{ 'stats': {"
1979                              "'rd_bytes': %" PRId64 ","
1980                              "'wr_bytes': %" PRId64 ","
1981                              "'rd_operations': %" PRId64 ","
1982                              "'wr_operations': %" PRId64 ","
1983                              "'wr_highest_offset': %" PRId64 ","
1984                              "'flush_operations': %" PRId64 ","
1985                              "'wr_total_time_ns': %" PRId64 ","
1986                              "'rd_total_time_ns': %" PRId64 ","
1987                              "'flush_total_time_ns': %" PRId64
1988                              "} }",
1989                              bs->nr_bytes[BDRV_ACCT_READ],
1990                              bs->nr_bytes[BDRV_ACCT_WRITE],
1991                              bs->nr_ops[BDRV_ACCT_READ],
1992                              bs->nr_ops[BDRV_ACCT_WRITE],
1993                              bs->wr_highest_sector *
1994                              (uint64_t)BDRV_SECTOR_SIZE,
1995                              bs->nr_ops[BDRV_ACCT_FLUSH],
1996                              bs->total_time_ns[BDRV_ACCT_WRITE],
1997                              bs->total_time_ns[BDRV_ACCT_READ],
1998                              bs->total_time_ns[BDRV_ACCT_FLUSH]);
1999     dict  = qobject_to_qdict(res);
2000
2001     if (*bs->device_name) {
2002         qdict_put(dict, "device", qstring_from_str(bs->device_name));
2003     }
2004
2005     if (bs->file) {
2006         QObject *parent = bdrv_info_stats_bs(bs->file);
2007         qdict_put_obj(dict, "parent", parent);
2008     }
2009
2010     return res;
2011 }
2012
2013 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
2014 {
2015     QObject *obj;
2016     QList *devices;
2017     BlockDriverState *bs;
2018
2019     devices = qlist_new();
2020
2021     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2022         obj = bdrv_info_stats_bs(bs);
2023         qlist_append_obj(devices, obj);
2024     }
2025
2026     *ret_data = QOBJECT(devices);
2027 }
2028
2029 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2030 {
2031     if (bs->backing_hd && bs->backing_hd->encrypted)
2032         return bs->backing_file;
2033     else if (bs->encrypted)
2034         return bs->filename;
2035     else
2036         return NULL;
2037 }
2038
2039 void bdrv_get_backing_filename(BlockDriverState *bs,
2040                                char *filename, int filename_size)
2041 {
2042     if (!bs->backing_file) {
2043         pstrcpy(filename, filename_size, "");
2044     } else {
2045         pstrcpy(filename, filename_size, bs->backing_file);
2046     }
2047 }
2048
2049 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2050                           const uint8_t *buf, int nb_sectors)
2051 {
2052     BlockDriver *drv = bs->drv;
2053     if (!drv)
2054         return -ENOMEDIUM;
2055     if (!drv->bdrv_write_compressed)
2056         return -ENOTSUP;
2057     if (bdrv_check_request(bs, sector_num, nb_sectors))
2058         return -EIO;
2059
2060     if (bs->dirty_bitmap) {
2061         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2062     }
2063
2064     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2065 }
2066
2067 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2068 {
2069     BlockDriver *drv = bs->drv;
2070     if (!drv)
2071         return -ENOMEDIUM;
2072     if (!drv->bdrv_get_info)
2073         return -ENOTSUP;
2074     memset(bdi, 0, sizeof(*bdi));
2075     return drv->bdrv_get_info(bs, bdi);
2076 }
2077
2078 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2079                       int64_t pos, int size)
2080 {
2081     BlockDriver *drv = bs->drv;
2082     if (!drv)
2083         return -ENOMEDIUM;
2084     if (drv->bdrv_save_vmstate)
2085         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2086     if (bs->file)
2087         return bdrv_save_vmstate(bs->file, buf, pos, size);
2088     return -ENOTSUP;
2089 }
2090
2091 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2092                       int64_t pos, int size)
2093 {
2094     BlockDriver *drv = bs->drv;
2095     if (!drv)
2096         return -ENOMEDIUM;
2097     if (drv->bdrv_load_vmstate)
2098         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2099     if (bs->file)
2100         return bdrv_load_vmstate(bs->file, buf, pos, size);
2101     return -ENOTSUP;
2102 }
2103
2104 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2105 {
2106     BlockDriver *drv = bs->drv;
2107
2108     if (!drv || !drv->bdrv_debug_event) {
2109         return;
2110     }
2111
2112     return drv->bdrv_debug_event(bs, event);
2113
2114 }
2115
2116 /**************************************************************/
2117 /* handling of snapshots */
2118
2119 int bdrv_can_snapshot(BlockDriverState *bs)
2120 {
2121     BlockDriver *drv = bs->drv;
2122     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2123         return 0;
2124     }
2125
2126     if (!drv->bdrv_snapshot_create) {
2127         if (bs->file != NULL) {
2128             return bdrv_can_snapshot(bs->file);
2129         }
2130         return 0;
2131     }
2132
2133     return 1;
2134 }
2135
2136 int bdrv_is_snapshot(BlockDriverState *bs)
2137 {
2138     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2139 }
2140
2141 BlockDriverState *bdrv_snapshots(void)
2142 {
2143     BlockDriverState *bs;
2144
2145     if (bs_snapshots) {
2146         return bs_snapshots;
2147     }
2148
2149     bs = NULL;
2150     while ((bs = bdrv_next(bs))) {
2151         if (bdrv_can_snapshot(bs)) {
2152             bs_snapshots = bs;
2153             return bs;
2154         }
2155     }
2156     return NULL;
2157 }
2158
2159 int bdrv_snapshot_create(BlockDriverState *bs,
2160                          QEMUSnapshotInfo *sn_info)
2161 {
2162     BlockDriver *drv = bs->drv;
2163     if (!drv)
2164         return -ENOMEDIUM;
2165     if (drv->bdrv_snapshot_create)
2166         return drv->bdrv_snapshot_create(bs, sn_info);
2167     if (bs->file)
2168         return bdrv_snapshot_create(bs->file, sn_info);
2169     return -ENOTSUP;
2170 }
2171
2172 int bdrv_snapshot_goto(BlockDriverState *bs,
2173                        const char *snapshot_id)
2174 {
2175     BlockDriver *drv = bs->drv;
2176     int ret, open_ret;
2177
2178     if (!drv)
2179         return -ENOMEDIUM;
2180     if (drv->bdrv_snapshot_goto)
2181         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2182
2183     if (bs->file) {
2184         drv->bdrv_close(bs);
2185         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2186         open_ret = drv->bdrv_open(bs, bs->open_flags);
2187         if (open_ret < 0) {
2188             bdrv_delete(bs->file);
2189             bs->drv = NULL;
2190             return open_ret;
2191         }
2192         return ret;
2193     }
2194
2195     return -ENOTSUP;
2196 }
2197
2198 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2199 {
2200     BlockDriver *drv = bs->drv;
2201     if (!drv)
2202         return -ENOMEDIUM;
2203     if (drv->bdrv_snapshot_delete)
2204         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2205     if (bs->file)
2206         return bdrv_snapshot_delete(bs->file, snapshot_id);
2207     return -ENOTSUP;
2208 }
2209
2210 int bdrv_snapshot_list(BlockDriverState *bs,
2211                        QEMUSnapshotInfo **psn_info)
2212 {
2213     BlockDriver *drv = bs->drv;
2214     if (!drv)
2215         return -ENOMEDIUM;
2216     if (drv->bdrv_snapshot_list)
2217         return drv->bdrv_snapshot_list(bs, psn_info);
2218     if (bs->file)
2219         return bdrv_snapshot_list(bs->file, psn_info);
2220     return -ENOTSUP;
2221 }
2222
2223 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2224         const char *snapshot_name)
2225 {
2226     BlockDriver *drv = bs->drv;
2227     if (!drv) {
2228         return -ENOMEDIUM;
2229     }
2230     if (!bs->read_only) {
2231         return -EINVAL;
2232     }
2233     if (drv->bdrv_snapshot_load_tmp) {
2234         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2235     }
2236     return -ENOTSUP;
2237 }
2238
2239 #define NB_SUFFIXES 4
2240
2241 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2242 {
2243     static const char suffixes[NB_SUFFIXES] = "KMGT";
2244     int64_t base;
2245     int i;
2246
2247     if (size <= 999) {
2248         snprintf(buf, buf_size, "%" PRId64, size);
2249     } else {
2250         base = 1024;
2251         for(i = 0; i < NB_SUFFIXES; i++) {
2252             if (size < (10 * base)) {
2253                 snprintf(buf, buf_size, "%0.1f%c",
2254                          (double)size / base,
2255                          suffixes[i]);
2256                 break;
2257             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2258                 snprintf(buf, buf_size, "%" PRId64 "%c",
2259                          ((size + (base >> 1)) / base),
2260                          suffixes[i]);
2261                 break;
2262             }
2263             base = base * 1024;
2264         }
2265     }
2266     return buf;
2267 }
2268
2269 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2270 {
2271     char buf1[128], date_buf[128], clock_buf[128];
2272 #ifdef _WIN32
2273     struct tm *ptm;
2274 #else
2275     struct tm tm;
2276 #endif
2277     time_t ti;
2278     int64_t secs;
2279
2280     if (!sn) {
2281         snprintf(buf, buf_size,
2282                  "%-10s%-20s%7s%20s%15s",
2283                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2284     } else {
2285         ti = sn->date_sec;
2286 #ifdef _WIN32
2287         ptm = localtime(&ti);
2288         strftime(date_buf, sizeof(date_buf),
2289                  "%Y-%m-%d %H:%M:%S", ptm);
2290 #else
2291         localtime_r(&ti, &tm);
2292         strftime(date_buf, sizeof(date_buf),
2293                  "%Y-%m-%d %H:%M:%S", &tm);
2294 #endif
2295         secs = sn->vm_clock_nsec / 1000000000;
2296         snprintf(clock_buf, sizeof(clock_buf),
2297                  "%02d:%02d:%02d.%03d",
2298                  (int)(secs / 3600),
2299                  (int)((secs / 60) % 60),
2300                  (int)(secs % 60),
2301                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2302         snprintf(buf, buf_size,
2303                  "%-10s%-20s%7s%20s%15s",
2304                  sn->id_str, sn->name,
2305                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2306                  date_buf,
2307                  clock_buf);
2308     }
2309     return buf;
2310 }
2311
2312 /**************************************************************/
2313 /* async I/Os */
2314
2315 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2316                                  QEMUIOVector *qiov, int nb_sectors,
2317                                  BlockDriverCompletionFunc *cb, void *opaque)
2318 {
2319     BlockDriver *drv = bs->drv;
2320
2321     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2322
2323     if (!drv)
2324         return NULL;
2325     if (bdrv_check_request(bs, sector_num, nb_sectors))
2326         return NULL;
2327
2328     return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
2329                                cb, opaque);
2330 }
2331
2332 typedef struct BlockCompleteData {
2333     BlockDriverCompletionFunc *cb;
2334     void *opaque;
2335     BlockDriverState *bs;
2336     int64_t sector_num;
2337     int nb_sectors;
2338 } BlockCompleteData;
2339
2340 static void block_complete_cb(void *opaque, int ret)
2341 {
2342     BlockCompleteData *b = opaque;
2343
2344     if (b->bs->dirty_bitmap) {
2345         set_dirty_bitmap(b->bs, b->sector_num, b->nb_sectors, 1);
2346     }
2347     b->cb(b->opaque, ret);
2348     g_free(b);
2349 }
2350
2351 static BlockCompleteData *blk_dirty_cb_alloc(BlockDriverState *bs,
2352                                              int64_t sector_num,
2353                                              int nb_sectors,
2354                                              BlockDriverCompletionFunc *cb,
2355                                              void *opaque)
2356 {
2357     BlockCompleteData *blkdata = g_malloc0(sizeof(BlockCompleteData));
2358
2359     blkdata->bs = bs;
2360     blkdata->cb = cb;
2361     blkdata->opaque = opaque;
2362     blkdata->sector_num = sector_num;
2363     blkdata->nb_sectors = nb_sectors;
2364
2365     return blkdata;
2366 }
2367
2368 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2369                                   QEMUIOVector *qiov, int nb_sectors,
2370                                   BlockDriverCompletionFunc *cb, void *opaque)
2371 {
2372     BlockDriver *drv = bs->drv;
2373     BlockDriverAIOCB *ret;
2374     BlockCompleteData *blk_cb_data;
2375
2376     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2377
2378     if (!drv)
2379         return NULL;
2380     if (bs->read_only)
2381         return NULL;
2382     if (bdrv_check_request(bs, sector_num, nb_sectors))
2383         return NULL;
2384
2385     if (bs->dirty_bitmap) {
2386         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
2387                                          opaque);
2388         cb = &block_complete_cb;
2389         opaque = blk_cb_data;
2390     }
2391
2392     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2393                                cb, opaque);
2394
2395     if (ret) {
2396         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2397             bs->wr_highest_sector = sector_num + nb_sectors - 1;
2398         }
2399     }
2400
2401     return ret;
2402 }
2403
2404
2405 typedef struct MultiwriteCB {
2406     int error;
2407     int num_requests;
2408     int num_callbacks;
2409     struct {
2410         BlockDriverCompletionFunc *cb;
2411         void *opaque;
2412         QEMUIOVector *free_qiov;
2413         void *free_buf;
2414     } callbacks[];
2415 } MultiwriteCB;
2416
2417 static void multiwrite_user_cb(MultiwriteCB *mcb)
2418 {
2419     int i;
2420
2421     for (i = 0; i < mcb->num_callbacks; i++) {
2422         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2423         if (mcb->callbacks[i].free_qiov) {
2424             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2425         }
2426         g_free(mcb->callbacks[i].free_qiov);
2427         qemu_vfree(mcb->callbacks[i].free_buf);
2428     }
2429 }
2430
2431 static void multiwrite_cb(void *opaque, int ret)
2432 {
2433     MultiwriteCB *mcb = opaque;
2434
2435     trace_multiwrite_cb(mcb, ret);
2436
2437     if (ret < 0 && !mcb->error) {
2438         mcb->error = ret;
2439     }
2440
2441     mcb->num_requests--;
2442     if (mcb->num_requests == 0) {
2443         multiwrite_user_cb(mcb);
2444         g_free(mcb);
2445     }
2446 }
2447
2448 static int multiwrite_req_compare(const void *a, const void *b)
2449 {
2450     const BlockRequest *req1 = a, *req2 = b;
2451
2452     /*
2453      * Note that we can't simply subtract req2->sector from req1->sector
2454      * here as that could overflow the return value.
2455      */
2456     if (req1->sector > req2->sector) {
2457         return 1;
2458     } else if (req1->sector < req2->sector) {
2459         return -1;
2460     } else {
2461         return 0;
2462     }
2463 }
2464
2465 /*
2466  * Takes a bunch of requests and tries to merge them. Returns the number of
2467  * requests that remain after merging.
2468  */
2469 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2470     int num_reqs, MultiwriteCB *mcb)
2471 {
2472     int i, outidx;
2473
2474     // Sort requests by start sector
2475     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2476
2477     // Check if adjacent requests touch the same clusters. If so, combine them,
2478     // filling up gaps with zero sectors.
2479     outidx = 0;
2480     for (i = 1; i < num_reqs; i++) {
2481         int merge = 0;
2482         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2483
2484         // This handles the cases that are valid for all block drivers, namely
2485         // exactly sequential writes and overlapping writes.
2486         if (reqs[i].sector <= oldreq_last) {
2487             merge = 1;
2488         }
2489
2490         // The block driver may decide that it makes sense to combine requests
2491         // even if there is a gap of some sectors between them. In this case,
2492         // the gap is filled with zeros (therefore only applicable for yet
2493         // unused space in format like qcow2).
2494         if (!merge && bs->drv->bdrv_merge_requests) {
2495             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2496         }
2497
2498         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2499             merge = 0;
2500         }
2501
2502         if (merge) {
2503             size_t size;
2504             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2505             qemu_iovec_init(qiov,
2506                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2507
2508             // Add the first request to the merged one. If the requests are
2509             // overlapping, drop the last sectors of the first request.
2510             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2511             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2512
2513             // We might need to add some zeros between the two requests
2514             if (reqs[i].sector > oldreq_last) {
2515                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2516                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2517                 memset(buf, 0, zero_bytes);
2518                 qemu_iovec_add(qiov, buf, zero_bytes);
2519                 mcb->callbacks[i].free_buf = buf;
2520             }
2521
2522             // Add the second request
2523             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2524
2525             reqs[outidx].nb_sectors = qiov->size >> 9;
2526             reqs[outidx].qiov = qiov;
2527
2528             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2529         } else {
2530             outidx++;
2531             reqs[outidx].sector     = reqs[i].sector;
2532             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2533             reqs[outidx].qiov       = reqs[i].qiov;
2534         }
2535     }
2536
2537     return outidx + 1;
2538 }
2539
2540 /*
2541  * Submit multiple AIO write requests at once.
2542  *
2543  * On success, the function returns 0 and all requests in the reqs array have
2544  * been submitted. In error case this function returns -1, and any of the
2545  * requests may or may not be submitted yet. In particular, this means that the
2546  * callback will be called for some of the requests, for others it won't. The
2547  * caller must check the error field of the BlockRequest to wait for the right
2548  * callbacks (if error != 0, no callback will be called).
2549  *
2550  * The implementation may modify the contents of the reqs array, e.g. to merge
2551  * requests. However, the fields opaque and error are left unmodified as they
2552  * are used to signal failure for a single request to the caller.
2553  */
2554 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2555 {
2556     BlockDriverAIOCB *acb;
2557     MultiwriteCB *mcb;
2558     int i;
2559
2560     /* don't submit writes if we don't have a medium */
2561     if (bs->drv == NULL) {
2562         for (i = 0; i < num_reqs; i++) {
2563             reqs[i].error = -ENOMEDIUM;
2564         }
2565         return -1;
2566     }
2567
2568     if (num_reqs == 0) {
2569         return 0;
2570     }
2571
2572     // Create MultiwriteCB structure
2573     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2574     mcb->num_requests = 0;
2575     mcb->num_callbacks = num_reqs;
2576
2577     for (i = 0; i < num_reqs; i++) {
2578         mcb->callbacks[i].cb = reqs[i].cb;
2579         mcb->callbacks[i].opaque = reqs[i].opaque;
2580     }
2581
2582     // Check for mergable requests
2583     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2584
2585     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2586
2587     /*
2588      * Run the aio requests. As soon as one request can't be submitted
2589      * successfully, fail all requests that are not yet submitted (we must
2590      * return failure for all requests anyway)
2591      *
2592      * num_requests cannot be set to the right value immediately: If
2593      * bdrv_aio_writev fails for some request, num_requests would be too high
2594      * and therefore multiwrite_cb() would never recognize the multiwrite
2595      * request as completed. We also cannot use the loop variable i to set it
2596      * when the first request fails because the callback may already have been
2597      * called for previously submitted requests. Thus, num_requests must be
2598      * incremented for each request that is submitted.
2599      *
2600      * The problem that callbacks may be called early also means that we need
2601      * to take care that num_requests doesn't become 0 before all requests are
2602      * submitted - multiwrite_cb() would consider the multiwrite request
2603      * completed. A dummy request that is "completed" by a manual call to
2604      * multiwrite_cb() takes care of this.
2605      */
2606     mcb->num_requests = 1;
2607
2608     // Run the aio requests
2609     for (i = 0; i < num_reqs; i++) {
2610         mcb->num_requests++;
2611         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2612             reqs[i].nb_sectors, multiwrite_cb, mcb);
2613
2614         if (acb == NULL) {
2615             // We can only fail the whole thing if no request has been
2616             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2617             // complete and report the error in the callback.
2618             if (i == 0) {
2619                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2620                 goto fail;
2621             } else {
2622                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2623                 multiwrite_cb(mcb, -EIO);
2624                 break;
2625             }
2626         }
2627     }
2628
2629     /* Complete the dummy request */
2630     multiwrite_cb(mcb, 0);
2631
2632     return 0;
2633
2634 fail:
2635     for (i = 0; i < mcb->num_callbacks; i++) {
2636         reqs[i].error = -EIO;
2637     }
2638     g_free(mcb);
2639     return -1;
2640 }
2641
2642 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2643         BlockDriverCompletionFunc *cb, void *opaque)
2644 {
2645     BlockDriver *drv = bs->drv;
2646
2647     trace_bdrv_aio_flush(bs, opaque);
2648
2649     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2650         return bdrv_aio_noop_em(bs, cb, opaque);
2651     }
2652
2653     if (!drv)
2654         return NULL;
2655     return drv->bdrv_aio_flush(bs, cb, opaque);
2656 }
2657
2658 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2659 {
2660     acb->pool->cancel(acb);
2661 }
2662
2663
2664 /**************************************************************/
2665 /* async block device emulation */
2666
2667 typedef struct BlockDriverAIOCBSync {
2668     BlockDriverAIOCB common;
2669     QEMUBH *bh;
2670     int ret;
2671     /* vector translation state */
2672     QEMUIOVector *qiov;
2673     uint8_t *bounce;
2674     int is_write;
2675 } BlockDriverAIOCBSync;
2676
2677 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2678 {
2679     BlockDriverAIOCBSync *acb =
2680         container_of(blockacb, BlockDriverAIOCBSync, common);
2681     qemu_bh_delete(acb->bh);
2682     acb->bh = NULL;
2683     qemu_aio_release(acb);
2684 }
2685
2686 static AIOPool bdrv_em_aio_pool = {
2687     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2688     .cancel             = bdrv_aio_cancel_em,
2689 };
2690
2691 static void bdrv_aio_bh_cb(void *opaque)
2692 {
2693     BlockDriverAIOCBSync *acb = opaque;
2694
2695     if (!acb->is_write)
2696         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2697     qemu_vfree(acb->bounce);
2698     acb->common.cb(acb->common.opaque, acb->ret);
2699     qemu_bh_delete(acb->bh);
2700     acb->bh = NULL;
2701     qemu_aio_release(acb);
2702 }
2703
2704 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2705                                             int64_t sector_num,
2706                                             QEMUIOVector *qiov,
2707                                             int nb_sectors,
2708                                             BlockDriverCompletionFunc *cb,
2709                                             void *opaque,
2710                                             int is_write)
2711
2712 {
2713     BlockDriverAIOCBSync *acb;
2714
2715     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2716     acb->is_write = is_write;
2717     acb->qiov = qiov;
2718     acb->bounce = qemu_blockalign(bs, qiov->size);
2719
2720     if (!acb->bh)
2721         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2722
2723     if (is_write) {
2724         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2725         acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2726     } else {
2727         acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2728     }
2729
2730     qemu_bh_schedule(acb->bh);
2731
2732     return &acb->common;
2733 }
2734
2735 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2736         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2737         BlockDriverCompletionFunc *cb, void *opaque)
2738 {
2739     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2740 }
2741
2742 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2743         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2744         BlockDriverCompletionFunc *cb, void *opaque)
2745 {
2746     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2747 }
2748
2749
2750 typedef struct BlockDriverAIOCBCoroutine {
2751     BlockDriverAIOCB common;
2752     BlockRequest req;
2753     bool is_write;
2754     QEMUBH* bh;
2755 } BlockDriverAIOCBCoroutine;
2756
2757 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2758 {
2759     qemu_aio_flush();
2760 }
2761
2762 static AIOPool bdrv_em_co_aio_pool = {
2763     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2764     .cancel             = bdrv_aio_co_cancel_em,
2765 };
2766
2767 static void bdrv_co_rw_bh(void *opaque)
2768 {
2769     BlockDriverAIOCBCoroutine *acb = opaque;
2770
2771     acb->common.cb(acb->common.opaque, acb->req.error);
2772     qemu_bh_delete(acb->bh);
2773     qemu_aio_release(acb);
2774 }
2775
2776 static void coroutine_fn bdrv_co_rw(void *opaque)
2777 {
2778     BlockDriverAIOCBCoroutine *acb = opaque;
2779     BlockDriverState *bs = acb->common.bs;
2780
2781     if (!acb->is_write) {
2782         acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector,
2783             acb->req.nb_sectors, acb->req.qiov);
2784     } else {
2785         acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector,
2786             acb->req.nb_sectors, acb->req.qiov);
2787     }
2788
2789     acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb);
2790     qemu_bh_schedule(acb->bh);
2791 }
2792
2793 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2794                                                int64_t sector_num,
2795                                                QEMUIOVector *qiov,
2796                                                int nb_sectors,
2797                                                BlockDriverCompletionFunc *cb,
2798                                                void *opaque,
2799                                                bool is_write)
2800 {
2801     Coroutine *co;
2802     BlockDriverAIOCBCoroutine *acb;
2803
2804     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2805     acb->req.sector = sector_num;
2806     acb->req.nb_sectors = nb_sectors;
2807     acb->req.qiov = qiov;
2808     acb->is_write = is_write;
2809
2810     co = qemu_coroutine_create(bdrv_co_rw);
2811     qemu_coroutine_enter(co, acb);
2812
2813     return &acb->common;
2814 }
2815
2816 static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
2817         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2818         BlockDriverCompletionFunc *cb, void *opaque)
2819 {
2820     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
2821                                  false);
2822 }
2823
2824 static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
2825         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2826         BlockDriverCompletionFunc *cb, void *opaque)
2827 {
2828     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
2829                                  true);
2830 }
2831
2832 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2833         BlockDriverCompletionFunc *cb, void *opaque)
2834 {
2835     BlockDriverAIOCBSync *acb;
2836
2837     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2838     acb->is_write = 1; /* don't bounce in the completion hadler */
2839     acb->qiov = NULL;
2840     acb->bounce = NULL;
2841     acb->ret = 0;
2842
2843     if (!acb->bh)
2844         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2845
2846     bdrv_flush(bs);
2847     qemu_bh_schedule(acb->bh);
2848     return &acb->common;
2849 }
2850
2851 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2852         BlockDriverCompletionFunc *cb, void *opaque)
2853 {
2854     BlockDriverAIOCBSync *acb;
2855
2856     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2857     acb->is_write = 1; /* don't bounce in the completion handler */
2858     acb->qiov = NULL;
2859     acb->bounce = NULL;
2860     acb->ret = 0;
2861
2862     if (!acb->bh) {
2863         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2864     }
2865
2866     qemu_bh_schedule(acb->bh);
2867     return &acb->common;
2868 }
2869
2870 /**************************************************************/
2871 /* sync block device emulation */
2872
2873 static void bdrv_rw_em_cb(void *opaque, int ret)
2874 {
2875     *(int *)opaque = ret;
2876 }
2877
2878 #define NOT_DONE 0x7fffffff
2879
2880 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2881                         uint8_t *buf, int nb_sectors)
2882 {
2883     int async_ret;
2884     BlockDriverAIOCB *acb;
2885     struct iovec iov;
2886     QEMUIOVector qiov;
2887
2888     async_ret = NOT_DONE;
2889     iov.iov_base = (void *)buf;
2890     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2891     qemu_iovec_init_external(&qiov, &iov, 1);
2892     acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2893         bdrv_rw_em_cb, &async_ret);
2894     if (acb == NULL) {
2895         async_ret = -1;
2896         goto fail;
2897     }
2898
2899     while (async_ret == NOT_DONE) {
2900         qemu_aio_wait();
2901     }
2902
2903
2904 fail:
2905     return async_ret;
2906 }
2907
2908 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2909                          const uint8_t *buf, int nb_sectors)
2910 {
2911     int async_ret;
2912     BlockDriverAIOCB *acb;
2913     struct iovec iov;
2914     QEMUIOVector qiov;
2915
2916     async_ret = NOT_DONE;
2917     iov.iov_base = (void *)buf;
2918     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2919     qemu_iovec_init_external(&qiov, &iov, 1);
2920     acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2921         bdrv_rw_em_cb, &async_ret);
2922     if (acb == NULL) {
2923         async_ret = -1;
2924         goto fail;
2925     }
2926     while (async_ret == NOT_DONE) {
2927         qemu_aio_wait();
2928     }
2929
2930 fail:
2931     return async_ret;
2932 }
2933
2934 void bdrv_init(void)
2935 {
2936     module_call_init(MODULE_INIT_BLOCK);
2937 }
2938
2939 void bdrv_init_with_whitelist(void)
2940 {
2941     use_bdrv_whitelist = 1;
2942     bdrv_init();
2943 }
2944
2945 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2946                    BlockDriverCompletionFunc *cb, void *opaque)
2947 {
2948     BlockDriverAIOCB *acb;
2949
2950     if (pool->free_aiocb) {
2951         acb = pool->free_aiocb;
2952         pool->free_aiocb = acb->next;
2953     } else {
2954         acb = g_malloc0(pool->aiocb_size);
2955         acb->pool = pool;
2956     }
2957     acb->bs = bs;
2958     acb->cb = cb;
2959     acb->opaque = opaque;
2960     return acb;
2961 }
2962
2963 void qemu_aio_release(void *p)
2964 {
2965     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2966     AIOPool *pool = acb->pool;
2967     acb->next = pool->free_aiocb;
2968     pool->free_aiocb = acb;
2969 }
2970
2971 /**************************************************************/
2972 /* Coroutine block device emulation */
2973
2974 typedef struct CoroutineIOCompletion {
2975     Coroutine *coroutine;
2976     int ret;
2977 } CoroutineIOCompletion;
2978
2979 static void bdrv_co_io_em_complete(void *opaque, int ret)
2980 {
2981     CoroutineIOCompletion *co = opaque;
2982
2983     co->ret = ret;
2984     qemu_coroutine_enter(co->coroutine, NULL);
2985 }
2986
2987 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2988                                       int nb_sectors, QEMUIOVector *iov,
2989                                       bool is_write)
2990 {
2991     CoroutineIOCompletion co = {
2992         .coroutine = qemu_coroutine_self(),
2993     };
2994     BlockDriverAIOCB *acb;
2995
2996     if (is_write) {
2997         acb = bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2998                               bdrv_co_io_em_complete, &co);
2999     } else {
3000         acb = bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3001                              bdrv_co_io_em_complete, &co);
3002     }
3003
3004     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3005     if (!acb) {
3006         return -EIO;
3007     }
3008     qemu_coroutine_yield();
3009
3010     return co.ret;
3011 }
3012
3013 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3014                                          int64_t sector_num, int nb_sectors,
3015                                          QEMUIOVector *iov)
3016 {
3017     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3018 }
3019
3020 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3021                                          int64_t sector_num, int nb_sectors,
3022                                          QEMUIOVector *iov)
3023 {
3024     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3025 }
3026
3027 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs)
3028 {
3029     CoroutineIOCompletion co = {
3030         .coroutine = qemu_coroutine_self(),
3031     };
3032     BlockDriverAIOCB *acb;
3033
3034     acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3035     if (!acb) {
3036         return -EIO;
3037     }
3038     qemu_coroutine_yield();
3039     return co.ret;
3040 }
3041
3042 /**************************************************************/
3043 /* removable device support */
3044
3045 /**
3046  * Return TRUE if the media is present
3047  */
3048 int bdrv_is_inserted(BlockDriverState *bs)
3049 {
3050     BlockDriver *drv = bs->drv;
3051
3052     if (!drv)
3053         return 0;
3054     if (!drv->bdrv_is_inserted)
3055         return 1;
3056     return drv->bdrv_is_inserted(bs);
3057 }
3058
3059 /**
3060  * Return whether the media changed since the last call to this
3061  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3062  */
3063 int bdrv_media_changed(BlockDriverState *bs)
3064 {
3065     BlockDriver *drv = bs->drv;
3066
3067     if (drv && drv->bdrv_media_changed) {
3068         return drv->bdrv_media_changed(bs);
3069     }
3070     return -ENOTSUP;
3071 }
3072
3073 /**
3074  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3075  */
3076 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3077 {
3078     BlockDriver *drv = bs->drv;
3079
3080     if (drv && drv->bdrv_eject) {
3081         drv->bdrv_eject(bs, eject_flag);
3082     }
3083 }
3084
3085 /**
3086  * Lock or unlock the media (if it is locked, the user won't be able
3087  * to eject it manually).
3088  */
3089 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3090 {
3091     BlockDriver *drv = bs->drv;
3092
3093     trace_bdrv_lock_medium(bs, locked);
3094
3095     if (drv && drv->bdrv_lock_medium) {
3096         drv->bdrv_lock_medium(bs, locked);
3097     }
3098 }
3099
3100 /* needed for generic scsi interface */
3101
3102 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3103 {
3104     BlockDriver *drv = bs->drv;
3105
3106     if (drv && drv->bdrv_ioctl)
3107         return drv->bdrv_ioctl(bs, req, buf);
3108     return -ENOTSUP;
3109 }
3110
3111 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3112         unsigned long int req, void *buf,
3113         BlockDriverCompletionFunc *cb, void *opaque)
3114 {
3115     BlockDriver *drv = bs->drv;
3116
3117     if (drv && drv->bdrv_aio_ioctl)
3118         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3119     return NULL;
3120 }
3121
3122 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3123 {
3124     bs->buffer_alignment = align;
3125 }
3126
3127 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3128 {
3129     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3130 }
3131
3132 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3133 {
3134     int64_t bitmap_size;
3135
3136     bs->dirty_count = 0;
3137     if (enable) {
3138         if (!bs->dirty_bitmap) {
3139             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3140                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3141             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3142
3143             bs->dirty_bitmap = g_malloc0(bitmap_size);
3144         }
3145     } else {
3146         if (bs->dirty_bitmap) {
3147             g_free(bs->dirty_bitmap);
3148             bs->dirty_bitmap = NULL;
3149         }
3150     }
3151 }
3152
3153 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3154 {
3155     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3156
3157     if (bs->dirty_bitmap &&
3158         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3159         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3160             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3161     } else {
3162         return 0;
3163     }
3164 }
3165
3166 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3167                       int nr_sectors)
3168 {
3169     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3170 }
3171
3172 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3173 {
3174     return bs->dirty_count;
3175 }
3176
3177 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3178 {
3179     assert(bs->in_use != in_use);
3180     bs->in_use = in_use;
3181 }
3182
3183 int bdrv_in_use(BlockDriverState *bs)
3184 {
3185     return bs->in_use;
3186 }
3187
3188 void bdrv_iostatus_enable(BlockDriverState *bs)
3189 {
3190     bs->iostatus = BDRV_IOS_OK;
3191 }
3192
3193 /* The I/O status is only enabled if the drive explicitly
3194  * enables it _and_ the VM is configured to stop on errors */
3195 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3196 {
3197     return (bs->iostatus != BDRV_IOS_INVAL &&
3198            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3199             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3200             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3201 }
3202
3203 void bdrv_iostatus_disable(BlockDriverState *bs)
3204 {
3205     bs->iostatus = BDRV_IOS_INVAL;
3206 }
3207
3208 void bdrv_iostatus_reset(BlockDriverState *bs)
3209 {
3210     if (bdrv_iostatus_is_enabled(bs)) {
3211         bs->iostatus = BDRV_IOS_OK;
3212     }
3213 }
3214
3215 /* XXX: Today this is set by device models because it makes the implementation
3216    quite simple. However, the block layer knows about the error, so it's
3217    possible to implement this without device models being involved */
3218 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3219 {
3220     if (bdrv_iostatus_is_enabled(bs) && bs->iostatus == BDRV_IOS_OK) {
3221         assert(error >= 0);
3222         bs->iostatus = error == ENOSPC ? BDRV_IOS_ENOSPC : BDRV_IOS_FAILED;
3223     }
3224 }
3225
3226 void
3227 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3228         enum BlockAcctType type)
3229 {
3230     assert(type < BDRV_MAX_IOTYPE);
3231
3232     cookie->bytes = bytes;
3233     cookie->start_time_ns = get_clock();
3234     cookie->type = type;
3235 }
3236
3237 void
3238 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3239 {
3240     assert(cookie->type < BDRV_MAX_IOTYPE);
3241
3242     bs->nr_bytes[cookie->type] += cookie->bytes;
3243     bs->nr_ops[cookie->type]++;
3244     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3245 }
3246
3247 int bdrv_img_create(const char *filename, const char *fmt,
3248                     const char *base_filename, const char *base_fmt,
3249                     char *options, uint64_t img_size, int flags)
3250 {
3251     QEMUOptionParameter *param = NULL, *create_options = NULL;
3252     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3253     BlockDriverState *bs = NULL;
3254     BlockDriver *drv, *proto_drv;
3255     BlockDriver *backing_drv = NULL;
3256     int ret = 0;
3257
3258     /* Find driver and parse its options */
3259     drv = bdrv_find_format(fmt);
3260     if (!drv) {
3261         error_report("Unknown file format '%s'", fmt);
3262         ret = -EINVAL;
3263         goto out;
3264     }
3265
3266     proto_drv = bdrv_find_protocol(filename);
3267     if (!proto_drv) {
3268         error_report("Unknown protocol '%s'", filename);
3269         ret = -EINVAL;
3270         goto out;
3271     }
3272
3273     create_options = append_option_parameters(create_options,
3274                                               drv->create_options);
3275     create_options = append_option_parameters(create_options,
3276                                               proto_drv->create_options);
3277
3278     /* Create parameter list with default values */
3279     param = parse_option_parameters("", create_options, param);
3280
3281     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3282
3283     /* Parse -o options */
3284     if (options) {
3285         param = parse_option_parameters(options, create_options, param);
3286         if (param == NULL) {
3287             error_report("Invalid options for file format '%s'.", fmt);
3288             ret = -EINVAL;
3289             goto out;
3290         }
3291     }
3292
3293     if (base_filename) {
3294         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3295                                  base_filename)) {
3296             error_report("Backing file not supported for file format '%s'",
3297                          fmt);
3298             ret = -EINVAL;
3299             goto out;
3300         }
3301     }
3302
3303     if (base_fmt) {
3304         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3305             error_report("Backing file format not supported for file "
3306                          "format '%s'", fmt);
3307             ret = -EINVAL;
3308             goto out;
3309         }
3310     }
3311
3312     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3313     if (backing_file && backing_file->value.s) {
3314         if (!strcmp(filename, backing_file->value.s)) {
3315             error_report("Error: Trying to create an image with the "
3316                          "same filename as the backing file");
3317             ret = -EINVAL;
3318             goto out;
3319         }
3320     }
3321
3322     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3323     if (backing_fmt && backing_fmt->value.s) {
3324         backing_drv = bdrv_find_format(backing_fmt->value.s);
3325         if (!backing_drv) {
3326             error_report("Unknown backing file format '%s'",
3327                          backing_fmt->value.s);
3328             ret = -EINVAL;
3329             goto out;
3330         }
3331     }
3332
3333     // The size for the image must always be specified, with one exception:
3334     // If we are using a backing file, we can obtain the size from there
3335     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3336     if (size && size->value.n == -1) {
3337         if (backing_file && backing_file->value.s) {
3338             uint64_t size;
3339             char buf[32];
3340
3341             bs = bdrv_new("");
3342
3343             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3344             if (ret < 0) {
3345                 error_report("Could not open '%s'", backing_file->value.s);
3346                 goto out;
3347             }
3348             bdrv_get_geometry(bs, &size);
3349             size *= 512;
3350
3351             snprintf(buf, sizeof(buf), "%" PRId64, size);
3352             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3353         } else {
3354             error_report("Image creation needs a size parameter");
3355             ret = -EINVAL;
3356             goto out;
3357         }
3358     }
3359
3360     printf("Formatting '%s', fmt=%s ", filename, fmt);
3361     print_option_parameters(param);
3362     puts("");
3363
3364     ret = bdrv_create(drv, filename, param);
3365
3366     if (ret < 0) {
3367         if (ret == -ENOTSUP) {
3368             error_report("Formatting or formatting option not supported for "
3369                          "file format '%s'", fmt);
3370         } else if (ret == -EFBIG) {
3371             error_report("The image size is too large for file format '%s'",
3372                          fmt);
3373         } else {
3374             error_report("%s: error while creating %s: %s", filename, fmt,
3375                          strerror(-ret));
3376         }
3377     }
3378
3379 out:
3380     free_option_parameters(create_options);
3381     free_option_parameters(param);
3382
3383     if (bs) {
3384         bdrv_delete(bs);
3385     }
3386
3387     return ret;
3388 }
This page took 0.201572 seconds and 4 git commands to generate.