]> Git Repo - qemu.git/blob - block.c
block: directly invoke .bdrv_aio_*() in bdrv_co_io_em()
[qemu.git] / block.c
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qemu-objects.h"
31 #include "qemu-coroutine.h"
32
33 #ifdef CONFIG_BSD
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/queue.h>
38 #ifndef __DragonFly__
39 #include <sys/disk.h>
40 #endif
41 #endif
42
43 #ifdef _WIN32
44 #include <windows.h>
45 #endif
46
47 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
48 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
49         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
50         BlockDriverCompletionFunc *cb, void *opaque);
51 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
52         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
53         BlockDriverCompletionFunc *cb, void *opaque);
54 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
55         BlockDriverCompletionFunc *cb, void *opaque);
56 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
57         BlockDriverCompletionFunc *cb, void *opaque);
58 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
59                         uint8_t *buf, int nb_sectors);
60 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
61                          const uint8_t *buf, int nb_sectors);
62 static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
63         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
64         BlockDriverCompletionFunc *cb, void *opaque);
65 static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
66         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
67         BlockDriverCompletionFunc *cb, void *opaque);
68 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
69                                          int64_t sector_num, int nb_sectors,
70                                          QEMUIOVector *iov);
71 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
72                                          int64_t sector_num, int nb_sectors,
73                                          QEMUIOVector *iov);
74 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
75
76 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
77     QTAILQ_HEAD_INITIALIZER(bdrv_states);
78
79 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
80     QLIST_HEAD_INITIALIZER(bdrv_drivers);
81
82 /* The device to use for VM snapshots */
83 static BlockDriverState *bs_snapshots;
84
85 /* If non-zero, use only whitelisted block drivers */
86 static int use_bdrv_whitelist;
87
88 #ifdef _WIN32
89 static int is_windows_drive_prefix(const char *filename)
90 {
91     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
92              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
93             filename[1] == ':');
94 }
95
96 int is_windows_drive(const char *filename)
97 {
98     if (is_windows_drive_prefix(filename) &&
99         filename[2] == '\0')
100         return 1;
101     if (strstart(filename, "\\\\.\\", NULL) ||
102         strstart(filename, "//./", NULL))
103         return 1;
104     return 0;
105 }
106 #endif
107
108 /* check if the path starts with "<protocol>:" */
109 static int path_has_protocol(const char *path)
110 {
111 #ifdef _WIN32
112     if (is_windows_drive(path) ||
113         is_windows_drive_prefix(path)) {
114         return 0;
115     }
116 #endif
117
118     return strchr(path, ':') != NULL;
119 }
120
121 int path_is_absolute(const char *path)
122 {
123     const char *p;
124 #ifdef _WIN32
125     /* specific case for names like: "\\.\d:" */
126     if (*path == '/' || *path == '\\')
127         return 1;
128 #endif
129     p = strchr(path, ':');
130     if (p)
131         p++;
132     else
133         p = path;
134 #ifdef _WIN32
135     return (*p == '/' || *p == '\\');
136 #else
137     return (*p == '/');
138 #endif
139 }
140
141 /* if filename is absolute, just copy it to dest. Otherwise, build a
142    path to it by considering it is relative to base_path. URL are
143    supported. */
144 void path_combine(char *dest, int dest_size,
145                   const char *base_path,
146                   const char *filename)
147 {
148     const char *p, *p1;
149     int len;
150
151     if (dest_size <= 0)
152         return;
153     if (path_is_absolute(filename)) {
154         pstrcpy(dest, dest_size, filename);
155     } else {
156         p = strchr(base_path, ':');
157         if (p)
158             p++;
159         else
160             p = base_path;
161         p1 = strrchr(base_path, '/');
162 #ifdef _WIN32
163         {
164             const char *p2;
165             p2 = strrchr(base_path, '\\');
166             if (!p1 || p2 > p1)
167                 p1 = p2;
168         }
169 #endif
170         if (p1)
171             p1++;
172         else
173             p1 = base_path;
174         if (p1 > p)
175             p = p1;
176         len = p - base_path;
177         if (len > dest_size - 1)
178             len = dest_size - 1;
179         memcpy(dest, base_path, len);
180         dest[len] = '\0';
181         pstrcat(dest, dest_size, filename);
182     }
183 }
184
185 void bdrv_register(BlockDriver *bdrv)
186 {
187     if (bdrv->bdrv_co_readv) {
188         /* Emulate AIO by coroutines, and sync by AIO */
189         bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em;
190         bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em;
191         bdrv->bdrv_read = bdrv_read_em;
192         bdrv->bdrv_write = bdrv_write_em;
193      } else {
194         bdrv->bdrv_co_readv = bdrv_co_readv_em;
195         bdrv->bdrv_co_writev = bdrv_co_writev_em;
196
197         if (!bdrv->bdrv_aio_readv) {
198             /* add AIO emulation layer */
199             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
200             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
201         } else if (!bdrv->bdrv_read) {
202             /* add synchronous IO emulation layer */
203             bdrv->bdrv_read = bdrv_read_em;
204             bdrv->bdrv_write = bdrv_write_em;
205         }
206     }
207
208     if (!bdrv->bdrv_aio_flush)
209         bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
210
211     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
212 }
213
214 /* create a new block device (by default it is empty) */
215 BlockDriverState *bdrv_new(const char *device_name)
216 {
217     BlockDriverState *bs;
218
219     bs = g_malloc0(sizeof(BlockDriverState));
220     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
221     if (device_name[0] != '\0') {
222         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
223     }
224     bdrv_iostatus_disable(bs);
225     return bs;
226 }
227
228 BlockDriver *bdrv_find_format(const char *format_name)
229 {
230     BlockDriver *drv1;
231     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
232         if (!strcmp(drv1->format_name, format_name)) {
233             return drv1;
234         }
235     }
236     return NULL;
237 }
238
239 static int bdrv_is_whitelisted(BlockDriver *drv)
240 {
241     static const char *whitelist[] = {
242         CONFIG_BDRV_WHITELIST
243     };
244     const char **p;
245
246     if (!whitelist[0])
247         return 1;               /* no whitelist, anything goes */
248
249     for (p = whitelist; *p; p++) {
250         if (!strcmp(drv->format_name, *p)) {
251             return 1;
252         }
253     }
254     return 0;
255 }
256
257 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
258 {
259     BlockDriver *drv = bdrv_find_format(format_name);
260     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
261 }
262
263 int bdrv_create(BlockDriver *drv, const char* filename,
264     QEMUOptionParameter *options)
265 {
266     if (!drv->bdrv_create)
267         return -ENOTSUP;
268
269     return drv->bdrv_create(filename, options);
270 }
271
272 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
273 {
274     BlockDriver *drv;
275
276     drv = bdrv_find_protocol(filename);
277     if (drv == NULL) {
278         return -ENOENT;
279     }
280
281     return bdrv_create(drv, filename, options);
282 }
283
284 #ifdef _WIN32
285 void get_tmp_filename(char *filename, int size)
286 {
287     char temp_dir[MAX_PATH];
288
289     GetTempPath(MAX_PATH, temp_dir);
290     GetTempFileName(temp_dir, "qem", 0, filename);
291 }
292 #else
293 void get_tmp_filename(char *filename, int size)
294 {
295     int fd;
296     const char *tmpdir;
297     /* XXX: race condition possible */
298     tmpdir = getenv("TMPDIR");
299     if (!tmpdir)
300         tmpdir = "/tmp";
301     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
302     fd = mkstemp(filename);
303     close(fd);
304 }
305 #endif
306
307 /*
308  * Detect host devices. By convention, /dev/cdrom[N] is always
309  * recognized as a host CDROM.
310  */
311 static BlockDriver *find_hdev_driver(const char *filename)
312 {
313     int score_max = 0, score;
314     BlockDriver *drv = NULL, *d;
315
316     QLIST_FOREACH(d, &bdrv_drivers, list) {
317         if (d->bdrv_probe_device) {
318             score = d->bdrv_probe_device(filename);
319             if (score > score_max) {
320                 score_max = score;
321                 drv = d;
322             }
323         }
324     }
325
326     return drv;
327 }
328
329 BlockDriver *bdrv_find_protocol(const char *filename)
330 {
331     BlockDriver *drv1;
332     char protocol[128];
333     int len;
334     const char *p;
335
336     /* TODO Drivers without bdrv_file_open must be specified explicitly */
337
338     /*
339      * XXX(hch): we really should not let host device detection
340      * override an explicit protocol specification, but moving this
341      * later breaks access to device names with colons in them.
342      * Thanks to the brain-dead persistent naming schemes on udev-
343      * based Linux systems those actually are quite common.
344      */
345     drv1 = find_hdev_driver(filename);
346     if (drv1) {
347         return drv1;
348     }
349
350     if (!path_has_protocol(filename)) {
351         return bdrv_find_format("file");
352     }
353     p = strchr(filename, ':');
354     assert(p != NULL);
355     len = p - filename;
356     if (len > sizeof(protocol) - 1)
357         len = sizeof(protocol) - 1;
358     memcpy(protocol, filename, len);
359     protocol[len] = '\0';
360     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
361         if (drv1->protocol_name &&
362             !strcmp(drv1->protocol_name, protocol)) {
363             return drv1;
364         }
365     }
366     return NULL;
367 }
368
369 static int find_image_format(const char *filename, BlockDriver **pdrv)
370 {
371     int ret, score, score_max;
372     BlockDriver *drv1, *drv;
373     uint8_t buf[2048];
374     BlockDriverState *bs;
375
376     ret = bdrv_file_open(&bs, filename, 0);
377     if (ret < 0) {
378         *pdrv = NULL;
379         return ret;
380     }
381
382     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
383     if (bs->sg || !bdrv_is_inserted(bs)) {
384         bdrv_delete(bs);
385         drv = bdrv_find_format("raw");
386         if (!drv) {
387             ret = -ENOENT;
388         }
389         *pdrv = drv;
390         return ret;
391     }
392
393     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
394     bdrv_delete(bs);
395     if (ret < 0) {
396         *pdrv = NULL;
397         return ret;
398     }
399
400     score_max = 0;
401     drv = NULL;
402     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
403         if (drv1->bdrv_probe) {
404             score = drv1->bdrv_probe(buf, ret, filename);
405             if (score > score_max) {
406                 score_max = score;
407                 drv = drv1;
408             }
409         }
410     }
411     if (!drv) {
412         ret = -ENOENT;
413     }
414     *pdrv = drv;
415     return ret;
416 }
417
418 /**
419  * Set the current 'total_sectors' value
420  */
421 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
422 {
423     BlockDriver *drv = bs->drv;
424
425     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
426     if (bs->sg)
427         return 0;
428
429     /* query actual device if possible, otherwise just trust the hint */
430     if (drv->bdrv_getlength) {
431         int64_t length = drv->bdrv_getlength(bs);
432         if (length < 0) {
433             return length;
434         }
435         hint = length >> BDRV_SECTOR_BITS;
436     }
437
438     bs->total_sectors = hint;
439     return 0;
440 }
441
442 /**
443  * Set open flags for a given cache mode
444  *
445  * Return 0 on success, -1 if the cache mode was invalid.
446  */
447 int bdrv_parse_cache_flags(const char *mode, int *flags)
448 {
449     *flags &= ~BDRV_O_CACHE_MASK;
450
451     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
452         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
453     } else if (!strcmp(mode, "directsync")) {
454         *flags |= BDRV_O_NOCACHE;
455     } else if (!strcmp(mode, "writeback")) {
456         *flags |= BDRV_O_CACHE_WB;
457     } else if (!strcmp(mode, "unsafe")) {
458         *flags |= BDRV_O_CACHE_WB;
459         *flags |= BDRV_O_NO_FLUSH;
460     } else if (!strcmp(mode, "writethrough")) {
461         /* this is the default */
462     } else {
463         return -1;
464     }
465
466     return 0;
467 }
468
469 /*
470  * Common part for opening disk images and files
471  */
472 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
473     int flags, BlockDriver *drv)
474 {
475     int ret, open_flags;
476
477     assert(drv != NULL);
478
479     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
480
481     bs->file = NULL;
482     bs->total_sectors = 0;
483     bs->encrypted = 0;
484     bs->valid_key = 0;
485     bs->open_flags = flags;
486     bs->buffer_alignment = 512;
487
488     pstrcpy(bs->filename, sizeof(bs->filename), filename);
489
490     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
491         return -ENOTSUP;
492     }
493
494     bs->drv = drv;
495     bs->opaque = g_malloc0(drv->instance_size);
496
497     if (flags & BDRV_O_CACHE_WB)
498         bs->enable_write_cache = 1;
499
500     /*
501      * Clear flags that are internal to the block layer before opening the
502      * image.
503      */
504     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
505
506     /*
507      * Snapshots should be writable.
508      */
509     if (bs->is_temporary) {
510         open_flags |= BDRV_O_RDWR;
511     }
512
513     /* Open the image, either directly or using a protocol */
514     if (drv->bdrv_file_open) {
515         ret = drv->bdrv_file_open(bs, filename, open_flags);
516     } else {
517         ret = bdrv_file_open(&bs->file, filename, open_flags);
518         if (ret >= 0) {
519             ret = drv->bdrv_open(bs, open_flags);
520         }
521     }
522
523     if (ret < 0) {
524         goto free_and_fail;
525     }
526
527     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
528
529     ret = refresh_total_sectors(bs, bs->total_sectors);
530     if (ret < 0) {
531         goto free_and_fail;
532     }
533
534 #ifndef _WIN32
535     if (bs->is_temporary) {
536         unlink(filename);
537     }
538 #endif
539     return 0;
540
541 free_and_fail:
542     if (bs->file) {
543         bdrv_delete(bs->file);
544         bs->file = NULL;
545     }
546     g_free(bs->opaque);
547     bs->opaque = NULL;
548     bs->drv = NULL;
549     return ret;
550 }
551
552 /*
553  * Opens a file using a protocol (file, host_device, nbd, ...)
554  */
555 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
556 {
557     BlockDriverState *bs;
558     BlockDriver *drv;
559     int ret;
560
561     drv = bdrv_find_protocol(filename);
562     if (!drv) {
563         return -ENOENT;
564     }
565
566     bs = bdrv_new("");
567     ret = bdrv_open_common(bs, filename, flags, drv);
568     if (ret < 0) {
569         bdrv_delete(bs);
570         return ret;
571     }
572     bs->growable = 1;
573     *pbs = bs;
574     return 0;
575 }
576
577 /*
578  * Opens a disk image (raw, qcow2, vmdk, ...)
579  */
580 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
581               BlockDriver *drv)
582 {
583     int ret;
584
585     if (flags & BDRV_O_SNAPSHOT) {
586         BlockDriverState *bs1;
587         int64_t total_size;
588         int is_protocol = 0;
589         BlockDriver *bdrv_qcow2;
590         QEMUOptionParameter *options;
591         char tmp_filename[PATH_MAX];
592         char backing_filename[PATH_MAX];
593
594         /* if snapshot, we create a temporary backing file and open it
595            instead of opening 'filename' directly */
596
597         /* if there is a backing file, use it */
598         bs1 = bdrv_new("");
599         ret = bdrv_open(bs1, filename, 0, drv);
600         if (ret < 0) {
601             bdrv_delete(bs1);
602             return ret;
603         }
604         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
605
606         if (bs1->drv && bs1->drv->protocol_name)
607             is_protocol = 1;
608
609         bdrv_delete(bs1);
610
611         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
612
613         /* Real path is meaningless for protocols */
614         if (is_protocol)
615             snprintf(backing_filename, sizeof(backing_filename),
616                      "%s", filename);
617         else if (!realpath(filename, backing_filename))
618             return -errno;
619
620         bdrv_qcow2 = bdrv_find_format("qcow2");
621         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
622
623         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
624         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
625         if (drv) {
626             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
627                 drv->format_name);
628         }
629
630         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
631         free_option_parameters(options);
632         if (ret < 0) {
633             return ret;
634         }
635
636         filename = tmp_filename;
637         drv = bdrv_qcow2;
638         bs->is_temporary = 1;
639     }
640
641     /* Find the right image format driver */
642     if (!drv) {
643         ret = find_image_format(filename, &drv);
644     }
645
646     if (!drv) {
647         goto unlink_and_fail;
648     }
649
650     /* Open the image */
651     ret = bdrv_open_common(bs, filename, flags, drv);
652     if (ret < 0) {
653         goto unlink_and_fail;
654     }
655
656     /* If there is a backing file, use it */
657     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
658         char backing_filename[PATH_MAX];
659         int back_flags;
660         BlockDriver *back_drv = NULL;
661
662         bs->backing_hd = bdrv_new("");
663
664         if (path_has_protocol(bs->backing_file)) {
665             pstrcpy(backing_filename, sizeof(backing_filename),
666                     bs->backing_file);
667         } else {
668             path_combine(backing_filename, sizeof(backing_filename),
669                          filename, bs->backing_file);
670         }
671
672         if (bs->backing_format[0] != '\0') {
673             back_drv = bdrv_find_format(bs->backing_format);
674         }
675
676         /* backing files always opened read-only */
677         back_flags =
678             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
679
680         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
681         if (ret < 0) {
682             bdrv_close(bs);
683             return ret;
684         }
685         if (bs->is_temporary) {
686             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
687         } else {
688             /* base image inherits from "parent" */
689             bs->backing_hd->keep_read_only = bs->keep_read_only;
690         }
691     }
692
693     if (!bdrv_key_required(bs)) {
694         bdrv_dev_change_media_cb(bs, true);
695     }
696
697     return 0;
698
699 unlink_and_fail:
700     if (bs->is_temporary) {
701         unlink(filename);
702     }
703     return ret;
704 }
705
706 void bdrv_close(BlockDriverState *bs)
707 {
708     if (bs->drv) {
709         if (bs == bs_snapshots) {
710             bs_snapshots = NULL;
711         }
712         if (bs->backing_hd) {
713             bdrv_delete(bs->backing_hd);
714             bs->backing_hd = NULL;
715         }
716         bs->drv->bdrv_close(bs);
717         g_free(bs->opaque);
718 #ifdef _WIN32
719         if (bs->is_temporary) {
720             unlink(bs->filename);
721         }
722 #endif
723         bs->opaque = NULL;
724         bs->drv = NULL;
725
726         if (bs->file != NULL) {
727             bdrv_close(bs->file);
728         }
729
730         bdrv_dev_change_media_cb(bs, false);
731     }
732 }
733
734 void bdrv_close_all(void)
735 {
736     BlockDriverState *bs;
737
738     QTAILQ_FOREACH(bs, &bdrv_states, list) {
739         bdrv_close(bs);
740     }
741 }
742
743 /* make a BlockDriverState anonymous by removing from bdrv_state list.
744    Also, NULL terminate the device_name to prevent double remove */
745 void bdrv_make_anon(BlockDriverState *bs)
746 {
747     if (bs->device_name[0] != '\0') {
748         QTAILQ_REMOVE(&bdrv_states, bs, list);
749     }
750     bs->device_name[0] = '\0';
751 }
752
753 void bdrv_delete(BlockDriverState *bs)
754 {
755     assert(!bs->dev);
756
757     /* remove from list, if necessary */
758     bdrv_make_anon(bs);
759
760     bdrv_close(bs);
761     if (bs->file != NULL) {
762         bdrv_delete(bs->file);
763     }
764
765     assert(bs != bs_snapshots);
766     g_free(bs);
767 }
768
769 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
770 /* TODO change to DeviceState *dev when all users are qdevified */
771 {
772     if (bs->dev) {
773         return -EBUSY;
774     }
775     bs->dev = dev;
776     bdrv_iostatus_reset(bs);
777     return 0;
778 }
779
780 /* TODO qdevified devices don't use this, remove when devices are qdevified */
781 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
782 {
783     if (bdrv_attach_dev(bs, dev) < 0) {
784         abort();
785     }
786 }
787
788 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
789 /* TODO change to DeviceState *dev when all users are qdevified */
790 {
791     assert(bs->dev == dev);
792     bs->dev = NULL;
793     bs->dev_ops = NULL;
794     bs->dev_opaque = NULL;
795     bs->buffer_alignment = 512;
796 }
797
798 /* TODO change to return DeviceState * when all users are qdevified */
799 void *bdrv_get_attached_dev(BlockDriverState *bs)
800 {
801     return bs->dev;
802 }
803
804 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
805                       void *opaque)
806 {
807     bs->dev_ops = ops;
808     bs->dev_opaque = opaque;
809     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
810         bs_snapshots = NULL;
811     }
812 }
813
814 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
815 {
816     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
817         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
818     }
819 }
820
821 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
822 {
823     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
824 }
825
826 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
827 {
828     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
829         return bs->dev_ops->is_tray_open(bs->dev_opaque);
830     }
831     return false;
832 }
833
834 static void bdrv_dev_resize_cb(BlockDriverState *bs)
835 {
836     if (bs->dev_ops && bs->dev_ops->resize_cb) {
837         bs->dev_ops->resize_cb(bs->dev_opaque);
838     }
839 }
840
841 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
842 {
843     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
844         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
845     }
846     return false;
847 }
848
849 /*
850  * Run consistency checks on an image
851  *
852  * Returns 0 if the check could be completed (it doesn't mean that the image is
853  * free of errors) or -errno when an internal error occurred. The results of the
854  * check are stored in res.
855  */
856 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
857 {
858     if (bs->drv->bdrv_check == NULL) {
859         return -ENOTSUP;
860     }
861
862     memset(res, 0, sizeof(*res));
863     return bs->drv->bdrv_check(bs, res);
864 }
865
866 #define COMMIT_BUF_SECTORS 2048
867
868 /* commit COW file into the raw image */
869 int bdrv_commit(BlockDriverState *bs)
870 {
871     BlockDriver *drv = bs->drv;
872     BlockDriver *backing_drv;
873     int64_t sector, total_sectors;
874     int n, ro, open_flags;
875     int ret = 0, rw_ret = 0;
876     uint8_t *buf;
877     char filename[1024];
878     BlockDriverState *bs_rw, *bs_ro;
879
880     if (!drv)
881         return -ENOMEDIUM;
882     
883     if (!bs->backing_hd) {
884         return -ENOTSUP;
885     }
886
887     if (bs->backing_hd->keep_read_only) {
888         return -EACCES;
889     }
890
891     backing_drv = bs->backing_hd->drv;
892     ro = bs->backing_hd->read_only;
893     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
894     open_flags =  bs->backing_hd->open_flags;
895
896     if (ro) {
897         /* re-open as RW */
898         bdrv_delete(bs->backing_hd);
899         bs->backing_hd = NULL;
900         bs_rw = bdrv_new("");
901         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
902             backing_drv);
903         if (rw_ret < 0) {
904             bdrv_delete(bs_rw);
905             /* try to re-open read-only */
906             bs_ro = bdrv_new("");
907             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
908                 backing_drv);
909             if (ret < 0) {
910                 bdrv_delete(bs_ro);
911                 /* drive not functional anymore */
912                 bs->drv = NULL;
913                 return ret;
914             }
915             bs->backing_hd = bs_ro;
916             return rw_ret;
917         }
918         bs->backing_hd = bs_rw;
919     }
920
921     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
922     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
923
924     for (sector = 0; sector < total_sectors; sector += n) {
925         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
926
927             if (bdrv_read(bs, sector, buf, n) != 0) {
928                 ret = -EIO;
929                 goto ro_cleanup;
930             }
931
932             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
933                 ret = -EIO;
934                 goto ro_cleanup;
935             }
936         }
937     }
938
939     if (drv->bdrv_make_empty) {
940         ret = drv->bdrv_make_empty(bs);
941         bdrv_flush(bs);
942     }
943
944     /*
945      * Make sure all data we wrote to the backing device is actually
946      * stable on disk.
947      */
948     if (bs->backing_hd)
949         bdrv_flush(bs->backing_hd);
950
951 ro_cleanup:
952     g_free(buf);
953
954     if (ro) {
955         /* re-open as RO */
956         bdrv_delete(bs->backing_hd);
957         bs->backing_hd = NULL;
958         bs_ro = bdrv_new("");
959         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
960             backing_drv);
961         if (ret < 0) {
962             bdrv_delete(bs_ro);
963             /* drive not functional anymore */
964             bs->drv = NULL;
965             return ret;
966         }
967         bs->backing_hd = bs_ro;
968         bs->backing_hd->keep_read_only = 0;
969     }
970
971     return ret;
972 }
973
974 void bdrv_commit_all(void)
975 {
976     BlockDriverState *bs;
977
978     QTAILQ_FOREACH(bs, &bdrv_states, list) {
979         bdrv_commit(bs);
980     }
981 }
982
983 /*
984  * Return values:
985  * 0        - success
986  * -EINVAL  - backing format specified, but no file
987  * -ENOSPC  - can't update the backing file because no space is left in the
988  *            image file header
989  * -ENOTSUP - format driver doesn't support changing the backing file
990  */
991 int bdrv_change_backing_file(BlockDriverState *bs,
992     const char *backing_file, const char *backing_fmt)
993 {
994     BlockDriver *drv = bs->drv;
995
996     if (drv->bdrv_change_backing_file != NULL) {
997         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
998     } else {
999         return -ENOTSUP;
1000     }
1001 }
1002
1003 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1004                                    size_t size)
1005 {
1006     int64_t len;
1007
1008     if (!bdrv_is_inserted(bs))
1009         return -ENOMEDIUM;
1010
1011     if (bs->growable)
1012         return 0;
1013
1014     len = bdrv_getlength(bs);
1015
1016     if (offset < 0)
1017         return -EIO;
1018
1019     if ((offset > len) || (len - offset < size))
1020         return -EIO;
1021
1022     return 0;
1023 }
1024
1025 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1026                               int nb_sectors)
1027 {
1028     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1029                                    nb_sectors * BDRV_SECTOR_SIZE);
1030 }
1031
1032 static inline bool bdrv_has_async_rw(BlockDriver *drv)
1033 {
1034     return drv->bdrv_co_readv != bdrv_co_readv_em
1035         || drv->bdrv_aio_readv != bdrv_aio_readv_em;
1036 }
1037
1038 static inline bool bdrv_has_async_flush(BlockDriver *drv)
1039 {
1040     return drv->bdrv_aio_flush != bdrv_aio_flush_em;
1041 }
1042
1043 /* return < 0 if error. See bdrv_write() for the return codes */
1044 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1045               uint8_t *buf, int nb_sectors)
1046 {
1047     BlockDriver *drv = bs->drv;
1048
1049     if (!drv)
1050         return -ENOMEDIUM;
1051
1052     if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
1053         QEMUIOVector qiov;
1054         struct iovec iov = {
1055             .iov_base = (void *)buf,
1056             .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1057         };
1058
1059         qemu_iovec_init_external(&qiov, &iov, 1);
1060         return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
1061     }
1062
1063     if (bdrv_check_request(bs, sector_num, nb_sectors))
1064         return -EIO;
1065
1066     return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
1067 }
1068
1069 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1070                              int nb_sectors, int dirty)
1071 {
1072     int64_t start, end;
1073     unsigned long val, idx, bit;
1074
1075     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1076     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1077
1078     for (; start <= end; start++) {
1079         idx = start / (sizeof(unsigned long) * 8);
1080         bit = start % (sizeof(unsigned long) * 8);
1081         val = bs->dirty_bitmap[idx];
1082         if (dirty) {
1083             if (!(val & (1UL << bit))) {
1084                 bs->dirty_count++;
1085                 val |= 1UL << bit;
1086             }
1087         } else {
1088             if (val & (1UL << bit)) {
1089                 bs->dirty_count--;
1090                 val &= ~(1UL << bit);
1091             }
1092         }
1093         bs->dirty_bitmap[idx] = val;
1094     }
1095 }
1096
1097 /* Return < 0 if error. Important errors are:
1098   -EIO         generic I/O error (may happen for all errors)
1099   -ENOMEDIUM   No media inserted.
1100   -EINVAL      Invalid sector number or nb_sectors
1101   -EACCES      Trying to write a read-only device
1102 */
1103 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1104                const uint8_t *buf, int nb_sectors)
1105 {
1106     BlockDriver *drv = bs->drv;
1107
1108     if (!bs->drv)
1109         return -ENOMEDIUM;
1110
1111     if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
1112         QEMUIOVector qiov;
1113         struct iovec iov = {
1114             .iov_base = (void *)buf,
1115             .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1116         };
1117
1118         qemu_iovec_init_external(&qiov, &iov, 1);
1119         return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1120     }
1121
1122     if (bs->read_only)
1123         return -EACCES;
1124     if (bdrv_check_request(bs, sector_num, nb_sectors))
1125         return -EIO;
1126
1127     if (bs->dirty_bitmap) {
1128         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1129     }
1130
1131     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1132         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1133     }
1134
1135     return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
1136 }
1137
1138 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1139                void *buf, int count1)
1140 {
1141     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1142     int len, nb_sectors, count;
1143     int64_t sector_num;
1144     int ret;
1145
1146     count = count1;
1147     /* first read to align to sector start */
1148     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1149     if (len > count)
1150         len = count;
1151     sector_num = offset >> BDRV_SECTOR_BITS;
1152     if (len > 0) {
1153         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1154             return ret;
1155         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1156         count -= len;
1157         if (count == 0)
1158             return count1;
1159         sector_num++;
1160         buf += len;
1161     }
1162
1163     /* read the sectors "in place" */
1164     nb_sectors = count >> BDRV_SECTOR_BITS;
1165     if (nb_sectors > 0) {
1166         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1167             return ret;
1168         sector_num += nb_sectors;
1169         len = nb_sectors << BDRV_SECTOR_BITS;
1170         buf += len;
1171         count -= len;
1172     }
1173
1174     /* add data from the last sector */
1175     if (count > 0) {
1176         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1177             return ret;
1178         memcpy(buf, tmp_buf, count);
1179     }
1180     return count1;
1181 }
1182
1183 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1184                 const void *buf, int count1)
1185 {
1186     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1187     int len, nb_sectors, count;
1188     int64_t sector_num;
1189     int ret;
1190
1191     count = count1;
1192     /* first write to align to sector start */
1193     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1194     if (len > count)
1195         len = count;
1196     sector_num = offset >> BDRV_SECTOR_BITS;
1197     if (len > 0) {
1198         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1199             return ret;
1200         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1201         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1202             return ret;
1203         count -= len;
1204         if (count == 0)
1205             return count1;
1206         sector_num++;
1207         buf += len;
1208     }
1209
1210     /* write the sectors "in place" */
1211     nb_sectors = count >> BDRV_SECTOR_BITS;
1212     if (nb_sectors > 0) {
1213         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1214             return ret;
1215         sector_num += nb_sectors;
1216         len = nb_sectors << BDRV_SECTOR_BITS;
1217         buf += len;
1218         count -= len;
1219     }
1220
1221     /* add data from the last sector */
1222     if (count > 0) {
1223         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1224             return ret;
1225         memcpy(tmp_buf, buf, count);
1226         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1227             return ret;
1228     }
1229     return count1;
1230 }
1231
1232 /*
1233  * Writes to the file and ensures that no writes are reordered across this
1234  * request (acts as a barrier)
1235  *
1236  * Returns 0 on success, -errno in error cases.
1237  */
1238 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1239     const void *buf, int count)
1240 {
1241     int ret;
1242
1243     ret = bdrv_pwrite(bs, offset, buf, count);
1244     if (ret < 0) {
1245         return ret;
1246     }
1247
1248     /* No flush needed for cache modes that use O_DSYNC */
1249     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1250         bdrv_flush(bs);
1251     }
1252
1253     return 0;
1254 }
1255
1256 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1257     int nb_sectors, QEMUIOVector *qiov)
1258 {
1259     BlockDriver *drv = bs->drv;
1260
1261     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1262
1263     if (!drv) {
1264         return -ENOMEDIUM;
1265     }
1266     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1267         return -EIO;
1268     }
1269
1270     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1271 }
1272
1273 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1274     int nb_sectors, QEMUIOVector *qiov)
1275 {
1276     BlockDriver *drv = bs->drv;
1277
1278     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1279
1280     if (!bs->drv) {
1281         return -ENOMEDIUM;
1282     }
1283     if (bs->read_only) {
1284         return -EACCES;
1285     }
1286     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1287         return -EIO;
1288     }
1289
1290     if (bs->dirty_bitmap) {
1291         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1292     }
1293
1294     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1295         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1296     }
1297
1298     return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1299 }
1300
1301 /**
1302  * Truncate file to 'offset' bytes (needed only for file protocols)
1303  */
1304 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1305 {
1306     BlockDriver *drv = bs->drv;
1307     int ret;
1308     if (!drv)
1309         return -ENOMEDIUM;
1310     if (!drv->bdrv_truncate)
1311         return -ENOTSUP;
1312     if (bs->read_only)
1313         return -EACCES;
1314     if (bdrv_in_use(bs))
1315         return -EBUSY;
1316     ret = drv->bdrv_truncate(bs, offset);
1317     if (ret == 0) {
1318         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1319         bdrv_dev_resize_cb(bs);
1320     }
1321     return ret;
1322 }
1323
1324 /**
1325  * Length of a allocated file in bytes. Sparse files are counted by actual
1326  * allocated space. Return < 0 if error or unknown.
1327  */
1328 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1329 {
1330     BlockDriver *drv = bs->drv;
1331     if (!drv) {
1332         return -ENOMEDIUM;
1333     }
1334     if (drv->bdrv_get_allocated_file_size) {
1335         return drv->bdrv_get_allocated_file_size(bs);
1336     }
1337     if (bs->file) {
1338         return bdrv_get_allocated_file_size(bs->file);
1339     }
1340     return -ENOTSUP;
1341 }
1342
1343 /**
1344  * Length of a file in bytes. Return < 0 if error or unknown.
1345  */
1346 int64_t bdrv_getlength(BlockDriverState *bs)
1347 {
1348     BlockDriver *drv = bs->drv;
1349     if (!drv)
1350         return -ENOMEDIUM;
1351
1352     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1353         if (drv->bdrv_getlength) {
1354             return drv->bdrv_getlength(bs);
1355         }
1356     }
1357     return bs->total_sectors * BDRV_SECTOR_SIZE;
1358 }
1359
1360 /* return 0 as number of sectors if no device present or error */
1361 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1362 {
1363     int64_t length;
1364     length = bdrv_getlength(bs);
1365     if (length < 0)
1366         length = 0;
1367     else
1368         length = length >> BDRV_SECTOR_BITS;
1369     *nb_sectors_ptr = length;
1370 }
1371
1372 struct partition {
1373         uint8_t boot_ind;           /* 0x80 - active */
1374         uint8_t head;               /* starting head */
1375         uint8_t sector;             /* starting sector */
1376         uint8_t cyl;                /* starting cylinder */
1377         uint8_t sys_ind;            /* What partition type */
1378         uint8_t end_head;           /* end head */
1379         uint8_t end_sector;         /* end sector */
1380         uint8_t end_cyl;            /* end cylinder */
1381         uint32_t start_sect;        /* starting sector counting from 0 */
1382         uint32_t nr_sects;          /* nr of sectors in partition */
1383 } QEMU_PACKED;
1384
1385 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1386 static int guess_disk_lchs(BlockDriverState *bs,
1387                            int *pcylinders, int *pheads, int *psectors)
1388 {
1389     uint8_t buf[BDRV_SECTOR_SIZE];
1390     int ret, i, heads, sectors, cylinders;
1391     struct partition *p;
1392     uint32_t nr_sects;
1393     uint64_t nb_sectors;
1394
1395     bdrv_get_geometry(bs, &nb_sectors);
1396
1397     ret = bdrv_read(bs, 0, buf, 1);
1398     if (ret < 0)
1399         return -1;
1400     /* test msdos magic */
1401     if (buf[510] != 0x55 || buf[511] != 0xaa)
1402         return -1;
1403     for(i = 0; i < 4; i++) {
1404         p = ((struct partition *)(buf + 0x1be)) + i;
1405         nr_sects = le32_to_cpu(p->nr_sects);
1406         if (nr_sects && p->end_head) {
1407             /* We make the assumption that the partition terminates on
1408                a cylinder boundary */
1409             heads = p->end_head + 1;
1410             sectors = p->end_sector & 63;
1411             if (sectors == 0)
1412                 continue;
1413             cylinders = nb_sectors / (heads * sectors);
1414             if (cylinders < 1 || cylinders > 16383)
1415                 continue;
1416             *pheads = heads;
1417             *psectors = sectors;
1418             *pcylinders = cylinders;
1419 #if 0
1420             printf("guessed geometry: LCHS=%d %d %d\n",
1421                    cylinders, heads, sectors);
1422 #endif
1423             return 0;
1424         }
1425     }
1426     return -1;
1427 }
1428
1429 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1430 {
1431     int translation, lba_detected = 0;
1432     int cylinders, heads, secs;
1433     uint64_t nb_sectors;
1434
1435     /* if a geometry hint is available, use it */
1436     bdrv_get_geometry(bs, &nb_sectors);
1437     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1438     translation = bdrv_get_translation_hint(bs);
1439     if (cylinders != 0) {
1440         *pcyls = cylinders;
1441         *pheads = heads;
1442         *psecs = secs;
1443     } else {
1444         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1445             if (heads > 16) {
1446                 /* if heads > 16, it means that a BIOS LBA
1447                    translation was active, so the default
1448                    hardware geometry is OK */
1449                 lba_detected = 1;
1450                 goto default_geometry;
1451             } else {
1452                 *pcyls = cylinders;
1453                 *pheads = heads;
1454                 *psecs = secs;
1455                 /* disable any translation to be in sync with
1456                    the logical geometry */
1457                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1458                     bdrv_set_translation_hint(bs,
1459                                               BIOS_ATA_TRANSLATION_NONE);
1460                 }
1461             }
1462         } else {
1463         default_geometry:
1464             /* if no geometry, use a standard physical disk geometry */
1465             cylinders = nb_sectors / (16 * 63);
1466
1467             if (cylinders > 16383)
1468                 cylinders = 16383;
1469             else if (cylinders < 2)
1470                 cylinders = 2;
1471             *pcyls = cylinders;
1472             *pheads = 16;
1473             *psecs = 63;
1474             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1475                 if ((*pcyls * *pheads) <= 131072) {
1476                     bdrv_set_translation_hint(bs,
1477                                               BIOS_ATA_TRANSLATION_LARGE);
1478                 } else {
1479                     bdrv_set_translation_hint(bs,
1480                                               BIOS_ATA_TRANSLATION_LBA);
1481                 }
1482             }
1483         }
1484         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1485     }
1486 }
1487
1488 void bdrv_set_geometry_hint(BlockDriverState *bs,
1489                             int cyls, int heads, int secs)
1490 {
1491     bs->cyls = cyls;
1492     bs->heads = heads;
1493     bs->secs = secs;
1494 }
1495
1496 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1497 {
1498     bs->translation = translation;
1499 }
1500
1501 void bdrv_get_geometry_hint(BlockDriverState *bs,
1502                             int *pcyls, int *pheads, int *psecs)
1503 {
1504     *pcyls = bs->cyls;
1505     *pheads = bs->heads;
1506     *psecs = bs->secs;
1507 }
1508
1509 /* Recognize floppy formats */
1510 typedef struct FDFormat {
1511     FDriveType drive;
1512     uint8_t last_sect;
1513     uint8_t max_track;
1514     uint8_t max_head;
1515 } FDFormat;
1516
1517 static const FDFormat fd_formats[] = {
1518     /* First entry is default format */
1519     /* 1.44 MB 3"1/2 floppy disks */
1520     { FDRIVE_DRV_144, 18, 80, 1, },
1521     { FDRIVE_DRV_144, 20, 80, 1, },
1522     { FDRIVE_DRV_144, 21, 80, 1, },
1523     { FDRIVE_DRV_144, 21, 82, 1, },
1524     { FDRIVE_DRV_144, 21, 83, 1, },
1525     { FDRIVE_DRV_144, 22, 80, 1, },
1526     { FDRIVE_DRV_144, 23, 80, 1, },
1527     { FDRIVE_DRV_144, 24, 80, 1, },
1528     /* 2.88 MB 3"1/2 floppy disks */
1529     { FDRIVE_DRV_288, 36, 80, 1, },
1530     { FDRIVE_DRV_288, 39, 80, 1, },
1531     { FDRIVE_DRV_288, 40, 80, 1, },
1532     { FDRIVE_DRV_288, 44, 80, 1, },
1533     { FDRIVE_DRV_288, 48, 80, 1, },
1534     /* 720 kB 3"1/2 floppy disks */
1535     { FDRIVE_DRV_144,  9, 80, 1, },
1536     { FDRIVE_DRV_144, 10, 80, 1, },
1537     { FDRIVE_DRV_144, 10, 82, 1, },
1538     { FDRIVE_DRV_144, 10, 83, 1, },
1539     { FDRIVE_DRV_144, 13, 80, 1, },
1540     { FDRIVE_DRV_144, 14, 80, 1, },
1541     /* 1.2 MB 5"1/4 floppy disks */
1542     { FDRIVE_DRV_120, 15, 80, 1, },
1543     { FDRIVE_DRV_120, 18, 80, 1, },
1544     { FDRIVE_DRV_120, 18, 82, 1, },
1545     { FDRIVE_DRV_120, 18, 83, 1, },
1546     { FDRIVE_DRV_120, 20, 80, 1, },
1547     /* 720 kB 5"1/4 floppy disks */
1548     { FDRIVE_DRV_120,  9, 80, 1, },
1549     { FDRIVE_DRV_120, 11, 80, 1, },
1550     /* 360 kB 5"1/4 floppy disks */
1551     { FDRIVE_DRV_120,  9, 40, 1, },
1552     { FDRIVE_DRV_120,  9, 40, 0, },
1553     { FDRIVE_DRV_120, 10, 41, 1, },
1554     { FDRIVE_DRV_120, 10, 42, 1, },
1555     /* 320 kB 5"1/4 floppy disks */
1556     { FDRIVE_DRV_120,  8, 40, 1, },
1557     { FDRIVE_DRV_120,  8, 40, 0, },
1558     /* 360 kB must match 5"1/4 better than 3"1/2... */
1559     { FDRIVE_DRV_144,  9, 80, 0, },
1560     /* end */
1561     { FDRIVE_DRV_NONE, -1, -1, 0, },
1562 };
1563
1564 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1565                                    int *max_track, int *last_sect,
1566                                    FDriveType drive_in, FDriveType *drive)
1567 {
1568     const FDFormat *parse;
1569     uint64_t nb_sectors, size;
1570     int i, first_match, match;
1571
1572     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1573     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1574         /* User defined disk */
1575     } else {
1576         bdrv_get_geometry(bs, &nb_sectors);
1577         match = -1;
1578         first_match = -1;
1579         for (i = 0; ; i++) {
1580             parse = &fd_formats[i];
1581             if (parse->drive == FDRIVE_DRV_NONE) {
1582                 break;
1583             }
1584             if (drive_in == parse->drive ||
1585                 drive_in == FDRIVE_DRV_NONE) {
1586                 size = (parse->max_head + 1) * parse->max_track *
1587                     parse->last_sect;
1588                 if (nb_sectors == size) {
1589                     match = i;
1590                     break;
1591                 }
1592                 if (first_match == -1) {
1593                     first_match = i;
1594                 }
1595             }
1596         }
1597         if (match == -1) {
1598             if (first_match == -1) {
1599                 match = 1;
1600             } else {
1601                 match = first_match;
1602             }
1603             parse = &fd_formats[match];
1604         }
1605         *nb_heads = parse->max_head + 1;
1606         *max_track = parse->max_track;
1607         *last_sect = parse->last_sect;
1608         *drive = parse->drive;
1609     }
1610 }
1611
1612 int bdrv_get_translation_hint(BlockDriverState *bs)
1613 {
1614     return bs->translation;
1615 }
1616
1617 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1618                        BlockErrorAction on_write_error)
1619 {
1620     bs->on_read_error = on_read_error;
1621     bs->on_write_error = on_write_error;
1622 }
1623
1624 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1625 {
1626     return is_read ? bs->on_read_error : bs->on_write_error;
1627 }
1628
1629 int bdrv_is_read_only(BlockDriverState *bs)
1630 {
1631     return bs->read_only;
1632 }
1633
1634 int bdrv_is_sg(BlockDriverState *bs)
1635 {
1636     return bs->sg;
1637 }
1638
1639 int bdrv_enable_write_cache(BlockDriverState *bs)
1640 {
1641     return bs->enable_write_cache;
1642 }
1643
1644 int bdrv_is_encrypted(BlockDriverState *bs)
1645 {
1646     if (bs->backing_hd && bs->backing_hd->encrypted)
1647         return 1;
1648     return bs->encrypted;
1649 }
1650
1651 int bdrv_key_required(BlockDriverState *bs)
1652 {
1653     BlockDriverState *backing_hd = bs->backing_hd;
1654
1655     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1656         return 1;
1657     return (bs->encrypted && !bs->valid_key);
1658 }
1659
1660 int bdrv_set_key(BlockDriverState *bs, const char *key)
1661 {
1662     int ret;
1663     if (bs->backing_hd && bs->backing_hd->encrypted) {
1664         ret = bdrv_set_key(bs->backing_hd, key);
1665         if (ret < 0)
1666             return ret;
1667         if (!bs->encrypted)
1668             return 0;
1669     }
1670     if (!bs->encrypted) {
1671         return -EINVAL;
1672     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1673         return -ENOMEDIUM;
1674     }
1675     ret = bs->drv->bdrv_set_key(bs, key);
1676     if (ret < 0) {
1677         bs->valid_key = 0;
1678     } else if (!bs->valid_key) {
1679         bs->valid_key = 1;
1680         /* call the change callback now, we skipped it on open */
1681         bdrv_dev_change_media_cb(bs, true);
1682     }
1683     return ret;
1684 }
1685
1686 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1687 {
1688     if (!bs->drv) {
1689         buf[0] = '\0';
1690     } else {
1691         pstrcpy(buf, buf_size, bs->drv->format_name);
1692     }
1693 }
1694
1695 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1696                          void *opaque)
1697 {
1698     BlockDriver *drv;
1699
1700     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1701         it(opaque, drv->format_name);
1702     }
1703 }
1704
1705 BlockDriverState *bdrv_find(const char *name)
1706 {
1707     BlockDriverState *bs;
1708
1709     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1710         if (!strcmp(name, bs->device_name)) {
1711             return bs;
1712         }
1713     }
1714     return NULL;
1715 }
1716
1717 BlockDriverState *bdrv_next(BlockDriverState *bs)
1718 {
1719     if (!bs) {
1720         return QTAILQ_FIRST(&bdrv_states);
1721     }
1722     return QTAILQ_NEXT(bs, list);
1723 }
1724
1725 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1726 {
1727     BlockDriverState *bs;
1728
1729     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1730         it(opaque, bs);
1731     }
1732 }
1733
1734 const char *bdrv_get_device_name(BlockDriverState *bs)
1735 {
1736     return bs->device_name;
1737 }
1738
1739 int bdrv_flush(BlockDriverState *bs)
1740 {
1741     if (bs->open_flags & BDRV_O_NO_FLUSH) {
1742         return 0;
1743     }
1744
1745     if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) {
1746         return bdrv_co_flush_em(bs);
1747     }
1748
1749     if (bs->drv && bs->drv->bdrv_flush) {
1750         return bs->drv->bdrv_flush(bs);
1751     }
1752
1753     /*
1754      * Some block drivers always operate in either writethrough or unsafe mode
1755      * and don't support bdrv_flush therefore. Usually qemu doesn't know how
1756      * the server works (because the behaviour is hardcoded or depends on
1757      * server-side configuration), so we can't ensure that everything is safe
1758      * on disk. Returning an error doesn't work because that would break guests
1759      * even if the server operates in writethrough mode.
1760      *
1761      * Let's hope the user knows what he's doing.
1762      */
1763     return 0;
1764 }
1765
1766 void bdrv_flush_all(void)
1767 {
1768     BlockDriverState *bs;
1769
1770     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1771         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1772             bdrv_flush(bs);
1773         }
1774     }
1775 }
1776
1777 int bdrv_has_zero_init(BlockDriverState *bs)
1778 {
1779     assert(bs->drv);
1780
1781     if (bs->drv->bdrv_has_zero_init) {
1782         return bs->drv->bdrv_has_zero_init(bs);
1783     }
1784
1785     return 1;
1786 }
1787
1788 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
1789 {
1790     if (!bs->drv) {
1791         return -ENOMEDIUM;
1792     }
1793     if (!bs->drv->bdrv_discard) {
1794         return 0;
1795     }
1796     return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
1797 }
1798
1799 /*
1800  * Returns true iff the specified sector is present in the disk image. Drivers
1801  * not implementing the functionality are assumed to not support backing files,
1802  * hence all their sectors are reported as allocated.
1803  *
1804  * 'pnum' is set to the number of sectors (including and immediately following
1805  * the specified sector) that are known to be in the same
1806  * allocated/unallocated state.
1807  *
1808  * 'nb_sectors' is the max value 'pnum' should be set to.
1809  */
1810 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1811         int *pnum)
1812 {
1813     int64_t n;
1814     if (!bs->drv->bdrv_is_allocated) {
1815         if (sector_num >= bs->total_sectors) {
1816             *pnum = 0;
1817             return 0;
1818         }
1819         n = bs->total_sectors - sector_num;
1820         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1821         return 1;
1822     }
1823     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1824 }
1825
1826 void bdrv_mon_event(const BlockDriverState *bdrv,
1827                     BlockMonEventAction action, int is_read)
1828 {
1829     QObject *data;
1830     const char *action_str;
1831
1832     switch (action) {
1833     case BDRV_ACTION_REPORT:
1834         action_str = "report";
1835         break;
1836     case BDRV_ACTION_IGNORE:
1837         action_str = "ignore";
1838         break;
1839     case BDRV_ACTION_STOP:
1840         action_str = "stop";
1841         break;
1842     default:
1843         abort();
1844     }
1845
1846     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1847                               bdrv->device_name,
1848                               action_str,
1849                               is_read ? "read" : "write");
1850     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1851
1852     qobject_decref(data);
1853 }
1854
1855 static void bdrv_print_dict(QObject *obj, void *opaque)
1856 {
1857     QDict *bs_dict;
1858     Monitor *mon = opaque;
1859
1860     bs_dict = qobject_to_qdict(obj);
1861
1862     monitor_printf(mon, "%s: removable=%d",
1863                         qdict_get_str(bs_dict, "device"),
1864                         qdict_get_bool(bs_dict, "removable"));
1865
1866     if (qdict_get_bool(bs_dict, "removable")) {
1867         monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1868         monitor_printf(mon, " tray-open=%d",
1869                        qdict_get_bool(bs_dict, "tray-open"));
1870     }
1871
1872     if (qdict_haskey(bs_dict, "io-status")) {
1873         monitor_printf(mon, " io-status=%s", qdict_get_str(bs_dict, "io-status"));
1874     }
1875
1876     if (qdict_haskey(bs_dict, "inserted")) {
1877         QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1878
1879         monitor_printf(mon, " file=");
1880         monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1881         if (qdict_haskey(qdict, "backing_file")) {
1882             monitor_printf(mon, " backing_file=");
1883             monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1884         }
1885         monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1886                             qdict_get_bool(qdict, "ro"),
1887                             qdict_get_str(qdict, "drv"),
1888                             qdict_get_bool(qdict, "encrypted"));
1889     } else {
1890         monitor_printf(mon, " [not inserted]");
1891     }
1892
1893     monitor_printf(mon, "\n");
1894 }
1895
1896 void bdrv_info_print(Monitor *mon, const QObject *data)
1897 {
1898     qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1899 }
1900
1901 static const char *const io_status_name[BDRV_IOS_MAX] = {
1902     [BDRV_IOS_OK] = "ok",
1903     [BDRV_IOS_FAILED] = "failed",
1904     [BDRV_IOS_ENOSPC] = "nospace",
1905 };
1906
1907 void bdrv_info(Monitor *mon, QObject **ret_data)
1908 {
1909     QList *bs_list;
1910     BlockDriverState *bs;
1911
1912     bs_list = qlist_new();
1913
1914     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1915         QObject *bs_obj;
1916         QDict *bs_dict;
1917
1918         bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': 'unknown', "
1919                                     "'removable': %i, 'locked': %i }",
1920                                     bs->device_name,
1921                                     bdrv_dev_has_removable_media(bs),
1922                                     bdrv_dev_is_medium_locked(bs));
1923         bs_dict = qobject_to_qdict(bs_obj);
1924
1925         if (bdrv_dev_has_removable_media(bs)) {
1926             qdict_put(bs_dict, "tray-open",
1927                       qbool_from_int(bdrv_dev_is_tray_open(bs)));
1928         }
1929
1930         if (bdrv_iostatus_is_enabled(bs)) {
1931             qdict_put(bs_dict, "io-status",
1932                       qstring_from_str(io_status_name[bs->iostatus]));
1933         }
1934
1935         if (bs->drv) {
1936             QObject *obj;
1937
1938             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1939                                      "'encrypted': %i }",
1940                                      bs->filename, bs->read_only,
1941                                      bs->drv->format_name,
1942                                      bdrv_is_encrypted(bs));
1943             if (bs->backing_file[0] != '\0') {
1944                 QDict *qdict = qobject_to_qdict(obj);
1945                 qdict_put(qdict, "backing_file",
1946                           qstring_from_str(bs->backing_file));
1947             }
1948
1949             qdict_put_obj(bs_dict, "inserted", obj);
1950         }
1951         qlist_append_obj(bs_list, bs_obj);
1952     }
1953
1954     *ret_data = QOBJECT(bs_list);
1955 }
1956
1957 static void bdrv_stats_iter(QObject *data, void *opaque)
1958 {
1959     QDict *qdict;
1960     Monitor *mon = opaque;
1961
1962     qdict = qobject_to_qdict(data);
1963     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1964
1965     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1966     monitor_printf(mon, " rd_bytes=%" PRId64
1967                         " wr_bytes=%" PRId64
1968                         " rd_operations=%" PRId64
1969                         " wr_operations=%" PRId64
1970                         " flush_operations=%" PRId64
1971                         " wr_total_time_ns=%" PRId64
1972                         " rd_total_time_ns=%" PRId64
1973                         " flush_total_time_ns=%" PRId64
1974                         "\n",
1975                         qdict_get_int(qdict, "rd_bytes"),
1976                         qdict_get_int(qdict, "wr_bytes"),
1977                         qdict_get_int(qdict, "rd_operations"),
1978                         qdict_get_int(qdict, "wr_operations"),
1979                         qdict_get_int(qdict, "flush_operations"),
1980                         qdict_get_int(qdict, "wr_total_time_ns"),
1981                         qdict_get_int(qdict, "rd_total_time_ns"),
1982                         qdict_get_int(qdict, "flush_total_time_ns"));
1983 }
1984
1985 void bdrv_stats_print(Monitor *mon, const QObject *data)
1986 {
1987     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1988 }
1989
1990 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1991 {
1992     QObject *res;
1993     QDict *dict;
1994
1995     res = qobject_from_jsonf("{ 'stats': {"
1996                              "'rd_bytes': %" PRId64 ","
1997                              "'wr_bytes': %" PRId64 ","
1998                              "'rd_operations': %" PRId64 ","
1999                              "'wr_operations': %" PRId64 ","
2000                              "'wr_highest_offset': %" PRId64 ","
2001                              "'flush_operations': %" PRId64 ","
2002                              "'wr_total_time_ns': %" PRId64 ","
2003                              "'rd_total_time_ns': %" PRId64 ","
2004                              "'flush_total_time_ns': %" PRId64
2005                              "} }",
2006                              bs->nr_bytes[BDRV_ACCT_READ],
2007                              bs->nr_bytes[BDRV_ACCT_WRITE],
2008                              bs->nr_ops[BDRV_ACCT_READ],
2009                              bs->nr_ops[BDRV_ACCT_WRITE],
2010                              bs->wr_highest_sector *
2011                              (uint64_t)BDRV_SECTOR_SIZE,
2012                              bs->nr_ops[BDRV_ACCT_FLUSH],
2013                              bs->total_time_ns[BDRV_ACCT_WRITE],
2014                              bs->total_time_ns[BDRV_ACCT_READ],
2015                              bs->total_time_ns[BDRV_ACCT_FLUSH]);
2016     dict  = qobject_to_qdict(res);
2017
2018     if (*bs->device_name) {
2019         qdict_put(dict, "device", qstring_from_str(bs->device_name));
2020     }
2021
2022     if (bs->file) {
2023         QObject *parent = bdrv_info_stats_bs(bs->file);
2024         qdict_put_obj(dict, "parent", parent);
2025     }
2026
2027     return res;
2028 }
2029
2030 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
2031 {
2032     QObject *obj;
2033     QList *devices;
2034     BlockDriverState *bs;
2035
2036     devices = qlist_new();
2037
2038     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2039         obj = bdrv_info_stats_bs(bs);
2040         qlist_append_obj(devices, obj);
2041     }
2042
2043     *ret_data = QOBJECT(devices);
2044 }
2045
2046 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2047 {
2048     if (bs->backing_hd && bs->backing_hd->encrypted)
2049         return bs->backing_file;
2050     else if (bs->encrypted)
2051         return bs->filename;
2052     else
2053         return NULL;
2054 }
2055
2056 void bdrv_get_backing_filename(BlockDriverState *bs,
2057                                char *filename, int filename_size)
2058 {
2059     if (!bs->backing_file) {
2060         pstrcpy(filename, filename_size, "");
2061     } else {
2062         pstrcpy(filename, filename_size, bs->backing_file);
2063     }
2064 }
2065
2066 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2067                           const uint8_t *buf, int nb_sectors)
2068 {
2069     BlockDriver *drv = bs->drv;
2070     if (!drv)
2071         return -ENOMEDIUM;
2072     if (!drv->bdrv_write_compressed)
2073         return -ENOTSUP;
2074     if (bdrv_check_request(bs, sector_num, nb_sectors))
2075         return -EIO;
2076
2077     if (bs->dirty_bitmap) {
2078         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2079     }
2080
2081     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2082 }
2083
2084 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2085 {
2086     BlockDriver *drv = bs->drv;
2087     if (!drv)
2088         return -ENOMEDIUM;
2089     if (!drv->bdrv_get_info)
2090         return -ENOTSUP;
2091     memset(bdi, 0, sizeof(*bdi));
2092     return drv->bdrv_get_info(bs, bdi);
2093 }
2094
2095 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2096                       int64_t pos, int size)
2097 {
2098     BlockDriver *drv = bs->drv;
2099     if (!drv)
2100         return -ENOMEDIUM;
2101     if (drv->bdrv_save_vmstate)
2102         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2103     if (bs->file)
2104         return bdrv_save_vmstate(bs->file, buf, pos, size);
2105     return -ENOTSUP;
2106 }
2107
2108 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2109                       int64_t pos, int size)
2110 {
2111     BlockDriver *drv = bs->drv;
2112     if (!drv)
2113         return -ENOMEDIUM;
2114     if (drv->bdrv_load_vmstate)
2115         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2116     if (bs->file)
2117         return bdrv_load_vmstate(bs->file, buf, pos, size);
2118     return -ENOTSUP;
2119 }
2120
2121 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2122 {
2123     BlockDriver *drv = bs->drv;
2124
2125     if (!drv || !drv->bdrv_debug_event) {
2126         return;
2127     }
2128
2129     return drv->bdrv_debug_event(bs, event);
2130
2131 }
2132
2133 /**************************************************************/
2134 /* handling of snapshots */
2135
2136 int bdrv_can_snapshot(BlockDriverState *bs)
2137 {
2138     BlockDriver *drv = bs->drv;
2139     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2140         return 0;
2141     }
2142
2143     if (!drv->bdrv_snapshot_create) {
2144         if (bs->file != NULL) {
2145             return bdrv_can_snapshot(bs->file);
2146         }
2147         return 0;
2148     }
2149
2150     return 1;
2151 }
2152
2153 int bdrv_is_snapshot(BlockDriverState *bs)
2154 {
2155     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2156 }
2157
2158 BlockDriverState *bdrv_snapshots(void)
2159 {
2160     BlockDriverState *bs;
2161
2162     if (bs_snapshots) {
2163         return bs_snapshots;
2164     }
2165
2166     bs = NULL;
2167     while ((bs = bdrv_next(bs))) {
2168         if (bdrv_can_snapshot(bs)) {
2169             bs_snapshots = bs;
2170             return bs;
2171         }
2172     }
2173     return NULL;
2174 }
2175
2176 int bdrv_snapshot_create(BlockDriverState *bs,
2177                          QEMUSnapshotInfo *sn_info)
2178 {
2179     BlockDriver *drv = bs->drv;
2180     if (!drv)
2181         return -ENOMEDIUM;
2182     if (drv->bdrv_snapshot_create)
2183         return drv->bdrv_snapshot_create(bs, sn_info);
2184     if (bs->file)
2185         return bdrv_snapshot_create(bs->file, sn_info);
2186     return -ENOTSUP;
2187 }
2188
2189 int bdrv_snapshot_goto(BlockDriverState *bs,
2190                        const char *snapshot_id)
2191 {
2192     BlockDriver *drv = bs->drv;
2193     int ret, open_ret;
2194
2195     if (!drv)
2196         return -ENOMEDIUM;
2197     if (drv->bdrv_snapshot_goto)
2198         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2199
2200     if (bs->file) {
2201         drv->bdrv_close(bs);
2202         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2203         open_ret = drv->bdrv_open(bs, bs->open_flags);
2204         if (open_ret < 0) {
2205             bdrv_delete(bs->file);
2206             bs->drv = NULL;
2207             return open_ret;
2208         }
2209         return ret;
2210     }
2211
2212     return -ENOTSUP;
2213 }
2214
2215 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2216 {
2217     BlockDriver *drv = bs->drv;
2218     if (!drv)
2219         return -ENOMEDIUM;
2220     if (drv->bdrv_snapshot_delete)
2221         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2222     if (bs->file)
2223         return bdrv_snapshot_delete(bs->file, snapshot_id);
2224     return -ENOTSUP;
2225 }
2226
2227 int bdrv_snapshot_list(BlockDriverState *bs,
2228                        QEMUSnapshotInfo **psn_info)
2229 {
2230     BlockDriver *drv = bs->drv;
2231     if (!drv)
2232         return -ENOMEDIUM;
2233     if (drv->bdrv_snapshot_list)
2234         return drv->bdrv_snapshot_list(bs, psn_info);
2235     if (bs->file)
2236         return bdrv_snapshot_list(bs->file, psn_info);
2237     return -ENOTSUP;
2238 }
2239
2240 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2241         const char *snapshot_name)
2242 {
2243     BlockDriver *drv = bs->drv;
2244     if (!drv) {
2245         return -ENOMEDIUM;
2246     }
2247     if (!bs->read_only) {
2248         return -EINVAL;
2249     }
2250     if (drv->bdrv_snapshot_load_tmp) {
2251         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2252     }
2253     return -ENOTSUP;
2254 }
2255
2256 #define NB_SUFFIXES 4
2257
2258 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2259 {
2260     static const char suffixes[NB_SUFFIXES] = "KMGT";
2261     int64_t base;
2262     int i;
2263
2264     if (size <= 999) {
2265         snprintf(buf, buf_size, "%" PRId64, size);
2266     } else {
2267         base = 1024;
2268         for(i = 0; i < NB_SUFFIXES; i++) {
2269             if (size < (10 * base)) {
2270                 snprintf(buf, buf_size, "%0.1f%c",
2271                          (double)size / base,
2272                          suffixes[i]);
2273                 break;
2274             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2275                 snprintf(buf, buf_size, "%" PRId64 "%c",
2276                          ((size + (base >> 1)) / base),
2277                          suffixes[i]);
2278                 break;
2279             }
2280             base = base * 1024;
2281         }
2282     }
2283     return buf;
2284 }
2285
2286 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2287 {
2288     char buf1[128], date_buf[128], clock_buf[128];
2289 #ifdef _WIN32
2290     struct tm *ptm;
2291 #else
2292     struct tm tm;
2293 #endif
2294     time_t ti;
2295     int64_t secs;
2296
2297     if (!sn) {
2298         snprintf(buf, buf_size,
2299                  "%-10s%-20s%7s%20s%15s",
2300                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2301     } else {
2302         ti = sn->date_sec;
2303 #ifdef _WIN32
2304         ptm = localtime(&ti);
2305         strftime(date_buf, sizeof(date_buf),
2306                  "%Y-%m-%d %H:%M:%S", ptm);
2307 #else
2308         localtime_r(&ti, &tm);
2309         strftime(date_buf, sizeof(date_buf),
2310                  "%Y-%m-%d %H:%M:%S", &tm);
2311 #endif
2312         secs = sn->vm_clock_nsec / 1000000000;
2313         snprintf(clock_buf, sizeof(clock_buf),
2314                  "%02d:%02d:%02d.%03d",
2315                  (int)(secs / 3600),
2316                  (int)((secs / 60) % 60),
2317                  (int)(secs % 60),
2318                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2319         snprintf(buf, buf_size,
2320                  "%-10s%-20s%7s%20s%15s",
2321                  sn->id_str, sn->name,
2322                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2323                  date_buf,
2324                  clock_buf);
2325     }
2326     return buf;
2327 }
2328
2329 /**************************************************************/
2330 /* async I/Os */
2331
2332 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2333                                  QEMUIOVector *qiov, int nb_sectors,
2334                                  BlockDriverCompletionFunc *cb, void *opaque)
2335 {
2336     BlockDriver *drv = bs->drv;
2337
2338     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2339
2340     if (!drv)
2341         return NULL;
2342     if (bdrv_check_request(bs, sector_num, nb_sectors))
2343         return NULL;
2344
2345     return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
2346                                cb, opaque);
2347 }
2348
2349 typedef struct BlockCompleteData {
2350     BlockDriverCompletionFunc *cb;
2351     void *opaque;
2352     BlockDriverState *bs;
2353     int64_t sector_num;
2354     int nb_sectors;
2355 } BlockCompleteData;
2356
2357 static void block_complete_cb(void *opaque, int ret)
2358 {
2359     BlockCompleteData *b = opaque;
2360
2361     if (b->bs->dirty_bitmap) {
2362         set_dirty_bitmap(b->bs, b->sector_num, b->nb_sectors, 1);
2363     }
2364     b->cb(b->opaque, ret);
2365     g_free(b);
2366 }
2367
2368 static BlockCompleteData *blk_dirty_cb_alloc(BlockDriverState *bs,
2369                                              int64_t sector_num,
2370                                              int nb_sectors,
2371                                              BlockDriverCompletionFunc *cb,
2372                                              void *opaque)
2373 {
2374     BlockCompleteData *blkdata = g_malloc0(sizeof(BlockCompleteData));
2375
2376     blkdata->bs = bs;
2377     blkdata->cb = cb;
2378     blkdata->opaque = opaque;
2379     blkdata->sector_num = sector_num;
2380     blkdata->nb_sectors = nb_sectors;
2381
2382     return blkdata;
2383 }
2384
2385 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2386                                   QEMUIOVector *qiov, int nb_sectors,
2387                                   BlockDriverCompletionFunc *cb, void *opaque)
2388 {
2389     BlockDriver *drv = bs->drv;
2390     BlockDriverAIOCB *ret;
2391     BlockCompleteData *blk_cb_data;
2392
2393     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2394
2395     if (!drv)
2396         return NULL;
2397     if (bs->read_only)
2398         return NULL;
2399     if (bdrv_check_request(bs, sector_num, nb_sectors))
2400         return NULL;
2401
2402     if (bs->dirty_bitmap) {
2403         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
2404                                          opaque);
2405         cb = &block_complete_cb;
2406         opaque = blk_cb_data;
2407     }
2408
2409     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2410                                cb, opaque);
2411
2412     if (ret) {
2413         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2414             bs->wr_highest_sector = sector_num + nb_sectors - 1;
2415         }
2416     }
2417
2418     return ret;
2419 }
2420
2421
2422 typedef struct MultiwriteCB {
2423     int error;
2424     int num_requests;
2425     int num_callbacks;
2426     struct {
2427         BlockDriverCompletionFunc *cb;
2428         void *opaque;
2429         QEMUIOVector *free_qiov;
2430         void *free_buf;
2431     } callbacks[];
2432 } MultiwriteCB;
2433
2434 static void multiwrite_user_cb(MultiwriteCB *mcb)
2435 {
2436     int i;
2437
2438     for (i = 0; i < mcb->num_callbacks; i++) {
2439         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2440         if (mcb->callbacks[i].free_qiov) {
2441             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2442         }
2443         g_free(mcb->callbacks[i].free_qiov);
2444         qemu_vfree(mcb->callbacks[i].free_buf);
2445     }
2446 }
2447
2448 static void multiwrite_cb(void *opaque, int ret)
2449 {
2450     MultiwriteCB *mcb = opaque;
2451
2452     trace_multiwrite_cb(mcb, ret);
2453
2454     if (ret < 0 && !mcb->error) {
2455         mcb->error = ret;
2456     }
2457
2458     mcb->num_requests--;
2459     if (mcb->num_requests == 0) {
2460         multiwrite_user_cb(mcb);
2461         g_free(mcb);
2462     }
2463 }
2464
2465 static int multiwrite_req_compare(const void *a, const void *b)
2466 {
2467     const BlockRequest *req1 = a, *req2 = b;
2468
2469     /*
2470      * Note that we can't simply subtract req2->sector from req1->sector
2471      * here as that could overflow the return value.
2472      */
2473     if (req1->sector > req2->sector) {
2474         return 1;
2475     } else if (req1->sector < req2->sector) {
2476         return -1;
2477     } else {
2478         return 0;
2479     }
2480 }
2481
2482 /*
2483  * Takes a bunch of requests and tries to merge them. Returns the number of
2484  * requests that remain after merging.
2485  */
2486 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2487     int num_reqs, MultiwriteCB *mcb)
2488 {
2489     int i, outidx;
2490
2491     // Sort requests by start sector
2492     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2493
2494     // Check if adjacent requests touch the same clusters. If so, combine them,
2495     // filling up gaps with zero sectors.
2496     outidx = 0;
2497     for (i = 1; i < num_reqs; i++) {
2498         int merge = 0;
2499         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2500
2501         // This handles the cases that are valid for all block drivers, namely
2502         // exactly sequential writes and overlapping writes.
2503         if (reqs[i].sector <= oldreq_last) {
2504             merge = 1;
2505         }
2506
2507         // The block driver may decide that it makes sense to combine requests
2508         // even if there is a gap of some sectors between them. In this case,
2509         // the gap is filled with zeros (therefore only applicable for yet
2510         // unused space in format like qcow2).
2511         if (!merge && bs->drv->bdrv_merge_requests) {
2512             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2513         }
2514
2515         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2516             merge = 0;
2517         }
2518
2519         if (merge) {
2520             size_t size;
2521             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2522             qemu_iovec_init(qiov,
2523                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2524
2525             // Add the first request to the merged one. If the requests are
2526             // overlapping, drop the last sectors of the first request.
2527             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2528             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2529
2530             // We might need to add some zeros between the two requests
2531             if (reqs[i].sector > oldreq_last) {
2532                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2533                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2534                 memset(buf, 0, zero_bytes);
2535                 qemu_iovec_add(qiov, buf, zero_bytes);
2536                 mcb->callbacks[i].free_buf = buf;
2537             }
2538
2539             // Add the second request
2540             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2541
2542             reqs[outidx].nb_sectors = qiov->size >> 9;
2543             reqs[outidx].qiov = qiov;
2544
2545             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2546         } else {
2547             outidx++;
2548             reqs[outidx].sector     = reqs[i].sector;
2549             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2550             reqs[outidx].qiov       = reqs[i].qiov;
2551         }
2552     }
2553
2554     return outidx + 1;
2555 }
2556
2557 /*
2558  * Submit multiple AIO write requests at once.
2559  *
2560  * On success, the function returns 0 and all requests in the reqs array have
2561  * been submitted. In error case this function returns -1, and any of the
2562  * requests may or may not be submitted yet. In particular, this means that the
2563  * callback will be called for some of the requests, for others it won't. The
2564  * caller must check the error field of the BlockRequest to wait for the right
2565  * callbacks (if error != 0, no callback will be called).
2566  *
2567  * The implementation may modify the contents of the reqs array, e.g. to merge
2568  * requests. However, the fields opaque and error are left unmodified as they
2569  * are used to signal failure for a single request to the caller.
2570  */
2571 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2572 {
2573     BlockDriverAIOCB *acb;
2574     MultiwriteCB *mcb;
2575     int i;
2576
2577     /* don't submit writes if we don't have a medium */
2578     if (bs->drv == NULL) {
2579         for (i = 0; i < num_reqs; i++) {
2580             reqs[i].error = -ENOMEDIUM;
2581         }
2582         return -1;
2583     }
2584
2585     if (num_reqs == 0) {
2586         return 0;
2587     }
2588
2589     // Create MultiwriteCB structure
2590     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2591     mcb->num_requests = 0;
2592     mcb->num_callbacks = num_reqs;
2593
2594     for (i = 0; i < num_reqs; i++) {
2595         mcb->callbacks[i].cb = reqs[i].cb;
2596         mcb->callbacks[i].opaque = reqs[i].opaque;
2597     }
2598
2599     // Check for mergable requests
2600     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2601
2602     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2603
2604     /*
2605      * Run the aio requests. As soon as one request can't be submitted
2606      * successfully, fail all requests that are not yet submitted (we must
2607      * return failure for all requests anyway)
2608      *
2609      * num_requests cannot be set to the right value immediately: If
2610      * bdrv_aio_writev fails for some request, num_requests would be too high
2611      * and therefore multiwrite_cb() would never recognize the multiwrite
2612      * request as completed. We also cannot use the loop variable i to set it
2613      * when the first request fails because the callback may already have been
2614      * called for previously submitted requests. Thus, num_requests must be
2615      * incremented for each request that is submitted.
2616      *
2617      * The problem that callbacks may be called early also means that we need
2618      * to take care that num_requests doesn't become 0 before all requests are
2619      * submitted - multiwrite_cb() would consider the multiwrite request
2620      * completed. A dummy request that is "completed" by a manual call to
2621      * multiwrite_cb() takes care of this.
2622      */
2623     mcb->num_requests = 1;
2624
2625     // Run the aio requests
2626     for (i = 0; i < num_reqs; i++) {
2627         mcb->num_requests++;
2628         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2629             reqs[i].nb_sectors, multiwrite_cb, mcb);
2630
2631         if (acb == NULL) {
2632             // We can only fail the whole thing if no request has been
2633             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2634             // complete and report the error in the callback.
2635             if (i == 0) {
2636                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2637                 goto fail;
2638             } else {
2639                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2640                 multiwrite_cb(mcb, -EIO);
2641                 break;
2642             }
2643         }
2644     }
2645
2646     /* Complete the dummy request */
2647     multiwrite_cb(mcb, 0);
2648
2649     return 0;
2650
2651 fail:
2652     for (i = 0; i < mcb->num_callbacks; i++) {
2653         reqs[i].error = -EIO;
2654     }
2655     g_free(mcb);
2656     return -1;
2657 }
2658
2659 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2660         BlockDriverCompletionFunc *cb, void *opaque)
2661 {
2662     BlockDriver *drv = bs->drv;
2663
2664     trace_bdrv_aio_flush(bs, opaque);
2665
2666     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2667         return bdrv_aio_noop_em(bs, cb, opaque);
2668     }
2669
2670     if (!drv)
2671         return NULL;
2672     return drv->bdrv_aio_flush(bs, cb, opaque);
2673 }
2674
2675 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2676 {
2677     acb->pool->cancel(acb);
2678 }
2679
2680
2681 /**************************************************************/
2682 /* async block device emulation */
2683
2684 typedef struct BlockDriverAIOCBSync {
2685     BlockDriverAIOCB common;
2686     QEMUBH *bh;
2687     int ret;
2688     /* vector translation state */
2689     QEMUIOVector *qiov;
2690     uint8_t *bounce;
2691     int is_write;
2692 } BlockDriverAIOCBSync;
2693
2694 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2695 {
2696     BlockDriverAIOCBSync *acb =
2697         container_of(blockacb, BlockDriverAIOCBSync, common);
2698     qemu_bh_delete(acb->bh);
2699     acb->bh = NULL;
2700     qemu_aio_release(acb);
2701 }
2702
2703 static AIOPool bdrv_em_aio_pool = {
2704     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2705     .cancel             = bdrv_aio_cancel_em,
2706 };
2707
2708 static void bdrv_aio_bh_cb(void *opaque)
2709 {
2710     BlockDriverAIOCBSync *acb = opaque;
2711
2712     if (!acb->is_write)
2713         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2714     qemu_vfree(acb->bounce);
2715     acb->common.cb(acb->common.opaque, acb->ret);
2716     qemu_bh_delete(acb->bh);
2717     acb->bh = NULL;
2718     qemu_aio_release(acb);
2719 }
2720
2721 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2722                                             int64_t sector_num,
2723                                             QEMUIOVector *qiov,
2724                                             int nb_sectors,
2725                                             BlockDriverCompletionFunc *cb,
2726                                             void *opaque,
2727                                             int is_write)
2728
2729 {
2730     BlockDriverAIOCBSync *acb;
2731
2732     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2733     acb->is_write = is_write;
2734     acb->qiov = qiov;
2735     acb->bounce = qemu_blockalign(bs, qiov->size);
2736
2737     if (!acb->bh)
2738         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2739
2740     if (is_write) {
2741         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2742         acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2743     } else {
2744         acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2745     }
2746
2747     qemu_bh_schedule(acb->bh);
2748
2749     return &acb->common;
2750 }
2751
2752 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2753         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2754         BlockDriverCompletionFunc *cb, void *opaque)
2755 {
2756     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2757 }
2758
2759 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2760         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2761         BlockDriverCompletionFunc *cb, void *opaque)
2762 {
2763     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2764 }
2765
2766
2767 typedef struct BlockDriverAIOCBCoroutine {
2768     BlockDriverAIOCB common;
2769     BlockRequest req;
2770     bool is_write;
2771     QEMUBH* bh;
2772 } BlockDriverAIOCBCoroutine;
2773
2774 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2775 {
2776     qemu_aio_flush();
2777 }
2778
2779 static AIOPool bdrv_em_co_aio_pool = {
2780     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2781     .cancel             = bdrv_aio_co_cancel_em,
2782 };
2783
2784 static void bdrv_co_rw_bh(void *opaque)
2785 {
2786     BlockDriverAIOCBCoroutine *acb = opaque;
2787
2788     acb->common.cb(acb->common.opaque, acb->req.error);
2789     qemu_bh_delete(acb->bh);
2790     qemu_aio_release(acb);
2791 }
2792
2793 static void coroutine_fn bdrv_co_rw(void *opaque)
2794 {
2795     BlockDriverAIOCBCoroutine *acb = opaque;
2796     BlockDriverState *bs = acb->common.bs;
2797
2798     if (!acb->is_write) {
2799         acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector,
2800             acb->req.nb_sectors, acb->req.qiov);
2801     } else {
2802         acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector,
2803             acb->req.nb_sectors, acb->req.qiov);
2804     }
2805
2806     acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb);
2807     qemu_bh_schedule(acb->bh);
2808 }
2809
2810 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2811                                                int64_t sector_num,
2812                                                QEMUIOVector *qiov,
2813                                                int nb_sectors,
2814                                                BlockDriverCompletionFunc *cb,
2815                                                void *opaque,
2816                                                bool is_write)
2817 {
2818     Coroutine *co;
2819     BlockDriverAIOCBCoroutine *acb;
2820
2821     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2822     acb->req.sector = sector_num;
2823     acb->req.nb_sectors = nb_sectors;
2824     acb->req.qiov = qiov;
2825     acb->is_write = is_write;
2826
2827     co = qemu_coroutine_create(bdrv_co_rw);
2828     qemu_coroutine_enter(co, acb);
2829
2830     return &acb->common;
2831 }
2832
2833 static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
2834         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2835         BlockDriverCompletionFunc *cb, void *opaque)
2836 {
2837     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
2838                                  false);
2839 }
2840
2841 static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
2842         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2843         BlockDriverCompletionFunc *cb, void *opaque)
2844 {
2845     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
2846                                  true);
2847 }
2848
2849 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2850         BlockDriverCompletionFunc *cb, void *opaque)
2851 {
2852     BlockDriverAIOCBSync *acb;
2853
2854     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2855     acb->is_write = 1; /* don't bounce in the completion hadler */
2856     acb->qiov = NULL;
2857     acb->bounce = NULL;
2858     acb->ret = 0;
2859
2860     if (!acb->bh)
2861         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2862
2863     bdrv_flush(bs);
2864     qemu_bh_schedule(acb->bh);
2865     return &acb->common;
2866 }
2867
2868 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2869         BlockDriverCompletionFunc *cb, void *opaque)
2870 {
2871     BlockDriverAIOCBSync *acb;
2872
2873     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2874     acb->is_write = 1; /* don't bounce in the completion handler */
2875     acb->qiov = NULL;
2876     acb->bounce = NULL;
2877     acb->ret = 0;
2878
2879     if (!acb->bh) {
2880         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2881     }
2882
2883     qemu_bh_schedule(acb->bh);
2884     return &acb->common;
2885 }
2886
2887 /**************************************************************/
2888 /* sync block device emulation */
2889
2890 static void bdrv_rw_em_cb(void *opaque, int ret)
2891 {
2892     *(int *)opaque = ret;
2893 }
2894
2895 #define NOT_DONE 0x7fffffff
2896
2897 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2898                         uint8_t *buf, int nb_sectors)
2899 {
2900     int async_ret;
2901     BlockDriverAIOCB *acb;
2902     struct iovec iov;
2903     QEMUIOVector qiov;
2904
2905     async_ret = NOT_DONE;
2906     iov.iov_base = (void *)buf;
2907     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2908     qemu_iovec_init_external(&qiov, &iov, 1);
2909     acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2910         bdrv_rw_em_cb, &async_ret);
2911     if (acb == NULL) {
2912         async_ret = -1;
2913         goto fail;
2914     }
2915
2916     while (async_ret == NOT_DONE) {
2917         qemu_aio_wait();
2918     }
2919
2920
2921 fail:
2922     return async_ret;
2923 }
2924
2925 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2926                          const uint8_t *buf, int nb_sectors)
2927 {
2928     int async_ret;
2929     BlockDriverAIOCB *acb;
2930     struct iovec iov;
2931     QEMUIOVector qiov;
2932
2933     async_ret = NOT_DONE;
2934     iov.iov_base = (void *)buf;
2935     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2936     qemu_iovec_init_external(&qiov, &iov, 1);
2937     acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2938         bdrv_rw_em_cb, &async_ret);
2939     if (acb == NULL) {
2940         async_ret = -1;
2941         goto fail;
2942     }
2943     while (async_ret == NOT_DONE) {
2944         qemu_aio_wait();
2945     }
2946
2947 fail:
2948     return async_ret;
2949 }
2950
2951 void bdrv_init(void)
2952 {
2953     module_call_init(MODULE_INIT_BLOCK);
2954 }
2955
2956 void bdrv_init_with_whitelist(void)
2957 {
2958     use_bdrv_whitelist = 1;
2959     bdrv_init();
2960 }
2961
2962 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2963                    BlockDriverCompletionFunc *cb, void *opaque)
2964 {
2965     BlockDriverAIOCB *acb;
2966
2967     if (pool->free_aiocb) {
2968         acb = pool->free_aiocb;
2969         pool->free_aiocb = acb->next;
2970     } else {
2971         acb = g_malloc0(pool->aiocb_size);
2972         acb->pool = pool;
2973     }
2974     acb->bs = bs;
2975     acb->cb = cb;
2976     acb->opaque = opaque;
2977     return acb;
2978 }
2979
2980 void qemu_aio_release(void *p)
2981 {
2982     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2983     AIOPool *pool = acb->pool;
2984     acb->next = pool->free_aiocb;
2985     pool->free_aiocb = acb;
2986 }
2987
2988 /**************************************************************/
2989 /* Coroutine block device emulation */
2990
2991 typedef struct CoroutineIOCompletion {
2992     Coroutine *coroutine;
2993     int ret;
2994 } CoroutineIOCompletion;
2995
2996 static void bdrv_co_io_em_complete(void *opaque, int ret)
2997 {
2998     CoroutineIOCompletion *co = opaque;
2999
3000     co->ret = ret;
3001     qemu_coroutine_enter(co->coroutine, NULL);
3002 }
3003
3004 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3005                                       int nb_sectors, QEMUIOVector *iov,
3006                                       bool is_write)
3007 {
3008     CoroutineIOCompletion co = {
3009         .coroutine = qemu_coroutine_self(),
3010     };
3011     BlockDriverAIOCB *acb;
3012
3013     if (is_write) {
3014         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3015                                        bdrv_co_io_em_complete, &co);
3016     } else {
3017         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3018                                       bdrv_co_io_em_complete, &co);
3019     }
3020
3021     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3022     if (!acb) {
3023         return -EIO;
3024     }
3025     qemu_coroutine_yield();
3026
3027     return co.ret;
3028 }
3029
3030 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3031                                          int64_t sector_num, int nb_sectors,
3032                                          QEMUIOVector *iov)
3033 {
3034     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3035 }
3036
3037 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3038                                          int64_t sector_num, int nb_sectors,
3039                                          QEMUIOVector *iov)
3040 {
3041     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3042 }
3043
3044 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs)
3045 {
3046     CoroutineIOCompletion co = {
3047         .coroutine = qemu_coroutine_self(),
3048     };
3049     BlockDriverAIOCB *acb;
3050
3051     acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3052     if (!acb) {
3053         return -EIO;
3054     }
3055     qemu_coroutine_yield();
3056     return co.ret;
3057 }
3058
3059 /**************************************************************/
3060 /* removable device support */
3061
3062 /**
3063  * Return TRUE if the media is present
3064  */
3065 int bdrv_is_inserted(BlockDriverState *bs)
3066 {
3067     BlockDriver *drv = bs->drv;
3068
3069     if (!drv)
3070         return 0;
3071     if (!drv->bdrv_is_inserted)
3072         return 1;
3073     return drv->bdrv_is_inserted(bs);
3074 }
3075
3076 /**
3077  * Return whether the media changed since the last call to this
3078  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3079  */
3080 int bdrv_media_changed(BlockDriverState *bs)
3081 {
3082     BlockDriver *drv = bs->drv;
3083
3084     if (drv && drv->bdrv_media_changed) {
3085         return drv->bdrv_media_changed(bs);
3086     }
3087     return -ENOTSUP;
3088 }
3089
3090 /**
3091  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3092  */
3093 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3094 {
3095     BlockDriver *drv = bs->drv;
3096
3097     if (drv && drv->bdrv_eject) {
3098         drv->bdrv_eject(bs, eject_flag);
3099     }
3100 }
3101
3102 /**
3103  * Lock or unlock the media (if it is locked, the user won't be able
3104  * to eject it manually).
3105  */
3106 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3107 {
3108     BlockDriver *drv = bs->drv;
3109
3110     trace_bdrv_lock_medium(bs, locked);
3111
3112     if (drv && drv->bdrv_lock_medium) {
3113         drv->bdrv_lock_medium(bs, locked);
3114     }
3115 }
3116
3117 /* needed for generic scsi interface */
3118
3119 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3120 {
3121     BlockDriver *drv = bs->drv;
3122
3123     if (drv && drv->bdrv_ioctl)
3124         return drv->bdrv_ioctl(bs, req, buf);
3125     return -ENOTSUP;
3126 }
3127
3128 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3129         unsigned long int req, void *buf,
3130         BlockDriverCompletionFunc *cb, void *opaque)
3131 {
3132     BlockDriver *drv = bs->drv;
3133
3134     if (drv && drv->bdrv_aio_ioctl)
3135         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3136     return NULL;
3137 }
3138
3139 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3140 {
3141     bs->buffer_alignment = align;
3142 }
3143
3144 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3145 {
3146     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3147 }
3148
3149 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3150 {
3151     int64_t bitmap_size;
3152
3153     bs->dirty_count = 0;
3154     if (enable) {
3155         if (!bs->dirty_bitmap) {
3156             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3157                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3158             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3159
3160             bs->dirty_bitmap = g_malloc0(bitmap_size);
3161         }
3162     } else {
3163         if (bs->dirty_bitmap) {
3164             g_free(bs->dirty_bitmap);
3165             bs->dirty_bitmap = NULL;
3166         }
3167     }
3168 }
3169
3170 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3171 {
3172     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3173
3174     if (bs->dirty_bitmap &&
3175         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3176         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3177             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3178     } else {
3179         return 0;
3180     }
3181 }
3182
3183 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3184                       int nr_sectors)
3185 {
3186     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3187 }
3188
3189 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3190 {
3191     return bs->dirty_count;
3192 }
3193
3194 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3195 {
3196     assert(bs->in_use != in_use);
3197     bs->in_use = in_use;
3198 }
3199
3200 int bdrv_in_use(BlockDriverState *bs)
3201 {
3202     return bs->in_use;
3203 }
3204
3205 void bdrv_iostatus_enable(BlockDriverState *bs)
3206 {
3207     bs->iostatus = BDRV_IOS_OK;
3208 }
3209
3210 /* The I/O status is only enabled if the drive explicitly
3211  * enables it _and_ the VM is configured to stop on errors */
3212 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3213 {
3214     return (bs->iostatus != BDRV_IOS_INVAL &&
3215            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3216             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3217             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3218 }
3219
3220 void bdrv_iostatus_disable(BlockDriverState *bs)
3221 {
3222     bs->iostatus = BDRV_IOS_INVAL;
3223 }
3224
3225 void bdrv_iostatus_reset(BlockDriverState *bs)
3226 {
3227     if (bdrv_iostatus_is_enabled(bs)) {
3228         bs->iostatus = BDRV_IOS_OK;
3229     }
3230 }
3231
3232 /* XXX: Today this is set by device models because it makes the implementation
3233    quite simple. However, the block layer knows about the error, so it's
3234    possible to implement this without device models being involved */
3235 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3236 {
3237     if (bdrv_iostatus_is_enabled(bs) && bs->iostatus == BDRV_IOS_OK) {
3238         assert(error >= 0);
3239         bs->iostatus = error == ENOSPC ? BDRV_IOS_ENOSPC : BDRV_IOS_FAILED;
3240     }
3241 }
3242
3243 void
3244 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3245         enum BlockAcctType type)
3246 {
3247     assert(type < BDRV_MAX_IOTYPE);
3248
3249     cookie->bytes = bytes;
3250     cookie->start_time_ns = get_clock();
3251     cookie->type = type;
3252 }
3253
3254 void
3255 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3256 {
3257     assert(cookie->type < BDRV_MAX_IOTYPE);
3258
3259     bs->nr_bytes[cookie->type] += cookie->bytes;
3260     bs->nr_ops[cookie->type]++;
3261     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3262 }
3263
3264 int bdrv_img_create(const char *filename, const char *fmt,
3265                     const char *base_filename, const char *base_fmt,
3266                     char *options, uint64_t img_size, int flags)
3267 {
3268     QEMUOptionParameter *param = NULL, *create_options = NULL;
3269     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3270     BlockDriverState *bs = NULL;
3271     BlockDriver *drv, *proto_drv;
3272     BlockDriver *backing_drv = NULL;
3273     int ret = 0;
3274
3275     /* Find driver and parse its options */
3276     drv = bdrv_find_format(fmt);
3277     if (!drv) {
3278         error_report("Unknown file format '%s'", fmt);
3279         ret = -EINVAL;
3280         goto out;
3281     }
3282
3283     proto_drv = bdrv_find_protocol(filename);
3284     if (!proto_drv) {
3285         error_report("Unknown protocol '%s'", filename);
3286         ret = -EINVAL;
3287         goto out;
3288     }
3289
3290     create_options = append_option_parameters(create_options,
3291                                               drv->create_options);
3292     create_options = append_option_parameters(create_options,
3293                                               proto_drv->create_options);
3294
3295     /* Create parameter list with default values */
3296     param = parse_option_parameters("", create_options, param);
3297
3298     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3299
3300     /* Parse -o options */
3301     if (options) {
3302         param = parse_option_parameters(options, create_options, param);
3303         if (param == NULL) {
3304             error_report("Invalid options for file format '%s'.", fmt);
3305             ret = -EINVAL;
3306             goto out;
3307         }
3308     }
3309
3310     if (base_filename) {
3311         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3312                                  base_filename)) {
3313             error_report("Backing file not supported for file format '%s'",
3314                          fmt);
3315             ret = -EINVAL;
3316             goto out;
3317         }
3318     }
3319
3320     if (base_fmt) {
3321         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3322             error_report("Backing file format not supported for file "
3323                          "format '%s'", fmt);
3324             ret = -EINVAL;
3325             goto out;
3326         }
3327     }
3328
3329     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3330     if (backing_file && backing_file->value.s) {
3331         if (!strcmp(filename, backing_file->value.s)) {
3332             error_report("Error: Trying to create an image with the "
3333                          "same filename as the backing file");
3334             ret = -EINVAL;
3335             goto out;
3336         }
3337     }
3338
3339     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3340     if (backing_fmt && backing_fmt->value.s) {
3341         backing_drv = bdrv_find_format(backing_fmt->value.s);
3342         if (!backing_drv) {
3343             error_report("Unknown backing file format '%s'",
3344                          backing_fmt->value.s);
3345             ret = -EINVAL;
3346             goto out;
3347         }
3348     }
3349
3350     // The size for the image must always be specified, with one exception:
3351     // If we are using a backing file, we can obtain the size from there
3352     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3353     if (size && size->value.n == -1) {
3354         if (backing_file && backing_file->value.s) {
3355             uint64_t size;
3356             char buf[32];
3357
3358             bs = bdrv_new("");
3359
3360             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3361             if (ret < 0) {
3362                 error_report("Could not open '%s'", backing_file->value.s);
3363                 goto out;
3364             }
3365             bdrv_get_geometry(bs, &size);
3366             size *= 512;
3367
3368             snprintf(buf, sizeof(buf), "%" PRId64, size);
3369             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3370         } else {
3371             error_report("Image creation needs a size parameter");
3372             ret = -EINVAL;
3373             goto out;
3374         }
3375     }
3376
3377     printf("Formatting '%s', fmt=%s ", filename, fmt);
3378     print_option_parameters(param);
3379     puts("");
3380
3381     ret = bdrv_create(drv, filename, param);
3382
3383     if (ret < 0) {
3384         if (ret == -ENOTSUP) {
3385             error_report("Formatting or formatting option not supported for "
3386                          "file format '%s'", fmt);
3387         } else if (ret == -EFBIG) {
3388             error_report("The image size is too large for file format '%s'",
3389                          fmt);
3390         } else {
3391             error_report("%s: error while creating %s: %s", filename, fmt,
3392                          strerror(-ret));
3393         }
3394     }
3395
3396 out:
3397     free_option_parameters(create_options);
3398     free_option_parameters(param);
3399
3400     if (bs) {
3401         bdrv_delete(bs);
3402     }
3403
3404     return ret;
3405 }
This page took 0.206682 seconds and 4 git commands to generate.