]> Git Repo - qemu.git/blob - block.c
scsi-disk: fix the block descriptor returned by the MODE SENSE command
[qemu.git] / block.c
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "monitor.h"
27 #include "block_int.h"
28 #include "module.h"
29 #include "qemu-objects.h"
30
31 #ifdef CONFIG_BSD
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <sys/ioctl.h>
35 #include <sys/queue.h>
36 #ifndef __DragonFly__
37 #include <sys/disk.h>
38 #endif
39 #endif
40
41 #ifdef _WIN32
42 #include <windows.h>
43 #endif
44
45 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
46         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
47         BlockDriverCompletionFunc *cb, void *opaque);
48 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
49         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
50         BlockDriverCompletionFunc *cb, void *opaque);
51 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
52         BlockDriverCompletionFunc *cb, void *opaque);
53 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
54         BlockDriverCompletionFunc *cb, void *opaque);
55 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
56                         uint8_t *buf, int nb_sectors);
57 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
58                          const uint8_t *buf, int nb_sectors);
59
60 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
61     QTAILQ_HEAD_INITIALIZER(bdrv_states);
62
63 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
64     QLIST_HEAD_INITIALIZER(bdrv_drivers);
65
66 /* The device to use for VM snapshots */
67 static BlockDriverState *bs_snapshots;
68
69 /* If non-zero, use only whitelisted block drivers */
70 static int use_bdrv_whitelist;
71
72 int path_is_absolute(const char *path)
73 {
74     const char *p;
75 #ifdef _WIN32
76     /* specific case for names like: "\\.\d:" */
77     if (*path == '/' || *path == '\\')
78         return 1;
79 #endif
80     p = strchr(path, ':');
81     if (p)
82         p++;
83     else
84         p = path;
85 #ifdef _WIN32
86     return (*p == '/' || *p == '\\');
87 #else
88     return (*p == '/');
89 #endif
90 }
91
92 /* if filename is absolute, just copy it to dest. Otherwise, build a
93    path to it by considering it is relative to base_path. URL are
94    supported. */
95 void path_combine(char *dest, int dest_size,
96                   const char *base_path,
97                   const char *filename)
98 {
99     const char *p, *p1;
100     int len;
101
102     if (dest_size <= 0)
103         return;
104     if (path_is_absolute(filename)) {
105         pstrcpy(dest, dest_size, filename);
106     } else {
107         p = strchr(base_path, ':');
108         if (p)
109             p++;
110         else
111             p = base_path;
112         p1 = strrchr(base_path, '/');
113 #ifdef _WIN32
114         {
115             const char *p2;
116             p2 = strrchr(base_path, '\\');
117             if (!p1 || p2 > p1)
118                 p1 = p2;
119         }
120 #endif
121         if (p1)
122             p1++;
123         else
124             p1 = base_path;
125         if (p1 > p)
126             p = p1;
127         len = p - base_path;
128         if (len > dest_size - 1)
129             len = dest_size - 1;
130         memcpy(dest, base_path, len);
131         dest[len] = '\0';
132         pstrcat(dest, dest_size, filename);
133     }
134 }
135
136 void bdrv_register(BlockDriver *bdrv)
137 {
138     if (!bdrv->bdrv_aio_readv) {
139         /* add AIO emulation layer */
140         bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
141         bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
142     } else if (!bdrv->bdrv_read) {
143         /* add synchronous IO emulation layer */
144         bdrv->bdrv_read = bdrv_read_em;
145         bdrv->bdrv_write = bdrv_write_em;
146     }
147
148     if (!bdrv->bdrv_aio_flush)
149         bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
150
151     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
152 }
153
154 /* create a new block device (by default it is empty) */
155 BlockDriverState *bdrv_new(const char *device_name)
156 {
157     BlockDriverState *bs;
158
159     bs = qemu_mallocz(sizeof(BlockDriverState));
160     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
161     if (device_name[0] != '\0') {
162         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
163     }
164     return bs;
165 }
166
167 BlockDriver *bdrv_find_format(const char *format_name)
168 {
169     BlockDriver *drv1;
170     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
171         if (!strcmp(drv1->format_name, format_name)) {
172             return drv1;
173         }
174     }
175     return NULL;
176 }
177
178 static int bdrv_is_whitelisted(BlockDriver *drv)
179 {
180     static const char *whitelist[] = {
181         CONFIG_BDRV_WHITELIST
182     };
183     const char **p;
184
185     if (!whitelist[0])
186         return 1;               /* no whitelist, anything goes */
187
188     for (p = whitelist; *p; p++) {
189         if (!strcmp(drv->format_name, *p)) {
190             return 1;
191         }
192     }
193     return 0;
194 }
195
196 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
197 {
198     BlockDriver *drv = bdrv_find_format(format_name);
199     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
200 }
201
202 int bdrv_create(BlockDriver *drv, const char* filename,
203     QEMUOptionParameter *options)
204 {
205     if (!drv->bdrv_create)
206         return -ENOTSUP;
207
208     return drv->bdrv_create(filename, options);
209 }
210
211 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
212 {
213     BlockDriver *drv;
214
215     drv = bdrv_find_protocol(filename);
216     if (drv == NULL) {
217         drv = bdrv_find_format("file");
218     }
219
220     return bdrv_create(drv, filename, options);
221 }
222
223 #ifdef _WIN32
224 void get_tmp_filename(char *filename, int size)
225 {
226     char temp_dir[MAX_PATH];
227
228     GetTempPath(MAX_PATH, temp_dir);
229     GetTempFileName(temp_dir, "qem", 0, filename);
230 }
231 #else
232 void get_tmp_filename(char *filename, int size)
233 {
234     int fd;
235     const char *tmpdir;
236     /* XXX: race condition possible */
237     tmpdir = getenv("TMPDIR");
238     if (!tmpdir)
239         tmpdir = "/tmp";
240     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
241     fd = mkstemp(filename);
242     close(fd);
243 }
244 #endif
245
246 #ifdef _WIN32
247 static int is_windows_drive_prefix(const char *filename)
248 {
249     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
250              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
251             filename[1] == ':');
252 }
253
254 int is_windows_drive(const char *filename)
255 {
256     if (is_windows_drive_prefix(filename) &&
257         filename[2] == '\0')
258         return 1;
259     if (strstart(filename, "\\\\.\\", NULL) ||
260         strstart(filename, "//./", NULL))
261         return 1;
262     return 0;
263 }
264 #endif
265
266 /*
267  * Detect host devices. By convention, /dev/cdrom[N] is always
268  * recognized as a host CDROM.
269  */
270 static BlockDriver *find_hdev_driver(const char *filename)
271 {
272     int score_max = 0, score;
273     BlockDriver *drv = NULL, *d;
274
275     QLIST_FOREACH(d, &bdrv_drivers, list) {
276         if (d->bdrv_probe_device) {
277             score = d->bdrv_probe_device(filename);
278             if (score > score_max) {
279                 score_max = score;
280                 drv = d;
281             }
282         }
283     }
284
285     return drv;
286 }
287
288 BlockDriver *bdrv_find_protocol(const char *filename)
289 {
290     BlockDriver *drv1;
291     char protocol[128];
292     int len;
293     const char *p;
294
295     /* TODO Drivers without bdrv_file_open must be specified explicitly */
296
297     /*
298      * XXX(hch): we really should not let host device detection
299      * override an explicit protocol specification, but moving this
300      * later breaks access to device names with colons in them.
301      * Thanks to the brain-dead persistent naming schemes on udev-
302      * based Linux systems those actually are quite common.
303      */
304     drv1 = find_hdev_driver(filename);
305     if (drv1) {
306         return drv1;
307     }
308
309 #ifdef _WIN32
310      if (is_windows_drive(filename) ||
311          is_windows_drive_prefix(filename))
312          return bdrv_find_format("file");
313 #endif
314
315     p = strchr(filename, ':');
316     if (!p) {
317         return bdrv_find_format("file");
318     }
319     len = p - filename;
320     if (len > sizeof(protocol) - 1)
321         len = sizeof(protocol) - 1;
322     memcpy(protocol, filename, len);
323     protocol[len] = '\0';
324     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
325         if (drv1->protocol_name &&
326             !strcmp(drv1->protocol_name, protocol)) {
327             return drv1;
328         }
329     }
330     return NULL;
331 }
332
333 static int find_image_format(const char *filename, BlockDriver **pdrv)
334 {
335     int ret, score, score_max;
336     BlockDriver *drv1, *drv;
337     uint8_t buf[2048];
338     BlockDriverState *bs;
339
340     ret = bdrv_file_open(&bs, filename, 0);
341     if (ret < 0) {
342         *pdrv = NULL;
343         return ret;
344     }
345
346     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
347     if (bs->sg || !bdrv_is_inserted(bs)) {
348         bdrv_delete(bs);
349         drv = bdrv_find_format("raw");
350         if (!drv) {
351             ret = -ENOENT;
352         }
353         *pdrv = drv;
354         return ret;
355     }
356
357     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
358     bdrv_delete(bs);
359     if (ret < 0) {
360         *pdrv = NULL;
361         return ret;
362     }
363
364     score_max = 0;
365     drv = NULL;
366     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
367         if (drv1->bdrv_probe) {
368             score = drv1->bdrv_probe(buf, ret, filename);
369             if (score > score_max) {
370                 score_max = score;
371                 drv = drv1;
372             }
373         }
374     }
375     if (!drv) {
376         ret = -ENOENT;
377     }
378     *pdrv = drv;
379     return ret;
380 }
381
382 /**
383  * Set the current 'total_sectors' value
384  */
385 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
386 {
387     BlockDriver *drv = bs->drv;
388
389     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
390     if (bs->sg)
391         return 0;
392
393     /* query actual device if possible, otherwise just trust the hint */
394     if (drv->bdrv_getlength) {
395         int64_t length = drv->bdrv_getlength(bs);
396         if (length < 0) {
397             return length;
398         }
399         hint = length >> BDRV_SECTOR_BITS;
400     }
401
402     bs->total_sectors = hint;
403     return 0;
404 }
405
406 /*
407  * Common part for opening disk images and files
408  */
409 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
410     int flags, BlockDriver *drv)
411 {
412     int ret, open_flags;
413
414     assert(drv != NULL);
415
416     bs->file = NULL;
417     bs->total_sectors = 0;
418     bs->encrypted = 0;
419     bs->valid_key = 0;
420     bs->open_flags = flags;
421     /* buffer_alignment defaulted to 512, drivers can change this value */
422     bs->buffer_alignment = 512;
423
424     pstrcpy(bs->filename, sizeof(bs->filename), filename);
425
426     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
427         return -ENOTSUP;
428     }
429
430     bs->drv = drv;
431     bs->opaque = qemu_mallocz(drv->instance_size);
432
433     /*
434      * Yes, BDRV_O_NOCACHE aka O_DIRECT means we have to present a
435      * write cache to the guest.  We do need the fdatasync to flush
436      * out transactions for block allocations, and we maybe have a
437      * volatile write cache in our backing device to deal with.
438      */
439     if (flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))
440         bs->enable_write_cache = 1;
441
442     /*
443      * Clear flags that are internal to the block layer before opening the
444      * image.
445      */
446     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
447
448     /*
449      * Snapshots should be writeable.
450      */
451     if (bs->is_temporary) {
452         open_flags |= BDRV_O_RDWR;
453     }
454
455     /* Open the image, either directly or using a protocol */
456     if (drv->bdrv_file_open) {
457         ret = drv->bdrv_file_open(bs, filename, open_flags);
458     } else {
459         ret = bdrv_file_open(&bs->file, filename, open_flags);
460         if (ret >= 0) {
461             ret = drv->bdrv_open(bs, open_flags);
462         }
463     }
464
465     if (ret < 0) {
466         goto free_and_fail;
467     }
468
469     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
470
471     ret = refresh_total_sectors(bs, bs->total_sectors);
472     if (ret < 0) {
473         goto free_and_fail;
474     }
475
476 #ifndef _WIN32
477     if (bs->is_temporary) {
478         unlink(filename);
479     }
480 #endif
481     return 0;
482
483 free_and_fail:
484     if (bs->file) {
485         bdrv_delete(bs->file);
486         bs->file = NULL;
487     }
488     qemu_free(bs->opaque);
489     bs->opaque = NULL;
490     bs->drv = NULL;
491     return ret;
492 }
493
494 /*
495  * Opens a file using a protocol (file, host_device, nbd, ...)
496  */
497 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
498 {
499     BlockDriverState *bs;
500     BlockDriver *drv;
501     int ret;
502
503     drv = bdrv_find_protocol(filename);
504     if (!drv) {
505         return -ENOENT;
506     }
507
508     bs = bdrv_new("");
509     ret = bdrv_open_common(bs, filename, flags, drv);
510     if (ret < 0) {
511         bdrv_delete(bs);
512         return ret;
513     }
514     bs->growable = 1;
515     *pbs = bs;
516     return 0;
517 }
518
519 /*
520  * Opens a disk image (raw, qcow2, vmdk, ...)
521  */
522 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
523               BlockDriver *drv)
524 {
525     int ret;
526     int probed = 0;
527
528     if (flags & BDRV_O_SNAPSHOT) {
529         BlockDriverState *bs1;
530         int64_t total_size;
531         int is_protocol = 0;
532         BlockDriver *bdrv_qcow2;
533         QEMUOptionParameter *options;
534         char tmp_filename[PATH_MAX];
535         char backing_filename[PATH_MAX];
536
537         /* if snapshot, we create a temporary backing file and open it
538            instead of opening 'filename' directly */
539
540         /* if there is a backing file, use it */
541         bs1 = bdrv_new("");
542         ret = bdrv_open(bs1, filename, 0, drv);
543         if (ret < 0) {
544             bdrv_delete(bs1);
545             return ret;
546         }
547         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
548
549         if (bs1->drv && bs1->drv->protocol_name)
550             is_protocol = 1;
551
552         bdrv_delete(bs1);
553
554         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
555
556         /* Real path is meaningless for protocols */
557         if (is_protocol)
558             snprintf(backing_filename, sizeof(backing_filename),
559                      "%s", filename);
560         else if (!realpath(filename, backing_filename))
561             return -errno;
562
563         bdrv_qcow2 = bdrv_find_format("qcow2");
564         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
565
566         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
567         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
568         if (drv) {
569             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
570                 drv->format_name);
571         }
572
573         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
574         free_option_parameters(options);
575         if (ret < 0) {
576             return ret;
577         }
578
579         filename = tmp_filename;
580         drv = bdrv_qcow2;
581         bs->is_temporary = 1;
582     }
583
584     /* Find the right image format driver */
585     if (!drv) {
586         ret = find_image_format(filename, &drv);
587         probed = 1;
588     }
589
590     if (!drv) {
591         goto unlink_and_fail;
592     }
593
594     /* Open the image */
595     ret = bdrv_open_common(bs, filename, flags, drv);
596     if (ret < 0) {
597         goto unlink_and_fail;
598     }
599
600     bs->probed = probed;
601
602     /* If there is a backing file, use it */
603     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
604         char backing_filename[PATH_MAX];
605         int back_flags;
606         BlockDriver *back_drv = NULL;
607
608         bs->backing_hd = bdrv_new("");
609         path_combine(backing_filename, sizeof(backing_filename),
610                      filename, bs->backing_file);
611         if (bs->backing_format[0] != '\0')
612             back_drv = bdrv_find_format(bs->backing_format);
613
614         /* backing files always opened read-only */
615         back_flags =
616             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
617
618         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
619         if (ret < 0) {
620             bdrv_close(bs);
621             return ret;
622         }
623         if (bs->is_temporary) {
624             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
625         } else {
626             /* base image inherits from "parent" */
627             bs->backing_hd->keep_read_only = bs->keep_read_only;
628         }
629     }
630
631     if (!bdrv_key_required(bs)) {
632         /* call the change callback */
633         bs->media_changed = 1;
634         if (bs->change_cb)
635             bs->change_cb(bs->change_opaque);
636     }
637
638     return 0;
639
640 unlink_and_fail:
641     if (bs->is_temporary) {
642         unlink(filename);
643     }
644     return ret;
645 }
646
647 void bdrv_close(BlockDriverState *bs)
648 {
649     if (bs->drv) {
650         if (bs == bs_snapshots) {
651             bs_snapshots = NULL;
652         }
653         if (bs->backing_hd) {
654             bdrv_delete(bs->backing_hd);
655             bs->backing_hd = NULL;
656         }
657         bs->drv->bdrv_close(bs);
658         qemu_free(bs->opaque);
659 #ifdef _WIN32
660         if (bs->is_temporary) {
661             unlink(bs->filename);
662         }
663 #endif
664         bs->opaque = NULL;
665         bs->drv = NULL;
666
667         if (bs->file != NULL) {
668             bdrv_close(bs->file);
669         }
670
671         /* call the change callback */
672         bs->media_changed = 1;
673         if (bs->change_cb)
674             bs->change_cb(bs->change_opaque);
675     }
676 }
677
678 void bdrv_close_all(void)
679 {
680     BlockDriverState *bs;
681
682     QTAILQ_FOREACH(bs, &bdrv_states, list) {
683         bdrv_close(bs);
684     }
685 }
686
687 void bdrv_delete(BlockDriverState *bs)
688 {
689     assert(!bs->peer);
690
691     /* remove from list, if necessary */
692     if (bs->device_name[0] != '\0') {
693         QTAILQ_REMOVE(&bdrv_states, bs, list);
694     }
695
696     bdrv_close(bs);
697     if (bs->file != NULL) {
698         bdrv_delete(bs->file);
699     }
700
701     assert(bs != bs_snapshots);
702     qemu_free(bs);
703 }
704
705 int bdrv_attach(BlockDriverState *bs, DeviceState *qdev)
706 {
707     if (bs->peer) {
708         return -EBUSY;
709     }
710     bs->peer = qdev;
711     return 0;
712 }
713
714 void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
715 {
716     assert(bs->peer == qdev);
717     bs->peer = NULL;
718 }
719
720 DeviceState *bdrv_get_attached(BlockDriverState *bs)
721 {
722     return bs->peer;
723 }
724
725 /*
726  * Run consistency checks on an image
727  *
728  * Returns 0 if the check could be completed (it doesn't mean that the image is
729  * free of errors) or -errno when an internal error occured. The results of the
730  * check are stored in res.
731  */
732 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
733 {
734     if (bs->drv->bdrv_check == NULL) {
735         return -ENOTSUP;
736     }
737
738     memset(res, 0, sizeof(*res));
739     return bs->drv->bdrv_check(bs, res);
740 }
741
742 #define COMMIT_BUF_SECTORS 2048
743
744 /* commit COW file into the raw image */
745 int bdrv_commit(BlockDriverState *bs)
746 {
747     BlockDriver *drv = bs->drv;
748     BlockDriver *backing_drv;
749     int64_t sector, total_sectors;
750     int n, ro, open_flags;
751     int ret = 0, rw_ret = 0;
752     uint8_t *buf;
753     char filename[1024];
754     BlockDriverState *bs_rw, *bs_ro;
755
756     if (!drv)
757         return -ENOMEDIUM;
758     
759     if (!bs->backing_hd) {
760         return -ENOTSUP;
761     }
762
763     if (bs->backing_hd->keep_read_only) {
764         return -EACCES;
765     }
766
767     backing_drv = bs->backing_hd->drv;
768     ro = bs->backing_hd->read_only;
769     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
770     open_flags =  bs->backing_hd->open_flags;
771
772     if (ro) {
773         /* re-open as RW */
774         bdrv_delete(bs->backing_hd);
775         bs->backing_hd = NULL;
776         bs_rw = bdrv_new("");
777         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
778             backing_drv);
779         if (rw_ret < 0) {
780             bdrv_delete(bs_rw);
781             /* try to re-open read-only */
782             bs_ro = bdrv_new("");
783             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
784                 backing_drv);
785             if (ret < 0) {
786                 bdrv_delete(bs_ro);
787                 /* drive not functional anymore */
788                 bs->drv = NULL;
789                 return ret;
790             }
791             bs->backing_hd = bs_ro;
792             return rw_ret;
793         }
794         bs->backing_hd = bs_rw;
795     }
796
797     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
798     buf = qemu_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
799
800     for (sector = 0; sector < total_sectors; sector += n) {
801         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
802
803             if (bdrv_read(bs, sector, buf, n) != 0) {
804                 ret = -EIO;
805                 goto ro_cleanup;
806             }
807
808             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
809                 ret = -EIO;
810                 goto ro_cleanup;
811             }
812         }
813     }
814
815     if (drv->bdrv_make_empty) {
816         ret = drv->bdrv_make_empty(bs);
817         bdrv_flush(bs);
818     }
819
820     /*
821      * Make sure all data we wrote to the backing device is actually
822      * stable on disk.
823      */
824     if (bs->backing_hd)
825         bdrv_flush(bs->backing_hd);
826
827 ro_cleanup:
828     qemu_free(buf);
829
830     if (ro) {
831         /* re-open as RO */
832         bdrv_delete(bs->backing_hd);
833         bs->backing_hd = NULL;
834         bs_ro = bdrv_new("");
835         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
836             backing_drv);
837         if (ret < 0) {
838             bdrv_delete(bs_ro);
839             /* drive not functional anymore */
840             bs->drv = NULL;
841             return ret;
842         }
843         bs->backing_hd = bs_ro;
844         bs->backing_hd->keep_read_only = 0;
845     }
846
847     return ret;
848 }
849
850 void bdrv_commit_all(void)
851 {
852     BlockDriverState *bs;
853
854     QTAILQ_FOREACH(bs, &bdrv_states, list) {
855         bdrv_commit(bs);
856     }
857 }
858
859 /*
860  * Return values:
861  * 0        - success
862  * -EINVAL  - backing format specified, but no file
863  * -ENOSPC  - can't update the backing file because no space is left in the
864  *            image file header
865  * -ENOTSUP - format driver doesn't support changing the backing file
866  */
867 int bdrv_change_backing_file(BlockDriverState *bs,
868     const char *backing_file, const char *backing_fmt)
869 {
870     BlockDriver *drv = bs->drv;
871
872     if (drv->bdrv_change_backing_file != NULL) {
873         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
874     } else {
875         return -ENOTSUP;
876     }
877 }
878
879 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
880                                    size_t size)
881 {
882     int64_t len;
883
884     if (!bdrv_is_inserted(bs))
885         return -ENOMEDIUM;
886
887     if (bs->growable)
888         return 0;
889
890     len = bdrv_getlength(bs);
891
892     if (offset < 0)
893         return -EIO;
894
895     if ((offset > len) || (len - offset < size))
896         return -EIO;
897
898     return 0;
899 }
900
901 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
902                               int nb_sectors)
903 {
904     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
905                                    nb_sectors * BDRV_SECTOR_SIZE);
906 }
907
908 /* return < 0 if error. See bdrv_write() for the return codes */
909 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
910               uint8_t *buf, int nb_sectors)
911 {
912     BlockDriver *drv = bs->drv;
913
914     if (!drv)
915         return -ENOMEDIUM;
916     if (bdrv_check_request(bs, sector_num, nb_sectors))
917         return -EIO;
918
919     return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
920 }
921
922 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
923                              int nb_sectors, int dirty)
924 {
925     int64_t start, end;
926     unsigned long val, idx, bit;
927
928     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
929     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
930
931     for (; start <= end; start++) {
932         idx = start / (sizeof(unsigned long) * 8);
933         bit = start % (sizeof(unsigned long) * 8);
934         val = bs->dirty_bitmap[idx];
935         if (dirty) {
936             if (!(val & (1 << bit))) {
937                 bs->dirty_count++;
938                 val |= 1 << bit;
939             }
940         } else {
941             if (val & (1 << bit)) {
942                 bs->dirty_count--;
943                 val &= ~(1 << bit);
944             }
945         }
946         bs->dirty_bitmap[idx] = val;
947     }
948 }
949
950 /* Return < 0 if error. Important errors are:
951   -EIO         generic I/O error (may happen for all errors)
952   -ENOMEDIUM   No media inserted.
953   -EINVAL      Invalid sector number or nb_sectors
954   -EACCES      Trying to write a read-only device
955 */
956 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
957                const uint8_t *buf, int nb_sectors)
958 {
959     BlockDriver *drv = bs->drv;
960     if (!bs->drv)
961         return -ENOMEDIUM;
962     if (bs->read_only)
963         return -EACCES;
964     if (bdrv_check_request(bs, sector_num, nb_sectors))
965         return -EIO;
966
967     if (bs->dirty_bitmap) {
968         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
969     }
970
971     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
972         bs->wr_highest_sector = sector_num + nb_sectors - 1;
973     }
974
975     return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
976 }
977
978 int bdrv_pread(BlockDriverState *bs, int64_t offset,
979                void *buf, int count1)
980 {
981     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
982     int len, nb_sectors, count;
983     int64_t sector_num;
984     int ret;
985
986     count = count1;
987     /* first read to align to sector start */
988     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
989     if (len > count)
990         len = count;
991     sector_num = offset >> BDRV_SECTOR_BITS;
992     if (len > 0) {
993         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
994             return ret;
995         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
996         count -= len;
997         if (count == 0)
998             return count1;
999         sector_num++;
1000         buf += len;
1001     }
1002
1003     /* read the sectors "in place" */
1004     nb_sectors = count >> BDRV_SECTOR_BITS;
1005     if (nb_sectors > 0) {
1006         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1007             return ret;
1008         sector_num += nb_sectors;
1009         len = nb_sectors << BDRV_SECTOR_BITS;
1010         buf += len;
1011         count -= len;
1012     }
1013
1014     /* add data from the last sector */
1015     if (count > 0) {
1016         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1017             return ret;
1018         memcpy(buf, tmp_buf, count);
1019     }
1020     return count1;
1021 }
1022
1023 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1024                 const void *buf, int count1)
1025 {
1026     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1027     int len, nb_sectors, count;
1028     int64_t sector_num;
1029     int ret;
1030
1031     count = count1;
1032     /* first write to align to sector start */
1033     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1034     if (len > count)
1035         len = count;
1036     sector_num = offset >> BDRV_SECTOR_BITS;
1037     if (len > 0) {
1038         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1039             return ret;
1040         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1041         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1042             return ret;
1043         count -= len;
1044         if (count == 0)
1045             return count1;
1046         sector_num++;
1047         buf += len;
1048     }
1049
1050     /* write the sectors "in place" */
1051     nb_sectors = count >> BDRV_SECTOR_BITS;
1052     if (nb_sectors > 0) {
1053         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1054             return ret;
1055         sector_num += nb_sectors;
1056         len = nb_sectors << BDRV_SECTOR_BITS;
1057         buf += len;
1058         count -= len;
1059     }
1060
1061     /* add data from the last sector */
1062     if (count > 0) {
1063         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1064             return ret;
1065         memcpy(tmp_buf, buf, count);
1066         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1067             return ret;
1068     }
1069     return count1;
1070 }
1071
1072 /*
1073  * Writes to the file and ensures that no writes are reordered across this
1074  * request (acts as a barrier)
1075  *
1076  * Returns 0 on success, -errno in error cases.
1077  */
1078 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1079     const void *buf, int count)
1080 {
1081     int ret;
1082
1083     ret = bdrv_pwrite(bs, offset, buf, count);
1084     if (ret < 0) {
1085         return ret;
1086     }
1087
1088     /* No flush needed for cache=writethrough, it uses O_DSYNC */
1089     if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) {
1090         bdrv_flush(bs);
1091     }
1092
1093     return 0;
1094 }
1095
1096 /*
1097  * Writes to the file and ensures that no writes are reordered across this
1098  * request (acts as a barrier)
1099  *
1100  * Returns 0 on success, -errno in error cases.
1101  */
1102 int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
1103     const uint8_t *buf, int nb_sectors)
1104 {
1105     return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
1106         buf, BDRV_SECTOR_SIZE * nb_sectors);
1107 }
1108
1109 /**
1110  * Truncate file to 'offset' bytes (needed only for file protocols)
1111  */
1112 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1113 {
1114     BlockDriver *drv = bs->drv;
1115     int ret;
1116     if (!drv)
1117         return -ENOMEDIUM;
1118     if (!drv->bdrv_truncate)
1119         return -ENOTSUP;
1120     if (bs->read_only)
1121         return -EACCES;
1122     ret = drv->bdrv_truncate(bs, offset);
1123     if (ret == 0) {
1124         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1125     }
1126     return ret;
1127 }
1128
1129 /**
1130  * Length of a file in bytes. Return < 0 if error or unknown.
1131  */
1132 int64_t bdrv_getlength(BlockDriverState *bs)
1133 {
1134     BlockDriver *drv = bs->drv;
1135     if (!drv)
1136         return -ENOMEDIUM;
1137
1138     /* Fixed size devices use the total_sectors value for speed instead of
1139        issuing a length query (like lseek) on each call.  Also, legacy block
1140        drivers don't provide a bdrv_getlength function and must use
1141        total_sectors. */
1142     if (!bs->growable || !drv->bdrv_getlength) {
1143         return bs->total_sectors * BDRV_SECTOR_SIZE;
1144     }
1145     return drv->bdrv_getlength(bs);
1146 }
1147
1148 /* return 0 as number of sectors if no device present or error */
1149 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1150 {
1151     int64_t length;
1152     length = bdrv_getlength(bs);
1153     if (length < 0)
1154         length = 0;
1155     else
1156         length = length >> BDRV_SECTOR_BITS;
1157     *nb_sectors_ptr = length;
1158 }
1159
1160 struct partition {
1161         uint8_t boot_ind;           /* 0x80 - active */
1162         uint8_t head;               /* starting head */
1163         uint8_t sector;             /* starting sector */
1164         uint8_t cyl;                /* starting cylinder */
1165         uint8_t sys_ind;            /* What partition type */
1166         uint8_t end_head;           /* end head */
1167         uint8_t end_sector;         /* end sector */
1168         uint8_t end_cyl;            /* end cylinder */
1169         uint32_t start_sect;        /* starting sector counting from 0 */
1170         uint32_t nr_sects;          /* nr of sectors in partition */
1171 } __attribute__((packed));
1172
1173 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1174 static int guess_disk_lchs(BlockDriverState *bs,
1175                            int *pcylinders, int *pheads, int *psectors)
1176 {
1177     uint8_t buf[BDRV_SECTOR_SIZE];
1178     int ret, i, heads, sectors, cylinders;
1179     struct partition *p;
1180     uint32_t nr_sects;
1181     uint64_t nb_sectors;
1182
1183     bdrv_get_geometry(bs, &nb_sectors);
1184
1185     ret = bdrv_read(bs, 0, buf, 1);
1186     if (ret < 0)
1187         return -1;
1188     /* test msdos magic */
1189     if (buf[510] != 0x55 || buf[511] != 0xaa)
1190         return -1;
1191     for(i = 0; i < 4; i++) {
1192         p = ((struct partition *)(buf + 0x1be)) + i;
1193         nr_sects = le32_to_cpu(p->nr_sects);
1194         if (nr_sects && p->end_head) {
1195             /* We make the assumption that the partition terminates on
1196                a cylinder boundary */
1197             heads = p->end_head + 1;
1198             sectors = p->end_sector & 63;
1199             if (sectors == 0)
1200                 continue;
1201             cylinders = nb_sectors / (heads * sectors);
1202             if (cylinders < 1 || cylinders > 16383)
1203                 continue;
1204             *pheads = heads;
1205             *psectors = sectors;
1206             *pcylinders = cylinders;
1207 #if 0
1208             printf("guessed geometry: LCHS=%d %d %d\n",
1209                    cylinders, heads, sectors);
1210 #endif
1211             return 0;
1212         }
1213     }
1214     return -1;
1215 }
1216
1217 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1218 {
1219     int translation, lba_detected = 0;
1220     int cylinders, heads, secs;
1221     uint64_t nb_sectors;
1222
1223     /* if a geometry hint is available, use it */
1224     bdrv_get_geometry(bs, &nb_sectors);
1225     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1226     translation = bdrv_get_translation_hint(bs);
1227     if (cylinders != 0) {
1228         *pcyls = cylinders;
1229         *pheads = heads;
1230         *psecs = secs;
1231     } else {
1232         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1233             if (heads > 16) {
1234                 /* if heads > 16, it means that a BIOS LBA
1235                    translation was active, so the default
1236                    hardware geometry is OK */
1237                 lba_detected = 1;
1238                 goto default_geometry;
1239             } else {
1240                 *pcyls = cylinders;
1241                 *pheads = heads;
1242                 *psecs = secs;
1243                 /* disable any translation to be in sync with
1244                    the logical geometry */
1245                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1246                     bdrv_set_translation_hint(bs,
1247                                               BIOS_ATA_TRANSLATION_NONE);
1248                 }
1249             }
1250         } else {
1251         default_geometry:
1252             /* if no geometry, use a standard physical disk geometry */
1253             cylinders = nb_sectors / (16 * 63);
1254
1255             if (cylinders > 16383)
1256                 cylinders = 16383;
1257             else if (cylinders < 2)
1258                 cylinders = 2;
1259             *pcyls = cylinders;
1260             *pheads = 16;
1261             *psecs = 63;
1262             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1263                 if ((*pcyls * *pheads) <= 131072) {
1264                     bdrv_set_translation_hint(bs,
1265                                               BIOS_ATA_TRANSLATION_LARGE);
1266                 } else {
1267                     bdrv_set_translation_hint(bs,
1268                                               BIOS_ATA_TRANSLATION_LBA);
1269                 }
1270             }
1271         }
1272         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1273     }
1274 }
1275
1276 void bdrv_set_geometry_hint(BlockDriverState *bs,
1277                             int cyls, int heads, int secs)
1278 {
1279     bs->cyls = cyls;
1280     bs->heads = heads;
1281     bs->secs = secs;
1282 }
1283
1284 void bdrv_set_type_hint(BlockDriverState *bs, int type)
1285 {
1286     bs->type = type;
1287     bs->removable = ((type == BDRV_TYPE_CDROM ||
1288                       type == BDRV_TYPE_FLOPPY));
1289 }
1290
1291 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1292 {
1293     bs->translation = translation;
1294 }
1295
1296 void bdrv_get_geometry_hint(BlockDriverState *bs,
1297                             int *pcyls, int *pheads, int *psecs)
1298 {
1299     *pcyls = bs->cyls;
1300     *pheads = bs->heads;
1301     *psecs = bs->secs;
1302 }
1303
1304 int bdrv_get_type_hint(BlockDriverState *bs)
1305 {
1306     return bs->type;
1307 }
1308
1309 int bdrv_get_translation_hint(BlockDriverState *bs)
1310 {
1311     return bs->translation;
1312 }
1313
1314 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1315                        BlockErrorAction on_write_error)
1316 {
1317     bs->on_read_error = on_read_error;
1318     bs->on_write_error = on_write_error;
1319 }
1320
1321 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1322 {
1323     return is_read ? bs->on_read_error : bs->on_write_error;
1324 }
1325
1326 void bdrv_set_removable(BlockDriverState *bs, int removable)
1327 {
1328     bs->removable = removable;
1329     if (removable && bs == bs_snapshots) {
1330         bs_snapshots = NULL;
1331     }
1332 }
1333
1334 int bdrv_is_removable(BlockDriverState *bs)
1335 {
1336     return bs->removable;
1337 }
1338
1339 int bdrv_is_read_only(BlockDriverState *bs)
1340 {
1341     return bs->read_only;
1342 }
1343
1344 int bdrv_is_sg(BlockDriverState *bs)
1345 {
1346     return bs->sg;
1347 }
1348
1349 int bdrv_enable_write_cache(BlockDriverState *bs)
1350 {
1351     return bs->enable_write_cache;
1352 }
1353
1354 /* XXX: no longer used */
1355 void bdrv_set_change_cb(BlockDriverState *bs,
1356                         void (*change_cb)(void *opaque), void *opaque)
1357 {
1358     bs->change_cb = change_cb;
1359     bs->change_opaque = opaque;
1360 }
1361
1362 int bdrv_is_encrypted(BlockDriverState *bs)
1363 {
1364     if (bs->backing_hd && bs->backing_hd->encrypted)
1365         return 1;
1366     return bs->encrypted;
1367 }
1368
1369 int bdrv_key_required(BlockDriverState *bs)
1370 {
1371     BlockDriverState *backing_hd = bs->backing_hd;
1372
1373     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1374         return 1;
1375     return (bs->encrypted && !bs->valid_key);
1376 }
1377
1378 int bdrv_set_key(BlockDriverState *bs, const char *key)
1379 {
1380     int ret;
1381     if (bs->backing_hd && bs->backing_hd->encrypted) {
1382         ret = bdrv_set_key(bs->backing_hd, key);
1383         if (ret < 0)
1384             return ret;
1385         if (!bs->encrypted)
1386             return 0;
1387     }
1388     if (!bs->encrypted) {
1389         return -EINVAL;
1390     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1391         return -ENOMEDIUM;
1392     }
1393     ret = bs->drv->bdrv_set_key(bs, key);
1394     if (ret < 0) {
1395         bs->valid_key = 0;
1396     } else if (!bs->valid_key) {
1397         bs->valid_key = 1;
1398         /* call the change callback now, we skipped it on open */
1399         bs->media_changed = 1;
1400         if (bs->change_cb)
1401             bs->change_cb(bs->change_opaque);
1402     }
1403     return ret;
1404 }
1405
1406 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1407 {
1408     if (!bs->drv) {
1409         buf[0] = '\0';
1410     } else {
1411         pstrcpy(buf, buf_size, bs->drv->format_name);
1412     }
1413 }
1414
1415 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1416                          void *opaque)
1417 {
1418     BlockDriver *drv;
1419
1420     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1421         it(opaque, drv->format_name);
1422     }
1423 }
1424
1425 BlockDriverState *bdrv_find(const char *name)
1426 {
1427     BlockDriverState *bs;
1428
1429     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1430         if (!strcmp(name, bs->device_name)) {
1431             return bs;
1432         }
1433     }
1434     return NULL;
1435 }
1436
1437 BlockDriverState *bdrv_next(BlockDriverState *bs)
1438 {
1439     if (!bs) {
1440         return QTAILQ_FIRST(&bdrv_states);
1441     }
1442     return QTAILQ_NEXT(bs, list);
1443 }
1444
1445 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1446 {
1447     BlockDriverState *bs;
1448
1449     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1450         it(opaque, bs);
1451     }
1452 }
1453
1454 const char *bdrv_get_device_name(BlockDriverState *bs)
1455 {
1456     return bs->device_name;
1457 }
1458
1459 void bdrv_flush(BlockDriverState *bs)
1460 {
1461     if (bs->open_flags & BDRV_O_NO_FLUSH) {
1462         return;
1463     }
1464
1465     if (bs->drv && bs->drv->bdrv_flush)
1466         bs->drv->bdrv_flush(bs);
1467 }
1468
1469 void bdrv_flush_all(void)
1470 {
1471     BlockDriverState *bs;
1472
1473     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1474         if (bs->drv && !bdrv_is_read_only(bs) &&
1475             (!bdrv_is_removable(bs) || bdrv_is_inserted(bs))) {
1476             bdrv_flush(bs);
1477         }
1478     }
1479 }
1480
1481 int bdrv_has_zero_init(BlockDriverState *bs)
1482 {
1483     assert(bs->drv);
1484
1485     if (bs->drv->bdrv_has_zero_init) {
1486         return bs->drv->bdrv_has_zero_init(bs);
1487     }
1488
1489     return 1;
1490 }
1491
1492 /*
1493  * Returns true iff the specified sector is present in the disk image. Drivers
1494  * not implementing the functionality are assumed to not support backing files,
1495  * hence all their sectors are reported as allocated.
1496  *
1497  * 'pnum' is set to the number of sectors (including and immediately following
1498  * the specified sector) that are known to be in the same
1499  * allocated/unallocated state.
1500  *
1501  * 'nb_sectors' is the max value 'pnum' should be set to.
1502  */
1503 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1504         int *pnum)
1505 {
1506     int64_t n;
1507     if (!bs->drv->bdrv_is_allocated) {
1508         if (sector_num >= bs->total_sectors) {
1509             *pnum = 0;
1510             return 0;
1511         }
1512         n = bs->total_sectors - sector_num;
1513         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1514         return 1;
1515     }
1516     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1517 }
1518
1519 void bdrv_mon_event(const BlockDriverState *bdrv,
1520                     BlockMonEventAction action, int is_read)
1521 {
1522     QObject *data;
1523     const char *action_str;
1524
1525     switch (action) {
1526     case BDRV_ACTION_REPORT:
1527         action_str = "report";
1528         break;
1529     case BDRV_ACTION_IGNORE:
1530         action_str = "ignore";
1531         break;
1532     case BDRV_ACTION_STOP:
1533         action_str = "stop";
1534         break;
1535     default:
1536         abort();
1537     }
1538
1539     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1540                               bdrv->device_name,
1541                               action_str,
1542                               is_read ? "read" : "write");
1543     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1544
1545     qobject_decref(data);
1546 }
1547
1548 static void bdrv_print_dict(QObject *obj, void *opaque)
1549 {
1550     QDict *bs_dict;
1551     Monitor *mon = opaque;
1552
1553     bs_dict = qobject_to_qdict(obj);
1554
1555     monitor_printf(mon, "%s: type=%s removable=%d",
1556                         qdict_get_str(bs_dict, "device"),
1557                         qdict_get_str(bs_dict, "type"),
1558                         qdict_get_bool(bs_dict, "removable"));
1559
1560     if (qdict_get_bool(bs_dict, "removable")) {
1561         monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1562     }
1563
1564     if (qdict_haskey(bs_dict, "inserted")) {
1565         QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1566
1567         monitor_printf(mon, " file=");
1568         monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1569         if (qdict_haskey(qdict, "backing_file")) {
1570             monitor_printf(mon, " backing_file=");
1571             monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1572         }
1573         monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1574                             qdict_get_bool(qdict, "ro"),
1575                             qdict_get_str(qdict, "drv"),
1576                             qdict_get_bool(qdict, "encrypted"));
1577     } else {
1578         monitor_printf(mon, " [not inserted]");
1579     }
1580
1581     monitor_printf(mon, "\n");
1582 }
1583
1584 void bdrv_info_print(Monitor *mon, const QObject *data)
1585 {
1586     qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1587 }
1588
1589 void bdrv_info(Monitor *mon, QObject **ret_data)
1590 {
1591     QList *bs_list;
1592     BlockDriverState *bs;
1593
1594     bs_list = qlist_new();
1595
1596     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1597         QObject *bs_obj;
1598         const char *type = "unknown";
1599
1600         switch(bs->type) {
1601         case BDRV_TYPE_HD:
1602             type = "hd";
1603             break;
1604         case BDRV_TYPE_CDROM:
1605             type = "cdrom";
1606             break;
1607         case BDRV_TYPE_FLOPPY:
1608             type = "floppy";
1609             break;
1610         }
1611
1612         bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': %s, "
1613                                     "'removable': %i, 'locked': %i }",
1614                                     bs->device_name, type, bs->removable,
1615                                     bs->locked);
1616
1617         if (bs->drv) {
1618             QObject *obj;
1619             QDict *bs_dict = qobject_to_qdict(bs_obj);
1620
1621             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1622                                      "'encrypted': %i }",
1623                                      bs->filename, bs->read_only,
1624                                      bs->drv->format_name,
1625                                      bdrv_is_encrypted(bs));
1626             if (bs->backing_file[0] != '\0') {
1627                 QDict *qdict = qobject_to_qdict(obj);
1628                 qdict_put(qdict, "backing_file",
1629                           qstring_from_str(bs->backing_file));
1630             }
1631
1632             qdict_put_obj(bs_dict, "inserted", obj);
1633         }
1634         qlist_append_obj(bs_list, bs_obj);
1635     }
1636
1637     *ret_data = QOBJECT(bs_list);
1638 }
1639
1640 static void bdrv_stats_iter(QObject *data, void *opaque)
1641 {
1642     QDict *qdict;
1643     Monitor *mon = opaque;
1644
1645     qdict = qobject_to_qdict(data);
1646     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1647
1648     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1649     monitor_printf(mon, " rd_bytes=%" PRId64
1650                         " wr_bytes=%" PRId64
1651                         " rd_operations=%" PRId64
1652                         " wr_operations=%" PRId64
1653                         "\n",
1654                         qdict_get_int(qdict, "rd_bytes"),
1655                         qdict_get_int(qdict, "wr_bytes"),
1656                         qdict_get_int(qdict, "rd_operations"),
1657                         qdict_get_int(qdict, "wr_operations"));
1658 }
1659
1660 void bdrv_stats_print(Monitor *mon, const QObject *data)
1661 {
1662     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1663 }
1664
1665 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1666 {
1667     QObject *res;
1668     QDict *dict;
1669
1670     res = qobject_from_jsonf("{ 'stats': {"
1671                              "'rd_bytes': %" PRId64 ","
1672                              "'wr_bytes': %" PRId64 ","
1673                              "'rd_operations': %" PRId64 ","
1674                              "'wr_operations': %" PRId64 ","
1675                              "'wr_highest_offset': %" PRId64
1676                              "} }",
1677                              bs->rd_bytes, bs->wr_bytes,
1678                              bs->rd_ops, bs->wr_ops,
1679                              bs->wr_highest_sector *
1680                              (uint64_t)BDRV_SECTOR_SIZE);
1681     dict  = qobject_to_qdict(res);
1682
1683     if (*bs->device_name) {
1684         qdict_put(dict, "device", qstring_from_str(bs->device_name));
1685     }
1686
1687     if (bs->file) {
1688         QObject *parent = bdrv_info_stats_bs(bs->file);
1689         qdict_put_obj(dict, "parent", parent);
1690     }
1691
1692     return res;
1693 }
1694
1695 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
1696 {
1697     QObject *obj;
1698     QList *devices;
1699     BlockDriverState *bs;
1700
1701     devices = qlist_new();
1702
1703     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1704         obj = bdrv_info_stats_bs(bs);
1705         qlist_append_obj(devices, obj);
1706     }
1707
1708     *ret_data = QOBJECT(devices);
1709 }
1710
1711 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1712 {
1713     if (bs->backing_hd && bs->backing_hd->encrypted)
1714         return bs->backing_file;
1715     else if (bs->encrypted)
1716         return bs->filename;
1717     else
1718         return NULL;
1719 }
1720
1721 void bdrv_get_backing_filename(BlockDriverState *bs,
1722                                char *filename, int filename_size)
1723 {
1724     if (!bs->backing_file) {
1725         pstrcpy(filename, filename_size, "");
1726     } else {
1727         pstrcpy(filename, filename_size, bs->backing_file);
1728     }
1729 }
1730
1731 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1732                           const uint8_t *buf, int nb_sectors)
1733 {
1734     BlockDriver *drv = bs->drv;
1735     if (!drv)
1736         return -ENOMEDIUM;
1737     if (!drv->bdrv_write_compressed)
1738         return -ENOTSUP;
1739     if (bdrv_check_request(bs, sector_num, nb_sectors))
1740         return -EIO;
1741
1742     if (bs->dirty_bitmap) {
1743         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1744     }
1745
1746     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1747 }
1748
1749 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1750 {
1751     BlockDriver *drv = bs->drv;
1752     if (!drv)
1753         return -ENOMEDIUM;
1754     if (!drv->bdrv_get_info)
1755         return -ENOTSUP;
1756     memset(bdi, 0, sizeof(*bdi));
1757     return drv->bdrv_get_info(bs, bdi);
1758 }
1759
1760 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1761                       int64_t pos, int size)
1762 {
1763     BlockDriver *drv = bs->drv;
1764     if (!drv)
1765         return -ENOMEDIUM;
1766     if (drv->bdrv_save_vmstate)
1767         return drv->bdrv_save_vmstate(bs, buf, pos, size);
1768     if (bs->file)
1769         return bdrv_save_vmstate(bs->file, buf, pos, size);
1770     return -ENOTSUP;
1771 }
1772
1773 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1774                       int64_t pos, int size)
1775 {
1776     BlockDriver *drv = bs->drv;
1777     if (!drv)
1778         return -ENOMEDIUM;
1779     if (drv->bdrv_load_vmstate)
1780         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1781     if (bs->file)
1782         return bdrv_load_vmstate(bs->file, buf, pos, size);
1783     return -ENOTSUP;
1784 }
1785
1786 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
1787 {
1788     BlockDriver *drv = bs->drv;
1789
1790     if (!drv || !drv->bdrv_debug_event) {
1791         return;
1792     }
1793
1794     return drv->bdrv_debug_event(bs, event);
1795
1796 }
1797
1798 /**************************************************************/
1799 /* handling of snapshots */
1800
1801 int bdrv_can_snapshot(BlockDriverState *bs)
1802 {
1803     BlockDriver *drv = bs->drv;
1804     if (!drv || bdrv_is_removable(bs) || bdrv_is_read_only(bs)) {
1805         return 0;
1806     }
1807
1808     if (!drv->bdrv_snapshot_create) {
1809         if (bs->file != NULL) {
1810             return bdrv_can_snapshot(bs->file);
1811         }
1812         return 0;
1813     }
1814
1815     return 1;
1816 }
1817
1818 int bdrv_is_snapshot(BlockDriverState *bs)
1819 {
1820     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
1821 }
1822
1823 BlockDriverState *bdrv_snapshots(void)
1824 {
1825     BlockDriverState *bs;
1826
1827     if (bs_snapshots) {
1828         return bs_snapshots;
1829     }
1830
1831     bs = NULL;
1832     while ((bs = bdrv_next(bs))) {
1833         if (bdrv_can_snapshot(bs)) {
1834             bs_snapshots = bs;
1835             return bs;
1836         }
1837     }
1838     return NULL;
1839 }
1840
1841 int bdrv_snapshot_create(BlockDriverState *bs,
1842                          QEMUSnapshotInfo *sn_info)
1843 {
1844     BlockDriver *drv = bs->drv;
1845     if (!drv)
1846         return -ENOMEDIUM;
1847     if (drv->bdrv_snapshot_create)
1848         return drv->bdrv_snapshot_create(bs, sn_info);
1849     if (bs->file)
1850         return bdrv_snapshot_create(bs->file, sn_info);
1851     return -ENOTSUP;
1852 }
1853
1854 int bdrv_snapshot_goto(BlockDriverState *bs,
1855                        const char *snapshot_id)
1856 {
1857     BlockDriver *drv = bs->drv;
1858     int ret, open_ret;
1859
1860     if (!drv)
1861         return -ENOMEDIUM;
1862     if (drv->bdrv_snapshot_goto)
1863         return drv->bdrv_snapshot_goto(bs, snapshot_id);
1864
1865     if (bs->file) {
1866         drv->bdrv_close(bs);
1867         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
1868         open_ret = drv->bdrv_open(bs, bs->open_flags);
1869         if (open_ret < 0) {
1870             bdrv_delete(bs->file);
1871             bs->drv = NULL;
1872             return open_ret;
1873         }
1874         return ret;
1875     }
1876
1877     return -ENOTSUP;
1878 }
1879
1880 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1881 {
1882     BlockDriver *drv = bs->drv;
1883     if (!drv)
1884         return -ENOMEDIUM;
1885     if (drv->bdrv_snapshot_delete)
1886         return drv->bdrv_snapshot_delete(bs, snapshot_id);
1887     if (bs->file)
1888         return bdrv_snapshot_delete(bs->file, snapshot_id);
1889     return -ENOTSUP;
1890 }
1891
1892 int bdrv_snapshot_list(BlockDriverState *bs,
1893                        QEMUSnapshotInfo **psn_info)
1894 {
1895     BlockDriver *drv = bs->drv;
1896     if (!drv)
1897         return -ENOMEDIUM;
1898     if (drv->bdrv_snapshot_list)
1899         return drv->bdrv_snapshot_list(bs, psn_info);
1900     if (bs->file)
1901         return bdrv_snapshot_list(bs->file, psn_info);
1902     return -ENOTSUP;
1903 }
1904
1905 #define NB_SUFFIXES 4
1906
1907 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
1908 {
1909     static const char suffixes[NB_SUFFIXES] = "KMGT";
1910     int64_t base;
1911     int i;
1912
1913     if (size <= 999) {
1914         snprintf(buf, buf_size, "%" PRId64, size);
1915     } else {
1916         base = 1024;
1917         for(i = 0; i < NB_SUFFIXES; i++) {
1918             if (size < (10 * base)) {
1919                 snprintf(buf, buf_size, "%0.1f%c",
1920                          (double)size / base,
1921                          suffixes[i]);
1922                 break;
1923             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
1924                 snprintf(buf, buf_size, "%" PRId64 "%c",
1925                          ((size + (base >> 1)) / base),
1926                          suffixes[i]);
1927                 break;
1928             }
1929             base = base * 1024;
1930         }
1931     }
1932     return buf;
1933 }
1934
1935 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
1936 {
1937     char buf1[128], date_buf[128], clock_buf[128];
1938 #ifdef _WIN32
1939     struct tm *ptm;
1940 #else
1941     struct tm tm;
1942 #endif
1943     time_t ti;
1944     int64_t secs;
1945
1946     if (!sn) {
1947         snprintf(buf, buf_size,
1948                  "%-10s%-20s%7s%20s%15s",
1949                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
1950     } else {
1951         ti = sn->date_sec;
1952 #ifdef _WIN32
1953         ptm = localtime(&ti);
1954         strftime(date_buf, sizeof(date_buf),
1955                  "%Y-%m-%d %H:%M:%S", ptm);
1956 #else
1957         localtime_r(&ti, &tm);
1958         strftime(date_buf, sizeof(date_buf),
1959                  "%Y-%m-%d %H:%M:%S", &tm);
1960 #endif
1961         secs = sn->vm_clock_nsec / 1000000000;
1962         snprintf(clock_buf, sizeof(clock_buf),
1963                  "%02d:%02d:%02d.%03d",
1964                  (int)(secs / 3600),
1965                  (int)((secs / 60) % 60),
1966                  (int)(secs % 60),
1967                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
1968         snprintf(buf, buf_size,
1969                  "%-10s%-20s%7s%20s%15s",
1970                  sn->id_str, sn->name,
1971                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
1972                  date_buf,
1973                  clock_buf);
1974     }
1975     return buf;
1976 }
1977
1978
1979 /**************************************************************/
1980 /* async I/Os */
1981
1982 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1983                                  QEMUIOVector *qiov, int nb_sectors,
1984                                  BlockDriverCompletionFunc *cb, void *opaque)
1985 {
1986     BlockDriver *drv = bs->drv;
1987     BlockDriverAIOCB *ret;
1988
1989     if (!drv)
1990         return NULL;
1991     if (bdrv_check_request(bs, sector_num, nb_sectors))
1992         return NULL;
1993
1994     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
1995                               cb, opaque);
1996
1997     if (ret) {
1998         /* Update stats even though technically transfer has not happened. */
1999         bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2000         bs->rd_ops ++;
2001     }
2002
2003     return ret;
2004 }
2005
2006 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2007                                   QEMUIOVector *qiov, int nb_sectors,
2008                                   BlockDriverCompletionFunc *cb, void *opaque)
2009 {
2010     BlockDriver *drv = bs->drv;
2011     BlockDriverAIOCB *ret;
2012
2013     if (!drv)
2014         return NULL;
2015     if (bs->read_only)
2016         return NULL;
2017     if (bdrv_check_request(bs, sector_num, nb_sectors))
2018         return NULL;
2019
2020     if (bs->dirty_bitmap) {
2021         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2022     }
2023
2024     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2025                                cb, opaque);
2026
2027     if (ret) {
2028         /* Update stats even though technically transfer has not happened. */
2029         bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2030         bs->wr_ops ++;
2031         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2032             bs->wr_highest_sector = sector_num + nb_sectors - 1;
2033         }
2034     }
2035
2036     return ret;
2037 }
2038
2039
2040 typedef struct MultiwriteCB {
2041     int error;
2042     int num_requests;
2043     int num_callbacks;
2044     struct {
2045         BlockDriverCompletionFunc *cb;
2046         void *opaque;
2047         QEMUIOVector *free_qiov;
2048         void *free_buf;
2049     } callbacks[];
2050 } MultiwriteCB;
2051
2052 static void multiwrite_user_cb(MultiwriteCB *mcb)
2053 {
2054     int i;
2055
2056     for (i = 0; i < mcb->num_callbacks; i++) {
2057         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2058         if (mcb->callbacks[i].free_qiov) {
2059             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2060         }
2061         qemu_free(mcb->callbacks[i].free_qiov);
2062         qemu_vfree(mcb->callbacks[i].free_buf);
2063     }
2064 }
2065
2066 static void multiwrite_cb(void *opaque, int ret)
2067 {
2068     MultiwriteCB *mcb = opaque;
2069
2070     if (ret < 0 && !mcb->error) {
2071         mcb->error = ret;
2072     }
2073
2074     mcb->num_requests--;
2075     if (mcb->num_requests == 0) {
2076         multiwrite_user_cb(mcb);
2077         qemu_free(mcb);
2078     }
2079 }
2080
2081 static int multiwrite_req_compare(const void *a, const void *b)
2082 {
2083     const BlockRequest *req1 = a, *req2 = b;
2084
2085     /*
2086      * Note that we can't simply subtract req2->sector from req1->sector
2087      * here as that could overflow the return value.
2088      */
2089     if (req1->sector > req2->sector) {
2090         return 1;
2091     } else if (req1->sector < req2->sector) {
2092         return -1;
2093     } else {
2094         return 0;
2095     }
2096 }
2097
2098 /*
2099  * Takes a bunch of requests and tries to merge them. Returns the number of
2100  * requests that remain after merging.
2101  */
2102 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2103     int num_reqs, MultiwriteCB *mcb)
2104 {
2105     int i, outidx;
2106
2107     // Sort requests by start sector
2108     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2109
2110     // Check if adjacent requests touch the same clusters. If so, combine them,
2111     // filling up gaps with zero sectors.
2112     outidx = 0;
2113     for (i = 1; i < num_reqs; i++) {
2114         int merge = 0;
2115         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2116
2117         // This handles the cases that are valid for all block drivers, namely
2118         // exactly sequential writes and overlapping writes.
2119         if (reqs[i].sector <= oldreq_last) {
2120             merge = 1;
2121         }
2122
2123         // The block driver may decide that it makes sense to combine requests
2124         // even if there is a gap of some sectors between them. In this case,
2125         // the gap is filled with zeros (therefore only applicable for yet
2126         // unused space in format like qcow2).
2127         if (!merge && bs->drv->bdrv_merge_requests) {
2128             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2129         }
2130
2131         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2132             merge = 0;
2133         }
2134
2135         if (merge) {
2136             size_t size;
2137             QEMUIOVector *qiov = qemu_mallocz(sizeof(*qiov));
2138             qemu_iovec_init(qiov,
2139                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2140
2141             // Add the first request to the merged one. If the requests are
2142             // overlapping, drop the last sectors of the first request.
2143             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2144             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2145
2146             // We might need to add some zeros between the two requests
2147             if (reqs[i].sector > oldreq_last) {
2148                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2149                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2150                 memset(buf, 0, zero_bytes);
2151                 qemu_iovec_add(qiov, buf, zero_bytes);
2152                 mcb->callbacks[i].free_buf = buf;
2153             }
2154
2155             // Add the second request
2156             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2157
2158             reqs[outidx].nb_sectors = qiov->size >> 9;
2159             reqs[outidx].qiov = qiov;
2160
2161             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2162         } else {
2163             outidx++;
2164             reqs[outidx].sector     = reqs[i].sector;
2165             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2166             reqs[outidx].qiov       = reqs[i].qiov;
2167         }
2168     }
2169
2170     return outidx + 1;
2171 }
2172
2173 /*
2174  * Submit multiple AIO write requests at once.
2175  *
2176  * On success, the function returns 0 and all requests in the reqs array have
2177  * been submitted. In error case this function returns -1, and any of the
2178  * requests may or may not be submitted yet. In particular, this means that the
2179  * callback will be called for some of the requests, for others it won't. The
2180  * caller must check the error field of the BlockRequest to wait for the right
2181  * callbacks (if error != 0, no callback will be called).
2182  *
2183  * The implementation may modify the contents of the reqs array, e.g. to merge
2184  * requests. However, the fields opaque and error are left unmodified as they
2185  * are used to signal failure for a single request to the caller.
2186  */
2187 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2188 {
2189     BlockDriverAIOCB *acb;
2190     MultiwriteCB *mcb;
2191     int i;
2192
2193     if (num_reqs == 0) {
2194         return 0;
2195     }
2196
2197     // Create MultiwriteCB structure
2198     mcb = qemu_mallocz(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2199     mcb->num_requests = 0;
2200     mcb->num_callbacks = num_reqs;
2201
2202     for (i = 0; i < num_reqs; i++) {
2203         mcb->callbacks[i].cb = reqs[i].cb;
2204         mcb->callbacks[i].opaque = reqs[i].opaque;
2205     }
2206
2207     // Check for mergable requests
2208     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2209
2210     /*
2211      * Run the aio requests. As soon as one request can't be submitted
2212      * successfully, fail all requests that are not yet submitted (we must
2213      * return failure for all requests anyway)
2214      *
2215      * num_requests cannot be set to the right value immediately: If
2216      * bdrv_aio_writev fails for some request, num_requests would be too high
2217      * and therefore multiwrite_cb() would never recognize the multiwrite
2218      * request as completed. We also cannot use the loop variable i to set it
2219      * when the first request fails because the callback may already have been
2220      * called for previously submitted requests. Thus, num_requests must be
2221      * incremented for each request that is submitted.
2222      *
2223      * The problem that callbacks may be called early also means that we need
2224      * to take care that num_requests doesn't become 0 before all requests are
2225      * submitted - multiwrite_cb() would consider the multiwrite request
2226      * completed. A dummy request that is "completed" by a manual call to
2227      * multiwrite_cb() takes care of this.
2228      */
2229     mcb->num_requests = 1;
2230
2231     for (i = 0; i < num_reqs; i++) {
2232         mcb->num_requests++;
2233         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2234             reqs[i].nb_sectors, multiwrite_cb, mcb);
2235
2236         if (acb == NULL) {
2237             // We can only fail the whole thing if no request has been
2238             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2239             // complete and report the error in the callback.
2240             if (i == 0) {
2241                 goto fail;
2242             } else {
2243                 multiwrite_cb(mcb, -EIO);
2244                 break;
2245             }
2246         }
2247     }
2248
2249     /* Complete the dummy request */
2250     multiwrite_cb(mcb, 0);
2251
2252     return 0;
2253
2254 fail:
2255     for (i = 0; i < mcb->num_callbacks; i++) {
2256         reqs[i].error = -EIO;
2257     }
2258     qemu_free(mcb);
2259     return -1;
2260 }
2261
2262 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2263         BlockDriverCompletionFunc *cb, void *opaque)
2264 {
2265     BlockDriver *drv = bs->drv;
2266
2267     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2268         return bdrv_aio_noop_em(bs, cb, opaque);
2269     }
2270
2271     if (!drv)
2272         return NULL;
2273     return drv->bdrv_aio_flush(bs, cb, opaque);
2274 }
2275
2276 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2277 {
2278     acb->pool->cancel(acb);
2279 }
2280
2281
2282 /**************************************************************/
2283 /* async block device emulation */
2284
2285 typedef struct BlockDriverAIOCBSync {
2286     BlockDriverAIOCB common;
2287     QEMUBH *bh;
2288     int ret;
2289     /* vector translation state */
2290     QEMUIOVector *qiov;
2291     uint8_t *bounce;
2292     int is_write;
2293 } BlockDriverAIOCBSync;
2294
2295 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2296 {
2297     BlockDriverAIOCBSync *acb =
2298         container_of(blockacb, BlockDriverAIOCBSync, common);
2299     qemu_bh_delete(acb->bh);
2300     acb->bh = NULL;
2301     qemu_aio_release(acb);
2302 }
2303
2304 static AIOPool bdrv_em_aio_pool = {
2305     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2306     .cancel             = bdrv_aio_cancel_em,
2307 };
2308
2309 static void bdrv_aio_bh_cb(void *opaque)
2310 {
2311     BlockDriverAIOCBSync *acb = opaque;
2312
2313     if (!acb->is_write)
2314         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2315     qemu_vfree(acb->bounce);
2316     acb->common.cb(acb->common.opaque, acb->ret);
2317     qemu_bh_delete(acb->bh);
2318     acb->bh = NULL;
2319     qemu_aio_release(acb);
2320 }
2321
2322 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2323                                             int64_t sector_num,
2324                                             QEMUIOVector *qiov,
2325                                             int nb_sectors,
2326                                             BlockDriverCompletionFunc *cb,
2327                                             void *opaque,
2328                                             int is_write)
2329
2330 {
2331     BlockDriverAIOCBSync *acb;
2332
2333     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2334     acb->is_write = is_write;
2335     acb->qiov = qiov;
2336     acb->bounce = qemu_blockalign(bs, qiov->size);
2337
2338     if (!acb->bh)
2339         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2340
2341     if (is_write) {
2342         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2343         acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2344     } else {
2345         acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2346     }
2347
2348     qemu_bh_schedule(acb->bh);
2349
2350     return &acb->common;
2351 }
2352
2353 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2354         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2355         BlockDriverCompletionFunc *cb, void *opaque)
2356 {
2357     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2358 }
2359
2360 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2361         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2362         BlockDriverCompletionFunc *cb, void *opaque)
2363 {
2364     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2365 }
2366
2367 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2368         BlockDriverCompletionFunc *cb, void *opaque)
2369 {
2370     BlockDriverAIOCBSync *acb;
2371
2372     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2373     acb->is_write = 1; /* don't bounce in the completion hadler */
2374     acb->qiov = NULL;
2375     acb->bounce = NULL;
2376     acb->ret = 0;
2377
2378     if (!acb->bh)
2379         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2380
2381     bdrv_flush(bs);
2382     qemu_bh_schedule(acb->bh);
2383     return &acb->common;
2384 }
2385
2386 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2387         BlockDriverCompletionFunc *cb, void *opaque)
2388 {
2389     BlockDriverAIOCBSync *acb;
2390
2391     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2392     acb->is_write = 1; /* don't bounce in the completion handler */
2393     acb->qiov = NULL;
2394     acb->bounce = NULL;
2395     acb->ret = 0;
2396
2397     if (!acb->bh) {
2398         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2399     }
2400
2401     qemu_bh_schedule(acb->bh);
2402     return &acb->common;
2403 }
2404
2405 /**************************************************************/
2406 /* sync block device emulation */
2407
2408 static void bdrv_rw_em_cb(void *opaque, int ret)
2409 {
2410     *(int *)opaque = ret;
2411 }
2412
2413 #define NOT_DONE 0x7fffffff
2414
2415 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2416                         uint8_t *buf, int nb_sectors)
2417 {
2418     int async_ret;
2419     BlockDriverAIOCB *acb;
2420     struct iovec iov;
2421     QEMUIOVector qiov;
2422
2423     async_context_push();
2424
2425     async_ret = NOT_DONE;
2426     iov.iov_base = (void *)buf;
2427     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2428     qemu_iovec_init_external(&qiov, &iov, 1);
2429     acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2430         bdrv_rw_em_cb, &async_ret);
2431     if (acb == NULL) {
2432         async_ret = -1;
2433         goto fail;
2434     }
2435
2436     while (async_ret == NOT_DONE) {
2437         qemu_aio_wait();
2438     }
2439
2440
2441 fail:
2442     async_context_pop();
2443     return async_ret;
2444 }
2445
2446 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2447                          const uint8_t *buf, int nb_sectors)
2448 {
2449     int async_ret;
2450     BlockDriverAIOCB *acb;
2451     struct iovec iov;
2452     QEMUIOVector qiov;
2453
2454     async_context_push();
2455
2456     async_ret = NOT_DONE;
2457     iov.iov_base = (void *)buf;
2458     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2459     qemu_iovec_init_external(&qiov, &iov, 1);
2460     acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2461         bdrv_rw_em_cb, &async_ret);
2462     if (acb == NULL) {
2463         async_ret = -1;
2464         goto fail;
2465     }
2466     while (async_ret == NOT_DONE) {
2467         qemu_aio_wait();
2468     }
2469
2470 fail:
2471     async_context_pop();
2472     return async_ret;
2473 }
2474
2475 void bdrv_init(void)
2476 {
2477     module_call_init(MODULE_INIT_BLOCK);
2478 }
2479
2480 void bdrv_init_with_whitelist(void)
2481 {
2482     use_bdrv_whitelist = 1;
2483     bdrv_init();
2484 }
2485
2486 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2487                    BlockDriverCompletionFunc *cb, void *opaque)
2488 {
2489     BlockDriverAIOCB *acb;
2490
2491     if (pool->free_aiocb) {
2492         acb = pool->free_aiocb;
2493         pool->free_aiocb = acb->next;
2494     } else {
2495         acb = qemu_mallocz(pool->aiocb_size);
2496         acb->pool = pool;
2497     }
2498     acb->bs = bs;
2499     acb->cb = cb;
2500     acb->opaque = opaque;
2501     return acb;
2502 }
2503
2504 void qemu_aio_release(void *p)
2505 {
2506     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2507     AIOPool *pool = acb->pool;
2508     acb->next = pool->free_aiocb;
2509     pool->free_aiocb = acb;
2510 }
2511
2512 /**************************************************************/
2513 /* removable device support */
2514
2515 /**
2516  * Return TRUE if the media is present
2517  */
2518 int bdrv_is_inserted(BlockDriverState *bs)
2519 {
2520     BlockDriver *drv = bs->drv;
2521     int ret;
2522     if (!drv)
2523         return 0;
2524     if (!drv->bdrv_is_inserted)
2525         return !bs->tray_open;
2526     ret = drv->bdrv_is_inserted(bs);
2527     return ret;
2528 }
2529
2530 /**
2531  * Return TRUE if the media changed since the last call to this
2532  * function. It is currently only used for floppy disks
2533  */
2534 int bdrv_media_changed(BlockDriverState *bs)
2535 {
2536     BlockDriver *drv = bs->drv;
2537     int ret;
2538
2539     if (!drv || !drv->bdrv_media_changed)
2540         ret = -ENOTSUP;
2541     else
2542         ret = drv->bdrv_media_changed(bs);
2543     if (ret == -ENOTSUP)
2544         ret = bs->media_changed;
2545     bs->media_changed = 0;
2546     return ret;
2547 }
2548
2549 /**
2550  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2551  */
2552 int bdrv_eject(BlockDriverState *bs, int eject_flag)
2553 {
2554     BlockDriver *drv = bs->drv;
2555     int ret;
2556
2557     if (bs->locked) {
2558         return -EBUSY;
2559     }
2560
2561     if (!drv || !drv->bdrv_eject) {
2562         ret = -ENOTSUP;
2563     } else {
2564         ret = drv->bdrv_eject(bs, eject_flag);
2565     }
2566     if (ret == -ENOTSUP) {
2567         ret = 0;
2568     }
2569     if (ret >= 0) {
2570         bs->tray_open = eject_flag;
2571     }
2572
2573     return ret;
2574 }
2575
2576 int bdrv_is_locked(BlockDriverState *bs)
2577 {
2578     return bs->locked;
2579 }
2580
2581 /**
2582  * Lock or unlock the media (if it is locked, the user won't be able
2583  * to eject it manually).
2584  */
2585 void bdrv_set_locked(BlockDriverState *bs, int locked)
2586 {
2587     BlockDriver *drv = bs->drv;
2588
2589     bs->locked = locked;
2590     if (drv && drv->bdrv_set_locked) {
2591         drv->bdrv_set_locked(bs, locked);
2592     }
2593 }
2594
2595 /* needed for generic scsi interface */
2596
2597 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2598 {
2599     BlockDriver *drv = bs->drv;
2600
2601     if (drv && drv->bdrv_ioctl)
2602         return drv->bdrv_ioctl(bs, req, buf);
2603     return -ENOTSUP;
2604 }
2605
2606 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2607         unsigned long int req, void *buf,
2608         BlockDriverCompletionFunc *cb, void *opaque)
2609 {
2610     BlockDriver *drv = bs->drv;
2611
2612     if (drv && drv->bdrv_aio_ioctl)
2613         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2614     return NULL;
2615 }
2616
2617
2618
2619 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2620 {
2621     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
2622 }
2623
2624 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
2625 {
2626     int64_t bitmap_size;
2627
2628     bs->dirty_count = 0;
2629     if (enable) {
2630         if (!bs->dirty_bitmap) {
2631             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
2632                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
2633             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
2634
2635             bs->dirty_bitmap = qemu_mallocz(bitmap_size);
2636         }
2637     } else {
2638         if (bs->dirty_bitmap) {
2639             qemu_free(bs->dirty_bitmap);
2640             bs->dirty_bitmap = NULL;
2641         }
2642     }
2643 }
2644
2645 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
2646 {
2647     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
2648
2649     if (bs->dirty_bitmap &&
2650         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
2651         return bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
2652             (1 << (chunk % (sizeof(unsigned long) * 8)));
2653     } else {
2654         return 0;
2655     }
2656 }
2657
2658 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
2659                       int nr_sectors)
2660 {
2661     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
2662 }
2663
2664 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
2665 {
2666     return bs->dirty_count;
2667 }
This page took 0.160927 seconds and 4 git commands to generate.