]> Git Repo - qemu.git/blame - block.c
block/iscsi: store DPOFUA bit from the modesense command
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
737e150e
PB
27#include "block/block_int.h"
28#include "block/blockjob.h"
1de7afc9 29#include "qemu/module.h"
7b1b5d19 30#include "qapi/qmp/qjson.h"
bfb197e0 31#include "sysemu/block-backend.h"
9c17d615 32#include "sysemu/sysemu.h"
de50a20a 33#include "sysemu/qtest.h"
1de7afc9 34#include "qemu/notify.h"
737e150e 35#include "block/coroutine.h"
c13163fb 36#include "block/qapi.h"
b2023818 37#include "qmp-commands.h"
1de7afc9 38#include "qemu/timer.h"
a5ee7bd4 39#include "qapi-event.h"
fc01f7e7 40
71e72a19 41#ifdef CONFIG_BSD
7674e7bf
FB
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <sys/ioctl.h>
72cf2d4f 45#include <sys/queue.h>
c5e97233 46#ifndef __DragonFly__
7674e7bf
FB
47#include <sys/disk.h>
48#endif
c5e97233 49#endif
7674e7bf 50
49dc768d
AL
51#ifdef _WIN32
52#include <windows.h>
53#endif
54
e4654d2d
FZ
55struct BdrvDirtyBitmap {
56 HBitmap *bitmap;
57 QLIST_ENTRY(BdrvDirtyBitmap) list;
58};
59
1c9805a3
SH
60#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61
7c84b1b8 62static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 63 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 64 BlockCompletionFunc *cb, void *opaque);
7c84b1b8 65static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 66 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 67 BlockCompletionFunc *cb, void *opaque);
f9f05dc5
KW
68static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
69 int64_t sector_num, int nb_sectors,
70 QEMUIOVector *iov);
71static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
72 int64_t sector_num, int nb_sectors,
73 QEMUIOVector *iov);
775aa8b6
KW
74static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
75 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 76 BdrvRequestFlags flags);
775aa8b6
KW
77static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
78 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 79 BdrvRequestFlags flags);
7c84b1b8
MA
80static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
81 int64_t sector_num,
82 QEMUIOVector *qiov,
83 int nb_sectors,
84 BdrvRequestFlags flags,
097310b5 85 BlockCompletionFunc *cb,
7c84b1b8
MA
86 void *opaque,
87 bool is_write);
b2a61371 88static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 89static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 90 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 91
1b7bdbc1
SH
92static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
93 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 94
dc364f4c
BC
95static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
96 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97
8a22f02a
SH
98static QLIST_HEAD(, BlockDriver) bdrv_drivers =
99 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 100
c4237dfa
VSO
101static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
102 int nr_sectors);
103static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
104 int nr_sectors);
eb852011
MA
105/* If non-zero, use only whitelisted block drivers */
106static int use_bdrv_whitelist;
107
9e0b22f4
SH
108#ifdef _WIN32
109static int is_windows_drive_prefix(const char *filename)
110{
111 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 filename[1] == ':');
114}
115
116int is_windows_drive(const char *filename)
117{
118 if (is_windows_drive_prefix(filename) &&
119 filename[2] == '\0')
120 return 1;
121 if (strstart(filename, "\\\\.\\", NULL) ||
122 strstart(filename, "//./", NULL))
123 return 1;
124 return 0;
125}
126#endif
127
0563e191 128/* throttling disk I/O limits */
cc0681c4
BC
129void bdrv_set_io_limits(BlockDriverState *bs,
130 ThrottleConfig *cfg)
98f90dba 131{
cc0681c4 132 int i;
98f90dba 133
cc0681c4 134 throttle_config(&bs->throttle_state, cfg);
98f90dba 135
cc0681c4
BC
136 for (i = 0; i < 2; i++) {
137 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 138 }
cc0681c4
BC
139}
140
141/* this function drain all the throttled IOs */
142static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
143{
144 bool drained = false;
145 bool enabled = bs->io_limits_enabled;
146 int i;
147
148 bs->io_limits_enabled = false;
149
150 for (i = 0; i < 2; i++) {
151 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
152 drained = true;
153 }
154 }
155
156 bs->io_limits_enabled = enabled;
98f90dba 157
cc0681c4 158 return drained;
98f90dba
ZYW
159}
160
cc0681c4 161void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 162{
cc0681c4 163 bs->io_limits_enabled = false;
0563e191 164
cc0681c4
BC
165 bdrv_start_throttled_reqs(bs);
166
167 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
168}
169
cc0681c4 170static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 171{
cc0681c4
BC
172 BlockDriverState *bs = opaque;
173 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
174}
175
cc0681c4 176static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 177{
cc0681c4
BC
178 BlockDriverState *bs = opaque;
179 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
180}
181
cc0681c4
BC
182/* should be called before bdrv_set_io_limits if a limit is set */
183void bdrv_io_limits_enable(BlockDriverState *bs)
184{
de50a20a
FZ
185 int clock_type = QEMU_CLOCK_REALTIME;
186
187 if (qtest_enabled()) {
188 /* For testing block IO throttling only */
189 clock_type = QEMU_CLOCK_VIRTUAL;
190 }
cc0681c4
BC
191 assert(!bs->io_limits_enabled);
192 throttle_init(&bs->throttle_state,
13af91eb 193 bdrv_get_aio_context(bs),
de50a20a 194 clock_type,
cc0681c4
BC
195 bdrv_throttle_read_timer_cb,
196 bdrv_throttle_write_timer_cb,
197 bs);
198 bs->io_limits_enabled = true;
199}
200
201/* This function makes an IO wait if needed
202 *
203 * @nb_sectors: the number of sectors of the IO
204 * @is_write: is the IO a write
205 */
98f90dba 206static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 207 unsigned int bytes,
cc0681c4 208 bool is_write)
98f90dba 209{
cc0681c4
BC
210 /* does this io must wait */
211 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 212
cc0681c4
BC
213 /* if must wait or any request of this type throttled queue the IO */
214 if (must_wait ||
215 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
216 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
217 }
218
cc0681c4 219 /* the IO will be executed, do the accounting */
d5103588
KW
220 throttle_account(&bs->throttle_state, is_write, bytes);
221
98f90dba 222
cc0681c4
BC
223 /* if the next request must wait -> do nothing */
224 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
225 return;
98f90dba
ZYW
226 }
227
cc0681c4
BC
228 /* else queue next request for execution */
229 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
230}
231
339064d5
KW
232size_t bdrv_opt_mem_align(BlockDriverState *bs)
233{
234 if (!bs || !bs->drv) {
235 /* 4k should be on the safe side */
236 return 4096;
237 }
238
239 return bs->bl.opt_mem_alignment;
240}
241
9e0b22f4 242/* check if the path starts with "<protocol>:" */
5c98415b 243int path_has_protocol(const char *path)
9e0b22f4 244{
947995c0
PB
245 const char *p;
246
9e0b22f4
SH
247#ifdef _WIN32
248 if (is_windows_drive(path) ||
249 is_windows_drive_prefix(path)) {
250 return 0;
251 }
947995c0
PB
252 p = path + strcspn(path, ":/\\");
253#else
254 p = path + strcspn(path, ":/");
9e0b22f4
SH
255#endif
256
947995c0 257 return *p == ':';
9e0b22f4
SH
258}
259
83f64091 260int path_is_absolute(const char *path)
3b0d4f61 261{
21664424
FB
262#ifdef _WIN32
263 /* specific case for names like: "\\.\d:" */
f53f4da9 264 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 265 return 1;
f53f4da9
PB
266 }
267 return (*path == '/' || *path == '\\');
3b9f94e1 268#else
f53f4da9 269 return (*path == '/');
3b9f94e1 270#endif
3b0d4f61
FB
271}
272
83f64091
FB
273/* if filename is absolute, just copy it to dest. Otherwise, build a
274 path to it by considering it is relative to base_path. URL are
275 supported. */
276void path_combine(char *dest, int dest_size,
277 const char *base_path,
278 const char *filename)
3b0d4f61 279{
83f64091
FB
280 const char *p, *p1;
281 int len;
282
283 if (dest_size <= 0)
284 return;
285 if (path_is_absolute(filename)) {
286 pstrcpy(dest, dest_size, filename);
287 } else {
288 p = strchr(base_path, ':');
289 if (p)
290 p++;
291 else
292 p = base_path;
3b9f94e1
FB
293 p1 = strrchr(base_path, '/');
294#ifdef _WIN32
295 {
296 const char *p2;
297 p2 = strrchr(base_path, '\\');
298 if (!p1 || p2 > p1)
299 p1 = p2;
300 }
301#endif
83f64091
FB
302 if (p1)
303 p1++;
304 else
305 p1 = base_path;
306 if (p1 > p)
307 p = p1;
308 len = p - base_path;
309 if (len > dest_size - 1)
310 len = dest_size - 1;
311 memcpy(dest, base_path, len);
312 dest[len] = '\0';
313 pstrcat(dest, dest_size, filename);
3b0d4f61 314 }
3b0d4f61
FB
315}
316
0a82855a
HR
317void bdrv_get_full_backing_filename_from_filename(const char *backed,
318 const char *backing,
9f07429e
HR
319 char *dest, size_t sz,
320 Error **errp)
dc5a1371 321{
9f07429e
HR
322 if (backing[0] == '\0' || path_has_protocol(backing) ||
323 path_is_absolute(backing))
324 {
0a82855a 325 pstrcpy(dest, sz, backing);
9f07429e
HR
326 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
327 error_setg(errp, "Cannot use relative backing file names for '%s'",
328 backed);
dc5a1371 329 } else {
0a82855a 330 path_combine(dest, sz, backed, backing);
dc5a1371
PB
331 }
332}
333
9f07429e
HR
334void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
335 Error **errp)
0a82855a 336{
9f07429e
HR
337 char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
338
339 bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
340 dest, sz, errp);
0a82855a
HR
341}
342
5efa9d5a 343void bdrv_register(BlockDriver *bdrv)
ea2384d3 344{
8c5873d6
SH
345 /* Block drivers without coroutine functions need emulation */
346 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
347 bdrv->bdrv_co_readv = bdrv_co_readv_em;
348 bdrv->bdrv_co_writev = bdrv_co_writev_em;
349
f8c35c1d
SH
350 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
351 * the block driver lacks aio we need to emulate that too.
352 */
f9f05dc5
KW
353 if (!bdrv->bdrv_aio_readv) {
354 /* add AIO emulation layer */
355 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
356 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 357 }
83f64091 358 }
b2e12bc6 359
8a22f02a 360 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 361}
b338082b 362
7f06d47e 363BlockDriverState *bdrv_new_root(void)
b338082b 364{
7f06d47e 365 BlockDriverState *bs = bdrv_new();
e4e9986b 366
e4e9986b 367 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
e4e9986b
MA
368 return bs;
369}
370
371BlockDriverState *bdrv_new(void)
372{
373 BlockDriverState *bs;
374 int i;
375
5839e53b 376 bs = g_new0(BlockDriverState, 1);
e4654d2d 377 QLIST_INIT(&bs->dirty_bitmaps);
fbe40ff7
FZ
378 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
379 QLIST_INIT(&bs->op_blockers[i]);
380 }
28a7282a 381 bdrv_iostatus_disable(bs);
d7d512f6 382 notifier_list_init(&bs->close_notifiers);
d616b224 383 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
384 qemu_co_queue_init(&bs->throttled_reqs[0]);
385 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 386 bs->refcnt = 1;
dcd04228 387 bs->aio_context = qemu_get_aio_context();
d7d512f6 388
b338082b
FB
389 return bs;
390}
391
d7d512f6
PB
392void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
393{
394 notifier_list_add(&bs->close_notifiers, notify);
395}
396
ea2384d3
FB
397BlockDriver *bdrv_find_format(const char *format_name)
398{
399 BlockDriver *drv1;
8a22f02a
SH
400 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
401 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 402 return drv1;
8a22f02a 403 }
ea2384d3
FB
404 }
405 return NULL;
406}
407
b64ec4e4 408static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 409{
b64ec4e4
FZ
410 static const char *whitelist_rw[] = {
411 CONFIG_BDRV_RW_WHITELIST
412 };
413 static const char *whitelist_ro[] = {
414 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
415 };
416 const char **p;
417
b64ec4e4 418 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 419 return 1; /* no whitelist, anything goes */
b64ec4e4 420 }
eb852011 421
b64ec4e4 422 for (p = whitelist_rw; *p; p++) {
eb852011
MA
423 if (!strcmp(drv->format_name, *p)) {
424 return 1;
425 }
426 }
b64ec4e4
FZ
427 if (read_only) {
428 for (p = whitelist_ro; *p; p++) {
429 if (!strcmp(drv->format_name, *p)) {
430 return 1;
431 }
432 }
433 }
eb852011
MA
434 return 0;
435}
436
b64ec4e4
FZ
437BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
438 bool read_only)
eb852011
MA
439{
440 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 441 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
442}
443
5b7e1542
ZYW
444typedef struct CreateCo {
445 BlockDriver *drv;
446 char *filename;
83d0521a 447 QemuOpts *opts;
5b7e1542 448 int ret;
cc84d90f 449 Error *err;
5b7e1542
ZYW
450} CreateCo;
451
452static void coroutine_fn bdrv_create_co_entry(void *opaque)
453{
cc84d90f
HR
454 Error *local_err = NULL;
455 int ret;
456
5b7e1542
ZYW
457 CreateCo *cco = opaque;
458 assert(cco->drv);
459
c282e1fd 460 ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
84d18f06 461 if (local_err) {
cc84d90f
HR
462 error_propagate(&cco->err, local_err);
463 }
464 cco->ret = ret;
5b7e1542
ZYW
465}
466
0e7e1989 467int bdrv_create(BlockDriver *drv, const char* filename,
83d0521a 468 QemuOpts *opts, Error **errp)
ea2384d3 469{
5b7e1542
ZYW
470 int ret;
471
472 Coroutine *co;
473 CreateCo cco = {
474 .drv = drv,
475 .filename = g_strdup(filename),
83d0521a 476 .opts = opts,
5b7e1542 477 .ret = NOT_DONE,
cc84d90f 478 .err = NULL,
5b7e1542
ZYW
479 };
480
c282e1fd 481 if (!drv->bdrv_create) {
cc84d90f 482 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
483 ret = -ENOTSUP;
484 goto out;
5b7e1542
ZYW
485 }
486
487 if (qemu_in_coroutine()) {
488 /* Fast-path if already in coroutine context */
489 bdrv_create_co_entry(&cco);
490 } else {
491 co = qemu_coroutine_create(bdrv_create_co_entry);
492 qemu_coroutine_enter(co, &cco);
493 while (cco.ret == NOT_DONE) {
b47ec2c4 494 aio_poll(qemu_get_aio_context(), true);
5b7e1542
ZYW
495 }
496 }
497
498 ret = cco.ret;
cc84d90f 499 if (ret < 0) {
84d18f06 500 if (cco.err) {
cc84d90f
HR
501 error_propagate(errp, cco.err);
502 } else {
503 error_setg_errno(errp, -ret, "Could not create image");
504 }
505 }
0e7e1989 506
80168bff
LC
507out:
508 g_free(cco.filename);
5b7e1542 509 return ret;
ea2384d3
FB
510}
511
c282e1fd 512int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
84a12e66
CH
513{
514 BlockDriver *drv;
cc84d90f
HR
515 Error *local_err = NULL;
516 int ret;
84a12e66 517
b65a5e12 518 drv = bdrv_find_protocol(filename, true, errp);
84a12e66 519 if (drv == NULL) {
16905d71 520 return -ENOENT;
84a12e66
CH
521 }
522
c282e1fd 523 ret = bdrv_create(drv, filename, opts, &local_err);
84d18f06 524 if (local_err) {
cc84d90f
HR
525 error_propagate(errp, local_err);
526 }
527 return ret;
84a12e66
CH
528}
529
3baca891 530void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
d34682cd
KW
531{
532 BlockDriver *drv = bs->drv;
3baca891 533 Error *local_err = NULL;
d34682cd
KW
534
535 memset(&bs->bl, 0, sizeof(bs->bl));
536
466ad822 537 if (!drv) {
3baca891 538 return;
466ad822
KW
539 }
540
541 /* Take some limits from the children as a default */
542 if (bs->file) {
3baca891
KW
543 bdrv_refresh_limits(bs->file, &local_err);
544 if (local_err) {
545 error_propagate(errp, local_err);
546 return;
547 }
466ad822 548 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
2647fab5 549 bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
339064d5
KW
550 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
551 } else {
552 bs->bl.opt_mem_alignment = 512;
466ad822
KW
553 }
554
555 if (bs->backing_hd) {
3baca891
KW
556 bdrv_refresh_limits(bs->backing_hd, &local_err);
557 if (local_err) {
558 error_propagate(errp, local_err);
559 return;
560 }
466ad822
KW
561 bs->bl.opt_transfer_length =
562 MAX(bs->bl.opt_transfer_length,
563 bs->backing_hd->bl.opt_transfer_length);
2647fab5
PL
564 bs->bl.max_transfer_length =
565 MIN_NON_ZERO(bs->bl.max_transfer_length,
566 bs->backing_hd->bl.max_transfer_length);
339064d5
KW
567 bs->bl.opt_mem_alignment =
568 MAX(bs->bl.opt_mem_alignment,
569 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
570 }
571
572 /* Then let the driver override it */
573 if (drv->bdrv_refresh_limits) {
3baca891 574 drv->bdrv_refresh_limits(bs, errp);
d34682cd 575 }
d34682cd
KW
576}
577
892b7de8
ET
578/**
579 * Try to get @bs's logical and physical block size.
580 * On success, store them in @bsz struct and return 0.
581 * On failure return -errno.
582 * @bs must not be empty.
583 */
584int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
585{
586 BlockDriver *drv = bs->drv;
587
588 if (drv && drv->bdrv_probe_blocksizes) {
589 return drv->bdrv_probe_blocksizes(bs, bsz);
590 }
591
592 return -ENOTSUP;
593}
594
595/**
596 * Try to get @bs's geometry (cyls, heads, sectors).
597 * On success, store them in @geo struct and return 0.
598 * On failure return -errno.
599 * @bs must not be empty.
600 */
601int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
602{
603 BlockDriver *drv = bs->drv;
604
605 if (drv && drv->bdrv_probe_geometry) {
606 return drv->bdrv_probe_geometry(bs, geo);
607 }
608
609 return -ENOTSUP;
610}
611
eba25057
JM
612/*
613 * Create a uniquely-named empty temporary file.
614 * Return 0 upon success, otherwise a negative errno value.
615 */
616int get_tmp_filename(char *filename, int size)
d5249393 617{
eba25057 618#ifdef _WIN32
3b9f94e1 619 char temp_dir[MAX_PATH];
eba25057
JM
620 /* GetTempFileName requires that its output buffer (4th param)
621 have length MAX_PATH or greater. */
622 assert(size >= MAX_PATH);
623 return (GetTempPath(MAX_PATH, temp_dir)
624 && GetTempFileName(temp_dir, "qem", 0, filename)
625 ? 0 : -GetLastError());
d5249393 626#else
67b915a5 627 int fd;
7ccfb2eb 628 const char *tmpdir;
0badc1ee 629 tmpdir = getenv("TMPDIR");
69bef793
AS
630 if (!tmpdir) {
631 tmpdir = "/var/tmp";
632 }
eba25057
JM
633 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
634 return -EOVERFLOW;
635 }
ea2384d3 636 fd = mkstemp(filename);
fe235a06
DH
637 if (fd < 0) {
638 return -errno;
639 }
640 if (close(fd) != 0) {
641 unlink(filename);
eba25057
JM
642 return -errno;
643 }
644 return 0;
d5249393 645#endif
eba25057 646}
fc01f7e7 647
84a12e66
CH
648/*
649 * Detect host devices. By convention, /dev/cdrom[N] is always
650 * recognized as a host CDROM.
651 */
652static BlockDriver *find_hdev_driver(const char *filename)
653{
654 int score_max = 0, score;
655 BlockDriver *drv = NULL, *d;
656
657 QLIST_FOREACH(d, &bdrv_drivers, list) {
658 if (d->bdrv_probe_device) {
659 score = d->bdrv_probe_device(filename);
660 if (score > score_max) {
661 score_max = score;
662 drv = d;
663 }
664 }
665 }
666
667 return drv;
668}
669
98289620 670BlockDriver *bdrv_find_protocol(const char *filename,
b65a5e12
HR
671 bool allow_protocol_prefix,
672 Error **errp)
83f64091
FB
673{
674 BlockDriver *drv1;
675 char protocol[128];
1cec71e3 676 int len;
83f64091 677 const char *p;
19cb3738 678
66f82cee
KW
679 /* TODO Drivers without bdrv_file_open must be specified explicitly */
680
39508e7a
CH
681 /*
682 * XXX(hch): we really should not let host device detection
683 * override an explicit protocol specification, but moving this
684 * later breaks access to device names with colons in them.
685 * Thanks to the brain-dead persistent naming schemes on udev-
686 * based Linux systems those actually are quite common.
687 */
688 drv1 = find_hdev_driver(filename);
689 if (drv1) {
690 return drv1;
691 }
692
98289620 693 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
ef810437 694 return &bdrv_file;
84a12e66 695 }
98289620 696
9e0b22f4
SH
697 p = strchr(filename, ':');
698 assert(p != NULL);
1cec71e3
AL
699 len = p - filename;
700 if (len > sizeof(protocol) - 1)
701 len = sizeof(protocol) - 1;
702 memcpy(protocol, filename, len);
703 protocol[len] = '\0';
8a22f02a 704 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 705 if (drv1->protocol_name &&
8a22f02a 706 !strcmp(drv1->protocol_name, protocol)) {
83f64091 707 return drv1;
8a22f02a 708 }
83f64091 709 }
b65a5e12
HR
710
711 error_setg(errp, "Unknown protocol '%s'", protocol);
83f64091
FB
712 return NULL;
713}
714
c6684249
MA
715/*
716 * Guess image format by probing its contents.
717 * This is not a good idea when your image is raw (CVE-2008-2004), but
718 * we do it anyway for backward compatibility.
719 *
720 * @buf contains the image's first @buf_size bytes.
7cddd372
KW
721 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
722 * but can be smaller if the image file is smaller)
c6684249
MA
723 * @filename is its filename.
724 *
725 * For all block drivers, call the bdrv_probe() method to get its
726 * probing score.
727 * Return the first block driver with the highest probing score.
728 */
38f3ef57
KW
729BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
730 const char *filename)
c6684249
MA
731{
732 int score_max = 0, score;
733 BlockDriver *drv = NULL, *d;
734
735 QLIST_FOREACH(d, &bdrv_drivers, list) {
736 if (d->bdrv_probe) {
737 score = d->bdrv_probe(buf, buf_size, filename);
738 if (score > score_max) {
739 score_max = score;
740 drv = d;
741 }
742 }
743 }
744
745 return drv;
746}
747
f500a6d3 748static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 749 BlockDriver **pdrv, Error **errp)
f3a5d3f8 750{
c6684249 751 BlockDriver *drv;
7cddd372 752 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
f500a6d3 753 int ret = 0;
f8ea0b00 754
08a00559 755 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 756 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
ef810437 757 *pdrv = &bdrv_raw;
c98ac35d 758 return ret;
1a396859 759 }
f8ea0b00 760
83f64091 761 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 762 if (ret < 0) {
34b5d2c6
HR
763 error_setg_errno(errp, -ret, "Could not read image for determining its "
764 "format");
c98ac35d
SW
765 *pdrv = NULL;
766 return ret;
83f64091
FB
767 }
768
c6684249 769 drv = bdrv_probe_all(buf, ret, filename);
c98ac35d 770 if (!drv) {
34b5d2c6
HR
771 error_setg(errp, "Could not determine image format: No compatible "
772 "driver found");
c98ac35d
SW
773 ret = -ENOENT;
774 }
775 *pdrv = drv;
776 return ret;
ea2384d3
FB
777}
778
51762288
SH
779/**
780 * Set the current 'total_sectors' value
65a9bb25 781 * Return 0 on success, -errno on error.
51762288
SH
782 */
783static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
784{
785 BlockDriver *drv = bs->drv;
786
396759ad
NB
787 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
788 if (bs->sg)
789 return 0;
790
51762288
SH
791 /* query actual device if possible, otherwise just trust the hint */
792 if (drv->bdrv_getlength) {
793 int64_t length = drv->bdrv_getlength(bs);
794 if (length < 0) {
795 return length;
796 }
7e382003 797 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
798 }
799
800 bs->total_sectors = hint;
801 return 0;
802}
803
9e8f1835
PB
804/**
805 * Set open flags for a given discard mode
806 *
807 * Return 0 on success, -1 if the discard mode was invalid.
808 */
809int bdrv_parse_discard_flags(const char *mode, int *flags)
810{
811 *flags &= ~BDRV_O_UNMAP;
812
813 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
814 /* do nothing */
815 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
816 *flags |= BDRV_O_UNMAP;
817 } else {
818 return -1;
819 }
820
821 return 0;
822}
823
c3993cdc
SH
824/**
825 * Set open flags for a given cache mode
826 *
827 * Return 0 on success, -1 if the cache mode was invalid.
828 */
829int bdrv_parse_cache_flags(const char *mode, int *flags)
830{
831 *flags &= ~BDRV_O_CACHE_MASK;
832
833 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
834 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
835 } else if (!strcmp(mode, "directsync")) {
836 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
837 } else if (!strcmp(mode, "writeback")) {
838 *flags |= BDRV_O_CACHE_WB;
839 } else if (!strcmp(mode, "unsafe")) {
840 *flags |= BDRV_O_CACHE_WB;
841 *flags |= BDRV_O_NO_FLUSH;
842 } else if (!strcmp(mode, "writethrough")) {
843 /* this is the default */
844 } else {
845 return -1;
846 }
847
848 return 0;
849}
850
53fec9d3
SH
851/**
852 * The copy-on-read flag is actually a reference count so multiple users may
853 * use the feature without worrying about clobbering its previous state.
854 * Copy-on-read stays enabled until all users have called to disable it.
855 */
856void bdrv_enable_copy_on_read(BlockDriverState *bs)
857{
858 bs->copy_on_read++;
859}
860
861void bdrv_disable_copy_on_read(BlockDriverState *bs)
862{
863 assert(bs->copy_on_read > 0);
864 bs->copy_on_read--;
865}
866
b1e6fc08
KW
867/*
868 * Returns the flags that a temporary snapshot should get, based on the
869 * originally requested flags (the originally requested image will have flags
870 * like a backing file)
871 */
872static int bdrv_temp_snapshot_flags(int flags)
873{
874 return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
875}
876
0b50cc88
KW
877/*
878 * Returns the flags that bs->file should get, based on the given flags for
879 * the parent BDS
880 */
881static int bdrv_inherited_flags(int flags)
882{
883 /* Enable protocol handling, disable format probing for bs->file */
884 flags |= BDRV_O_PROTOCOL;
885
886 /* Our block drivers take care to send flushes and respect unmap policy,
887 * so we can enable both unconditionally on lower layers. */
888 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
889
0b50cc88 890 /* Clear flags that only apply to the top layer */
5669b44d 891 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
0b50cc88
KW
892
893 return flags;
894}
895
317fc44e
KW
896/*
897 * Returns the flags that bs->backing_hd should get, based on the given flags
898 * for the parent BDS
899 */
900static int bdrv_backing_flags(int flags)
901{
902 /* backing files always opened read-only */
903 flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
904
905 /* snapshot=on is handled on the top layer */
8bfea15d 906 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
317fc44e
KW
907
908 return flags;
909}
910
7b272452
KW
911static int bdrv_open_flags(BlockDriverState *bs, int flags)
912{
913 int open_flags = flags | BDRV_O_CACHE_WB;
914
915 /*
916 * Clear flags that are internal to the block layer before opening the
917 * image.
918 */
20cca275 919 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
7b272452
KW
920
921 /*
922 * Snapshots should be writable.
923 */
8bfea15d 924 if (flags & BDRV_O_TEMPORARY) {
7b272452
KW
925 open_flags |= BDRV_O_RDWR;
926 }
927
928 return open_flags;
929}
930
636ea370
KW
931static void bdrv_assign_node_name(BlockDriverState *bs,
932 const char *node_name,
933 Error **errp)
6913c0c2
BC
934{
935 if (!node_name) {
636ea370 936 return;
6913c0c2
BC
937 }
938
9aebf3b8 939 /* Check for empty string or invalid characters */
f5bebbbb 940 if (!id_wellformed(node_name)) {
9aebf3b8 941 error_setg(errp, "Invalid node name");
636ea370 942 return;
6913c0c2
BC
943 }
944
0c5e94ee 945 /* takes care of avoiding namespaces collisions */
7f06d47e 946 if (blk_by_name(node_name)) {
0c5e94ee
BC
947 error_setg(errp, "node-name=%s is conflicting with a device id",
948 node_name);
636ea370 949 return;
0c5e94ee
BC
950 }
951
6913c0c2
BC
952 /* takes care of avoiding duplicates node names */
953 if (bdrv_find_node(node_name)) {
954 error_setg(errp, "Duplicate node name");
636ea370 955 return;
6913c0c2
BC
956 }
957
958 /* copy node name into the bs and insert it into the graph list */
959 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
960 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
961}
962
57915332
KW
963/*
964 * Common part for opening disk images and files
b6ad491a
KW
965 *
966 * Removes all processed options from *options.
57915332 967 */
f500a6d3 968static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 969 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
970{
971 int ret, open_flags;
035fccdf 972 const char *filename;
6913c0c2 973 const char *node_name = NULL;
34b5d2c6 974 Error *local_err = NULL;
57915332
KW
975
976 assert(drv != NULL);
6405875c 977 assert(bs->file == NULL);
707ff828 978 assert(options != NULL && bs->options != options);
57915332 979
45673671
KW
980 if (file != NULL) {
981 filename = file->filename;
982 } else {
983 filename = qdict_get_try_str(options, "filename");
984 }
985
765003db
KW
986 if (drv->bdrv_needs_filename && !filename) {
987 error_setg(errp, "The '%s' block driver requires a file name",
988 drv->format_name);
989 return -EINVAL;
990 }
991
45673671 992 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 993
6913c0c2 994 node_name = qdict_get_try_str(options, "node-name");
636ea370 995 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 996 if (local_err) {
636ea370
KW
997 error_propagate(errp, local_err);
998 return -EINVAL;
6913c0c2
BC
999 }
1000 qdict_del(options, "node-name");
1001
5d186eb0
KW
1002 /* bdrv_open() with directly using a protocol as drv. This layer is already
1003 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1004 * and return immediately. */
1005 if (file != NULL && drv->bdrv_file_open) {
1006 bdrv_swap(file, bs);
1007 return 0;
1008 }
1009
57915332 1010 bs->open_flags = flags;
1b7fd729 1011 bs->guest_block_size = 512;
c25f53b0 1012 bs->request_alignment = 512;
0d51b4de 1013 bs->zero_beyond_eof = true;
b64ec4e4
FZ
1014 open_flags = bdrv_open_flags(bs, flags);
1015 bs->read_only = !(open_flags & BDRV_O_RDWR);
1016
1017 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
1018 error_setg(errp,
1019 !bs->read_only && bdrv_is_whitelisted(drv, true)
1020 ? "Driver '%s' can only be used for read-only devices"
1021 : "Driver '%s' is not whitelisted",
1022 drv->format_name);
b64ec4e4
FZ
1023 return -ENOTSUP;
1024 }
57915332 1025
53fec9d3 1026 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
1027 if (flags & BDRV_O_COPY_ON_READ) {
1028 if (!bs->read_only) {
1029 bdrv_enable_copy_on_read(bs);
1030 } else {
1031 error_setg(errp, "Can't use copy-on-read on read-only device");
1032 return -EINVAL;
1033 }
53fec9d3
SH
1034 }
1035
c2ad1b0c
KW
1036 if (filename != NULL) {
1037 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1038 } else {
1039 bs->filename[0] = '\0';
1040 }
91af7014 1041 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
57915332 1042
57915332 1043 bs->drv = drv;
7267c094 1044 bs->opaque = g_malloc0(drv->instance_size);
57915332 1045
03f541bd 1046 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 1047
66f82cee
KW
1048 /* Open the image, either directly or using a protocol */
1049 if (drv->bdrv_file_open) {
5d186eb0 1050 assert(file == NULL);
030be321 1051 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 1052 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 1053 } else {
2af5ef70 1054 if (file == NULL) {
34b5d2c6
HR
1055 error_setg(errp, "Can't use '%s' as a block driver for the "
1056 "protocol level", drv->format_name);
2af5ef70
KW
1057 ret = -EINVAL;
1058 goto free_and_fail;
1059 }
f500a6d3 1060 bs->file = file;
34b5d2c6 1061 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
1062 }
1063
57915332 1064 if (ret < 0) {
84d18f06 1065 if (local_err) {
34b5d2c6 1066 error_propagate(errp, local_err);
2fa9aa59
DH
1067 } else if (bs->filename[0]) {
1068 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
1069 } else {
1070 error_setg_errno(errp, -ret, "Could not open image");
1071 }
57915332
KW
1072 goto free_and_fail;
1073 }
1074
a1f688f4
MA
1075 if (bs->encrypted) {
1076 error_report("Encrypted images are deprecated");
1077 error_printf("Support for them will be removed in a future release.\n"
1078 "You can use 'qemu-img convert' to convert your image"
1079 " to an unencrypted one.\n");
1080 }
1081
51762288
SH
1082 ret = refresh_total_sectors(bs, bs->total_sectors);
1083 if (ret < 0) {
34b5d2c6 1084 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 1085 goto free_and_fail;
57915332 1086 }
51762288 1087
3baca891
KW
1088 bdrv_refresh_limits(bs, &local_err);
1089 if (local_err) {
1090 error_propagate(errp, local_err);
1091 ret = -EINVAL;
1092 goto free_and_fail;
1093 }
1094
c25f53b0 1095 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 1096 assert((bs->request_alignment != 0) || bs->sg);
57915332
KW
1097 return 0;
1098
1099free_and_fail:
f500a6d3 1100 bs->file = NULL;
7267c094 1101 g_free(bs->opaque);
57915332
KW
1102 bs->opaque = NULL;
1103 bs->drv = NULL;
1104 return ret;
1105}
1106
5e5c4f63
KW
1107static QDict *parse_json_filename(const char *filename, Error **errp)
1108{
1109 QObject *options_obj;
1110 QDict *options;
1111 int ret;
1112
1113 ret = strstart(filename, "json:", &filename);
1114 assert(ret);
1115
1116 options_obj = qobject_from_json(filename);
1117 if (!options_obj) {
1118 error_setg(errp, "Could not parse the JSON options");
1119 return NULL;
1120 }
1121
1122 if (qobject_type(options_obj) != QTYPE_QDICT) {
1123 qobject_decref(options_obj);
1124 error_setg(errp, "Invalid JSON object given");
1125 return NULL;
1126 }
1127
1128 options = qobject_to_qdict(options_obj);
1129 qdict_flatten(options);
1130
1131 return options;
1132}
1133
b6ce07aa 1134/*
f54120ff
KW
1135 * Fills in default options for opening images and converts the legacy
1136 * filename/flags pair to option QDict entries.
b6ce07aa 1137 */
5e5c4f63 1138static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
17b005f1 1139 BlockDriver *drv, Error **errp)
ea2384d3 1140{
5e5c4f63 1141 const char *filename = *pfilename;
c2ad1b0c 1142 const char *drvname;
462f5bcf 1143 bool protocol = flags & BDRV_O_PROTOCOL;
e3fa4bfa 1144 bool parse_filename = false;
34b5d2c6 1145 Error *local_err = NULL;
83f64091 1146
5e5c4f63
KW
1147 /* Parse json: pseudo-protocol */
1148 if (filename && g_str_has_prefix(filename, "json:")) {
1149 QDict *json_options = parse_json_filename(filename, &local_err);
1150 if (local_err) {
1151 error_propagate(errp, local_err);
1152 return -EINVAL;
1153 }
1154
1155 /* Options given in the filename have lower priority than options
1156 * specified directly */
1157 qdict_join(*options, json_options, false);
1158 QDECREF(json_options);
1159 *pfilename = filename = NULL;
1160 }
1161
035fccdf 1162 /* Fetch the file name from the options QDict if necessary */
17b005f1 1163 if (protocol && filename) {
f54120ff
KW
1164 if (!qdict_haskey(*options, "filename")) {
1165 qdict_put(*options, "filename", qstring_from_str(filename));
1166 parse_filename = true;
1167 } else {
1168 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1169 "the same time");
1170 return -EINVAL;
1171 }
035fccdf
KW
1172 }
1173
c2ad1b0c 1174 /* Find the right block driver */
f54120ff 1175 filename = qdict_get_try_str(*options, "filename");
5acd9d81 1176 drvname = qdict_get_try_str(*options, "driver");
f54120ff 1177
17b005f1
KW
1178 if (drv) {
1179 if (drvname) {
1180 error_setg(errp, "Driver specified twice");
1181 return -EINVAL;
1182 }
1183 drvname = drv->format_name;
1184 qdict_put(*options, "driver", qstring_from_str(drvname));
1185 } else {
1186 if (!drvname && protocol) {
1187 if (filename) {
b65a5e12 1188 drv = bdrv_find_protocol(filename, parse_filename, errp);
17b005f1 1189 if (!drv) {
17b005f1
KW
1190 return -EINVAL;
1191 }
1192
1193 drvname = drv->format_name;
1194 qdict_put(*options, "driver", qstring_from_str(drvname));
1195 } else {
1196 error_setg(errp, "Must specify either driver or file");
f54120ff
KW
1197 return -EINVAL;
1198 }
17b005f1
KW
1199 } else if (drvname) {
1200 drv = bdrv_find_format(drvname);
1201 if (!drv) {
1202 error_setg(errp, "Unknown driver '%s'", drvname);
1203 return -ENOENT;
1204 }
98289620 1205 }
c2ad1b0c
KW
1206 }
1207
17b005f1 1208 assert(drv || !protocol);
c2ad1b0c 1209
f54120ff 1210 /* Driver-specific filename parsing */
17b005f1 1211 if (drv && drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1212 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1213 if (local_err) {
34b5d2c6 1214 error_propagate(errp, local_err);
f54120ff 1215 return -EINVAL;
6963a30d 1216 }
cd5d031e
HR
1217
1218 if (!drv->bdrv_needs_filename) {
1219 qdict_del(*options, "filename");
cd5d031e 1220 }
6963a30d
KW
1221 }
1222
f54120ff
KW
1223 return 0;
1224}
1225
8d24cce1
FZ
1226void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1227{
1228
826b6ca0
FZ
1229 if (bs->backing_hd) {
1230 assert(bs->backing_blocker);
1231 bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1232 } else if (backing_hd) {
1233 error_setg(&bs->backing_blocker,
81e5f78a
AG
1234 "node is used as backing hd of '%s'",
1235 bdrv_get_device_or_node_name(bs));
826b6ca0
FZ
1236 }
1237
8d24cce1
FZ
1238 bs->backing_hd = backing_hd;
1239 if (!backing_hd) {
826b6ca0
FZ
1240 error_free(bs->backing_blocker);
1241 bs->backing_blocker = NULL;
8d24cce1
FZ
1242 goto out;
1243 }
1244 bs->open_flags &= ~BDRV_O_NO_BACKING;
1245 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1246 pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1247 backing_hd->drv ? backing_hd->drv->format_name : "");
826b6ca0
FZ
1248
1249 bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1250 /* Otherwise we won't be able to commit due to check in bdrv_commit */
bb00021d 1251 bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
826b6ca0 1252 bs->backing_blocker);
8d24cce1 1253out:
3baca891 1254 bdrv_refresh_limits(bs, NULL);
8d24cce1
FZ
1255}
1256
31ca6d07
KW
1257/*
1258 * Opens the backing file for a BlockDriverState if not yet open
1259 *
1260 * options is a QDict of options to pass to the block drivers, or NULL for an
1261 * empty set of options. The reference to the QDict is transferred to this
1262 * function (even on failure), so if the caller intends to reuse the dictionary,
1263 * it needs to use QINCREF() before calling bdrv_file_open.
1264 */
34b5d2c6 1265int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1266{
1ba4b6a5 1267 char *backing_filename = g_malloc0(PATH_MAX);
317fc44e 1268 int ret = 0;
8d24cce1 1269 BlockDriverState *backing_hd;
34b5d2c6 1270 Error *local_err = NULL;
9156df12
PB
1271
1272 if (bs->backing_hd != NULL) {
31ca6d07 1273 QDECREF(options);
1ba4b6a5 1274 goto free_exit;
9156df12
PB
1275 }
1276
31ca6d07
KW
1277 /* NULL means an empty set of options */
1278 if (options == NULL) {
1279 options = qdict_new();
1280 }
1281
9156df12 1282 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1283 if (qdict_haskey(options, "file.filename")) {
1284 backing_filename[0] = '\0';
1285 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1286 QDECREF(options);
1ba4b6a5 1287 goto free_exit;
dbecebdd 1288 } else {
9f07429e
HR
1289 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1290 &local_err);
1291 if (local_err) {
1292 ret = -EINVAL;
1293 error_propagate(errp, local_err);
1294 QDECREF(options);
1295 goto free_exit;
1296 }
9156df12
PB
1297 }
1298
8ee79e70
KW
1299 if (!bs->drv || !bs->drv->supports_backing) {
1300 ret = -EINVAL;
1301 error_setg(errp, "Driver doesn't support backing files");
1302 QDECREF(options);
1303 goto free_exit;
1304 }
1305
e4e9986b 1306 backing_hd = bdrv_new();
8d24cce1 1307
c5f6e493
KW
1308 if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1309 qdict_put(options, "driver", qstring_from_str(bs->backing_format));
9156df12
PB
1310 }
1311
f67503e5 1312 assert(bs->backing_hd == NULL);
8d24cce1 1313 ret = bdrv_open(&backing_hd,
ddf5636d 1314 *backing_filename ? backing_filename : NULL, NULL, options,
c5f6e493 1315 bdrv_backing_flags(bs->open_flags), NULL, &local_err);
9156df12 1316 if (ret < 0) {
8d24cce1
FZ
1317 bdrv_unref(backing_hd);
1318 backing_hd = NULL;
9156df12 1319 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1320 error_setg(errp, "Could not open backing file: %s",
1321 error_get_pretty(local_err));
1322 error_free(local_err);
1ba4b6a5 1323 goto free_exit;
9156df12 1324 }
8d24cce1 1325 bdrv_set_backing_hd(bs, backing_hd);
d80ac658 1326
1ba4b6a5
BC
1327free_exit:
1328 g_free(backing_filename);
1329 return ret;
9156df12
PB
1330}
1331
da557aac
HR
1332/*
1333 * Opens a disk image whose options are given as BlockdevRef in another block
1334 * device's options.
1335 *
da557aac
HR
1336 * If allow_none is true, no image will be opened if filename is false and no
1337 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1338 *
1339 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1340 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1341 * itself, all options starting with "${bdref_key}." are considered part of the
1342 * BlockdevRef.
1343 *
1344 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1345 *
1346 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1347 */
1348int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1349 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1350 bool allow_none, Error **errp)
da557aac
HR
1351{
1352 QDict *image_options;
1353 int ret;
1354 char *bdref_key_dot;
1355 const char *reference;
1356
f67503e5
HR
1357 assert(pbs);
1358 assert(*pbs == NULL);
1359
da557aac
HR
1360 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1361 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1362 g_free(bdref_key_dot);
1363
1364 reference = qdict_get_try_str(options, bdref_key);
1365 if (!filename && !reference && !qdict_size(image_options)) {
1366 if (allow_none) {
1367 ret = 0;
1368 } else {
1369 error_setg(errp, "A block device must be specified for \"%s\"",
1370 bdref_key);
1371 ret = -EINVAL;
1372 }
b20e61e0 1373 QDECREF(image_options);
da557aac
HR
1374 goto done;
1375 }
1376
f7d9fd8c 1377 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1378
1379done:
1380 qdict_del(options, bdref_key);
1381 return ret;
1382}
1383
6b8aeca5 1384int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
b998875d
KW
1385{
1386 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1387 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d 1388 int64_t total_size;
83d0521a 1389 QemuOpts *opts = NULL;
b998875d
KW
1390 QDict *snapshot_options;
1391 BlockDriverState *bs_snapshot;
1392 Error *local_err;
1393 int ret;
1394
1395 /* if snapshot, we create a temporary backing file and open it
1396 instead of opening 'filename' directly */
1397
1398 /* Get the required size from the image */
f187743a
KW
1399 total_size = bdrv_getlength(bs);
1400 if (total_size < 0) {
6b8aeca5 1401 ret = total_size;
f187743a 1402 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1403 goto out;
f187743a 1404 }
b998875d
KW
1405
1406 /* Create the temporary image */
1ba4b6a5 1407 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1408 if (ret < 0) {
1409 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1410 goto out;
b998875d
KW
1411 }
1412
ef810437 1413 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
c282e1fd 1414 &error_abort);
39101f25 1415 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
ef810437 1416 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
83d0521a 1417 qemu_opts_del(opts);
b998875d
KW
1418 if (ret < 0) {
1419 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1420 "'%s': %s", tmp_filename,
1421 error_get_pretty(local_err));
1422 error_free(local_err);
1ba4b6a5 1423 goto out;
b998875d
KW
1424 }
1425
1426 /* Prepare a new options QDict for the temporary file */
1427 snapshot_options = qdict_new();
1428 qdict_put(snapshot_options, "file.driver",
1429 qstring_from_str("file"));
1430 qdict_put(snapshot_options, "file.filename",
1431 qstring_from_str(tmp_filename));
1432
e4e9986b 1433 bs_snapshot = bdrv_new();
b998875d
KW
1434
1435 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
ef810437 1436 flags, &bdrv_qcow2, &local_err);
b998875d
KW
1437 if (ret < 0) {
1438 error_propagate(errp, local_err);
1ba4b6a5 1439 goto out;
b998875d
KW
1440 }
1441
1442 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1443
1444out:
1445 g_free(tmp_filename);
6b8aeca5 1446 return ret;
b998875d
KW
1447}
1448
b6ce07aa
KW
1449/*
1450 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1451 *
1452 * options is a QDict of options to pass to the block drivers, or NULL for an
1453 * empty set of options. The reference to the QDict belongs to the block layer
1454 * after the call (even on failure), so if the caller intends to reuse the
1455 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1456 *
1457 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1458 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1459 *
1460 * The reference parameter may be used to specify an existing block device which
1461 * should be opened. If specified, neither options nor a filename may be given,
1462 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1463 */
ddf5636d
HR
1464int bdrv_open(BlockDriverState **pbs, const char *filename,
1465 const char *reference, QDict *options, int flags,
1466 BlockDriver *drv, Error **errp)
ea2384d3 1467{
b6ce07aa 1468 int ret;
f67503e5 1469 BlockDriverState *file = NULL, *bs;
74fe54f2 1470 const char *drvname;
34b5d2c6 1471 Error *local_err = NULL;
b1e6fc08 1472 int snapshot_flags = 0;
712e7874 1473
f67503e5
HR
1474 assert(pbs);
1475
ddf5636d
HR
1476 if (reference) {
1477 bool options_non_empty = options ? qdict_size(options) : false;
1478 QDECREF(options);
1479
1480 if (*pbs) {
1481 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1482 "another block device");
1483 return -EINVAL;
1484 }
1485
1486 if (filename || options_non_empty) {
1487 error_setg(errp, "Cannot reference an existing block device with "
1488 "additional options or a new filename");
1489 return -EINVAL;
1490 }
1491
1492 bs = bdrv_lookup_bs(reference, reference, errp);
1493 if (!bs) {
1494 return -ENODEV;
1495 }
1496 bdrv_ref(bs);
1497 *pbs = bs;
1498 return 0;
1499 }
1500
f67503e5
HR
1501 if (*pbs) {
1502 bs = *pbs;
1503 } else {
e4e9986b 1504 bs = bdrv_new();
f67503e5
HR
1505 }
1506
de9c0cec
KW
1507 /* NULL means an empty set of options */
1508 if (options == NULL) {
1509 options = qdict_new();
1510 }
1511
17b005f1 1512 ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
462f5bcf
KW
1513 if (local_err) {
1514 goto fail;
1515 }
1516
76c591b0
KW
1517 /* Find the right image format driver */
1518 drv = NULL;
1519 drvname = qdict_get_try_str(options, "driver");
1520 if (drvname) {
1521 drv = bdrv_find_format(drvname);
1522 qdict_del(options, "driver");
1523 if (!drv) {
1524 error_setg(errp, "Unknown driver: '%s'", drvname);
1525 ret = -EINVAL;
1526 goto fail;
1527 }
1528 }
1529
1530 assert(drvname || !(flags & BDRV_O_PROTOCOL));
1531 if (drv && !drv->bdrv_file_open) {
1532 /* If the user explicitly wants a format driver here, we'll need to add
1533 * another layer for the protocol in bs->file */
1534 flags &= ~BDRV_O_PROTOCOL;
1535 }
1536
de9c0cec 1537 bs->options = options;
b6ad491a 1538 options = qdict_clone_shallow(options);
de9c0cec 1539
f500a6d3 1540 /* Open image file without format layer */
f4788adc
KW
1541 if ((flags & BDRV_O_PROTOCOL) == 0) {
1542 if (flags & BDRV_O_RDWR) {
1543 flags |= BDRV_O_ALLOW_RDWR;
1544 }
1545 if (flags & BDRV_O_SNAPSHOT) {
1546 snapshot_flags = bdrv_temp_snapshot_flags(flags);
1547 flags = bdrv_backing_flags(flags);
1548 }
f500a6d3 1549
f4788adc
KW
1550 assert(file == NULL);
1551 ret = bdrv_open_image(&file, filename, options, "file",
1552 bdrv_inherited_flags(flags),
1553 true, &local_err);
1554 if (ret < 0) {
1555 goto fail;
1556 }
f500a6d3
KW
1557 }
1558
76c591b0 1559 /* Image format probing */
38f3ef57 1560 bs->probed = !drv;
76c591b0 1561 if (!drv && file) {
17b005f1
KW
1562 ret = find_image_format(file, filename, &drv, &local_err);
1563 if (ret < 0) {
8bfea15d 1564 goto fail;
2a05cbe4 1565 }
76c591b0 1566 } else if (!drv) {
17b005f1
KW
1567 error_setg(errp, "Must specify either driver or file");
1568 ret = -EINVAL;
8bfea15d 1569 goto fail;
ea2384d3 1570 }
b6ce07aa
KW
1571
1572 /* Open the image */
34b5d2c6 1573 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1574 if (ret < 0) {
8bfea15d 1575 goto fail;
6987307c
CH
1576 }
1577
2a05cbe4 1578 if (file && (bs->file != file)) {
4f6fd349 1579 bdrv_unref(file);
f500a6d3
KW
1580 file = NULL;
1581 }
1582
b6ce07aa 1583 /* If there is a backing file, use it */
9156df12 1584 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1585 QDict *backing_options;
1586
5726d872 1587 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1588 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1589 if (ret < 0) {
b6ad491a 1590 goto close_and_fail;
b6ce07aa 1591 }
b6ce07aa
KW
1592 }
1593
91af7014
HR
1594 bdrv_refresh_filename(bs);
1595
b998875d
KW
1596 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1597 * temporary snapshot afterwards. */
b1e6fc08 1598 if (snapshot_flags) {
6b8aeca5 1599 ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
b998875d 1600 if (local_err) {
b998875d
KW
1601 goto close_and_fail;
1602 }
1603 }
1604
b6ad491a 1605 /* Check if any unknown options were used */
5acd9d81 1606 if (options && (qdict_size(options) != 0)) {
b6ad491a 1607 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1608 if (flags & BDRV_O_PROTOCOL) {
1609 error_setg(errp, "Block protocol '%s' doesn't support the option "
1610 "'%s'", drv->format_name, entry->key);
1611 } else {
1612 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1613 "support the option '%s'", drv->format_name,
bfb197e0 1614 bdrv_get_device_name(bs), entry->key);
5acd9d81 1615 }
b6ad491a
KW
1616
1617 ret = -EINVAL;
1618 goto close_and_fail;
1619 }
b6ad491a 1620
b6ce07aa 1621 if (!bdrv_key_required(bs)) {
a7f53e26
MA
1622 if (bs->blk) {
1623 blk_dev_change_media_cb(bs->blk, true);
1624 }
c3adb58f
MA
1625 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1626 && !runstate_check(RUN_STATE_INMIGRATE)
1627 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1628 error_setg(errp,
1629 "Guest must be stopped for opening of encrypted image");
1630 ret = -EBUSY;
1631 goto close_and_fail;
b6ce07aa
KW
1632 }
1633
c3adb58f 1634 QDECREF(options);
f67503e5 1635 *pbs = bs;
b6ce07aa
KW
1636 return 0;
1637
8bfea15d 1638fail:
f500a6d3 1639 if (file != NULL) {
4f6fd349 1640 bdrv_unref(file);
f500a6d3 1641 }
de9c0cec 1642 QDECREF(bs->options);
b6ad491a 1643 QDECREF(options);
de9c0cec 1644 bs->options = NULL;
f67503e5
HR
1645 if (!*pbs) {
1646 /* If *pbs is NULL, a new BDS has been created in this function and
1647 needs to be freed now. Otherwise, it does not need to be closed,
1648 since it has not really been opened yet. */
1649 bdrv_unref(bs);
1650 }
84d18f06 1651 if (local_err) {
34b5d2c6
HR
1652 error_propagate(errp, local_err);
1653 }
b6ad491a 1654 return ret;
de9c0cec 1655
b6ad491a 1656close_and_fail:
f67503e5
HR
1657 /* See fail path, but now the BDS has to be always closed */
1658 if (*pbs) {
1659 bdrv_close(bs);
1660 } else {
1661 bdrv_unref(bs);
1662 }
b6ad491a 1663 QDECREF(options);
84d18f06 1664 if (local_err) {
34b5d2c6
HR
1665 error_propagate(errp, local_err);
1666 }
b6ce07aa
KW
1667 return ret;
1668}
1669
e971aa12
JC
1670typedef struct BlockReopenQueueEntry {
1671 bool prepared;
1672 BDRVReopenState state;
1673 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1674} BlockReopenQueueEntry;
1675
1676/*
1677 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1678 * reopen of multiple devices.
1679 *
1680 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1681 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1682 * be created and initialized. This newly created BlockReopenQueue should be
1683 * passed back in for subsequent calls that are intended to be of the same
1684 * atomic 'set'.
1685 *
1686 * bs is the BlockDriverState to add to the reopen queue.
1687 *
1688 * flags contains the open flags for the associated bs
1689 *
1690 * returns a pointer to bs_queue, which is either the newly allocated
1691 * bs_queue, or the existing bs_queue being used.
1692 *
1693 */
1694BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1695 BlockDriverState *bs, int flags)
1696{
1697 assert(bs != NULL);
1698
1699 BlockReopenQueueEntry *bs_entry;
1700 if (bs_queue == NULL) {
1701 bs_queue = g_new0(BlockReopenQueue, 1);
1702 QSIMPLEQ_INIT(bs_queue);
1703 }
1704
f1f25a2e
KW
1705 /* bdrv_open() masks this flag out */
1706 flags &= ~BDRV_O_PROTOCOL;
1707
e971aa12 1708 if (bs->file) {
f1f25a2e 1709 bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
e971aa12
JC
1710 }
1711
1712 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1713 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1714
1715 bs_entry->state.bs = bs;
1716 bs_entry->state.flags = flags;
1717
1718 return bs_queue;
1719}
1720
1721/*
1722 * Reopen multiple BlockDriverStates atomically & transactionally.
1723 *
1724 * The queue passed in (bs_queue) must have been built up previous
1725 * via bdrv_reopen_queue().
1726 *
1727 * Reopens all BDS specified in the queue, with the appropriate
1728 * flags. All devices are prepared for reopen, and failure of any
1729 * device will cause all device changes to be abandonded, and intermediate
1730 * data cleaned up.
1731 *
1732 * If all devices prepare successfully, then the changes are committed
1733 * to all devices.
1734 *
1735 */
1736int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1737{
1738 int ret = -1;
1739 BlockReopenQueueEntry *bs_entry, *next;
1740 Error *local_err = NULL;
1741
1742 assert(bs_queue != NULL);
1743
1744 bdrv_drain_all();
1745
1746 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1747 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1748 error_propagate(errp, local_err);
1749 goto cleanup;
1750 }
1751 bs_entry->prepared = true;
1752 }
1753
1754 /* If we reach this point, we have success and just need to apply the
1755 * changes
1756 */
1757 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1758 bdrv_reopen_commit(&bs_entry->state);
1759 }
1760
1761 ret = 0;
1762
1763cleanup:
1764 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1765 if (ret && bs_entry->prepared) {
1766 bdrv_reopen_abort(&bs_entry->state);
1767 }
1768 g_free(bs_entry);
1769 }
1770 g_free(bs_queue);
1771 return ret;
1772}
1773
1774
1775/* Reopen a single BlockDriverState with the specified flags. */
1776int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1777{
1778 int ret = -1;
1779 Error *local_err = NULL;
1780 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1781
1782 ret = bdrv_reopen_multiple(queue, &local_err);
1783 if (local_err != NULL) {
1784 error_propagate(errp, local_err);
1785 }
1786 return ret;
1787}
1788
1789
1790/*
1791 * Prepares a BlockDriverState for reopen. All changes are staged in the
1792 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1793 * the block driver layer .bdrv_reopen_prepare()
1794 *
1795 * bs is the BlockDriverState to reopen
1796 * flags are the new open flags
1797 * queue is the reopen queue
1798 *
1799 * Returns 0 on success, non-zero on error. On error errp will be set
1800 * as well.
1801 *
1802 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1803 * It is the responsibility of the caller to then call the abort() or
1804 * commit() for any other BDS that have been left in a prepare() state
1805 *
1806 */
1807int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1808 Error **errp)
1809{
1810 int ret = -1;
1811 Error *local_err = NULL;
1812 BlockDriver *drv;
1813
1814 assert(reopen_state != NULL);
1815 assert(reopen_state->bs->drv != NULL);
1816 drv = reopen_state->bs->drv;
1817
1818 /* if we are to stay read-only, do not allow permission change
1819 * to r/w */
1820 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1821 reopen_state->flags & BDRV_O_RDWR) {
81e5f78a
AG
1822 error_setg(errp, "Node '%s' is read only",
1823 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1824 goto error;
1825 }
1826
1827
1828 ret = bdrv_flush(reopen_state->bs);
1829 if (ret) {
1830 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1831 strerror(-ret));
1832 goto error;
1833 }
1834
1835 if (drv->bdrv_reopen_prepare) {
1836 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1837 if (ret) {
1838 if (local_err != NULL) {
1839 error_propagate(errp, local_err);
1840 } else {
d8b6895f
LC
1841 error_setg(errp, "failed while preparing to reopen image '%s'",
1842 reopen_state->bs->filename);
e971aa12
JC
1843 }
1844 goto error;
1845 }
1846 } else {
1847 /* It is currently mandatory to have a bdrv_reopen_prepare()
1848 * handler for each supported drv. */
81e5f78a
AG
1849 error_setg(errp, "Block format '%s' used by node '%s' "
1850 "does not support reopening files", drv->format_name,
1851 bdrv_get_device_or_node_name(reopen_state->bs));
e971aa12
JC
1852 ret = -1;
1853 goto error;
1854 }
1855
1856 ret = 0;
1857
1858error:
1859 return ret;
1860}
1861
1862/*
1863 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1864 * makes them final by swapping the staging BlockDriverState contents into
1865 * the active BlockDriverState contents.
1866 */
1867void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1868{
1869 BlockDriver *drv;
1870
1871 assert(reopen_state != NULL);
1872 drv = reopen_state->bs->drv;
1873 assert(drv != NULL);
1874
1875 /* If there are any driver level actions to take */
1876 if (drv->bdrv_reopen_commit) {
1877 drv->bdrv_reopen_commit(reopen_state);
1878 }
1879
1880 /* set BDS specific flags now */
1881 reopen_state->bs->open_flags = reopen_state->flags;
1882 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1883 BDRV_O_CACHE_WB);
1884 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac 1885
3baca891 1886 bdrv_refresh_limits(reopen_state->bs, NULL);
e971aa12
JC
1887}
1888
1889/*
1890 * Abort the reopen, and delete and free the staged changes in
1891 * reopen_state
1892 */
1893void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1894{
1895 BlockDriver *drv;
1896
1897 assert(reopen_state != NULL);
1898 drv = reopen_state->bs->drv;
1899 assert(drv != NULL);
1900
1901 if (drv->bdrv_reopen_abort) {
1902 drv->bdrv_reopen_abort(reopen_state);
1903 }
1904}
1905
1906
fc01f7e7
FB
1907void bdrv_close(BlockDriverState *bs)
1908{
33384421
HR
1909 BdrvAioNotifier *ban, *ban_next;
1910
3cbc002c
PB
1911 if (bs->job) {
1912 block_job_cancel_sync(bs->job);
1913 }
58fda173
SH
1914 bdrv_drain_all(); /* complete I/O */
1915 bdrv_flush(bs);
1916 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1917 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1918
3cbc002c 1919 if (bs->drv) {
557df6ac 1920 if (bs->backing_hd) {
826b6ca0
FZ
1921 BlockDriverState *backing_hd = bs->backing_hd;
1922 bdrv_set_backing_hd(bs, NULL);
1923 bdrv_unref(backing_hd);
557df6ac 1924 }
ea2384d3 1925 bs->drv->bdrv_close(bs);
7267c094 1926 g_free(bs->opaque);
ea2384d3
FB
1927 bs->opaque = NULL;
1928 bs->drv = NULL;
53fec9d3 1929 bs->copy_on_read = 0;
a275fa42
PB
1930 bs->backing_file[0] = '\0';
1931 bs->backing_format[0] = '\0';
6405875c
PB
1932 bs->total_sectors = 0;
1933 bs->encrypted = 0;
1934 bs->valid_key = 0;
1935 bs->sg = 0;
0d51b4de 1936 bs->zero_beyond_eof = false;
de9c0cec
KW
1937 QDECREF(bs->options);
1938 bs->options = NULL;
91af7014
HR
1939 QDECREF(bs->full_open_options);
1940 bs->full_open_options = NULL;
b338082b 1941
66f82cee 1942 if (bs->file != NULL) {
4f6fd349 1943 bdrv_unref(bs->file);
0ac9377d 1944 bs->file = NULL;
66f82cee 1945 }
b338082b 1946 }
98f90dba 1947
a7f53e26
MA
1948 if (bs->blk) {
1949 blk_dev_change_media_cb(bs->blk, false);
1950 }
9ca11154 1951
98f90dba
ZYW
1952 /*throttling disk I/O limits*/
1953 if (bs->io_limits_enabled) {
1954 bdrv_io_limits_disable(bs);
1955 }
33384421
HR
1956
1957 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1958 g_free(ban);
1959 }
1960 QLIST_INIT(&bs->aio_notifiers);
b338082b
FB
1961}
1962
2bc93fed
MK
1963void bdrv_close_all(void)
1964{
1965 BlockDriverState *bs;
1966
dc364f4c 1967 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
1968 AioContext *aio_context = bdrv_get_aio_context(bs);
1969
1970 aio_context_acquire(aio_context);
2bc93fed 1971 bdrv_close(bs);
ed78cda3 1972 aio_context_release(aio_context);
2bc93fed
MK
1973 }
1974}
1975
88266f5a
SH
1976/* Check if any requests are in-flight (including throttled requests) */
1977static bool bdrv_requests_pending(BlockDriverState *bs)
1978{
1979 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1980 return true;
1981 }
cc0681c4
BC
1982 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1983 return true;
1984 }
1985 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1986 return true;
1987 }
1988 if (bs->file && bdrv_requests_pending(bs->file)) {
1989 return true;
1990 }
1991 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1992 return true;
1993 }
1994 return false;
1995}
1996
5b98db0a
SH
1997static bool bdrv_drain_one(BlockDriverState *bs)
1998{
1999 bool bs_busy;
2000
2001 bdrv_flush_io_queue(bs);
2002 bdrv_start_throttled_reqs(bs);
2003 bs_busy = bdrv_requests_pending(bs);
2004 bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2005 return bs_busy;
2006}
2007
2008/*
2009 * Wait for pending requests to complete on a single BlockDriverState subtree
2010 *
2011 * See the warning in bdrv_drain_all(). This function can only be called if
2012 * you are sure nothing can generate I/O because you have op blockers
2013 * installed.
2014 *
2015 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2016 * AioContext.
2017 */
2018void bdrv_drain(BlockDriverState *bs)
2019{
2020 while (bdrv_drain_one(bs)) {
2021 /* Keep iterating */
2022 }
2023}
2024
922453bc
SH
2025/*
2026 * Wait for pending requests to complete across all BlockDriverStates
2027 *
2028 * This function does not flush data to disk, use bdrv_flush_all() for that
2029 * after calling this function.
4c355d53
ZYW
2030 *
2031 * Note that completion of an asynchronous I/O operation can trigger any
2032 * number of other I/O operations on other devices---for example a coroutine
2033 * can be arbitrarily complex and a constant flow of I/O can come until the
2034 * coroutine is complete. Because of this, it is not possible to have a
2035 * function to drain a single device's I/O queue.
922453bc
SH
2036 */
2037void bdrv_drain_all(void)
2038{
88266f5a
SH
2039 /* Always run first iteration so any pending completion BHs run */
2040 bool busy = true;
922453bc
SH
2041 BlockDriverState *bs;
2042
69da3b0b
FZ
2043 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2044 AioContext *aio_context = bdrv_get_aio_context(bs);
2045
2046 aio_context_acquire(aio_context);
2047 if (bs->job) {
2048 block_job_pause(bs->job);
2049 }
2050 aio_context_release(aio_context);
2051 }
2052
88266f5a 2053 while (busy) {
9b536adc
SH
2054 busy = false;
2055
dc364f4c 2056 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
9b536adc 2057 AioContext *aio_context = bdrv_get_aio_context(bs);
9b536adc
SH
2058
2059 aio_context_acquire(aio_context);
5b98db0a 2060 busy |= bdrv_drain_one(bs);
9b536adc 2061 aio_context_release(aio_context);
9b536adc 2062 }
922453bc 2063 }
69da3b0b
FZ
2064
2065 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2066 AioContext *aio_context = bdrv_get_aio_context(bs);
2067
2068 aio_context_acquire(aio_context);
2069 if (bs->job) {
2070 block_job_resume(bs->job);
2071 }
2072 aio_context_release(aio_context);
2073 }
922453bc
SH
2074}
2075
dc364f4c
BC
2076/* make a BlockDriverState anonymous by removing from bdrv_state and
2077 * graph_bdrv_state list.
d22b2f41
RH
2078 Also, NULL terminate the device_name to prevent double remove */
2079void bdrv_make_anon(BlockDriverState *bs)
2080{
bfb197e0
MA
2081 /*
2082 * Take care to remove bs from bdrv_states only when it's actually
2083 * in it. Note that bs->device_list.tqe_prev is initially null,
2084 * and gets set to non-null by QTAILQ_INSERT_TAIL(). Establish
2085 * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2086 * resetting it to null on remove.
2087 */
2088 if (bs->device_list.tqe_prev) {
dc364f4c 2089 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
bfb197e0 2090 bs->device_list.tqe_prev = NULL;
d22b2f41 2091 }
dc364f4c
BC
2092 if (bs->node_name[0] != '\0') {
2093 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2094 }
2095 bs->node_name[0] = '\0';
d22b2f41
RH
2096}
2097
e023b2e2
PB
2098static void bdrv_rebind(BlockDriverState *bs)
2099{
2100 if (bs->drv && bs->drv->bdrv_rebind) {
2101 bs->drv->bdrv_rebind(bs);
2102 }
2103}
2104
4ddc07ca
PB
2105static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2106 BlockDriverState *bs_src)
8802d1fd 2107{
4ddc07ca 2108 /* move some fields that need to stay attached to the device */
8802d1fd
JC
2109
2110 /* dev info */
1b7fd729 2111 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 2112 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 2113
4ddc07ca 2114 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 2115
cc0681c4
BC
2116 /* i/o throttled req */
2117 memcpy(&bs_dest->throttle_state,
2118 &bs_src->throttle_state,
2119 sizeof(ThrottleState));
2120 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
2121 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 2122 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 2123
8802d1fd 2124 /* r/w error */
4ddc07ca
PB
2125 bs_dest->on_read_error = bs_src->on_read_error;
2126 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
2127
2128 /* i/o status */
4ddc07ca
PB
2129 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
2130 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 2131
a9fc4408 2132 /* dirty bitmap */
e4654d2d 2133 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 2134
9fcb0251
FZ
2135 /* reference count */
2136 bs_dest->refcnt = bs_src->refcnt;
2137
a9fc4408 2138 /* job */
4ddc07ca 2139 bs_dest->job = bs_src->job;
a9fc4408 2140
8802d1fd 2141 /* keep the same entry in bdrv_states */
dc364f4c 2142 bs_dest->device_list = bs_src->device_list;
7e7d56d9
MA
2143 bs_dest->blk = bs_src->blk;
2144
fbe40ff7
FZ
2145 memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2146 sizeof(bs_dest->op_blockers));
4ddc07ca 2147}
8802d1fd 2148
4ddc07ca
PB
2149/*
2150 * Swap bs contents for two image chains while they are live,
2151 * while keeping required fields on the BlockDriverState that is
2152 * actually attached to a device.
2153 *
2154 * This will modify the BlockDriverState fields, and swap contents
2155 * between bs_new and bs_old. Both bs_new and bs_old are modified.
2156 *
bfb197e0 2157 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2158 *
2159 * This function does not create any image files.
2160 */
2161void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2162{
2163 BlockDriverState tmp;
f6801b83 2164
90ce8a06
BC
2165 /* The code needs to swap the node_name but simply swapping node_list won't
2166 * work so first remove the nodes from the graph list, do the swap then
2167 * insert them back if needed.
2168 */
2169 if (bs_new->node_name[0] != '\0') {
2170 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2171 }
2172 if (bs_old->node_name[0] != '\0') {
2173 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2174 }
2175
bfb197e0 2176 /* bs_new must be unattached and shouldn't have anything fancy enabled */
7e7d56d9 2177 assert(!bs_new->blk);
e4654d2d 2178 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca 2179 assert(bs_new->job == NULL);
4ddc07ca 2180 assert(bs_new->io_limits_enabled == false);
cc0681c4 2181 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 2182
4ddc07ca
PB
2183 tmp = *bs_new;
2184 *bs_new = *bs_old;
2185 *bs_old = tmp;
a9fc4408 2186
4ddc07ca
PB
2187 /* there are some fields that should not be swapped, move them back */
2188 bdrv_move_feature_fields(&tmp, bs_old);
2189 bdrv_move_feature_fields(bs_old, bs_new);
2190 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 2191
bfb197e0 2192 /* bs_new must remain unattached */
7e7d56d9 2193 assert(!bs_new->blk);
4ddc07ca
PB
2194
2195 /* Check a few fields that should remain attached to the device */
4ddc07ca 2196 assert(bs_new->job == NULL);
4ddc07ca 2197 assert(bs_new->io_limits_enabled == false);
cc0681c4 2198 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 2199
90ce8a06
BC
2200 /* insert the nodes back into the graph node list if needed */
2201 if (bs_new->node_name[0] != '\0') {
2202 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2203 }
2204 if (bs_old->node_name[0] != '\0') {
2205 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2206 }
2207
e023b2e2 2208 bdrv_rebind(bs_new);
4ddc07ca
PB
2209 bdrv_rebind(bs_old);
2210}
2211
2212/*
2213 * Add new bs contents at the top of an image chain while the chain is
2214 * live, while keeping required fields on the top layer.
2215 *
2216 * This will modify the BlockDriverState fields, and swap contents
2217 * between bs_new and bs_top. Both bs_new and bs_top are modified.
2218 *
bfb197e0 2219 * bs_new must not be attached to a BlockBackend.
4ddc07ca
PB
2220 *
2221 * This function does not create any image files.
2222 */
2223void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2224{
2225 bdrv_swap(bs_new, bs_top);
2226
2227 /* The contents of 'tmp' will become bs_top, as we are
2228 * swapping bs_new and bs_top contents. */
8d24cce1 2229 bdrv_set_backing_hd(bs_top, bs_new);
8802d1fd
JC
2230}
2231
4f6fd349 2232static void bdrv_delete(BlockDriverState *bs)
b338082b 2233{
3e914655 2234 assert(!bs->job);
3718d8ab 2235 assert(bdrv_op_blocker_is_empty(bs));
4f6fd349 2236 assert(!bs->refcnt);
e4654d2d 2237 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2238
e1b5c52e
SH
2239 bdrv_close(bs);
2240
1b7bdbc1 2241 /* remove from list, if necessary */
d22b2f41 2242 bdrv_make_anon(bs);
34c6f050 2243
7267c094 2244 g_free(bs);
fc01f7e7
FB
2245}
2246
e97fc193
AL
2247/*
2248 * Run consistency checks on an image
2249 *
e076f338 2250 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2251 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2252 * check are stored in res.
e97fc193 2253 */
4534ff54 2254int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193 2255{
908bcd54
HR
2256 if (bs->drv == NULL) {
2257 return -ENOMEDIUM;
2258 }
e97fc193
AL
2259 if (bs->drv->bdrv_check == NULL) {
2260 return -ENOTSUP;
2261 }
2262
e076f338 2263 memset(res, 0, sizeof(*res));
4534ff54 2264 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2265}
2266
8a426614
KW
2267#define COMMIT_BUF_SECTORS 2048
2268
33e3963e
FB
2269/* commit COW file into the raw image */
2270int bdrv_commit(BlockDriverState *bs)
2271{
19cb3738 2272 BlockDriver *drv = bs->drv;
72706ea4 2273 int64_t sector, total_sectors, length, backing_length;
8a426614 2274 int n, ro, open_flags;
0bce597d 2275 int ret = 0;
72706ea4 2276 uint8_t *buf = NULL;
33e3963e 2277
19cb3738
FB
2278 if (!drv)
2279 return -ENOMEDIUM;
6bb45158 2280
4dca4b63
NS
2281 if (!bs->backing_hd) {
2282 return -ENOTSUP;
33e3963e
FB
2283 }
2284
bb00021d
FZ
2285 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2286 bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2d3735d3
SH
2287 return -EBUSY;
2288 }
2289
4dca4b63 2290 ro = bs->backing_hd->read_only;
4dca4b63
NS
2291 open_flags = bs->backing_hd->open_flags;
2292
2293 if (ro) {
0bce597d
JC
2294 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2295 return -EACCES;
4dca4b63 2296 }
ea2384d3 2297 }
33e3963e 2298
72706ea4
JC
2299 length = bdrv_getlength(bs);
2300 if (length < 0) {
2301 ret = length;
2302 goto ro_cleanup;
2303 }
2304
2305 backing_length = bdrv_getlength(bs->backing_hd);
2306 if (backing_length < 0) {
2307 ret = backing_length;
2308 goto ro_cleanup;
2309 }
2310
2311 /* If our top snapshot is larger than the backing file image,
2312 * grow the backing file image if possible. If not possible,
2313 * we must return an error */
2314 if (length > backing_length) {
2315 ret = bdrv_truncate(bs->backing_hd, length);
2316 if (ret < 0) {
2317 goto ro_cleanup;
2318 }
2319 }
2320
2321 total_sectors = length >> BDRV_SECTOR_BITS;
857d4f46
KW
2322
2323 /* qemu_try_blockalign() for bs will choose an alignment that works for
2324 * bs->backing_hd as well, so no need to compare the alignment manually. */
2325 buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2326 if (buf == NULL) {
2327 ret = -ENOMEM;
2328 goto ro_cleanup;
2329 }
8a426614
KW
2330
2331 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2332 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2333 if (ret < 0) {
2334 goto ro_cleanup;
2335 }
2336 if (ret) {
dabfa6cc
KW
2337 ret = bdrv_read(bs, sector, buf, n);
2338 if (ret < 0) {
8a426614
KW
2339 goto ro_cleanup;
2340 }
2341
dabfa6cc
KW
2342 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2343 if (ret < 0) {
8a426614
KW
2344 goto ro_cleanup;
2345 }
ea2384d3 2346 }
33e3963e 2347 }
95389c86 2348
1d44952f
CH
2349 if (drv->bdrv_make_empty) {
2350 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2351 if (ret < 0) {
2352 goto ro_cleanup;
2353 }
1d44952f
CH
2354 bdrv_flush(bs);
2355 }
95389c86 2356
3f5075ae
CH
2357 /*
2358 * Make sure all data we wrote to the backing device is actually
2359 * stable on disk.
2360 */
dabfa6cc 2361 if (bs->backing_hd) {
3f5075ae 2362 bdrv_flush(bs->backing_hd);
dabfa6cc 2363 }
4dca4b63 2364
dabfa6cc 2365 ret = 0;
4dca4b63 2366ro_cleanup:
857d4f46 2367 qemu_vfree(buf);
4dca4b63
NS
2368
2369 if (ro) {
0bce597d
JC
2370 /* ignoring error return here */
2371 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2372 }
2373
1d44952f 2374 return ret;
33e3963e
FB
2375}
2376
e8877497 2377int bdrv_commit_all(void)
6ab4b5ab
MA
2378{
2379 BlockDriverState *bs;
2380
dc364f4c 2381 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
2382 AioContext *aio_context = bdrv_get_aio_context(bs);
2383
2384 aio_context_acquire(aio_context);
272d2d8e
JC
2385 if (bs->drv && bs->backing_hd) {
2386 int ret = bdrv_commit(bs);
2387 if (ret < 0) {
ed78cda3 2388 aio_context_release(aio_context);
272d2d8e
JC
2389 return ret;
2390 }
e8877497 2391 }
ed78cda3 2392 aio_context_release(aio_context);
6ab4b5ab 2393 }
e8877497 2394 return 0;
6ab4b5ab
MA
2395}
2396
dbffbdcf
SH
2397/**
2398 * Remove an active request from the tracked requests list
2399 *
2400 * This function should be called when a tracked request is completing.
2401 */
2402static void tracked_request_end(BdrvTrackedRequest *req)
2403{
2dbafdc0
KW
2404 if (req->serialising) {
2405 req->bs->serialising_in_flight--;
2406 }
2407
dbffbdcf 2408 QLIST_REMOVE(req, list);
f4658285 2409 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2410}
2411
2412/**
2413 * Add an active request to the tracked requests list
2414 */
2415static void tracked_request_begin(BdrvTrackedRequest *req,
2416 BlockDriverState *bs,
793ed47a
KW
2417 int64_t offset,
2418 unsigned int bytes, bool is_write)
dbffbdcf
SH
2419{
2420 *req = (BdrvTrackedRequest){
2421 .bs = bs,
2dbafdc0
KW
2422 .offset = offset,
2423 .bytes = bytes,
2424 .is_write = is_write,
2425 .co = qemu_coroutine_self(),
2426 .serialising = false,
7327145f
KW
2427 .overlap_offset = offset,
2428 .overlap_bytes = bytes,
dbffbdcf
SH
2429 };
2430
f4658285
SH
2431 qemu_co_queue_init(&req->wait_queue);
2432
dbffbdcf
SH
2433 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2434}
2435
e96126ff 2436static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2437{
7327145f 2438 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2439 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2440 - overlap_offset;
7327145f 2441
2dbafdc0
KW
2442 if (!req->serialising) {
2443 req->bs->serialising_in_flight++;
2444 req->serialising = true;
2445 }
7327145f
KW
2446
2447 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2448 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2449}
2450
d83947ac
SH
2451/**
2452 * Round a region to cluster boundaries
2453 */
343bded4
PB
2454void bdrv_round_to_clusters(BlockDriverState *bs,
2455 int64_t sector_num, int nb_sectors,
2456 int64_t *cluster_sector_num,
2457 int *cluster_nb_sectors)
d83947ac
SH
2458{
2459 BlockDriverInfo bdi;
2460
2461 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2462 *cluster_sector_num = sector_num;
2463 *cluster_nb_sectors = nb_sectors;
2464 } else {
2465 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2466 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2467 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2468 nb_sectors, c);
2469 }
2470}
2471
7327145f 2472static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2473{
2474 BlockDriverInfo bdi;
7327145f 2475 int ret;
793ed47a 2476
7327145f
KW
2477 ret = bdrv_get_info(bs, &bdi);
2478 if (ret < 0 || bdi.cluster_size == 0) {
2479 return bs->request_alignment;
793ed47a 2480 } else {
7327145f 2481 return bdi.cluster_size;
793ed47a
KW
2482 }
2483}
2484
f4658285 2485static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2486 int64_t offset, unsigned int bytes)
2487{
d83947ac 2488 /* aaaa bbbb */
7327145f 2489 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2490 return false;
2491 }
2492 /* bbbb aaaa */
7327145f 2493 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2494 return false;
2495 }
2496 return true;
f4658285
SH
2497}
2498
28de2dcd 2499static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2500{
2dbafdc0 2501 BlockDriverState *bs = self->bs;
f4658285
SH
2502 BdrvTrackedRequest *req;
2503 bool retry;
28de2dcd 2504 bool waited = false;
f4658285 2505
2dbafdc0 2506 if (!bs->serialising_in_flight) {
28de2dcd 2507 return false;
2dbafdc0
KW
2508 }
2509
f4658285
SH
2510 do {
2511 retry = false;
2512 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2513 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2514 continue;
2515 }
7327145f
KW
2516 if (tracked_request_overlaps(req, self->overlap_offset,
2517 self->overlap_bytes))
2518 {
5f8b6491
SH
2519 /* Hitting this means there was a reentrant request, for
2520 * example, a block driver issuing nested requests. This must
2521 * never happen since it means deadlock.
2522 */
2523 assert(qemu_coroutine_self() != req->co);
2524
6460440f
KW
2525 /* If the request is already (indirectly) waiting for us, or
2526 * will wait for us as soon as it wakes up, then just go on
2527 * (instead of producing a deadlock in the former case). */
2528 if (!req->waiting_for) {
2529 self->waiting_for = req;
2530 qemu_co_queue_wait(&req->wait_queue);
2531 self->waiting_for = NULL;
2532 retry = true;
28de2dcd 2533 waited = true;
6460440f
KW
2534 break;
2535 }
f4658285
SH
2536 }
2537 }
2538 } while (retry);
28de2dcd
KW
2539
2540 return waited;
f4658285
SH
2541}
2542
756e6736
KW
2543/*
2544 * Return values:
2545 * 0 - success
2546 * -EINVAL - backing format specified, but no file
2547 * -ENOSPC - can't update the backing file because no space is left in the
2548 * image file header
2549 * -ENOTSUP - format driver doesn't support changing the backing file
2550 */
2551int bdrv_change_backing_file(BlockDriverState *bs,
2552 const char *backing_file, const char *backing_fmt)
2553{
2554 BlockDriver *drv = bs->drv;
469ef350 2555 int ret;
756e6736 2556
5f377794
PB
2557 /* Backing file format doesn't make sense without a backing file */
2558 if (backing_fmt && !backing_file) {
2559 return -EINVAL;
2560 }
2561
756e6736 2562 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2563 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2564 } else {
469ef350 2565 ret = -ENOTSUP;
756e6736 2566 }
469ef350
PB
2567
2568 if (ret == 0) {
2569 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2570 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2571 }
2572 return ret;
756e6736
KW
2573}
2574
6ebdcee2
JC
2575/*
2576 * Finds the image layer in the chain that has 'bs' as its backing file.
2577 *
2578 * active is the current topmost image.
2579 *
2580 * Returns NULL if bs is not found in active's image chain,
2581 * or if active == bs.
4caf0fcd
JC
2582 *
2583 * Returns the bottommost base image if bs == NULL.
6ebdcee2
JC
2584 */
2585BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2586 BlockDriverState *bs)
2587{
4caf0fcd
JC
2588 while (active && bs != active->backing_hd) {
2589 active = active->backing_hd;
6ebdcee2
JC
2590 }
2591
4caf0fcd
JC
2592 return active;
2593}
6ebdcee2 2594
4caf0fcd
JC
2595/* Given a BDS, searches for the base layer. */
2596BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2597{
2598 return bdrv_find_overlay(bs, NULL);
6ebdcee2
JC
2599}
2600
2601typedef struct BlkIntermediateStates {
2602 BlockDriverState *bs;
2603 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2604} BlkIntermediateStates;
2605
2606
2607/*
2608 * Drops images above 'base' up to and including 'top', and sets the image
2609 * above 'top' to have base as its backing file.
2610 *
2611 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2612 * information in 'bs' can be properly updated.
2613 *
2614 * E.g., this will convert the following chain:
2615 * bottom <- base <- intermediate <- top <- active
2616 *
2617 * to
2618 *
2619 * bottom <- base <- active
2620 *
2621 * It is allowed for bottom==base, in which case it converts:
2622 *
2623 * base <- intermediate <- top <- active
2624 *
2625 * to
2626 *
2627 * base <- active
2628 *
54e26900
JC
2629 * If backing_file_str is non-NULL, it will be used when modifying top's
2630 * overlay image metadata.
2631 *
6ebdcee2
JC
2632 * Error conditions:
2633 * if active == top, that is considered an error
2634 *
2635 */
2636int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
54e26900 2637 BlockDriverState *base, const char *backing_file_str)
6ebdcee2
JC
2638{
2639 BlockDriverState *intermediate;
2640 BlockDriverState *base_bs = NULL;
2641 BlockDriverState *new_top_bs = NULL;
2642 BlkIntermediateStates *intermediate_state, *next;
2643 int ret = -EIO;
2644
2645 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2646 QSIMPLEQ_INIT(&states_to_delete);
2647
2648 if (!top->drv || !base->drv) {
2649 goto exit;
2650 }
2651
2652 new_top_bs = bdrv_find_overlay(active, top);
2653
2654 if (new_top_bs == NULL) {
2655 /* we could not find the image above 'top', this is an error */
2656 goto exit;
2657 }
2658
2659 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2660 * to do, no intermediate images */
2661 if (new_top_bs->backing_hd == base) {
2662 ret = 0;
2663 goto exit;
2664 }
2665
2666 intermediate = top;
2667
2668 /* now we will go down through the list, and add each BDS we find
2669 * into our deletion queue, until we hit the 'base'
2670 */
2671 while (intermediate) {
5839e53b 2672 intermediate_state = g_new0(BlkIntermediateStates, 1);
6ebdcee2
JC
2673 intermediate_state->bs = intermediate;
2674 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2675
2676 if (intermediate->backing_hd == base) {
2677 base_bs = intermediate->backing_hd;
2678 break;
2679 }
2680 intermediate = intermediate->backing_hd;
2681 }
2682 if (base_bs == NULL) {
2683 /* something went wrong, we did not end at the base. safely
2684 * unravel everything, and exit with error */
2685 goto exit;
2686 }
2687
2688 /* success - we can delete the intermediate states, and link top->base */
54e26900
JC
2689 backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2690 ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
6ebdcee2
JC
2691 base_bs->drv ? base_bs->drv->format_name : "");
2692 if (ret) {
2693 goto exit;
2694 }
920beae1 2695 bdrv_set_backing_hd(new_top_bs, base_bs);
6ebdcee2
JC
2696
2697 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2698 /* so that bdrv_close() does not recursively close the chain */
920beae1 2699 bdrv_set_backing_hd(intermediate_state->bs, NULL);
4f6fd349 2700 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2701 }
2702 ret = 0;
2703
2704exit:
2705 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2706 g_free(intermediate_state);
2707 }
2708 return ret;
2709}
2710
2711
71d0770c
AL
2712static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2713 size_t size)
2714{
75af1f34 2715 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
1dd3a447
KW
2716 return -EIO;
2717 }
2718
c0191e76 2719 if (!bdrv_is_inserted(bs)) {
71d0770c 2720 return -ENOMEDIUM;
c0191e76 2721 }
71d0770c 2722
c0191e76 2723 if (offset < 0) {
71d0770c 2724 return -EIO;
c0191e76 2725 }
71d0770c
AL
2726
2727 return 0;
2728}
2729
2730static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2731 int nb_sectors)
2732{
75af1f34 2733 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
8f4754ed
KW
2734 return -EIO;
2735 }
2736
eb5a3165
JS
2737 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2738 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2739}
2740
1c9805a3
SH
2741typedef struct RwCo {
2742 BlockDriverState *bs;
775aa8b6 2743 int64_t offset;
1c9805a3
SH
2744 QEMUIOVector *qiov;
2745 bool is_write;
2746 int ret;
4105eaaa 2747 BdrvRequestFlags flags;
1c9805a3
SH
2748} RwCo;
2749
2750static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2751{
1c9805a3 2752 RwCo *rwco = opaque;
ea2384d3 2753
1c9805a3 2754 if (!rwco->is_write) {
775aa8b6
KW
2755 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2756 rwco->qiov->size, rwco->qiov,
4105eaaa 2757 rwco->flags);
775aa8b6
KW
2758 } else {
2759 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2760 rwco->qiov->size, rwco->qiov,
2761 rwco->flags);
1c9805a3
SH
2762 }
2763}
e7a8a783 2764
1c9805a3 2765/*
8d3b1a2d 2766 * Process a vectored synchronous request using coroutines
1c9805a3 2767 */
775aa8b6
KW
2768static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2769 QEMUIOVector *qiov, bool is_write,
2770 BdrvRequestFlags flags)
1c9805a3 2771{
1c9805a3
SH
2772 Coroutine *co;
2773 RwCo rwco = {
2774 .bs = bs,
775aa8b6 2775 .offset = offset,
8d3b1a2d 2776 .qiov = qiov,
1c9805a3
SH
2777 .is_write = is_write,
2778 .ret = NOT_DONE,
4105eaaa 2779 .flags = flags,
1c9805a3 2780 };
e7a8a783 2781
498e386c
ZYW
2782 /**
2783 * In sync call context, when the vcpu is blocked, this throttling timer
2784 * will not fire; so the I/O throttling function has to be disabled here
2785 * if it has been enabled.
2786 */
2787 if (bs->io_limits_enabled) {
2788 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2789 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2790 bdrv_io_limits_disable(bs);
2791 }
2792
1c9805a3
SH
2793 if (qemu_in_coroutine()) {
2794 /* Fast-path if already in coroutine context */
2795 bdrv_rw_co_entry(&rwco);
2796 } else {
2572b37a
SH
2797 AioContext *aio_context = bdrv_get_aio_context(bs);
2798
1c9805a3
SH
2799 co = qemu_coroutine_create(bdrv_rw_co_entry);
2800 qemu_coroutine_enter(co, &rwco);
2801 while (rwco.ret == NOT_DONE) {
2572b37a 2802 aio_poll(aio_context, true);
1c9805a3
SH
2803 }
2804 }
2805 return rwco.ret;
2806}
b338082b 2807
8d3b1a2d
KW
2808/*
2809 * Process a synchronous request using coroutines
2810 */
2811static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2812 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2813{
2814 QEMUIOVector qiov;
2815 struct iovec iov = {
2816 .iov_base = (void *)buf,
2817 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2818 };
2819
75af1f34 2820 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
da15ee51
KW
2821 return -EINVAL;
2822 }
2823
8d3b1a2d 2824 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2825 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2826 &qiov, is_write, flags);
8d3b1a2d
KW
2827}
2828
1c9805a3
SH
2829/* return < 0 if error. See bdrv_write() for the return codes */
2830int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2831 uint8_t *buf, int nb_sectors)
2832{
4105eaaa 2833 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2834}
2835
07d27a44
MA
2836/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2837int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2838 uint8_t *buf, int nb_sectors)
2839{
2840 bool enabled;
2841 int ret;
2842
2843 enabled = bs->io_limits_enabled;
2844 bs->io_limits_enabled = false;
4e7395e8 2845 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2846 bs->io_limits_enabled = enabled;
2847 return ret;
2848}
2849
5fafdf24 2850/* Return < 0 if error. Important errors are:
19cb3738
FB
2851 -EIO generic I/O error (may happen for all errors)
2852 -ENOMEDIUM No media inserted.
2853 -EINVAL Invalid sector number or nb_sectors
2854 -EACCES Trying to write a read-only device
2855*/
5fafdf24 2856int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2857 const uint8_t *buf, int nb_sectors)
2858{
4105eaaa 2859 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2860}
2861
aa7bfbff
PL
2862int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2863 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2864{
2865 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2866 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2867}
2868
d75cbb5e
PL
2869/*
2870 * Completely zero out a block device with the help of bdrv_write_zeroes.
2871 * The operation is sped up by checking the block status and only writing
2872 * zeroes to the device if they currently do not return zeroes. Optional
2873 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2874 *
2875 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2876 */
2877int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2878{
d32f7c10 2879 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
d75cbb5e
PL
2880 int n;
2881
d32f7c10
MA
2882 target_sectors = bdrv_nb_sectors(bs);
2883 if (target_sectors < 0) {
2884 return target_sectors;
9ce10c0b 2885 }
9ce10c0b 2886
d75cbb5e 2887 for (;;) {
75af1f34 2888 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
d75cbb5e
PL
2889 if (nb_sectors <= 0) {
2890 return 0;
2891 }
d75cbb5e 2892 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2893 if (ret < 0) {
2894 error_report("error getting block status at sector %" PRId64 ": %s",
2895 sector_num, strerror(-ret));
2896 return ret;
2897 }
d75cbb5e
PL
2898 if (ret & BDRV_BLOCK_ZERO) {
2899 sector_num += n;
2900 continue;
2901 }
2902 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2903 if (ret < 0) {
2904 error_report("error writing zeroes at sector %" PRId64 ": %s",
2905 sector_num, strerror(-ret));
2906 return ret;
2907 }
2908 sector_num += n;
2909 }
2910}
2911
a3ef6571 2912int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2913{
a3ef6571
KW
2914 QEMUIOVector qiov;
2915 struct iovec iov = {
2916 .iov_base = (void *)buf,
2917 .iov_len = bytes,
2918 };
9a8c4cce 2919 int ret;
83f64091 2920
a3ef6571
KW
2921 if (bytes < 0) {
2922 return -EINVAL;
83f64091
FB
2923 }
2924
a3ef6571
KW
2925 qemu_iovec_init_external(&qiov, &iov, 1);
2926 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2927 if (ret < 0) {
2928 return ret;
83f64091 2929 }
a3ef6571
KW
2930
2931 return bytes;
83f64091
FB
2932}
2933
8d3b1a2d 2934int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2935{
9a8c4cce 2936 int ret;
83f64091 2937
8407d5d7
KW
2938 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2939 if (ret < 0) {
2940 return ret;
83f64091
FB
2941 }
2942
8d3b1a2d
KW
2943 return qiov->size;
2944}
2945
2946int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2947 const void *buf, int bytes)
8d3b1a2d
KW
2948{
2949 QEMUIOVector qiov;
2950 struct iovec iov = {
2951 .iov_base = (void *) buf,
8407d5d7 2952 .iov_len = bytes,
8d3b1a2d
KW
2953 };
2954
8407d5d7
KW
2955 if (bytes < 0) {
2956 return -EINVAL;
2957 }
2958
8d3b1a2d
KW
2959 qemu_iovec_init_external(&qiov, &iov, 1);
2960 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2961}
83f64091 2962
f08145fe
KW
2963/*
2964 * Writes to the file and ensures that no writes are reordered across this
2965 * request (acts as a barrier)
2966 *
2967 * Returns 0 on success, -errno in error cases.
2968 */
2969int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2970 const void *buf, int count)
2971{
2972 int ret;
2973
2974 ret = bdrv_pwrite(bs, offset, buf, count);
2975 if (ret < 0) {
2976 return ret;
2977 }
2978
f05fa4ad
PB
2979 /* No flush needed for cache modes that already do it */
2980 if (bs->enable_write_cache) {
f08145fe
KW
2981 bdrv_flush(bs);
2982 }
2983
2984 return 0;
2985}
2986
470c0504 2987static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2988 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2989{
2990 /* Perform I/O through a temporary buffer so that users who scribble over
2991 * their read buffer while the operation is in progress do not end up
2992 * modifying the image file. This is critical for zero-copy guest I/O
2993 * where anything might happen inside guest memory.
2994 */
2995 void *bounce_buffer;
2996
79c053bd 2997 BlockDriver *drv = bs->drv;
ab185921
SH
2998 struct iovec iov;
2999 QEMUIOVector bounce_qiov;
3000 int64_t cluster_sector_num;
3001 int cluster_nb_sectors;
3002 size_t skip_bytes;
3003 int ret;
3004
3005 /* Cover entire cluster so no additional backing file I/O is required when
3006 * allocating cluster in the image file.
3007 */
343bded4
PB
3008 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3009 &cluster_sector_num, &cluster_nb_sectors);
ab185921 3010
470c0504
SH
3011 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3012 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
3013
3014 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
857d4f46
KW
3015 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3016 if (bounce_buffer == NULL) {
3017 ret = -ENOMEM;
3018 goto err;
3019 }
3020
ab185921
SH
3021 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3022
79c053bd
SH
3023 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3024 &bounce_qiov);
ab185921
SH
3025 if (ret < 0) {
3026 goto err;
3027 }
3028
79c053bd
SH
3029 if (drv->bdrv_co_write_zeroes &&
3030 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 3031 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 3032 cluster_nb_sectors, 0);
79c053bd 3033 } else {
f05fa4ad
PB
3034 /* This does not change the data on the disk, it is not necessary
3035 * to flush even in cache=writethrough mode.
3036 */
79c053bd 3037 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 3038 &bounce_qiov);
79c053bd
SH
3039 }
3040
ab185921
SH
3041 if (ret < 0) {
3042 /* It might be okay to ignore write errors for guest requests. If this
3043 * is a deliberate copy-on-read then we don't want to ignore the error.
3044 * Simply report it in all cases.
3045 */
3046 goto err;
3047 }
3048
3049 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
3050 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3051 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
3052
3053err:
3054 qemu_vfree(bounce_buffer);
3055 return ret;
3056}
3057
c5fbe571 3058/*
d0c7f642
KW
3059 * Forwards an already correctly aligned request to the BlockDriver. This
3060 * handles copy on read and zeroing after EOF; any other features must be
3061 * implemented by the caller.
c5fbe571 3062 */
d0c7f642 3063static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 3064 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 3065 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
3066{
3067 BlockDriver *drv = bs->drv;
dbffbdcf 3068 int ret;
da1fa91d 3069
d0c7f642
KW
3070 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3071 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 3072
d0c7f642
KW
3073 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3074 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3075 assert(!qiov || bytes == qiov->size);
d0c7f642
KW
3076
3077 /* Handle Copy on Read and associated serialisation */
470c0504 3078 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
3079 /* If we touch the same cluster it counts as an overlap. This
3080 * guarantees that allocating writes will be serialized and not race
3081 * with each other for the same cluster. For example, in copy-on-read
3082 * it ensures that the CoR read and write operations are atomic and
3083 * guest writes cannot interleave between them. */
3084 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
3085 }
3086
2dbafdc0 3087 wait_serialising_requests(req);
f4658285 3088
470c0504 3089 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
3090 int pnum;
3091
bdad13b9 3092 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3093 if (ret < 0) {
3094 goto out;
3095 }
3096
3097 if (!ret || pnum != nb_sectors) {
470c0504 3098 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3099 goto out;
3100 }
3101 }
3102
d0c7f642 3103 /* Forward the request to the BlockDriver */
c0191e76 3104 if (!bs->zero_beyond_eof) {
893a8f62
MK
3105 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3106 } else {
c0191e76 3107 /* Read zeros after EOF */
4049082c 3108 int64_t total_sectors, max_nb_sectors;
893a8f62 3109
4049082c
MA
3110 total_sectors = bdrv_nb_sectors(bs);
3111 if (total_sectors < 0) {
3112 ret = total_sectors;
893a8f62
MK
3113 goto out;
3114 }
3115
5f5bcd80
KW
3116 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3117 align >> BDRV_SECTOR_BITS);
e012b78c
PB
3118 if (nb_sectors < max_nb_sectors) {
3119 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3120 } else if (max_nb_sectors > 0) {
33f461e0 3121 QEMUIOVector local_qiov;
33f461e0
KW
3122
3123 qemu_iovec_init(&local_qiov, qiov->niov);
3124 qemu_iovec_concat(&local_qiov, qiov, 0,
e012b78c 3125 max_nb_sectors * BDRV_SECTOR_SIZE);
33f461e0 3126
e012b78c 3127 ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
33f461e0
KW
3128 &local_qiov);
3129
3130 qemu_iovec_destroy(&local_qiov);
893a8f62
MK
3131 } else {
3132 ret = 0;
3133 }
3134
3135 /* Reading beyond end of file is supposed to produce zeroes */
3136 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3137 uint64_t offset = MAX(0, total_sectors - sector_num);
3138 uint64_t bytes = (sector_num + nb_sectors - offset) *
3139 BDRV_SECTOR_SIZE;
3140 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3141 }
3142 }
ab185921
SH
3143
3144out:
dbffbdcf 3145 return ret;
da1fa91d
KW
3146}
3147
fc3959e4
FZ
3148static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3149{
3150 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3151 return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3152}
3153
3154static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3155 int64_t offset, size_t bytes)
3156{
3157 int64_t align = bdrv_get_align(bs);
3158 return !(offset & (align - 1) || (bytes & (align - 1)));
3159}
3160
d0c7f642
KW
3161/*
3162 * Handle a read request in coroutine context
3163 */
1b0288ae
KW
3164static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3165 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3166 BdrvRequestFlags flags)
3167{
3168 BlockDriver *drv = bs->drv;
65afd211
KW
3169 BdrvTrackedRequest req;
3170
fc3959e4 3171 uint64_t align = bdrv_get_align(bs);
1b0288ae
KW
3172 uint8_t *head_buf = NULL;
3173 uint8_t *tail_buf = NULL;
3174 QEMUIOVector local_qiov;
3175 bool use_local_qiov = false;
d0c7f642
KW
3176 int ret;
3177
3178 if (!drv) {
3179 return -ENOMEDIUM;
3180 }
b9c64947
HR
3181
3182 ret = bdrv_check_byte_request(bs, offset, bytes);
3183 if (ret < 0) {
3184 return ret;
d0c7f642
KW
3185 }
3186
3187 if (bs->copy_on_read) {
3188 flags |= BDRV_REQ_COPY_ON_READ;
3189 }
3190
3191 /* throttling disk I/O */
3192 if (bs->io_limits_enabled) {
d5103588 3193 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3194 }
3195
3196 /* Align read if necessary by padding qiov */
3197 if (offset & (align - 1)) {
3198 head_buf = qemu_blockalign(bs, align);
3199 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3200 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3201 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3202 use_local_qiov = true;
3203
3204 bytes += offset & (align - 1);
3205 offset = offset & ~(align - 1);
3206 }
3207
3208 if ((offset + bytes) & (align - 1)) {
3209 if (!use_local_qiov) {
3210 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3211 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3212 use_local_qiov = true;
3213 }
3214 tail_buf = qemu_blockalign(bs, align);
3215 qemu_iovec_add(&local_qiov, tail_buf,
3216 align - ((offset + bytes) & (align - 1)));
3217
3218 bytes = ROUND_UP(bytes, align);
3219 }
3220
65afd211 3221 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3222 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3223 use_local_qiov ? &local_qiov : qiov,
3224 flags);
65afd211 3225 tracked_request_end(&req);
1b0288ae
KW
3226
3227 if (use_local_qiov) {
3228 qemu_iovec_destroy(&local_qiov);
3229 qemu_vfree(head_buf);
3230 qemu_vfree(tail_buf);
d0c7f642
KW
3231 }
3232
d0c7f642
KW
3233 return ret;
3234}
3235
1b0288ae
KW
3236static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3237 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3238 BdrvRequestFlags flags)
3239{
75af1f34 3240 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1b0288ae
KW
3241 return -EINVAL;
3242 }
3243
3244 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3245 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3246}
3247
c5fbe571 3248int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3249 int nb_sectors, QEMUIOVector *qiov)
3250{
c5fbe571 3251 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3252
470c0504
SH
3253 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3254}
3255
3256int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3257 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3258{
3259 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3260
3261 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3262 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3263}
3264
98764152 3265#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
c31cb707 3266
f08f2dda 3267static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3268 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3269{
3270 BlockDriver *drv = bs->drv;
3271 QEMUIOVector qiov;
c31cb707
PL
3272 struct iovec iov = {0};
3273 int ret = 0;
f08f2dda 3274
75af1f34
PL
3275 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3276 BDRV_REQUEST_MAX_SECTORS);
621f0589 3277
c31cb707
PL
3278 while (nb_sectors > 0 && !ret) {
3279 int num = nb_sectors;
3280
b8d71c09
PB
3281 /* Align request. Block drivers can expect the "bulk" of the request
3282 * to be aligned.
3283 */
3284 if (bs->bl.write_zeroes_alignment
3285 && num > bs->bl.write_zeroes_alignment) {
3286 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3287 /* Make a small request up to the first aligned sector. */
c31cb707 3288 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3289 num -= sector_num % bs->bl.write_zeroes_alignment;
3290 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3291 /* Shorten the request to the last aligned sector. num cannot
3292 * underflow because num > bs->bl.write_zeroes_alignment.
3293 */
3294 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3295 }
621f0589 3296 }
f08f2dda 3297
c31cb707
PL
3298 /* limit request size */
3299 if (num > max_write_zeroes) {
3300 num = max_write_zeroes;
3301 }
3302
3303 ret = -ENOTSUP;
3304 /* First try the efficient write zeroes operation */
3305 if (drv->bdrv_co_write_zeroes) {
3306 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3307 }
3308
3309 if (ret == -ENOTSUP) {
3310 /* Fall back to bounce buffer if write zeroes is unsupported */
095e4fa4 3311 int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
98764152 3312 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
095e4fa4 3313 num = MIN(num, max_xfer_len);
c31cb707
PL
3314 iov.iov_len = num * BDRV_SECTOR_SIZE;
3315 if (iov.iov_base == NULL) {
857d4f46
KW
3316 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3317 if (iov.iov_base == NULL) {
3318 ret = -ENOMEM;
3319 goto fail;
3320 }
b8d71c09 3321 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3322 }
3323 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3324
c31cb707 3325 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3326
3327 /* Keep bounce buffer around if it is big enough for all
3328 * all future requests.
3329 */
095e4fa4 3330 if (num < max_xfer_len) {
b8d71c09
PB
3331 qemu_vfree(iov.iov_base);
3332 iov.iov_base = NULL;
3333 }
c31cb707
PL
3334 }
3335
3336 sector_num += num;
3337 nb_sectors -= num;
3338 }
f08f2dda 3339
857d4f46 3340fail:
f08f2dda
SH
3341 qemu_vfree(iov.iov_base);
3342 return ret;
3343}
3344
c5fbe571 3345/*
b404f720 3346 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3347 */
b404f720 3348static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3349 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3350 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3351{
3352 BlockDriver *drv = bs->drv;
28de2dcd 3353 bool waited;
6b7cb247 3354 int ret;
da1fa91d 3355
b404f720
KW
3356 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3357 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3358
b404f720
KW
3359 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3360 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
8eb029c2 3361 assert(!qiov || bytes == qiov->size);
cc0681c4 3362
28de2dcd
KW
3363 waited = wait_serialising_requests(req);
3364 assert(!waited || !req->serialising);
af91f9a7
KW
3365 assert(req->overlap_offset <= offset);
3366 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3367
65afd211 3368 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224 3369
465bee1d
PL
3370 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3371 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3372 qemu_iovec_is_zero(qiov)) {
3373 flags |= BDRV_REQ_ZERO_WRITE;
3374 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3375 flags |= BDRV_REQ_MAY_UNMAP;
3376 }
3377 }
3378
d616b224
SH
3379 if (ret < 0) {
3380 /* Do nothing, write notifier decided to fail this request */
3381 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3382 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3383 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3384 } else {
9e1cb96d 3385 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3386 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3387 }
9e1cb96d 3388 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3389
f05fa4ad
PB
3390 if (ret == 0 && !bs->enable_write_cache) {
3391 ret = bdrv_co_flush(bs);
3392 }
3393
e4654d2d 3394 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d 3395
5366d0c8 3396 block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
5e5a94b6 3397
c0191e76 3398 if (ret >= 0) {
df2a6f29
PB
3399 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3400 }
da1fa91d 3401
6b7cb247 3402 return ret;
da1fa91d
KW
3403}
3404
b404f720
KW
3405/*
3406 * Handle a write request in coroutine context
3407 */
6601553e
KW
3408static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3409 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3410 BdrvRequestFlags flags)
3411{
65afd211 3412 BdrvTrackedRequest req;
fc3959e4 3413 uint64_t align = bdrv_get_align(bs);
3b8242e0
KW
3414 uint8_t *head_buf = NULL;
3415 uint8_t *tail_buf = NULL;
3416 QEMUIOVector local_qiov;
3417 bool use_local_qiov = false;
b404f720
KW
3418 int ret;
3419
3420 if (!bs->drv) {
3421 return -ENOMEDIUM;
3422 }
3423 if (bs->read_only) {
3424 return -EACCES;
3425 }
b9c64947
HR
3426
3427 ret = bdrv_check_byte_request(bs, offset, bytes);
3428 if (ret < 0) {
3429 return ret;
b404f720
KW
3430 }
3431
b404f720
KW
3432 /* throttling disk I/O */
3433 if (bs->io_limits_enabled) {
d5103588 3434 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3435 }
3436
3b8242e0
KW
3437 /*
3438 * Align write if necessary by performing a read-modify-write cycle.
3439 * Pad qiov with the read parts and be sure to have a tracked request not
3440 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3441 */
65afd211 3442 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3443
3444 if (offset & (align - 1)) {
3445 QEMUIOVector head_qiov;
3446 struct iovec head_iov;
3447
3448 mark_request_serialising(&req, align);
3449 wait_serialising_requests(&req);
3450
3451 head_buf = qemu_blockalign(bs, align);
3452 head_iov = (struct iovec) {
3453 .iov_base = head_buf,
3454 .iov_len = align,
3455 };
3456 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3457
9e1cb96d 3458 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3459 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3460 align, &head_qiov, 0);
3461 if (ret < 0) {
3462 goto fail;
3463 }
9e1cb96d 3464 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3465
3466 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3467 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3468 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3469 use_local_qiov = true;
3470
3471 bytes += offset & (align - 1);
3472 offset = offset & ~(align - 1);
3473 }
3474
3475 if ((offset + bytes) & (align - 1)) {
3476 QEMUIOVector tail_qiov;
3477 struct iovec tail_iov;
3478 size_t tail_bytes;
28de2dcd 3479 bool waited;
3b8242e0
KW
3480
3481 mark_request_serialising(&req, align);
28de2dcd
KW
3482 waited = wait_serialising_requests(&req);
3483 assert(!waited || !use_local_qiov);
3b8242e0
KW
3484
3485 tail_buf = qemu_blockalign(bs, align);
3486 tail_iov = (struct iovec) {
3487 .iov_base = tail_buf,
3488 .iov_len = align,
3489 };
3490 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3491
9e1cb96d 3492 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3493 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3494 align, &tail_qiov, 0);
3495 if (ret < 0) {
3496 goto fail;
3497 }
9e1cb96d 3498 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3499
3500 if (!use_local_qiov) {
3501 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3502 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3503 use_local_qiov = true;
3504 }
3505
3506 tail_bytes = (offset + bytes) & (align - 1);
3507 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3508
3509 bytes = ROUND_UP(bytes, align);
3510 }
3511
fc3959e4
FZ
3512 if (use_local_qiov) {
3513 /* Local buffer may have non-zero data. */
3514 flags &= ~BDRV_REQ_ZERO_WRITE;
3515 }
3b8242e0
KW
3516 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3517 use_local_qiov ? &local_qiov : qiov,
3518 flags);
3519
3520fail:
65afd211 3521 tracked_request_end(&req);
b404f720 3522
3b8242e0
KW
3523 if (use_local_qiov) {
3524 qemu_iovec_destroy(&local_qiov);
3b8242e0 3525 }
99c4a85c
KW
3526 qemu_vfree(head_buf);
3527 qemu_vfree(tail_buf);
3b8242e0 3528
b404f720
KW
3529 return ret;
3530}
3531
6601553e
KW
3532static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3533 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3534 BdrvRequestFlags flags)
3535{
75af1f34 3536 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
6601553e
KW
3537 return -EINVAL;
3538 }
3539
3540 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3541 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3542}
3543
c5fbe571
SH
3544int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3545 int nb_sectors, QEMUIOVector *qiov)
3546{
3547 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3548
f08f2dda
SH
3549 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3550}
3551
3552int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3553 int64_t sector_num, int nb_sectors,
3554 BdrvRequestFlags flags)
f08f2dda 3555{
fc3959e4
FZ
3556 int ret;
3557
94d6ff21 3558 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3559
d32f35cb
PL
3560 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3561 flags &= ~BDRV_REQ_MAY_UNMAP;
3562 }
fc3959e4
FZ
3563 if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3564 nb_sectors << BDRV_SECTOR_BITS)) {
3565 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3566 BDRV_REQ_ZERO_WRITE | flags);
3567 } else {
3568 uint8_t *buf;
3569 QEMUIOVector local_qiov;
3570 size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3571
3572 buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3573 memset(buf, 0, bytes);
3574 qemu_iovec_init(&local_qiov, 1);
3575 qemu_iovec_add(&local_qiov, buf, bytes);
d32f35cb 3576
fc3959e4
FZ
3577 ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3578 BDRV_REQ_ZERO_WRITE | flags);
3579 qemu_vfree(buf);
3580 }
3581 return ret;
c5fbe571
SH
3582}
3583
83f64091
FB
3584/**
3585 * Truncate file to 'offset' bytes (needed only for file protocols)
3586 */
3587int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3588{
3589 BlockDriver *drv = bs->drv;
51762288 3590 int ret;
83f64091 3591 if (!drv)
19cb3738 3592 return -ENOMEDIUM;
83f64091
FB
3593 if (!drv->bdrv_truncate)
3594 return -ENOTSUP;
59f2689d
NS
3595 if (bs->read_only)
3596 return -EACCES;
9c75e168 3597
51762288
SH
3598 ret = drv->bdrv_truncate(bs, offset);
3599 if (ret == 0) {
3600 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
a7f53e26
MA
3601 if (bs->blk) {
3602 blk_dev_resize_cb(bs->blk);
3603 }
51762288
SH
3604 }
3605 return ret;
83f64091
FB
3606}
3607
4a1d5e1f
FZ
3608/**
3609 * Length of a allocated file in bytes. Sparse files are counted by actual
3610 * allocated space. Return < 0 if error or unknown.
3611 */
3612int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3613{
3614 BlockDriver *drv = bs->drv;
3615 if (!drv) {
3616 return -ENOMEDIUM;
3617 }
3618 if (drv->bdrv_get_allocated_file_size) {
3619 return drv->bdrv_get_allocated_file_size(bs);
3620 }
3621 if (bs->file) {
3622 return bdrv_get_allocated_file_size(bs->file);
3623 }
3624 return -ENOTSUP;
3625}
3626
83f64091 3627/**
65a9bb25 3628 * Return number of sectors on success, -errno on error.
83f64091 3629 */
65a9bb25 3630int64_t bdrv_nb_sectors(BlockDriverState *bs)
83f64091
FB
3631{
3632 BlockDriver *drv = bs->drv;
65a9bb25 3633
83f64091 3634 if (!drv)
19cb3738 3635 return -ENOMEDIUM;
51762288 3636
b94a2610
KW
3637 if (drv->has_variable_length) {
3638 int ret = refresh_total_sectors(bs, bs->total_sectors);
3639 if (ret < 0) {
3640 return ret;
46a4e4e6 3641 }
83f64091 3642 }
65a9bb25
MA
3643 return bs->total_sectors;
3644}
3645
3646/**
3647 * Return length in bytes on success, -errno on error.
3648 * The length is always a multiple of BDRV_SECTOR_SIZE.
3649 */
3650int64_t bdrv_getlength(BlockDriverState *bs)
3651{
3652 int64_t ret = bdrv_nb_sectors(bs);
3653
3654 return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3655}
3656
19cb3738 3657/* return 0 as number of sectors if no device present or error */
96b8f136 3658void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3659{
65a9bb25
MA
3660 int64_t nb_sectors = bdrv_nb_sectors(bs);
3661
3662 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
fc01f7e7 3663}
cf98951b 3664
ff06f5f3
PB
3665void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3666 BlockdevOnError on_write_error)
abd7f68d
MA
3667{
3668 bs->on_read_error = on_read_error;
3669 bs->on_write_error = on_write_error;
3670}
3671
1ceee0d5 3672BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3673{
3674 return is_read ? bs->on_read_error : bs->on_write_error;
3675}
3676
3e1caa5f
PB
3677BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3678{
3679 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3680
3681 switch (on_err) {
3682 case BLOCKDEV_ON_ERROR_ENOSPC:
a589569f
WX
3683 return (error == ENOSPC) ?
3684 BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3685 case BLOCKDEV_ON_ERROR_STOP:
a589569f 3686 return BLOCK_ERROR_ACTION_STOP;
3e1caa5f 3687 case BLOCKDEV_ON_ERROR_REPORT:
a589569f 3688 return BLOCK_ERROR_ACTION_REPORT;
3e1caa5f 3689 case BLOCKDEV_ON_ERROR_IGNORE:
a589569f 3690 return BLOCK_ERROR_ACTION_IGNORE;
3e1caa5f
PB
3691 default:
3692 abort();
3693 }
3694}
3695
c7c2ff0c
LC
3696static void send_qmp_error_event(BlockDriverState *bs,
3697 BlockErrorAction action,
3698 bool is_read, int error)
3699{
573742a5 3700 IoOperationType optype;
c7c2ff0c 3701
573742a5
PM
3702 optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3703 qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
c7c2ff0c 3704 bdrv_iostatus_is_enabled(bs),
624ff573
LC
3705 error == ENOSPC, strerror(error),
3706 &error_abort);
c7c2ff0c
LC
3707}
3708
3e1caa5f
PB
3709/* This is done by device models because, while the block layer knows
3710 * about the error, it does not know whether an operation comes from
3711 * the device or the block layer (from a job, for example).
3712 */
3713void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3714 bool is_read, int error)
3715{
3716 assert(error >= 0);
2bd3bce8 3717
a589569f 3718 if (action == BLOCK_ERROR_ACTION_STOP) {
2bd3bce8
PB
3719 /* First set the iostatus, so that "info block" returns an iostatus
3720 * that matches the events raised so far (an additional error iostatus
3721 * is fine, but not a lost one).
3722 */
3e1caa5f 3723 bdrv_iostatus_set_err(bs, error);
2bd3bce8
PB
3724
3725 /* Then raise the request to stop the VM and the event.
3726 * qemu_system_vmstop_request_prepare has two effects. First,
3727 * it ensures that the STOP event always comes after the
3728 * BLOCK_IO_ERROR event. Second, it ensures that even if management
3729 * can observe the STOP event and do a "cont" before the STOP
3730 * event is issued, the VM will not stop. In this case, vm_start()
3731 * also ensures that the STOP/RESUME pair of events is emitted.
3732 */
3733 qemu_system_vmstop_request_prepare();
c7c2ff0c 3734 send_qmp_error_event(bs, action, is_read, error);
2bd3bce8
PB
3735 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3736 } else {
c7c2ff0c 3737 send_qmp_error_event(bs, action, is_read, error);
3e1caa5f
PB
3738 }
3739}
3740
b338082b
FB
3741int bdrv_is_read_only(BlockDriverState *bs)
3742{
3743 return bs->read_only;
3744}
3745
985a03b0
TS
3746int bdrv_is_sg(BlockDriverState *bs)
3747{
3748 return bs->sg;
3749}
3750
e900a7b7
CH
3751int bdrv_enable_write_cache(BlockDriverState *bs)
3752{
3753 return bs->enable_write_cache;
3754}
3755
425b0148
PB
3756void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3757{
3758 bs->enable_write_cache = wce;
55b110f2
JC
3759
3760 /* so a reopen() will preserve wce */
3761 if (wce) {
3762 bs->open_flags |= BDRV_O_CACHE_WB;
3763 } else {
3764 bs->open_flags &= ~BDRV_O_CACHE_WB;
3765 }
425b0148
PB
3766}
3767
ea2384d3
FB
3768int bdrv_is_encrypted(BlockDriverState *bs)
3769{
3770 if (bs->backing_hd && bs->backing_hd->encrypted)
3771 return 1;
3772 return bs->encrypted;
3773}
3774
c0f4ce77
AL
3775int bdrv_key_required(BlockDriverState *bs)
3776{
3777 BlockDriverState *backing_hd = bs->backing_hd;
3778
3779 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3780 return 1;
3781 return (bs->encrypted && !bs->valid_key);
3782}
3783
ea2384d3
FB
3784int bdrv_set_key(BlockDriverState *bs, const char *key)
3785{
3786 int ret;
3787 if (bs->backing_hd && bs->backing_hd->encrypted) {
3788 ret = bdrv_set_key(bs->backing_hd, key);
3789 if (ret < 0)
3790 return ret;
3791 if (!bs->encrypted)
3792 return 0;
3793 }
fd04a2ae
SH
3794 if (!bs->encrypted) {
3795 return -EINVAL;
3796 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3797 return -ENOMEDIUM;
3798 }
c0f4ce77 3799 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3800 if (ret < 0) {
3801 bs->valid_key = 0;
3802 } else if (!bs->valid_key) {
3803 bs->valid_key = 1;
a7f53e26
MA
3804 if (bs->blk) {
3805 /* call the change callback now, we skipped it on open */
3806 blk_dev_change_media_cb(bs->blk, true);
3807 }
bb5fc20f 3808 }
c0f4ce77 3809 return ret;
ea2384d3
FB
3810}
3811
4d2855a3
MA
3812/*
3813 * Provide an encryption key for @bs.
3814 * If @key is non-null:
3815 * If @bs is not encrypted, fail.
3816 * Else if the key is invalid, fail.
3817 * Else set @bs's key to @key, replacing the existing key, if any.
3818 * If @key is null:
3819 * If @bs is encrypted and still lacks a key, fail.
3820 * Else do nothing.
3821 * On failure, store an error object through @errp if non-null.
3822 */
3823void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3824{
3825 if (key) {
3826 if (!bdrv_is_encrypted(bs)) {
81e5f78a
AG
3827 error_setg(errp, "Node '%s' is not encrypted",
3828 bdrv_get_device_or_node_name(bs));
4d2855a3
MA
3829 } else if (bdrv_set_key(bs, key) < 0) {
3830 error_set(errp, QERR_INVALID_PASSWORD);
3831 }
3832 } else {
3833 if (bdrv_key_required(bs)) {
b1ca6391
MA
3834 error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3835 "'%s' (%s) is encrypted",
81e5f78a 3836 bdrv_get_device_or_node_name(bs),
4d2855a3
MA
3837 bdrv_get_encrypted_filename(bs));
3838 }
3839 }
3840}
3841
f8d6bba1 3842const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3843{
f8d6bba1 3844 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3845}
3846
ada42401
SH
3847static int qsort_strcmp(const void *a, const void *b)
3848{
3849 return strcmp(a, b);
3850}
3851
5fafdf24 3852void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3853 void *opaque)
3854{
3855 BlockDriver *drv;
e855e4fb 3856 int count = 0;
ada42401 3857 int i;
e855e4fb 3858 const char **formats = NULL;
ea2384d3 3859
8a22f02a 3860 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3861 if (drv->format_name) {
3862 bool found = false;
3863 int i = count;
3864 while (formats && i && !found) {
3865 found = !strcmp(formats[--i], drv->format_name);
3866 }
3867
3868 if (!found) {
5839e53b 3869 formats = g_renew(const char *, formats, count + 1);
e855e4fb 3870 formats[count++] = drv->format_name;
e855e4fb
JC
3871 }
3872 }
ea2384d3 3873 }
ada42401
SH
3874
3875 qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3876
3877 for (i = 0; i < count; i++) {
3878 it(opaque, formats[i]);
3879 }
3880
e855e4fb 3881 g_free(formats);
ea2384d3
FB
3882}
3883
dc364f4c
BC
3884/* This function is to find a node in the bs graph */
3885BlockDriverState *bdrv_find_node(const char *node_name)
3886{
3887 BlockDriverState *bs;
3888
3889 assert(node_name);
3890
3891 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3892 if (!strcmp(node_name, bs->node_name)) {
3893 return bs;
3894 }
3895 }
3896 return NULL;
3897}
3898
c13163fb 3899/* Put this QMP function here so it can access the static graph_bdrv_states. */
d5a8ee60 3900BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
c13163fb
BC
3901{
3902 BlockDeviceInfoList *list, *entry;
3903 BlockDriverState *bs;
3904
3905 list = NULL;
3906 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
d5a8ee60
AG
3907 BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3908 if (!info) {
3909 qapi_free_BlockDeviceInfoList(list);
3910 return NULL;
3911 }
c13163fb 3912 entry = g_malloc0(sizeof(*entry));
d5a8ee60 3913 entry->value = info;
c13163fb
BC
3914 entry->next = list;
3915 list = entry;
3916 }
3917
3918 return list;
3919}
3920
12d3ba82
BC
3921BlockDriverState *bdrv_lookup_bs(const char *device,
3922 const char *node_name,
3923 Error **errp)
3924{
7f06d47e
MA
3925 BlockBackend *blk;
3926 BlockDriverState *bs;
12d3ba82 3927
12d3ba82 3928 if (device) {
7f06d47e 3929 blk = blk_by_name(device);
12d3ba82 3930
7f06d47e
MA
3931 if (blk) {
3932 return blk_bs(blk);
12d3ba82 3933 }
12d3ba82
BC
3934 }
3935
dd67fa50
BC
3936 if (node_name) {
3937 bs = bdrv_find_node(node_name);
12d3ba82 3938
dd67fa50
BC
3939 if (bs) {
3940 return bs;
3941 }
12d3ba82
BC
3942 }
3943
dd67fa50
BC
3944 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3945 device ? device : "",
3946 node_name ? node_name : "");
3947 return NULL;
12d3ba82
BC
3948}
3949
5a6684d2
JC
3950/* If 'base' is in the same chain as 'top', return true. Otherwise,
3951 * return false. If either argument is NULL, return false. */
3952bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3953{
3954 while (top && top != base) {
3955 top = top->backing_hd;
3956 }
3957
3958 return top != NULL;
3959}
3960
04df765a
FZ
3961BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3962{
3963 if (!bs) {
3964 return QTAILQ_FIRST(&graph_bdrv_states);
3965 }
3966 return QTAILQ_NEXT(bs, node_list);
3967}
3968
2f399b0a
MA
3969BlockDriverState *bdrv_next(BlockDriverState *bs)
3970{
3971 if (!bs) {
3972 return QTAILQ_FIRST(&bdrv_states);
3973 }
dc364f4c 3974 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3975}
3976
20a9e77d
FZ
3977const char *bdrv_get_node_name(const BlockDriverState *bs)
3978{
3979 return bs->node_name;
3980}
3981
7f06d47e 3982/* TODO check what callers really want: bs->node_name or blk_name() */
bfb197e0 3983const char *bdrv_get_device_name(const BlockDriverState *bs)
ea2384d3 3984{
bfb197e0 3985 return bs->blk ? blk_name(bs->blk) : "";
ea2384d3
FB
3986}
3987
9b2aa84f
AG
3988/* This can be used to identify nodes that might not have a device
3989 * name associated. Since node and device names live in the same
3990 * namespace, the result is unambiguous. The exception is if both are
3991 * absent, then this returns an empty (non-null) string. */
3992const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
3993{
3994 return bs->blk ? blk_name(bs->blk) : bs->node_name;
3995}
3996
c8433287
MA
3997int bdrv_get_flags(BlockDriverState *bs)
3998{
3999 return bs->open_flags;
4000}
4001
f0f0fdfe 4002int bdrv_flush_all(void)
c6ca28d6
AL
4003{
4004 BlockDriverState *bs;
f0f0fdfe 4005 int result = 0;
c6ca28d6 4006
dc364f4c 4007 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
4008 AioContext *aio_context = bdrv_get_aio_context(bs);
4009 int ret;
4010
4011 aio_context_acquire(aio_context);
4012 ret = bdrv_flush(bs);
f0f0fdfe
KW
4013 if (ret < 0 && !result) {
4014 result = ret;
4015 }
ed78cda3 4016 aio_context_release(aio_context);
1b7bdbc1 4017 }
f0f0fdfe
KW
4018
4019 return result;
c6ca28d6
AL
4020}
4021
3ac21627
PL
4022int bdrv_has_zero_init_1(BlockDriverState *bs)
4023{
4024 return 1;
4025}
4026
f2feebbd
KW
4027int bdrv_has_zero_init(BlockDriverState *bs)
4028{
4029 assert(bs->drv);
4030
11212d8f
PB
4031 /* If BS is a copy on write image, it is initialized to
4032 the contents of the base image, which may not be zeroes. */
4033 if (bs->backing_hd) {
4034 return 0;
4035 }
336c1c12
KW
4036 if (bs->drv->bdrv_has_zero_init) {
4037 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
4038 }
4039
3ac21627
PL
4040 /* safe default */
4041 return 0;
f2feebbd
KW
4042}
4043
4ce78691
PL
4044bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4045{
4046 BlockDriverInfo bdi;
4047
4048 if (bs->backing_hd) {
4049 return false;
4050 }
4051
4052 if (bdrv_get_info(bs, &bdi) == 0) {
4053 return bdi.unallocated_blocks_are_zero;
4054 }
4055
4056 return false;
4057}
4058
4059bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4060{
4061 BlockDriverInfo bdi;
4062
4063 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4064 return false;
4065 }
4066
4067 if (bdrv_get_info(bs, &bdi) == 0) {
4068 return bdi.can_write_zeroes_with_unmap;
4069 }
4070
4071 return false;
4072}
4073
b6b8a333 4074typedef struct BdrvCoGetBlockStatusData {
376ae3f1 4075 BlockDriverState *bs;
b35b2bba 4076 BlockDriverState *base;
376ae3f1
SH
4077 int64_t sector_num;
4078 int nb_sectors;
4079 int *pnum;
b6b8a333 4080 int64_t ret;
376ae3f1 4081 bool done;
b6b8a333 4082} BdrvCoGetBlockStatusData;
376ae3f1 4083
f58c7b35 4084/*
705be728
FZ
4085 * Returns the allocation status of the specified sectors.
4086 * Drivers not implementing the functionality are assumed to not support
4087 * backing files, hence all their sectors are reported as allocated.
f58c7b35 4088 *
bd9533e3
SH
4089 * If 'sector_num' is beyond the end of the disk image the return value is 0
4090 * and 'pnum' is set to 0.
4091 *
f58c7b35
TS
4092 * 'pnum' is set to the number of sectors (including and immediately following
4093 * the specified sector) that are known to be in the same
4094 * allocated/unallocated state.
4095 *
bd9533e3
SH
4096 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
4097 * beyond the end of the disk image it will be clamped.
f58c7b35 4098 */
b6b8a333
PB
4099static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4100 int64_t sector_num,
4101 int nb_sectors, int *pnum)
f58c7b35 4102{
30a7f2fc 4103 int64_t total_sectors;
bd9533e3 4104 int64_t n;
5daa74a6 4105 int64_t ret, ret2;
bd9533e3 4106
30a7f2fc
MA
4107 total_sectors = bdrv_nb_sectors(bs);
4108 if (total_sectors < 0) {
4109 return total_sectors;
617ccb46
PB
4110 }
4111
30a7f2fc 4112 if (sector_num >= total_sectors) {
bd9533e3
SH
4113 *pnum = 0;
4114 return 0;
4115 }
4116
30a7f2fc 4117 n = total_sectors - sector_num;
bd9533e3
SH
4118 if (n < nb_sectors) {
4119 nb_sectors = n;
4120 }
4121
b6b8a333 4122 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 4123 *pnum = nb_sectors;
e88ae226 4124 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
918e92d7
PB
4125 if (bs->drv->protocol_name) {
4126 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4127 }
4128 return ret;
f58c7b35 4129 }
6aebab14 4130
415b5b01
PB
4131 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4132 if (ret < 0) {
3e0a233d 4133 *pnum = 0;
415b5b01
PB
4134 return ret;
4135 }
4136
92bc50a5
PL
4137 if (ret & BDRV_BLOCK_RAW) {
4138 assert(ret & BDRV_BLOCK_OFFSET_VALID);
4139 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4140 *pnum, pnum);
4141 }
4142
e88ae226
KW
4143 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4144 ret |= BDRV_BLOCK_ALLOCATED;
4145 }
4146
c3d86884
PL
4147 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4148 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 4149 ret |= BDRV_BLOCK_ZERO;
1f9db224 4150 } else if (bs->backing_hd) {
f0ad5712 4151 BlockDriverState *bs2 = bs->backing_hd;
30a7f2fc
MA
4152 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4153 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
f0ad5712
PB
4154 ret |= BDRV_BLOCK_ZERO;
4155 }
4156 }
415b5b01 4157 }
5daa74a6
PB
4158
4159 if (bs->file &&
4160 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4161 (ret & BDRV_BLOCK_OFFSET_VALID)) {
59c9a95f
HR
4162 int file_pnum;
4163
5daa74a6 4164 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
59c9a95f 4165 *pnum, &file_pnum);
5daa74a6
PB
4166 if (ret2 >= 0) {
4167 /* Ignore errors. This is just providing extra information, it
4168 * is useful but not necessary.
4169 */
59c9a95f
HR
4170 if (!file_pnum) {
4171 /* !file_pnum indicates an offset at or beyond the EOF; it is
4172 * perfectly valid for the format block driver to point to such
4173 * offsets, so catch it and mark everything as zero */
4174 ret |= BDRV_BLOCK_ZERO;
4175 } else {
4176 /* Limit request to the range reported by the protocol driver */
4177 *pnum = file_pnum;
4178 ret |= (ret2 & BDRV_BLOCK_ZERO);
4179 }
5daa74a6
PB
4180 }
4181 }
4182
415b5b01 4183 return ret;
060f51c9
SH
4184}
4185
b6b8a333
PB
4186/* Coroutine wrapper for bdrv_get_block_status() */
4187static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 4188{
b6b8a333 4189 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
4190 BlockDriverState *bs = data->bs;
4191
b6b8a333
PB
4192 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4193 data->pnum);
060f51c9
SH
4194 data->done = true;
4195}
4196
4197/*
b6b8a333 4198 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 4199 *
b6b8a333 4200 * See bdrv_co_get_block_status() for details.
060f51c9 4201 */
b6b8a333
PB
4202int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4203 int nb_sectors, int *pnum)
060f51c9 4204{
6aebab14 4205 Coroutine *co;
b6b8a333 4206 BdrvCoGetBlockStatusData data = {
6aebab14
SH
4207 .bs = bs,
4208 .sector_num = sector_num,
4209 .nb_sectors = nb_sectors,
4210 .pnum = pnum,
4211 .done = false,
4212 };
4213
bdad13b9
PB
4214 if (qemu_in_coroutine()) {
4215 /* Fast-path if already in coroutine context */
b6b8a333 4216 bdrv_get_block_status_co_entry(&data);
bdad13b9 4217 } else {
2572b37a
SH
4218 AioContext *aio_context = bdrv_get_aio_context(bs);
4219
b6b8a333 4220 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
4221 qemu_coroutine_enter(co, &data);
4222 while (!data.done) {
2572b37a 4223 aio_poll(aio_context, true);
bdad13b9 4224 }
6aebab14
SH
4225 }
4226 return data.ret;
f58c7b35
TS
4227}
4228
b6b8a333
PB
4229int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4230 int nb_sectors, int *pnum)
4231{
4333bb71
PB
4232 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4233 if (ret < 0) {
4234 return ret;
4235 }
01fb2705 4236 return !!(ret & BDRV_BLOCK_ALLOCATED);
b6b8a333
PB
4237}
4238
188a7bbf
PB
4239/*
4240 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4241 *
4242 * Return true if the given sector is allocated in any image between
4243 * BASE and TOP (inclusive). BASE can be NULL to check if the given
4244 * sector is allocated in any image of the chain. Return false otherwise.
4245 *
4246 * 'pnum' is set to the number of sectors (including and immediately following
4247 * the specified sector) that are known to be in the same
4248 * allocated/unallocated state.
4249 *
4250 */
4f578637
PB
4251int bdrv_is_allocated_above(BlockDriverState *top,
4252 BlockDriverState *base,
4253 int64_t sector_num,
4254 int nb_sectors, int *pnum)
188a7bbf
PB
4255{
4256 BlockDriverState *intermediate;
4257 int ret, n = nb_sectors;
4258
4259 intermediate = top;
4260 while (intermediate && intermediate != base) {
4261 int pnum_inter;
bdad13b9
PB
4262 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4263 &pnum_inter);
188a7bbf
PB
4264 if (ret < 0) {
4265 return ret;
4266 } else if (ret) {
4267 *pnum = pnum_inter;
4268 return 1;
4269 }
4270
4271 /*
4272 * [sector_num, nb_sectors] is unallocated on top but intermediate
4273 * might have
4274 *
4275 * [sector_num+x, nr_sectors] allocated.
4276 */
63ba17d3
VI
4277 if (n > pnum_inter &&
4278 (intermediate == top ||
4279 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4280 n = pnum_inter;
4281 }
4282
4283 intermediate = intermediate->backing_hd;
4284 }
4285
4286 *pnum = n;
4287 return 0;
4288}
4289
045df330
AL
4290const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4291{
4292 if (bs->backing_hd && bs->backing_hd->encrypted)
4293 return bs->backing_file;
4294 else if (bs->encrypted)
4295 return bs->filename;
4296 else
4297 return NULL;
4298}
4299
5fafdf24 4300void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4301 char *filename, int filename_size)
4302{
3574c608 4303 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4304}
4305
5fafdf24 4306int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4307 const uint8_t *buf, int nb_sectors)
4308{
4309 BlockDriver *drv = bs->drv;
b9c64947
HR
4310 int ret;
4311
4312 if (!drv) {
19cb3738 4313 return -ENOMEDIUM;
b9c64947
HR
4314 }
4315 if (!drv->bdrv_write_compressed) {
faea38e7 4316 return -ENOTSUP;
b9c64947
HR
4317 }
4318 ret = bdrv_check_request(bs, sector_num, nb_sectors);
4319 if (ret < 0) {
4320 return ret;
4321 }
a55eb92c 4322
e4654d2d 4323 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4324
faea38e7
FB
4325 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4326}
3b46e624 4327
faea38e7
FB
4328int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4329{
4330 BlockDriver *drv = bs->drv;
4331 if (!drv)
19cb3738 4332 return -ENOMEDIUM;
faea38e7
FB
4333 if (!drv->bdrv_get_info)
4334 return -ENOTSUP;
4335 memset(bdi, 0, sizeof(*bdi));
4336 return drv->bdrv_get_info(bs, bdi);
4337}
4338
eae041fe
HR
4339ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4340{
4341 BlockDriver *drv = bs->drv;
4342 if (drv && drv->bdrv_get_specific_info) {
4343 return drv->bdrv_get_specific_info(bs);
4344 }
4345 return NULL;
4346}
4347
45566e9c
CH
4348int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4349 int64_t pos, int size)
cf8074b3
KW
4350{
4351 QEMUIOVector qiov;
4352 struct iovec iov = {
4353 .iov_base = (void *) buf,
4354 .iov_len = size,
4355 };
4356
4357 qemu_iovec_init_external(&qiov, &iov, 1);
4358 return bdrv_writev_vmstate(bs, &qiov, pos);
4359}
4360
4361int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4362{
4363 BlockDriver *drv = bs->drv;
cf8074b3
KW
4364
4365 if (!drv) {
178e08a5 4366 return -ENOMEDIUM;
cf8074b3
KW
4367 } else if (drv->bdrv_save_vmstate) {
4368 return drv->bdrv_save_vmstate(bs, qiov, pos);
4369 } else if (bs->file) {
4370 return bdrv_writev_vmstate(bs->file, qiov, pos);
4371 }
4372
7cdb1f6d 4373 return -ENOTSUP;
178e08a5
AL
4374}
4375
45566e9c
CH
4376int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4377 int64_t pos, int size)
178e08a5
AL
4378{
4379 BlockDriver *drv = bs->drv;
4380 if (!drv)
4381 return -ENOMEDIUM;
7cdb1f6d
MK
4382 if (drv->bdrv_load_vmstate)
4383 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4384 if (bs->file)
4385 return bdrv_load_vmstate(bs->file, buf, pos, size);
4386 return -ENOTSUP;
178e08a5
AL
4387}
4388
8b9b0cc2
KW
4389void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4390{
bf736fe3 4391 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4392 return;
4393 }
4394
bf736fe3 4395 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4396}
4397
4398int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4399 const char *tag)
4400{
4401 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4402 bs = bs->file;
4403 }
4404
4405 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4406 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4407 }
4408
4409 return -ENOTSUP;
4410}
4411
4cc70e93
FZ
4412int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4413{
4414 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4415 bs = bs->file;
4416 }
4417
4418 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4419 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4420 }
4421
4422 return -ENOTSUP;
4423}
4424
41c695c7
KW
4425int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4426{
938789ea 4427 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4428 bs = bs->file;
4429 }
8b9b0cc2 4430
41c695c7
KW
4431 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4432 return bs->drv->bdrv_debug_resume(bs, tag);
4433 }
4434
4435 return -ENOTSUP;
4436}
4437
4438bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4439{
4440 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4441 bs = bs->file;
4442 }
4443
4444 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4445 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4446 }
4447
4448 return false;
8b9b0cc2
KW
4449}
4450
199630b6
BS
4451int bdrv_is_snapshot(BlockDriverState *bs)
4452{
4453 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4454}
4455
b1b1d783
JC
4456/* backing_file can either be relative, or absolute, or a protocol. If it is
4457 * relative, it must be relative to the chain. So, passing in bs->filename
4458 * from a BDS as backing_file should not be done, as that may be relative to
4459 * the CWD rather than the chain. */
e8a6bb9c
MT
4460BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4461 const char *backing_file)
4462{
b1b1d783
JC
4463 char *filename_full = NULL;
4464 char *backing_file_full = NULL;
4465 char *filename_tmp = NULL;
4466 int is_protocol = 0;
4467 BlockDriverState *curr_bs = NULL;
4468 BlockDriverState *retval = NULL;
4469
4470 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4471 return NULL;
4472 }
4473
b1b1d783
JC
4474 filename_full = g_malloc(PATH_MAX);
4475 backing_file_full = g_malloc(PATH_MAX);
4476 filename_tmp = g_malloc(PATH_MAX);
4477
4478 is_protocol = path_has_protocol(backing_file);
4479
4480 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4481
4482 /* If either of the filename paths is actually a protocol, then
4483 * compare unmodified paths; otherwise make paths relative */
4484 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4485 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4486 retval = curr_bs->backing_hd;
4487 break;
4488 }
e8a6bb9c 4489 } else {
b1b1d783
JC
4490 /* If not an absolute filename path, make it relative to the current
4491 * image's filename path */
4492 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4493 backing_file);
4494
4495 /* We are going to compare absolute pathnames */
4496 if (!realpath(filename_tmp, filename_full)) {
4497 continue;
4498 }
4499
4500 /* We need to make sure the backing filename we are comparing against
4501 * is relative to the current image filename (or absolute) */
4502 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4503 curr_bs->backing_file);
4504
4505 if (!realpath(filename_tmp, backing_file_full)) {
4506 continue;
4507 }
4508
4509 if (strcmp(backing_file_full, filename_full) == 0) {
4510 retval = curr_bs->backing_hd;
4511 break;
4512 }
e8a6bb9c
MT
4513 }
4514 }
4515
b1b1d783
JC
4516 g_free(filename_full);
4517 g_free(backing_file_full);
4518 g_free(filename_tmp);
4519 return retval;
e8a6bb9c
MT
4520}
4521
f198fd1c
BC
4522int bdrv_get_backing_file_depth(BlockDriverState *bs)
4523{
4524 if (!bs->drv) {
4525 return 0;
4526 }
4527
4528 if (!bs->backing_hd) {
4529 return 0;
4530 }
4531
4532 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4533}
4534
ea2384d3 4535/**************************************************************/
83f64091 4536/* async I/Os */
ea2384d3 4537
7c84b1b8
MA
4538BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4539 QEMUIOVector *qiov, int nb_sectors,
097310b5 4540 BlockCompletionFunc *cb, void *opaque)
83f64091 4541{
bbf0a440
SH
4542 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4543
d20d9b7c 4544 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4545 cb, opaque, false);
ea2384d3
FB
4546}
4547
7c84b1b8
MA
4548BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4549 QEMUIOVector *qiov, int nb_sectors,
097310b5 4550 BlockCompletionFunc *cb, void *opaque)
ea2384d3 4551{
bbf0a440
SH
4552 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4553
d20d9b7c 4554 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4555 cb, opaque, true);
83f64091
FB
4556}
4557
7c84b1b8 4558BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
d5ef94d4 4559 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
097310b5 4560 BlockCompletionFunc *cb, void *opaque)
d5ef94d4
PB
4561{
4562 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4563
4564 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4565 BDRV_REQ_ZERO_WRITE | flags,
4566 cb, opaque, true);
4567}
4568
40b4f539
KW
4569
4570typedef struct MultiwriteCB {
4571 int error;
4572 int num_requests;
4573 int num_callbacks;
4574 struct {
097310b5 4575 BlockCompletionFunc *cb;
40b4f539
KW
4576 void *opaque;
4577 QEMUIOVector *free_qiov;
40b4f539
KW
4578 } callbacks[];
4579} MultiwriteCB;
4580
4581static void multiwrite_user_cb(MultiwriteCB *mcb)
4582{
4583 int i;
4584
4585 for (i = 0; i < mcb->num_callbacks; i++) {
4586 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4587 if (mcb->callbacks[i].free_qiov) {
4588 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4589 }
7267c094 4590 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4591 }
4592}
4593
4594static void multiwrite_cb(void *opaque, int ret)
4595{
4596 MultiwriteCB *mcb = opaque;
4597
6d519a5f
SH
4598 trace_multiwrite_cb(mcb, ret);
4599
cb6d3ca0 4600 if (ret < 0 && !mcb->error) {
40b4f539 4601 mcb->error = ret;
40b4f539
KW
4602 }
4603
4604 mcb->num_requests--;
4605 if (mcb->num_requests == 0) {
de189a1b 4606 multiwrite_user_cb(mcb);
7267c094 4607 g_free(mcb);
40b4f539
KW
4608 }
4609}
4610
4611static int multiwrite_req_compare(const void *a, const void *b)
4612{
77be4366
CH
4613 const BlockRequest *req1 = a, *req2 = b;
4614
4615 /*
4616 * Note that we can't simply subtract req2->sector from req1->sector
4617 * here as that could overflow the return value.
4618 */
4619 if (req1->sector > req2->sector) {
4620 return 1;
4621 } else if (req1->sector < req2->sector) {
4622 return -1;
4623 } else {
4624 return 0;
4625 }
40b4f539
KW
4626}
4627
4628/*
4629 * Takes a bunch of requests and tries to merge them. Returns the number of
4630 * requests that remain after merging.
4631 */
4632static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4633 int num_reqs, MultiwriteCB *mcb)
4634{
4635 int i, outidx;
4636
4637 // Sort requests by start sector
4638 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4639
4640 // Check if adjacent requests touch the same clusters. If so, combine them,
4641 // filling up gaps with zero sectors.
4642 outidx = 0;
4643 for (i = 1; i < num_reqs; i++) {
4644 int merge = 0;
4645 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4646
b6a127a1 4647 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4648 if (reqs[i].sector <= oldreq_last) {
4649 merge = 1;
4650 }
4651
e2a305fb
CH
4652 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4653 merge = 0;
4654 }
4655
6c5a42ac
PL
4656 if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4657 reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4658 merge = 0;
4659 }
4660
40b4f539
KW
4661 if (merge) {
4662 size_t size;
7267c094 4663 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4664 qemu_iovec_init(qiov,
4665 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4666
4667 // Add the first request to the merged one. If the requests are
4668 // overlapping, drop the last sectors of the first request.
4669 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4670 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4671
b6a127a1
PB
4672 // We should need to add any zeros between the two requests
4673 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4674
4675 // Add the second request
1b093c48 4676 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4677
391827eb
SH
4678 // Add tail of first request, if necessary
4679 if (qiov->size < reqs[outidx].qiov->size) {
4680 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4681 reqs[outidx].qiov->size - qiov->size);
4682 }
4683
cbf1dff2 4684 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4685 reqs[outidx].qiov = qiov;
4686
4687 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4688 } else {
4689 outidx++;
4690 reqs[outidx].sector = reqs[i].sector;
4691 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4692 reqs[outidx].qiov = reqs[i].qiov;
4693 }
4694 }
4695
f4564d53
PL
4696 block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4697
40b4f539
KW
4698 return outidx + 1;
4699}
4700
4701/*
4702 * Submit multiple AIO write requests at once.
4703 *
4704 * On success, the function returns 0 and all requests in the reqs array have
4705 * been submitted. In error case this function returns -1, and any of the
4706 * requests may or may not be submitted yet. In particular, this means that the
4707 * callback will be called for some of the requests, for others it won't. The
4708 * caller must check the error field of the BlockRequest to wait for the right
4709 * callbacks (if error != 0, no callback will be called).
4710 *
4711 * The implementation may modify the contents of the reqs array, e.g. to merge
4712 * requests. However, the fields opaque and error are left unmodified as they
4713 * are used to signal failure for a single request to the caller.
4714 */
4715int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4716{
40b4f539
KW
4717 MultiwriteCB *mcb;
4718 int i;
4719
301db7c2
RH
4720 /* don't submit writes if we don't have a medium */
4721 if (bs->drv == NULL) {
4722 for (i = 0; i < num_reqs; i++) {
4723 reqs[i].error = -ENOMEDIUM;
4724 }
4725 return -1;
4726 }
4727
40b4f539
KW
4728 if (num_reqs == 0) {
4729 return 0;
4730 }
4731
4732 // Create MultiwriteCB structure
7267c094 4733 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4734 mcb->num_requests = 0;
4735 mcb->num_callbacks = num_reqs;
4736
4737 for (i = 0; i < num_reqs; i++) {
4738 mcb->callbacks[i].cb = reqs[i].cb;
4739 mcb->callbacks[i].opaque = reqs[i].opaque;
4740 }
4741
4742 // Check for mergable requests
4743 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4744
6d519a5f
SH
4745 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4746
df9309fb
PB
4747 /* Run the aio requests. */
4748 mcb->num_requests = num_reqs;
40b4f539 4749 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4750 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4751 reqs[i].nb_sectors, reqs[i].flags,
4752 multiwrite_cb, mcb,
4753 true);
40b4f539
KW
4754 }
4755
4756 return 0;
40b4f539
KW
4757}
4758
7c84b1b8 4759void bdrv_aio_cancel(BlockAIOCB *acb)
83f64091 4760{
ca5fd113
FZ
4761 qemu_aio_ref(acb);
4762 bdrv_aio_cancel_async(acb);
4763 while (acb->refcnt > 1) {
4764 if (acb->aiocb_info->get_aio_context) {
4765 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4766 } else if (acb->bs) {
4767 aio_poll(bdrv_get_aio_context(acb->bs), true);
4768 } else {
4769 abort();
02c50efe 4770 }
02c50efe 4771 }
8007429a 4772 qemu_aio_unref(acb);
02c50efe
FZ
4773}
4774
4775/* Async version of aio cancel. The caller is not blocked if the acb implements
4776 * cancel_async, otherwise we do nothing and let the request normally complete.
4777 * In either case the completion callback must be called. */
7c84b1b8 4778void bdrv_aio_cancel_async(BlockAIOCB *acb)
02c50efe
FZ
4779{
4780 if (acb->aiocb_info->cancel_async) {
4781 acb->aiocb_info->cancel_async(acb);
4782 }
83f64091
FB
4783}
4784
4785/**************************************************************/
4786/* async block device emulation */
4787
7c84b1b8
MA
4788typedef struct BlockAIOCBSync {
4789 BlockAIOCB common;
c16b5a2c
CH
4790 QEMUBH *bh;
4791 int ret;
4792 /* vector translation state */
4793 QEMUIOVector *qiov;
4794 uint8_t *bounce;
4795 int is_write;
7c84b1b8 4796} BlockAIOCBSync;
c16b5a2c 4797
d7331bed 4798static const AIOCBInfo bdrv_em_aiocb_info = {
7c84b1b8 4799 .aiocb_size = sizeof(BlockAIOCBSync),
c16b5a2c
CH
4800};
4801
ce1a14dc 4802static void bdrv_aio_bh_cb(void *opaque)
83f64091 4803{
7c84b1b8 4804 BlockAIOCBSync *acb = opaque;
f141eafe 4805
857d4f46 4806 if (!acb->is_write && acb->ret >= 0) {
03396148 4807 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
857d4f46 4808 }
ceb42de8 4809 qemu_vfree(acb->bounce);
ce1a14dc 4810 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4811 qemu_bh_delete(acb->bh);
36afc451 4812 acb->bh = NULL;
8007429a 4813 qemu_aio_unref(acb);
83f64091 4814}
beac80cd 4815
7c84b1b8
MA
4816static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4817 int64_t sector_num,
4818 QEMUIOVector *qiov,
4819 int nb_sectors,
097310b5 4820 BlockCompletionFunc *cb,
7c84b1b8
MA
4821 void *opaque,
4822 int is_write)
f141eafe 4823
83f64091 4824{
7c84b1b8 4825 BlockAIOCBSync *acb;
ce1a14dc 4826
d7331bed 4827 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4828 acb->is_write = is_write;
4829 acb->qiov = qiov;
857d4f46 4830 acb->bounce = qemu_try_blockalign(bs, qiov->size);
2572b37a 4831 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
f141eafe 4832
857d4f46
KW
4833 if (acb->bounce == NULL) {
4834 acb->ret = -ENOMEM;
4835 } else if (is_write) {
d5e6b161 4836 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4837 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4838 } else {
1ed20acf 4839 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4840 }
4841
ce1a14dc 4842 qemu_bh_schedule(acb->bh);
f141eafe 4843
ce1a14dc 4844 return &acb->common;
beac80cd
FB
4845}
4846
7c84b1b8 4847static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
f141eafe 4848 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4849 BlockCompletionFunc *cb, void *opaque)
beac80cd 4850{
f141eafe
AL
4851 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4852}
83f64091 4853
7c84b1b8 4854static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
f141eafe 4855 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
097310b5 4856 BlockCompletionFunc *cb, void *opaque)
f141eafe
AL
4857{
4858 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4859}
beac80cd 4860
68485420 4861
7c84b1b8
MA
4862typedef struct BlockAIOCBCoroutine {
4863 BlockAIOCB common;
68485420
KW
4864 BlockRequest req;
4865 bool is_write;
0b5a2445 4866 bool need_bh;
d318aea9 4867 bool *done;
68485420 4868 QEMUBH* bh;
7c84b1b8 4869} BlockAIOCBCoroutine;
68485420 4870
d7331bed 4871static const AIOCBInfo bdrv_em_co_aiocb_info = {
7c84b1b8 4872 .aiocb_size = sizeof(BlockAIOCBCoroutine),
68485420
KW
4873};
4874
0b5a2445
PB
4875static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4876{
4877 if (!acb->need_bh) {
4878 acb->common.cb(acb->common.opaque, acb->req.error);
4879 qemu_aio_unref(acb);
4880 }
4881}
4882
35246a68 4883static void bdrv_co_em_bh(void *opaque)
68485420 4884{
7c84b1b8 4885 BlockAIOCBCoroutine *acb = opaque;
68485420 4886
0b5a2445 4887 assert(!acb->need_bh);
68485420 4888 qemu_bh_delete(acb->bh);
0b5a2445
PB
4889 bdrv_co_complete(acb);
4890}
4891
4892static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4893{
4894 acb->need_bh = false;
4895 if (acb->req.error != -EINPROGRESS) {
4896 BlockDriverState *bs = acb->common.bs;
4897
4898 acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4899 qemu_bh_schedule(acb->bh);
4900 }
68485420
KW
4901}
4902
b2a61371
SH
4903/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4904static void coroutine_fn bdrv_co_do_rw(void *opaque)
4905{
7c84b1b8 4906 BlockAIOCBCoroutine *acb = opaque;
b2a61371
SH
4907 BlockDriverState *bs = acb->common.bs;
4908
4909 if (!acb->is_write) {
4910 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4911 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4912 } else {
4913 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4914 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4915 }
4916
0b5a2445 4917 bdrv_co_complete(acb);
b2a61371
SH
4918}
4919
7c84b1b8
MA
4920static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4921 int64_t sector_num,
4922 QEMUIOVector *qiov,
4923 int nb_sectors,
4924 BdrvRequestFlags flags,
097310b5 4925 BlockCompletionFunc *cb,
7c84b1b8
MA
4926 void *opaque,
4927 bool is_write)
68485420
KW
4928{
4929 Coroutine *co;
7c84b1b8 4930 BlockAIOCBCoroutine *acb;
68485420 4931
d7331bed 4932 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4933 acb->need_bh = true;
4934 acb->req.error = -EINPROGRESS;
68485420
KW
4935 acb->req.sector = sector_num;
4936 acb->req.nb_sectors = nb_sectors;
4937 acb->req.qiov = qiov;
d20d9b7c 4938 acb->req.flags = flags;
68485420
KW
4939 acb->is_write = is_write;
4940
8c5873d6 4941 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4942 qemu_coroutine_enter(co, acb);
4943
0b5a2445 4944 bdrv_co_maybe_schedule_bh(acb);
68485420
KW
4945 return &acb->common;
4946}
4947
07f07615 4948static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4949{
7c84b1b8 4950 BlockAIOCBCoroutine *acb = opaque;
07f07615 4951 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4952
07f07615 4953 acb->req.error = bdrv_co_flush(bs);
0b5a2445 4954 bdrv_co_complete(acb);
b2e12bc6
CH
4955}
4956
7c84b1b8 4957BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
097310b5 4958 BlockCompletionFunc *cb, void *opaque)
016f5cf6 4959{
07f07615 4960 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4961
07f07615 4962 Coroutine *co;
7c84b1b8 4963 BlockAIOCBCoroutine *acb;
016f5cf6 4964
d7331bed 4965 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4966 acb->need_bh = true;
4967 acb->req.error = -EINPROGRESS;
d318aea9 4968
07f07615
PB
4969 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4970 qemu_coroutine_enter(co, acb);
016f5cf6 4971
0b5a2445 4972 bdrv_co_maybe_schedule_bh(acb);
016f5cf6
AG
4973 return &acb->common;
4974}
4975
4265d620
PB
4976static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4977{
7c84b1b8 4978 BlockAIOCBCoroutine *acb = opaque;
4265d620
PB
4979 BlockDriverState *bs = acb->common.bs;
4980
4981 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
0b5a2445 4982 bdrv_co_complete(acb);
4265d620
PB
4983}
4984
7c84b1b8 4985BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4265d620 4986 int64_t sector_num, int nb_sectors,
097310b5 4987 BlockCompletionFunc *cb, void *opaque)
4265d620
PB
4988{
4989 Coroutine *co;
7c84b1b8 4990 BlockAIOCBCoroutine *acb;
4265d620
PB
4991
4992 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4993
d7331bed 4994 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
0b5a2445
PB
4995 acb->need_bh = true;
4996 acb->req.error = -EINPROGRESS;
4265d620
PB
4997 acb->req.sector = sector_num;
4998 acb->req.nb_sectors = nb_sectors;
4999 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5000 qemu_coroutine_enter(co, acb);
5001
0b5a2445 5002 bdrv_co_maybe_schedule_bh(acb);
4265d620
PB
5003 return &acb->common;
5004}
5005
ea2384d3
FB
5006void bdrv_init(void)
5007{
5efa9d5a 5008 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 5009}
ce1a14dc 5010
eb852011
MA
5011void bdrv_init_with_whitelist(void)
5012{
5013 use_bdrv_whitelist = 1;
5014 bdrv_init();
5015}
5016
d7331bed 5017void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
097310b5 5018 BlockCompletionFunc *cb, void *opaque)
ce1a14dc 5019{
7c84b1b8 5020 BlockAIOCB *acb;
ce1a14dc 5021
d7331bed
SH
5022 acb = g_slice_alloc(aiocb_info->aiocb_size);
5023 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
5024 acb->bs = bs;
5025 acb->cb = cb;
5026 acb->opaque = opaque;
f197fe2b 5027 acb->refcnt = 1;
ce1a14dc
PB
5028 return acb;
5029}
5030
f197fe2b
FZ
5031void qemu_aio_ref(void *p)
5032{
7c84b1b8 5033 BlockAIOCB *acb = p;
f197fe2b
FZ
5034 acb->refcnt++;
5035}
5036
8007429a 5037void qemu_aio_unref(void *p)
ce1a14dc 5038{
7c84b1b8 5039 BlockAIOCB *acb = p;
f197fe2b
FZ
5040 assert(acb->refcnt > 0);
5041 if (--acb->refcnt == 0) {
5042 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5043 }
ce1a14dc 5044}
19cb3738 5045
f9f05dc5
KW
5046/**************************************************************/
5047/* Coroutine block device emulation */
5048
5049typedef struct CoroutineIOCompletion {
5050 Coroutine *coroutine;
5051 int ret;
5052} CoroutineIOCompletion;
5053
5054static void bdrv_co_io_em_complete(void *opaque, int ret)
5055{
5056 CoroutineIOCompletion *co = opaque;
5057
5058 co->ret = ret;
5059 qemu_coroutine_enter(co->coroutine, NULL);
5060}
5061
5062static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5063 int nb_sectors, QEMUIOVector *iov,
5064 bool is_write)
5065{
5066 CoroutineIOCompletion co = {
5067 .coroutine = qemu_coroutine_self(),
5068 };
7c84b1b8 5069 BlockAIOCB *acb;
f9f05dc5
KW
5070
5071 if (is_write) {
a652d160
SH
5072 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5073 bdrv_co_io_em_complete, &co);
f9f05dc5 5074 } else {
a652d160
SH
5075 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5076 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
5077 }
5078
59370aaa 5079 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
5080 if (!acb) {
5081 return -EIO;
5082 }
5083 qemu_coroutine_yield();
5084
5085 return co.ret;
5086}
5087
5088static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5089 int64_t sector_num, int nb_sectors,
5090 QEMUIOVector *iov)
5091{
5092 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5093}
5094
5095static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5096 int64_t sector_num, int nb_sectors,
5097 QEMUIOVector *iov)
5098{
5099 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5100}
5101
07f07615 5102static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 5103{
07f07615
PB
5104 RwCo *rwco = opaque;
5105
5106 rwco->ret = bdrv_co_flush(rwco->bs);
5107}
5108
5109int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5110{
eb489bb1
KW
5111 int ret;
5112
29cdb251 5113 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 5114 return 0;
eb489bb1
KW
5115 }
5116
ca716364 5117 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 5118 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
5119 if (bs->drv->bdrv_co_flush_to_os) {
5120 ret = bs->drv->bdrv_co_flush_to_os(bs);
5121 if (ret < 0) {
5122 return ret;
5123 }
5124 }
5125
ca716364
KW
5126 /* But don't actually force it to the disk with cache=unsafe */
5127 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 5128 goto flush_parent;
ca716364
KW
5129 }
5130
bf736fe3 5131 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 5132 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 5133 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615 5134 } else if (bs->drv->bdrv_aio_flush) {
7c84b1b8 5135 BlockAIOCB *acb;
07f07615
PB
5136 CoroutineIOCompletion co = {
5137 .coroutine = qemu_coroutine_self(),
5138 };
5139
5140 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5141 if (acb == NULL) {
29cdb251 5142 ret = -EIO;
07f07615
PB
5143 } else {
5144 qemu_coroutine_yield();
29cdb251 5145 ret = co.ret;
07f07615 5146 }
07f07615
PB
5147 } else {
5148 /*
5149 * Some block drivers always operate in either writethrough or unsafe
5150 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5151 * know how the server works (because the behaviour is hardcoded or
5152 * depends on server-side configuration), so we can't ensure that
5153 * everything is safe on disk. Returning an error doesn't work because
5154 * that would break guests even if the server operates in writethrough
5155 * mode.
5156 *
5157 * Let's hope the user knows what he's doing.
5158 */
29cdb251 5159 ret = 0;
07f07615 5160 }
29cdb251
PB
5161 if (ret < 0) {
5162 return ret;
5163 }
5164
5165 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
5166 * in the case of cache=unsafe, so there are no useless flushes.
5167 */
d4c82329 5168flush_parent:
29cdb251 5169 return bdrv_co_flush(bs->file);
07f07615
PB
5170}
5171
5a8a30db 5172void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 5173{
5a8a30db
KW
5174 Error *local_err = NULL;
5175 int ret;
5176
3456a8d1
KW
5177 if (!bs->drv) {
5178 return;
5179 }
5180
7ea2d269
AK
5181 if (!(bs->open_flags & BDRV_O_INCOMING)) {
5182 return;
5183 }
5184 bs->open_flags &= ~BDRV_O_INCOMING;
5185
3456a8d1 5186 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 5187 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 5188 } else if (bs->file) {
5a8a30db
KW
5189 bdrv_invalidate_cache(bs->file, &local_err);
5190 }
5191 if (local_err) {
5192 error_propagate(errp, local_err);
5193 return;
0f15423c 5194 }
3456a8d1 5195
5a8a30db
KW
5196 ret = refresh_total_sectors(bs, bs->total_sectors);
5197 if (ret < 0) {
5198 error_setg_errno(errp, -ret, "Could not refresh total sector count");
5199 return;
5200 }
0f15423c
AL
5201}
5202
5a8a30db 5203void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
5204{
5205 BlockDriverState *bs;
5a8a30db 5206 Error *local_err = NULL;
0f15423c 5207
dc364f4c 5208 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
ed78cda3
SH
5209 AioContext *aio_context = bdrv_get_aio_context(bs);
5210
5211 aio_context_acquire(aio_context);
5a8a30db 5212 bdrv_invalidate_cache(bs, &local_err);
ed78cda3 5213 aio_context_release(aio_context);
5a8a30db
KW
5214 if (local_err) {
5215 error_propagate(errp, local_err);
5216 return;
5217 }
0f15423c
AL
5218 }
5219}
5220
07f07615
PB
5221int bdrv_flush(BlockDriverState *bs)
5222{
5223 Coroutine *co;
5224 RwCo rwco = {
5225 .bs = bs,
5226 .ret = NOT_DONE,
e7a8a783 5227 };
e7a8a783 5228
07f07615
PB
5229 if (qemu_in_coroutine()) {
5230 /* Fast-path if already in coroutine context */
5231 bdrv_flush_co_entry(&rwco);
5232 } else {
2572b37a
SH
5233 AioContext *aio_context = bdrv_get_aio_context(bs);
5234
07f07615
PB
5235 co = qemu_coroutine_create(bdrv_flush_co_entry);
5236 qemu_coroutine_enter(co, &rwco);
5237 while (rwco.ret == NOT_DONE) {
2572b37a 5238 aio_poll(aio_context, true);
07f07615 5239 }
e7a8a783 5240 }
07f07615
PB
5241
5242 return rwco.ret;
e7a8a783
KW
5243}
5244
775aa8b6
KW
5245typedef struct DiscardCo {
5246 BlockDriverState *bs;
5247 int64_t sector_num;
5248 int nb_sectors;
5249 int ret;
5250} DiscardCo;
4265d620
PB
5251static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5252{
775aa8b6 5253 DiscardCo *rwco = opaque;
4265d620
PB
5254
5255 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5256}
5257
5258int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5259 int nb_sectors)
5260{
b9c64947 5261 int max_discard, ret;
d51e9fe5 5262
4265d620
PB
5263 if (!bs->drv) {
5264 return -ENOMEDIUM;
b9c64947
HR
5265 }
5266
5267 ret = bdrv_check_request(bs, sector_num, nb_sectors);
5268 if (ret < 0) {
5269 return ret;
4265d620
PB
5270 } else if (bs->read_only) {
5271 return -EROFS;
df702c9b
PB
5272 }
5273
e4654d2d 5274 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 5275
9e8f1835
PB
5276 /* Do nothing if disabled. */
5277 if (!(bs->open_flags & BDRV_O_UNMAP)) {
5278 return 0;
5279 }
5280
d51e9fe5
PB
5281 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5282 return 0;
5283 }
6f14da52 5284
75af1f34 5285 max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
d51e9fe5
PB
5286 while (nb_sectors > 0) {
5287 int ret;
5288 int num = nb_sectors;
6f14da52 5289
d51e9fe5
PB
5290 /* align request */
5291 if (bs->bl.discard_alignment &&
5292 num >= bs->bl.discard_alignment &&
5293 sector_num % bs->bl.discard_alignment) {
5294 if (num > bs->bl.discard_alignment) {
5295 num = bs->bl.discard_alignment;
6f14da52 5296 }
d51e9fe5
PB
5297 num -= sector_num % bs->bl.discard_alignment;
5298 }
6f14da52 5299
d51e9fe5
PB
5300 /* limit request size */
5301 if (num > max_discard) {
5302 num = max_discard;
5303 }
6f14da52 5304
d51e9fe5 5305 if (bs->drv->bdrv_co_discard) {
6f14da52 5306 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5 5307 } else {
7c84b1b8 5308 BlockAIOCB *acb;
d51e9fe5
PB
5309 CoroutineIOCompletion co = {
5310 .coroutine = qemu_coroutine_self(),
5311 };
5312
5313 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5314 bdrv_co_io_em_complete, &co);
5315 if (acb == NULL) {
5316 return -EIO;
5317 } else {
5318 qemu_coroutine_yield();
5319 ret = co.ret;
6f14da52 5320 }
6f14da52 5321 }
7ce21016 5322 if (ret && ret != -ENOTSUP) {
d51e9fe5 5323 return ret;
4265d620 5324 }
d51e9fe5
PB
5325
5326 sector_num += num;
5327 nb_sectors -= num;
4265d620 5328 }
d51e9fe5 5329 return 0;
4265d620
PB
5330}
5331
5332int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5333{
5334 Coroutine *co;
775aa8b6 5335 DiscardCo rwco = {
4265d620
PB
5336 .bs = bs,
5337 .sector_num = sector_num,
5338 .nb_sectors = nb_sectors,
5339 .ret = NOT_DONE,
5340 };
5341
5342 if (qemu_in_coroutine()) {
5343 /* Fast-path if already in coroutine context */
5344 bdrv_discard_co_entry(&rwco);
5345 } else {
2572b37a
SH
5346 AioContext *aio_context = bdrv_get_aio_context(bs);
5347
4265d620
PB
5348 co = qemu_coroutine_create(bdrv_discard_co_entry);
5349 qemu_coroutine_enter(co, &rwco);
5350 while (rwco.ret == NOT_DONE) {
2572b37a 5351 aio_poll(aio_context, true);
4265d620
PB
5352 }
5353 }
5354
5355 return rwco.ret;
5356}
5357
19cb3738
FB
5358/**************************************************************/
5359/* removable device support */
5360
5361/**
5362 * Return TRUE if the media is present
5363 */
5364int bdrv_is_inserted(BlockDriverState *bs)
5365{
5366 BlockDriver *drv = bs->drv;
a1aff5bf 5367
19cb3738
FB
5368 if (!drv)
5369 return 0;
5370 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5371 return 1;
5372 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5373}
5374
5375/**
8e49ca46
MA
5376 * Return whether the media changed since the last call to this
5377 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5378 */
5379int bdrv_media_changed(BlockDriverState *bs)
5380{
5381 BlockDriver *drv = bs->drv;
19cb3738 5382
8e49ca46
MA
5383 if (drv && drv->bdrv_media_changed) {
5384 return drv->bdrv_media_changed(bs);
5385 }
5386 return -ENOTSUP;
19cb3738
FB
5387}
5388
5389/**
5390 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5391 */
f36f3949 5392void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5393{
5394 BlockDriver *drv = bs->drv;
bfb197e0 5395 const char *device_name;
19cb3738 5396
822e1cd1
MA
5397 if (drv && drv->bdrv_eject) {
5398 drv->bdrv_eject(bs, eject_flag);
19cb3738 5399 }
6f382ed2 5400
bfb197e0
MA
5401 device_name = bdrv_get_device_name(bs);
5402 if (device_name[0] != '\0') {
5403 qapi_event_send_device_tray_moved(device_name,
a5ee7bd4 5404 eject_flag, &error_abort);
6f382ed2 5405 }
19cb3738
FB
5406}
5407
19cb3738
FB
5408/**
5409 * Lock or unlock the media (if it is locked, the user won't be able
5410 * to eject it manually).
5411 */
025e849a 5412void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5413{
5414 BlockDriver *drv = bs->drv;
5415
025e849a 5416 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5417
025e849a
MA
5418 if (drv && drv->bdrv_lock_medium) {
5419 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5420 }
5421}
985a03b0
TS
5422
5423/* needed for generic scsi interface */
5424
5425int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5426{
5427 BlockDriver *drv = bs->drv;
5428
5429 if (drv && drv->bdrv_ioctl)
5430 return drv->bdrv_ioctl(bs, req, buf);
5431 return -ENOTSUP;
5432}
7d780669 5433
7c84b1b8 5434BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
221f715d 5435 unsigned long int req, void *buf,
097310b5 5436 BlockCompletionFunc *cb, void *opaque)
7d780669 5437{
221f715d 5438 BlockDriver *drv = bs->drv;
7d780669 5439
221f715d
AL
5440 if (drv && drv->bdrv_aio_ioctl)
5441 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5442 return NULL;
7d780669 5443}
e268ca52 5444
1b7fd729 5445void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5446{
1b7fd729 5447 bs->guest_block_size = align;
7b6f9300 5448}
7cd1e32a 5449
e268ca52
AL
5450void *qemu_blockalign(BlockDriverState *bs, size_t size)
5451{
339064d5 5452 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5453}
7cd1e32a 5454
9ebd8448
HR
5455void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5456{
5457 return memset(qemu_blockalign(bs, size), 0, size);
5458}
5459
7d2a35cc
KW
5460void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5461{
5462 size_t align = bdrv_opt_mem_align(bs);
5463
5464 /* Ensure that NULL is never returned on success */
5465 assert(align > 0);
5466 if (size == 0) {
5467 size = align;
5468 }
5469
5470 return qemu_try_memalign(align, size);
5471}
5472
9ebd8448
HR
5473void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5474{
5475 void *mem = qemu_try_blockalign(bs, size);
5476
5477 if (mem) {
5478 memset(mem, 0, size);
5479 }
5480
5481 return mem;
5482}
5483
c53b1c51
SH
5484/*
5485 * Check if all memory in this vector is sector aligned.
5486 */
5487bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5488{
5489 int i;
339064d5 5490 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5491
5492 for (i = 0; i < qiov->niov; i++) {
339064d5 5493 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5494 return false;
1ff735bd 5495 }
339064d5 5496 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5497 return false;
c53b1c51
SH
5498 }
5499 }
5500
5501 return true;
5502}
5503
b8afb520
FZ
5504BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5505 Error **errp)
7cd1e32a
LS
5506{
5507 int64_t bitmap_size;
e4654d2d 5508 BdrvDirtyBitmap *bitmap;
a55eb92c 5509
50717e94
PB
5510 assert((granularity & (granularity - 1)) == 0);
5511
e4654d2d
FZ
5512 granularity >>= BDRV_SECTOR_BITS;
5513 assert(granularity);
57322b78 5514 bitmap_size = bdrv_nb_sectors(bs);
b8afb520
FZ
5515 if (bitmap_size < 0) {
5516 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5517 errno = -bitmap_size;
5518 return NULL;
5519 }
5839e53b 5520 bitmap = g_new0(BdrvDirtyBitmap, 1);
786a4ea8 5521 bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
e4654d2d
FZ
5522 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5523 return bitmap;
5524}
5525
5526void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5527{
5528 BdrvDirtyBitmap *bm, *next;
5529 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5530 if (bm == bitmap) {
5531 QLIST_REMOVE(bitmap, list);
5532 hbitmap_free(bitmap->bitmap);
5533 g_free(bitmap);
5534 return;
a55eb92c 5535 }
7cd1e32a
LS
5536 }
5537}
5538
21b56835
FZ
5539BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5540{
5541 BdrvDirtyBitmap *bm;
5542 BlockDirtyInfoList *list = NULL;
5543 BlockDirtyInfoList **plist = &list;
5544
5545 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5839e53b
MA
5546 BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5547 BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
21b56835
FZ
5548 info->count = bdrv_get_dirty_count(bs, bm);
5549 info->granularity =
5550 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5551 entry->value = info;
5552 *plist = entry;
5553 plist = &entry->next;
5554 }
5555
5556 return list;
5557}
5558
e4654d2d 5559int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5560{
e4654d2d
FZ
5561 if (bitmap) {
5562 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5563 } else {
5564 return 0;
5565 }
5566}
5567
e4654d2d
FZ
5568void bdrv_dirty_iter_init(BlockDriverState *bs,
5569 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5570{
e4654d2d 5571 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5572}
5573
c4237dfa
VSO
5574void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5575 int64_t cur_sector, int nr_sectors)
5576{
5577 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5578}
5579
5580void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5581 int64_t cur_sector, int nr_sectors)
5582{
5583 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5584}
5585
5586static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5587 int nr_sectors)
1755da16 5588{
e4654d2d
FZ
5589 BdrvDirtyBitmap *bitmap;
5590 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5591 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5592 }
1755da16
PB
5593}
5594
c4237dfa
VSO
5595static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5596 int nr_sectors)
7cd1e32a 5597{
e4654d2d
FZ
5598 BdrvDirtyBitmap *bitmap;
5599 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5600 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5601 }
7cd1e32a 5602}
aaa0eb75 5603
e4654d2d 5604int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5605{
e4654d2d 5606 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5607}
f88e1a42 5608
9fcb0251
FZ
5609/* Get a reference to bs */
5610void bdrv_ref(BlockDriverState *bs)
5611{
5612 bs->refcnt++;
5613}
5614
5615/* Release a previously grabbed reference to bs.
5616 * If after releasing, reference count is zero, the BlockDriverState is
5617 * deleted. */
5618void bdrv_unref(BlockDriverState *bs)
5619{
9a4d5ca6
JC
5620 if (!bs) {
5621 return;
5622 }
9fcb0251
FZ
5623 assert(bs->refcnt > 0);
5624 if (--bs->refcnt == 0) {
5625 bdrv_delete(bs);
5626 }
5627}
5628
fbe40ff7
FZ
5629struct BdrvOpBlocker {
5630 Error *reason;
5631 QLIST_ENTRY(BdrvOpBlocker) list;
5632};
5633
5634bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5635{
5636 BdrvOpBlocker *blocker;
5637 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5638 if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5639 blocker = QLIST_FIRST(&bs->op_blockers[op]);
5640 if (errp) {
81e5f78a
AG
5641 error_setg(errp, "Node '%s' is busy: %s",
5642 bdrv_get_device_or_node_name(bs),
bfb197e0 5643 error_get_pretty(blocker->reason));
fbe40ff7
FZ
5644 }
5645 return true;
5646 }
5647 return false;
5648}
5649
5650void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5651{
5652 BdrvOpBlocker *blocker;
5653 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5654
5839e53b 5655 blocker = g_new0(BdrvOpBlocker, 1);
fbe40ff7
FZ
5656 blocker->reason = reason;
5657 QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5658}
5659
5660void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5661{
5662 BdrvOpBlocker *blocker, *next;
5663 assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5664 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5665 if (blocker->reason == reason) {
5666 QLIST_REMOVE(blocker, list);
5667 g_free(blocker);
5668 }
5669 }
5670}
5671
5672void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5673{
5674 int i;
5675 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5676 bdrv_op_block(bs, i, reason);
5677 }
5678}
5679
5680void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5681{
5682 int i;
5683 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5684 bdrv_op_unblock(bs, i, reason);
5685 }
5686}
5687
5688bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5689{
5690 int i;
5691
5692 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5693 if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5694 return false;
5695 }
5696 }
5697 return true;
5698}
5699
28a7282a
LC
5700void bdrv_iostatus_enable(BlockDriverState *bs)
5701{
d6bf279e 5702 bs->iostatus_enabled = true;
58e21ef5 5703 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5704}
5705
5706/* The I/O status is only enabled if the drive explicitly
5707 * enables it _and_ the VM is configured to stop on errors */
5708bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5709{
d6bf279e 5710 return (bs->iostatus_enabled &&
92aa5c6d
PB
5711 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5712 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5713 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5714}
5715
5716void bdrv_iostatus_disable(BlockDriverState *bs)
5717{
d6bf279e 5718 bs->iostatus_enabled = false;
28a7282a
LC
5719}
5720
5721void bdrv_iostatus_reset(BlockDriverState *bs)
5722{
5723 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5724 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5725 if (bs->job) {
5726 block_job_iostatus_reset(bs->job);
5727 }
28a7282a
LC
5728 }
5729}
5730
28a7282a
LC
5731void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5732{
3e1caa5f
PB
5733 assert(bdrv_iostatus_is_enabled(bs));
5734 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5735 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5736 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5737 }
5738}
5739
d92ada22
LC
5740void bdrv_img_create(const char *filename, const char *fmt,
5741 const char *base_filename, const char *base_fmt,
f382d43a
MR
5742 char *options, uint64_t img_size, int flags,
5743 Error **errp, bool quiet)
f88e1a42 5744{
83d0521a
CL
5745 QemuOptsList *create_opts = NULL;
5746 QemuOpts *opts = NULL;
5747 const char *backing_fmt, *backing_file;
5748 int64_t size;
f88e1a42 5749 BlockDriver *drv, *proto_drv;
96df67d1 5750 BlockDriver *backing_drv = NULL;
cc84d90f 5751 Error *local_err = NULL;
f88e1a42
JS
5752 int ret = 0;
5753
5754 /* Find driver and parse its options */
5755 drv = bdrv_find_format(fmt);
5756 if (!drv) {
71c79813 5757 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5758 return;
f88e1a42
JS
5759 }
5760
b65a5e12 5761 proto_drv = bdrv_find_protocol(filename, true, errp);
f88e1a42 5762 if (!proto_drv) {
d92ada22 5763 return;
f88e1a42
JS
5764 }
5765
c6149724
HR
5766 if (!drv->create_opts) {
5767 error_setg(errp, "Format driver '%s' does not support image creation",
5768 drv->format_name);
5769 return;
5770 }
5771
5772 if (!proto_drv->create_opts) {
5773 error_setg(errp, "Protocol driver '%s' does not support image creation",
5774 proto_drv->format_name);
5775 return;
5776 }
5777
c282e1fd
CL
5778 create_opts = qemu_opts_append(create_opts, drv->create_opts);
5779 create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
f88e1a42
JS
5780
5781 /* Create parameter list with default values */
83d0521a 5782 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
39101f25 5783 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
f88e1a42
JS
5784
5785 /* Parse -o options */
5786 if (options) {
dc523cd3
MA
5787 qemu_opts_do_parse(opts, options, NULL, &local_err);
5788 if (local_err) {
5789 error_report_err(local_err);
5790 local_err = NULL;
83d0521a 5791 error_setg(errp, "Invalid options for file format '%s'", fmt);
f88e1a42
JS
5792 goto out;
5793 }
5794 }
5795
5796 if (base_filename) {
f43e47db 5797 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
6be4194b 5798 if (local_err) {
71c79813
LC
5799 error_setg(errp, "Backing file not supported for file format '%s'",
5800 fmt);
f88e1a42
JS
5801 goto out;
5802 }
5803 }
5804
5805 if (base_fmt) {
f43e47db 5806 qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6be4194b 5807 if (local_err) {
71c79813
LC
5808 error_setg(errp, "Backing file format not supported for file "
5809 "format '%s'", fmt);
f88e1a42
JS
5810 goto out;
5811 }
5812 }
5813
83d0521a
CL
5814 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5815 if (backing_file) {
5816 if (!strcmp(filename, backing_file)) {
71c79813
LC
5817 error_setg(errp, "Error: Trying to create an image with the "
5818 "same filename as the backing file");
792da93a
JS
5819 goto out;
5820 }
5821 }
5822
83d0521a
CL
5823 backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5824 if (backing_fmt) {
5825 backing_drv = bdrv_find_format(backing_fmt);
96df67d1 5826 if (!backing_drv) {
71c79813 5827 error_setg(errp, "Unknown backing file format '%s'",
83d0521a 5828 backing_fmt);
f88e1a42
JS
5829 goto out;
5830 }
5831 }
5832
5833 // The size for the image must always be specified, with one exception:
5834 // If we are using a backing file, we can obtain the size from there
83d0521a
CL
5835 size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5836 if (size == -1) {
5837 if (backing_file) {
66f6b814 5838 BlockDriverState *bs;
29168018 5839 char *full_backing = g_new0(char, PATH_MAX);
52bf1e72 5840 int64_t size;
63090dac
PB
5841 int back_flags;
5842
29168018
HR
5843 bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5844 full_backing, PATH_MAX,
5845 &local_err);
5846 if (local_err) {
5847 g_free(full_backing);
5848 goto out;
5849 }
5850
63090dac
PB
5851 /* backing files always opened read-only */
5852 back_flags =
5853 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5854
f67503e5 5855 bs = NULL;
29168018 5856 ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
cc84d90f 5857 backing_drv, &local_err);
29168018 5858 g_free(full_backing);
f88e1a42 5859 if (ret < 0) {
f88e1a42
JS
5860 goto out;
5861 }
52bf1e72
MA
5862 size = bdrv_getlength(bs);
5863 if (size < 0) {
5864 error_setg_errno(errp, -size, "Could not get size of '%s'",
5865 backing_file);
5866 bdrv_unref(bs);
5867 goto out;
5868 }
f88e1a42 5869
39101f25 5870 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
66f6b814
HR
5871
5872 bdrv_unref(bs);
f88e1a42 5873 } else {
71c79813 5874 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5875 goto out;
5876 }
5877 }
5878
f382d43a 5879 if (!quiet) {
43c5d8f8
FZ
5880 printf("Formatting '%s', fmt=%s", filename, fmt);
5881 qemu_opts_print(opts, " ");
f382d43a
MR
5882 puts("");
5883 }
83d0521a 5884
c282e1fd 5885 ret = bdrv_create(drv, filename, opts, &local_err);
83d0521a 5886
cc84d90f
HR
5887 if (ret == -EFBIG) {
5888 /* This is generally a better message than whatever the driver would
5889 * deliver (especially because of the cluster_size_hint), since that
5890 * is most probably not much different from "image too large". */
5891 const char *cluster_size_hint = "";
83d0521a 5892 if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
cc84d90f 5893 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5894 }
cc84d90f
HR
5895 error_setg(errp, "The image size is too large for file format '%s'"
5896 "%s", fmt, cluster_size_hint);
5897 error_free(local_err);
5898 local_err = NULL;
f88e1a42
JS
5899 }
5900
5901out:
83d0521a
CL
5902 qemu_opts_del(opts);
5903 qemu_opts_free(create_opts);
84d18f06 5904 if (local_err) {
cc84d90f
HR
5905 error_propagate(errp, local_err);
5906 }
f88e1a42 5907}
85d126f3
SH
5908
5909AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5910{
dcd04228
SH
5911 return bs->aio_context;
5912}
5913
5914void bdrv_detach_aio_context(BlockDriverState *bs)
5915{
33384421
HR
5916 BdrvAioNotifier *baf;
5917
dcd04228
SH
5918 if (!bs->drv) {
5919 return;
5920 }
5921
33384421
HR
5922 QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5923 baf->detach_aio_context(baf->opaque);
5924 }
5925
13af91eb
SH
5926 if (bs->io_limits_enabled) {
5927 throttle_detach_aio_context(&bs->throttle_state);
5928 }
dcd04228
SH
5929 if (bs->drv->bdrv_detach_aio_context) {
5930 bs->drv->bdrv_detach_aio_context(bs);
5931 }
5932 if (bs->file) {
5933 bdrv_detach_aio_context(bs->file);
5934 }
5935 if (bs->backing_hd) {
5936 bdrv_detach_aio_context(bs->backing_hd);
5937 }
5938
5939 bs->aio_context = NULL;
5940}
5941
5942void bdrv_attach_aio_context(BlockDriverState *bs,
5943 AioContext *new_context)
5944{
33384421
HR
5945 BdrvAioNotifier *ban;
5946
dcd04228
SH
5947 if (!bs->drv) {
5948 return;
5949 }
5950
5951 bs->aio_context = new_context;
5952
5953 if (bs->backing_hd) {
5954 bdrv_attach_aio_context(bs->backing_hd, new_context);
5955 }
5956 if (bs->file) {
5957 bdrv_attach_aio_context(bs->file, new_context);
5958 }
5959 if (bs->drv->bdrv_attach_aio_context) {
5960 bs->drv->bdrv_attach_aio_context(bs, new_context);
5961 }
13af91eb
SH
5962 if (bs->io_limits_enabled) {
5963 throttle_attach_aio_context(&bs->throttle_state, new_context);
5964 }
33384421
HR
5965
5966 QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5967 ban->attached_aio_context(new_context, ban->opaque);
5968 }
dcd04228
SH
5969}
5970
5971void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5972{
5973 bdrv_drain_all(); /* ensure there are no in-flight requests */
5974
5975 bdrv_detach_aio_context(bs);
5976
5977 /* This function executes in the old AioContext so acquire the new one in
5978 * case it runs in a different thread.
5979 */
5980 aio_context_acquire(new_context);
5981 bdrv_attach_aio_context(bs, new_context);
5982 aio_context_release(new_context);
85d126f3 5983}
d616b224 5984
33384421
HR
5985void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5986 void (*attached_aio_context)(AioContext *new_context, void *opaque),
5987 void (*detach_aio_context)(void *opaque), void *opaque)
5988{
5989 BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5990 *ban = (BdrvAioNotifier){
5991 .attached_aio_context = attached_aio_context,
5992 .detach_aio_context = detach_aio_context,
5993 .opaque = opaque
5994 };
5995
5996 QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5997}
5998
5999void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6000 void (*attached_aio_context)(AioContext *,
6001 void *),
6002 void (*detach_aio_context)(void *),
6003 void *opaque)
6004{
6005 BdrvAioNotifier *ban, *ban_next;
6006
6007 QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6008 if (ban->attached_aio_context == attached_aio_context &&
6009 ban->detach_aio_context == detach_aio_context &&
6010 ban->opaque == opaque)
6011 {
6012 QLIST_REMOVE(ban, list);
6013 g_free(ban);
6014
6015 return;
6016 }
6017 }
6018
6019 abort();
6020}
6021
d616b224
SH
6022void bdrv_add_before_write_notifier(BlockDriverState *bs,
6023 NotifierWithReturn *notifier)
6024{
6025 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6026}
6f176b48 6027
77485434
HR
6028int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6029 BlockDriverAmendStatusCB *status_cb)
6f176b48 6030{
c282e1fd 6031 if (!bs->drv->bdrv_amend_options) {
6f176b48
HR
6032 return -ENOTSUP;
6033 }
77485434 6034 return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6f176b48 6035}
f6186f49 6036
b5042a36
BC
6037/* This function will be called by the bdrv_recurse_is_first_non_filter method
6038 * of block filter and by bdrv_is_first_non_filter.
6039 * It is used to test if the given bs is the candidate or recurse more in the
6040 * node graph.
212a5a8f 6041 */
b5042a36 6042bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 6043 BlockDriverState *candidate)
f6186f49 6044{
b5042a36
BC
6045 /* return false if basic checks fails */
6046 if (!bs || !bs->drv) {
212a5a8f 6047 return false;
f6186f49
BC
6048 }
6049
b5042a36
BC
6050 /* the code reached a non block filter driver -> check if the bs is
6051 * the same as the candidate. It's the recursion termination condition.
6052 */
6053 if (!bs->drv->is_filter) {
6054 return bs == candidate;
212a5a8f 6055 }
b5042a36 6056 /* Down this path the driver is a block filter driver */
212a5a8f 6057
b5042a36
BC
6058 /* If the block filter recursion method is defined use it to recurse down
6059 * the node graph.
6060 */
6061 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 6062 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
6063 }
6064
b5042a36
BC
6065 /* the driver is a block filter but don't allow to recurse -> return false
6066 */
6067 return false;
f6186f49
BC
6068}
6069
212a5a8f
BC
6070/* This function checks if the candidate is the first non filter bs down it's
6071 * bs chain. Since we don't have pointers to parents it explore all bs chains
6072 * from the top. Some filters can choose not to pass down the recursion.
6073 */
6074bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 6075{
212a5a8f
BC
6076 BlockDriverState *bs;
6077
6078 /* walk down the bs forest recursively */
6079 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6080 bool perm;
6081
b5042a36 6082 /* try to recurse in this top level bs */
e6dc8a1f 6083 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
6084
6085 /* candidate is the first non filter */
6086 if (perm) {
6087 return true;
6088 }
6089 }
6090
6091 return false;
f6186f49 6092}
09158f00
BC
6093
6094BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6095{
6096 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5a7e7a0b
SH
6097 AioContext *aio_context;
6098
09158f00
BC
6099 if (!to_replace_bs) {
6100 error_setg(errp, "Node name '%s' not found", node_name);
6101 return NULL;
6102 }
6103
5a7e7a0b
SH
6104 aio_context = bdrv_get_aio_context(to_replace_bs);
6105 aio_context_acquire(aio_context);
6106
09158f00 6107 if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5a7e7a0b
SH
6108 to_replace_bs = NULL;
6109 goto out;
09158f00
BC
6110 }
6111
6112 /* We don't want arbitrary node of the BDS chain to be replaced only the top
6113 * most non filter in order to prevent data corruption.
6114 * Another benefit is that this tests exclude backing files which are
6115 * blocked by the backing blockers.
6116 */
6117 if (!bdrv_is_first_non_filter(to_replace_bs)) {
6118 error_setg(errp, "Only top most non filter can be replaced");
5a7e7a0b
SH
6119 to_replace_bs = NULL;
6120 goto out;
09158f00
BC
6121 }
6122
5a7e7a0b
SH
6123out:
6124 aio_context_release(aio_context);
09158f00
BC
6125 return to_replace_bs;
6126}
448ad91d
ML
6127
6128void bdrv_io_plug(BlockDriverState *bs)
6129{
6130 BlockDriver *drv = bs->drv;
6131 if (drv && drv->bdrv_io_plug) {
6132 drv->bdrv_io_plug(bs);
6133 } else if (bs->file) {
6134 bdrv_io_plug(bs->file);
6135 }
6136}
6137
6138void bdrv_io_unplug(BlockDriverState *bs)
6139{
6140 BlockDriver *drv = bs->drv;
6141 if (drv && drv->bdrv_io_unplug) {
6142 drv->bdrv_io_unplug(bs);
6143 } else if (bs->file) {
6144 bdrv_io_unplug(bs->file);
6145 }
6146}
6147
6148void bdrv_flush_io_queue(BlockDriverState *bs)
6149{
6150 BlockDriver *drv = bs->drv;
6151 if (drv && drv->bdrv_flush_io_queue) {
6152 drv->bdrv_flush_io_queue(bs);
6153 } else if (bs->file) {
6154 bdrv_flush_io_queue(bs->file);
6155 }
6156}
91af7014
HR
6157
6158static bool append_open_options(QDict *d, BlockDriverState *bs)
6159{
6160 const QDictEntry *entry;
6161 bool found_any = false;
6162
6163 for (entry = qdict_first(bs->options); entry;
6164 entry = qdict_next(bs->options, entry))
6165 {
6166 /* Only take options for this level and exclude all non-driver-specific
6167 * options */
6168 if (!strchr(qdict_entry_key(entry), '.') &&
6169 strcmp(qdict_entry_key(entry), "node-name"))
6170 {
6171 qobject_incref(qdict_entry_value(entry));
6172 qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6173 found_any = true;
6174 }
6175 }
6176
6177 return found_any;
6178}
6179
6180/* Updates the following BDS fields:
6181 * - exact_filename: A filename which may be used for opening a block device
6182 * which (mostly) equals the given BDS (even without any
6183 * other options; so reading and writing must return the same
6184 * results, but caching etc. may be different)
6185 * - full_open_options: Options which, when given when opening a block device
6186 * (without a filename), result in a BDS (mostly)
6187 * equalling the given one
6188 * - filename: If exact_filename is set, it is copied here. Otherwise,
6189 * full_open_options is converted to a JSON object, prefixed with
6190 * "json:" (for use through the JSON pseudo protocol) and put here.
6191 */
6192void bdrv_refresh_filename(BlockDriverState *bs)
6193{
6194 BlockDriver *drv = bs->drv;
6195 QDict *opts;
6196
6197 if (!drv) {
6198 return;
6199 }
6200
6201 /* This BDS's file name will most probably depend on its file's name, so
6202 * refresh that first */
6203 if (bs->file) {
6204 bdrv_refresh_filename(bs->file);
6205 }
6206
6207 if (drv->bdrv_refresh_filename) {
6208 /* Obsolete information is of no use here, so drop the old file name
6209 * information before refreshing it */
6210 bs->exact_filename[0] = '\0';
6211 if (bs->full_open_options) {
6212 QDECREF(bs->full_open_options);
6213 bs->full_open_options = NULL;
6214 }
6215
6216 drv->bdrv_refresh_filename(bs);
6217 } else if (bs->file) {
6218 /* Try to reconstruct valid information from the underlying file */
6219 bool has_open_options;
6220
6221 bs->exact_filename[0] = '\0';
6222 if (bs->full_open_options) {
6223 QDECREF(bs->full_open_options);
6224 bs->full_open_options = NULL;
6225 }
6226
6227 opts = qdict_new();
6228 has_open_options = append_open_options(opts, bs);
6229
6230 /* If no specific options have been given for this BDS, the filename of
6231 * the underlying file should suffice for this one as well */
6232 if (bs->file->exact_filename[0] && !has_open_options) {
6233 strcpy(bs->exact_filename, bs->file->exact_filename);
6234 }
6235 /* Reconstructing the full options QDict is simple for most format block
6236 * drivers, as long as the full options are known for the underlying
6237 * file BDS. The full options QDict of that file BDS should somehow
6238 * contain a representation of the filename, therefore the following
6239 * suffices without querying the (exact_)filename of this BDS. */
6240 if (bs->file->full_open_options) {
6241 qdict_put_obj(opts, "driver",
6242 QOBJECT(qstring_from_str(drv->format_name)));
6243 QINCREF(bs->file->full_open_options);
6244 qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6245
6246 bs->full_open_options = opts;
6247 } else {
6248 QDECREF(opts);
6249 }
6250 } else if (!bs->full_open_options && qdict_size(bs->options)) {
6251 /* There is no underlying file BDS (at least referenced by BDS.file),
6252 * so the full options QDict should be equal to the options given
6253 * specifically for this block device when it was opened (plus the
6254 * driver specification).
6255 * Because those options don't change, there is no need to update
6256 * full_open_options when it's already set. */
6257
6258 opts = qdict_new();
6259 append_open_options(opts, bs);
6260 qdict_put_obj(opts, "driver",
6261 QOBJECT(qstring_from_str(drv->format_name)));
6262
6263 if (bs->exact_filename[0]) {
6264 /* This may not work for all block protocol drivers (some may
6265 * require this filename to be parsed), but we have to find some
6266 * default solution here, so just include it. If some block driver
6267 * does not support pure options without any filename at all or
6268 * needs some special format of the options QDict, it needs to
6269 * implement the driver-specific bdrv_refresh_filename() function.
6270 */
6271 qdict_put_obj(opts, "filename",
6272 QOBJECT(qstring_from_str(bs->exact_filename)));
6273 }
6274
6275 bs->full_open_options = opts;
6276 }
6277
6278 if (bs->exact_filename[0]) {
6279 pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6280 } else if (bs->full_open_options) {
6281 QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6282 snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6283 qstring_get_str(json));
6284 QDECREF(json);
6285 }
6286}
5366d0c8
BC
6287
6288/* This accessor function purpose is to allow the device models to access the
6289 * BlockAcctStats structure embedded inside a BlockDriverState without being
6290 * aware of the BlockDriverState structure layout.
6291 * It will go away when the BlockAcctStats structure will be moved inside
6292 * the device models.
6293 */
6294BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6295{
6296 return &bs->stats;
6297}
This page took 1.864875 seconds and 4 git commands to generate.