]> Git Repo - qemu.git/blame - block/io.c
block: access wakeup with atomic ops
[qemu.git] / block / io.c
CommitLineData
61007b31
SH
1/*
2 * Block layer I/O functions
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
80c71a24 25#include "qemu/osdep.h"
61007b31 26#include "trace.h"
7f0e9da6 27#include "sysemu/block-backend.h"
61007b31 28#include "block/blockjob.h"
f321dcb5 29#include "block/blockjob_int.h"
61007b31 30#include "block/block_int.h"
f348b6d1 31#include "qemu/cutils.h"
da34e65c 32#include "qapi/error.h"
d49b6836 33#include "qemu/error-report.h"
61007b31
SH
34
35#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
36
b15404e0
EB
37static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
38 int64_t offset,
39 QEMUIOVector *qiov,
40 BdrvRequestFlags flags,
41 BlockCompletionFunc *cb,
42 void *opaque,
43 bool is_write);
61007b31 44static void coroutine_fn bdrv_co_do_rw(void *opaque);
d05aa8bb
EB
45static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
46 int64_t offset, int count, BdrvRequestFlags flags);
61007b31 47
14e9559f 48void bdrv_parent_drained_begin(BlockDriverState *bs)
61007b31 49{
c2066af0 50 BdrvChild *c;
27ccdd52 51
c2066af0
KW
52 QLIST_FOREACH(c, &bs->parents, next_parent) {
53 if (c->role->drained_begin) {
54 c->role->drained_begin(c);
55 }
ce0f1412
PB
56 }
57}
61007b31 58
14e9559f 59void bdrv_parent_drained_end(BlockDriverState *bs)
ce0f1412 60{
c2066af0 61 BdrvChild *c;
27ccdd52 62
c2066af0
KW
63 QLIST_FOREACH(c, &bs->parents, next_parent) {
64 if (c->role->drained_end) {
65 c->role->drained_end(c);
66 }
27ccdd52 67 }
61007b31
SH
68}
69
d9e0dfa2
EB
70static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
71{
72 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
73 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
74 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
75 src->opt_mem_alignment);
76 dst->min_mem_alignment = MAX(dst->min_mem_alignment,
77 src->min_mem_alignment);
78 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
79}
80
61007b31
SH
81void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
82{
83 BlockDriver *drv = bs->drv;
84 Error *local_err = NULL;
85
86 memset(&bs->bl, 0, sizeof(bs->bl));
87
88 if (!drv) {
89 return;
90 }
91
79ba8c98 92 /* Default alignment based on whether driver has byte interface */
a5b8dd2c 93 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
79ba8c98 94
61007b31
SH
95 /* Take some limits from the children as a default */
96 if (bs->file) {
9a4f4c31 97 bdrv_refresh_limits(bs->file->bs, &local_err);
61007b31
SH
98 if (local_err) {
99 error_propagate(errp, local_err);
100 return;
101 }
d9e0dfa2 102 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
61007b31 103 } else {
4196d2f0 104 bs->bl.min_mem_alignment = 512;
459b4e66 105 bs->bl.opt_mem_alignment = getpagesize();
bd44feb7
SH
106
107 /* Safe default since most protocols use readv()/writev()/etc */
108 bs->bl.max_iov = IOV_MAX;
61007b31
SH
109 }
110
760e0063
KW
111 if (bs->backing) {
112 bdrv_refresh_limits(bs->backing->bs, &local_err);
61007b31
SH
113 if (local_err) {
114 error_propagate(errp, local_err);
115 return;
116 }
d9e0dfa2 117 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
61007b31
SH
118 }
119
120 /* Then let the driver override it */
121 if (drv->bdrv_refresh_limits) {
122 drv->bdrv_refresh_limits(bs, errp);
123 }
124}
125
126/**
127 * The copy-on-read flag is actually a reference count so multiple users may
128 * use the feature without worrying about clobbering its previous state.
129 * Copy-on-read stays enabled until all users have called to disable it.
130 */
131void bdrv_enable_copy_on_read(BlockDriverState *bs)
132{
d3faa13e 133 atomic_inc(&bs->copy_on_read);
61007b31
SH
134}
135
136void bdrv_disable_copy_on_read(BlockDriverState *bs)
137{
d3faa13e
PB
138 int old = atomic_fetch_dec(&bs->copy_on_read);
139 assert(old >= 1);
61007b31
SH
140}
141
142/* Check if any requests are in-flight (including throttled requests) */
439db28c 143bool bdrv_requests_pending(BlockDriverState *bs)
61007b31 144{
37a639a7
KW
145 BdrvChild *child;
146
99723548 147 if (atomic_read(&bs->in_flight)) {
61007b31
SH
148 return true;
149 }
37a639a7
KW
150
151 QLIST_FOREACH(child, &bs->children, next) {
152 if (bdrv_requests_pending(child->bs)) {
153 return true;
154 }
61007b31 155 }
37a639a7 156
61007b31
SH
157 return false;
158}
159
d42cf288 160static bool bdrv_drain_recurse(BlockDriverState *bs)
67da1dc5 161{
178bd438 162 BdrvChild *child, *tmp;
d42cf288
PB
163 bool waited;
164
88b062c2 165 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
67da1dc5
FZ
166
167 if (bs->drv && bs->drv->bdrv_drain) {
168 bs->drv->bdrv_drain(bs);
169 }
d42cf288 170
178bd438
FZ
171 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
172 BlockDriverState *bs = child->bs;
173 bool in_main_loop =
174 qemu_get_current_aio_context() == qemu_get_aio_context();
175 assert(bs->refcnt > 0);
176 if (in_main_loop) {
177 /* In case the recursive bdrv_drain_recurse processes a
178 * block_job_defer_to_main_loop BH and modifies the graph,
179 * let's hold a reference to bs until we are done.
180 *
181 * IOThread doesn't have such a BH, and it is not safe to call
182 * bdrv_unref without BQL, so skip doing it there.
183 */
184 bdrv_ref(bs);
185 }
186 waited |= bdrv_drain_recurse(bs);
187 if (in_main_loop) {
188 bdrv_unref(bs);
189 }
67da1dc5 190 }
d42cf288
PB
191
192 return waited;
67da1dc5
FZ
193}
194
a77fd4bb
FZ
195typedef struct {
196 Coroutine *co;
197 BlockDriverState *bs;
a77fd4bb
FZ
198 bool done;
199} BdrvCoDrainData;
200
201static void bdrv_co_drain_bh_cb(void *opaque)
202{
203 BdrvCoDrainData *data = opaque;
204 Coroutine *co = data->co;
99723548 205 BlockDriverState *bs = data->bs;
a77fd4bb 206
99723548 207 bdrv_dec_in_flight(bs);
d42cf288 208 bdrv_drained_begin(bs);
a77fd4bb 209 data->done = true;
1919631e 210 aio_co_wake(co);
a77fd4bb
FZ
211}
212
b6e84c97 213static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
a77fd4bb
FZ
214{
215 BdrvCoDrainData data;
216
217 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
218 * other coroutines run if they were queued from
219 * qemu_co_queue_run_restart(). */
220
221 assert(qemu_in_coroutine());
222 data = (BdrvCoDrainData) {
223 .co = qemu_coroutine_self(),
224 .bs = bs,
225 .done = false,
a77fd4bb 226 };
99723548 227 bdrv_inc_in_flight(bs);
fffb6e12
PB
228 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
229 bdrv_co_drain_bh_cb, &data);
a77fd4bb
FZ
230
231 qemu_coroutine_yield();
232 /* If we are resumed from some other event (such as an aio completion or a
233 * timer callback), it is a bug in the caller that should be fixed. */
234 assert(data.done);
235}
236
6820643f
KW
237void bdrv_drained_begin(BlockDriverState *bs)
238{
d42cf288
PB
239 if (qemu_in_coroutine()) {
240 bdrv_co_yield_to_drain(bs);
241 return;
242 }
243
414c2ec3 244 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
6820643f
KW
245 aio_disable_external(bdrv_get_aio_context(bs));
246 bdrv_parent_drained_begin(bs);
247 }
248
6820643f 249 bdrv_drain_recurse(bs);
6820643f
KW
250}
251
252void bdrv_drained_end(BlockDriverState *bs)
253{
254 assert(bs->quiesce_counter > 0);
414c2ec3 255 if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
6820643f
KW
256 return;
257 }
258
259 bdrv_parent_drained_end(bs);
260 aio_enable_external(bdrv_get_aio_context(bs));
261}
262
61007b31 263/*
67da1dc5
FZ
264 * Wait for pending requests to complete on a single BlockDriverState subtree,
265 * and suspend block driver's internal I/O until next request arrives.
61007b31 266 *
61007b31
SH
267 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
268 * AioContext.
7a63f3cd
SH
269 *
270 * Only this BlockDriverState's AioContext is run, so in-flight requests must
271 * not depend on events in other AioContexts. In that case, use
272 * bdrv_drain_all() instead.
61007b31 273 */
b6e84c97 274void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
61007b31 275{
6820643f
KW
276 assert(qemu_in_coroutine());
277 bdrv_drained_begin(bs);
278 bdrv_drained_end(bs);
b6e84c97 279}
f406c03c 280
b6e84c97
PB
281void bdrv_drain(BlockDriverState *bs)
282{
6820643f
KW
283 bdrv_drained_begin(bs);
284 bdrv_drained_end(bs);
61007b31
SH
285}
286
287/*
288 * Wait for pending requests to complete across all BlockDriverStates
289 *
290 * This function does not flush data to disk, use bdrv_flush_all() for that
291 * after calling this function.
c0778f66
AG
292 *
293 * This pauses all block jobs and disables external clients. It must
294 * be paired with bdrv_drain_all_end().
295 *
296 * NOTE: no new block jobs or BlockDriverStates can be created between
297 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
61007b31 298 */
c0778f66 299void bdrv_drain_all_begin(void)
61007b31
SH
300{
301 /* Always run first iteration so any pending completion BHs run */
99723548 302 bool waited = true;
7c8eece4 303 BlockDriverState *bs;
88be7b4b 304 BdrvNextIterator it;
f406c03c 305 GSList *aio_ctxs = NULL, *ctx;
61007b31 306
f321dcb5 307 block_job_pause_all();
eb1364ce 308
88be7b4b 309 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
310 AioContext *aio_context = bdrv_get_aio_context(bs);
311
312 aio_context_acquire(aio_context);
c2066af0 313 bdrv_parent_drained_begin(bs);
c0778f66 314 aio_disable_external(aio_context);
61007b31 315 aio_context_release(aio_context);
f406c03c 316
764ba3ae 317 if (!g_slist_find(aio_ctxs, aio_context)) {
f406c03c
AY
318 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
319 }
61007b31
SH
320 }
321
7a63f3cd
SH
322 /* Note that completion of an asynchronous I/O operation can trigger any
323 * number of other I/O operations on other devices---for example a
324 * coroutine can submit an I/O request to another device in response to
325 * request completion. Therefore we must keep looping until there was no
326 * more activity rather than simply draining each device independently.
327 */
99723548
PB
328 while (waited) {
329 waited = false;
61007b31 330
f406c03c
AY
331 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
332 AioContext *aio_context = ctx->data;
61007b31
SH
333
334 aio_context_acquire(aio_context);
88be7b4b 335 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
f406c03c 336 if (aio_context == bdrv_get_aio_context(bs)) {
d42cf288 337 waited |= bdrv_drain_recurse(bs);
f406c03c
AY
338 }
339 }
61007b31
SH
340 aio_context_release(aio_context);
341 }
342 }
343
c0778f66
AG
344 g_slist_free(aio_ctxs);
345}
346
347void bdrv_drain_all_end(void)
348{
349 BlockDriverState *bs;
350 BdrvNextIterator it;
c0778f66 351
88be7b4b 352 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
353 AioContext *aio_context = bdrv_get_aio_context(bs);
354
355 aio_context_acquire(aio_context);
c0778f66 356 aio_enable_external(aio_context);
c2066af0 357 bdrv_parent_drained_end(bs);
61007b31
SH
358 aio_context_release(aio_context);
359 }
eb1364ce 360
f321dcb5 361 block_job_resume_all();
61007b31
SH
362}
363
c0778f66
AG
364void bdrv_drain_all(void)
365{
366 bdrv_drain_all_begin();
367 bdrv_drain_all_end();
368}
369
61007b31
SH
370/**
371 * Remove an active request from the tracked requests list
372 *
373 * This function should be called when a tracked request is completing.
374 */
375static void tracked_request_end(BdrvTrackedRequest *req)
376{
377 if (req->serialising) {
20fc71b2 378 atomic_dec(&req->bs->serialising_in_flight);
61007b31
SH
379 }
380
381 QLIST_REMOVE(req, list);
382 qemu_co_queue_restart_all(&req->wait_queue);
383}
384
385/**
386 * Add an active request to the tracked requests list
387 */
388static void tracked_request_begin(BdrvTrackedRequest *req,
389 BlockDriverState *bs,
390 int64_t offset,
ebde595c
FZ
391 unsigned int bytes,
392 enum BdrvTrackedRequestType type)
61007b31
SH
393{
394 *req = (BdrvTrackedRequest){
395 .bs = bs,
396 .offset = offset,
397 .bytes = bytes,
ebde595c 398 .type = type,
61007b31
SH
399 .co = qemu_coroutine_self(),
400 .serialising = false,
401 .overlap_offset = offset,
402 .overlap_bytes = bytes,
403 };
404
405 qemu_co_queue_init(&req->wait_queue);
406
407 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
408}
409
410static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
411{
412 int64_t overlap_offset = req->offset & ~(align - 1);
413 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
414 - overlap_offset;
415
416 if (!req->serialising) {
20fc71b2 417 atomic_inc(&req->bs->serialising_in_flight);
61007b31
SH
418 req->serialising = true;
419 }
420
421 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
422 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
423}
424
425/**
244483e6 426 * Round a region to cluster boundaries (sector-based)
61007b31 427 */
244483e6
KW
428void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
429 int64_t sector_num, int nb_sectors,
430 int64_t *cluster_sector_num,
431 int *cluster_nb_sectors)
61007b31
SH
432{
433 BlockDriverInfo bdi;
434
435 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
436 *cluster_sector_num = sector_num;
437 *cluster_nb_sectors = nb_sectors;
438 } else {
439 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
440 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
441 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
442 nb_sectors, c);
443 }
444}
445
244483e6
KW
446/**
447 * Round a region to cluster boundaries
448 */
449void bdrv_round_to_clusters(BlockDriverState *bs,
450 int64_t offset, unsigned int bytes,
451 int64_t *cluster_offset,
452 unsigned int *cluster_bytes)
453{
454 BlockDriverInfo bdi;
455
456 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
457 *cluster_offset = offset;
458 *cluster_bytes = bytes;
459 } else {
460 int64_t c = bdi.cluster_size;
461 *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
462 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
463 }
464}
465
61007b31
SH
466static int bdrv_get_cluster_size(BlockDriverState *bs)
467{
468 BlockDriverInfo bdi;
469 int ret;
470
471 ret = bdrv_get_info(bs, &bdi);
472 if (ret < 0 || bdi.cluster_size == 0) {
a5b8dd2c 473 return bs->bl.request_alignment;
61007b31
SH
474 } else {
475 return bdi.cluster_size;
476 }
477}
478
479static bool tracked_request_overlaps(BdrvTrackedRequest *req,
480 int64_t offset, unsigned int bytes)
481{
482 /* aaaa bbbb */
483 if (offset >= req->overlap_offset + req->overlap_bytes) {
484 return false;
485 }
486 /* bbbb aaaa */
487 if (req->overlap_offset >= offset + bytes) {
488 return false;
489 }
490 return true;
491}
492
99723548
PB
493void bdrv_inc_in_flight(BlockDriverState *bs)
494{
495 atomic_inc(&bs->in_flight);
496}
497
c9d1a561
PB
498static void dummy_bh_cb(void *opaque)
499{
500}
501
502void bdrv_wakeup(BlockDriverState *bs)
503{
e2a6ae7f
PB
504 /* The barrier (or an atomic op) is in the caller. */
505 if (atomic_read(&bs->wakeup)) {
c9d1a561
PB
506 aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
507 }
508}
509
99723548
PB
510void bdrv_dec_in_flight(BlockDriverState *bs)
511{
512 atomic_dec(&bs->in_flight);
c9d1a561 513 bdrv_wakeup(bs);
99723548
PB
514}
515
61007b31
SH
516static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
517{
518 BlockDriverState *bs = self->bs;
519 BdrvTrackedRequest *req;
520 bool retry;
521 bool waited = false;
522
20fc71b2 523 if (!atomic_read(&bs->serialising_in_flight)) {
61007b31
SH
524 return false;
525 }
526
527 do {
528 retry = false;
529 QLIST_FOREACH(req, &bs->tracked_requests, list) {
530 if (req == self || (!req->serialising && !self->serialising)) {
531 continue;
532 }
533 if (tracked_request_overlaps(req, self->overlap_offset,
534 self->overlap_bytes))
535 {
536 /* Hitting this means there was a reentrant request, for
537 * example, a block driver issuing nested requests. This must
538 * never happen since it means deadlock.
539 */
540 assert(qemu_coroutine_self() != req->co);
541
542 /* If the request is already (indirectly) waiting for us, or
543 * will wait for us as soon as it wakes up, then just go on
544 * (instead of producing a deadlock in the former case). */
545 if (!req->waiting_for) {
546 self->waiting_for = req;
1ace7cea 547 qemu_co_queue_wait(&req->wait_queue, NULL);
61007b31
SH
548 self->waiting_for = NULL;
549 retry = true;
550 waited = true;
551 break;
552 }
553 }
554 }
555 } while (retry);
556
557 return waited;
558}
559
560static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
561 size_t size)
562{
563 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
564 return -EIO;
565 }
566
567 if (!bdrv_is_inserted(bs)) {
568 return -ENOMEDIUM;
569 }
570
571 if (offset < 0) {
572 return -EIO;
573 }
574
575 return 0;
576}
577
61007b31 578typedef struct RwCo {
e293b7a3 579 BdrvChild *child;
61007b31
SH
580 int64_t offset;
581 QEMUIOVector *qiov;
582 bool is_write;
583 int ret;
584 BdrvRequestFlags flags;
585} RwCo;
586
587static void coroutine_fn bdrv_rw_co_entry(void *opaque)
588{
589 RwCo *rwco = opaque;
590
591 if (!rwco->is_write) {
a03ef88f 592 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
cab3a356
KW
593 rwco->qiov->size, rwco->qiov,
594 rwco->flags);
61007b31 595 } else {
a03ef88f 596 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
cab3a356
KW
597 rwco->qiov->size, rwco->qiov,
598 rwco->flags);
61007b31
SH
599 }
600}
601
602/*
603 * Process a vectored synchronous request using coroutines
604 */
e293b7a3 605static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
61007b31
SH
606 QEMUIOVector *qiov, bool is_write,
607 BdrvRequestFlags flags)
608{
609 Coroutine *co;
610 RwCo rwco = {
e293b7a3 611 .child = child,
61007b31
SH
612 .offset = offset,
613 .qiov = qiov,
614 .is_write = is_write,
615 .ret = NOT_DONE,
616 .flags = flags,
617 };
618
61007b31
SH
619 if (qemu_in_coroutine()) {
620 /* Fast-path if already in coroutine context */
621 bdrv_rw_co_entry(&rwco);
622 } else {
0b8b8753 623 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
e92f0e19 624 bdrv_coroutine_enter(child->bs, co);
88b062c2 625 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
61007b31
SH
626 }
627 return rwco.ret;
628}
629
630/*
631 * Process a synchronous request using coroutines
632 */
e293b7a3 633static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
61007b31
SH
634 int nb_sectors, bool is_write, BdrvRequestFlags flags)
635{
636 QEMUIOVector qiov;
637 struct iovec iov = {
638 .iov_base = (void *)buf,
639 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
640 };
641
642 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
643 return -EINVAL;
644 }
645
646 qemu_iovec_init_external(&qiov, &iov, 1);
e293b7a3 647 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
61007b31
SH
648 &qiov, is_write, flags);
649}
650
651/* return < 0 if error. See bdrv_write() for the return codes */
fbcbbf4e 652int bdrv_read(BdrvChild *child, int64_t sector_num,
61007b31
SH
653 uint8_t *buf, int nb_sectors)
654{
e293b7a3 655 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
61007b31
SH
656}
657
61007b31
SH
658/* Return < 0 if error. Important errors are:
659 -EIO generic I/O error (may happen for all errors)
660 -ENOMEDIUM No media inserted.
661 -EINVAL Invalid sector number or nb_sectors
662 -EACCES Trying to write a read-only device
663*/
18d51c4b 664int bdrv_write(BdrvChild *child, int64_t sector_num,
61007b31
SH
665 const uint8_t *buf, int nb_sectors)
666{
e293b7a3 667 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
61007b31
SH
668}
669
720ff280 670int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
74021bc4 671 int count, BdrvRequestFlags flags)
61007b31 672{
74021bc4
EB
673 QEMUIOVector qiov;
674 struct iovec iov = {
675 .iov_base = NULL,
676 .iov_len = count,
677 };
678
679 qemu_iovec_init_external(&qiov, &iov, 1);
e293b7a3 680 return bdrv_prwv_co(child, offset, &qiov, true,
74021bc4 681 BDRV_REQ_ZERO_WRITE | flags);
61007b31
SH
682}
683
684/*
74021bc4 685 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
61007b31
SH
686 * The operation is sped up by checking the block status and only writing
687 * zeroes to the device if they currently do not return zeroes. Optional
74021bc4 688 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
465fe887 689 * BDRV_REQ_FUA).
61007b31
SH
690 *
691 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
692 */
720ff280 693int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
61007b31
SH
694{
695 int64_t target_sectors, ret, nb_sectors, sector_num = 0;
720ff280 696 BlockDriverState *bs = child->bs;
67a0fd2a 697 BlockDriverState *file;
61007b31
SH
698 int n;
699
700 target_sectors = bdrv_nb_sectors(bs);
701 if (target_sectors < 0) {
702 return target_sectors;
703 }
704
705 for (;;) {
706 nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
707 if (nb_sectors <= 0) {
708 return 0;
709 }
67a0fd2a 710 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
61007b31
SH
711 if (ret < 0) {
712 error_report("error getting block status at sector %" PRId64 ": %s",
713 sector_num, strerror(-ret));
714 return ret;
715 }
716 if (ret & BDRV_BLOCK_ZERO) {
717 sector_num += n;
718 continue;
719 }
720ff280 720 ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
74021bc4 721 n << BDRV_SECTOR_BITS, flags);
61007b31
SH
722 if (ret < 0) {
723 error_report("error writing zeroes at sector %" PRId64 ": %s",
724 sector_num, strerror(-ret));
725 return ret;
726 }
727 sector_num += n;
728 }
729}
730
cf2ab8fc 731int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
f1e84741
KW
732{
733 int ret;
734
e293b7a3 735 ret = bdrv_prwv_co(child, offset, qiov, false, 0);
f1e84741
KW
736 if (ret < 0) {
737 return ret;
738 }
739
740 return qiov->size;
741}
742
cf2ab8fc 743int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
61007b31
SH
744{
745 QEMUIOVector qiov;
746 struct iovec iov = {
747 .iov_base = (void *)buf,
748 .iov_len = bytes,
749 };
61007b31
SH
750
751 if (bytes < 0) {
752 return -EINVAL;
753 }
754
755 qemu_iovec_init_external(&qiov, &iov, 1);
cf2ab8fc 756 return bdrv_preadv(child, offset, &qiov);
61007b31
SH
757}
758
d9ca2ea2 759int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
61007b31
SH
760{
761 int ret;
762
e293b7a3 763 ret = bdrv_prwv_co(child, offset, qiov, true, 0);
61007b31
SH
764 if (ret < 0) {
765 return ret;
766 }
767
768 return qiov->size;
769}
770
d9ca2ea2 771int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
61007b31
SH
772{
773 QEMUIOVector qiov;
774 struct iovec iov = {
775 .iov_base = (void *) buf,
776 .iov_len = bytes,
777 };
778
779 if (bytes < 0) {
780 return -EINVAL;
781 }
782
783 qemu_iovec_init_external(&qiov, &iov, 1);
d9ca2ea2 784 return bdrv_pwritev(child, offset, &qiov);
61007b31
SH
785}
786
787/*
788 * Writes to the file and ensures that no writes are reordered across this
789 * request (acts as a barrier)
790 *
791 * Returns 0 on success, -errno in error cases.
792 */
d9ca2ea2
KW
793int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
794 const void *buf, int count)
61007b31
SH
795{
796 int ret;
797
d9ca2ea2 798 ret = bdrv_pwrite(child, offset, buf, count);
61007b31
SH
799 if (ret < 0) {
800 return ret;
801 }
802
d9ca2ea2 803 ret = bdrv_flush(child->bs);
855a6a93
KW
804 if (ret < 0) {
805 return ret;
61007b31
SH
806 }
807
808 return 0;
809}
810
08844473
KW
811typedef struct CoroutineIOCompletion {
812 Coroutine *coroutine;
813 int ret;
814} CoroutineIOCompletion;
815
816static void bdrv_co_io_em_complete(void *opaque, int ret)
817{
818 CoroutineIOCompletion *co = opaque;
819
820 co->ret = ret;
b9e413dd 821 aio_co_wake(co->coroutine);
08844473
KW
822}
823
166fe960
KW
824static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
825 uint64_t offset, uint64_t bytes,
826 QEMUIOVector *qiov, int flags)
827{
828 BlockDriver *drv = bs->drv;
3fb06697
KW
829 int64_t sector_num;
830 unsigned int nb_sectors;
831
fa166538
EB
832 assert(!(flags & ~BDRV_REQ_MASK));
833
3fb06697
KW
834 if (drv->bdrv_co_preadv) {
835 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
836 }
837
838 sector_num = offset >> BDRV_SECTOR_BITS;
839 nb_sectors = bytes >> BDRV_SECTOR_BITS;
166fe960
KW
840
841 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
842 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
843 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
844
08844473
KW
845 if (drv->bdrv_co_readv) {
846 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
847 } else {
848 BlockAIOCB *acb;
849 CoroutineIOCompletion co = {
850 .coroutine = qemu_coroutine_self(),
851 };
852
853 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
854 bdrv_co_io_em_complete, &co);
855 if (acb == NULL) {
856 return -EIO;
857 } else {
858 qemu_coroutine_yield();
859 return co.ret;
860 }
861 }
166fe960
KW
862}
863
78a07294
KW
864static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
865 uint64_t offset, uint64_t bytes,
866 QEMUIOVector *qiov, int flags)
867{
868 BlockDriver *drv = bs->drv;
3fb06697
KW
869 int64_t sector_num;
870 unsigned int nb_sectors;
78a07294
KW
871 int ret;
872
fa166538
EB
873 assert(!(flags & ~BDRV_REQ_MASK));
874
3fb06697 875 if (drv->bdrv_co_pwritev) {
515c2f43
KW
876 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
877 flags & bs->supported_write_flags);
878 flags &= ~bs->supported_write_flags;
3fb06697
KW
879 goto emulate_flags;
880 }
881
882 sector_num = offset >> BDRV_SECTOR_BITS;
883 nb_sectors = bytes >> BDRV_SECTOR_BITS;
884
78a07294
KW
885 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
886 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
887 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
888
889 if (drv->bdrv_co_writev_flags) {
890 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
4df863f3
EB
891 flags & bs->supported_write_flags);
892 flags &= ~bs->supported_write_flags;
08844473 893 } else if (drv->bdrv_co_writev) {
4df863f3 894 assert(!bs->supported_write_flags);
78a07294 895 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
08844473
KW
896 } else {
897 BlockAIOCB *acb;
898 CoroutineIOCompletion co = {
899 .coroutine = qemu_coroutine_self(),
900 };
901
902 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
903 bdrv_co_io_em_complete, &co);
904 if (acb == NULL) {
3fb06697 905 ret = -EIO;
08844473
KW
906 } else {
907 qemu_coroutine_yield();
3fb06697 908 ret = co.ret;
08844473 909 }
78a07294
KW
910 }
911
3fb06697 912emulate_flags:
4df863f3 913 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
78a07294
KW
914 ret = bdrv_co_flush(bs);
915 }
916
917 return ret;
918}
919
29a298af
PB
920static int coroutine_fn
921bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
922 uint64_t bytes, QEMUIOVector *qiov)
923{
924 BlockDriver *drv = bs->drv;
925
926 if (!drv->bdrv_co_pwritev_compressed) {
927 return -ENOTSUP;
928 }
929
29a298af
PB
930 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
931}
932
85c97ca7 933static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
244483e6 934 int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
61007b31 935{
85c97ca7
KW
936 BlockDriverState *bs = child->bs;
937
61007b31
SH
938 /* Perform I/O through a temporary buffer so that users who scribble over
939 * their read buffer while the operation is in progress do not end up
940 * modifying the image file. This is critical for zero-copy guest I/O
941 * where anything might happen inside guest memory.
942 */
943 void *bounce_buffer;
944
945 BlockDriver *drv = bs->drv;
946 struct iovec iov;
947 QEMUIOVector bounce_qiov;
244483e6
KW
948 int64_t cluster_offset;
949 unsigned int cluster_bytes;
61007b31
SH
950 size_t skip_bytes;
951 int ret;
952
1bf03e66
KW
953 /* FIXME We cannot require callers to have write permissions when all they
954 * are doing is a read request. If we did things right, write permissions
955 * would be obtained anyway, but internally by the copy-on-read code. As
956 * long as it is implemented here rather than in a separat filter driver,
957 * the copy-on-read code doesn't have its own BdrvChild, however, for which
958 * it could request permissions. Therefore we have to bypass the permission
959 * system for the moment. */
960 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
afa4b293 961
61007b31
SH
962 /* Cover entire cluster so no additional backing file I/O is required when
963 * allocating cluster in the image file.
964 */
244483e6 965 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
61007b31 966
244483e6
KW
967 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
968 cluster_offset, cluster_bytes);
61007b31 969
244483e6 970 iov.iov_len = cluster_bytes;
61007b31
SH
971 iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
972 if (bounce_buffer == NULL) {
973 ret = -ENOMEM;
974 goto err;
975 }
976
977 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
978
244483e6 979 ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
166fe960 980 &bounce_qiov, 0);
61007b31
SH
981 if (ret < 0) {
982 goto err;
983 }
984
c1499a5e 985 if (drv->bdrv_co_pwrite_zeroes &&
61007b31 986 buffer_is_zero(bounce_buffer, iov.iov_len)) {
a604fa2b
EB
987 /* FIXME: Should we (perhaps conditionally) be setting
988 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
989 * that still correctly reads as zero? */
244483e6 990 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
61007b31
SH
991 } else {
992 /* This does not change the data on the disk, it is not necessary
993 * to flush even in cache=writethrough mode.
994 */
244483e6 995 ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
78a07294 996 &bounce_qiov, 0);
61007b31
SH
997 }
998
999 if (ret < 0) {
1000 /* It might be okay to ignore write errors for guest requests. If this
1001 * is a deliberate copy-on-read then we don't want to ignore the error.
1002 * Simply report it in all cases.
1003 */
1004 goto err;
1005 }
1006
244483e6
KW
1007 skip_bytes = offset - cluster_offset;
1008 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
61007b31
SH
1009
1010err:
1011 qemu_vfree(bounce_buffer);
1012 return ret;
1013}
1014
1015/*
1016 * Forwards an already correctly aligned request to the BlockDriver. This
1a62d0ac
EB
1017 * handles copy on read, zeroing after EOF, and fragmentation of large
1018 * reads; any other features must be implemented by the caller.
61007b31 1019 */
85c97ca7 1020static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
61007b31
SH
1021 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1022 int64_t align, QEMUIOVector *qiov, int flags)
1023{
85c97ca7 1024 BlockDriverState *bs = child->bs;
c9d20029 1025 int64_t total_bytes, max_bytes;
1a62d0ac
EB
1026 int ret = 0;
1027 uint64_t bytes_remaining = bytes;
1028 int max_transfer;
61007b31 1029
49c07526
KW
1030 assert(is_power_of_2(align));
1031 assert((offset & (align - 1)) == 0);
1032 assert((bytes & (align - 1)) == 0);
61007b31 1033 assert(!qiov || bytes == qiov->size);
abb06c5a 1034 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1a62d0ac
EB
1035 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1036 align);
a604fa2b
EB
1037
1038 /* TODO: We would need a per-BDS .supported_read_flags and
1039 * potential fallback support, if we ever implement any read flags
1040 * to pass through to drivers. For now, there aren't any
1041 * passthrough flags. */
1042 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
61007b31
SH
1043
1044 /* Handle Copy on Read and associated serialisation */
1045 if (flags & BDRV_REQ_COPY_ON_READ) {
1046 /* If we touch the same cluster it counts as an overlap. This
1047 * guarantees that allocating writes will be serialized and not race
1048 * with each other for the same cluster. For example, in copy-on-read
1049 * it ensures that the CoR read and write operations are atomic and
1050 * guest writes cannot interleave between them. */
1051 mark_request_serialising(req, bdrv_get_cluster_size(bs));
1052 }
1053
61408b25
FZ
1054 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1055 wait_serialising_requests(req);
1056 }
61007b31
SH
1057
1058 if (flags & BDRV_REQ_COPY_ON_READ) {
49c07526
KW
1059 int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1060 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1061 unsigned int nb_sectors = end_sector - start_sector;
61007b31
SH
1062 int pnum;
1063
49c07526 1064 ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
61007b31
SH
1065 if (ret < 0) {
1066 goto out;
1067 }
1068
1069 if (!ret || pnum != nb_sectors) {
85c97ca7 1070 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
61007b31
SH
1071 goto out;
1072 }
1073 }
1074
1a62d0ac 1075 /* Forward the request to the BlockDriver, possibly fragmenting it */
c9d20029
KW
1076 total_bytes = bdrv_getlength(bs);
1077 if (total_bytes < 0) {
1078 ret = total_bytes;
1079 goto out;
1080 }
61007b31 1081
c9d20029 1082 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1a62d0ac 1083 if (bytes <= max_bytes && bytes <= max_transfer) {
c9d20029 1084 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1a62d0ac
EB
1085 goto out;
1086 }
61007b31 1087
1a62d0ac
EB
1088 while (bytes_remaining) {
1089 int num;
61007b31 1090
1a62d0ac
EB
1091 if (max_bytes) {
1092 QEMUIOVector local_qiov;
61007b31 1093
1a62d0ac
EB
1094 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1095 assert(num);
1096 qemu_iovec_init(&local_qiov, qiov->niov);
1097 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
61007b31 1098
1a62d0ac
EB
1099 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1100 num, &local_qiov, 0);
1101 max_bytes -= num;
1102 qemu_iovec_destroy(&local_qiov);
1103 } else {
1104 num = bytes_remaining;
1105 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1106 bytes_remaining);
1107 }
1108 if (ret < 0) {
1109 goto out;
1110 }
1111 bytes_remaining -= num;
61007b31
SH
1112 }
1113
1114out:
1a62d0ac 1115 return ret < 0 ? ret : 0;
61007b31
SH
1116}
1117
61007b31
SH
1118/*
1119 * Handle a read request in coroutine context
1120 */
a03ef88f 1121int coroutine_fn bdrv_co_preadv(BdrvChild *child,
61007b31
SH
1122 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1123 BdrvRequestFlags flags)
1124{
a03ef88f 1125 BlockDriverState *bs = child->bs;
61007b31
SH
1126 BlockDriver *drv = bs->drv;
1127 BdrvTrackedRequest req;
1128
a5b8dd2c 1129 uint64_t align = bs->bl.request_alignment;
61007b31
SH
1130 uint8_t *head_buf = NULL;
1131 uint8_t *tail_buf = NULL;
1132 QEMUIOVector local_qiov;
1133 bool use_local_qiov = false;
1134 int ret;
1135
1136 if (!drv) {
1137 return -ENOMEDIUM;
1138 }
1139
1140 ret = bdrv_check_byte_request(bs, offset, bytes);
1141 if (ret < 0) {
1142 return ret;
1143 }
1144
99723548
PB
1145 bdrv_inc_in_flight(bs);
1146
9568b511 1147 /* Don't do copy-on-read if we read data before write operation */
d3faa13e 1148 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
61007b31
SH
1149 flags |= BDRV_REQ_COPY_ON_READ;
1150 }
1151
61007b31
SH
1152 /* Align read if necessary by padding qiov */
1153 if (offset & (align - 1)) {
1154 head_buf = qemu_blockalign(bs, align);
1155 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1156 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1157 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1158 use_local_qiov = true;
1159
1160 bytes += offset & (align - 1);
1161 offset = offset & ~(align - 1);
1162 }
1163
1164 if ((offset + bytes) & (align - 1)) {
1165 if (!use_local_qiov) {
1166 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1167 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1168 use_local_qiov = true;
1169 }
1170 tail_buf = qemu_blockalign(bs, align);
1171 qemu_iovec_add(&local_qiov, tail_buf,
1172 align - ((offset + bytes) & (align - 1)));
1173
1174 bytes = ROUND_UP(bytes, align);
1175 }
1176
ebde595c 1177 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
85c97ca7 1178 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
61007b31
SH
1179 use_local_qiov ? &local_qiov : qiov,
1180 flags);
1181 tracked_request_end(&req);
99723548 1182 bdrv_dec_in_flight(bs);
61007b31
SH
1183
1184 if (use_local_qiov) {
1185 qemu_iovec_destroy(&local_qiov);
1186 qemu_vfree(head_buf);
1187 qemu_vfree(tail_buf);
1188 }
1189
1190 return ret;
1191}
1192
adad6496 1193static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
61007b31
SH
1194 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1195 BdrvRequestFlags flags)
1196{
1197 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1198 return -EINVAL;
1199 }
1200
a03ef88f 1201 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
cab3a356 1202 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1203}
1204
28b04a8f
KW
1205int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1206 int nb_sectors, QEMUIOVector *qiov)
61007b31 1207{
28b04a8f 1208 trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
61007b31 1209
adad6496 1210 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
61007b31
SH
1211}
1212
5def6b80
EB
1213/* Maximum buffer for write zeroes fallback, in bytes */
1214#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
61007b31 1215
d05aa8bb
EB
1216static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1217 int64_t offset, int count, BdrvRequestFlags flags)
61007b31
SH
1218{
1219 BlockDriver *drv = bs->drv;
1220 QEMUIOVector qiov;
1221 struct iovec iov = {0};
1222 int ret = 0;
465fe887 1223 bool need_flush = false;
443668ca
DL
1224 int head = 0;
1225 int tail = 0;
61007b31 1226
cf081fca 1227 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
a5b8dd2c
EB
1228 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1229 bs->bl.request_alignment);
b2f95fee
EB
1230 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1231 MAX_WRITE_ZEROES_BOUNCE_BUFFER);
d05aa8bb 1232
b8d0a980
EB
1233 assert(alignment % bs->bl.request_alignment == 0);
1234 head = offset % alignment;
1235 tail = (offset + count) % alignment;
1236 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1237 assert(max_write_zeroes >= bs->bl.request_alignment);
61007b31 1238
d05aa8bb
EB
1239 while (count > 0 && !ret) {
1240 int num = count;
61007b31
SH
1241
1242 /* Align request. Block drivers can expect the "bulk" of the request
443668ca
DL
1243 * to be aligned, and that unaligned requests do not cross cluster
1244 * boundaries.
61007b31 1245 */
443668ca 1246 if (head) {
b2f95fee
EB
1247 /* Make a small request up to the first aligned sector. For
1248 * convenience, limit this request to max_transfer even if
1249 * we don't need to fall back to writes. */
1250 num = MIN(MIN(count, max_transfer), alignment - head);
1251 head = (head + num) % alignment;
1252 assert(num < max_write_zeroes);
d05aa8bb 1253 } else if (tail && num > alignment) {
443668ca
DL
1254 /* Shorten the request to the last aligned sector. */
1255 num -= tail;
61007b31
SH
1256 }
1257
1258 /* limit request size */
1259 if (num > max_write_zeroes) {
1260 num = max_write_zeroes;
1261 }
1262
1263 ret = -ENOTSUP;
1264 /* First try the efficient write zeroes operation */
d05aa8bb
EB
1265 if (drv->bdrv_co_pwrite_zeroes) {
1266 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1267 flags & bs->supported_zero_flags);
1268 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1269 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1270 need_flush = true;
1271 }
465fe887
EB
1272 } else {
1273 assert(!bs->supported_zero_flags);
61007b31
SH
1274 }
1275
1276 if (ret == -ENOTSUP) {
1277 /* Fall back to bounce buffer if write zeroes is unsupported */
465fe887
EB
1278 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1279
1280 if ((flags & BDRV_REQ_FUA) &&
1281 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1282 /* No need for bdrv_driver_pwrite() to do a fallback
1283 * flush on each chunk; use just one at the end */
1284 write_flags &= ~BDRV_REQ_FUA;
1285 need_flush = true;
1286 }
5def6b80 1287 num = MIN(num, max_transfer);
d05aa8bb 1288 iov.iov_len = num;
61007b31 1289 if (iov.iov_base == NULL) {
d05aa8bb 1290 iov.iov_base = qemu_try_blockalign(bs, num);
61007b31
SH
1291 if (iov.iov_base == NULL) {
1292 ret = -ENOMEM;
1293 goto fail;
1294 }
d05aa8bb 1295 memset(iov.iov_base, 0, num);
61007b31
SH
1296 }
1297 qemu_iovec_init_external(&qiov, &iov, 1);
1298
d05aa8bb 1299 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
61007b31
SH
1300
1301 /* Keep bounce buffer around if it is big enough for all
1302 * all future requests.
1303 */
5def6b80 1304 if (num < max_transfer) {
61007b31
SH
1305 qemu_vfree(iov.iov_base);
1306 iov.iov_base = NULL;
1307 }
1308 }
1309
d05aa8bb
EB
1310 offset += num;
1311 count -= num;
61007b31
SH
1312 }
1313
1314fail:
465fe887
EB
1315 if (ret == 0 && need_flush) {
1316 ret = bdrv_co_flush(bs);
1317 }
61007b31
SH
1318 qemu_vfree(iov.iov_base);
1319 return ret;
1320}
1321
1322/*
04ed95f4
EB
1323 * Forwards an already correctly aligned write request to the BlockDriver,
1324 * after possibly fragmenting it.
61007b31 1325 */
85c97ca7 1326static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
61007b31 1327 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
cff86b38 1328 int64_t align, QEMUIOVector *qiov, int flags)
61007b31 1329{
85c97ca7 1330 BlockDriverState *bs = child->bs;
61007b31
SH
1331 BlockDriver *drv = bs->drv;
1332 bool waited;
1333 int ret;
1334
9896c876
KW
1335 int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1336 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
04ed95f4
EB
1337 uint64_t bytes_remaining = bytes;
1338 int max_transfer;
61007b31 1339
cff86b38
EB
1340 assert(is_power_of_2(align));
1341 assert((offset & (align - 1)) == 0);
1342 assert((bytes & (align - 1)) == 0);
61007b31 1343 assert(!qiov || bytes == qiov->size);
abb06c5a 1344 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
fa166538 1345 assert(!(flags & ~BDRV_REQ_MASK));
04ed95f4
EB
1346 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1347 align);
61007b31
SH
1348
1349 waited = wait_serialising_requests(req);
1350 assert(!waited || !req->serialising);
1351 assert(req->overlap_offset <= offset);
1352 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
362b3786
HR
1353 assert(child->perm & BLK_PERM_WRITE);
1354 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
61007b31
SH
1355
1356 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1357
1358 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
c1499a5e 1359 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
61007b31
SH
1360 qemu_iovec_is_zero(qiov)) {
1361 flags |= BDRV_REQ_ZERO_WRITE;
1362 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1363 flags |= BDRV_REQ_MAY_UNMAP;
1364 }
1365 }
1366
1367 if (ret < 0) {
1368 /* Do nothing, write notifier decided to fail this request */
1369 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9a4f4c31 1370 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
9896c876 1371 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
3ea1a091
PB
1372 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1373 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
04ed95f4 1374 } else if (bytes <= max_transfer) {
9a4f4c31 1375 bdrv_debug_event(bs, BLKDBG_PWRITEV);
78a07294 1376 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
04ed95f4
EB
1377 } else {
1378 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1379 while (bytes_remaining) {
1380 int num = MIN(bytes_remaining, max_transfer);
1381 QEMUIOVector local_qiov;
1382 int local_flags = flags;
1383
1384 assert(num);
1385 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1386 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1387 /* If FUA is going to be emulated by flush, we only
1388 * need to flush on the last iteration */
1389 local_flags &= ~BDRV_REQ_FUA;
1390 }
1391 qemu_iovec_init(&local_qiov, qiov->niov);
1392 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1393
1394 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1395 num, &local_qiov, local_flags);
1396 qemu_iovec_destroy(&local_qiov);
1397 if (ret < 0) {
1398 break;
1399 }
1400 bytes_remaining -= num;
1401 }
61007b31 1402 }
9a4f4c31 1403 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
61007b31 1404
3ff2f67a 1405 ++bs->write_gen;
9896c876 1406 bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
61007b31 1407
53d8f9d8
HR
1408 if (bs->wr_highest_offset < offset + bytes) {
1409 bs->wr_highest_offset = offset + bytes;
1410 }
61007b31
SH
1411
1412 if (ret >= 0) {
9896c876 1413 bs->total_sectors = MAX(bs->total_sectors, end_sector);
04ed95f4 1414 ret = 0;
61007b31
SH
1415 }
1416
1417 return ret;
1418}
1419
85c97ca7 1420static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
9eeb6dd1
FZ
1421 int64_t offset,
1422 unsigned int bytes,
1423 BdrvRequestFlags flags,
1424 BdrvTrackedRequest *req)
1425{
85c97ca7 1426 BlockDriverState *bs = child->bs;
9eeb6dd1
FZ
1427 uint8_t *buf = NULL;
1428 QEMUIOVector local_qiov;
1429 struct iovec iov;
a5b8dd2c 1430 uint64_t align = bs->bl.request_alignment;
9eeb6dd1
FZ
1431 unsigned int head_padding_bytes, tail_padding_bytes;
1432 int ret = 0;
1433
1434 head_padding_bytes = offset & (align - 1);
f13ce1be 1435 tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
9eeb6dd1
FZ
1436
1437
1438 assert(flags & BDRV_REQ_ZERO_WRITE);
1439 if (head_padding_bytes || tail_padding_bytes) {
1440 buf = qemu_blockalign(bs, align);
1441 iov = (struct iovec) {
1442 .iov_base = buf,
1443 .iov_len = align,
1444 };
1445 qemu_iovec_init_external(&local_qiov, &iov, 1);
1446 }
1447 if (head_padding_bytes) {
1448 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1449
1450 /* RMW the unaligned part before head. */
1451 mark_request_serialising(req, align);
1452 wait_serialising_requests(req);
9a4f4c31 1453 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
85c97ca7 1454 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
9eeb6dd1
FZ
1455 align, &local_qiov, 0);
1456 if (ret < 0) {
1457 goto fail;
1458 }
9a4f4c31 1459 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
9eeb6dd1
FZ
1460
1461 memset(buf + head_padding_bytes, 0, zero_bytes);
85c97ca7 1462 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
cff86b38 1463 align, &local_qiov,
9eeb6dd1
FZ
1464 flags & ~BDRV_REQ_ZERO_WRITE);
1465 if (ret < 0) {
1466 goto fail;
1467 }
1468 offset += zero_bytes;
1469 bytes -= zero_bytes;
1470 }
1471
1472 assert(!bytes || (offset & (align - 1)) == 0);
1473 if (bytes >= align) {
1474 /* Write the aligned part in the middle. */
1475 uint64_t aligned_bytes = bytes & ~(align - 1);
85c97ca7 1476 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
9eeb6dd1
FZ
1477 NULL, flags);
1478 if (ret < 0) {
1479 goto fail;
1480 }
1481 bytes -= aligned_bytes;
1482 offset += aligned_bytes;
1483 }
1484
1485 assert(!bytes || (offset & (align - 1)) == 0);
1486 if (bytes) {
1487 assert(align == tail_padding_bytes + bytes);
1488 /* RMW the unaligned part after tail. */
1489 mark_request_serialising(req, align);
1490 wait_serialising_requests(req);
9a4f4c31 1491 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
85c97ca7 1492 ret = bdrv_aligned_preadv(child, req, offset, align,
9eeb6dd1
FZ
1493 align, &local_qiov, 0);
1494 if (ret < 0) {
1495 goto fail;
1496 }
9a4f4c31 1497 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
9eeb6dd1
FZ
1498
1499 memset(buf, 0, bytes);
85c97ca7 1500 ret = bdrv_aligned_pwritev(child, req, offset, align, align,
9eeb6dd1
FZ
1501 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1502 }
1503fail:
1504 qemu_vfree(buf);
1505 return ret;
1506
1507}
1508
61007b31
SH
1509/*
1510 * Handle a write request in coroutine context
1511 */
a03ef88f 1512int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
61007b31
SH
1513 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1514 BdrvRequestFlags flags)
1515{
a03ef88f 1516 BlockDriverState *bs = child->bs;
61007b31 1517 BdrvTrackedRequest req;
a5b8dd2c 1518 uint64_t align = bs->bl.request_alignment;
61007b31
SH
1519 uint8_t *head_buf = NULL;
1520 uint8_t *tail_buf = NULL;
1521 QEMUIOVector local_qiov;
1522 bool use_local_qiov = false;
1523 int ret;
1524
1525 if (!bs->drv) {
1526 return -ENOMEDIUM;
1527 }
1528 if (bs->read_only) {
eaf5fe2d 1529 return -EPERM;
61007b31 1530 }
04c01a5c 1531 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31
SH
1532
1533 ret = bdrv_check_byte_request(bs, offset, bytes);
1534 if (ret < 0) {
1535 return ret;
1536 }
1537
99723548 1538 bdrv_inc_in_flight(bs);
61007b31
SH
1539 /*
1540 * Align write if necessary by performing a read-modify-write cycle.
1541 * Pad qiov with the read parts and be sure to have a tracked request not
1542 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1543 */
ebde595c 1544 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
61007b31 1545
9eeb6dd1 1546 if (!qiov) {
85c97ca7 1547 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
9eeb6dd1
FZ
1548 goto out;
1549 }
1550
61007b31
SH
1551 if (offset & (align - 1)) {
1552 QEMUIOVector head_qiov;
1553 struct iovec head_iov;
1554
1555 mark_request_serialising(&req, align);
1556 wait_serialising_requests(&req);
1557
1558 head_buf = qemu_blockalign(bs, align);
1559 head_iov = (struct iovec) {
1560 .iov_base = head_buf,
1561 .iov_len = align,
1562 };
1563 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1564
9a4f4c31 1565 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
85c97ca7 1566 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
61007b31
SH
1567 align, &head_qiov, 0);
1568 if (ret < 0) {
1569 goto fail;
1570 }
9a4f4c31 1571 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
61007b31
SH
1572
1573 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1574 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1575 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1576 use_local_qiov = true;
1577
1578 bytes += offset & (align - 1);
1579 offset = offset & ~(align - 1);
117bc3fa
PL
1580
1581 /* We have read the tail already if the request is smaller
1582 * than one aligned block.
1583 */
1584 if (bytes < align) {
1585 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1586 bytes = align;
1587 }
61007b31
SH
1588 }
1589
1590 if ((offset + bytes) & (align - 1)) {
1591 QEMUIOVector tail_qiov;
1592 struct iovec tail_iov;
1593 size_t tail_bytes;
1594 bool waited;
1595
1596 mark_request_serialising(&req, align);
1597 waited = wait_serialising_requests(&req);
1598 assert(!waited || !use_local_qiov);
1599
1600 tail_buf = qemu_blockalign(bs, align);
1601 tail_iov = (struct iovec) {
1602 .iov_base = tail_buf,
1603 .iov_len = align,
1604 };
1605 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1606
9a4f4c31 1607 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
85c97ca7
KW
1608 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1609 align, align, &tail_qiov, 0);
61007b31
SH
1610 if (ret < 0) {
1611 goto fail;
1612 }
9a4f4c31 1613 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
61007b31
SH
1614
1615 if (!use_local_qiov) {
1616 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1617 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1618 use_local_qiov = true;
1619 }
1620
1621 tail_bytes = (offset + bytes) & (align - 1);
1622 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1623
1624 bytes = ROUND_UP(bytes, align);
1625 }
1626
85c97ca7 1627 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
3ea1a091
PB
1628 use_local_qiov ? &local_qiov : qiov,
1629 flags);
61007b31
SH
1630
1631fail:
61007b31
SH
1632
1633 if (use_local_qiov) {
1634 qemu_iovec_destroy(&local_qiov);
1635 }
1636 qemu_vfree(head_buf);
1637 qemu_vfree(tail_buf);
9eeb6dd1
FZ
1638out:
1639 tracked_request_end(&req);
99723548 1640 bdrv_dec_in_flight(bs);
61007b31
SH
1641 return ret;
1642}
1643
adad6496 1644static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
61007b31
SH
1645 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1646 BdrvRequestFlags flags)
1647{
1648 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1649 return -EINVAL;
1650 }
1651
a03ef88f 1652 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
cab3a356 1653 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1654}
1655
25ec177d 1656int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
61007b31
SH
1657 int nb_sectors, QEMUIOVector *qiov)
1658{
25ec177d 1659 trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
61007b31 1660
adad6496 1661 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
61007b31
SH
1662}
1663
a03ef88f
KW
1664int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1665 int count, BdrvRequestFlags flags)
61007b31 1666{
a03ef88f 1667 trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
61007b31 1668
a03ef88f 1669 if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
61007b31
SH
1670 flags &= ~BDRV_REQ_MAY_UNMAP;
1671 }
61007b31 1672
a03ef88f 1673 return bdrv_co_pwritev(child, offset, count, NULL,
74021bc4 1674 BDRV_REQ_ZERO_WRITE | flags);
61007b31
SH
1675}
1676
4085f5c7
JS
1677/*
1678 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1679 */
1680int bdrv_flush_all(void)
1681{
1682 BdrvNextIterator it;
1683 BlockDriverState *bs = NULL;
1684 int result = 0;
1685
1686 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1687 AioContext *aio_context = bdrv_get_aio_context(bs);
1688 int ret;
1689
1690 aio_context_acquire(aio_context);
1691 ret = bdrv_flush(bs);
1692 if (ret < 0 && !result) {
1693 result = ret;
1694 }
1695 aio_context_release(aio_context);
1696 }
1697
1698 return result;
1699}
1700
1701
61007b31
SH
1702typedef struct BdrvCoGetBlockStatusData {
1703 BlockDriverState *bs;
1704 BlockDriverState *base;
67a0fd2a 1705 BlockDriverState **file;
61007b31
SH
1706 int64_t sector_num;
1707 int nb_sectors;
1708 int *pnum;
1709 int64_t ret;
1710 bool done;
1711} BdrvCoGetBlockStatusData;
1712
1713/*
1714 * Returns the allocation status of the specified sectors.
1715 * Drivers not implementing the functionality are assumed to not support
1716 * backing files, hence all their sectors are reported as allocated.
1717 *
1718 * If 'sector_num' is beyond the end of the disk image the return value is 0
1719 * and 'pnum' is set to 0.
1720 *
1721 * 'pnum' is set to the number of sectors (including and immediately following
1722 * the specified sector) that are known to be in the same
1723 * allocated/unallocated state.
1724 *
1725 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
1726 * beyond the end of the disk image it will be clamped.
67a0fd2a
FZ
1727 *
1728 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1729 * points to the BDS which the sector range is allocated in.
61007b31
SH
1730 */
1731static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1732 int64_t sector_num,
67a0fd2a
FZ
1733 int nb_sectors, int *pnum,
1734 BlockDriverState **file)
61007b31
SH
1735{
1736 int64_t total_sectors;
1737 int64_t n;
1738 int64_t ret, ret2;
1739
1740 total_sectors = bdrv_nb_sectors(bs);
1741 if (total_sectors < 0) {
1742 return total_sectors;
1743 }
1744
1745 if (sector_num >= total_sectors) {
1746 *pnum = 0;
1747 return 0;
1748 }
1749
1750 n = total_sectors - sector_num;
1751 if (n < nb_sectors) {
1752 nb_sectors = n;
1753 }
1754
1755 if (!bs->drv->bdrv_co_get_block_status) {
1756 *pnum = nb_sectors;
1757 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1758 if (bs->drv->protocol_name) {
1759 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1760 }
1761 return ret;
1762 }
1763
67a0fd2a 1764 *file = NULL;
99723548 1765 bdrv_inc_in_flight(bs);
67a0fd2a
FZ
1766 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1767 file);
61007b31
SH
1768 if (ret < 0) {
1769 *pnum = 0;
99723548 1770 goto out;
61007b31
SH
1771 }
1772
1773 if (ret & BDRV_BLOCK_RAW) {
1774 assert(ret & BDRV_BLOCK_OFFSET_VALID);
ee29d6ad
EB
1775 ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1776 *pnum, pnum, file);
99723548 1777 goto out;
61007b31
SH
1778 }
1779
1780 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1781 ret |= BDRV_BLOCK_ALLOCATED;
a53f1a95 1782 } else {
61007b31
SH
1783 if (bdrv_unallocated_blocks_are_zero(bs)) {
1784 ret |= BDRV_BLOCK_ZERO;
760e0063
KW
1785 } else if (bs->backing) {
1786 BlockDriverState *bs2 = bs->backing->bs;
61007b31
SH
1787 int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1788 if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1789 ret |= BDRV_BLOCK_ZERO;
1790 }
1791 }
1792 }
1793
ac987b30 1794 if (*file && *file != bs &&
61007b31
SH
1795 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1796 (ret & BDRV_BLOCK_OFFSET_VALID)) {
67a0fd2a 1797 BlockDriverState *file2;
61007b31
SH
1798 int file_pnum;
1799
ac987b30 1800 ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
67a0fd2a 1801 *pnum, &file_pnum, &file2);
61007b31
SH
1802 if (ret2 >= 0) {
1803 /* Ignore errors. This is just providing extra information, it
1804 * is useful but not necessary.
1805 */
1806 if (!file_pnum) {
1807 /* !file_pnum indicates an offset at or beyond the EOF; it is
1808 * perfectly valid for the format block driver to point to such
1809 * offsets, so catch it and mark everything as zero */
1810 ret |= BDRV_BLOCK_ZERO;
1811 } else {
1812 /* Limit request to the range reported by the protocol driver */
1813 *pnum = file_pnum;
1814 ret |= (ret2 & BDRV_BLOCK_ZERO);
1815 }
1816 }
1817 }
1818
99723548
PB
1819out:
1820 bdrv_dec_in_flight(bs);
61007b31
SH
1821 return ret;
1822}
1823
ba3f0e25
FZ
1824static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1825 BlockDriverState *base,
1826 int64_t sector_num,
1827 int nb_sectors,
67a0fd2a
FZ
1828 int *pnum,
1829 BlockDriverState **file)
ba3f0e25
FZ
1830{
1831 BlockDriverState *p;
1832 int64_t ret = 0;
1833
1834 assert(bs != base);
760e0063 1835 for (p = bs; p != base; p = backing_bs(p)) {
67a0fd2a 1836 ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
ba3f0e25
FZ
1837 if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1838 break;
1839 }
1840 /* [sector_num, pnum] unallocated on this layer, which could be only
1841 * the first part of [sector_num, nb_sectors]. */
1842 nb_sectors = MIN(nb_sectors, *pnum);
1843 }
1844 return ret;
1845}
1846
1847/* Coroutine wrapper for bdrv_get_block_status_above() */
1848static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
61007b31
SH
1849{
1850 BdrvCoGetBlockStatusData *data = opaque;
61007b31 1851
ba3f0e25
FZ
1852 data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1853 data->sector_num,
1854 data->nb_sectors,
67a0fd2a
FZ
1855 data->pnum,
1856 data->file);
61007b31
SH
1857 data->done = true;
1858}
1859
1860/*
ba3f0e25 1861 * Synchronous wrapper around bdrv_co_get_block_status_above().
61007b31 1862 *
ba3f0e25 1863 * See bdrv_co_get_block_status_above() for details.
61007b31 1864 */
ba3f0e25
FZ
1865int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1866 BlockDriverState *base,
1867 int64_t sector_num,
67a0fd2a
FZ
1868 int nb_sectors, int *pnum,
1869 BlockDriverState **file)
61007b31
SH
1870{
1871 Coroutine *co;
1872 BdrvCoGetBlockStatusData data = {
1873 .bs = bs,
ba3f0e25 1874 .base = base,
67a0fd2a 1875 .file = file,
61007b31
SH
1876 .sector_num = sector_num,
1877 .nb_sectors = nb_sectors,
1878 .pnum = pnum,
1879 .done = false,
1880 };
1881
1882 if (qemu_in_coroutine()) {
1883 /* Fast-path if already in coroutine context */
ba3f0e25 1884 bdrv_get_block_status_above_co_entry(&data);
61007b31 1885 } else {
0b8b8753
PB
1886 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
1887 &data);
e92f0e19 1888 bdrv_coroutine_enter(bs, co);
88b062c2 1889 BDRV_POLL_WHILE(bs, !data.done);
61007b31
SH
1890 }
1891 return data.ret;
1892}
1893
ba3f0e25
FZ
1894int64_t bdrv_get_block_status(BlockDriverState *bs,
1895 int64_t sector_num,
67a0fd2a
FZ
1896 int nb_sectors, int *pnum,
1897 BlockDriverState **file)
ba3f0e25 1898{
760e0063 1899 return bdrv_get_block_status_above(bs, backing_bs(bs),
67a0fd2a 1900 sector_num, nb_sectors, pnum, file);
ba3f0e25
FZ
1901}
1902
61007b31
SH
1903int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1904 int nb_sectors, int *pnum)
1905{
67a0fd2a
FZ
1906 BlockDriverState *file;
1907 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1908 &file);
61007b31
SH
1909 if (ret < 0) {
1910 return ret;
1911 }
1912 return !!(ret & BDRV_BLOCK_ALLOCATED);
1913}
1914
1915/*
1916 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1917 *
1918 * Return true if the given sector is allocated in any image between
1919 * BASE and TOP (inclusive). BASE can be NULL to check if the given
1920 * sector is allocated in any image of the chain. Return false otherwise.
1921 *
1922 * 'pnum' is set to the number of sectors (including and immediately following
1923 * the specified sector) that are known to be in the same
1924 * allocated/unallocated state.
1925 *
1926 */
1927int bdrv_is_allocated_above(BlockDriverState *top,
1928 BlockDriverState *base,
1929 int64_t sector_num,
1930 int nb_sectors, int *pnum)
1931{
1932 BlockDriverState *intermediate;
1933 int ret, n = nb_sectors;
1934
1935 intermediate = top;
1936 while (intermediate && intermediate != base) {
1937 int pnum_inter;
1938 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1939 &pnum_inter);
1940 if (ret < 0) {
1941 return ret;
1942 } else if (ret) {
1943 *pnum = pnum_inter;
1944 return 1;
1945 }
1946
1947 /*
1948 * [sector_num, nb_sectors] is unallocated on top but intermediate
1949 * might have
1950 *
1951 * [sector_num+x, nr_sectors] allocated.
1952 */
1953 if (n > pnum_inter &&
1954 (intermediate == top ||
1955 sector_num + pnum_inter < intermediate->total_sectors)) {
1956 n = pnum_inter;
1957 }
1958
760e0063 1959 intermediate = backing_bs(intermediate);
61007b31
SH
1960 }
1961
1962 *pnum = n;
1963 return 0;
1964}
1965
1a8ae822
KW
1966typedef struct BdrvVmstateCo {
1967 BlockDriverState *bs;
1968 QEMUIOVector *qiov;
1969 int64_t pos;
1970 bool is_read;
1971 int ret;
1972} BdrvVmstateCo;
1973
1974static int coroutine_fn
1975bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1976 bool is_read)
1977{
1978 BlockDriver *drv = bs->drv;
1979
1980 if (!drv) {
1981 return -ENOMEDIUM;
1982 } else if (drv->bdrv_load_vmstate) {
1983 return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
1984 : drv->bdrv_save_vmstate(bs, qiov, pos);
1985 } else if (bs->file) {
1986 return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1987 }
1988
1989 return -ENOTSUP;
1990}
1991
1992static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
1993{
1994 BdrvVmstateCo *co = opaque;
1995 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
1996}
1997
1998static inline int
1999bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2000 bool is_read)
2001{
2002 if (qemu_in_coroutine()) {
2003 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2004 } else {
2005 BdrvVmstateCo data = {
2006 .bs = bs,
2007 .qiov = qiov,
2008 .pos = pos,
2009 .is_read = is_read,
2010 .ret = -EINPROGRESS,
2011 };
0b8b8753 2012 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
1a8ae822 2013
e92f0e19 2014 bdrv_coroutine_enter(bs, co);
1a8ae822
KW
2015 while (data.ret == -EINPROGRESS) {
2016 aio_poll(bdrv_get_aio_context(bs), true);
2017 }
2018 return data.ret;
2019 }
2020}
2021
61007b31
SH
2022int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2023 int64_t pos, int size)
2024{
2025 QEMUIOVector qiov;
2026 struct iovec iov = {
2027 .iov_base = (void *) buf,
2028 .iov_len = size,
2029 };
b433d942 2030 int ret;
61007b31
SH
2031
2032 qemu_iovec_init_external(&qiov, &iov, 1);
b433d942
KW
2033
2034 ret = bdrv_writev_vmstate(bs, &qiov, pos);
2035 if (ret < 0) {
2036 return ret;
2037 }
2038
2039 return size;
61007b31
SH
2040}
2041
2042int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2043{
1a8ae822 2044 return bdrv_rw_vmstate(bs, qiov, pos, false);
61007b31
SH
2045}
2046
2047int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2048 int64_t pos, int size)
5ddda0b8
KW
2049{
2050 QEMUIOVector qiov;
2051 struct iovec iov = {
2052 .iov_base = buf,
2053 .iov_len = size,
2054 };
b433d942 2055 int ret;
5ddda0b8
KW
2056
2057 qemu_iovec_init_external(&qiov, &iov, 1);
b433d942
KW
2058 ret = bdrv_readv_vmstate(bs, &qiov, pos);
2059 if (ret < 0) {
2060 return ret;
2061 }
2062
2063 return size;
5ddda0b8
KW
2064}
2065
2066int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
61007b31 2067{
1a8ae822 2068 return bdrv_rw_vmstate(bs, qiov, pos, true);
61007b31
SH
2069}
2070
2071/**************************************************************/
2072/* async I/Os */
2073
ebb7af21 2074BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
61007b31
SH
2075 QEMUIOVector *qiov, int nb_sectors,
2076 BlockCompletionFunc *cb, void *opaque)
2077{
ebb7af21 2078 trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
61007b31 2079
b15404e0
EB
2080 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2081 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2082 0, cb, opaque, false);
61007b31
SH
2083}
2084
0d1049c7 2085BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
61007b31
SH
2086 QEMUIOVector *qiov, int nb_sectors,
2087 BlockCompletionFunc *cb, void *opaque)
2088{
0d1049c7 2089 trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
61007b31 2090
b15404e0
EB
2091 assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2092 return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2093 0, cb, opaque, true);
61007b31
SH
2094}
2095
61007b31
SH
2096void bdrv_aio_cancel(BlockAIOCB *acb)
2097{
2098 qemu_aio_ref(acb);
2099 bdrv_aio_cancel_async(acb);
2100 while (acb->refcnt > 1) {
2101 if (acb->aiocb_info->get_aio_context) {
2102 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2103 } else if (acb->bs) {
2f47da5f
PB
2104 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2105 * assert that we're not using an I/O thread. Thread-safe
2106 * code should use bdrv_aio_cancel_async exclusively.
2107 */
2108 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
61007b31
SH
2109 aio_poll(bdrv_get_aio_context(acb->bs), true);
2110 } else {
2111 abort();
2112 }
2113 }
2114 qemu_aio_unref(acb);
2115}
2116
2117/* Async version of aio cancel. The caller is not blocked if the acb implements
2118 * cancel_async, otherwise we do nothing and let the request normally complete.
2119 * In either case the completion callback must be called. */
2120void bdrv_aio_cancel_async(BlockAIOCB *acb)
2121{
2122 if (acb->aiocb_info->cancel_async) {
2123 acb->aiocb_info->cancel_async(acb);
2124 }
2125}
2126
2127/**************************************************************/
2128/* async block device emulation */
2129
41574268
EB
2130typedef struct BlockRequest {
2131 union {
2132 /* Used during read, write, trim */
2133 struct {
b15404e0
EB
2134 int64_t offset;
2135 int bytes;
41574268
EB
2136 int flags;
2137 QEMUIOVector *qiov;
2138 };
2139 /* Used during ioctl */
2140 struct {
2141 int req;
2142 void *buf;
2143 };
2144 };
2145 BlockCompletionFunc *cb;
2146 void *opaque;
2147
2148 int error;
2149} BlockRequest;
2150
61007b31
SH
2151typedef struct BlockAIOCBCoroutine {
2152 BlockAIOCB common;
adad6496 2153 BdrvChild *child;
61007b31
SH
2154 BlockRequest req;
2155 bool is_write;
2156 bool need_bh;
2157 bool *done;
61007b31
SH
2158} BlockAIOCBCoroutine;
2159
2160static const AIOCBInfo bdrv_em_co_aiocb_info = {
2161 .aiocb_size = sizeof(BlockAIOCBCoroutine),
2162};
2163
2164static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2165{
2166 if (!acb->need_bh) {
99723548 2167 bdrv_dec_in_flight(acb->common.bs);
61007b31
SH
2168 acb->common.cb(acb->common.opaque, acb->req.error);
2169 qemu_aio_unref(acb);
2170 }
2171}
2172
2173static void bdrv_co_em_bh(void *opaque)
2174{
2175 BlockAIOCBCoroutine *acb = opaque;
2176
2177 assert(!acb->need_bh);
61007b31
SH
2178 bdrv_co_complete(acb);
2179}
2180
2181static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2182{
2183 acb->need_bh = false;
2184 if (acb->req.error != -EINPROGRESS) {
2185 BlockDriverState *bs = acb->common.bs;
2186
fffb6e12 2187 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
61007b31
SH
2188 }
2189}
2190
2191/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2192static void coroutine_fn bdrv_co_do_rw(void *opaque)
2193{
2194 BlockAIOCBCoroutine *acb = opaque;
61007b31
SH
2195
2196 if (!acb->is_write) {
b15404e0
EB
2197 acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
2198 acb->req.qiov->size, acb->req.qiov, acb->req.flags);
61007b31 2199 } else {
b15404e0
EB
2200 acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
2201 acb->req.qiov->size, acb->req.qiov, acb->req.flags);
61007b31
SH
2202 }
2203
2204 bdrv_co_complete(acb);
2205}
2206
b15404e0
EB
2207static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
2208 int64_t offset,
2209 QEMUIOVector *qiov,
2210 BdrvRequestFlags flags,
2211 BlockCompletionFunc *cb,
2212 void *opaque,
2213 bool is_write)
61007b31
SH
2214{
2215 Coroutine *co;
2216 BlockAIOCBCoroutine *acb;
2217
99723548
PB
2218 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */
2219 bdrv_inc_in_flight(child->bs);
2220
adad6496
KW
2221 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
2222 acb->child = child;
61007b31
SH
2223 acb->need_bh = true;
2224 acb->req.error = -EINPROGRESS;
b15404e0 2225 acb->req.offset = offset;
61007b31
SH
2226 acb->req.qiov = qiov;
2227 acb->req.flags = flags;
2228 acb->is_write = is_write;
2229
0b8b8753 2230 co = qemu_coroutine_create(bdrv_co_do_rw, acb);
e92f0e19 2231 bdrv_coroutine_enter(child->bs, co);
61007b31
SH
2232
2233 bdrv_co_maybe_schedule_bh(acb);
2234 return &acb->common;
2235}
2236
2237static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2238{
2239 BlockAIOCBCoroutine *acb = opaque;
2240 BlockDriverState *bs = acb->common.bs;
2241
2242 acb->req.error = bdrv_co_flush(bs);
2243 bdrv_co_complete(acb);
2244}
2245
2246BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2247 BlockCompletionFunc *cb, void *opaque)
2248{
2249 trace_bdrv_aio_flush(bs, opaque);
2250
2251 Coroutine *co;
2252 BlockAIOCBCoroutine *acb;
2253
99723548
PB
2254 /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */
2255 bdrv_inc_in_flight(bs);
2256
61007b31
SH
2257 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2258 acb->need_bh = true;
2259 acb->req.error = -EINPROGRESS;
2260
0b8b8753 2261 co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
e92f0e19 2262 bdrv_coroutine_enter(bs, co);
61007b31
SH
2263
2264 bdrv_co_maybe_schedule_bh(acb);
2265 return &acb->common;
2266}
2267
61007b31
SH
2268/**************************************************************/
2269/* Coroutine block device emulation */
2270
e293b7a3
KW
2271typedef struct FlushCo {
2272 BlockDriverState *bs;
2273 int ret;
2274} FlushCo;
2275
2276
61007b31
SH
2277static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2278{
e293b7a3 2279 FlushCo *rwco = opaque;
61007b31
SH
2280
2281 rwco->ret = bdrv_co_flush(rwco->bs);
2282}
2283
2284int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2285{
49ca6259
FZ
2286 int current_gen;
2287 int ret = 0;
2288
2289 bdrv_inc_in_flight(bs);
61007b31 2290
e914404e 2291 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
1b6bc94d 2292 bdrv_is_sg(bs)) {
49ca6259 2293 goto early_exit;
61007b31
SH
2294 }
2295
49ca6259 2296 current_gen = bs->write_gen;
3ff2f67a
EY
2297
2298 /* Wait until any previous flushes are completed */
99723548 2299 while (bs->active_flush_req) {
1ace7cea 2300 qemu_co_queue_wait(&bs->flush_queue, NULL);
3ff2f67a
EY
2301 }
2302
99723548 2303 bs->active_flush_req = true;
3ff2f67a 2304
c32b82af
PD
2305 /* Write back all layers by calling one driver function */
2306 if (bs->drv->bdrv_co_flush) {
2307 ret = bs->drv->bdrv_co_flush(bs);
2308 goto out;
2309 }
2310
61007b31
SH
2311 /* Write back cached data to the OS even with cache=unsafe */
2312 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2313 if (bs->drv->bdrv_co_flush_to_os) {
2314 ret = bs->drv->bdrv_co_flush_to_os(bs);
2315 if (ret < 0) {
cdb5e315 2316 goto out;
61007b31
SH
2317 }
2318 }
2319
2320 /* But don't actually force it to the disk with cache=unsafe */
2321 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2322 goto flush_parent;
2323 }
2324
3ff2f67a
EY
2325 /* Check if we really need to flush anything */
2326 if (bs->flushed_gen == current_gen) {
2327 goto flush_parent;
2328 }
2329
61007b31
SH
2330 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2331 if (bs->drv->bdrv_co_flush_to_disk) {
2332 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2333 } else if (bs->drv->bdrv_aio_flush) {
2334 BlockAIOCB *acb;
2335 CoroutineIOCompletion co = {
2336 .coroutine = qemu_coroutine_self(),
2337 };
2338
2339 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2340 if (acb == NULL) {
2341 ret = -EIO;
2342 } else {
2343 qemu_coroutine_yield();
2344 ret = co.ret;
2345 }
2346 } else {
2347 /*
2348 * Some block drivers always operate in either writethrough or unsafe
2349 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2350 * know how the server works (because the behaviour is hardcoded or
2351 * depends on server-side configuration), so we can't ensure that
2352 * everything is safe on disk. Returning an error doesn't work because
2353 * that would break guests even if the server operates in writethrough
2354 * mode.
2355 *
2356 * Let's hope the user knows what he's doing.
2357 */
2358 ret = 0;
2359 }
3ff2f67a 2360
61007b31 2361 if (ret < 0) {
cdb5e315 2362 goto out;
61007b31
SH
2363 }
2364
2365 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2366 * in the case of cache=unsafe, so there are no useless flushes.
2367 */
2368flush_parent:
cdb5e315
FZ
2369 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2370out:
3ff2f67a 2371 /* Notify any pending flushes that we have completed */
e6af1e08
KW
2372 if (ret == 0) {
2373 bs->flushed_gen = current_gen;
2374 }
99723548 2375 bs->active_flush_req = false;
156af3ac
DL
2376 /* Return value is ignored - it's ok if wait queue is empty */
2377 qemu_co_queue_next(&bs->flush_queue);
3ff2f67a 2378
49ca6259 2379early_exit:
99723548 2380 bdrv_dec_in_flight(bs);
cdb5e315 2381 return ret;
61007b31
SH
2382}
2383
2384int bdrv_flush(BlockDriverState *bs)
2385{
2386 Coroutine *co;
e293b7a3 2387 FlushCo flush_co = {
61007b31
SH
2388 .bs = bs,
2389 .ret = NOT_DONE,
2390 };
2391
2392 if (qemu_in_coroutine()) {
2393 /* Fast-path if already in coroutine context */
e293b7a3 2394 bdrv_flush_co_entry(&flush_co);
61007b31 2395 } else {
0b8b8753 2396 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
e92f0e19 2397 bdrv_coroutine_enter(bs, co);
88b062c2 2398 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
61007b31
SH
2399 }
2400
e293b7a3 2401 return flush_co.ret;
61007b31
SH
2402}
2403
2404typedef struct DiscardCo {
2405 BlockDriverState *bs;
0c51a893
EB
2406 int64_t offset;
2407 int count;
61007b31
SH
2408 int ret;
2409} DiscardCo;
0c51a893 2410static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
61007b31
SH
2411{
2412 DiscardCo *rwco = opaque;
2413
0c51a893 2414 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
61007b31
SH
2415}
2416
9f1963b3
EB
2417int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2418 int count)
61007b31 2419{
b1066c87 2420 BdrvTrackedRequest req;
9f1963b3 2421 int max_pdiscard, ret;
3482b9bc 2422 int head, tail, align;
61007b31
SH
2423
2424 if (!bs->drv) {
2425 return -ENOMEDIUM;
2426 }
2427
9f1963b3 2428 ret = bdrv_check_byte_request(bs, offset, count);
61007b31
SH
2429 if (ret < 0) {
2430 return ret;
2431 } else if (bs->read_only) {
eaf5fe2d 2432 return -EPERM;
61007b31 2433 }
04c01a5c 2434 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31 2435
61007b31
SH
2436 /* Do nothing if disabled. */
2437 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2438 return 0;
2439 }
2440
02aefe43 2441 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
61007b31
SH
2442 return 0;
2443 }
2444
3482b9bc
EB
2445 /* Discard is advisory, but some devices track and coalesce
2446 * unaligned requests, so we must pass everything down rather than
2447 * round here. Still, most devices will just silently ignore
2448 * unaligned requests (by returning -ENOTSUP), so we must fragment
2449 * the request accordingly. */
02aefe43 2450 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
b8d0a980
EB
2451 assert(align % bs->bl.request_alignment == 0);
2452 head = offset % align;
3482b9bc 2453 tail = (offset + count) % align;
9f1963b3 2454
99723548 2455 bdrv_inc_in_flight(bs);
9f1963b3 2456 tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
50824995 2457
ec050f77
DL
2458 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2459 if (ret < 0) {
2460 goto out;
2461 }
2462
9f1963b3
EB
2463 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2464 align);
3482b9bc 2465 assert(max_pdiscard >= bs->bl.request_alignment);
61007b31 2466
9f1963b3
EB
2467 while (count > 0) {
2468 int ret;
3482b9bc
EB
2469 int num = count;
2470
2471 if (head) {
2472 /* Make small requests to get to alignment boundaries. */
2473 num = MIN(count, align - head);
2474 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2475 num %= bs->bl.request_alignment;
2476 }
2477 head = (head + num) % align;
2478 assert(num < max_pdiscard);
2479 } else if (tail) {
2480 if (num > align) {
2481 /* Shorten the request to the last aligned cluster. */
2482 num -= tail;
2483 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2484 tail > bs->bl.request_alignment) {
2485 tail %= bs->bl.request_alignment;
2486 num -= tail;
2487 }
2488 }
2489 /* limit request size */
2490 if (num > max_pdiscard) {
2491 num = max_pdiscard;
2492 }
61007b31 2493
47a5486d
EB
2494 if (bs->drv->bdrv_co_pdiscard) {
2495 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
61007b31
SH
2496 } else {
2497 BlockAIOCB *acb;
2498 CoroutineIOCompletion co = {
2499 .coroutine = qemu_coroutine_self(),
2500 };
2501
4da444a0
EB
2502 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2503 bdrv_co_io_em_complete, &co);
61007b31 2504 if (acb == NULL) {
b1066c87
FZ
2505 ret = -EIO;
2506 goto out;
61007b31
SH
2507 } else {
2508 qemu_coroutine_yield();
2509 ret = co.ret;
2510 }
2511 }
2512 if (ret && ret != -ENOTSUP) {
b1066c87 2513 goto out;
61007b31
SH
2514 }
2515
9f1963b3
EB
2516 offset += num;
2517 count -= num;
61007b31 2518 }
b1066c87
FZ
2519 ret = 0;
2520out:
3ff2f67a 2521 ++bs->write_gen;
968d8b06
DL
2522 bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2523 req.bytes >> BDRV_SECTOR_BITS);
b1066c87 2524 tracked_request_end(&req);
99723548 2525 bdrv_dec_in_flight(bs);
b1066c87 2526 return ret;
61007b31
SH
2527}
2528
0c51a893 2529int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
61007b31
SH
2530{
2531 Coroutine *co;
2532 DiscardCo rwco = {
2533 .bs = bs,
0c51a893
EB
2534 .offset = offset,
2535 .count = count,
61007b31
SH
2536 .ret = NOT_DONE,
2537 };
2538
2539 if (qemu_in_coroutine()) {
2540 /* Fast-path if already in coroutine context */
0c51a893 2541 bdrv_pdiscard_co_entry(&rwco);
61007b31 2542 } else {
0c51a893 2543 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
e92f0e19 2544 bdrv_coroutine_enter(bs, co);
88b062c2 2545 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
61007b31
SH
2546 }
2547
2548 return rwco.ret;
2549}
2550
48af776a 2551int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
61007b31
SH
2552{
2553 BlockDriver *drv = bs->drv;
5c5ae76a
FZ
2554 CoroutineIOCompletion co = {
2555 .coroutine = qemu_coroutine_self(),
2556 };
2557 BlockAIOCB *acb;
61007b31 2558
99723548 2559 bdrv_inc_in_flight(bs);
16a389dc 2560 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
5c5ae76a
FZ
2561 co.ret = -ENOTSUP;
2562 goto out;
2563 }
2564
16a389dc
KW
2565 if (drv->bdrv_co_ioctl) {
2566 co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2567 } else {
2568 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2569 if (!acb) {
2570 co.ret = -ENOTSUP;
2571 goto out;
2572 }
2573 qemu_coroutine_yield();
5c5ae76a 2574 }
5c5ae76a 2575out:
99723548 2576 bdrv_dec_in_flight(bs);
5c5ae76a
FZ
2577 return co.ret;
2578}
2579
61007b31
SH
2580void *qemu_blockalign(BlockDriverState *bs, size_t size)
2581{
2582 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2583}
2584
2585void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2586{
2587 return memset(qemu_blockalign(bs, size), 0, size);
2588}
2589
2590void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2591{
2592 size_t align = bdrv_opt_mem_align(bs);
2593
2594 /* Ensure that NULL is never returned on success */
2595 assert(align > 0);
2596 if (size == 0) {
2597 size = align;
2598 }
2599
2600 return qemu_try_memalign(align, size);
2601}
2602
2603void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2604{
2605 void *mem = qemu_try_blockalign(bs, size);
2606
2607 if (mem) {
2608 memset(mem, 0, size);
2609 }
2610
2611 return mem;
2612}
2613
2614/*
2615 * Check if all memory in this vector is sector aligned.
2616 */
2617bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2618{
2619 int i;
4196d2f0 2620 size_t alignment = bdrv_min_mem_align(bs);
61007b31
SH
2621
2622 for (i = 0; i < qiov->niov; i++) {
2623 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2624 return false;
2625 }
2626 if (qiov->iov[i].iov_len % alignment) {
2627 return false;
2628 }
2629 }
2630
2631 return true;
2632}
2633
2634void bdrv_add_before_write_notifier(BlockDriverState *bs,
2635 NotifierWithReturn *notifier)
2636{
2637 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2638}
2639
2640void bdrv_io_plug(BlockDriverState *bs)
2641{
6b98bd64
PB
2642 BdrvChild *child;
2643
2644 QLIST_FOREACH(child, &bs->children, next) {
2645 bdrv_io_plug(child->bs);
2646 }
2647
8f90b5e9 2648 if (bs->io_plugged++ == 0) {
6b98bd64
PB
2649 BlockDriver *drv = bs->drv;
2650 if (drv && drv->bdrv_io_plug) {
2651 drv->bdrv_io_plug(bs);
2652 }
61007b31
SH
2653 }
2654}
2655
2656void bdrv_io_unplug(BlockDriverState *bs)
2657{
6b98bd64
PB
2658 BdrvChild *child;
2659
2660 assert(bs->io_plugged);
8f90b5e9 2661 if (--bs->io_plugged == 0) {
6b98bd64
PB
2662 BlockDriver *drv = bs->drv;
2663 if (drv && drv->bdrv_io_unplug) {
2664 drv->bdrv_io_unplug(bs);
2665 }
2666 }
2667
2668 QLIST_FOREACH(child, &bs->children, next) {
2669 bdrv_io_unplug(child->bs);
61007b31
SH
2670 }
2671}
This page took 0.5213 seconds and 4 git commands to generate.