]> Git Repo - qemu.git/blame - block/io.c
block: Remove unused bdrv_requests_pending
[qemu.git] / block / io.c
CommitLineData
61007b31
SH
1/*
2 * Block layer I/O functions
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
80c71a24 25#include "qemu/osdep.h"
61007b31 26#include "trace.h"
7f0e9da6 27#include "sysemu/block-backend.h"
61007b31 28#include "block/blockjob.h"
f321dcb5 29#include "block/blockjob_int.h"
61007b31 30#include "block/block_int.h"
f348b6d1 31#include "qemu/cutils.h"
da34e65c 32#include "qapi/error.h"
d49b6836 33#include "qemu/error-report.h"
61007b31
SH
34
35#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
36
cb2e2878
EB
37/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
38#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
39
d05aa8bb 40static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
f5a5ca79 41 int64_t offset, int bytes, BdrvRequestFlags flags);
61007b31 42
14e9559f 43void bdrv_parent_drained_begin(BlockDriverState *bs)
61007b31 44{
02d21300 45 BdrvChild *c, *next;
27ccdd52 46
02d21300 47 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
c2066af0
KW
48 if (c->role->drained_begin) {
49 c->role->drained_begin(c);
50 }
ce0f1412
PB
51 }
52}
61007b31 53
14e9559f 54void bdrv_parent_drained_end(BlockDriverState *bs)
ce0f1412 55{
02d21300 56 BdrvChild *c, *next;
27ccdd52 57
02d21300 58 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
c2066af0
KW
59 if (c->role->drained_end) {
60 c->role->drained_end(c);
61 }
27ccdd52 62 }
61007b31
SH
63}
64
d9e0dfa2
EB
65static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
66{
67 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
68 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
69 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
70 src->opt_mem_alignment);
71 dst->min_mem_alignment = MAX(dst->min_mem_alignment,
72 src->min_mem_alignment);
73 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
74}
75
61007b31
SH
76void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
77{
78 BlockDriver *drv = bs->drv;
79 Error *local_err = NULL;
80
81 memset(&bs->bl, 0, sizeof(bs->bl));
82
83 if (!drv) {
84 return;
85 }
86
79ba8c98 87 /* Default alignment based on whether driver has byte interface */
a5b8dd2c 88 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
79ba8c98 89
61007b31
SH
90 /* Take some limits from the children as a default */
91 if (bs->file) {
9a4f4c31 92 bdrv_refresh_limits(bs->file->bs, &local_err);
61007b31
SH
93 if (local_err) {
94 error_propagate(errp, local_err);
95 return;
96 }
d9e0dfa2 97 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
61007b31 98 } else {
4196d2f0 99 bs->bl.min_mem_alignment = 512;
459b4e66 100 bs->bl.opt_mem_alignment = getpagesize();
bd44feb7
SH
101
102 /* Safe default since most protocols use readv()/writev()/etc */
103 bs->bl.max_iov = IOV_MAX;
61007b31
SH
104 }
105
760e0063
KW
106 if (bs->backing) {
107 bdrv_refresh_limits(bs->backing->bs, &local_err);
61007b31
SH
108 if (local_err) {
109 error_propagate(errp, local_err);
110 return;
111 }
d9e0dfa2 112 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
61007b31
SH
113 }
114
115 /* Then let the driver override it */
116 if (drv->bdrv_refresh_limits) {
117 drv->bdrv_refresh_limits(bs, errp);
118 }
119}
120
121/**
122 * The copy-on-read flag is actually a reference count so multiple users may
123 * use the feature without worrying about clobbering its previous state.
124 * Copy-on-read stays enabled until all users have called to disable it.
125 */
126void bdrv_enable_copy_on_read(BlockDriverState *bs)
127{
d3faa13e 128 atomic_inc(&bs->copy_on_read);
61007b31
SH
129}
130
131void bdrv_disable_copy_on_read(BlockDriverState *bs)
132{
d3faa13e
PB
133 int old = atomic_fetch_dec(&bs->copy_on_read);
134 assert(old >= 1);
61007b31
SH
135}
136
61124f03
PB
137typedef struct {
138 Coroutine *co;
139 BlockDriverState *bs;
140 bool done;
481cad48 141 bool begin;
61124f03
PB
142} BdrvCoDrainData;
143
144static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
145{
146 BdrvCoDrainData *data = opaque;
147 BlockDriverState *bs = data->bs;
148
481cad48 149 if (data->begin) {
f8ea8dac 150 bs->drv->bdrv_co_drain_begin(bs);
481cad48
MP
151 } else {
152 bs->drv->bdrv_co_drain_end(bs);
153 }
61124f03
PB
154
155 /* Set data->done before reading bs->wakeup. */
156 atomic_mb_set(&data->done, true);
157 bdrv_wakeup(bs);
158}
159
db0289b9 160/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
481cad48 161static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
61124f03 162{
db0289b9 163 BdrvChild *child, *tmp;
481cad48 164 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
61124f03 165
f8ea8dac 166 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
481cad48 167 (!begin && !bs->drv->bdrv_co_drain_end)) {
61124f03
PB
168 return;
169 }
170
171 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
172 bdrv_coroutine_enter(bs, data.co);
173 BDRV_POLL_WHILE(bs, !data.done);
db0289b9
KW
174
175 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
176 bdrv_drain_invoke(child->bs, begin);
177 }
61124f03
PB
178}
179
99c05de9 180static bool bdrv_drain_recurse(BlockDriverState *bs)
67da1dc5 181{
178bd438 182 BdrvChild *child, *tmp;
d42cf288
PB
183 bool waited;
184
481cad48
MP
185 /* Wait for drained requests to finish */
186 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
d42cf288 187
178bd438
FZ
188 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
189 BlockDriverState *bs = child->bs;
190 bool in_main_loop =
191 qemu_get_current_aio_context() == qemu_get_aio_context();
192 assert(bs->refcnt > 0);
193 if (in_main_loop) {
194 /* In case the recursive bdrv_drain_recurse processes a
195 * block_job_defer_to_main_loop BH and modifies the graph,
196 * let's hold a reference to bs until we are done.
197 *
198 * IOThread doesn't have such a BH, and it is not safe to call
199 * bdrv_unref without BQL, so skip doing it there.
200 */
201 bdrv_ref(bs);
202 }
99c05de9 203 waited |= bdrv_drain_recurse(bs);
178bd438
FZ
204 if (in_main_loop) {
205 bdrv_unref(bs);
206 }
67da1dc5 207 }
d42cf288
PB
208
209 return waited;
67da1dc5
FZ
210}
211
a77fd4bb
FZ
212static void bdrv_co_drain_bh_cb(void *opaque)
213{
214 BdrvCoDrainData *data = opaque;
215 Coroutine *co = data->co;
99723548 216 BlockDriverState *bs = data->bs;
a77fd4bb 217
99723548 218 bdrv_dec_in_flight(bs);
481cad48
MP
219 if (data->begin) {
220 bdrv_drained_begin(bs);
221 } else {
222 bdrv_drained_end(bs);
223 }
224
a77fd4bb 225 data->done = true;
1919631e 226 aio_co_wake(co);
a77fd4bb
FZ
227}
228
481cad48
MP
229static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
230 bool begin)
a77fd4bb
FZ
231{
232 BdrvCoDrainData data;
233
234 /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
235 * other coroutines run if they were queued from
236 * qemu_co_queue_run_restart(). */
237
238 assert(qemu_in_coroutine());
239 data = (BdrvCoDrainData) {
240 .co = qemu_coroutine_self(),
241 .bs = bs,
242 .done = false,
481cad48 243 .begin = begin,
a77fd4bb 244 };
99723548 245 bdrv_inc_in_flight(bs);
fffb6e12
PB
246 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
247 bdrv_co_drain_bh_cb, &data);
a77fd4bb
FZ
248
249 qemu_coroutine_yield();
250 /* If we are resumed from some other event (such as an aio completion or a
251 * timer callback), it is a bug in the caller that should be fixed. */
252 assert(data.done);
253}
254
6820643f
KW
255void bdrv_drained_begin(BlockDriverState *bs)
256{
d42cf288 257 if (qemu_in_coroutine()) {
481cad48 258 bdrv_co_yield_to_drain(bs, true);
d42cf288
PB
259 return;
260 }
261
60369b86 262 /* Stop things in parent-to-child order */
414c2ec3 263 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
6820643f
KW
264 aio_disable_external(bdrv_get_aio_context(bs));
265 bdrv_parent_drained_begin(bs);
266 }
267
db0289b9 268 bdrv_drain_invoke(bs, true);
99c05de9 269 bdrv_drain_recurse(bs);
6820643f
KW
270}
271
272void bdrv_drained_end(BlockDriverState *bs)
273{
481cad48
MP
274 if (qemu_in_coroutine()) {
275 bdrv_co_yield_to_drain(bs, false);
276 return;
277 }
6820643f 278 assert(bs->quiesce_counter > 0);
414c2ec3 279 if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
6820643f
KW
280 return;
281 }
282
60369b86 283 /* Re-enable things in child-to-parent order */
db0289b9 284 bdrv_drain_invoke(bs, false);
60369b86 285 bdrv_parent_drained_end(bs);
6820643f
KW
286 aio_enable_external(bdrv_get_aio_context(bs));
287}
288
61007b31 289/*
67da1dc5
FZ
290 * Wait for pending requests to complete on a single BlockDriverState subtree,
291 * and suspend block driver's internal I/O until next request arrives.
61007b31 292 *
61007b31
SH
293 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
294 * AioContext.
7a63f3cd
SH
295 *
296 * Only this BlockDriverState's AioContext is run, so in-flight requests must
297 * not depend on events in other AioContexts. In that case, use
298 * bdrv_drain_all() instead.
61007b31 299 */
b6e84c97 300void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
61007b31 301{
6820643f
KW
302 assert(qemu_in_coroutine());
303 bdrv_drained_begin(bs);
304 bdrv_drained_end(bs);
b6e84c97 305}
f406c03c 306
b6e84c97
PB
307void bdrv_drain(BlockDriverState *bs)
308{
6820643f
KW
309 bdrv_drained_begin(bs);
310 bdrv_drained_end(bs);
61007b31
SH
311}
312
313/*
314 * Wait for pending requests to complete across all BlockDriverStates
315 *
316 * This function does not flush data to disk, use bdrv_flush_all() for that
317 * after calling this function.
c0778f66
AG
318 *
319 * This pauses all block jobs and disables external clients. It must
320 * be paired with bdrv_drain_all_end().
321 *
322 * NOTE: no new block jobs or BlockDriverStates can be created between
323 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
61007b31 324 */
c0778f66 325void bdrv_drain_all_begin(void)
61007b31
SH
326{
327 /* Always run first iteration so any pending completion BHs run */
99723548 328 bool waited = true;
7c8eece4 329 BlockDriverState *bs;
88be7b4b 330 BdrvNextIterator it;
f406c03c 331 GSList *aio_ctxs = NULL, *ctx;
61007b31 332
f321dcb5 333 block_job_pause_all();
eb1364ce 334
88be7b4b 335 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
336 AioContext *aio_context = bdrv_get_aio_context(bs);
337
60369b86 338 /* Stop things in parent-to-child order */
61007b31 339 aio_context_acquire(aio_context);
c0778f66 340 aio_disable_external(aio_context);
60369b86 341 bdrv_parent_drained_begin(bs);
2da9b7d4 342 bdrv_drain_invoke(bs, true);
61007b31 343 aio_context_release(aio_context);
f406c03c 344
764ba3ae 345 if (!g_slist_find(aio_ctxs, aio_context)) {
f406c03c
AY
346 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
347 }
61007b31
SH
348 }
349
7a63f3cd
SH
350 /* Note that completion of an asynchronous I/O operation can trigger any
351 * number of other I/O operations on other devices---for example a
352 * coroutine can submit an I/O request to another device in response to
353 * request completion. Therefore we must keep looping until there was no
354 * more activity rather than simply draining each device independently.
355 */
99723548
PB
356 while (waited) {
357 waited = false;
61007b31 358
f406c03c
AY
359 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
360 AioContext *aio_context = ctx->data;
61007b31
SH
361
362 aio_context_acquire(aio_context);
88be7b4b 363 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
f406c03c 364 if (aio_context == bdrv_get_aio_context(bs)) {
99c05de9 365 waited |= bdrv_drain_recurse(bs);
f406c03c
AY
366 }
367 }
61007b31
SH
368 aio_context_release(aio_context);
369 }
370 }
371
c0778f66
AG
372 g_slist_free(aio_ctxs);
373}
374
375void bdrv_drain_all_end(void)
376{
377 BlockDriverState *bs;
378 BdrvNextIterator it;
c0778f66 379
88be7b4b 380 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
61007b31
SH
381 AioContext *aio_context = bdrv_get_aio_context(bs);
382
60369b86 383 /* Re-enable things in child-to-parent order */
61007b31 384 aio_context_acquire(aio_context);
db0289b9 385 bdrv_drain_invoke(bs, false);
60369b86
KW
386 bdrv_parent_drained_end(bs);
387 aio_enable_external(aio_context);
61007b31
SH
388 aio_context_release(aio_context);
389 }
eb1364ce 390
f321dcb5 391 block_job_resume_all();
61007b31
SH
392}
393
c0778f66
AG
394void bdrv_drain_all(void)
395{
396 bdrv_drain_all_begin();
397 bdrv_drain_all_end();
398}
399
61007b31
SH
400/**
401 * Remove an active request from the tracked requests list
402 *
403 * This function should be called when a tracked request is completing.
404 */
405static void tracked_request_end(BdrvTrackedRequest *req)
406{
407 if (req->serialising) {
20fc71b2 408 atomic_dec(&req->bs->serialising_in_flight);
61007b31
SH
409 }
410
3783fa3d 411 qemu_co_mutex_lock(&req->bs->reqs_lock);
61007b31
SH
412 QLIST_REMOVE(req, list);
413 qemu_co_queue_restart_all(&req->wait_queue);
3783fa3d 414 qemu_co_mutex_unlock(&req->bs->reqs_lock);
61007b31
SH
415}
416
417/**
418 * Add an active request to the tracked requests list
419 */
420static void tracked_request_begin(BdrvTrackedRequest *req,
421 BlockDriverState *bs,
422 int64_t offset,
ebde595c
FZ
423 unsigned int bytes,
424 enum BdrvTrackedRequestType type)
61007b31
SH
425{
426 *req = (BdrvTrackedRequest){
427 .bs = bs,
428 .offset = offset,
429 .bytes = bytes,
ebde595c 430 .type = type,
61007b31
SH
431 .co = qemu_coroutine_self(),
432 .serialising = false,
433 .overlap_offset = offset,
434 .overlap_bytes = bytes,
435 };
436
437 qemu_co_queue_init(&req->wait_queue);
438
3783fa3d 439 qemu_co_mutex_lock(&bs->reqs_lock);
61007b31 440 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
3783fa3d 441 qemu_co_mutex_unlock(&bs->reqs_lock);
61007b31
SH
442}
443
444static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
445{
446 int64_t overlap_offset = req->offset & ~(align - 1);
447 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
448 - overlap_offset;
449
450 if (!req->serialising) {
20fc71b2 451 atomic_inc(&req->bs->serialising_in_flight);
61007b31
SH
452 req->serialising = true;
453 }
454
455 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
456 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
457}
458
244483e6
KW
459/**
460 * Round a region to cluster boundaries
461 */
462void bdrv_round_to_clusters(BlockDriverState *bs,
7cfd5275 463 int64_t offset, int64_t bytes,
244483e6 464 int64_t *cluster_offset,
7cfd5275 465 int64_t *cluster_bytes)
244483e6
KW
466{
467 BlockDriverInfo bdi;
468
469 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
470 *cluster_offset = offset;
471 *cluster_bytes = bytes;
472 } else {
473 int64_t c = bdi.cluster_size;
474 *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
475 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
476 }
477}
478
61007b31
SH
479static int bdrv_get_cluster_size(BlockDriverState *bs)
480{
481 BlockDriverInfo bdi;
482 int ret;
483
484 ret = bdrv_get_info(bs, &bdi);
485 if (ret < 0 || bdi.cluster_size == 0) {
a5b8dd2c 486 return bs->bl.request_alignment;
61007b31
SH
487 } else {
488 return bdi.cluster_size;
489 }
490}
491
492static bool tracked_request_overlaps(BdrvTrackedRequest *req,
493 int64_t offset, unsigned int bytes)
494{
495 /* aaaa bbbb */
496 if (offset >= req->overlap_offset + req->overlap_bytes) {
497 return false;
498 }
499 /* bbbb aaaa */
500 if (req->overlap_offset >= offset + bytes) {
501 return false;
502 }
503 return true;
504}
505
99723548
PB
506void bdrv_inc_in_flight(BlockDriverState *bs)
507{
508 atomic_inc(&bs->in_flight);
509}
510
c9d1a561
PB
511static void dummy_bh_cb(void *opaque)
512{
513}
514
515void bdrv_wakeup(BlockDriverState *bs)
516{
e2a6ae7f
PB
517 /* The barrier (or an atomic op) is in the caller. */
518 if (atomic_read(&bs->wakeup)) {
c9d1a561
PB
519 aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
520 }
521}
522
99723548
PB
523void bdrv_dec_in_flight(BlockDriverState *bs)
524{
525 atomic_dec(&bs->in_flight);
c9d1a561 526 bdrv_wakeup(bs);
99723548
PB
527}
528
61007b31
SH
529static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
530{
531 BlockDriverState *bs = self->bs;
532 BdrvTrackedRequest *req;
533 bool retry;
534 bool waited = false;
535
20fc71b2 536 if (!atomic_read(&bs->serialising_in_flight)) {
61007b31
SH
537 return false;
538 }
539
540 do {
541 retry = false;
3783fa3d 542 qemu_co_mutex_lock(&bs->reqs_lock);
61007b31
SH
543 QLIST_FOREACH(req, &bs->tracked_requests, list) {
544 if (req == self || (!req->serialising && !self->serialising)) {
545 continue;
546 }
547 if (tracked_request_overlaps(req, self->overlap_offset,
548 self->overlap_bytes))
549 {
550 /* Hitting this means there was a reentrant request, for
551 * example, a block driver issuing nested requests. This must
552 * never happen since it means deadlock.
553 */
554 assert(qemu_coroutine_self() != req->co);
555
556 /* If the request is already (indirectly) waiting for us, or
557 * will wait for us as soon as it wakes up, then just go on
558 * (instead of producing a deadlock in the former case). */
559 if (!req->waiting_for) {
560 self->waiting_for = req;
3783fa3d 561 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
61007b31
SH
562 self->waiting_for = NULL;
563 retry = true;
564 waited = true;
565 break;
566 }
567 }
568 }
3783fa3d 569 qemu_co_mutex_unlock(&bs->reqs_lock);
61007b31
SH
570 } while (retry);
571
572 return waited;
573}
574
575static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
576 size_t size)
577{
578 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
579 return -EIO;
580 }
581
582 if (!bdrv_is_inserted(bs)) {
583 return -ENOMEDIUM;
584 }
585
586 if (offset < 0) {
587 return -EIO;
588 }
589
590 return 0;
591}
592
61007b31 593typedef struct RwCo {
e293b7a3 594 BdrvChild *child;
61007b31
SH
595 int64_t offset;
596 QEMUIOVector *qiov;
597 bool is_write;
598 int ret;
599 BdrvRequestFlags flags;
600} RwCo;
601
602static void coroutine_fn bdrv_rw_co_entry(void *opaque)
603{
604 RwCo *rwco = opaque;
605
606 if (!rwco->is_write) {
a03ef88f 607 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
cab3a356
KW
608 rwco->qiov->size, rwco->qiov,
609 rwco->flags);
61007b31 610 } else {
a03ef88f 611 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
cab3a356
KW
612 rwco->qiov->size, rwco->qiov,
613 rwco->flags);
61007b31
SH
614 }
615}
616
617/*
618 * Process a vectored synchronous request using coroutines
619 */
e293b7a3 620static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
61007b31
SH
621 QEMUIOVector *qiov, bool is_write,
622 BdrvRequestFlags flags)
623{
624 Coroutine *co;
625 RwCo rwco = {
e293b7a3 626 .child = child,
61007b31
SH
627 .offset = offset,
628 .qiov = qiov,
629 .is_write = is_write,
630 .ret = NOT_DONE,
631 .flags = flags,
632 };
633
61007b31
SH
634 if (qemu_in_coroutine()) {
635 /* Fast-path if already in coroutine context */
636 bdrv_rw_co_entry(&rwco);
637 } else {
0b8b8753 638 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
e92f0e19 639 bdrv_coroutine_enter(child->bs, co);
88b062c2 640 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
61007b31
SH
641 }
642 return rwco.ret;
643}
644
645/*
646 * Process a synchronous request using coroutines
647 */
e293b7a3 648static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
61007b31
SH
649 int nb_sectors, bool is_write, BdrvRequestFlags flags)
650{
651 QEMUIOVector qiov;
652 struct iovec iov = {
653 .iov_base = (void *)buf,
654 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
655 };
656
657 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
658 return -EINVAL;
659 }
660
661 qemu_iovec_init_external(&qiov, &iov, 1);
e293b7a3 662 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
61007b31
SH
663 &qiov, is_write, flags);
664}
665
666/* return < 0 if error. See bdrv_write() for the return codes */
fbcbbf4e 667int bdrv_read(BdrvChild *child, int64_t sector_num,
61007b31
SH
668 uint8_t *buf, int nb_sectors)
669{
e293b7a3 670 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
61007b31
SH
671}
672
61007b31
SH
673/* Return < 0 if error. Important errors are:
674 -EIO generic I/O error (may happen for all errors)
675 -ENOMEDIUM No media inserted.
676 -EINVAL Invalid sector number or nb_sectors
677 -EACCES Trying to write a read-only device
678*/
18d51c4b 679int bdrv_write(BdrvChild *child, int64_t sector_num,
61007b31
SH
680 const uint8_t *buf, int nb_sectors)
681{
e293b7a3 682 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
61007b31
SH
683}
684
720ff280 685int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
f5a5ca79 686 int bytes, BdrvRequestFlags flags)
61007b31 687{
74021bc4
EB
688 QEMUIOVector qiov;
689 struct iovec iov = {
690 .iov_base = NULL,
f5a5ca79 691 .iov_len = bytes,
74021bc4
EB
692 };
693
694 qemu_iovec_init_external(&qiov, &iov, 1);
e293b7a3 695 return bdrv_prwv_co(child, offset, &qiov, true,
74021bc4 696 BDRV_REQ_ZERO_WRITE | flags);
61007b31
SH
697}
698
699/*
74021bc4 700 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
61007b31
SH
701 * The operation is sped up by checking the block status and only writing
702 * zeroes to the device if they currently do not return zeroes. Optional
74021bc4 703 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
465fe887 704 * BDRV_REQ_FUA).
61007b31
SH
705 *
706 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
707 */
720ff280 708int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
61007b31 709{
237d78f8
EB
710 int ret;
711 int64_t target_size, bytes, offset = 0;
720ff280 712 BlockDriverState *bs = child->bs;
61007b31 713
7286d610
EB
714 target_size = bdrv_getlength(bs);
715 if (target_size < 0) {
716 return target_size;
61007b31
SH
717 }
718
719 for (;;) {
7286d610
EB
720 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
721 if (bytes <= 0) {
61007b31
SH
722 return 0;
723 }
237d78f8 724 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
61007b31 725 if (ret < 0) {
7286d610
EB
726 error_report("error getting block status at offset %" PRId64 ": %s",
727 offset, strerror(-ret));
61007b31
SH
728 return ret;
729 }
730 if (ret & BDRV_BLOCK_ZERO) {
237d78f8 731 offset += bytes;
61007b31
SH
732 continue;
733 }
237d78f8 734 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
61007b31 735 if (ret < 0) {
7286d610
EB
736 error_report("error writing zeroes at offset %" PRId64 ": %s",
737 offset, strerror(-ret));
61007b31
SH
738 return ret;
739 }
237d78f8 740 offset += bytes;
61007b31
SH
741 }
742}
743
cf2ab8fc 744int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
f1e84741
KW
745{
746 int ret;
747
e293b7a3 748 ret = bdrv_prwv_co(child, offset, qiov, false, 0);
f1e84741
KW
749 if (ret < 0) {
750 return ret;
751 }
752
753 return qiov->size;
754}
755
cf2ab8fc 756int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
61007b31
SH
757{
758 QEMUIOVector qiov;
759 struct iovec iov = {
760 .iov_base = (void *)buf,
761 .iov_len = bytes,
762 };
61007b31
SH
763
764 if (bytes < 0) {
765 return -EINVAL;
766 }
767
768 qemu_iovec_init_external(&qiov, &iov, 1);
cf2ab8fc 769 return bdrv_preadv(child, offset, &qiov);
61007b31
SH
770}
771
d9ca2ea2 772int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
61007b31
SH
773{
774 int ret;
775
e293b7a3 776 ret = bdrv_prwv_co(child, offset, qiov, true, 0);
61007b31
SH
777 if (ret < 0) {
778 return ret;
779 }
780
781 return qiov->size;
782}
783
d9ca2ea2 784int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
61007b31
SH
785{
786 QEMUIOVector qiov;
787 struct iovec iov = {
788 .iov_base = (void *) buf,
789 .iov_len = bytes,
790 };
791
792 if (bytes < 0) {
793 return -EINVAL;
794 }
795
796 qemu_iovec_init_external(&qiov, &iov, 1);
d9ca2ea2 797 return bdrv_pwritev(child, offset, &qiov);
61007b31
SH
798}
799
800/*
801 * Writes to the file and ensures that no writes are reordered across this
802 * request (acts as a barrier)
803 *
804 * Returns 0 on success, -errno in error cases.
805 */
d9ca2ea2
KW
806int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
807 const void *buf, int count)
61007b31
SH
808{
809 int ret;
810
d9ca2ea2 811 ret = bdrv_pwrite(child, offset, buf, count);
61007b31
SH
812 if (ret < 0) {
813 return ret;
814 }
815
d9ca2ea2 816 ret = bdrv_flush(child->bs);
855a6a93
KW
817 if (ret < 0) {
818 return ret;
61007b31
SH
819 }
820
821 return 0;
822}
823
08844473
KW
824typedef struct CoroutineIOCompletion {
825 Coroutine *coroutine;
826 int ret;
827} CoroutineIOCompletion;
828
829static void bdrv_co_io_em_complete(void *opaque, int ret)
830{
831 CoroutineIOCompletion *co = opaque;
832
833 co->ret = ret;
b9e413dd 834 aio_co_wake(co->coroutine);
08844473
KW
835}
836
166fe960
KW
837static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
838 uint64_t offset, uint64_t bytes,
839 QEMUIOVector *qiov, int flags)
840{
841 BlockDriver *drv = bs->drv;
3fb06697
KW
842 int64_t sector_num;
843 unsigned int nb_sectors;
844
fa166538
EB
845 assert(!(flags & ~BDRV_REQ_MASK));
846
d470ad42
HR
847 if (!drv) {
848 return -ENOMEDIUM;
849 }
850
3fb06697
KW
851 if (drv->bdrv_co_preadv) {
852 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
853 }
854
855 sector_num = offset >> BDRV_SECTOR_BITS;
856 nb_sectors = bytes >> BDRV_SECTOR_BITS;
166fe960
KW
857
858 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
859 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
860 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
861
08844473
KW
862 if (drv->bdrv_co_readv) {
863 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
864 } else {
865 BlockAIOCB *acb;
866 CoroutineIOCompletion co = {
867 .coroutine = qemu_coroutine_self(),
868 };
869
870 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
871 bdrv_co_io_em_complete, &co);
872 if (acb == NULL) {
873 return -EIO;
874 } else {
875 qemu_coroutine_yield();
876 return co.ret;
877 }
878 }
166fe960
KW
879}
880
78a07294
KW
881static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
882 uint64_t offset, uint64_t bytes,
883 QEMUIOVector *qiov, int flags)
884{
885 BlockDriver *drv = bs->drv;
3fb06697
KW
886 int64_t sector_num;
887 unsigned int nb_sectors;
78a07294
KW
888 int ret;
889
fa166538
EB
890 assert(!(flags & ~BDRV_REQ_MASK));
891
d470ad42
HR
892 if (!drv) {
893 return -ENOMEDIUM;
894 }
895
3fb06697 896 if (drv->bdrv_co_pwritev) {
515c2f43
KW
897 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
898 flags & bs->supported_write_flags);
899 flags &= ~bs->supported_write_flags;
3fb06697
KW
900 goto emulate_flags;
901 }
902
903 sector_num = offset >> BDRV_SECTOR_BITS;
904 nb_sectors = bytes >> BDRV_SECTOR_BITS;
905
78a07294
KW
906 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
907 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
908 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
909
910 if (drv->bdrv_co_writev_flags) {
911 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
4df863f3
EB
912 flags & bs->supported_write_flags);
913 flags &= ~bs->supported_write_flags;
08844473 914 } else if (drv->bdrv_co_writev) {
4df863f3 915 assert(!bs->supported_write_flags);
78a07294 916 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
08844473
KW
917 } else {
918 BlockAIOCB *acb;
919 CoroutineIOCompletion co = {
920 .coroutine = qemu_coroutine_self(),
921 };
922
923 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
924 bdrv_co_io_em_complete, &co);
925 if (acb == NULL) {
3fb06697 926 ret = -EIO;
08844473
KW
927 } else {
928 qemu_coroutine_yield();
3fb06697 929 ret = co.ret;
08844473 930 }
78a07294
KW
931 }
932
3fb06697 933emulate_flags:
4df863f3 934 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
78a07294
KW
935 ret = bdrv_co_flush(bs);
936 }
937
938 return ret;
939}
940
29a298af
PB
941static int coroutine_fn
942bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
943 uint64_t bytes, QEMUIOVector *qiov)
944{
945 BlockDriver *drv = bs->drv;
946
d470ad42
HR
947 if (!drv) {
948 return -ENOMEDIUM;
949 }
950
29a298af
PB
951 if (!drv->bdrv_co_pwritev_compressed) {
952 return -ENOTSUP;
953 }
954
29a298af
PB
955 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
956}
957
85c97ca7 958static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
244483e6 959 int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
61007b31 960{
85c97ca7
KW
961 BlockDriverState *bs = child->bs;
962
61007b31
SH
963 /* Perform I/O through a temporary buffer so that users who scribble over
964 * their read buffer while the operation is in progress do not end up
965 * modifying the image file. This is critical for zero-copy guest I/O
966 * where anything might happen inside guest memory.
967 */
968 void *bounce_buffer;
969
970 BlockDriver *drv = bs->drv;
971 struct iovec iov;
cb2e2878 972 QEMUIOVector local_qiov;
244483e6 973 int64_t cluster_offset;
7cfd5275 974 int64_t cluster_bytes;
61007b31
SH
975 size_t skip_bytes;
976 int ret;
cb2e2878
EB
977 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
978 BDRV_REQUEST_MAX_BYTES);
979 unsigned int progress = 0;
61007b31 980
d470ad42
HR
981 if (!drv) {
982 return -ENOMEDIUM;
983 }
984
1bf03e66
KW
985 /* FIXME We cannot require callers to have write permissions when all they
986 * are doing is a read request. If we did things right, write permissions
987 * would be obtained anyway, but internally by the copy-on-read code. As
765d9df9 988 * long as it is implemented here rather than in a separate filter driver,
1bf03e66
KW
989 * the copy-on-read code doesn't have its own BdrvChild, however, for which
990 * it could request permissions. Therefore we have to bypass the permission
991 * system for the moment. */
992 // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
afa4b293 993
61007b31 994 /* Cover entire cluster so no additional backing file I/O is required when
cb2e2878
EB
995 * allocating cluster in the image file. Note that this value may exceed
996 * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
997 * is one reason we loop rather than doing it all at once.
61007b31 998 */
244483e6 999 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
cb2e2878 1000 skip_bytes = offset - cluster_offset;
61007b31 1001
244483e6
KW
1002 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1003 cluster_offset, cluster_bytes);
61007b31 1004
cb2e2878
EB
1005 bounce_buffer = qemu_try_blockalign(bs,
1006 MIN(MIN(max_transfer, cluster_bytes),
1007 MAX_BOUNCE_BUFFER));
61007b31
SH
1008 if (bounce_buffer == NULL) {
1009 ret = -ENOMEM;
1010 goto err;
1011 }
1012
cb2e2878
EB
1013 while (cluster_bytes) {
1014 int64_t pnum;
61007b31 1015
cb2e2878
EB
1016 ret = bdrv_is_allocated(bs, cluster_offset,
1017 MIN(cluster_bytes, max_transfer), &pnum);
1018 if (ret < 0) {
1019 /* Safe to treat errors in querying allocation as if
1020 * unallocated; we'll probably fail again soon on the
1021 * read, but at least that will set a decent errno.
1022 */
1023 pnum = MIN(cluster_bytes, max_transfer);
1024 }
61007b31 1025
cb2e2878 1026 assert(skip_bytes < pnum);
61007b31 1027
cb2e2878
EB
1028 if (ret <= 0) {
1029 /* Must copy-on-read; use the bounce buffer */
1030 iov.iov_base = bounce_buffer;
1031 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1032 qemu_iovec_init_external(&local_qiov, &iov, 1);
61007b31 1033
cb2e2878
EB
1034 ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1035 &local_qiov, 0);
1036 if (ret < 0) {
1037 goto err;
1038 }
1039
1040 bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1041 if (drv->bdrv_co_pwrite_zeroes &&
1042 buffer_is_zero(bounce_buffer, pnum)) {
1043 /* FIXME: Should we (perhaps conditionally) be setting
1044 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1045 * that still correctly reads as zero? */
1046 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
1047 } else {
1048 /* This does not change the data on the disk, it is not
1049 * necessary to flush even in cache=writethrough mode.
1050 */
1051 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1052 &local_qiov, 0);
1053 }
1054
1055 if (ret < 0) {
1056 /* It might be okay to ignore write errors for guest
1057 * requests. If this is a deliberate copy-on-read
1058 * then we don't want to ignore the error. Simply
1059 * report it in all cases.
1060 */
1061 goto err;
1062 }
1063
1064 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1065 pnum - skip_bytes);
1066 } else {
1067 /* Read directly into the destination */
1068 qemu_iovec_init(&local_qiov, qiov->niov);
1069 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1070 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1071 &local_qiov, 0);
1072 qemu_iovec_destroy(&local_qiov);
1073 if (ret < 0) {
1074 goto err;
1075 }
1076 }
1077
1078 cluster_offset += pnum;
1079 cluster_bytes -= pnum;
1080 progress += pnum - skip_bytes;
1081 skip_bytes = 0;
1082 }
1083 ret = 0;
61007b31
SH
1084
1085err:
1086 qemu_vfree(bounce_buffer);
1087 return ret;
1088}
1089
1090/*
1091 * Forwards an already correctly aligned request to the BlockDriver. This
1a62d0ac
EB
1092 * handles copy on read, zeroing after EOF, and fragmentation of large
1093 * reads; any other features must be implemented by the caller.
61007b31 1094 */
85c97ca7 1095static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
61007b31
SH
1096 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1097 int64_t align, QEMUIOVector *qiov, int flags)
1098{
85c97ca7 1099 BlockDriverState *bs = child->bs;
c9d20029 1100 int64_t total_bytes, max_bytes;
1a62d0ac
EB
1101 int ret = 0;
1102 uint64_t bytes_remaining = bytes;
1103 int max_transfer;
61007b31 1104
49c07526
KW
1105 assert(is_power_of_2(align));
1106 assert((offset & (align - 1)) == 0);
1107 assert((bytes & (align - 1)) == 0);
61007b31 1108 assert(!qiov || bytes == qiov->size);
abb06c5a 1109 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1a62d0ac
EB
1110 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1111 align);
a604fa2b
EB
1112
1113 /* TODO: We would need a per-BDS .supported_read_flags and
1114 * potential fallback support, if we ever implement any read flags
1115 * to pass through to drivers. For now, there aren't any
1116 * passthrough flags. */
1117 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
61007b31
SH
1118
1119 /* Handle Copy on Read and associated serialisation */
1120 if (flags & BDRV_REQ_COPY_ON_READ) {
1121 /* If we touch the same cluster it counts as an overlap. This
1122 * guarantees that allocating writes will be serialized and not race
1123 * with each other for the same cluster. For example, in copy-on-read
1124 * it ensures that the CoR read and write operations are atomic and
1125 * guest writes cannot interleave between them. */
1126 mark_request_serialising(req, bdrv_get_cluster_size(bs));
1127 }
1128
61408b25
FZ
1129 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1130 wait_serialising_requests(req);
1131 }
61007b31
SH
1132
1133 if (flags & BDRV_REQ_COPY_ON_READ) {
d6a644bb 1134 int64_t pnum;
61007b31 1135
88e63df2 1136 ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
61007b31
SH
1137 if (ret < 0) {
1138 goto out;
1139 }
1140
88e63df2 1141 if (!ret || pnum != bytes) {
85c97ca7 1142 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
61007b31
SH
1143 goto out;
1144 }
1145 }
1146
1a62d0ac 1147 /* Forward the request to the BlockDriver, possibly fragmenting it */
c9d20029
KW
1148 total_bytes = bdrv_getlength(bs);
1149 if (total_bytes < 0) {
1150 ret = total_bytes;
1151 goto out;
1152 }
61007b31 1153
c9d20029 1154 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1a62d0ac 1155 if (bytes <= max_bytes && bytes <= max_transfer) {
c9d20029 1156 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1a62d0ac
EB
1157 goto out;
1158 }
61007b31 1159
1a62d0ac
EB
1160 while (bytes_remaining) {
1161 int num;
61007b31 1162
1a62d0ac
EB
1163 if (max_bytes) {
1164 QEMUIOVector local_qiov;
61007b31 1165
1a62d0ac
EB
1166 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1167 assert(num);
1168 qemu_iovec_init(&local_qiov, qiov->niov);
1169 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
61007b31 1170
1a62d0ac
EB
1171 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1172 num, &local_qiov, 0);
1173 max_bytes -= num;
1174 qemu_iovec_destroy(&local_qiov);
1175 } else {
1176 num = bytes_remaining;
1177 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1178 bytes_remaining);
1179 }
1180 if (ret < 0) {
1181 goto out;
1182 }
1183 bytes_remaining -= num;
61007b31
SH
1184 }
1185
1186out:
1a62d0ac 1187 return ret < 0 ? ret : 0;
61007b31
SH
1188}
1189
61007b31
SH
1190/*
1191 * Handle a read request in coroutine context
1192 */
a03ef88f 1193int coroutine_fn bdrv_co_preadv(BdrvChild *child,
61007b31
SH
1194 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1195 BdrvRequestFlags flags)
1196{
a03ef88f 1197 BlockDriverState *bs = child->bs;
61007b31
SH
1198 BlockDriver *drv = bs->drv;
1199 BdrvTrackedRequest req;
1200
a5b8dd2c 1201 uint64_t align = bs->bl.request_alignment;
61007b31
SH
1202 uint8_t *head_buf = NULL;
1203 uint8_t *tail_buf = NULL;
1204 QEMUIOVector local_qiov;
1205 bool use_local_qiov = false;
1206 int ret;
1207
f42cf447
DB
1208 trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1209
61007b31
SH
1210 if (!drv) {
1211 return -ENOMEDIUM;
1212 }
1213
1214 ret = bdrv_check_byte_request(bs, offset, bytes);
1215 if (ret < 0) {
1216 return ret;
1217 }
1218
99723548
PB
1219 bdrv_inc_in_flight(bs);
1220
9568b511 1221 /* Don't do copy-on-read if we read data before write operation */
d3faa13e 1222 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
61007b31
SH
1223 flags |= BDRV_REQ_COPY_ON_READ;
1224 }
1225
61007b31
SH
1226 /* Align read if necessary by padding qiov */
1227 if (offset & (align - 1)) {
1228 head_buf = qemu_blockalign(bs, align);
1229 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1230 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1231 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1232 use_local_qiov = true;
1233
1234 bytes += offset & (align - 1);
1235 offset = offset & ~(align - 1);
1236 }
1237
1238 if ((offset + bytes) & (align - 1)) {
1239 if (!use_local_qiov) {
1240 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1241 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1242 use_local_qiov = true;
1243 }
1244 tail_buf = qemu_blockalign(bs, align);
1245 qemu_iovec_add(&local_qiov, tail_buf,
1246 align - ((offset + bytes) & (align - 1)));
1247
1248 bytes = ROUND_UP(bytes, align);
1249 }
1250
ebde595c 1251 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
85c97ca7 1252 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
61007b31
SH
1253 use_local_qiov ? &local_qiov : qiov,
1254 flags);
1255 tracked_request_end(&req);
99723548 1256 bdrv_dec_in_flight(bs);
61007b31
SH
1257
1258 if (use_local_qiov) {
1259 qemu_iovec_destroy(&local_qiov);
1260 qemu_vfree(head_buf);
1261 qemu_vfree(tail_buf);
1262 }
1263
1264 return ret;
1265}
1266
adad6496 1267static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
61007b31
SH
1268 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1269 BdrvRequestFlags flags)
1270{
1271 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1272 return -EINVAL;
1273 }
1274
a03ef88f 1275 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
cab3a356 1276 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1277}
1278
28b04a8f
KW
1279int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1280 int nb_sectors, QEMUIOVector *qiov)
61007b31 1281{
adad6496 1282 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
61007b31
SH
1283}
1284
d05aa8bb 1285static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
f5a5ca79 1286 int64_t offset, int bytes, BdrvRequestFlags flags)
61007b31
SH
1287{
1288 BlockDriver *drv = bs->drv;
1289 QEMUIOVector qiov;
1290 struct iovec iov = {0};
1291 int ret = 0;
465fe887 1292 bool need_flush = false;
443668ca
DL
1293 int head = 0;
1294 int tail = 0;
61007b31 1295
cf081fca 1296 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
a5b8dd2c
EB
1297 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1298 bs->bl.request_alignment);
cb2e2878 1299 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
d05aa8bb 1300
d470ad42
HR
1301 if (!drv) {
1302 return -ENOMEDIUM;
1303 }
1304
b8d0a980
EB
1305 assert(alignment % bs->bl.request_alignment == 0);
1306 head = offset % alignment;
f5a5ca79 1307 tail = (offset + bytes) % alignment;
b8d0a980
EB
1308 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1309 assert(max_write_zeroes >= bs->bl.request_alignment);
61007b31 1310
f5a5ca79
MP
1311 while (bytes > 0 && !ret) {
1312 int num = bytes;
61007b31
SH
1313
1314 /* Align request. Block drivers can expect the "bulk" of the request
443668ca
DL
1315 * to be aligned, and that unaligned requests do not cross cluster
1316 * boundaries.
61007b31 1317 */
443668ca 1318 if (head) {
b2f95fee
EB
1319 /* Make a small request up to the first aligned sector. For
1320 * convenience, limit this request to max_transfer even if
1321 * we don't need to fall back to writes. */
f5a5ca79 1322 num = MIN(MIN(bytes, max_transfer), alignment - head);
b2f95fee
EB
1323 head = (head + num) % alignment;
1324 assert(num < max_write_zeroes);
d05aa8bb 1325 } else if (tail && num > alignment) {
443668ca
DL
1326 /* Shorten the request to the last aligned sector. */
1327 num -= tail;
61007b31
SH
1328 }
1329
1330 /* limit request size */
1331 if (num > max_write_zeroes) {
1332 num = max_write_zeroes;
1333 }
1334
1335 ret = -ENOTSUP;
1336 /* First try the efficient write zeroes operation */
d05aa8bb
EB
1337 if (drv->bdrv_co_pwrite_zeroes) {
1338 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1339 flags & bs->supported_zero_flags);
1340 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1341 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1342 need_flush = true;
1343 }
465fe887
EB
1344 } else {
1345 assert(!bs->supported_zero_flags);
61007b31
SH
1346 }
1347
1348 if (ret == -ENOTSUP) {
1349 /* Fall back to bounce buffer if write zeroes is unsupported */
465fe887
EB
1350 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1351
1352 if ((flags & BDRV_REQ_FUA) &&
1353 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1354 /* No need for bdrv_driver_pwrite() to do a fallback
1355 * flush on each chunk; use just one at the end */
1356 write_flags &= ~BDRV_REQ_FUA;
1357 need_flush = true;
1358 }
5def6b80 1359 num = MIN(num, max_transfer);
d05aa8bb 1360 iov.iov_len = num;
61007b31 1361 if (iov.iov_base == NULL) {
d05aa8bb 1362 iov.iov_base = qemu_try_blockalign(bs, num);
61007b31
SH
1363 if (iov.iov_base == NULL) {
1364 ret = -ENOMEM;
1365 goto fail;
1366 }
d05aa8bb 1367 memset(iov.iov_base, 0, num);
61007b31
SH
1368 }
1369 qemu_iovec_init_external(&qiov, &iov, 1);
1370
d05aa8bb 1371 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
61007b31
SH
1372
1373 /* Keep bounce buffer around if it is big enough for all
1374 * all future requests.
1375 */
5def6b80 1376 if (num < max_transfer) {
61007b31
SH
1377 qemu_vfree(iov.iov_base);
1378 iov.iov_base = NULL;
1379 }
1380 }
1381
d05aa8bb 1382 offset += num;
f5a5ca79 1383 bytes -= num;
61007b31
SH
1384 }
1385
1386fail:
465fe887
EB
1387 if (ret == 0 && need_flush) {
1388 ret = bdrv_co_flush(bs);
1389 }
61007b31
SH
1390 qemu_vfree(iov.iov_base);
1391 return ret;
1392}
1393
1394/*
04ed95f4
EB
1395 * Forwards an already correctly aligned write request to the BlockDriver,
1396 * after possibly fragmenting it.
61007b31 1397 */
85c97ca7 1398static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
61007b31 1399 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
cff86b38 1400 int64_t align, QEMUIOVector *qiov, int flags)
61007b31 1401{
85c97ca7 1402 BlockDriverState *bs = child->bs;
61007b31
SH
1403 BlockDriver *drv = bs->drv;
1404 bool waited;
1405 int ret;
1406
9896c876 1407 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
04ed95f4
EB
1408 uint64_t bytes_remaining = bytes;
1409 int max_transfer;
61007b31 1410
d470ad42
HR
1411 if (!drv) {
1412 return -ENOMEDIUM;
1413 }
1414
d6883bc9
VSO
1415 if (bdrv_has_readonly_bitmaps(bs)) {
1416 return -EPERM;
1417 }
1418
cff86b38
EB
1419 assert(is_power_of_2(align));
1420 assert((offset & (align - 1)) == 0);
1421 assert((bytes & (align - 1)) == 0);
61007b31 1422 assert(!qiov || bytes == qiov->size);
abb06c5a 1423 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
fa166538 1424 assert(!(flags & ~BDRV_REQ_MASK));
04ed95f4
EB
1425 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1426 align);
61007b31
SH
1427
1428 waited = wait_serialising_requests(req);
1429 assert(!waited || !req->serialising);
1430 assert(req->overlap_offset <= offset);
1431 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
362b3786
HR
1432 assert(child->perm & BLK_PERM_WRITE);
1433 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
61007b31
SH
1434
1435 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1436
1437 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
c1499a5e 1438 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
61007b31
SH
1439 qemu_iovec_is_zero(qiov)) {
1440 flags |= BDRV_REQ_ZERO_WRITE;
1441 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1442 flags |= BDRV_REQ_MAY_UNMAP;
1443 }
1444 }
1445
1446 if (ret < 0) {
1447 /* Do nothing, write notifier decided to fail this request */
1448 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9a4f4c31 1449 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
9896c876 1450 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
3ea1a091
PB
1451 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1452 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
04ed95f4 1453 } else if (bytes <= max_transfer) {
9a4f4c31 1454 bdrv_debug_event(bs, BLKDBG_PWRITEV);
78a07294 1455 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
04ed95f4
EB
1456 } else {
1457 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1458 while (bytes_remaining) {
1459 int num = MIN(bytes_remaining, max_transfer);
1460 QEMUIOVector local_qiov;
1461 int local_flags = flags;
1462
1463 assert(num);
1464 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1465 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1466 /* If FUA is going to be emulated by flush, we only
1467 * need to flush on the last iteration */
1468 local_flags &= ~BDRV_REQ_FUA;
1469 }
1470 qemu_iovec_init(&local_qiov, qiov->niov);
1471 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1472
1473 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1474 num, &local_qiov, local_flags);
1475 qemu_iovec_destroy(&local_qiov);
1476 if (ret < 0) {
1477 break;
1478 }
1479 bytes_remaining -= num;
1480 }
61007b31 1481 }
9a4f4c31 1482 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
61007b31 1483
47fec599 1484 atomic_inc(&bs->write_gen);
0fdf1a4f 1485 bdrv_set_dirty(bs, offset, bytes);
61007b31 1486
f7946da2 1487 stat64_max(&bs->wr_highest_offset, offset + bytes);
61007b31
SH
1488
1489 if (ret >= 0) {
9896c876 1490 bs->total_sectors = MAX(bs->total_sectors, end_sector);
04ed95f4 1491 ret = 0;
61007b31
SH
1492 }
1493
1494 return ret;
1495}
1496
85c97ca7 1497static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
9eeb6dd1
FZ
1498 int64_t offset,
1499 unsigned int bytes,
1500 BdrvRequestFlags flags,
1501 BdrvTrackedRequest *req)
1502{
85c97ca7 1503 BlockDriverState *bs = child->bs;
9eeb6dd1
FZ
1504 uint8_t *buf = NULL;
1505 QEMUIOVector local_qiov;
1506 struct iovec iov;
a5b8dd2c 1507 uint64_t align = bs->bl.request_alignment;
9eeb6dd1
FZ
1508 unsigned int head_padding_bytes, tail_padding_bytes;
1509 int ret = 0;
1510
1511 head_padding_bytes = offset & (align - 1);
f13ce1be 1512 tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
9eeb6dd1
FZ
1513
1514
1515 assert(flags & BDRV_REQ_ZERO_WRITE);
1516 if (head_padding_bytes || tail_padding_bytes) {
1517 buf = qemu_blockalign(bs, align);
1518 iov = (struct iovec) {
1519 .iov_base = buf,
1520 .iov_len = align,
1521 };
1522 qemu_iovec_init_external(&local_qiov, &iov, 1);
1523 }
1524 if (head_padding_bytes) {
1525 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1526
1527 /* RMW the unaligned part before head. */
1528 mark_request_serialising(req, align);
1529 wait_serialising_requests(req);
9a4f4c31 1530 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
85c97ca7 1531 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
9eeb6dd1
FZ
1532 align, &local_qiov, 0);
1533 if (ret < 0) {
1534 goto fail;
1535 }
9a4f4c31 1536 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
9eeb6dd1
FZ
1537
1538 memset(buf + head_padding_bytes, 0, zero_bytes);
85c97ca7 1539 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
cff86b38 1540 align, &local_qiov,
9eeb6dd1
FZ
1541 flags & ~BDRV_REQ_ZERO_WRITE);
1542 if (ret < 0) {
1543 goto fail;
1544 }
1545 offset += zero_bytes;
1546 bytes -= zero_bytes;
1547 }
1548
1549 assert(!bytes || (offset & (align - 1)) == 0);
1550 if (bytes >= align) {
1551 /* Write the aligned part in the middle. */
1552 uint64_t aligned_bytes = bytes & ~(align - 1);
85c97ca7 1553 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
9eeb6dd1
FZ
1554 NULL, flags);
1555 if (ret < 0) {
1556 goto fail;
1557 }
1558 bytes -= aligned_bytes;
1559 offset += aligned_bytes;
1560 }
1561
1562 assert(!bytes || (offset & (align - 1)) == 0);
1563 if (bytes) {
1564 assert(align == tail_padding_bytes + bytes);
1565 /* RMW the unaligned part after tail. */
1566 mark_request_serialising(req, align);
1567 wait_serialising_requests(req);
9a4f4c31 1568 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
85c97ca7 1569 ret = bdrv_aligned_preadv(child, req, offset, align,
9eeb6dd1
FZ
1570 align, &local_qiov, 0);
1571 if (ret < 0) {
1572 goto fail;
1573 }
9a4f4c31 1574 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
9eeb6dd1
FZ
1575
1576 memset(buf, 0, bytes);
85c97ca7 1577 ret = bdrv_aligned_pwritev(child, req, offset, align, align,
9eeb6dd1
FZ
1578 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1579 }
1580fail:
1581 qemu_vfree(buf);
1582 return ret;
1583
1584}
1585
61007b31
SH
1586/*
1587 * Handle a write request in coroutine context
1588 */
a03ef88f 1589int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
61007b31
SH
1590 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1591 BdrvRequestFlags flags)
1592{
a03ef88f 1593 BlockDriverState *bs = child->bs;
61007b31 1594 BdrvTrackedRequest req;
a5b8dd2c 1595 uint64_t align = bs->bl.request_alignment;
61007b31
SH
1596 uint8_t *head_buf = NULL;
1597 uint8_t *tail_buf = NULL;
1598 QEMUIOVector local_qiov;
1599 bool use_local_qiov = false;
1600 int ret;
1601
f42cf447
DB
1602 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1603
61007b31
SH
1604 if (!bs->drv) {
1605 return -ENOMEDIUM;
1606 }
1607 if (bs->read_only) {
eaf5fe2d 1608 return -EPERM;
61007b31 1609 }
04c01a5c 1610 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31
SH
1611
1612 ret = bdrv_check_byte_request(bs, offset, bytes);
1613 if (ret < 0) {
1614 return ret;
1615 }
1616
99723548 1617 bdrv_inc_in_flight(bs);
61007b31
SH
1618 /*
1619 * Align write if necessary by performing a read-modify-write cycle.
1620 * Pad qiov with the read parts and be sure to have a tracked request not
1621 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1622 */
ebde595c 1623 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
61007b31 1624
9eeb6dd1 1625 if (!qiov) {
85c97ca7 1626 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
9eeb6dd1
FZ
1627 goto out;
1628 }
1629
61007b31
SH
1630 if (offset & (align - 1)) {
1631 QEMUIOVector head_qiov;
1632 struct iovec head_iov;
1633
1634 mark_request_serialising(&req, align);
1635 wait_serialising_requests(&req);
1636
1637 head_buf = qemu_blockalign(bs, align);
1638 head_iov = (struct iovec) {
1639 .iov_base = head_buf,
1640 .iov_len = align,
1641 };
1642 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1643
9a4f4c31 1644 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
85c97ca7 1645 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
61007b31
SH
1646 align, &head_qiov, 0);
1647 if (ret < 0) {
1648 goto fail;
1649 }
9a4f4c31 1650 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
61007b31
SH
1651
1652 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1653 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1654 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1655 use_local_qiov = true;
1656
1657 bytes += offset & (align - 1);
1658 offset = offset & ~(align - 1);
117bc3fa
PL
1659
1660 /* We have read the tail already if the request is smaller
1661 * than one aligned block.
1662 */
1663 if (bytes < align) {
1664 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1665 bytes = align;
1666 }
61007b31
SH
1667 }
1668
1669 if ((offset + bytes) & (align - 1)) {
1670 QEMUIOVector tail_qiov;
1671 struct iovec tail_iov;
1672 size_t tail_bytes;
1673 bool waited;
1674
1675 mark_request_serialising(&req, align);
1676 waited = wait_serialising_requests(&req);
1677 assert(!waited || !use_local_qiov);
1678
1679 tail_buf = qemu_blockalign(bs, align);
1680 tail_iov = (struct iovec) {
1681 .iov_base = tail_buf,
1682 .iov_len = align,
1683 };
1684 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1685
9a4f4c31 1686 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
85c97ca7
KW
1687 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1688 align, align, &tail_qiov, 0);
61007b31
SH
1689 if (ret < 0) {
1690 goto fail;
1691 }
9a4f4c31 1692 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
61007b31
SH
1693
1694 if (!use_local_qiov) {
1695 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1696 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1697 use_local_qiov = true;
1698 }
1699
1700 tail_bytes = (offset + bytes) & (align - 1);
1701 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1702
1703 bytes = ROUND_UP(bytes, align);
1704 }
1705
85c97ca7 1706 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
3ea1a091
PB
1707 use_local_qiov ? &local_qiov : qiov,
1708 flags);
61007b31
SH
1709
1710fail:
61007b31
SH
1711
1712 if (use_local_qiov) {
1713 qemu_iovec_destroy(&local_qiov);
1714 }
1715 qemu_vfree(head_buf);
1716 qemu_vfree(tail_buf);
9eeb6dd1
FZ
1717out:
1718 tracked_request_end(&req);
99723548 1719 bdrv_dec_in_flight(bs);
61007b31
SH
1720 return ret;
1721}
1722
adad6496 1723static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
61007b31
SH
1724 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1725 BdrvRequestFlags flags)
1726{
1727 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1728 return -EINVAL;
1729 }
1730
a03ef88f 1731 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
cab3a356 1732 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
61007b31
SH
1733}
1734
25ec177d 1735int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
61007b31
SH
1736 int nb_sectors, QEMUIOVector *qiov)
1737{
adad6496 1738 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
61007b31
SH
1739}
1740
a03ef88f 1741int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
f5a5ca79 1742 int bytes, BdrvRequestFlags flags)
61007b31 1743{
f5a5ca79 1744 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
61007b31 1745
a03ef88f 1746 if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
61007b31
SH
1747 flags &= ~BDRV_REQ_MAY_UNMAP;
1748 }
61007b31 1749
f5a5ca79 1750 return bdrv_co_pwritev(child, offset, bytes, NULL,
74021bc4 1751 BDRV_REQ_ZERO_WRITE | flags);
61007b31
SH
1752}
1753
4085f5c7
JS
1754/*
1755 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1756 */
1757int bdrv_flush_all(void)
1758{
1759 BdrvNextIterator it;
1760 BlockDriverState *bs = NULL;
1761 int result = 0;
1762
1763 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1764 AioContext *aio_context = bdrv_get_aio_context(bs);
1765 int ret;
1766
1767 aio_context_acquire(aio_context);
1768 ret = bdrv_flush(bs);
1769 if (ret < 0 && !result) {
1770 result = ret;
1771 }
1772 aio_context_release(aio_context);
1773 }
1774
1775 return result;
1776}
1777
1778
4bcd936e 1779typedef struct BdrvCoBlockStatusData {
61007b31
SH
1780 BlockDriverState *bs;
1781 BlockDriverState *base;
c9ce8c4d 1782 bool want_zero;
4bcd936e
EB
1783 int64_t offset;
1784 int64_t bytes;
1785 int64_t *pnum;
1786 int64_t *map;
c9ce8c4d 1787 BlockDriverState **file;
4bcd936e 1788 int ret;
61007b31 1789 bool done;
4bcd936e 1790} BdrvCoBlockStatusData;
61007b31 1791
f7cc69b3
MP
1792int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
1793 int64_t sector_num,
1794 int nb_sectors,
1795 int *pnum,
1796 BlockDriverState **file)
1797{
1798 assert(bs->file && bs->file->bs);
1799 *pnum = nb_sectors;
1800 *file = bs->file->bs;
1801 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1802 (sector_num << BDRV_SECTOR_BITS);
1803}
1804
1805int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
1806 int64_t sector_num,
1807 int nb_sectors,
1808 int *pnum,
1809 BlockDriverState **file)
1810{
1811 assert(bs->backing && bs->backing->bs);
1812 *pnum = nb_sectors;
1813 *file = bs->backing->bs;
1814 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1815 (sector_num << BDRV_SECTOR_BITS);
1816}
1817
61007b31
SH
1818/*
1819 * Returns the allocation status of the specified sectors.
1820 * Drivers not implementing the functionality are assumed to not support
1821 * backing files, hence all their sectors are reported as allocated.
1822 *
c9ce8c4d
EB
1823 * If 'want_zero' is true, the caller is querying for mapping purposes,
1824 * and the result should include BDRV_BLOCK_OFFSET_VALID and
1825 * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those
1826 * bits particularly if it allows for a larger value in 'pnum'.
1827 *
2e8bc787 1828 * If 'offset' is beyond the end of the disk image the return value is
fb0d8654 1829 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
61007b31 1830 *
2e8bc787 1831 * 'bytes' is the max value 'pnum' should be set to. If bytes goes
fb0d8654
EB
1832 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
1833 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
67a0fd2a 1834 *
2e8bc787
EB
1835 * 'pnum' is set to the number of bytes (including and immediately
1836 * following the specified offset) that are easily known to be in the
1837 * same allocated/unallocated state. Note that a second call starting
1838 * at the original offset plus returned pnum may have the same status.
1839 * The returned value is non-zero on success except at end-of-file.
1840 *
1841 * Returns negative errno on failure. Otherwise, if the
1842 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
1843 * set to the host mapping and BDS corresponding to the guest offset.
61007b31 1844 */
2e8bc787
EB
1845static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
1846 bool want_zero,
1847 int64_t offset, int64_t bytes,
1848 int64_t *pnum, int64_t *map,
1849 BlockDriverState **file)
1850{
1851 int64_t total_size;
1852 int64_t n; /* bytes */
efa6e2ed 1853 int ret;
2e8bc787 1854 int64_t local_map = 0;
298a1665 1855 BlockDriverState *local_file = NULL;
efa6e2ed
EB
1856 int64_t aligned_offset, aligned_bytes;
1857 uint32_t align;
61007b31 1858
298a1665
EB
1859 assert(pnum);
1860 *pnum = 0;
2e8bc787
EB
1861 total_size = bdrv_getlength(bs);
1862 if (total_size < 0) {
1863 ret = total_size;
298a1665 1864 goto early_out;
61007b31
SH
1865 }
1866
2e8bc787 1867 if (offset >= total_size) {
298a1665
EB
1868 ret = BDRV_BLOCK_EOF;
1869 goto early_out;
61007b31 1870 }
2e8bc787 1871 if (!bytes) {
298a1665
EB
1872 ret = 0;
1873 goto early_out;
9cdcfd9f 1874 }
61007b31 1875
2e8bc787
EB
1876 n = total_size - offset;
1877 if (n < bytes) {
1878 bytes = n;
61007b31
SH
1879 }
1880
d470ad42
HR
1881 /* Must be non-NULL or bdrv_getlength() would have failed */
1882 assert(bs->drv);
61007b31 1883 if (!bs->drv->bdrv_co_get_block_status) {
2e8bc787 1884 *pnum = bytes;
61007b31 1885 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2e8bc787 1886 if (offset + bytes == total_size) {
fb0d8654
EB
1887 ret |= BDRV_BLOCK_EOF;
1888 }
61007b31 1889 if (bs->drv->protocol_name) {
2e8bc787
EB
1890 ret |= BDRV_BLOCK_OFFSET_VALID;
1891 local_map = offset;
298a1665 1892 local_file = bs;
61007b31 1893 }
298a1665 1894 goto early_out;
61007b31
SH
1895 }
1896
99723548 1897 bdrv_inc_in_flight(bs);
efa6e2ed
EB
1898
1899 /* Round out to request_alignment boundaries */
1900 /* TODO: until we have a byte-based driver callback, we also have to
1901 * round out to sectors, even if that is bigger than request_alignment */
1902 align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE);
1903 aligned_offset = QEMU_ALIGN_DOWN(offset, align);
1904 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
1905
1906 {
1907 int count; /* sectors */
1908 int64_t longret;
1909
1910 assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes,
1911 BDRV_SECTOR_SIZE));
1912 /*
1913 * The contract allows us to return pnum smaller than bytes, even
1914 * if the next query would see the same status; we truncate the
1915 * request to avoid overflowing the driver's 32-bit interface.
1916 */
1917 longret = bs->drv->bdrv_co_get_block_status(
1918 bs, aligned_offset >> BDRV_SECTOR_BITS,
1919 MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count,
1920 &local_file);
1921 if (longret < 0) {
1922 assert(INT_MIN <= longret);
1923 ret = longret;
1924 goto out;
1925 }
1926 if (longret & BDRV_BLOCK_OFFSET_VALID) {
1927 local_map = longret & BDRV_BLOCK_OFFSET_MASK;
1928 }
1929 ret = longret & ~BDRV_BLOCK_OFFSET_MASK;
1930 *pnum = count * BDRV_SECTOR_SIZE;
1931 }
1932
2e8bc787 1933 /*
efa6e2ed
EB
1934 * The driver's result must be a multiple of request_alignment.
1935 * Clamp pnum and adjust map to original request.
2e8bc787 1936 */
efa6e2ed
EB
1937 assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset);
1938 *pnum -= offset - aligned_offset;
1939 if (*pnum > bytes) {
1940 *pnum = bytes;
61007b31 1941 }
2e8bc787 1942 if (ret & BDRV_BLOCK_OFFSET_VALID) {
efa6e2ed 1943 local_map += offset - aligned_offset;
2e8bc787 1944 }
61007b31
SH
1945
1946 if (ret & BDRV_BLOCK_RAW) {
298a1665 1947 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2e8bc787
EB
1948 ret = bdrv_co_block_status(local_file, want_zero, local_map,
1949 *pnum, pnum, &local_map, &local_file);
99723548 1950 goto out;
61007b31
SH
1951 }
1952
1953 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1954 ret |= BDRV_BLOCK_ALLOCATED;
c9ce8c4d 1955 } else if (want_zero) {
61007b31
SH
1956 if (bdrv_unallocated_blocks_are_zero(bs)) {
1957 ret |= BDRV_BLOCK_ZERO;
760e0063
KW
1958 } else if (bs->backing) {
1959 BlockDriverState *bs2 = bs->backing->bs;
2e8bc787 1960 int64_t size2 = bdrv_getlength(bs2);
c9ce8c4d 1961
2e8bc787 1962 if (size2 >= 0 && offset >= size2) {
61007b31
SH
1963 ret |= BDRV_BLOCK_ZERO;
1964 }
1965 }
1966 }
1967
c9ce8c4d 1968 if (want_zero && local_file && local_file != bs &&
61007b31
SH
1969 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1970 (ret & BDRV_BLOCK_OFFSET_VALID)) {
2e8bc787
EB
1971 int64_t file_pnum;
1972 int ret2;
61007b31 1973
2e8bc787
EB
1974 ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
1975 *pnum, &file_pnum, NULL, NULL);
61007b31
SH
1976 if (ret2 >= 0) {
1977 /* Ignore errors. This is just providing extra information, it
1978 * is useful but not necessary.
1979 */
c61e684e
EB
1980 if (ret2 & BDRV_BLOCK_EOF &&
1981 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
1982 /*
1983 * It is valid for the format block driver to read
1984 * beyond the end of the underlying file's current
1985 * size; such areas read as zero.
1986 */
61007b31
SH
1987 ret |= BDRV_BLOCK_ZERO;
1988 } else {
1989 /* Limit request to the range reported by the protocol driver */
1990 *pnum = file_pnum;
1991 ret |= (ret2 & BDRV_BLOCK_ZERO);
1992 }
1993 }
1994 }
1995
99723548
PB
1996out:
1997 bdrv_dec_in_flight(bs);
2e8bc787 1998 if (ret >= 0 && offset + *pnum == total_size) {
fb0d8654
EB
1999 ret |= BDRV_BLOCK_EOF;
2000 }
298a1665
EB
2001early_out:
2002 if (file) {
2003 *file = local_file;
2004 }
2e8bc787
EB
2005 if (map) {
2006 *map = local_map;
2007 }
61007b31
SH
2008 return ret;
2009}
2010
5b648c67
EB
2011static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2012 BlockDriverState *base,
2013 bool want_zero,
2014 int64_t offset,
2015 int64_t bytes,
2016 int64_t *pnum,
2017 int64_t *map,
2018 BlockDriverState **file)
ba3f0e25
FZ
2019{
2020 BlockDriverState *p;
5b648c67 2021 int ret = 0;
c61e684e 2022 bool first = true;
ba3f0e25
FZ
2023
2024 assert(bs != base);
760e0063 2025 for (p = bs; p != base; p = backing_bs(p)) {
5b648c67
EB
2026 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2027 file);
c61e684e
EB
2028 if (ret < 0) {
2029 break;
2030 }
2031 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2032 /*
2033 * Reading beyond the end of the file continues to read
2034 * zeroes, but we can only widen the result to the
2035 * unallocated length we learned from an earlier
2036 * iteration.
2037 */
5b648c67 2038 *pnum = bytes;
c61e684e
EB
2039 }
2040 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
ba3f0e25
FZ
2041 break;
2042 }
5b648c67
EB
2043 /* [offset, pnum] unallocated on this layer, which could be only
2044 * the first part of [offset, bytes]. */
2045 bytes = MIN(bytes, *pnum);
c61e684e 2046 first = false;
ba3f0e25
FZ
2047 }
2048 return ret;
2049}
2050
31826642 2051/* Coroutine wrapper for bdrv_block_status_above() */
5b648c67 2052static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
61007b31 2053{
4bcd936e 2054 BdrvCoBlockStatusData *data = opaque;
61007b31 2055
5b648c67
EB
2056 data->ret = bdrv_co_block_status_above(data->bs, data->base,
2057 data->want_zero,
2058 data->offset, data->bytes,
2059 data->pnum, data->map, data->file);
61007b31
SH
2060 data->done = true;
2061}
2062
2063/*
5b648c67 2064 * Synchronous wrapper around bdrv_co_block_status_above().
61007b31 2065 *
5b648c67 2066 * See bdrv_co_block_status_above() for details.
61007b31 2067 */
7ddb99b9
EB
2068static int bdrv_common_block_status_above(BlockDriverState *bs,
2069 BlockDriverState *base,
2070 bool want_zero, int64_t offset,
2071 int64_t bytes, int64_t *pnum,
2072 int64_t *map,
2073 BlockDriverState **file)
61007b31
SH
2074{
2075 Coroutine *co;
4bcd936e 2076 BdrvCoBlockStatusData data = {
61007b31 2077 .bs = bs,
ba3f0e25 2078 .base = base,
c9ce8c4d 2079 .want_zero = want_zero,
7ddb99b9
EB
2080 .offset = offset,
2081 .bytes = bytes,
2082 .pnum = pnum,
2083 .map = map,
c9ce8c4d 2084 .file = file,
61007b31
SH
2085 .done = false,
2086 };
2087
2088 if (qemu_in_coroutine()) {
2089 /* Fast-path if already in coroutine context */
5b648c67 2090 bdrv_block_status_above_co_entry(&data);
61007b31 2091 } else {
5b648c67 2092 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
e92f0e19 2093 bdrv_coroutine_enter(bs, co);
88b062c2 2094 BDRV_POLL_WHILE(bs, !data.done);
61007b31 2095 }
7ddb99b9 2096 return data.ret;
61007b31
SH
2097}
2098
31826642
EB
2099int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2100 int64_t offset, int64_t bytes, int64_t *pnum,
2101 int64_t *map, BlockDriverState **file)
c9ce8c4d 2102{
31826642
EB
2103 return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2104 pnum, map, file);
c9ce8c4d
EB
2105}
2106
237d78f8
EB
2107int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2108 int64_t *pnum, int64_t *map, BlockDriverState **file)
ba3f0e25 2109{
31826642
EB
2110 return bdrv_block_status_above(bs, backing_bs(bs),
2111 offset, bytes, pnum, map, file);
ba3f0e25
FZ
2112}
2113
d6a644bb
EB
2114int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2115 int64_t bytes, int64_t *pnum)
61007b31 2116{
7ddb99b9
EB
2117 int ret;
2118 int64_t dummy;
d6a644bb 2119
7ddb99b9
EB
2120 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2121 bytes, pnum ? pnum : &dummy, NULL,
c9ce8c4d 2122 NULL);
61007b31
SH
2123 if (ret < 0) {
2124 return ret;
2125 }
2126 return !!(ret & BDRV_BLOCK_ALLOCATED);
2127}
2128
2129/*
2130 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2131 *
51b0a488
EB
2132 * Return true if (a prefix of) the given range is allocated in any image
2133 * between BASE and TOP (inclusive). BASE can be NULL to check if the given
2134 * offset is allocated in any image of the chain. Return false otherwise,
d6a644bb 2135 * or negative errno on failure.
61007b31 2136 *
51b0a488
EB
2137 * 'pnum' is set to the number of bytes (including and immediately
2138 * following the specified offset) that are known to be in the same
2139 * allocated/unallocated state. Note that a subsequent call starting
2140 * at 'offset + *pnum' may return the same allocation status (in other
2141 * words, the result is not necessarily the maximum possible range);
2142 * but 'pnum' will only be 0 when end of file is reached.
61007b31
SH
2143 *
2144 */
2145int bdrv_is_allocated_above(BlockDriverState *top,
2146 BlockDriverState *base,
51b0a488 2147 int64_t offset, int64_t bytes, int64_t *pnum)
61007b31
SH
2148{
2149 BlockDriverState *intermediate;
51b0a488
EB
2150 int ret;
2151 int64_t n = bytes;
61007b31
SH
2152
2153 intermediate = top;
2154 while (intermediate && intermediate != base) {
d6a644bb 2155 int64_t pnum_inter;
c00716be 2156 int64_t size_inter;
d6a644bb 2157
51b0a488 2158 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
61007b31
SH
2159 if (ret < 0) {
2160 return ret;
d6a644bb 2161 }
d6a644bb 2162 if (ret) {
51b0a488 2163 *pnum = pnum_inter;
61007b31
SH
2164 return 1;
2165 }
2166
51b0a488 2167 size_inter = bdrv_getlength(intermediate);
c00716be
EB
2168 if (size_inter < 0) {
2169 return size_inter;
2170 }
51b0a488
EB
2171 if (n > pnum_inter &&
2172 (intermediate == top || offset + pnum_inter < size_inter)) {
2173 n = pnum_inter;
61007b31
SH
2174 }
2175
760e0063 2176 intermediate = backing_bs(intermediate);
61007b31
SH
2177 }
2178
2179 *pnum = n;
2180 return 0;
2181}
2182
1a8ae822
KW
2183typedef struct BdrvVmstateCo {
2184 BlockDriverState *bs;
2185 QEMUIOVector *qiov;
2186 int64_t pos;
2187 bool is_read;
2188 int ret;
2189} BdrvVmstateCo;
2190
2191static int coroutine_fn
2192bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2193 bool is_read)
2194{
2195 BlockDriver *drv = bs->drv;
dc88a467
SH
2196 int ret = -ENOTSUP;
2197
2198 bdrv_inc_in_flight(bs);
1a8ae822
KW
2199
2200 if (!drv) {
dc88a467 2201 ret = -ENOMEDIUM;
1a8ae822 2202 } else if (drv->bdrv_load_vmstate) {
dc88a467
SH
2203 if (is_read) {
2204 ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2205 } else {
2206 ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2207 }
1a8ae822 2208 } else if (bs->file) {
dc88a467 2209 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1a8ae822
KW
2210 }
2211
dc88a467
SH
2212 bdrv_dec_in_flight(bs);
2213 return ret;
1a8ae822
KW
2214}
2215
2216static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2217{
2218 BdrvVmstateCo *co = opaque;
2219 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2220}
2221
2222static inline int
2223bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2224 bool is_read)
2225{
2226 if (qemu_in_coroutine()) {
2227 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2228 } else {
2229 BdrvVmstateCo data = {
2230 .bs = bs,
2231 .qiov = qiov,
2232 .pos = pos,
2233 .is_read = is_read,
2234 .ret = -EINPROGRESS,
2235 };
0b8b8753 2236 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
1a8ae822 2237
e92f0e19 2238 bdrv_coroutine_enter(bs, co);
ea17c9d2 2239 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
1a8ae822
KW
2240 return data.ret;
2241 }
2242}
2243
61007b31
SH
2244int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2245 int64_t pos, int size)
2246{
2247 QEMUIOVector qiov;
2248 struct iovec iov = {
2249 .iov_base = (void *) buf,
2250 .iov_len = size,
2251 };
b433d942 2252 int ret;
61007b31
SH
2253
2254 qemu_iovec_init_external(&qiov, &iov, 1);
b433d942
KW
2255
2256 ret = bdrv_writev_vmstate(bs, &qiov, pos);
2257 if (ret < 0) {
2258 return ret;
2259 }
2260
2261 return size;
61007b31
SH
2262}
2263
2264int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2265{
1a8ae822 2266 return bdrv_rw_vmstate(bs, qiov, pos, false);
61007b31
SH
2267}
2268
2269int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2270 int64_t pos, int size)
5ddda0b8
KW
2271{
2272 QEMUIOVector qiov;
2273 struct iovec iov = {
2274 .iov_base = buf,
2275 .iov_len = size,
2276 };
b433d942 2277 int ret;
5ddda0b8
KW
2278
2279 qemu_iovec_init_external(&qiov, &iov, 1);
b433d942
KW
2280 ret = bdrv_readv_vmstate(bs, &qiov, pos);
2281 if (ret < 0) {
2282 return ret;
2283 }
2284
2285 return size;
5ddda0b8
KW
2286}
2287
2288int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
61007b31 2289{
1a8ae822 2290 return bdrv_rw_vmstate(bs, qiov, pos, true);
61007b31
SH
2291}
2292
2293/**************************************************************/
2294/* async I/Os */
2295
61007b31
SH
2296void bdrv_aio_cancel(BlockAIOCB *acb)
2297{
2298 qemu_aio_ref(acb);
2299 bdrv_aio_cancel_async(acb);
2300 while (acb->refcnt > 1) {
2301 if (acb->aiocb_info->get_aio_context) {
2302 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2303 } else if (acb->bs) {
2f47da5f
PB
2304 /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2305 * assert that we're not using an I/O thread. Thread-safe
2306 * code should use bdrv_aio_cancel_async exclusively.
2307 */
2308 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
61007b31
SH
2309 aio_poll(bdrv_get_aio_context(acb->bs), true);
2310 } else {
2311 abort();
2312 }
2313 }
2314 qemu_aio_unref(acb);
2315}
2316
2317/* Async version of aio cancel. The caller is not blocked if the acb implements
2318 * cancel_async, otherwise we do nothing and let the request normally complete.
2319 * In either case the completion callback must be called. */
2320void bdrv_aio_cancel_async(BlockAIOCB *acb)
2321{
2322 if (acb->aiocb_info->cancel_async) {
2323 acb->aiocb_info->cancel_async(acb);
2324 }
2325}
2326
61007b31
SH
2327/**************************************************************/
2328/* Coroutine block device emulation */
2329
e293b7a3
KW
2330typedef struct FlushCo {
2331 BlockDriverState *bs;
2332 int ret;
2333} FlushCo;
2334
2335
61007b31
SH
2336static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2337{
e293b7a3 2338 FlushCo *rwco = opaque;
61007b31
SH
2339
2340 rwco->ret = bdrv_co_flush(rwco->bs);
2341}
2342
2343int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2344{
49ca6259
FZ
2345 int current_gen;
2346 int ret = 0;
2347
2348 bdrv_inc_in_flight(bs);
61007b31 2349
e914404e 2350 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
1b6bc94d 2351 bdrv_is_sg(bs)) {
49ca6259 2352 goto early_exit;
61007b31
SH
2353 }
2354
3783fa3d 2355 qemu_co_mutex_lock(&bs->reqs_lock);
47fec599 2356 current_gen = atomic_read(&bs->write_gen);
3ff2f67a
EY
2357
2358 /* Wait until any previous flushes are completed */
99723548 2359 while (bs->active_flush_req) {
3783fa3d 2360 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
3ff2f67a
EY
2361 }
2362
3783fa3d 2363 /* Flushes reach this point in nondecreasing current_gen order. */
99723548 2364 bs->active_flush_req = true;
3783fa3d 2365 qemu_co_mutex_unlock(&bs->reqs_lock);
3ff2f67a 2366
c32b82af
PD
2367 /* Write back all layers by calling one driver function */
2368 if (bs->drv->bdrv_co_flush) {
2369 ret = bs->drv->bdrv_co_flush(bs);
2370 goto out;
2371 }
2372
61007b31
SH
2373 /* Write back cached data to the OS even with cache=unsafe */
2374 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2375 if (bs->drv->bdrv_co_flush_to_os) {
2376 ret = bs->drv->bdrv_co_flush_to_os(bs);
2377 if (ret < 0) {
cdb5e315 2378 goto out;
61007b31
SH
2379 }
2380 }
2381
2382 /* But don't actually force it to the disk with cache=unsafe */
2383 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2384 goto flush_parent;
2385 }
2386
3ff2f67a
EY
2387 /* Check if we really need to flush anything */
2388 if (bs->flushed_gen == current_gen) {
2389 goto flush_parent;
2390 }
2391
61007b31 2392 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
d470ad42
HR
2393 if (!bs->drv) {
2394 /* bs->drv->bdrv_co_flush() might have ejected the BDS
2395 * (even in case of apparent success) */
2396 ret = -ENOMEDIUM;
2397 goto out;
2398 }
61007b31
SH
2399 if (bs->drv->bdrv_co_flush_to_disk) {
2400 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2401 } else if (bs->drv->bdrv_aio_flush) {
2402 BlockAIOCB *acb;
2403 CoroutineIOCompletion co = {
2404 .coroutine = qemu_coroutine_self(),
2405 };
2406
2407 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2408 if (acb == NULL) {
2409 ret = -EIO;
2410 } else {
2411 qemu_coroutine_yield();
2412 ret = co.ret;
2413 }
2414 } else {
2415 /*
2416 * Some block drivers always operate in either writethrough or unsafe
2417 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2418 * know how the server works (because the behaviour is hardcoded or
2419 * depends on server-side configuration), so we can't ensure that
2420 * everything is safe on disk. Returning an error doesn't work because
2421 * that would break guests even if the server operates in writethrough
2422 * mode.
2423 *
2424 * Let's hope the user knows what he's doing.
2425 */
2426 ret = 0;
2427 }
3ff2f67a 2428
61007b31 2429 if (ret < 0) {
cdb5e315 2430 goto out;
61007b31
SH
2431 }
2432
2433 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
2434 * in the case of cache=unsafe, so there are no useless flushes.
2435 */
2436flush_parent:
cdb5e315
FZ
2437 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2438out:
3ff2f67a 2439 /* Notify any pending flushes that we have completed */
e6af1e08
KW
2440 if (ret == 0) {
2441 bs->flushed_gen = current_gen;
2442 }
3783fa3d
PB
2443
2444 qemu_co_mutex_lock(&bs->reqs_lock);
99723548 2445 bs->active_flush_req = false;
156af3ac
DL
2446 /* Return value is ignored - it's ok if wait queue is empty */
2447 qemu_co_queue_next(&bs->flush_queue);
3783fa3d 2448 qemu_co_mutex_unlock(&bs->reqs_lock);
3ff2f67a 2449
49ca6259 2450early_exit:
99723548 2451 bdrv_dec_in_flight(bs);
cdb5e315 2452 return ret;
61007b31
SH
2453}
2454
2455int bdrv_flush(BlockDriverState *bs)
2456{
2457 Coroutine *co;
e293b7a3 2458 FlushCo flush_co = {
61007b31
SH
2459 .bs = bs,
2460 .ret = NOT_DONE,
2461 };
2462
2463 if (qemu_in_coroutine()) {
2464 /* Fast-path if already in coroutine context */
e293b7a3 2465 bdrv_flush_co_entry(&flush_co);
61007b31 2466 } else {
0b8b8753 2467 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
e92f0e19 2468 bdrv_coroutine_enter(bs, co);
88b062c2 2469 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
61007b31
SH
2470 }
2471
e293b7a3 2472 return flush_co.ret;
61007b31
SH
2473}
2474
2475typedef struct DiscardCo {
2476 BlockDriverState *bs;
0c51a893 2477 int64_t offset;
f5a5ca79 2478 int bytes;
61007b31
SH
2479 int ret;
2480} DiscardCo;
0c51a893 2481static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
61007b31
SH
2482{
2483 DiscardCo *rwco = opaque;
2484
f5a5ca79 2485 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
61007b31
SH
2486}
2487
9f1963b3 2488int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
f5a5ca79 2489 int bytes)
61007b31 2490{
b1066c87 2491 BdrvTrackedRequest req;
9f1963b3 2492 int max_pdiscard, ret;
3482b9bc 2493 int head, tail, align;
61007b31
SH
2494
2495 if (!bs->drv) {
2496 return -ENOMEDIUM;
2497 }
2498
d6883bc9
VSO
2499 if (bdrv_has_readonly_bitmaps(bs)) {
2500 return -EPERM;
2501 }
2502
f5a5ca79 2503 ret = bdrv_check_byte_request(bs, offset, bytes);
61007b31
SH
2504 if (ret < 0) {
2505 return ret;
2506 } else if (bs->read_only) {
eaf5fe2d 2507 return -EPERM;
61007b31 2508 }
04c01a5c 2509 assert(!(bs->open_flags & BDRV_O_INACTIVE));
61007b31 2510
61007b31
SH
2511 /* Do nothing if disabled. */
2512 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2513 return 0;
2514 }
2515
02aefe43 2516 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
61007b31
SH
2517 return 0;
2518 }
2519
3482b9bc
EB
2520 /* Discard is advisory, but some devices track and coalesce
2521 * unaligned requests, so we must pass everything down rather than
2522 * round here. Still, most devices will just silently ignore
2523 * unaligned requests (by returning -ENOTSUP), so we must fragment
2524 * the request accordingly. */
02aefe43 2525 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
b8d0a980
EB
2526 assert(align % bs->bl.request_alignment == 0);
2527 head = offset % align;
f5a5ca79 2528 tail = (offset + bytes) % align;
9f1963b3 2529
99723548 2530 bdrv_inc_in_flight(bs);
f5a5ca79 2531 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
50824995 2532
ec050f77
DL
2533 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2534 if (ret < 0) {
2535 goto out;
2536 }
2537
9f1963b3
EB
2538 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2539 align);
3482b9bc 2540 assert(max_pdiscard >= bs->bl.request_alignment);
61007b31 2541
f5a5ca79 2542 while (bytes > 0) {
f5a5ca79 2543 int num = bytes;
3482b9bc
EB
2544
2545 if (head) {
2546 /* Make small requests to get to alignment boundaries. */
f5a5ca79 2547 num = MIN(bytes, align - head);
3482b9bc
EB
2548 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2549 num %= bs->bl.request_alignment;
2550 }
2551 head = (head + num) % align;
2552 assert(num < max_pdiscard);
2553 } else if (tail) {
2554 if (num > align) {
2555 /* Shorten the request to the last aligned cluster. */
2556 num -= tail;
2557 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2558 tail > bs->bl.request_alignment) {
2559 tail %= bs->bl.request_alignment;
2560 num -= tail;
2561 }
2562 }
2563 /* limit request size */
2564 if (num > max_pdiscard) {
2565 num = max_pdiscard;
2566 }
61007b31 2567
d470ad42
HR
2568 if (!bs->drv) {
2569 ret = -ENOMEDIUM;
2570 goto out;
2571 }
47a5486d
EB
2572 if (bs->drv->bdrv_co_pdiscard) {
2573 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
61007b31
SH
2574 } else {
2575 BlockAIOCB *acb;
2576 CoroutineIOCompletion co = {
2577 .coroutine = qemu_coroutine_self(),
2578 };
2579
4da444a0
EB
2580 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2581 bdrv_co_io_em_complete, &co);
61007b31 2582 if (acb == NULL) {
b1066c87
FZ
2583 ret = -EIO;
2584 goto out;
61007b31
SH
2585 } else {
2586 qemu_coroutine_yield();
2587 ret = co.ret;
2588 }
2589 }
2590 if (ret && ret != -ENOTSUP) {
b1066c87 2591 goto out;
61007b31
SH
2592 }
2593
9f1963b3 2594 offset += num;
f5a5ca79 2595 bytes -= num;
61007b31 2596 }
b1066c87
FZ
2597 ret = 0;
2598out:
47fec599 2599 atomic_inc(&bs->write_gen);
0fdf1a4f 2600 bdrv_set_dirty(bs, req.offset, req.bytes);
b1066c87 2601 tracked_request_end(&req);
99723548 2602 bdrv_dec_in_flight(bs);
b1066c87 2603 return ret;
61007b31
SH
2604}
2605
f5a5ca79 2606int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
61007b31
SH
2607{
2608 Coroutine *co;
2609 DiscardCo rwco = {
2610 .bs = bs,
0c51a893 2611 .offset = offset,
f5a5ca79 2612 .bytes = bytes,
61007b31
SH
2613 .ret = NOT_DONE,
2614 };
2615
2616 if (qemu_in_coroutine()) {
2617 /* Fast-path if already in coroutine context */
0c51a893 2618 bdrv_pdiscard_co_entry(&rwco);
61007b31 2619 } else {
0c51a893 2620 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
e92f0e19 2621 bdrv_coroutine_enter(bs, co);
88b062c2 2622 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
61007b31
SH
2623 }
2624
2625 return rwco.ret;
2626}
2627
48af776a 2628int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
61007b31
SH
2629{
2630 BlockDriver *drv = bs->drv;
5c5ae76a
FZ
2631 CoroutineIOCompletion co = {
2632 .coroutine = qemu_coroutine_self(),
2633 };
2634 BlockAIOCB *acb;
61007b31 2635
99723548 2636 bdrv_inc_in_flight(bs);
16a389dc 2637 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
5c5ae76a
FZ
2638 co.ret = -ENOTSUP;
2639 goto out;
2640 }
2641
16a389dc
KW
2642 if (drv->bdrv_co_ioctl) {
2643 co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2644 } else {
2645 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2646 if (!acb) {
2647 co.ret = -ENOTSUP;
2648 goto out;
2649 }
2650 qemu_coroutine_yield();
5c5ae76a 2651 }
5c5ae76a 2652out:
99723548 2653 bdrv_dec_in_flight(bs);
5c5ae76a
FZ
2654 return co.ret;
2655}
2656
61007b31
SH
2657void *qemu_blockalign(BlockDriverState *bs, size_t size)
2658{
2659 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2660}
2661
2662void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2663{
2664 return memset(qemu_blockalign(bs, size), 0, size);
2665}
2666
2667void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2668{
2669 size_t align = bdrv_opt_mem_align(bs);
2670
2671 /* Ensure that NULL is never returned on success */
2672 assert(align > 0);
2673 if (size == 0) {
2674 size = align;
2675 }
2676
2677 return qemu_try_memalign(align, size);
2678}
2679
2680void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2681{
2682 void *mem = qemu_try_blockalign(bs, size);
2683
2684 if (mem) {
2685 memset(mem, 0, size);
2686 }
2687
2688 return mem;
2689}
2690
2691/*
2692 * Check if all memory in this vector is sector aligned.
2693 */
2694bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2695{
2696 int i;
4196d2f0 2697 size_t alignment = bdrv_min_mem_align(bs);
61007b31
SH
2698
2699 for (i = 0; i < qiov->niov; i++) {
2700 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2701 return false;
2702 }
2703 if (qiov->iov[i].iov_len % alignment) {
2704 return false;
2705 }
2706 }
2707
2708 return true;
2709}
2710
2711void bdrv_add_before_write_notifier(BlockDriverState *bs,
2712 NotifierWithReturn *notifier)
2713{
2714 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2715}
2716
2717void bdrv_io_plug(BlockDriverState *bs)
2718{
6b98bd64
PB
2719 BdrvChild *child;
2720
2721 QLIST_FOREACH(child, &bs->children, next) {
2722 bdrv_io_plug(child->bs);
2723 }
2724
850d54a2 2725 if (atomic_fetch_inc(&bs->io_plugged) == 0) {
6b98bd64
PB
2726 BlockDriver *drv = bs->drv;
2727 if (drv && drv->bdrv_io_plug) {
2728 drv->bdrv_io_plug(bs);
2729 }
61007b31
SH
2730 }
2731}
2732
2733void bdrv_io_unplug(BlockDriverState *bs)
2734{
6b98bd64
PB
2735 BdrvChild *child;
2736
2737 assert(bs->io_plugged);
850d54a2 2738 if (atomic_fetch_dec(&bs->io_plugged) == 1) {
6b98bd64
PB
2739 BlockDriver *drv = bs->drv;
2740 if (drv && drv->bdrv_io_unplug) {
2741 drv->bdrv_io_unplug(bs);
2742 }
2743 }
2744
2745 QLIST_FOREACH(child, &bs->children, next) {
2746 bdrv_io_unplug(child->bs);
61007b31
SH
2747 }
2748}
This page took 0.618718 seconds and 4 git commands to generate.