4 * Copyright (C) 2013 Proxmox Server Solutions
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
14 #include "qemu/osdep.h"
17 #include "block/block.h"
18 #include "block/block_int.h"
19 #include "block/blockjob_int.h"
20 #include "block/block_backup.h"
21 #include "qapi/error.h"
22 #include "qapi/qmp/qerror.h"
23 #include "qemu/ratelimit.h"
24 #include "qemu/cutils.h"
25 #include "sysemu/block-backend.h"
26 #include "qemu/bitmap.h"
27 #include "qemu/error-report.h"
29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
31 typedef struct CowRequest {
34 QLIST_ENTRY(CowRequest) list;
35 CoQueue wait_queue; /* coroutines blocked on this request */
38 typedef void (*ProgressBytesCallbackFunc)(int64_t bytes, void *opaque);
39 typedef void (*ProgressResetCallbackFunc)(void *opaque);
40 typedef struct BlockCopyState {
43 BdrvDirtyBitmap *copy_bitmap;
46 int64_t copy_range_size;
49 BdrvRequestFlags write_flags;
54 * Used by sync=top jobs, which first scan the source node for unallocated
55 * areas and clear them in the copy_bitmap. During this process, the bitmap
56 * is thus not fully initialized: It may still have bits set for areas that
57 * are unallocated and should actually not be copied.
59 * This is indicated by skip_unallocated.
61 * In this case, block_copy() will query the source’s allocation status,
62 * skip unallocated regions, clear them in the copy_bitmap, and invoke
63 * block_copy_reset_unallocated() every time it does.
65 bool skip_unallocated;
67 /* progress_bytes_callback: called when some copying progress is done. */
68 ProgressBytesCallbackFunc progress_bytes_callback;
71 * progress_reset_callback: called when some bytes reset from copy_bitmap
72 * (see @skip_unallocated above). The callee is assumed to recalculate how
73 * many bytes remain based on the dirty bit count of copy_bitmap.
75 ProgressResetCallbackFunc progress_reset_callback;
76 void *progress_opaque;
79 typedef struct BackupBlockJob {
81 BlockDriverState *source_bs;
83 BdrvDirtyBitmap *sync_bitmap;
85 MirrorSyncMode sync_mode;
86 BitmapSyncMode bitmap_mode;
87 BlockdevOnError on_source_error;
88 BlockdevOnError on_target_error;
89 CoRwlock flush_rwlock;
93 NotifierWithReturn before_write;
94 QLIST_HEAD(, CowRequest) inflight_reqs;
99 static const BlockJobDriver backup_job_driver;
101 /* See if in-flight requests overlap and wait for them to complete */
102 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
111 QLIST_FOREACH(req, &job->inflight_reqs, list) {
112 if (end > req->start_byte && start < req->end_byte) {
113 qemu_co_queue_wait(&req->wait_queue, NULL);
121 /* Keep track of an in-flight request */
122 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
123 int64_t start, int64_t end)
125 req->start_byte = start;
127 qemu_co_queue_init(&req->wait_queue);
128 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
131 /* Forget about a completed request */
132 static void cow_request_end(CowRequest *req)
134 QLIST_REMOVE(req, list);
135 qemu_co_queue_restart_all(&req->wait_queue);
138 static void block_copy_state_free(BlockCopyState *s)
144 bdrv_release_dirty_bitmap(blk_bs(s->source), s->copy_bitmap);
145 blk_unref(s->source);
146 blk_unref(s->target);
150 static BlockCopyState *block_copy_state_new(
151 BlockDriverState *source, BlockDriverState *target,
152 int64_t cluster_size, BdrvRequestFlags write_flags,
153 ProgressBytesCallbackFunc progress_bytes_callback,
154 ProgressResetCallbackFunc progress_reset_callback,
155 void *progress_opaque, Error **errp)
159 uint64_t no_resize = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
160 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD;
161 BdrvDirtyBitmap *copy_bitmap;
163 copy_bitmap = bdrv_create_dirty_bitmap(source, cluster_size, NULL, errp);
167 bdrv_disable_dirty_bitmap(copy_bitmap);
169 s = g_new(BlockCopyState, 1);
170 *s = (BlockCopyState) {
171 .source = blk_new(bdrv_get_aio_context(source),
172 BLK_PERM_CONSISTENT_READ, no_resize),
173 .target = blk_new(bdrv_get_aio_context(target),
174 BLK_PERM_WRITE, no_resize),
175 .copy_bitmap = copy_bitmap,
176 .cluster_size = cluster_size,
177 .len = bdrv_dirty_bitmap_size(copy_bitmap),
178 .write_flags = write_flags,
179 .progress_bytes_callback = progress_bytes_callback,
180 .progress_reset_callback = progress_reset_callback,
181 .progress_opaque = progress_opaque,
184 s->copy_range_size = QEMU_ALIGN_DOWN(MIN(blk_get_max_transfer(s->source),
185 blk_get_max_transfer(s->target)),
188 * Set use_copy_range, consider the following:
189 * 1. Compression is not supported for copy_range.
190 * 2. copy_range does not respect max_transfer (it's a TODO), so we factor
191 * that in here. If max_transfer is smaller than the job->cluster_size,
192 * we do not use copy_range (in that case it's zero after aligning down
196 !(write_flags & BDRV_REQ_WRITE_COMPRESSED) && s->copy_range_size > 0;
199 * We just allow aio context change on our block backends. block_copy() user
200 * (now it's only backup) is responsible for source and target being in same
203 blk_set_disable_request_queuing(s->source, true);
204 blk_set_allow_aio_context_change(s->source, true);
205 blk_set_disable_request_queuing(s->target, true);
206 blk_set_allow_aio_context_change(s->target, true);
208 ret = blk_insert_bs(s->source, source, errp);
213 ret = blk_insert_bs(s->target, target, errp);
221 block_copy_state_free(s);
227 * Copy range to target with a bounce buffer and return the bytes copied. If
228 * error occurred, return a negative error number
230 static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
233 bool is_write_notifier,
235 void **bounce_buffer)
239 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
241 assert(QEMU_IS_ALIGNED(start, s->cluster_size));
242 bdrv_reset_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
243 nbytes = MIN(s->cluster_size, s->len - start);
244 if (!*bounce_buffer) {
245 *bounce_buffer = blk_blockalign(s->source, s->cluster_size);
248 ret = blk_co_pread(s->source, start, nbytes, *bounce_buffer, read_flags);
250 trace_block_copy_with_bounce_buffer_read_fail(s, start, ret);
252 *error_is_read = true;
257 ret = blk_co_pwrite(s->target, start, nbytes, *bounce_buffer,
260 trace_block_copy_with_bounce_buffer_write_fail(s, start, ret);
262 *error_is_read = false;
269 bdrv_set_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
275 * Copy range to target and return the bytes copied. If error occurred, return a
276 * negative error number.
278 static int coroutine_fn block_copy_with_offload(BlockCopyState *s,
281 bool is_write_notifier)
286 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
288 assert(QEMU_IS_ALIGNED(s->copy_range_size, s->cluster_size));
289 assert(QEMU_IS_ALIGNED(start, s->cluster_size));
290 nbytes = MIN(s->copy_range_size, MIN(end, s->len) - start);
291 nr_clusters = DIV_ROUND_UP(nbytes, s->cluster_size);
292 bdrv_reset_dirty_bitmap(s->copy_bitmap, start,
293 s->cluster_size * nr_clusters);
294 ret = blk_co_copy_range(s->source, start, s->target, start, nbytes,
295 read_flags, s->write_flags);
297 trace_block_copy_with_offload_fail(s, start, ret);
298 bdrv_set_dirty_bitmap(s->copy_bitmap, start,
299 s->cluster_size * nr_clusters);
307 * Check if the cluster starting at offset is allocated or not.
308 * return via pnum the number of contiguous clusters sharing this allocation.
310 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
313 BlockDriverState *bs = blk_bs(s->source);
314 int64_t count, total_count = 0;
315 int64_t bytes = s->len - offset;
318 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
321 ret = bdrv_is_allocated(bs, offset, bytes, &count);
326 total_count += count;
328 if (ret || count == 0) {
330 * ret: partial segment(s) are considered allocated.
331 * otherwise: unallocated tail is treated as an entire segment.
333 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
337 /* Unallocated segment(s) with uncertain following segment(s) */
338 if (total_count >= s->cluster_size) {
339 *pnum = total_count / s->cluster_size;
349 * Reset bits in copy_bitmap starting at offset if they represent unallocated
350 * data in the image. May reset subsequent contiguous bits.
351 * @return 0 when the cluster at @offset was unallocated,
352 * 1 otherwise, and -ret on error.
354 static int64_t block_copy_reset_unallocated(BlockCopyState *s,
355 int64_t offset, int64_t *count)
358 int64_t clusters, bytes;
360 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
365 bytes = clusters * s->cluster_size;
368 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
369 s->progress_reset_callback(s->progress_opaque);
376 static int coroutine_fn block_copy(BlockCopyState *s,
377 int64_t start, uint64_t bytes,
379 bool is_write_notifier)
382 int64_t end = bytes + start; /* bytes */
383 void *bounce_buffer = NULL;
384 int64_t status_bytes;
387 * block_copy() user is responsible for keeping source and target in same
390 assert(blk_get_aio_context(s->source) == blk_get_aio_context(s->target));
392 assert(QEMU_IS_ALIGNED(start, s->cluster_size));
393 assert(QEMU_IS_ALIGNED(end, s->cluster_size));
395 while (start < end) {
398 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
399 trace_block_copy_skip(s, start);
400 start += s->cluster_size;
401 continue; /* already copied */
404 dirty_end = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
410 if (s->skip_unallocated) {
411 ret = block_copy_reset_unallocated(s, start, &status_bytes);
413 trace_block_copy_skip_range(s, start, status_bytes);
414 start += status_bytes;
417 /* Clamp to known allocated region */
418 dirty_end = MIN(dirty_end, start + status_bytes);
421 trace_block_copy_process(s, start);
423 if (s->use_copy_range) {
424 ret = block_copy_with_offload(s, start, dirty_end,
427 s->use_copy_range = false;
430 if (!s->use_copy_range) {
431 ret = block_copy_with_bounce_buffer(s, start, dirty_end,
433 error_is_read, &bounce_buffer);
440 s->progress_bytes_callback(ret, s->progress_opaque);
445 qemu_vfree(bounce_buffer);
451 static void backup_progress_bytes_callback(int64_t bytes, void *opaque)
453 BackupBlockJob *s = opaque;
455 s->bytes_read += bytes;
456 job_progress_update(&s->common.job, bytes);
459 static void backup_progress_reset_callback(void *opaque)
461 BackupBlockJob *s = opaque;
462 uint64_t estimate = bdrv_get_dirty_count(s->bcs->copy_bitmap);
464 job_progress_set_remaining(&s->common.job, estimate);
467 static int coroutine_fn backup_do_cow(BackupBlockJob *job,
468 int64_t offset, uint64_t bytes,
470 bool is_write_notifier)
472 CowRequest cow_request;
474 int64_t start, end; /* bytes */
476 qemu_co_rwlock_rdlock(&job->flush_rwlock);
478 start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
479 end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
481 trace_backup_do_cow_enter(job, start, offset, bytes);
483 wait_for_overlapping_requests(job, start, end);
484 cow_request_begin(&cow_request, job, start, end);
486 ret = block_copy(job->bcs, start, end - start, error_is_read,
489 cow_request_end(&cow_request);
491 trace_backup_do_cow_return(job, offset, bytes, ret);
493 qemu_co_rwlock_unlock(&job->flush_rwlock);
498 static int coroutine_fn backup_before_write_notify(
499 NotifierWithReturn *notifier,
502 BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
503 BdrvTrackedRequest *req = opaque;
505 assert(req->bs == job->source_bs);
506 assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
507 assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));
509 return backup_do_cow(job, req->offset, req->bytes, NULL, true);
512 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
515 bool sync = (((ret == 0) || (job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS)) \
516 && (job->bitmap_mode != BITMAP_SYNC_MODE_NEVER));
520 * We succeeded, or we always intended to sync the bitmap.
521 * Delete this bitmap and install the child.
523 bm = bdrv_dirty_bitmap_abdicate(job->source_bs, job->sync_bitmap, NULL);
526 * We failed, or we never intended to sync the bitmap anyway.
527 * Merge the successor back into the parent, keeping all data.
529 bm = bdrv_reclaim_dirty_bitmap(job->source_bs, job->sync_bitmap, NULL);
534 if (ret < 0 && job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS) {
535 /* If we failed and synced, merge in the bits we didn't copy: */
536 bdrv_dirty_bitmap_merge_internal(bm, job->bcs->copy_bitmap,
541 static void backup_commit(Job *job)
543 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
544 if (s->sync_bitmap) {
545 backup_cleanup_sync_bitmap(s, 0);
549 static void backup_abort(Job *job)
551 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
552 if (s->sync_bitmap) {
553 backup_cleanup_sync_bitmap(s, -1);
557 static void backup_clean(Job *job)
559 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
561 block_copy_state_free(s->bcs);
564 void backup_do_checkpoint(BlockJob *job, Error **errp)
566 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
568 assert(block_job_driver(job) == &backup_job_driver);
570 if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
571 error_setg(errp, "The backup job only supports block checkpoint in"
576 bdrv_set_dirty_bitmap(backup_job->bcs->copy_bitmap, 0, backup_job->len);
579 static BlockErrorAction backup_error_action(BackupBlockJob *job,
580 bool read, int error)
583 return block_job_error_action(&job->common, job->on_source_error,
586 return block_job_error_action(&job->common, job->on_target_error,
591 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
595 if (job_is_cancelled(&job->common.job)) {
600 * We need to yield even for delay_ns = 0 so that bdrv_drain_all() can
601 * return. Without a yield, the VM would not reboot.
603 delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read);
605 job_sleep_ns(&job->common.job, delay_ns);
607 if (job_is_cancelled(&job->common.job)) {
614 static int coroutine_fn backup_loop(BackupBlockJob *job)
618 BdrvDirtyBitmapIter *bdbi;
621 bdbi = bdrv_dirty_iter_new(job->bcs->copy_bitmap);
622 while ((offset = bdrv_dirty_iter_next(bdbi)) != -1) {
624 if (yield_and_check(job)) {
627 ret = backup_do_cow(job, offset,
628 job->cluster_size, &error_is_read, false);
629 if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
630 BLOCK_ERROR_ACTION_REPORT)
638 bdrv_dirty_iter_free(bdbi);
642 static void backup_init_copy_bitmap(BackupBlockJob *job)
647 if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
648 ret = bdrv_dirty_bitmap_merge_internal(job->bcs->copy_bitmap,
653 if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
655 * We can't hog the coroutine to initialize this thoroughly.
656 * Set a flag and resume work when we are able to yield safely.
658 job->bcs->skip_unallocated = true;
660 bdrv_set_dirty_bitmap(job->bcs->copy_bitmap, 0, job->len);
663 estimate = bdrv_get_dirty_count(job->bcs->copy_bitmap);
664 job_progress_set_remaining(&job->common.job, estimate);
667 static int coroutine_fn backup_run(Job *job, Error **errp)
669 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
672 QLIST_INIT(&s->inflight_reqs);
673 qemu_co_rwlock_init(&s->flush_rwlock);
675 backup_init_copy_bitmap(s);
677 s->before_write.notify = backup_before_write_notify;
678 bdrv_add_before_write_notifier(s->source_bs, &s->before_write);
680 if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
684 for (offset = 0; offset < s->len; ) {
685 if (yield_and_check(s)) {
690 ret = block_copy_reset_unallocated(s->bcs, offset, &count);
697 s->bcs->skip_unallocated = false;
700 if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
702 * All bits are set in copy_bitmap to allow any cluster to be copied.
703 * This does not actually require them to be copied.
705 while (!job_is_cancelled(job)) {
707 * Yield until the job is cancelled. We just let our before_write
708 * notify callback service CoW requests.
713 ret = backup_loop(s);
717 notifier_with_return_remove(&s->before_write);
719 /* wait until pending backup_do_cow() calls have completed */
720 qemu_co_rwlock_wrlock(&s->flush_rwlock);
721 qemu_co_rwlock_unlock(&s->flush_rwlock);
726 static const BlockJobDriver backup_job_driver = {
728 .instance_size = sizeof(BackupBlockJob),
729 .job_type = JOB_TYPE_BACKUP,
730 .free = block_job_free,
731 .user_resume = block_job_user_resume,
733 .commit = backup_commit,
734 .abort = backup_abort,
735 .clean = backup_clean,
739 static int64_t backup_calculate_cluster_size(BlockDriverState *target,
746 * If there is no backing file on the target, we cannot rely on COW if our
747 * backup cluster size is smaller than the target cluster size. Even for
748 * targets with a backing file, try to avoid COW if possible.
750 ret = bdrv_get_info(target, &bdi);
751 if (ret == -ENOTSUP && !target->backing) {
752 /* Cluster size is not defined */
753 warn_report("The target block device doesn't provide "
754 "information about the block size and it doesn't have a "
755 "backing file. The default block size of %u bytes is "
756 "used. If the actual block size of the target exceeds "
757 "this default, the backup may be unusable",
758 BACKUP_CLUSTER_SIZE_DEFAULT);
759 return BACKUP_CLUSTER_SIZE_DEFAULT;
760 } else if (ret < 0 && !target->backing) {
761 error_setg_errno(errp, -ret,
762 "Couldn't determine the cluster size of the target image, "
763 "which has no backing file");
764 error_append_hint(errp,
765 "Aborting, since this may create an unusable destination image\n");
767 } else if (ret < 0 && target->backing) {
768 /* Not fatal; just trudge on ahead. */
769 return BACKUP_CLUSTER_SIZE_DEFAULT;
772 return MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
775 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
776 BlockDriverState *target, int64_t speed,
777 MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
778 BitmapSyncMode bitmap_mode,
780 BlockdevOnError on_source_error,
781 BlockdevOnError on_target_error,
783 BlockCompletionFunc *cb, void *opaque,
784 JobTxn *txn, Error **errp)
787 BackupBlockJob *job = NULL;
788 int64_t cluster_size;
789 BdrvRequestFlags write_flags;
794 /* QMP interface protects us from these cases */
795 assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
796 assert(sync_bitmap || sync_mode != MIRROR_SYNC_MODE_BITMAP);
799 error_setg(errp, "Source and target cannot be the same");
803 if (!bdrv_is_inserted(bs)) {
804 error_setg(errp, "Device is not inserted: %s",
805 bdrv_get_device_name(bs));
809 if (!bdrv_is_inserted(target)) {
810 error_setg(errp, "Device is not inserted: %s",
811 bdrv_get_device_name(target));
815 if (compress && !block_driver_can_compress(target->drv)) {
816 error_setg(errp, "Compression is not supported for this drive %s",
817 bdrv_get_device_name(target));
821 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
825 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
830 /* If we need to write to this bitmap, check that we can: */
831 if (bitmap_mode != BITMAP_SYNC_MODE_NEVER &&
832 bdrv_dirty_bitmap_check(sync_bitmap, BDRV_BITMAP_DEFAULT, errp)) {
836 /* Create a new bitmap, and freeze/disable this one. */
837 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
842 len = bdrv_getlength(bs);
844 error_setg_errno(errp, -len, "unable to get length for '%s'",
845 bdrv_get_device_name(bs));
849 cluster_size = backup_calculate_cluster_size(target, errp);
850 if (cluster_size < 0) {
854 /* job->len is fixed, so we can't allow resize */
855 job = block_job_create(job_id, &backup_job_driver, txn, bs, 0, BLK_PERM_ALL,
856 speed, creation_flags, cb, opaque, errp);
862 job->on_source_error = on_source_error;
863 job->on_target_error = on_target_error;
864 job->sync_mode = sync_mode;
865 job->sync_bitmap = sync_bitmap;
866 job->bitmap_mode = bitmap_mode;
869 * If source is in backing chain of target assume that target is going to be
870 * used for "image fleecing", i.e. it should represent a kind of snapshot of
871 * source at backup-start point in time. And target is going to be read by
872 * somebody (for example, used as NBD export) during backup job.
874 * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid
875 * intersection of backup writes and third party reads from target,
876 * otherwise reading from target we may occasionally read already updated by
879 * For more information see commit f8d59dfb40bb and test
880 * tests/qemu-iotests/222
882 write_flags = (bdrv_chain_contains(target, bs) ? BDRV_REQ_SERIALISING : 0) |
883 (compress ? BDRV_REQ_WRITE_COMPRESSED : 0),
885 job->bcs = block_copy_state_new(bs, target, cluster_size, write_flags,
886 backup_progress_bytes_callback,
887 backup_progress_reset_callback, job, errp);
892 job->cluster_size = cluster_size;
894 /* Required permissions are already taken by block-copy-state target */
895 block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
903 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
906 backup_clean(&job->common.job);
907 job_early_fail(&job->common.job);