]>
Commit | Line | Data |
---|---|---|
98d2c6f2 DM |
1 | /* |
2 | * QEMU backup | |
3 | * | |
4 | * Copyright (C) 2013 Proxmox Server Solutions | |
5 | * | |
6 | * Authors: | |
7 | * Dietmar Maurer ([email protected]) | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
10 | * See the COPYING file in the top-level directory. | |
11 | * | |
12 | */ | |
13 | ||
14 | #include <stdio.h> | |
15 | #include <errno.h> | |
16 | #include <unistd.h> | |
17 | ||
18 | #include "trace.h" | |
19 | #include "block/block.h" | |
20 | #include "block/block_int.h" | |
21 | #include "block/blockjob.h" | |
22 | #include "qemu/ratelimit.h" | |
23 | ||
24 | #define BACKUP_CLUSTER_BITS 16 | |
25 | #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) | |
26 | #define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) | |
27 | ||
28 | #define SLICE_TIME 100000000ULL /* ns */ | |
29 | ||
30 | typedef struct CowRequest { | |
31 | int64_t start; | |
32 | int64_t end; | |
33 | QLIST_ENTRY(CowRequest) list; | |
34 | CoQueue wait_queue; /* coroutines blocked on this request */ | |
35 | } CowRequest; | |
36 | ||
37 | typedef struct BackupBlockJob { | |
38 | BlockJob common; | |
39 | BlockDriverState *target; | |
fc5d3f84 | 40 | MirrorSyncMode sync_mode; |
98d2c6f2 DM |
41 | RateLimit limit; |
42 | BlockdevOnError on_source_error; | |
43 | BlockdevOnError on_target_error; | |
44 | CoRwlock flush_rwlock; | |
45 | uint64_t sectors_read; | |
46 | HBitmap *bitmap; | |
47 | QLIST_HEAD(, CowRequest) inflight_reqs; | |
48 | } BackupBlockJob; | |
49 | ||
50 | /* See if in-flight requests overlap and wait for them to complete */ | |
51 | static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, | |
52 | int64_t start, | |
53 | int64_t end) | |
54 | { | |
55 | CowRequest *req; | |
56 | bool retry; | |
57 | ||
58 | do { | |
59 | retry = false; | |
60 | QLIST_FOREACH(req, &job->inflight_reqs, list) { | |
61 | if (end > req->start && start < req->end) { | |
62 | qemu_co_queue_wait(&req->wait_queue); | |
63 | retry = true; | |
64 | break; | |
65 | } | |
66 | } | |
67 | } while (retry); | |
68 | } | |
69 | ||
70 | /* Keep track of an in-flight request */ | |
71 | static void cow_request_begin(CowRequest *req, BackupBlockJob *job, | |
72 | int64_t start, int64_t end) | |
73 | { | |
74 | req->start = start; | |
75 | req->end = end; | |
76 | qemu_co_queue_init(&req->wait_queue); | |
77 | QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); | |
78 | } | |
79 | ||
80 | /* Forget about a completed request */ | |
81 | static void cow_request_end(CowRequest *req) | |
82 | { | |
83 | QLIST_REMOVE(req, list); | |
84 | qemu_co_queue_restart_all(&req->wait_queue); | |
85 | } | |
86 | ||
87 | static int coroutine_fn backup_do_cow(BlockDriverState *bs, | |
88 | int64_t sector_num, int nb_sectors, | |
89 | bool *error_is_read) | |
90 | { | |
91 | BackupBlockJob *job = (BackupBlockJob *)bs->job; | |
92 | CowRequest cow_request; | |
93 | struct iovec iov; | |
94 | QEMUIOVector bounce_qiov; | |
95 | void *bounce_buffer = NULL; | |
96 | int ret = 0; | |
97 | int64_t start, end; | |
98 | int n; | |
99 | ||
100 | qemu_co_rwlock_rdlock(&job->flush_rwlock); | |
101 | ||
102 | start = sector_num / BACKUP_SECTORS_PER_CLUSTER; | |
103 | end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); | |
104 | ||
105 | trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); | |
106 | ||
107 | wait_for_overlapping_requests(job, start, end); | |
108 | cow_request_begin(&cow_request, job, start, end); | |
109 | ||
110 | for (; start < end; start++) { | |
111 | if (hbitmap_get(job->bitmap, start)) { | |
112 | trace_backup_do_cow_skip(job, start); | |
113 | continue; /* already copied */ | |
114 | } | |
115 | ||
116 | trace_backup_do_cow_process(job, start); | |
117 | ||
118 | n = MIN(BACKUP_SECTORS_PER_CLUSTER, | |
119 | job->common.len / BDRV_SECTOR_SIZE - | |
120 | start * BACKUP_SECTORS_PER_CLUSTER); | |
121 | ||
122 | if (!bounce_buffer) { | |
123 | bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); | |
124 | } | |
125 | iov.iov_base = bounce_buffer; | |
126 | iov.iov_len = n * BDRV_SECTOR_SIZE; | |
127 | qemu_iovec_init_external(&bounce_qiov, &iov, 1); | |
128 | ||
129 | ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, | |
130 | &bounce_qiov); | |
131 | if (ret < 0) { | |
132 | trace_backup_do_cow_read_fail(job, start, ret); | |
133 | if (error_is_read) { | |
134 | *error_is_read = true; | |
135 | } | |
136 | goto out; | |
137 | } | |
138 | ||
139 | if (buffer_is_zero(iov.iov_base, iov.iov_len)) { | |
140 | ret = bdrv_co_write_zeroes(job->target, | |
aa7bfbff | 141 | start * BACKUP_SECTORS_PER_CLUSTER, |
d32f35cb | 142 | n, BDRV_REQ_MAY_UNMAP); |
98d2c6f2 DM |
143 | } else { |
144 | ret = bdrv_co_writev(job->target, | |
145 | start * BACKUP_SECTORS_PER_CLUSTER, n, | |
146 | &bounce_qiov); | |
147 | } | |
148 | if (ret < 0) { | |
149 | trace_backup_do_cow_write_fail(job, start, ret); | |
150 | if (error_is_read) { | |
151 | *error_is_read = false; | |
152 | } | |
153 | goto out; | |
154 | } | |
155 | ||
156 | hbitmap_set(job->bitmap, start, 1); | |
157 | ||
158 | /* Publish progress, guest I/O counts as progress too. Note that the | |
159 | * offset field is an opaque progress value, it is not a disk offset. | |
160 | */ | |
161 | job->sectors_read += n; | |
162 | job->common.offset += n * BDRV_SECTOR_SIZE; | |
163 | } | |
164 | ||
165 | out: | |
166 | if (bounce_buffer) { | |
167 | qemu_vfree(bounce_buffer); | |
168 | } | |
169 | ||
170 | cow_request_end(&cow_request); | |
171 | ||
172 | trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); | |
173 | ||
174 | qemu_co_rwlock_unlock(&job->flush_rwlock); | |
175 | ||
176 | return ret; | |
177 | } | |
178 | ||
179 | static int coroutine_fn backup_before_write_notify( | |
180 | NotifierWithReturn *notifier, | |
181 | void *opaque) | |
182 | { | |
183 | BdrvTrackedRequest *req = opaque; | |
793ed47a KW |
184 | int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; |
185 | int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; | |
98d2c6f2 | 186 | |
793ed47a KW |
187 | assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); |
188 | assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | |
189 | ||
190 | return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); | |
98d2c6f2 DM |
191 | } |
192 | ||
193 | static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) | |
194 | { | |
195 | BackupBlockJob *s = container_of(job, BackupBlockJob, common); | |
196 | ||
197 | if (speed < 0) { | |
198 | error_set(errp, QERR_INVALID_PARAMETER, "speed"); | |
199 | return; | |
200 | } | |
201 | ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); | |
202 | } | |
203 | ||
204 | static void backup_iostatus_reset(BlockJob *job) | |
205 | { | |
206 | BackupBlockJob *s = container_of(job, BackupBlockJob, common); | |
207 | ||
208 | bdrv_iostatus_reset(s->target); | |
209 | } | |
210 | ||
3fc4b10a | 211 | static const BlockJobDriver backup_job_driver = { |
98d2c6f2 | 212 | .instance_size = sizeof(BackupBlockJob), |
79e14bf7 | 213 | .job_type = BLOCK_JOB_TYPE_BACKUP, |
98d2c6f2 DM |
214 | .set_speed = backup_set_speed, |
215 | .iostatus_reset = backup_iostatus_reset, | |
216 | }; | |
217 | ||
218 | static BlockErrorAction backup_error_action(BackupBlockJob *job, | |
219 | bool read, int error) | |
220 | { | |
221 | if (read) { | |
222 | return block_job_error_action(&job->common, job->common.bs, | |
223 | job->on_source_error, true, error); | |
224 | } else { | |
225 | return block_job_error_action(&job->common, job->target, | |
226 | job->on_target_error, false, error); | |
227 | } | |
228 | } | |
229 | ||
230 | static void coroutine_fn backup_run(void *opaque) | |
231 | { | |
232 | BackupBlockJob *job = opaque; | |
233 | BlockDriverState *bs = job->common.bs; | |
234 | BlockDriverState *target = job->target; | |
235 | BlockdevOnError on_target_error = job->on_target_error; | |
236 | NotifierWithReturn before_write = { | |
237 | .notify = backup_before_write_notify, | |
238 | }; | |
239 | int64_t start, end; | |
240 | int ret = 0; | |
241 | ||
242 | QLIST_INIT(&job->inflight_reqs); | |
243 | qemu_co_rwlock_init(&job->flush_rwlock); | |
244 | ||
245 | start = 0; | |
246 | end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE, | |
247 | BACKUP_SECTORS_PER_CLUSTER); | |
248 | ||
249 | job->bitmap = hbitmap_alloc(end, 0); | |
250 | ||
251 | bdrv_set_enable_write_cache(target, true); | |
252 | bdrv_set_on_error(target, on_target_error, on_target_error); | |
253 | bdrv_iostatus_enable(target); | |
254 | ||
255 | bdrv_add_before_write_notifier(bs, &before_write); | |
256 | ||
fc5d3f84 IM |
257 | if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { |
258 | while (!block_job_is_cancelled(&job->common)) { | |
259 | /* Yield until the job is cancelled. We just let our before_write | |
260 | * notify callback service CoW requests. */ | |
261 | job->common.busy = false; | |
262 | qemu_coroutine_yield(); | |
263 | job->common.busy = true; | |
98d2c6f2 | 264 | } |
fc5d3f84 IM |
265 | } else { |
266 | /* Both FULL and TOP SYNC_MODE's require copying.. */ | |
267 | for (; start < end; start++) { | |
268 | bool error_is_read; | |
98d2c6f2 | 269 | |
fc5d3f84 IM |
270 | if (block_job_is_cancelled(&job->common)) { |
271 | break; | |
272 | } | |
98d2c6f2 | 273 | |
fc5d3f84 IM |
274 | /* we need to yield so that qemu_aio_flush() returns. |
275 | * (without, VM does not reboot) | |
276 | */ | |
277 | if (job->common.speed) { | |
278 | uint64_t delay_ns = ratelimit_calculate_delay( | |
279 | &job->limit, job->sectors_read); | |
280 | job->sectors_read = 0; | |
7483d1e5 | 281 | block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); |
fc5d3f84 | 282 | } else { |
7483d1e5 | 283 | block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); |
fc5d3f84 | 284 | } |
98d2c6f2 | 285 | |
fc5d3f84 | 286 | if (block_job_is_cancelled(&job->common)) { |
98d2c6f2 | 287 | break; |
fc5d3f84 IM |
288 | } |
289 | ||
290 | if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { | |
291 | int i, n; | |
292 | int alloced = 0; | |
293 | ||
294 | /* Check to see if these blocks are already in the | |
295 | * backing file. */ | |
296 | ||
297 | for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { | |
bdad13b9 | 298 | /* bdrv_is_allocated() only returns true/false based |
4c293dc6 | 299 | * on the first set of sectors it comes across that |
fc5d3f84 IM |
300 | * are are all in the same state. |
301 | * For that reason we must verify each sector in the | |
302 | * backup cluster length. We end up copying more than | |
303 | * needed but at some point that is always the case. */ | |
304 | alloced = | |
bdad13b9 | 305 | bdrv_is_allocated(bs, |
fc5d3f84 IM |
306 | start * BACKUP_SECTORS_PER_CLUSTER + i, |
307 | BACKUP_SECTORS_PER_CLUSTER - i, &n); | |
308 | i += n; | |
309 | ||
310 | if (alloced == 1) { | |
311 | break; | |
312 | } | |
313 | } | |
314 | ||
315 | /* If the above loop never found any sectors that are in | |
316 | * the topmost image, skip this backup. */ | |
317 | if (alloced == 0) { | |
318 | continue; | |
319 | } | |
320 | } | |
321 | /* FULL sync mode we copy the whole drive. */ | |
322 | ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, | |
323 | BACKUP_SECTORS_PER_CLUSTER, &error_is_read); | |
324 | if (ret < 0) { | |
325 | /* Depending on error action, fail now or retry cluster */ | |
326 | BlockErrorAction action = | |
327 | backup_error_action(job, error_is_read, -ret); | |
328 | if (action == BDRV_ACTION_REPORT) { | |
329 | break; | |
330 | } else { | |
331 | start--; | |
332 | continue; | |
333 | } | |
98d2c6f2 DM |
334 | } |
335 | } | |
336 | } | |
337 | ||
338 | notifier_with_return_remove(&before_write); | |
339 | ||
340 | /* wait until pending backup_do_cow() calls have completed */ | |
341 | qemu_co_rwlock_wrlock(&job->flush_rwlock); | |
342 | qemu_co_rwlock_unlock(&job->flush_rwlock); | |
343 | ||
344 | hbitmap_free(job->bitmap); | |
345 | ||
346 | bdrv_iostatus_disable(target); | |
4f6fd349 | 347 | bdrv_unref(target); |
98d2c6f2 DM |
348 | |
349 | block_job_completed(&job->common, ret); | |
350 | } | |
351 | ||
352 | void backup_start(BlockDriverState *bs, BlockDriverState *target, | |
fc5d3f84 | 353 | int64_t speed, MirrorSyncMode sync_mode, |
98d2c6f2 DM |
354 | BlockdevOnError on_source_error, |
355 | BlockdevOnError on_target_error, | |
356 | BlockDriverCompletionFunc *cb, void *opaque, | |
357 | Error **errp) | |
358 | { | |
359 | int64_t len; | |
360 | ||
361 | assert(bs); | |
362 | assert(target); | |
363 | assert(cb); | |
364 | ||
365 | if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || | |
366 | on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && | |
367 | !bdrv_iostatus_is_enabled(bs)) { | |
368 | error_set(errp, QERR_INVALID_PARAMETER, "on-source-error"); | |
369 | return; | |
370 | } | |
371 | ||
372 | len = bdrv_getlength(bs); | |
373 | if (len < 0) { | |
374 | error_setg_errno(errp, -len, "unable to get length for '%s'", | |
375 | bdrv_get_device_name(bs)); | |
376 | return; | |
377 | } | |
378 | ||
3fc4b10a | 379 | BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, |
98d2c6f2 DM |
380 | cb, opaque, errp); |
381 | if (!job) { | |
382 | return; | |
383 | } | |
384 | ||
385 | job->on_source_error = on_source_error; | |
386 | job->on_target_error = on_target_error; | |
387 | job->target = target; | |
fc5d3f84 | 388 | job->sync_mode = sync_mode; |
98d2c6f2 DM |
389 | job->common.len = len; |
390 | job->common.co = qemu_coroutine_create(backup_run); | |
391 | qemu_coroutine_enter(job->common.co, job); | |
392 | } |