]>
Commit | Line | Data |
---|---|---|
beb5f545 VSO |
1 | /* |
2 | * block_copy API | |
3 | * | |
4 | * Copyright (C) 2013 Proxmox Server Solutions | |
5 | * Copyright (c) 2019 Virtuozzo International GmbH. | |
6 | * | |
7 | * Authors: | |
8 | * Dietmar Maurer ([email protected]) | |
9 | * Vladimir Sementsov-Ogievskiy <[email protected]> | |
10 | * | |
11 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
12 | * See the COPYING file in the top-level directory. | |
13 | */ | |
14 | ||
15 | #include "qemu/osdep.h" | |
16 | ||
17 | #include "trace.h" | |
18 | #include "qapi/error.h" | |
19 | #include "block/block-copy.h" | |
20 | #include "sysemu/block-backend.h" | |
b3b7036a VSO |
21 | #include "qemu/units.h" |
22 | ||
23 | #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) | |
0e240245 | 24 | #define BLOCK_COPY_MAX_BUFFER (1 * MiB) |
7f739d0e | 25 | #define BLOCK_COPY_MAX_MEM (128 * MiB) |
beb5f545 | 26 | |
a6ffe199 VSO |
27 | static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s, |
28 | int64_t start, | |
29 | int64_t end) | |
30 | { | |
31 | BlockCopyInFlightReq *req; | |
32 | bool waited; | |
33 | ||
34 | do { | |
35 | waited = false; | |
36 | QLIST_FOREACH(req, &s->inflight_reqs, list) { | |
37 | if (end > req->start_byte && start < req->end_byte) { | |
38 | qemu_co_queue_wait(&req->wait_queue, NULL); | |
39 | waited = true; | |
40 | break; | |
41 | } | |
42 | } | |
43 | } while (waited); | |
44 | } | |
45 | ||
46 | static void block_copy_inflight_req_begin(BlockCopyState *s, | |
47 | BlockCopyInFlightReq *req, | |
48 | int64_t start, int64_t end) | |
49 | { | |
50 | req->start_byte = start; | |
51 | req->end_byte = end; | |
52 | qemu_co_queue_init(&req->wait_queue); | |
53 | QLIST_INSERT_HEAD(&s->inflight_reqs, req, list); | |
54 | } | |
55 | ||
56 | static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req) | |
57 | { | |
58 | QLIST_REMOVE(req, list); | |
59 | qemu_co_queue_restart_all(&req->wait_queue); | |
60 | } | |
61 | ||
beb5f545 VSO |
62 | void block_copy_state_free(BlockCopyState *s) |
63 | { | |
64 | if (!s) { | |
65 | return; | |
66 | } | |
67 | ||
5deb6cbd | 68 | bdrv_release_dirty_bitmap(s->copy_bitmap); |
7f739d0e | 69 | shres_destroy(s->mem); |
beb5f545 VSO |
70 | g_free(s); |
71 | } | |
72 | ||
00e30f05 | 73 | BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, |
0f4b02b7 VSO |
74 | int64_t cluster_size, |
75 | BdrvRequestFlags write_flags, Error **errp) | |
beb5f545 VSO |
76 | { |
77 | BlockCopyState *s; | |
beb5f545 | 78 | BdrvDirtyBitmap *copy_bitmap; |
00e30f05 | 79 | uint32_t max_transfer = |
0e240245 | 80 | MIN_NON_ZERO(INT_MAX, |
b3b7036a VSO |
81 | MIN_NON_ZERO(source->bs->bl.max_transfer, |
82 | target->bs->bl.max_transfer)); | |
beb5f545 | 83 | |
00e30f05 VSO |
84 | copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, |
85 | errp); | |
beb5f545 VSO |
86 | if (!copy_bitmap) { |
87 | return NULL; | |
88 | } | |
89 | bdrv_disable_dirty_bitmap(copy_bitmap); | |
90 | ||
91 | s = g_new(BlockCopyState, 1); | |
92 | *s = (BlockCopyState) { | |
00e30f05 VSO |
93 | .source = source, |
94 | .target = target, | |
beb5f545 VSO |
95 | .copy_bitmap = copy_bitmap, |
96 | .cluster_size = cluster_size, | |
97 | .len = bdrv_dirty_bitmap_size(copy_bitmap), | |
98 | .write_flags = write_flags, | |
7f739d0e | 99 | .mem = shres_create(BLOCK_COPY_MAX_MEM), |
beb5f545 VSO |
100 | }; |
101 | ||
0e240245 VSO |
102 | if (max_transfer < cluster_size) { |
103 | /* | |
104 | * copy_range does not respect max_transfer. We don't want to bother | |
105 | * with requests smaller than block-copy cluster size, so fallback to | |
106 | * buffered copying (read and write respect max_transfer on their | |
107 | * behalf). | |
108 | */ | |
109 | s->use_copy_range = false; | |
110 | s->copy_size = cluster_size; | |
111 | } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { | |
dcfbece6 | 112 | /* Compression supports only cluster-size writes and no copy-range. */ |
0e240245 | 113 | s->use_copy_range = false; |
dcfbece6 | 114 | s->copy_size = cluster_size; |
0e240245 VSO |
115 | } else { |
116 | /* | |
117 | * copy_range does not respect max_transfer (it's a TODO), so we factor | |
118 | * that in here. | |
119 | */ | |
120 | s->use_copy_range = true; | |
121 | s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE), | |
122 | QEMU_ALIGN_DOWN(max_transfer, cluster_size)); | |
123 | } | |
beb5f545 | 124 | |
a6ffe199 VSO |
125 | QLIST_INIT(&s->inflight_reqs); |
126 | ||
beb5f545 | 127 | return s; |
beb5f545 VSO |
128 | } |
129 | ||
0f4b02b7 VSO |
130 | void block_copy_set_callbacks( |
131 | BlockCopyState *s, | |
132 | ProgressBytesCallbackFunc progress_bytes_callback, | |
133 | ProgressResetCallbackFunc progress_reset_callback, | |
134 | void *progress_opaque) | |
135 | { | |
136 | s->progress_bytes_callback = progress_bytes_callback; | |
137 | s->progress_reset_callback = progress_reset_callback; | |
138 | s->progress_opaque = progress_opaque; | |
139 | } | |
140 | ||
beb5f545 | 141 | /* |
e332a726 VSO |
142 | * block_copy_do_copy |
143 | * | |
144 | * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to | |
145 | * cover last cluster when s->len is not aligned to clusters. | |
146 | * | |
147 | * No sync here: nor bitmap neighter intersecting requests handling, only copy. | |
148 | * | |
149 | * Returns 0 on success. | |
beb5f545 | 150 | */ |
e332a726 VSO |
151 | static int coroutine_fn block_copy_do_copy(BlockCopyState *s, |
152 | int64_t start, int64_t end, | |
153 | bool *error_is_read) | |
beb5f545 VSO |
154 | { |
155 | int ret; | |
e332a726 VSO |
156 | int nbytes = MIN(end, s->len) - start; |
157 | void *bounce_buffer = NULL; | |
beb5f545 VSO |
158 | |
159 | assert(QEMU_IS_ALIGNED(start, s->cluster_size)); | |
e332a726 VSO |
160 | assert(QEMU_IS_ALIGNED(end, s->cluster_size)); |
161 | assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size)); | |
162 | ||
163 | if (s->use_copy_range) { | |
164 | ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes, | |
165 | 0, s->write_flags); | |
166 | if (ret < 0) { | |
167 | trace_block_copy_copy_range_fail(s, start, ret); | |
168 | s->use_copy_range = false; | |
0e240245 | 169 | s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); |
e332a726 VSO |
170 | /* Fallback to read+write with allocated buffer */ |
171 | } else { | |
172 | goto out; | |
173 | } | |
174 | } | |
175 | ||
0e240245 VSO |
176 | /* |
177 | * In case of failed copy_range request above, we may proceed with buffered | |
178 | * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will | |
179 | * be properly limited, so don't care too much. | |
180 | */ | |
181 | ||
e332a726 | 182 | bounce_buffer = qemu_blockalign(s->source->bs, nbytes); |
beb5f545 | 183 | |
3816edd2 | 184 | ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0); |
beb5f545 | 185 | if (ret < 0) { |
e332a726 | 186 | trace_block_copy_read_fail(s, start, ret); |
beb5f545 VSO |
187 | if (error_is_read) { |
188 | *error_is_read = true; | |
189 | } | |
e332a726 | 190 | goto out; |
beb5f545 VSO |
191 | } |
192 | ||
3816edd2 | 193 | ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer, |
00e30f05 | 194 | s->write_flags); |
beb5f545 | 195 | if (ret < 0) { |
e332a726 | 196 | trace_block_copy_write_fail(s, start, ret); |
beb5f545 VSO |
197 | if (error_is_read) { |
198 | *error_is_read = false; | |
199 | } | |
e332a726 | 200 | goto out; |
beb5f545 VSO |
201 | } |
202 | ||
e332a726 | 203 | out: |
3816edd2 VSO |
204 | qemu_vfree(bounce_buffer); |
205 | ||
beb5f545 | 206 | return ret; |
beb5f545 VSO |
207 | } |
208 | ||
209 | /* | |
210 | * Check if the cluster starting at offset is allocated or not. | |
211 | * return via pnum the number of contiguous clusters sharing this allocation. | |
212 | */ | |
213 | static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, | |
214 | int64_t *pnum) | |
215 | { | |
00e30f05 | 216 | BlockDriverState *bs = s->source->bs; |
beb5f545 VSO |
217 | int64_t count, total_count = 0; |
218 | int64_t bytes = s->len - offset; | |
219 | int ret; | |
220 | ||
221 | assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); | |
222 | ||
223 | while (true) { | |
224 | ret = bdrv_is_allocated(bs, offset, bytes, &count); | |
225 | if (ret < 0) { | |
226 | return ret; | |
227 | } | |
228 | ||
229 | total_count += count; | |
230 | ||
231 | if (ret || count == 0) { | |
232 | /* | |
233 | * ret: partial segment(s) are considered allocated. | |
234 | * otherwise: unallocated tail is treated as an entire segment. | |
235 | */ | |
236 | *pnum = DIV_ROUND_UP(total_count, s->cluster_size); | |
237 | return ret; | |
238 | } | |
239 | ||
240 | /* Unallocated segment(s) with uncertain following segment(s) */ | |
241 | if (total_count >= s->cluster_size) { | |
242 | *pnum = total_count / s->cluster_size; | |
243 | return 0; | |
244 | } | |
245 | ||
246 | offset += count; | |
247 | bytes -= count; | |
248 | } | |
249 | } | |
250 | ||
251 | /* | |
252 | * Reset bits in copy_bitmap starting at offset if they represent unallocated | |
253 | * data in the image. May reset subsequent contiguous bits. | |
254 | * @return 0 when the cluster at @offset was unallocated, | |
255 | * 1 otherwise, and -ret on error. | |
256 | */ | |
257 | int64_t block_copy_reset_unallocated(BlockCopyState *s, | |
258 | int64_t offset, int64_t *count) | |
259 | { | |
260 | int ret; | |
261 | int64_t clusters, bytes; | |
262 | ||
263 | ret = block_copy_is_cluster_allocated(s, offset, &clusters); | |
264 | if (ret < 0) { | |
265 | return ret; | |
266 | } | |
267 | ||
268 | bytes = clusters * s->cluster_size; | |
269 | ||
270 | if (!ret) { | |
271 | bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); | |
272 | s->progress_reset_callback(s->progress_opaque); | |
273 | } | |
274 | ||
275 | *count = bytes; | |
276 | return ret; | |
277 | } | |
278 | ||
279 | int coroutine_fn block_copy(BlockCopyState *s, | |
280 | int64_t start, uint64_t bytes, | |
00e30f05 | 281 | bool *error_is_read) |
beb5f545 VSO |
282 | { |
283 | int ret = 0; | |
284 | int64_t end = bytes + start; /* bytes */ | |
beb5f545 | 285 | int64_t status_bytes; |
a6ffe199 | 286 | BlockCopyInFlightReq req; |
beb5f545 VSO |
287 | |
288 | /* | |
289 | * block_copy() user is responsible for keeping source and target in same | |
290 | * aio context | |
291 | */ | |
00e30f05 VSO |
292 | assert(bdrv_get_aio_context(s->source->bs) == |
293 | bdrv_get_aio_context(s->target->bs)); | |
beb5f545 VSO |
294 | |
295 | assert(QEMU_IS_ALIGNED(start, s->cluster_size)); | |
296 | assert(QEMU_IS_ALIGNED(end, s->cluster_size)); | |
297 | ||
a6ffe199 VSO |
298 | block_copy_wait_inflight_reqs(s, start, bytes); |
299 | block_copy_inflight_req_begin(s, &req, start, end); | |
300 | ||
beb5f545 | 301 | while (start < end) { |
e332a726 | 302 | int64_t next_zero, chunk_end; |
beb5f545 VSO |
303 | |
304 | if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) { | |
305 | trace_block_copy_skip(s, start); | |
306 | start += s->cluster_size; | |
307 | continue; /* already copied */ | |
308 | } | |
309 | ||
0e240245 | 310 | chunk_end = MIN(end, start + s->copy_size); |
e332a726 VSO |
311 | |
312 | next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start, | |
313 | chunk_end - start); | |
314 | if (next_zero >= 0) { | |
315 | assert(next_zero > start); /* start is dirty */ | |
316 | assert(next_zero < chunk_end); /* no need to do MIN() */ | |
317 | chunk_end = next_zero; | |
beb5f545 VSO |
318 | } |
319 | ||
320 | if (s->skip_unallocated) { | |
321 | ret = block_copy_reset_unallocated(s, start, &status_bytes); | |
322 | if (ret == 0) { | |
323 | trace_block_copy_skip_range(s, start, status_bytes); | |
324 | start += status_bytes; | |
325 | continue; | |
326 | } | |
327 | /* Clamp to known allocated region */ | |
e332a726 | 328 | chunk_end = MIN(chunk_end, start + status_bytes); |
beb5f545 VSO |
329 | } |
330 | ||
331 | trace_block_copy_process(s, start); | |
332 | ||
e332a726 VSO |
333 | bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); |
334 | ||
7f739d0e | 335 | co_get_from_shres(s->mem, chunk_end - start); |
e332a726 | 336 | ret = block_copy_do_copy(s, start, chunk_end, error_is_read); |
7f739d0e | 337 | co_put_to_shres(s->mem, chunk_end - start); |
beb5f545 | 338 | if (ret < 0) { |
e332a726 | 339 | bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start); |
beb5f545 VSO |
340 | break; |
341 | } | |
342 | ||
e332a726 VSO |
343 | s->progress_bytes_callback(chunk_end - start, s->progress_opaque); |
344 | start = chunk_end; | |
beb5f545 VSO |
345 | ret = 0; |
346 | } | |
347 | ||
a6ffe199 VSO |
348 | block_copy_inflight_req_end(&req); |
349 | ||
beb5f545 VSO |
350 | return ret; |
351 | } |