]>
Commit | Line | Data |
---|---|---|
7df7868b | 1 | /* |
d003e0ae | 2 | * copy-before-write filter driver |
7df7868b VSO |
3 | * |
4 | * The driver performs Copy-Before-Write (CBW) operation: it is injected above | |
5 | * some node, and before each write it copies _old_ data to the target node. | |
6 | * | |
d003e0ae | 7 | * Copyright (c) 2018-2021 Virtuozzo International GmbH. |
7df7868b VSO |
8 | * |
9 | * Author: | |
10 | * Sementsov-Ogievskiy Vladimir <[email protected]> | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * This program is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU General Public License | |
23 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
24 | */ | |
25 | ||
26 | #include "qemu/osdep.h" | |
27 | ||
28 | #include "sysemu/block-backend.h" | |
29 | #include "qemu/cutils.h" | |
30 | #include "qapi/error.h" | |
31 | #include "block/block_int.h" | |
32 | #include "block/qdict.h" | |
33 | #include "block/block-copy.h" | |
34 | ||
d003e0ae | 35 | #include "block/copy-before-write.h" |
af5bcd77 | 36 | #include "block/reqlist.h" |
7df7868b | 37 | |
5f3a3cd7 VSO |
38 | #include "qapi/qapi-visit-block-core.h" |
39 | ||
d003e0ae | 40 | typedef struct BDRVCopyBeforeWriteState { |
7df7868b VSO |
41 | BlockCopyState *bcs; |
42 | BdrvChild *target; | |
af5bcd77 VSO |
43 | |
44 | /* | |
45 | * @lock: protects access to @access_bitmap, @done_bitmap and | |
46 | * @frozen_read_reqs | |
47 | */ | |
48 | CoMutex lock; | |
49 | ||
50 | /* | |
51 | * @access_bitmap: represents areas allowed for reading by fleecing user. | |
52 | * Reading from non-dirty areas leads to -EACCES. | |
53 | */ | |
54 | BdrvDirtyBitmap *access_bitmap; | |
55 | ||
56 | /* | |
57 | * @done_bitmap: represents areas that was successfully copied to @target by | |
58 | * copy-before-write operations. | |
59 | */ | |
60 | BdrvDirtyBitmap *done_bitmap; | |
61 | ||
62 | /* | |
63 | * @frozen_read_reqs: current read requests for fleecing user in bs->file | |
64 | * node. These areas must not be rewritten by guest. | |
65 | */ | |
66 | BlockReqList frozen_read_reqs; | |
d003e0ae | 67 | } BDRVCopyBeforeWriteState; |
7df7868b | 68 | |
d003e0ae | 69 | static coroutine_fn int cbw_co_preadv( |
f7ef38dd VSO |
70 | BlockDriverState *bs, int64_t offset, int64_t bytes, |
71 | QEMUIOVector *qiov, BdrvRequestFlags flags) | |
7df7868b | 72 | { |
3c1e6327 | 73 | return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); |
7df7868b VSO |
74 | } |
75 | ||
af5bcd77 VSO |
76 | /* |
77 | * Do copy-before-write operation. | |
78 | * | |
79 | * On failure guest request must be failed too. | |
80 | * | |
81 | * On success, we also wait for all in-flight fleecing read requests in source | |
82 | * node, and it's guaranteed that after cbw_do_copy_before_write() successful | |
83 | * return there are no such requests and they will never appear. | |
84 | */ | |
d003e0ae VSO |
85 | static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, |
86 | uint64_t offset, uint64_t bytes, BdrvRequestFlags flags) | |
7df7868b | 87 | { |
d003e0ae | 88 | BDRVCopyBeforeWriteState *s = bs->opaque; |
af5bcd77 | 89 | int ret; |
4bc267a7 | 90 | uint64_t off, end; |
b518e9e9 | 91 | int64_t cluster_size = block_copy_cluster_size(s->bcs); |
4bc267a7 VSO |
92 | |
93 | if (flags & BDRV_REQ_WRITE_UNCHANGED) { | |
94 | return 0; | |
95 | } | |
96 | ||
b518e9e9 VSO |
97 | off = QEMU_ALIGN_DOWN(offset, cluster_size); |
98 | end = QEMU_ALIGN_UP(offset + bytes, cluster_size); | |
7df7868b | 99 | |
af5bcd77 VSO |
100 | ret = block_copy(s->bcs, off, end - off, true); |
101 | if (ret < 0) { | |
102 | return ret; | |
103 | } | |
104 | ||
105 | WITH_QEMU_LOCK_GUARD(&s->lock) { | |
106 | bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off); | |
107 | reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock); | |
108 | } | |
109 | ||
110 | return 0; | |
7df7868b VSO |
111 | } |
112 | ||
d003e0ae | 113 | static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, |
0c802287 | 114 | int64_t offset, int64_t bytes) |
7df7868b | 115 | { |
d003e0ae | 116 | int ret = cbw_do_copy_before_write(bs, offset, bytes, 0); |
7df7868b VSO |
117 | if (ret < 0) { |
118 | return ret; | |
119 | } | |
120 | ||
3c1e6327 | 121 | return bdrv_co_pdiscard(bs->file, offset, bytes); |
7df7868b VSO |
122 | } |
123 | ||
d003e0ae | 124 | static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, |
f34b2bcf | 125 | int64_t offset, int64_t bytes, BdrvRequestFlags flags) |
7df7868b | 126 | { |
d003e0ae | 127 | int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); |
7df7868b VSO |
128 | if (ret < 0) { |
129 | return ret; | |
130 | } | |
131 | ||
3c1e6327 | 132 | return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); |
7df7868b VSO |
133 | } |
134 | ||
d003e0ae | 135 | static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs, |
e75abeda VSO |
136 | int64_t offset, |
137 | int64_t bytes, | |
138 | QEMUIOVector *qiov, | |
139 | BdrvRequestFlags flags) | |
7df7868b | 140 | { |
d003e0ae | 141 | int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); |
4bc267a7 VSO |
142 | if (ret < 0) { |
143 | return ret; | |
7df7868b VSO |
144 | } |
145 | ||
3c1e6327 | 146 | return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); |
7df7868b VSO |
147 | } |
148 | ||
d003e0ae | 149 | static int coroutine_fn cbw_co_flush(BlockDriverState *bs) |
7df7868b | 150 | { |
3c1e6327 | 151 | if (!bs->file) { |
7df7868b VSO |
152 | return 0; |
153 | } | |
154 | ||
3c1e6327 | 155 | return bdrv_co_flush(bs->file->bs); |
7df7868b VSO |
156 | } |
157 | ||
af5bcd77 VSO |
158 | /* |
159 | * If @offset not accessible - return NULL. | |
160 | * | |
161 | * Otherwise, set @pnum to some bytes that accessible from @file (@file is set | |
162 | * to bs->file or to s->target). Return newly allocated BlockReq object that | |
163 | * should be than passed to cbw_snapshot_read_unlock(). | |
164 | * | |
165 | * It's guaranteed that guest writes will not interact in the region until | |
166 | * cbw_snapshot_read_unlock() called. | |
167 | */ | |
168 | static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs, | |
169 | int64_t offset, int64_t bytes, | |
170 | int64_t *pnum, BdrvChild **file) | |
171 | { | |
172 | BDRVCopyBeforeWriteState *s = bs->opaque; | |
173 | BlockReq *req = g_new(BlockReq, 1); | |
174 | bool done; | |
175 | ||
176 | QEMU_LOCK_GUARD(&s->lock); | |
177 | ||
178 | if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) { | |
179 | g_free(req); | |
180 | return NULL; | |
181 | } | |
182 | ||
183 | done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum); | |
184 | if (done) { | |
185 | /* | |
186 | * Special invalid BlockReq, that is handled in | |
187 | * cbw_snapshot_read_unlock(). We don't need to lock something to read | |
188 | * from s->target. | |
189 | */ | |
190 | *req = (BlockReq) {.offset = -1, .bytes = -1}; | |
191 | *file = s->target; | |
192 | } else { | |
193 | reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes); | |
194 | *file = bs->file; | |
195 | } | |
196 | ||
197 | return req; | |
198 | } | |
199 | ||
200 | static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req) | |
201 | { | |
202 | BDRVCopyBeforeWriteState *s = bs->opaque; | |
203 | ||
204 | if (req->offset == -1 && req->bytes == -1) { | |
205 | g_free(req); | |
206 | return; | |
207 | } | |
208 | ||
209 | QEMU_LOCK_GUARD(&s->lock); | |
210 | ||
211 | reqlist_remove_req(req); | |
212 | g_free(req); | |
213 | } | |
214 | ||
215 | static coroutine_fn int | |
216 | cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes, | |
217 | QEMUIOVector *qiov, size_t qiov_offset) | |
218 | { | |
219 | BlockReq *req; | |
220 | BdrvChild *file; | |
221 | int ret; | |
222 | ||
223 | /* TODO: upgrade to async loop using AioTask */ | |
224 | while (bytes) { | |
225 | int64_t cur_bytes; | |
226 | ||
227 | req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file); | |
228 | if (!req) { | |
229 | return -EACCES; | |
230 | } | |
231 | ||
232 | ret = bdrv_co_preadv_part(file, offset, cur_bytes, | |
233 | qiov, qiov_offset, 0); | |
234 | cbw_snapshot_read_unlock(bs, req); | |
235 | if (ret < 0) { | |
236 | return ret; | |
237 | } | |
238 | ||
239 | bytes -= cur_bytes; | |
240 | offset += cur_bytes; | |
241 | qiov_offset += cur_bytes; | |
242 | } | |
243 | ||
244 | return 0; | |
245 | } | |
246 | ||
247 | static int coroutine_fn | |
248 | cbw_co_snapshot_block_status(BlockDriverState *bs, | |
249 | bool want_zero, int64_t offset, int64_t bytes, | |
250 | int64_t *pnum, int64_t *map, | |
251 | BlockDriverState **file) | |
252 | { | |
253 | BDRVCopyBeforeWriteState *s = bs->opaque; | |
254 | BlockReq *req; | |
255 | int ret; | |
256 | int64_t cur_bytes; | |
257 | BdrvChild *child; | |
258 | ||
259 | req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child); | |
260 | if (!req) { | |
261 | return -EACCES; | |
262 | } | |
263 | ||
264 | ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file); | |
265 | if (child == s->target) { | |
266 | /* | |
267 | * We refer to s->target only for areas that we've written to it. | |
268 | * And we can not report unallocated blocks in s->target: this will | |
269 | * break generic block-status-above logic, that will go to | |
270 | * copy-before-write filtered child in this case. | |
271 | */ | |
272 | assert(ret & BDRV_BLOCK_ALLOCATED); | |
273 | } | |
274 | ||
275 | cbw_snapshot_read_unlock(bs, req); | |
276 | ||
277 | return ret; | |
278 | } | |
279 | ||
280 | static int coroutine_fn cbw_co_pdiscard_snapshot(BlockDriverState *bs, | |
281 | int64_t offset, int64_t bytes) | |
282 | { | |
283 | BDRVCopyBeforeWriteState *s = bs->opaque; | |
284 | ||
285 | WITH_QEMU_LOCK_GUARD(&s->lock) { | |
286 | bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); | |
287 | } | |
288 | ||
289 | block_copy_reset(s->bcs, offset, bytes); | |
290 | ||
291 | return bdrv_co_pdiscard(s->target, offset, bytes); | |
292 | } | |
293 | ||
d003e0ae | 294 | static void cbw_refresh_filename(BlockDriverState *bs) |
7df7868b | 295 | { |
7df7868b | 296 | pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), |
3c1e6327 | 297 | bs->file->bs->filename); |
7df7868b VSO |
298 | } |
299 | ||
d003e0ae VSO |
300 | static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c, |
301 | BdrvChildRole role, | |
302 | BlockReopenQueue *reopen_queue, | |
303 | uint64_t perm, uint64_t shared, | |
304 | uint64_t *nperm, uint64_t *nshared) | |
7df7868b | 305 | { |
25191e5f | 306 | if (!(role & BDRV_CHILD_FILTERED)) { |
7df7868b VSO |
307 | /* |
308 | * Target child | |
309 | * | |
310 | * Share write to target (child_file), to not interfere | |
311 | * with guest writes to its disk which may be in target backing chain. | |
958a04bd KW |
312 | * Can't resize during a backup block job because we check the size |
313 | * only upfront. | |
7df7868b | 314 | */ |
958a04bd | 315 | *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE; |
7df7868b VSO |
316 | *nperm = BLK_PERM_WRITE; |
317 | } else { | |
318 | /* Source child */ | |
e5d8a406 | 319 | bdrv_default_perms(bs, c, role, reopen_queue, |
69dca43d | 320 | perm, shared, nperm, nshared); |
7df7868b | 321 | |
3860c020 VSO |
322 | if (!QLIST_EMPTY(&bs->parents)) { |
323 | if (perm & BLK_PERM_WRITE) { | |
324 | *nperm = *nperm | BLK_PERM_CONSISTENT_READ; | |
325 | } | |
326 | *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); | |
7df7868b | 327 | } |
7df7868b VSO |
328 | } |
329 | } | |
330 | ||
5f3a3cd7 VSO |
331 | static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap, |
332 | Error **errp) | |
333 | { | |
334 | QDict *bitmap_qdict = NULL; | |
335 | BlockDirtyBitmap *bmp_param = NULL; | |
336 | Visitor *v = NULL; | |
337 | bool ret = false; | |
338 | ||
339 | *bitmap = NULL; | |
340 | ||
341 | qdict_extract_subqdict(options, &bitmap_qdict, "bitmap."); | |
342 | if (!qdict_size(bitmap_qdict)) { | |
343 | ret = true; | |
344 | goto out; | |
345 | } | |
346 | ||
347 | v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp); | |
348 | if (!v) { | |
349 | goto out; | |
350 | } | |
351 | ||
352 | visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp); | |
353 | if (!bmp_param) { | |
354 | goto out; | |
355 | } | |
356 | ||
357 | *bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL, | |
358 | errp); | |
359 | if (!*bitmap) { | |
360 | goto out; | |
361 | } | |
362 | ||
363 | ret = true; | |
364 | ||
365 | out: | |
366 | qapi_free_BlockDirtyBitmap(bmp_param); | |
367 | visit_free(v); | |
368 | qobject_unref(bitmap_qdict); | |
369 | ||
370 | return ret; | |
371 | } | |
372 | ||
751cec7a VSO |
373 | static int cbw_open(BlockDriverState *bs, QDict *options, int flags, |
374 | Error **errp) | |
1f0cacb9 | 375 | { |
fe7ea40c | 376 | BDRVCopyBeforeWriteState *s = bs->opaque; |
5f3a3cd7 | 377 | BdrvDirtyBitmap *bitmap = NULL; |
af5bcd77 | 378 | int64_t cluster_size; |
1f0cacb9 | 379 | |
f44fd739 VSO |
380 | bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, |
381 | BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, | |
382 | false, errp); | |
383 | if (!bs->file) { | |
1f0cacb9 VSO |
384 | return -EINVAL; |
385 | } | |
386 | ||
f44fd739 VSO |
387 | s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds, |
388 | BDRV_CHILD_DATA, false, errp); | |
389 | if (!s->target) { | |
1f0cacb9 VSO |
390 | return -EINVAL; |
391 | } | |
392 | ||
5f3a3cd7 VSO |
393 | if (!cbw_parse_bitmap_option(options, &bitmap, errp)) { |
394 | return -EINVAL; | |
395 | } | |
396 | ||
5a507426 VSO |
397 | bs->total_sectors = bs->file->bs->total_sectors; |
398 | bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | | |
399 | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); | |
400 | bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | | |
401 | ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & | |
402 | bs->file->bs->supported_zero_flags); | |
403 | ||
5f3a3cd7 | 404 | s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); |
fe7ea40c | 405 | if (!s->bcs) { |
1f0cacb9 VSO |
406 | error_prepend(errp, "Cannot create block-copy-state: "); |
407 | return -EINVAL; | |
408 | } | |
409 | ||
af5bcd77 VSO |
410 | cluster_size = block_copy_cluster_size(s->bcs); |
411 | ||
412 | s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); | |
413 | if (!s->done_bitmap) { | |
414 | return -EINVAL; | |
415 | } | |
416 | bdrv_disable_dirty_bitmap(s->done_bitmap); | |
417 | ||
418 | /* s->access_bitmap starts equal to bcs bitmap */ | |
419 | s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp); | |
420 | if (!s->access_bitmap) { | |
421 | return -EINVAL; | |
422 | } | |
423 | bdrv_disable_dirty_bitmap(s->access_bitmap); | |
424 | bdrv_dirty_bitmap_merge_internal(s->access_bitmap, | |
425 | block_copy_dirty_bitmap(s->bcs), NULL, | |
426 | true); | |
427 | ||
428 | qemu_co_mutex_init(&s->lock); | |
429 | QLIST_INIT(&s->frozen_read_reqs); | |
430 | ||
1f0cacb9 VSO |
431 | return 0; |
432 | } | |
433 | ||
751cec7a VSO |
434 | static void cbw_close(BlockDriverState *bs) |
435 | { | |
436 | BDRVCopyBeforeWriteState *s = bs->opaque; | |
437 | ||
af5bcd77 VSO |
438 | bdrv_release_dirty_bitmap(s->access_bitmap); |
439 | bdrv_release_dirty_bitmap(s->done_bitmap); | |
440 | ||
751cec7a VSO |
441 | block_copy_state_free(s->bcs); |
442 | s->bcs = NULL; | |
443 | } | |
444 | ||
d003e0ae VSO |
445 | BlockDriver bdrv_cbw_filter = { |
446 | .format_name = "copy-before-write", | |
447 | .instance_size = sizeof(BDRVCopyBeforeWriteState), | |
7df7868b | 448 | |
751cec7a VSO |
449 | .bdrv_open = cbw_open, |
450 | .bdrv_close = cbw_close, | |
451 | ||
d003e0ae VSO |
452 | .bdrv_co_preadv = cbw_co_preadv, |
453 | .bdrv_co_pwritev = cbw_co_pwritev, | |
454 | .bdrv_co_pwrite_zeroes = cbw_co_pwrite_zeroes, | |
455 | .bdrv_co_pdiscard = cbw_co_pdiscard, | |
456 | .bdrv_co_flush = cbw_co_flush, | |
7df7868b | 457 | |
af5bcd77 VSO |
458 | .bdrv_co_preadv_snapshot = cbw_co_preadv_snapshot, |
459 | .bdrv_co_pdiscard_snapshot = cbw_co_pdiscard_snapshot, | |
460 | .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status, | |
461 | ||
d003e0ae | 462 | .bdrv_refresh_filename = cbw_refresh_filename, |
7df7868b | 463 | |
d003e0ae | 464 | .bdrv_child_perm = cbw_child_perm, |
7df7868b VSO |
465 | |
466 | .is_filter = true, | |
467 | }; | |
468 | ||
d003e0ae VSO |
469 | BlockDriverState *bdrv_cbw_append(BlockDriverState *source, |
470 | BlockDriverState *target, | |
471 | const char *filter_node_name, | |
d003e0ae VSO |
472 | BlockCopyState **bcs, |
473 | Error **errp) | |
7df7868b | 474 | { |
934aee14 | 475 | ERRP_GUARD(); |
d003e0ae | 476 | BDRVCopyBeforeWriteState *state; |
958a04bd | 477 | BlockDriverState *top; |
f44fd739 | 478 | QDict *opts; |
7df7868b | 479 | |
958a04bd | 480 | assert(source->total_sectors == target->total_sectors); |
377cc15b | 481 | GLOBAL_STATE_CODE(); |
958a04bd | 482 | |
f44fd739 | 483 | opts = qdict_new(); |
751cec7a VSO |
484 | qdict_put_str(opts, "driver", "copy-before-write"); |
485 | if (filter_node_name) { | |
486 | qdict_put_str(opts, "node-name", filter_node_name); | |
487 | } | |
f44fd739 VSO |
488 | qdict_put_str(opts, "file", bdrv_get_node_name(source)); |
489 | qdict_put_str(opts, "target", bdrv_get_node_name(target)); | |
490 | ||
751cec7a VSO |
491 | top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp); |
492 | if (!top) { | |
493 | return NULL; | |
7ddbce2d VSO |
494 | } |
495 | ||
751cec7a | 496 | state = top->opaque; |
7ddbce2d | 497 | *bcs = state->bcs; |
7df7868b VSO |
498 | |
499 | return top; | |
7df7868b VSO |
500 | } |
501 | ||
d003e0ae | 502 | void bdrv_cbw_drop(BlockDriverState *bs) |
7df7868b | 503 | { |
377cc15b | 504 | GLOBAL_STATE_CODE(); |
b75d64b3 | 505 | bdrv_drop_filter(bs, &error_abort); |
7df7868b | 506 | bdrv_unref(bs); |
7df7868b | 507 | } |
751cec7a VSO |
508 | |
509 | static void cbw_init(void) | |
510 | { | |
511 | bdrv_register(&bdrv_cbw_filter); | |
512 | } | |
513 | ||
514 | block_init(cbw_init); |