]>
Commit | Line | Data |
---|---|---|
33fa2222 VSO |
1 | /* |
2 | * preallocate filter driver | |
3 | * | |
4 | * The driver performs preallocate operation: it is injected above | |
5 | * some node, and before each write over EOF it does additional preallocating | |
6 | * write-zeroes request. | |
7 | * | |
8 | * Copyright (c) 2020 Virtuozzo International GmbH. | |
9 | * | |
10 | * Author: | |
11 | * Sementsov-Ogievskiy Vladimir <[email protected]> | |
12 | * | |
13 | * This program is free software; you can redistribute it and/or modify | |
14 | * it under the terms of the GNU General Public License as published by | |
15 | * the Free Software Foundation; either version 2 of the License, or | |
16 | * (at your option) any later version. | |
17 | * | |
18 | * This program is distributed in the hope that it will be useful, | |
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | * GNU General Public License for more details. | |
22 | * | |
23 | * You should have received a copy of the GNU General Public License | |
24 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
25 | */ | |
26 | ||
27 | #include "qemu/osdep.h" | |
28 | ||
29 | #include "qapi/error.h" | |
30 | #include "qemu/module.h" | |
31 | #include "qemu/option.h" | |
32 | #include "qemu/units.h" | |
33 | #include "block/block_int.h" | |
34 | ||
35 | ||
36 | typedef struct PreallocateOpts { | |
37 | int64_t prealloc_size; | |
38 | int64_t prealloc_align; | |
39 | } PreallocateOpts; | |
40 | ||
41 | typedef struct BDRVPreallocateState { | |
42 | PreallocateOpts opts; | |
43 | ||
44 | /* | |
45 | * Track real data end, to crop preallocation on close. If < 0 the status is | |
46 | * unknown. | |
47 | * | |
48 | * @data_end is a maximum of file size on open (or when we get write/resize | |
49 | * permissions) and all write request ends after it. So it's safe to | |
50 | * truncate to data_end if it is valid. | |
51 | */ | |
52 | int64_t data_end; | |
53 | ||
54 | /* | |
55 | * Start of trailing preallocated area which reads as zero. May be smaller | |
56 | * than data_end, if user does over-EOF write zero operation. If < 0 the | |
57 | * status is unknown. | |
58 | * | |
59 | * If both @zero_start and @file_end are valid, the region | |
60 | * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end | |
61 | * is not valid, @zero_start doesn't make much sense. | |
62 | */ | |
63 | int64_t zero_start; | |
64 | ||
65 | /* | |
66 | * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), | |
67 | * to avoid extra lseek() calls on each write operation. If < 0 the status | |
68 | * is unknown. | |
69 | */ | |
70 | int64_t file_end; | |
71 | ||
72 | /* | |
73 | * All three states @data_end, @zero_start and @file_end are guaranteed to | |
74 | * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and | |
75 | * BLK_PERM_WRITE permissions on file child. | |
76 | */ | |
77 | } BDRVPreallocateState; | |
78 | ||
79 | #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" | |
80 | #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" | |
81 | static QemuOptsList runtime_opts = { | |
82 | .name = "preallocate", | |
83 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | |
84 | .desc = { | |
85 | { | |
86 | .name = PREALLOCATE_OPT_PREALLOC_ALIGN, | |
87 | .type = QEMU_OPT_SIZE, | |
88 | .help = "on preallocation, align file length to this number, " | |
89 | "default 1M", | |
90 | }, | |
91 | { | |
92 | .name = PREALLOCATE_OPT_PREALLOC_SIZE, | |
93 | .type = QEMU_OPT_SIZE, | |
94 | .help = "how much to preallocate, default 128M", | |
95 | }, | |
96 | { /* end of list */ } | |
97 | }, | |
98 | }; | |
99 | ||
100 | static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, | |
101 | BlockDriverState *child_bs, Error **errp) | |
102 | { | |
103 | QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); | |
104 | ||
105 | if (!qemu_opts_absorb_qdict(opts, options, errp)) { | |
106 | return false; | |
107 | } | |
108 | ||
109 | dest->prealloc_align = | |
110 | qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); | |
111 | dest->prealloc_size = | |
112 | qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); | |
113 | ||
114 | qemu_opts_del(opts); | |
115 | ||
116 | if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { | |
117 | error_setg(errp, "prealloc-align parameter of preallocate filter " | |
118 | "is not aligned to %llu", BDRV_SECTOR_SIZE); | |
119 | return false; | |
120 | } | |
121 | ||
122 | if (!QEMU_IS_ALIGNED(dest->prealloc_align, | |
123 | child_bs->bl.request_alignment)) { | |
124 | error_setg(errp, "prealloc-align parameter of preallocate filter " | |
125 | "is not aligned to underlying node request alignment " | |
126 | "(%" PRIi32 ")", child_bs->bl.request_alignment); | |
127 | return false; | |
128 | } | |
129 | ||
130 | return true; | |
131 | } | |
132 | ||
133 | static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, | |
134 | Error **errp) | |
135 | { | |
136 | BDRVPreallocateState *s = bs->opaque; | |
137 | ||
138 | /* | |
139 | * s->data_end and friends should be initialized on permission update. | |
140 | * For this to work, mark them invalid. | |
141 | */ | |
142 | s->file_end = s->zero_start = s->data_end = -EINVAL; | |
143 | ||
144 | bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, | |
145 | BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, | |
146 | false, errp); | |
147 | if (!bs->file) { | |
148 | return -EINVAL; | |
149 | } | |
150 | ||
151 | if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { | |
152 | return -EINVAL; | |
153 | } | |
154 | ||
155 | bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | | |
156 | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); | |
157 | ||
158 | bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | | |
159 | ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & | |
160 | bs->file->bs->supported_zero_flags); | |
161 | ||
162 | return 0; | |
163 | } | |
164 | ||
165 | static void preallocate_close(BlockDriverState *bs) | |
166 | { | |
167 | int ret; | |
168 | BDRVPreallocateState *s = bs->opaque; | |
169 | ||
170 | if (s->data_end < 0) { | |
171 | return; | |
172 | } | |
173 | ||
174 | if (s->file_end < 0) { | |
175 | s->file_end = bdrv_getlength(bs->file->bs); | |
176 | if (s->file_end < 0) { | |
177 | return; | |
178 | } | |
179 | } | |
180 | ||
181 | if (s->data_end < s->file_end) { | |
182 | ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, | |
183 | NULL); | |
184 | s->file_end = ret < 0 ? ret : s->data_end; | |
185 | } | |
186 | } | |
187 | ||
188 | ||
189 | /* | |
190 | * Handle reopen. | |
191 | * | |
192 | * We must implement reopen handlers, otherwise reopen just don't work. Handle | |
193 | * new options and don't care about preallocation state, as it is handled in | |
194 | * set/check permission handlers. | |
195 | */ | |
196 | ||
197 | static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, | |
198 | BlockReopenQueue *queue, Error **errp) | |
199 | { | |
200 | PreallocateOpts *opts = g_new0(PreallocateOpts, 1); | |
201 | ||
202 | if (!preallocate_absorb_opts(opts, reopen_state->options, | |
203 | reopen_state->bs->file->bs, errp)) { | |
204 | g_free(opts); | |
205 | return -EINVAL; | |
206 | } | |
207 | ||
208 | reopen_state->opaque = opts; | |
209 | ||
210 | return 0; | |
211 | } | |
212 | ||
213 | static void preallocate_reopen_commit(BDRVReopenState *state) | |
214 | { | |
215 | BDRVPreallocateState *s = state->bs->opaque; | |
216 | ||
217 | s->opts = *(PreallocateOpts *)state->opaque; | |
218 | ||
219 | g_free(state->opaque); | |
220 | state->opaque = NULL; | |
221 | } | |
222 | ||
223 | static void preallocate_reopen_abort(BDRVReopenState *state) | |
224 | { | |
225 | g_free(state->opaque); | |
226 | state->opaque = NULL; | |
227 | } | |
228 | ||
229 | static coroutine_fn int preallocate_co_preadv_part( | |
230 | BlockDriverState *bs, uint64_t offset, uint64_t bytes, | |
231 | QEMUIOVector *qiov, size_t qiov_offset, int flags) | |
232 | { | |
233 | return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, | |
234 | flags); | |
235 | } | |
236 | ||
237 | static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, | |
238 | int64_t offset, int bytes) | |
239 | { | |
240 | return bdrv_co_pdiscard(bs->file, offset, bytes); | |
241 | } | |
242 | ||
243 | static bool can_write_resize(uint64_t perm) | |
244 | { | |
245 | return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); | |
246 | } | |
247 | ||
248 | static bool has_prealloc_perms(BlockDriverState *bs) | |
249 | { | |
250 | BDRVPreallocateState *s = bs->opaque; | |
251 | ||
252 | if (can_write_resize(bs->file->perm)) { | |
253 | assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); | |
254 | assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); | |
255 | return true; | |
256 | } | |
257 | ||
258 | assert(s->data_end < 0); | |
259 | assert(s->zero_start < 0); | |
260 | assert(s->file_end < 0); | |
261 | return false; | |
262 | } | |
263 | ||
264 | /* | |
265 | * Call on each write. Returns true if @want_merge_zero is true and the region | |
266 | * [offset, offset + bytes) is zeroed (as a result of this call or earlier | |
267 | * preallocation). | |
268 | * | |
269 | * want_merge_zero is used to merge write-zero request with preallocation in | |
270 | * one bdrv_co_pwrite_zeroes() call. | |
271 | */ | |
272 | static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset, | |
273 | int64_t bytes, bool want_merge_zero) | |
274 | { | |
275 | BDRVPreallocateState *s = bs->opaque; | |
276 | int64_t end = offset + bytes; | |
277 | int64_t prealloc_start, prealloc_end; | |
278 | int ret; | |
279 | ||
280 | if (!has_prealloc_perms(bs)) { | |
281 | /* We don't have state neither should try to recover it */ | |
282 | return false; | |
283 | } | |
284 | ||
285 | if (s->data_end < 0) { | |
286 | s->data_end = bdrv_getlength(bs->file->bs); | |
287 | if (s->data_end < 0) { | |
288 | return false; | |
289 | } | |
290 | ||
291 | if (s->file_end < 0) { | |
292 | s->file_end = s->data_end; | |
293 | } | |
294 | } | |
295 | ||
296 | if (end <= s->data_end) { | |
297 | return false; | |
298 | } | |
299 | ||
300 | /* We have valid s->data_end, and request writes beyond it. */ | |
301 | ||
302 | s->data_end = end; | |
303 | if (s->zero_start < 0 || !want_merge_zero) { | |
304 | s->zero_start = end; | |
305 | } | |
306 | ||
307 | if (s->file_end < 0) { | |
308 | s->file_end = bdrv_getlength(bs->file->bs); | |
309 | if (s->file_end < 0) { | |
310 | return false; | |
311 | } | |
312 | } | |
313 | ||
314 | /* Now s->data_end, s->zero_start and s->file_end are valid. */ | |
315 | ||
316 | if (end <= s->file_end) { | |
317 | /* No preallocation needed. */ | |
318 | return want_merge_zero && offset >= s->zero_start; | |
319 | } | |
320 | ||
321 | /* Now we want new preallocation, as request writes beyond s->file_end. */ | |
322 | ||
323 | prealloc_start = want_merge_zero ? MIN(offset, s->file_end) : s->file_end; | |
324 | prealloc_end = QEMU_ALIGN_UP(end + s->opts.prealloc_size, | |
325 | s->opts.prealloc_align); | |
326 | ||
327 | ret = bdrv_co_pwrite_zeroes( | |
328 | bs->file, prealloc_start, prealloc_end - prealloc_start, | |
329 | BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); | |
330 | if (ret < 0) { | |
331 | s->file_end = ret; | |
332 | return false; | |
333 | } | |
334 | ||
335 | s->file_end = prealloc_end; | |
336 | return want_merge_zero; | |
337 | } | |
338 | ||
339 | static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, | |
340 | int64_t offset, int bytes, BdrvRequestFlags flags) | |
341 | { | |
342 | bool want_merge_zero = | |
343 | !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); | |
344 | if (handle_write(bs, offset, bytes, want_merge_zero)) { | |
345 | return 0; | |
346 | } | |
347 | ||
348 | return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); | |
349 | } | |
350 | ||
351 | static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, | |
352 | uint64_t offset, | |
353 | uint64_t bytes, | |
354 | QEMUIOVector *qiov, | |
355 | size_t qiov_offset, | |
356 | int flags) | |
357 | { | |
358 | handle_write(bs, offset, bytes, false); | |
359 | ||
360 | return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, | |
361 | flags); | |
362 | } | |
363 | ||
364 | static int coroutine_fn | |
365 | preallocate_co_truncate(BlockDriverState *bs, int64_t offset, | |
366 | bool exact, PreallocMode prealloc, | |
367 | BdrvRequestFlags flags, Error **errp) | |
368 | { | |
369 | ERRP_GUARD(); | |
370 | BDRVPreallocateState *s = bs->opaque; | |
371 | int ret; | |
372 | ||
373 | if (s->data_end >= 0 && offset > s->data_end) { | |
374 | if (s->file_end < 0) { | |
375 | s->file_end = bdrv_getlength(bs->file->bs); | |
376 | if (s->file_end < 0) { | |
377 | error_setg(errp, "failed to get file length"); | |
378 | return s->file_end; | |
379 | } | |
380 | } | |
381 | ||
382 | if (prealloc == PREALLOC_MODE_FALLOC) { | |
383 | /* | |
384 | * If offset <= s->file_end, the task is already done, just | |
385 | * update s->data_end, to move part of "filter preallocation" | |
386 | * to "preallocation requested by user". | |
387 | * Otherwise just proceed to preallocate missing part. | |
388 | */ | |
389 | if (offset <= s->file_end) { | |
390 | s->data_end = offset; | |
391 | return 0; | |
392 | } | |
393 | } else { | |
394 | /* | |
395 | * We have to drop our preallocation, to | |
396 | * - avoid "Cannot use preallocation for shrinking files" in | |
397 | * case of offset < file_end | |
398 | * - give PREALLOC_MODE_OFF a chance to keep small disk | |
399 | * usage | |
400 | * - give PREALLOC_MODE_FULL a chance to actually write the | |
401 | * whole region as user expects | |
402 | */ | |
403 | if (s->file_end > s->data_end) { | |
404 | ret = bdrv_co_truncate(bs->file, s->data_end, true, | |
405 | PREALLOC_MODE_OFF, 0, errp); | |
406 | if (ret < 0) { | |
407 | s->file_end = ret; | |
408 | error_prepend(errp, "preallocate-filter: failed to drop " | |
409 | "write-zero preallocation: "); | |
410 | return ret; | |
411 | } | |
412 | s->file_end = s->data_end; | |
413 | } | |
414 | } | |
415 | ||
416 | s->data_end = offset; | |
417 | } | |
418 | ||
419 | ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); | |
420 | if (ret < 0) { | |
421 | s->file_end = s->zero_start = s->data_end = ret; | |
422 | return ret; | |
423 | } | |
424 | ||
425 | if (has_prealloc_perms(bs)) { | |
426 | s->file_end = s->zero_start = s->data_end = offset; | |
427 | } | |
428 | return 0; | |
429 | } | |
430 | ||
431 | static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) | |
432 | { | |
433 | return bdrv_co_flush(bs->file->bs); | |
434 | } | |
435 | ||
436 | static int64_t preallocate_getlength(BlockDriverState *bs) | |
437 | { | |
438 | int64_t ret; | |
439 | BDRVPreallocateState *s = bs->opaque; | |
440 | ||
441 | if (s->data_end >= 0) { | |
442 | return s->data_end; | |
443 | } | |
444 | ||
445 | ret = bdrv_getlength(bs->file->bs); | |
446 | ||
447 | if (has_prealloc_perms(bs)) { | |
448 | s->file_end = s->zero_start = s->data_end = ret; | |
449 | } | |
450 | ||
451 | return ret; | |
452 | } | |
453 | ||
454 | static int preallocate_check_perm(BlockDriverState *bs, | |
455 | uint64_t perm, uint64_t shared, Error **errp) | |
456 | { | |
457 | BDRVPreallocateState *s = bs->opaque; | |
458 | ||
459 | if (s->data_end >= 0 && !can_write_resize(perm)) { | |
460 | /* | |
461 | * Lose permissions. | |
462 | * We should truncate in check_perm, as in set_perm bs->file->perm will | |
463 | * be already changed, and we should not violate it. | |
464 | */ | |
465 | if (s->file_end < 0) { | |
466 | s->file_end = bdrv_getlength(bs->file->bs); | |
467 | if (s->file_end < 0) { | |
468 | error_setg(errp, "Failed to get file length"); | |
469 | return s->file_end; | |
470 | } | |
471 | } | |
472 | ||
473 | if (s->data_end < s->file_end) { | |
474 | int ret = bdrv_truncate(bs->file, s->data_end, true, | |
475 | PREALLOC_MODE_OFF, 0, NULL); | |
476 | if (ret < 0) { | |
477 | error_setg(errp, "Failed to drop preallocation"); | |
478 | s->file_end = ret; | |
479 | return ret; | |
480 | } | |
481 | s->file_end = s->data_end; | |
482 | } | |
483 | } | |
484 | ||
485 | return 0; | |
486 | } | |
487 | ||
488 | static void preallocate_set_perm(BlockDriverState *bs, | |
489 | uint64_t perm, uint64_t shared) | |
490 | { | |
491 | BDRVPreallocateState *s = bs->opaque; | |
492 | ||
493 | if (can_write_resize(perm)) { | |
494 | if (s->data_end < 0) { | |
495 | s->data_end = s->file_end = s->zero_start = | |
496 | bdrv_getlength(bs->file->bs); | |
497 | } | |
498 | } else { | |
499 | /* | |
500 | * We drop our permissions, as well as allow shared | |
501 | * permissions (see preallocate_child_perm), anyone will be able to | |
502 | * change the child, so mark all states invalid. We'll regain control if | |
503 | * get good permissions back. | |
504 | */ | |
505 | s->data_end = s->file_end = s->zero_start = -EINVAL; | |
506 | } | |
507 | } | |
508 | ||
509 | static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, | |
510 | BdrvChildRole role, BlockReopenQueue *reopen_queue, | |
511 | uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) | |
512 | { | |
513 | bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); | |
514 | ||
515 | if (can_write_resize(perm)) { | |
516 | /* This should come by default, but let's enforce: */ | |
517 | *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; | |
518 | ||
519 | /* | |
520 | * Don't share, to keep our states s->file_end, s->data_end and | |
521 | * s->zero_start valid. | |
522 | */ | |
523 | *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); | |
524 | } | |
525 | } | |
526 | ||
527 | BlockDriver bdrv_preallocate_filter = { | |
528 | .format_name = "preallocate", | |
529 | .instance_size = sizeof(BDRVPreallocateState), | |
530 | ||
531 | .bdrv_getlength = preallocate_getlength, | |
532 | .bdrv_open = preallocate_open, | |
533 | .bdrv_close = preallocate_close, | |
534 | ||
535 | .bdrv_reopen_prepare = preallocate_reopen_prepare, | |
536 | .bdrv_reopen_commit = preallocate_reopen_commit, | |
537 | .bdrv_reopen_abort = preallocate_reopen_abort, | |
538 | ||
539 | .bdrv_co_preadv_part = preallocate_co_preadv_part, | |
540 | .bdrv_co_pwritev_part = preallocate_co_pwritev_part, | |
541 | .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, | |
542 | .bdrv_co_pdiscard = preallocate_co_pdiscard, | |
543 | .bdrv_co_flush = preallocate_co_flush, | |
544 | .bdrv_co_truncate = preallocate_co_truncate, | |
545 | ||
546 | .bdrv_check_perm = preallocate_check_perm, | |
547 | .bdrv_set_perm = preallocate_set_perm, | |
548 | .bdrv_child_perm = preallocate_child_perm, | |
549 | ||
550 | .has_variable_length = true, | |
551 | .is_filter = true, | |
552 | }; | |
553 | ||
554 | static void bdrv_preallocate_init(void) | |
555 | { | |
556 | bdrv_register(&bdrv_preallocate_filter); | |
557 | } | |
558 | ||
559 | block_init(bdrv_preallocate_init); |