]> Git Repo - qemu.git/blob - block/mirror.c
Merge remote-tracking branch 'remotes/kraxel/tags/pull-vga-20160720-1' into staging
[qemu.git] / block / mirror.c
1 /*
2  * Image mirroring
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Paolo Bonzini  <[email protected]>
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
10  * See the COPYING.LIB file in the top-level directory.
11  *
12  */
13
14 #include "qemu/osdep.h"
15 #include "trace.h"
16 #include "block/blockjob.h"
17 #include "block/block_int.h"
18 #include "sysemu/block-backend.h"
19 #include "qapi/error.h"
20 #include "qapi/qmp/qerror.h"
21 #include "qemu/ratelimit.h"
22 #include "qemu/bitmap.h"
23
24 #define SLICE_TIME    100000000ULL /* ns */
25 #define MAX_IN_FLIGHT 16
26 #define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
27
28 /* The mirroring buffer is a list of granularity-sized chunks.
29  * Free chunks are organized in a list.
30  */
31 typedef struct MirrorBuffer {
32     QSIMPLEQ_ENTRY(MirrorBuffer) next;
33 } MirrorBuffer;
34
35 typedef struct MirrorBlockJob {
36     BlockJob common;
37     RateLimit limit;
38     BlockBackend *target;
39     BlockDriverState *base;
40     /* The name of the graph node to replace */
41     char *replaces;
42     /* The BDS to replace */
43     BlockDriverState *to_replace;
44     /* Used to block operations on the drive-mirror-replace target */
45     Error *replace_blocker;
46     bool is_none_mode;
47     BlockMirrorBackingMode backing_mode;
48     BlockdevOnError on_source_error, on_target_error;
49     bool synced;
50     bool should_complete;
51     int64_t granularity;
52     size_t buf_size;
53     int64_t bdev_length;
54     unsigned long *cow_bitmap;
55     BdrvDirtyBitmap *dirty_bitmap;
56     HBitmapIter hbi;
57     uint8_t *buf;
58     QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
59     int buf_free_count;
60
61     uint64_t last_pause_ns;
62     unsigned long *in_flight_bitmap;
63     int in_flight;
64     int64_t sectors_in_flight;
65     int ret;
66     bool unmap;
67     bool waiting_for_io;
68     int target_cluster_sectors;
69     int max_iov;
70 } MirrorBlockJob;
71
72 typedef struct MirrorOp {
73     MirrorBlockJob *s;
74     QEMUIOVector qiov;
75     int64_t sector_num;
76     int nb_sectors;
77 } MirrorOp;
78
79 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
80                                             int error)
81 {
82     s->synced = false;
83     if (read) {
84         return block_job_error_action(&s->common, s->on_source_error,
85                                       true, error);
86     } else {
87         return block_job_error_action(&s->common, s->on_target_error,
88                                       false, error);
89     }
90 }
91
92 static void mirror_iteration_done(MirrorOp *op, int ret)
93 {
94     MirrorBlockJob *s = op->s;
95     struct iovec *iov;
96     int64_t chunk_num;
97     int i, nb_chunks, sectors_per_chunk;
98
99     trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
100
101     s->in_flight--;
102     s->sectors_in_flight -= op->nb_sectors;
103     iov = op->qiov.iov;
104     for (i = 0; i < op->qiov.niov; i++) {
105         MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
106         QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
107         s->buf_free_count++;
108     }
109
110     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
111     chunk_num = op->sector_num / sectors_per_chunk;
112     nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk);
113     bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
114     if (ret >= 0) {
115         if (s->cow_bitmap) {
116             bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
117         }
118         s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
119     }
120
121     qemu_iovec_destroy(&op->qiov);
122     g_free(op);
123
124     if (s->waiting_for_io) {
125         qemu_coroutine_enter(s->common.co);
126     }
127 }
128
129 static void mirror_write_complete(void *opaque, int ret)
130 {
131     MirrorOp *op = opaque;
132     MirrorBlockJob *s = op->s;
133     if (ret < 0) {
134         BlockErrorAction action;
135
136         bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
137         action = mirror_error_action(s, false, -ret);
138         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
139             s->ret = ret;
140         }
141     }
142     mirror_iteration_done(op, ret);
143 }
144
145 static void mirror_read_complete(void *opaque, int ret)
146 {
147     MirrorOp *op = opaque;
148     MirrorBlockJob *s = op->s;
149     if (ret < 0) {
150         BlockErrorAction action;
151
152         bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
153         action = mirror_error_action(s, true, -ret);
154         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
155             s->ret = ret;
156         }
157
158         mirror_iteration_done(op, ret);
159         return;
160     }
161     blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
162                     0, mirror_write_complete, op);
163 }
164
165 static inline void mirror_clip_sectors(MirrorBlockJob *s,
166                                        int64_t sector_num,
167                                        int *nb_sectors)
168 {
169     *nb_sectors = MIN(*nb_sectors,
170                       s->bdev_length / BDRV_SECTOR_SIZE - sector_num);
171 }
172
173 /* Round sector_num and/or nb_sectors to target cluster if COW is needed, and
174  * return the offset of the adjusted tail sector against original. */
175 static int mirror_cow_align(MirrorBlockJob *s,
176                             int64_t *sector_num,
177                             int *nb_sectors)
178 {
179     bool need_cow;
180     int ret = 0;
181     int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS;
182     int64_t align_sector_num = *sector_num;
183     int align_nb_sectors = *nb_sectors;
184     int max_sectors = chunk_sectors * s->max_iov;
185
186     need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap);
187     need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
188                           s->cow_bitmap);
189     if (need_cow) {
190         bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num,
191                                        *nb_sectors, &align_sector_num,
192                                        &align_nb_sectors);
193     }
194
195     if (align_nb_sectors > max_sectors) {
196         align_nb_sectors = max_sectors;
197         if (need_cow) {
198             align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors,
199                                                s->target_cluster_sectors);
200         }
201     }
202     /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but
203      * that doesn't matter because it's already the end of source image. */
204     mirror_clip_sectors(s, align_sector_num, &align_nb_sectors);
205
206     ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors);
207     *sector_num = align_sector_num;
208     *nb_sectors = align_nb_sectors;
209     assert(ret >= 0);
210     return ret;
211 }
212
213 static inline void mirror_wait_for_io(MirrorBlockJob *s)
214 {
215     assert(!s->waiting_for_io);
216     s->waiting_for_io = true;
217     qemu_coroutine_yield();
218     s->waiting_for_io = false;
219 }
220
221 /* Submit async read while handling COW.
222  * Returns: The number of sectors copied after and including sector_num,
223  *          excluding any sectors copied prior to sector_num due to alignment.
224  *          This will be nb_sectors if no alignment is necessary, or
225  *          (new_end - sector_num) if tail is rounded up or down due to
226  *          alignment or buffer limit.
227  */
228 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
229                           int nb_sectors)
230 {
231     BlockBackend *source = s->common.blk;
232     int sectors_per_chunk, nb_chunks;
233     int ret;
234     MirrorOp *op;
235     int max_sectors;
236
237     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
238     max_sectors = sectors_per_chunk * s->max_iov;
239
240     /* We can only handle as much as buf_size at a time. */
241     nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors);
242     nb_sectors = MIN(max_sectors, nb_sectors);
243     assert(nb_sectors);
244     ret = nb_sectors;
245
246     if (s->cow_bitmap) {
247         ret += mirror_cow_align(s, &sector_num, &nb_sectors);
248     }
249     assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size);
250     /* The sector range must meet granularity because:
251      * 1) Caller passes in aligned values;
252      * 2) mirror_cow_align is used only when target cluster is larger. */
253     assert(!(sector_num % sectors_per_chunk));
254     nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk);
255
256     while (s->buf_free_count < nb_chunks) {
257         trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
258         mirror_wait_for_io(s);
259     }
260
261     /* Allocate a MirrorOp that is used as an AIO callback.  */
262     op = g_new(MirrorOp, 1);
263     op->s = s;
264     op->sector_num = sector_num;
265     op->nb_sectors = nb_sectors;
266
267     /* Now make a QEMUIOVector taking enough granularity-sized chunks
268      * from s->buf_free.
269      */
270     qemu_iovec_init(&op->qiov, nb_chunks);
271     while (nb_chunks-- > 0) {
272         MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
273         size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size;
274
275         QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
276         s->buf_free_count--;
277         qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining));
278     }
279
280     /* Copy the dirty cluster.  */
281     s->in_flight++;
282     s->sectors_in_flight += nb_sectors;
283     trace_mirror_one_iteration(s, sector_num, nb_sectors);
284
285     blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0,
286                    mirror_read_complete, op);
287     return ret;
288 }
289
290 static void mirror_do_zero_or_discard(MirrorBlockJob *s,
291                                       int64_t sector_num,
292                                       int nb_sectors,
293                                       bool is_discard)
294 {
295     MirrorOp *op;
296
297     /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed
298      * so the freeing in mirror_iteration_done is nop. */
299     op = g_new0(MirrorOp, 1);
300     op->s = s;
301     op->sector_num = sector_num;
302     op->nb_sectors = nb_sectors;
303
304     s->in_flight++;
305     s->sectors_in_flight += nb_sectors;
306     if (is_discard) {
307         blk_aio_discard(s->target, sector_num, op->nb_sectors,
308                         mirror_write_complete, op);
309     } else {
310         blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
311                               op->nb_sectors * BDRV_SECTOR_SIZE,
312                               s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
313                               mirror_write_complete, op);
314     }
315 }
316
317 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
318 {
319     BlockDriverState *source = blk_bs(s->common.blk);
320     int64_t sector_num, first_chunk;
321     uint64_t delay_ns = 0;
322     /* At least the first dirty chunk is mirrored in one iteration. */
323     int nb_chunks = 1;
324     int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
325     int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
326     bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
327
328     sector_num = hbitmap_iter_next(&s->hbi);
329     if (sector_num < 0) {
330         bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
331         sector_num = hbitmap_iter_next(&s->hbi);
332         trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
333         assert(sector_num >= 0);
334     }
335
336     first_chunk = sector_num / sectors_per_chunk;
337     while (test_bit(first_chunk, s->in_flight_bitmap)) {
338         trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
339         mirror_wait_for_io(s);
340     }
341
342     block_job_pause_point(&s->common);
343
344     /* Find the number of consective dirty chunks following the first dirty
345      * one, and wait for in flight requests in them. */
346     while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
347         int64_t hbitmap_next;
348         int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
349         int64_t next_chunk = next_sector / sectors_per_chunk;
350         if (next_sector >= end ||
351             !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
352             break;
353         }
354         if (test_bit(next_chunk, s->in_flight_bitmap)) {
355             break;
356         }
357
358         hbitmap_next = hbitmap_iter_next(&s->hbi);
359         if (hbitmap_next > next_sector || hbitmap_next < 0) {
360             /* The bitmap iterator's cache is stale, refresh it */
361             bdrv_set_dirty_iter(&s->hbi, next_sector);
362             hbitmap_next = hbitmap_iter_next(&s->hbi);
363         }
364         assert(hbitmap_next == next_sector);
365         nb_chunks++;
366     }
367
368     /* Clear dirty bits before querying the block status, because
369      * calling bdrv_get_block_status_above could yield - if some blocks are
370      * marked dirty in this window, we need to know.
371      */
372     bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
373                             nb_chunks * sectors_per_chunk);
374     bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
375     while (nb_chunks > 0 && sector_num < end) {
376         int ret;
377         int io_sectors, io_sectors_acct;
378         BlockDriverState *file;
379         enum MirrorMethod {
380             MIRROR_METHOD_COPY,
381             MIRROR_METHOD_ZERO,
382             MIRROR_METHOD_DISCARD
383         } mirror_method = MIRROR_METHOD_COPY;
384
385         assert(!(sector_num % sectors_per_chunk));
386         ret = bdrv_get_block_status_above(source, NULL, sector_num,
387                                           nb_chunks * sectors_per_chunk,
388                                           &io_sectors, &file);
389         if (ret < 0) {
390             io_sectors = nb_chunks * sectors_per_chunk;
391         }
392
393         io_sectors -= io_sectors % sectors_per_chunk;
394         if (io_sectors < sectors_per_chunk) {
395             io_sectors = sectors_per_chunk;
396         } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
397             int64_t target_sector_num;
398             int target_nb_sectors;
399             bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num,
400                                            io_sectors,  &target_sector_num,
401                                            &target_nb_sectors);
402             if (target_sector_num == sector_num &&
403                 target_nb_sectors == io_sectors) {
404                 mirror_method = ret & BDRV_BLOCK_ZERO ?
405                                     MIRROR_METHOD_ZERO :
406                                     MIRROR_METHOD_DISCARD;
407             }
408         }
409
410         while (s->in_flight >= MAX_IN_FLIGHT) {
411             trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
412             mirror_wait_for_io(s);
413         }
414
415         mirror_clip_sectors(s, sector_num, &io_sectors);
416         switch (mirror_method) {
417         case MIRROR_METHOD_COPY:
418             io_sectors = mirror_do_read(s, sector_num, io_sectors);
419             io_sectors_acct = io_sectors;
420             break;
421         case MIRROR_METHOD_ZERO:
422         case MIRROR_METHOD_DISCARD:
423             mirror_do_zero_or_discard(s, sector_num, io_sectors,
424                                       mirror_method == MIRROR_METHOD_DISCARD);
425             if (write_zeroes_ok) {
426                 io_sectors_acct = 0;
427             } else {
428                 io_sectors_acct = io_sectors;
429             }
430             break;
431         default:
432             abort();
433         }
434         assert(io_sectors);
435         sector_num += io_sectors;
436         nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
437         if (s->common.speed) {
438             delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
439         }
440     }
441     return delay_ns;
442 }
443
444 static void mirror_free_init(MirrorBlockJob *s)
445 {
446     int granularity = s->granularity;
447     size_t buf_size = s->buf_size;
448     uint8_t *buf = s->buf;
449
450     assert(s->buf_free_count == 0);
451     QSIMPLEQ_INIT(&s->buf_free);
452     while (buf_size != 0) {
453         MirrorBuffer *cur = (MirrorBuffer *)buf;
454         QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
455         s->buf_free_count++;
456         buf_size -= granularity;
457         buf += granularity;
458     }
459 }
460
461 static void mirror_drain(MirrorBlockJob *s)
462 {
463     while (s->in_flight > 0) {
464         mirror_wait_for_io(s);
465     }
466 }
467
468 typedef struct {
469     int ret;
470 } MirrorExitData;
471
472 static void mirror_exit(BlockJob *job, void *opaque)
473 {
474     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
475     MirrorExitData *data = opaque;
476     AioContext *replace_aio_context = NULL;
477     BlockDriverState *src = blk_bs(s->common.blk);
478     BlockDriverState *target_bs = blk_bs(s->target);
479
480     /* Make sure that the source BDS doesn't go away before we called
481      * block_job_completed(). */
482     bdrv_ref(src);
483
484     if (s->to_replace) {
485         replace_aio_context = bdrv_get_aio_context(s->to_replace);
486         aio_context_acquire(replace_aio_context);
487     }
488
489     if (s->should_complete && data->ret == 0) {
490         BlockDriverState *to_replace = src;
491         if (s->to_replace) {
492             to_replace = s->to_replace;
493         }
494
495         if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
496             bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
497         }
498
499         /* The mirror job has no requests in flight any more, but we need to
500          * drain potential other users of the BDS before changing the graph. */
501         bdrv_drained_begin(target_bs);
502         bdrv_replace_in_backing_chain(to_replace, target_bs);
503         bdrv_drained_end(target_bs);
504
505         /* We just changed the BDS the job BB refers to */
506         blk_remove_bs(job->blk);
507         blk_insert_bs(job->blk, src);
508     }
509     if (s->to_replace) {
510         bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
511         error_free(s->replace_blocker);
512         bdrv_unref(s->to_replace);
513     }
514     if (replace_aio_context) {
515         aio_context_release(replace_aio_context);
516     }
517     g_free(s->replaces);
518     bdrv_op_unblock_all(target_bs, s->common.blocker);
519     blk_unref(s->target);
520     block_job_completed(&s->common, data->ret);
521     g_free(data);
522     bdrv_drained_end(src);
523     if (qemu_get_aio_context() == bdrv_get_aio_context(src)) {
524         aio_enable_external(iohandler_get_aio_context());
525     }
526     bdrv_unref(src);
527 }
528
529 static void mirror_throttle(MirrorBlockJob *s)
530 {
531     int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
532
533     if (now - s->last_pause_ns > SLICE_TIME) {
534         s->last_pause_ns = now;
535         block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
536     } else {
537         block_job_pause_point(&s->common);
538     }
539 }
540
541 static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
542 {
543     int64_t sector_num, end;
544     BlockDriverState *base = s->base;
545     BlockDriverState *bs = blk_bs(s->common.blk);
546     BlockDriverState *target_bs = blk_bs(s->target);
547     int ret, n;
548
549     end = s->bdev_length / BDRV_SECTOR_SIZE;
550
551     if (base == NULL && !bdrv_has_zero_init(target_bs)) {
552         if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
553             bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
554             return 0;
555         }
556
557         for (sector_num = 0; sector_num < end; ) {
558             int nb_sectors = MIN(end - sector_num,
559                 QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
560
561             mirror_throttle(s);
562
563             if (block_job_is_cancelled(&s->common)) {
564                 return 0;
565             }
566
567             if (s->in_flight >= MAX_IN_FLIGHT) {
568                 trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
569                 mirror_wait_for_io(s);
570                 continue;
571             }
572
573             mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
574             sector_num += nb_sectors;
575         }
576
577         mirror_drain(s);
578     }
579
580     /* First part, loop on the sectors and initialize the dirty bitmap.  */
581     for (sector_num = 0; sector_num < end; ) {
582         /* Just to make sure we are not exceeding int limit. */
583         int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
584                              end - sector_num);
585
586         mirror_throttle(s);
587
588         if (block_job_is_cancelled(&s->common)) {
589             return 0;
590         }
591
592         ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
593         if (ret < 0) {
594             return ret;
595         }
596
597         assert(n > 0);
598         if (ret == 1) {
599             bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
600         }
601         sector_num += n;
602     }
603     return 0;
604 }
605
606 static void coroutine_fn mirror_run(void *opaque)
607 {
608     MirrorBlockJob *s = opaque;
609     MirrorExitData *data;
610     BlockDriverState *bs = blk_bs(s->common.blk);
611     BlockDriverState *target_bs = blk_bs(s->target);
612     int64_t length;
613     BlockDriverInfo bdi;
614     char backing_filename[2]; /* we only need 2 characters because we are only
615                                  checking for a NULL string */
616     int ret = 0;
617     int target_cluster_size = BDRV_SECTOR_SIZE;
618
619     if (block_job_is_cancelled(&s->common)) {
620         goto immediate_exit;
621     }
622
623     s->bdev_length = bdrv_getlength(bs);
624     if (s->bdev_length < 0) {
625         ret = s->bdev_length;
626         goto immediate_exit;
627     } else if (s->bdev_length == 0) {
628         /* Report BLOCK_JOB_READY and wait for complete. */
629         block_job_event_ready(&s->common);
630         s->synced = true;
631         while (!block_job_is_cancelled(&s->common) && !s->should_complete) {
632             block_job_yield(&s->common);
633         }
634         s->common.cancelled = false;
635         goto immediate_exit;
636     }
637
638     length = DIV_ROUND_UP(s->bdev_length, s->granularity);
639     s->in_flight_bitmap = bitmap_new(length);
640
641     /* If we have no backing file yet in the destination, we cannot let
642      * the destination do COW.  Instead, we copy sectors around the
643      * dirty data if needed.  We need a bitmap to do that.
644      */
645     bdrv_get_backing_filename(target_bs, backing_filename,
646                               sizeof(backing_filename));
647     if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
648         target_cluster_size = bdi.cluster_size;
649     }
650     if (backing_filename[0] && !target_bs->backing
651         && s->granularity < target_cluster_size) {
652         s->buf_size = MAX(s->buf_size, target_cluster_size);
653         s->cow_bitmap = bitmap_new(length);
654     }
655     s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
656     s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
657
658     s->buf = qemu_try_blockalign(bs, s->buf_size);
659     if (s->buf == NULL) {
660         ret = -ENOMEM;
661         goto immediate_exit;
662     }
663
664     mirror_free_init(s);
665
666     s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
667     if (!s->is_none_mode) {
668         ret = mirror_dirty_init(s);
669         if (ret < 0 || block_job_is_cancelled(&s->common)) {
670             goto immediate_exit;
671         }
672     }
673
674     bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
675     for (;;) {
676         uint64_t delay_ns = 0;
677         int64_t cnt, delta;
678         bool should_complete;
679
680         if (s->ret < 0) {
681             ret = s->ret;
682             goto immediate_exit;
683         }
684
685         block_job_pause_point(&s->common);
686
687         cnt = bdrv_get_dirty_count(s->dirty_bitmap);
688         /* s->common.offset contains the number of bytes already processed so
689          * far, cnt is the number of dirty sectors remaining and
690          * s->sectors_in_flight is the number of sectors currently being
691          * processed; together those are the current total operation length */
692         s->common.len = s->common.offset +
693                         (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
694
695         /* Note that even when no rate limit is applied we need to yield
696          * periodically with no pending I/O so that bdrv_drain_all() returns.
697          * We do so every SLICE_TIME nanoseconds, or when there is an error,
698          * or when the source is clean, whichever comes first.
699          */
700         delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
701         if (delta < SLICE_TIME &&
702             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
703             if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
704                 (cnt == 0 && s->in_flight > 0)) {
705                 trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
706                 mirror_wait_for_io(s);
707                 continue;
708             } else if (cnt != 0) {
709                 delay_ns = mirror_iteration(s);
710             }
711         }
712
713         should_complete = false;
714         if (s->in_flight == 0 && cnt == 0) {
715             trace_mirror_before_flush(s);
716             ret = blk_flush(s->target);
717             if (ret < 0) {
718                 if (mirror_error_action(s, false, -ret) ==
719                     BLOCK_ERROR_ACTION_REPORT) {
720                     goto immediate_exit;
721                 }
722             } else {
723                 /* We're out of the streaming phase.  From now on, if the job
724                  * is cancelled we will actually complete all pending I/O and
725                  * report completion.  This way, block-job-cancel will leave
726                  * the target in a consistent state.
727                  */
728                 if (!s->synced) {
729                     block_job_event_ready(&s->common);
730                     s->synced = true;
731                 }
732
733                 should_complete = s->should_complete ||
734                     block_job_is_cancelled(&s->common);
735                 cnt = bdrv_get_dirty_count(s->dirty_bitmap);
736             }
737         }
738
739         if (cnt == 0 && should_complete) {
740             /* The dirty bitmap is not updated while operations are pending.
741              * If we're about to exit, wait for pending operations before
742              * calling bdrv_get_dirty_count(bs), or we may exit while the
743              * source has dirty data to copy!
744              *
745              * Note that I/O can be submitted by the guest while
746              * mirror_populate runs.
747              */
748             trace_mirror_before_drain(s, cnt);
749             bdrv_co_drain(bs);
750             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
751         }
752
753         ret = 0;
754         trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
755         if (!s->synced) {
756             block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
757             if (block_job_is_cancelled(&s->common)) {
758                 break;
759             }
760         } else if (!should_complete) {
761             delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
762             block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
763         } else if (cnt == 0) {
764             /* The two disks are in sync.  Exit and report successful
765              * completion.
766              */
767             assert(QLIST_EMPTY(&bs->tracked_requests));
768             s->common.cancelled = false;
769             break;
770         }
771         s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
772     }
773
774 immediate_exit:
775     if (s->in_flight > 0) {
776         /* We get here only if something went wrong.  Either the job failed,
777          * or it was cancelled prematurely so that we do not guarantee that
778          * the target is a copy of the source.
779          */
780         assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
781         mirror_drain(s);
782     }
783
784     assert(s->in_flight == 0);
785     qemu_vfree(s->buf);
786     g_free(s->cow_bitmap);
787     g_free(s->in_flight_bitmap);
788     bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
789
790     data = g_malloc(sizeof(*data));
791     data->ret = ret;
792     /* Before we switch to target in mirror_exit, make sure data doesn't
793      * change. */
794     bdrv_drained_begin(bs);
795     if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
796         /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
797          * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
798          * need a block layer API change to achieve this. */
799         aio_disable_external(iohandler_get_aio_context());
800     }
801     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
802 }
803
804 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
805 {
806     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
807
808     if (speed < 0) {
809         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
810         return;
811     }
812     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
813 }
814
815 static void mirror_complete(BlockJob *job, Error **errp)
816 {
817     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
818     BlockDriverState *src, *target;
819
820     src = blk_bs(job->blk);
821     target = blk_bs(s->target);
822
823     if (!s->synced) {
824         error_setg(errp, "The active block job '%s' cannot be completed",
825                    job->id);
826         return;
827     }
828
829     if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
830         int ret;
831
832         assert(!target->backing);
833         ret = bdrv_open_backing_file(target, NULL, "backing", errp);
834         if (ret < 0) {
835             return;
836         }
837     }
838
839     /* block all operations on to_replace bs */
840     if (s->replaces) {
841         AioContext *replace_aio_context;
842
843         s->to_replace = bdrv_find_node(s->replaces);
844         if (!s->to_replace) {
845             error_setg(errp, "Node name '%s' not found", s->replaces);
846             return;
847         }
848
849         replace_aio_context = bdrv_get_aio_context(s->to_replace);
850         aio_context_acquire(replace_aio_context);
851
852         error_setg(&s->replace_blocker,
853                    "block device is in use by block-job-complete");
854         bdrv_op_block_all(s->to_replace, s->replace_blocker);
855         bdrv_ref(s->to_replace);
856
857         aio_context_release(replace_aio_context);
858     }
859
860     if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
861         BlockDriverState *backing = s->is_none_mode ? src : s->base;
862         if (backing_bs(target) != backing) {
863             bdrv_set_backing_hd(target, backing);
864         }
865     }
866
867     s->should_complete = true;
868     block_job_enter(&s->common);
869 }
870
871 /* There is no matching mirror_resume() because mirror_run() will begin
872  * iterating again when the job is resumed.
873  */
874 static void coroutine_fn mirror_pause(BlockJob *job)
875 {
876     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
877
878     mirror_drain(s);
879 }
880
881 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
882 {
883     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
884
885     blk_set_aio_context(s->target, new_context);
886 }
887
888 static const BlockJobDriver mirror_job_driver = {
889     .instance_size          = sizeof(MirrorBlockJob),
890     .job_type               = BLOCK_JOB_TYPE_MIRROR,
891     .set_speed              = mirror_set_speed,
892     .complete               = mirror_complete,
893     .pause                  = mirror_pause,
894     .attached_aio_context   = mirror_attached_aio_context,
895 };
896
897 static const BlockJobDriver commit_active_job_driver = {
898     .instance_size          = sizeof(MirrorBlockJob),
899     .job_type               = BLOCK_JOB_TYPE_COMMIT,
900     .set_speed              = mirror_set_speed,
901     .complete               = mirror_complete,
902     .pause                  = mirror_pause,
903     .attached_aio_context   = mirror_attached_aio_context,
904 };
905
906 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
907                              BlockDriverState *target, const char *replaces,
908                              int64_t speed, uint32_t granularity,
909                              int64_t buf_size,
910                              BlockMirrorBackingMode backing_mode,
911                              BlockdevOnError on_source_error,
912                              BlockdevOnError on_target_error,
913                              bool unmap,
914                              BlockCompletionFunc *cb,
915                              void *opaque, Error **errp,
916                              const BlockJobDriver *driver,
917                              bool is_none_mode, BlockDriverState *base)
918 {
919     MirrorBlockJob *s;
920
921     if (granularity == 0) {
922         granularity = bdrv_get_default_bitmap_granularity(target);
923     }
924
925     assert ((granularity & (granularity - 1)) == 0);
926
927     if (buf_size < 0) {
928         error_setg(errp, "Invalid parameter 'buf-size'");
929         return;
930     }
931
932     if (buf_size == 0) {
933         buf_size = DEFAULT_MIRROR_BUF_SIZE;
934     }
935
936     s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
937     if (!s) {
938         return;
939     }
940
941     s->target = blk_new();
942     blk_insert_bs(s->target, target);
943
944     s->replaces = g_strdup(replaces);
945     s->on_source_error = on_source_error;
946     s->on_target_error = on_target_error;
947     s->is_none_mode = is_none_mode;
948     s->backing_mode = backing_mode;
949     s->base = base;
950     s->granularity = granularity;
951     s->buf_size = ROUND_UP(buf_size, granularity);
952     s->unmap = unmap;
953
954     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
955     if (!s->dirty_bitmap) {
956         g_free(s->replaces);
957         blk_unref(s->target);
958         block_job_unref(&s->common);
959         return;
960     }
961
962     bdrv_op_block_all(target, s->common.blocker);
963
964     s->common.co = qemu_coroutine_create(mirror_run, s);
965     trace_mirror_start(bs, s, s->common.co, opaque);
966     qemu_coroutine_enter(s->common.co);
967 }
968
969 void mirror_start(const char *job_id, BlockDriverState *bs,
970                   BlockDriverState *target, const char *replaces,
971                   int64_t speed, uint32_t granularity, int64_t buf_size,
972                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
973                   BlockdevOnError on_source_error,
974                   BlockdevOnError on_target_error,
975                   bool unmap,
976                   BlockCompletionFunc *cb,
977                   void *opaque, Error **errp)
978 {
979     bool is_none_mode;
980     BlockDriverState *base;
981
982     if (mode == MIRROR_SYNC_MODE_INCREMENTAL) {
983         error_setg(errp, "Sync mode 'incremental' not supported");
984         return;
985     }
986     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
987     base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
988     mirror_start_job(job_id, bs, target, replaces,
989                      speed, granularity, buf_size, backing_mode,
990                      on_source_error, on_target_error, unmap, cb, opaque, errp,
991                      &mirror_job_driver, is_none_mode, base);
992 }
993
994 void commit_active_start(const char *job_id, BlockDriverState *bs,
995                          BlockDriverState *base, int64_t speed,
996                          BlockdevOnError on_error,
997                          BlockCompletionFunc *cb,
998                          void *opaque, Error **errp)
999 {
1000     int64_t length, base_length;
1001     int orig_base_flags;
1002     int ret;
1003     Error *local_err = NULL;
1004
1005     orig_base_flags = bdrv_get_flags(base);
1006
1007     if (bdrv_reopen(base, bs->open_flags, errp)) {
1008         return;
1009     }
1010
1011     length = bdrv_getlength(bs);
1012     if (length < 0) {
1013         error_setg_errno(errp, -length,
1014                          "Unable to determine length of %s", bs->filename);
1015         goto error_restore_flags;
1016     }
1017
1018     base_length = bdrv_getlength(base);
1019     if (base_length < 0) {
1020         error_setg_errno(errp, -base_length,
1021                          "Unable to determine length of %s", base->filename);
1022         goto error_restore_flags;
1023     }
1024
1025     if (length > base_length) {
1026         ret = bdrv_truncate(base, length);
1027         if (ret < 0) {
1028             error_setg_errno(errp, -ret,
1029                             "Top image %s is larger than base image %s, and "
1030                              "resize of base image failed",
1031                              bs->filename, base->filename);
1032             goto error_restore_flags;
1033         }
1034     }
1035
1036     mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
1037                      MIRROR_LEAVE_BACKING_CHAIN,
1038                      on_error, on_error, false, cb, opaque, &local_err,
1039                      &commit_active_job_driver, false, base);
1040     if (local_err) {
1041         error_propagate(errp, local_err);
1042         goto error_restore_flags;
1043     }
1044
1045     return;
1046
1047 error_restore_flags:
1048     /* ignore error and errp for bdrv_reopen, because we want to propagate
1049      * the original error */
1050     bdrv_reopen(base, orig_base_flags, NULL);
1051     return;
1052 }
This page took 0.08033 seconds and 4 git commands to generate.