4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
30 #include "qemu/bitops.h"
31 #include "qemu/bitmap.h"
32 #include "qemu/timer.h"
33 #include "qemu/main-loop.h"
34 #include "migration/migration.h"
35 #include "exec/address-spaces.h"
36 #include "migration/page_cache.h"
37 #include "qemu/error-report.h"
39 #include "exec/ram_addr.h"
40 #include "qemu/rcu_queue.h"
42 #ifdef DEBUG_MIGRATION_RAM
43 #define DPRINTF(fmt, ...) \
44 do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
46 #define DPRINTF(fmt, ...) \
50 static int dirty_rate_high_cnt;
52 static uint64_t bitmap_sync_count;
54 /***********************************************************/
55 /* ram save/restore */
57 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
58 #define RAM_SAVE_FLAG_COMPRESS 0x02
59 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
60 #define RAM_SAVE_FLAG_PAGE 0x08
61 #define RAM_SAVE_FLAG_EOS 0x10
62 #define RAM_SAVE_FLAG_CONTINUE 0x20
63 #define RAM_SAVE_FLAG_XBZRLE 0x40
64 /* 0x80 is reserved in migration.h start with 0x100 next */
65 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
67 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
71 return buffer_find_nonzero_offset(p, size) == size;
74 /* struct contains XBZRLE cache and a static page
75 used by the compression */
77 /* buffer used for XBZRLE encoding */
79 /* buffer for storing page content */
81 /* Cache for XBZRLE, Protected by lock. */
86 /* buffer used for XBZRLE decoding */
87 static uint8_t *xbzrle_decoded_buf;
89 static void XBZRLE_cache_lock(void)
91 if (migrate_use_xbzrle())
92 qemu_mutex_lock(&XBZRLE.lock);
95 static void XBZRLE_cache_unlock(void)
97 if (migrate_use_xbzrle())
98 qemu_mutex_unlock(&XBZRLE.lock);
102 * called from qmp_migrate_set_cache_size in main thread, possibly while
103 * a migration is in progress.
104 * A running migration maybe using the cache and might finish during this
105 * call, hence changes to the cache are protected by XBZRLE.lock().
107 int64_t xbzrle_cache_resize(int64_t new_size)
109 PageCache *new_cache;
112 if (new_size < TARGET_PAGE_SIZE) {
118 if (XBZRLE.cache != NULL) {
119 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
122 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
125 error_report("Error creating cache");
130 cache_fini(XBZRLE.cache);
131 XBZRLE.cache = new_cache;
135 ret = pow2floor(new_size);
137 XBZRLE_cache_unlock();
141 /* accounting for migration statistics */
142 typedef struct AccountingInfo {
144 uint64_t skipped_pages;
147 uint64_t xbzrle_bytes;
148 uint64_t xbzrle_pages;
149 uint64_t xbzrle_cache_miss;
150 double xbzrle_cache_miss_rate;
151 uint64_t xbzrle_overflows;
154 static AccountingInfo acct_info;
156 static void acct_clear(void)
158 memset(&acct_info, 0, sizeof(acct_info));
161 uint64_t dup_mig_bytes_transferred(void)
163 return acct_info.dup_pages * TARGET_PAGE_SIZE;
166 uint64_t dup_mig_pages_transferred(void)
168 return acct_info.dup_pages;
171 uint64_t skipped_mig_bytes_transferred(void)
173 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
176 uint64_t skipped_mig_pages_transferred(void)
178 return acct_info.skipped_pages;
181 uint64_t norm_mig_bytes_transferred(void)
183 return acct_info.norm_pages * TARGET_PAGE_SIZE;
186 uint64_t norm_mig_pages_transferred(void)
188 return acct_info.norm_pages;
191 uint64_t xbzrle_mig_bytes_transferred(void)
193 return acct_info.xbzrle_bytes;
196 uint64_t xbzrle_mig_pages_transferred(void)
198 return acct_info.xbzrle_pages;
201 uint64_t xbzrle_mig_pages_cache_miss(void)
203 return acct_info.xbzrle_cache_miss;
206 double xbzrle_mig_cache_miss_rate(void)
208 return acct_info.xbzrle_cache_miss_rate;
211 uint64_t xbzrle_mig_pages_overflow(void)
213 return acct_info.xbzrle_overflows;
216 /* This is the last block that we have visited serching for dirty pages
218 static RAMBlock *last_seen_block;
219 /* This is the last block from where we have sent data */
220 static RAMBlock *last_sent_block;
221 static ram_addr_t last_offset;
222 static QemuMutex migration_bitmap_mutex;
223 static uint64_t migration_dirty_pages;
224 static uint32_t last_version;
225 static bool ram_bulk_stage;
227 /* used by the search for pages to send */
228 struct PageSearchStatus {
229 /* Current block being searched */
231 /* Current offset to search from */
233 /* Set once we wrap around */
236 typedef struct PageSearchStatus PageSearchStatus;
238 static struct BitmapRcu {
241 } *migration_bitmap_rcu;
243 struct CompressParam {
252 typedef struct CompressParam CompressParam;
254 struct DecompressParam {
262 typedef struct DecompressParam DecompressParam;
264 static CompressParam *comp_param;
265 static QemuThread *compress_threads;
266 /* comp_done_cond is used to wake up the migration thread when
267 * one of the compression threads has finished the compression.
268 * comp_done_lock is used to co-work with comp_done_cond.
270 static QemuMutex *comp_done_lock;
271 static QemuCond *comp_done_cond;
272 /* The empty QEMUFileOps will be used by file in CompressParam */
273 static const QEMUFileOps empty_ops = { };
275 static bool compression_switch;
276 static bool quit_comp_thread;
277 static bool quit_decomp_thread;
278 static DecompressParam *decomp_param;
279 static QemuThread *decompress_threads;
280 static uint8_t *compressed_data_buf;
282 static int do_compress_ram_page(CompressParam *param);
284 static void *do_data_compress(void *opaque)
286 CompressParam *param = opaque;
288 while (!quit_comp_thread) {
289 qemu_mutex_lock(¶m->mutex);
290 /* Re-check the quit_comp_thread in case of
291 * terminate_compression_threads is called just before
292 * qemu_mutex_lock(¶m->mutex) and after
293 * while(!quit_comp_thread), re-check it here can make
294 * sure the compression thread terminate as expected.
296 while (!param->start && !quit_comp_thread) {
297 qemu_cond_wait(¶m->cond, ¶m->mutex);
299 if (!quit_comp_thread) {
300 do_compress_ram_page(param);
302 param->start = false;
303 qemu_mutex_unlock(¶m->mutex);
305 qemu_mutex_lock(comp_done_lock);
307 qemu_cond_signal(comp_done_cond);
308 qemu_mutex_unlock(comp_done_lock);
314 static inline void terminate_compression_threads(void)
316 int idx, thread_count;
318 thread_count = migrate_compress_threads();
319 quit_comp_thread = true;
320 for (idx = 0; idx < thread_count; idx++) {
321 qemu_mutex_lock(&comp_param[idx].mutex);
322 qemu_cond_signal(&comp_param[idx].cond);
323 qemu_mutex_unlock(&comp_param[idx].mutex);
327 void migrate_compress_threads_join(void)
331 if (!migrate_use_compression()) {
334 terminate_compression_threads();
335 thread_count = migrate_compress_threads();
336 for (i = 0; i < thread_count; i++) {
337 qemu_thread_join(compress_threads + i);
338 qemu_fclose(comp_param[i].file);
339 qemu_mutex_destroy(&comp_param[i].mutex);
340 qemu_cond_destroy(&comp_param[i].cond);
342 qemu_mutex_destroy(comp_done_lock);
343 qemu_cond_destroy(comp_done_cond);
344 g_free(compress_threads);
346 g_free(comp_done_cond);
347 g_free(comp_done_lock);
348 compress_threads = NULL;
350 comp_done_cond = NULL;
351 comp_done_lock = NULL;
354 void migrate_compress_threads_create(void)
358 if (!migrate_use_compression()) {
361 quit_comp_thread = false;
362 compression_switch = true;
363 thread_count = migrate_compress_threads();
364 compress_threads = g_new0(QemuThread, thread_count);
365 comp_param = g_new0(CompressParam, thread_count);
366 comp_done_cond = g_new0(QemuCond, 1);
367 comp_done_lock = g_new0(QemuMutex, 1);
368 qemu_cond_init(comp_done_cond);
369 qemu_mutex_init(comp_done_lock);
370 for (i = 0; i < thread_count; i++) {
371 /* com_param[i].file is just used as a dummy buffer to save data, set
374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375 comp_param[i].done = true;
376 qemu_mutex_init(&comp_param[i].mutex);
377 qemu_cond_init(&comp_param[i].cond);
378 qemu_thread_create(compress_threads + i, "compress",
379 do_data_compress, comp_param + i,
380 QEMU_THREAD_JOINABLE);
385 * save_page_header: Write page header to wire
387 * If this is the 1st block, it also writes the block identification
389 * Returns: Number of bytes written
391 * @f: QEMUFile where to send the data
392 * @block: block that contains the page we want to send
393 * @offset: offset inside the block for the page
394 * in the lower bits, it contains flags
396 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
400 qemu_put_be64(f, offset);
403 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
404 len = strlen(block->idstr);
405 qemu_put_byte(f, len);
406 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
412 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
413 * If guest dirty memory rate is reduced below the rate at which we can
414 * transfer pages to the destination then we should be able to complete
415 * migration. Some workloads dirty memory way too fast and will not effectively
416 * converge, even with auto-converge.
418 static void mig_throttle_guest_down(void)
420 MigrationState *s = migrate_get_current();
421 uint64_t pct_initial =
422 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
423 uint64_t pct_icrement =
424 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
426 /* We have not started throttling yet. Let's start it. */
427 if (!cpu_throttle_active()) {
428 cpu_throttle_set(pct_initial);
430 /* Throttling already on, just increase the rate */
431 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
435 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
436 * The important thing is that a stale (not-yet-0'd) page be replaced
438 * As a bonus, if the page wasn't in the cache it gets added so that
439 * when a small write is made into the 0'd page it gets XBZRLE sent
441 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
443 if (ram_bulk_stage || !migrate_use_xbzrle()) {
447 /* We don't care if this fails to allocate a new cache page
448 * as long as it updated an old one */
449 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
453 #define ENCODING_FLAG_XBZRLE 0x1
456 * save_xbzrle_page: compress and send current page
458 * Returns: 1 means that we wrote the page
459 * 0 means that page is identical to the one already sent
460 * -1 means that xbzrle would be longer than normal
462 * @f: QEMUFile where to send the data
465 * @block: block that contains the page we want to send
466 * @offset: offset inside the block for the page
467 * @last_stage: if we are at the completion stage
468 * @bytes_transferred: increase it with the number of transferred bytes
470 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
471 ram_addr_t current_addr, RAMBlock *block,
472 ram_addr_t offset, bool last_stage,
473 uint64_t *bytes_transferred)
475 int encoded_len = 0, bytes_xbzrle;
476 uint8_t *prev_cached_page;
478 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
479 acct_info.xbzrle_cache_miss++;
481 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
482 bitmap_sync_count) == -1) {
485 /* update *current_data when the page has been
486 inserted into cache */
487 *current_data = get_cached_data(XBZRLE.cache, current_addr);
493 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
495 /* save current buffer into memory */
496 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
498 /* XBZRLE encoding (if there is no overflow) */
499 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
500 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
502 if (encoded_len == 0) {
503 DPRINTF("Skipping unmodified page\n");
505 } else if (encoded_len == -1) {
506 DPRINTF("Overflow\n");
507 acct_info.xbzrle_overflows++;
508 /* update data in the cache */
510 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
511 *current_data = prev_cached_page;
516 /* we need to update the data in the cache, in order to get the same data */
518 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
521 /* Send XBZRLE based compressed page */
522 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
523 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
524 qemu_put_be16(f, encoded_len);
525 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
526 bytes_xbzrle += encoded_len + 1 + 2;
527 acct_info.xbzrle_pages++;
528 acct_info.xbzrle_bytes += bytes_xbzrle;
529 *bytes_transferred += bytes_xbzrle;
534 /* Called with rcu_read_lock() to protect migration_bitmap */
536 ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
539 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
540 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
541 uint64_t rb_size = rb->used_length;
542 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
543 unsigned long *bitmap;
547 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
548 if (ram_bulk_stage && nr > base) {
551 next = find_next_bit(bitmap, size, nr);
555 clear_bit(next, bitmap);
556 migration_dirty_pages--;
558 return (next - base) << TARGET_PAGE_BITS;
561 /* Called with rcu_read_lock() to protect migration_bitmap */
562 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
564 unsigned long *bitmap;
565 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
566 migration_dirty_pages +=
567 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
570 /* Fix me: there are too many global variables used in migration process. */
571 static int64_t start_time;
572 static int64_t bytes_xfer_prev;
573 static int64_t num_dirty_pages_period;
574 static uint64_t xbzrle_cache_miss_prev;
575 static uint64_t iterations_prev;
577 static void migration_bitmap_sync_init(void)
581 num_dirty_pages_period = 0;
582 xbzrle_cache_miss_prev = 0;
586 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
587 static void migration_bitmap_sync(void)
590 uint64_t num_dirty_pages_init = migration_dirty_pages;
591 MigrationState *s = migrate_get_current();
593 int64_t bytes_xfer_now;
597 if (!bytes_xfer_prev) {
598 bytes_xfer_prev = ram_bytes_transferred();
602 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
605 trace_migration_bitmap_sync_start();
606 address_space_sync_dirty_bitmap(&address_space_memory);
608 qemu_mutex_lock(&migration_bitmap_mutex);
610 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
611 migration_bitmap_sync_range(block->offset, block->used_length);
614 qemu_mutex_unlock(&migration_bitmap_mutex);
616 trace_migration_bitmap_sync_end(migration_dirty_pages
617 - num_dirty_pages_init);
618 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
619 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
621 /* more than 1 second = 1000 millisecons */
622 if (end_time > start_time + 1000) {
623 if (migrate_auto_converge()) {
624 /* The following detection logic can be refined later. For now:
625 Check to see if the dirtied bytes is 50% more than the approx.
626 amount of bytes that just got transferred since the last time we
627 were in this routine. If that happens twice, start or increase
629 bytes_xfer_now = ram_bytes_transferred();
631 if (s->dirty_pages_rate &&
632 (num_dirty_pages_period * TARGET_PAGE_SIZE >
633 (bytes_xfer_now - bytes_xfer_prev)/2) &&
634 (dirty_rate_high_cnt++ >= 2)) {
635 trace_migration_throttle();
636 dirty_rate_high_cnt = 0;
637 mig_throttle_guest_down();
639 bytes_xfer_prev = bytes_xfer_now;
642 if (migrate_use_xbzrle()) {
643 if (iterations_prev != acct_info.iterations) {
644 acct_info.xbzrle_cache_miss_rate =
645 (double)(acct_info.xbzrle_cache_miss -
646 xbzrle_cache_miss_prev) /
647 (acct_info.iterations - iterations_prev);
649 iterations_prev = acct_info.iterations;
650 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
652 s->dirty_pages_rate = num_dirty_pages_period * 1000
653 / (end_time - start_time);
654 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
655 start_time = end_time;
656 num_dirty_pages_period = 0;
658 s->dirty_sync_count = bitmap_sync_count;
662 * save_zero_page: Send the zero page to the stream
664 * Returns: Number of pages written.
666 * @f: QEMUFile where to send the data
667 * @block: block that contains the page we want to send
668 * @offset: offset inside the block for the page
669 * @p: pointer to the page
670 * @bytes_transferred: increase it with the number of transferred bytes
672 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
673 uint8_t *p, uint64_t *bytes_transferred)
677 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
678 acct_info.dup_pages++;
679 *bytes_transferred += save_page_header(f, block,
680 offset | RAM_SAVE_FLAG_COMPRESS);
682 *bytes_transferred += 1;
690 * ram_save_page: Send the given page to the stream
692 * Returns: Number of pages written.
694 * @f: QEMUFile where to send the data
695 * @block: block that contains the page we want to send
696 * @offset: offset inside the block for the page
697 * @last_stage: if we are at the completion stage
698 * @bytes_transferred: increase it with the number of transferred bytes
700 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
701 bool last_stage, uint64_t *bytes_transferred)
705 ram_addr_t current_addr;
708 bool send_async = true;
710 p = block->host + offset;
712 /* In doubt sent page as normal */
714 ret = ram_control_save_page(f, block->offset,
715 offset, TARGET_PAGE_SIZE, &bytes_xmit);
717 *bytes_transferred += bytes_xmit;
723 current_addr = block->offset + offset;
725 if (block == last_sent_block) {
726 offset |= RAM_SAVE_FLAG_CONTINUE;
728 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
729 if (ret != RAM_SAVE_CONTROL_DELAYED) {
730 if (bytes_xmit > 0) {
731 acct_info.norm_pages++;
732 } else if (bytes_xmit == 0) {
733 acct_info.dup_pages++;
737 pages = save_zero_page(f, block, offset, p, bytes_transferred);
739 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
740 * page would be stale
742 xbzrle_cache_zero_page(current_addr);
743 } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
744 pages = save_xbzrle_page(f, &p, current_addr, block,
745 offset, last_stage, bytes_transferred);
747 /* Can't send this cached data async, since the cache page
748 * might get updated before it gets to the wire
755 /* XBZRLE overflow or normal page */
757 *bytes_transferred += save_page_header(f, block,
758 offset | RAM_SAVE_FLAG_PAGE);
760 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
762 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
764 *bytes_transferred += TARGET_PAGE_SIZE;
766 acct_info.norm_pages++;
769 XBZRLE_cache_unlock();
774 static int do_compress_ram_page(CompressParam *param)
776 int bytes_sent, blen;
778 RAMBlock *block = param->block;
779 ram_addr_t offset = param->offset;
781 p = block->host + (offset & TARGET_PAGE_MASK);
783 bytes_sent = save_page_header(param->file, block, offset |
784 RAM_SAVE_FLAG_COMPRESS_PAGE);
785 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
786 migrate_compress_level());
792 static inline void start_compression(CompressParam *param)
795 qemu_mutex_lock(¶m->mutex);
797 qemu_cond_signal(¶m->cond);
798 qemu_mutex_unlock(¶m->mutex);
801 static inline void start_decompression(DecompressParam *param)
803 qemu_mutex_lock(¶m->mutex);
805 qemu_cond_signal(¶m->cond);
806 qemu_mutex_unlock(¶m->mutex);
809 static uint64_t bytes_transferred;
811 static void flush_compressed_data(QEMUFile *f)
813 int idx, len, thread_count;
815 if (!migrate_use_compression()) {
818 thread_count = migrate_compress_threads();
819 for (idx = 0; idx < thread_count; idx++) {
820 if (!comp_param[idx].done) {
821 qemu_mutex_lock(comp_done_lock);
822 while (!comp_param[idx].done && !quit_comp_thread) {
823 qemu_cond_wait(comp_done_cond, comp_done_lock);
825 qemu_mutex_unlock(comp_done_lock);
827 if (!quit_comp_thread) {
828 len = qemu_put_qemu_file(f, comp_param[idx].file);
829 bytes_transferred += len;
834 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
837 param->block = block;
838 param->offset = offset;
841 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
843 uint64_t *bytes_transferred)
845 int idx, thread_count, bytes_xmit = -1, pages = -1;
847 thread_count = migrate_compress_threads();
848 qemu_mutex_lock(comp_done_lock);
850 for (idx = 0; idx < thread_count; idx++) {
851 if (comp_param[idx].done) {
852 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
853 set_compress_params(&comp_param[idx], block, offset);
854 start_compression(&comp_param[idx]);
856 acct_info.norm_pages++;
857 *bytes_transferred += bytes_xmit;
864 qemu_cond_wait(comp_done_cond, comp_done_lock);
867 qemu_mutex_unlock(comp_done_lock);
873 * ram_save_compressed_page: compress the given page and send it to the stream
875 * Returns: Number of pages written.
877 * @f: QEMUFile where to send the data
878 * @block: block that contains the page we want to send
879 * @offset: offset inside the block for the page
880 * @last_stage: if we are at the completion stage
881 * @bytes_transferred: increase it with the number of transferred bytes
883 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
884 ram_addr_t offset, bool last_stage,
885 uint64_t *bytes_transferred)
892 p = block->host + offset;
895 ret = ram_control_save_page(f, block->offset,
896 offset, TARGET_PAGE_SIZE, &bytes_xmit);
898 *bytes_transferred += bytes_xmit;
901 if (block == last_sent_block) {
902 offset |= RAM_SAVE_FLAG_CONTINUE;
904 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
905 if (ret != RAM_SAVE_CONTROL_DELAYED) {
906 if (bytes_xmit > 0) {
907 acct_info.norm_pages++;
908 } else if (bytes_xmit == 0) {
909 acct_info.dup_pages++;
913 /* When starting the process of a new block, the first page of
914 * the block should be sent out before other pages in the same
915 * block, and all the pages in last block should have been sent
916 * out, keeping this order is important, because the 'cont' flag
917 * is used to avoid resending the block name.
919 if (block != last_sent_block) {
920 flush_compressed_data(f);
921 pages = save_zero_page(f, block, offset, p, bytes_transferred);
923 set_compress_params(&comp_param[0], block, offset);
924 /* Use the qemu thread to compress the data to make sure the
925 * first page is sent out before other pages
927 bytes_xmit = do_compress_ram_page(&comp_param[0]);
928 acct_info.norm_pages++;
929 qemu_put_qemu_file(f, comp_param[0].file);
930 *bytes_transferred += bytes_xmit;
934 pages = save_zero_page(f, block, offset, p, bytes_transferred);
936 pages = compress_page_with_multi_thread(f, block, offset,
946 * Find the next dirty page and update any state associated with
947 * the search process.
949 * Returns: True if a page is found
951 * @f: Current migration stream.
952 * @pss: Data about the state of the current dirty page scan.
953 * @*again: Set to false if the search has scanned the whole of RAM
955 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
958 pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
960 if (pss->complete_round && pss->block == last_seen_block &&
961 pss->offset >= last_offset) {
963 * We've been once around the RAM and haven't found anything.
969 if (pss->offset >= pss->block->used_length) {
970 /* Didn't find anything in this RAM Block */
972 pss->block = QLIST_NEXT_RCU(pss->block, next);
974 /* Hit the end of the list */
975 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
976 /* Flag that we've looped */
977 pss->complete_round = true;
978 ram_bulk_stage = false;
979 if (migrate_use_xbzrle()) {
980 /* If xbzrle is on, stop using the data compression at this
981 * point. In theory, xbzrle can do better than compression.
983 flush_compressed_data(f);
984 compression_switch = false;
987 /* Didn't find anything this time, but try again on the new block */
991 /* Can go around again, but... */
993 /* We've found something so probably don't need to */
999 * ram_find_and_save_block: Finds a dirty page and sends it to f
1001 * Called within an RCU critical section.
1003 * Returns: The number of pages written
1004 * 0 means no dirty pages
1006 * @f: QEMUFile where to send the data
1007 * @last_stage: if we are at the completion stage
1008 * @bytes_transferred: increase it with the number of transferred bytes
1011 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1012 uint64_t *bytes_transferred)
1014 PageSearchStatus pss;
1018 pss.block = last_seen_block;
1019 pss.offset = last_offset;
1020 pss.complete_round = false;
1023 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1027 found = find_dirty_block(f, &pss, &again);
1030 if (compression_switch && migrate_use_compression()) {
1031 pages = ram_save_compressed_page(f, pss.block, pss.offset,
1035 pages = ram_save_page(f, pss.block, pss.offset, last_stage,
1039 /* if page is unmodified, continue to the next */
1041 last_sent_block = pss.block;
1044 } while (!pages && again);
1046 last_seen_block = pss.block;
1047 last_offset = pss.offset;
1052 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1054 uint64_t pages = size / TARGET_PAGE_SIZE;
1056 acct_info.dup_pages += pages;
1058 acct_info.norm_pages += pages;
1059 bytes_transferred += size;
1060 qemu_update_position(f, size);
1064 static ram_addr_t ram_save_remaining(void)
1066 return migration_dirty_pages;
1069 uint64_t ram_bytes_remaining(void)
1071 return ram_save_remaining() * TARGET_PAGE_SIZE;
1074 uint64_t ram_bytes_transferred(void)
1076 return bytes_transferred;
1079 uint64_t ram_bytes_total(void)
1085 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1086 total += block->used_length;
1091 void free_xbzrle_decoded_buf(void)
1093 g_free(xbzrle_decoded_buf);
1094 xbzrle_decoded_buf = NULL;
1097 static void migration_bitmap_free(struct BitmapRcu *bmap)
1103 static void migration_end(void)
1105 /* caller have hold iothread lock or is in a bh, so there is
1106 * no writing race against this migration_bitmap
1108 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1109 atomic_rcu_set(&migration_bitmap_rcu, NULL);
1111 memory_global_dirty_log_stop();
1112 call_rcu(bitmap, migration_bitmap_free, rcu);
1115 XBZRLE_cache_lock();
1117 cache_fini(XBZRLE.cache);
1118 g_free(XBZRLE.encoded_buf);
1119 g_free(XBZRLE.current_buf);
1120 XBZRLE.cache = NULL;
1121 XBZRLE.encoded_buf = NULL;
1122 XBZRLE.current_buf = NULL;
1124 XBZRLE_cache_unlock();
1127 static void ram_migration_cancel(void *opaque)
1132 static void reset_ram_globals(void)
1134 last_seen_block = NULL;
1135 last_sent_block = NULL;
1137 last_version = ram_list.version;
1138 ram_bulk_stage = true;
1141 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1143 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1145 /* called in qemu main thread, so there is
1146 * no writing race against this migration_bitmap
1148 if (migration_bitmap_rcu) {
1149 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1150 bitmap = g_new(struct BitmapRcu, 1);
1151 bitmap->bmap = bitmap_new(new);
1153 /* prevent migration_bitmap content from being set bit
1154 * by migration_bitmap_sync_range() at the same time.
1155 * it is safe to migration if migration_bitmap is cleared bit
1158 qemu_mutex_lock(&migration_bitmap_mutex);
1159 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1160 bitmap_set(bitmap->bmap, old, new - old);
1161 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1162 qemu_mutex_unlock(&migration_bitmap_mutex);
1163 migration_dirty_pages += new - old;
1164 call_rcu(old_bitmap, migration_bitmap_free, rcu);
1168 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1169 * long-running RCU critical section. When rcu-reclaims in the code
1170 * start to become numerous it will be necessary to reduce the
1171 * granularity of these critical sections.
1174 static int ram_save_setup(QEMUFile *f, void *opaque)
1177 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1179 dirty_rate_high_cnt = 0;
1180 bitmap_sync_count = 0;
1181 migration_bitmap_sync_init();
1182 qemu_mutex_init(&migration_bitmap_mutex);
1184 if (migrate_use_xbzrle()) {
1185 XBZRLE_cache_lock();
1186 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1189 if (!XBZRLE.cache) {
1190 XBZRLE_cache_unlock();
1191 error_report("Error creating cache");
1194 XBZRLE_cache_unlock();
1196 /* We prefer not to abort if there is no memory */
1197 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1198 if (!XBZRLE.encoded_buf) {
1199 error_report("Error allocating encoded_buf");
1203 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1204 if (!XBZRLE.current_buf) {
1205 error_report("Error allocating current_buf");
1206 g_free(XBZRLE.encoded_buf);
1207 XBZRLE.encoded_buf = NULL;
1214 /* iothread lock needed for ram_list.dirty_memory[] */
1215 qemu_mutex_lock_iothread();
1216 qemu_mutex_lock_ramlist();
1218 bytes_transferred = 0;
1219 reset_ram_globals();
1221 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1222 migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
1223 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1224 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1227 * Count the total number of pages used by ram blocks not including any
1228 * gaps due to alignment or unplugs.
1230 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1232 memory_global_dirty_log_start();
1233 migration_bitmap_sync();
1234 qemu_mutex_unlock_ramlist();
1235 qemu_mutex_unlock_iothread();
1237 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1239 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1240 qemu_put_byte(f, strlen(block->idstr));
1241 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1242 qemu_put_be64(f, block->used_length);
1247 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1248 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1250 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1255 static int ram_save_iterate(QEMUFile *f, void *opaque)
1263 if (ram_list.version != last_version) {
1264 reset_ram_globals();
1267 /* Read version before ram_list.blocks */
1270 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1272 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1274 while ((ret = qemu_file_rate_limit(f)) == 0) {
1277 pages = ram_find_and_save_block(f, false, &bytes_transferred);
1278 /* no more pages to sent */
1282 pages_sent += pages;
1283 acct_info.iterations++;
1285 /* we want to check in the 1st loop, just in case it was the 1st time
1286 and we had to sync the dirty bitmap.
1287 qemu_get_clock_ns() is a bit expensive, so we only check each some
1290 if ((i & 63) == 0) {
1291 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1292 if (t1 > MAX_WAIT) {
1293 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1300 flush_compressed_data(f);
1304 * Must occur before EOS (or any QEMUFile operation)
1305 * because of RDMA protocol.
1307 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1309 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1310 bytes_transferred += 8;
1312 ret = qemu_file_get_error(f);
1320 /* Called with iothread lock */
1321 static int ram_save_complete(QEMUFile *f, void *opaque)
1325 migration_bitmap_sync();
1327 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1329 /* try transferring iterative blocks of memory */
1331 /* flush all remaining blocks regardless of rate limiting */
1335 pages = ram_find_and_save_block(f, true, &bytes_transferred);
1336 /* no more blocks to sent */
1342 flush_compressed_data(f);
1343 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
1348 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1353 static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1355 uint64_t remaining_size;
1357 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1359 if (remaining_size < max_size) {
1360 qemu_mutex_lock_iothread();
1362 migration_bitmap_sync();
1364 qemu_mutex_unlock_iothread();
1365 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1367 return remaining_size;
1370 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1372 unsigned int xh_len;
1375 if (!xbzrle_decoded_buf) {
1376 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1379 /* extract RLE header */
1380 xh_flags = qemu_get_byte(f);
1381 xh_len = qemu_get_be16(f);
1383 if (xh_flags != ENCODING_FLAG_XBZRLE) {
1384 error_report("Failed to load XBZRLE page - wrong compression!");
1388 if (xh_len > TARGET_PAGE_SIZE) {
1389 error_report("Failed to load XBZRLE page - len overflow!");
1392 /* load data and decode */
1393 qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1396 if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1397 TARGET_PAGE_SIZE) == -1) {
1398 error_report("Failed to load XBZRLE page - decode error!");
1405 /* Must be called from within a rcu critical section.
1406 * Returns a pointer from within the RCU-protected ram_list.
1408 static inline void *host_from_stream_offset(QEMUFile *f,
1412 static RAMBlock *block = NULL;
1416 if (flags & RAM_SAVE_FLAG_CONTINUE) {
1417 if (!block || block->max_length <= offset) {
1418 error_report("Ack, bad migration stream!");
1422 return block->host + offset;
1425 len = qemu_get_byte(f);
1426 qemu_get_buffer(f, (uint8_t *)id, len);
1429 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1430 if (!strncmp(id, block->idstr, sizeof(id)) &&
1431 block->max_length > offset) {
1432 return block->host + offset;
1436 error_report("Can't find block %s!", id);
1441 * If a page (or a whole RDMA chunk) has been
1442 * determined to be zero, then zap it.
1444 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1446 if (ch != 0 || !is_zero_range(host, size)) {
1447 memset(host, ch, size);
1451 static void *do_data_decompress(void *opaque)
1453 DecompressParam *param = opaque;
1454 unsigned long pagesize;
1456 while (!quit_decomp_thread) {
1457 qemu_mutex_lock(¶m->mutex);
1458 while (!param->start && !quit_decomp_thread) {
1459 qemu_cond_wait(¶m->cond, ¶m->mutex);
1460 pagesize = TARGET_PAGE_SIZE;
1461 if (!quit_decomp_thread) {
1462 /* uncompress() will return failed in some case, especially
1463 * when the page is dirted when doing the compression, it's
1464 * not a problem because the dirty page will be retransferred
1465 * and uncompress() won't break the data in other pages.
1467 uncompress((Bytef *)param->des, &pagesize,
1468 (const Bytef *)param->compbuf, param->len);
1470 param->start = false;
1472 qemu_mutex_unlock(¶m->mutex);
1478 void migrate_decompress_threads_create(void)
1480 int i, thread_count;
1482 thread_count = migrate_decompress_threads();
1483 decompress_threads = g_new0(QemuThread, thread_count);
1484 decomp_param = g_new0(DecompressParam, thread_count);
1485 compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1486 quit_decomp_thread = false;
1487 for (i = 0; i < thread_count; i++) {
1488 qemu_mutex_init(&decomp_param[i].mutex);
1489 qemu_cond_init(&decomp_param[i].cond);
1490 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1491 qemu_thread_create(decompress_threads + i, "decompress",
1492 do_data_decompress, decomp_param + i,
1493 QEMU_THREAD_JOINABLE);
1497 void migrate_decompress_threads_join(void)
1499 int i, thread_count;
1501 quit_decomp_thread = true;
1502 thread_count = migrate_decompress_threads();
1503 for (i = 0; i < thread_count; i++) {
1504 qemu_mutex_lock(&decomp_param[i].mutex);
1505 qemu_cond_signal(&decomp_param[i].cond);
1506 qemu_mutex_unlock(&decomp_param[i].mutex);
1508 for (i = 0; i < thread_count; i++) {
1509 qemu_thread_join(decompress_threads + i);
1510 qemu_mutex_destroy(&decomp_param[i].mutex);
1511 qemu_cond_destroy(&decomp_param[i].cond);
1512 g_free(decomp_param[i].compbuf);
1514 g_free(decompress_threads);
1515 g_free(decomp_param);
1516 g_free(compressed_data_buf);
1517 decompress_threads = NULL;
1518 decomp_param = NULL;
1519 compressed_data_buf = NULL;
1522 static void decompress_data_with_multi_threads(uint8_t *compbuf,
1523 void *host, int len)
1525 int idx, thread_count;
1527 thread_count = migrate_decompress_threads();
1529 for (idx = 0; idx < thread_count; idx++) {
1530 if (!decomp_param[idx].start) {
1531 memcpy(decomp_param[idx].compbuf, compbuf, len);
1532 decomp_param[idx].des = host;
1533 decomp_param[idx].len = len;
1534 start_decompression(&decomp_param[idx]);
1538 if (idx < thread_count) {
1544 static int ram_load(QEMUFile *f, void *opaque, int version_id)
1546 int flags = 0, ret = 0;
1547 static uint64_t seq_iter;
1552 if (version_id != 4) {
1556 /* This RCU critical section can be very long running.
1557 * When RCU reclaims in the code start to become numerous,
1558 * it will be necessary to reduce the granularity of this
1562 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1563 ram_addr_t addr, total_ram_bytes;
1567 addr = qemu_get_be64(f);
1568 flags = addr & ~TARGET_PAGE_MASK;
1569 addr &= TARGET_PAGE_MASK;
1571 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1572 case RAM_SAVE_FLAG_MEM_SIZE:
1573 /* Synchronize RAM block list */
1574 total_ram_bytes = addr;
1575 while (!ret && total_ram_bytes) {
1580 len = qemu_get_byte(f);
1581 qemu_get_buffer(f, (uint8_t *)id, len);
1583 length = qemu_get_be64(f);
1585 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1586 if (!strncmp(id, block->idstr, sizeof(id))) {
1587 if (length != block->used_length) {
1588 Error *local_err = NULL;
1590 ret = qemu_ram_resize(block->offset, length, &local_err);
1592 error_report_err(local_err);
1595 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
1602 error_report("Unknown ramblock \"%s\", cannot "
1603 "accept migration", id);
1607 total_ram_bytes -= length;
1610 case RAM_SAVE_FLAG_COMPRESS:
1611 host = host_from_stream_offset(f, addr, flags);
1613 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1617 ch = qemu_get_byte(f);
1618 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1620 case RAM_SAVE_FLAG_PAGE:
1621 host = host_from_stream_offset(f, addr, flags);
1623 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1627 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1629 case RAM_SAVE_FLAG_COMPRESS_PAGE:
1630 host = host_from_stream_offset(f, addr, flags);
1632 error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
1637 len = qemu_get_be32(f);
1638 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1639 error_report("Invalid compressed data length: %d", len);
1643 qemu_get_buffer(f, compressed_data_buf, len);
1644 decompress_data_with_multi_threads(compressed_data_buf, host, len);
1646 case RAM_SAVE_FLAG_XBZRLE:
1647 host = host_from_stream_offset(f, addr, flags);
1649 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1653 if (load_xbzrle(f, addr, host) < 0) {
1654 error_report("Failed to decompress XBZRLE page at "
1655 RAM_ADDR_FMT, addr);
1660 case RAM_SAVE_FLAG_EOS:
1664 if (flags & RAM_SAVE_FLAG_HOOK) {
1665 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
1667 error_report("Unknown combination of migration flags: %#x",
1673 ret = qemu_file_get_error(f);
1678 DPRINTF("Completed load of VM with exit code %d seq iteration "
1679 "%" PRIu64 "\n", ret, seq_iter);
1683 static SaveVMHandlers savevm_ram_handlers = {
1684 .save_live_setup = ram_save_setup,
1685 .save_live_iterate = ram_save_iterate,
1686 .save_live_complete = ram_save_complete,
1687 .save_live_pending = ram_save_pending,
1688 .load_state = ram_load,
1689 .cancel = ram_migration_cancel,
1692 void ram_mig_init(void)
1694 qemu_mutex_init(&XBZRLE.lock);
1695 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);