4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
38 #include "migration.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
59 /***********************************************************/
60 /* ram save/restore */
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 return buffer_is_zero(p, size);
83 XBZRLECacheStats xbzrle_counters;
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
88 /* buffer used for XBZRLE encoding */
90 /* buffer for storing page content */
92 /* Cache for XBZRLE, Protected by lock. */
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
101 static void XBZRLE_cache_lock(void)
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
107 static void XBZRLE_cache_unlock(void)
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
114 * xbzrle_cache_resize: resize the xbzrle cache
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
121 * Returns 0 for success or -1 for error
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 PageCache *new_cache;
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
138 if (new_size == migrate_xbzrle_cache_size()) {
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
156 XBZRLE_cache_unlock();
160 /* Should be holding either ram_list.mutex, or the RCU lock. */
161 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
162 INTERNAL_RAMBLOCK_FOREACH(block) \
163 if (!qemu_ram_is_migratable(block)) {} else
165 #undef RAMBLOCK_FOREACH
167 static void ramblock_recv_map_init(void)
171 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
172 assert(!rb->receivedmap);
173 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
177 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
179 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
183 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
185 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
188 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
190 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
193 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
196 bitmap_set_atomic(rb->receivedmap,
197 ramblock_recv_bitmap_offset(host_addr, rb),
201 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
204 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
206 * Returns >0 if success with sent bytes, or <0 if error.
208 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
209 const char *block_name)
211 RAMBlock *block = qemu_ram_block_by_name(block_name);
212 unsigned long *le_bitmap, nbits;
216 error_report("%s: invalid block name: %s", __func__, block_name);
220 nbits = block->used_length >> TARGET_PAGE_BITS;
223 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
224 * machines we may need 4 more bytes for padding (see below
225 * comment). So extend it a bit before hand.
227 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
230 * Always use little endian when sending the bitmap. This is
231 * required that when source and destination VMs are not using the
232 * same endianess. (Note: big endian won't work.)
234 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
236 /* Size of the bitmap, in bytes */
240 * size is always aligned to 8 bytes for 64bit machines, but it
241 * may not be true for 32bit machines. We need this padding to
242 * make sure the migration can survive even between 32bit and
245 size = ROUND_UP(size, 8);
247 qemu_put_be64(file, size);
248 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
250 * Mark as an end, in case the middle part is screwed up due to
251 * some "misterious" reason.
253 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
258 if (qemu_file_get_error(file)) {
259 return qemu_file_get_error(file);
262 return size + sizeof(size);
266 * An outstanding page request, on the source, having been received
269 struct RAMSrcPageRequest {
274 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
277 /* State of RAM for migration */
279 /* QEMUFile used for this migration */
281 /* Last block that we have visited searching for dirty pages */
282 RAMBlock *last_seen_block;
283 /* Last block from where we have sent data */
284 RAMBlock *last_sent_block;
285 /* Last dirty target page we have sent */
286 ram_addr_t last_page;
287 /* last ram version we have seen */
288 uint32_t last_version;
289 /* We are in the first round */
291 /* How many times we have dirty too many pages */
292 int dirty_rate_high_cnt;
293 /* these variables are used for bitmap sync */
294 /* last time we did a full bitmap_sync */
295 int64_t time_last_bitmap_sync;
296 /* bytes transferred at start_time */
297 uint64_t bytes_xfer_prev;
298 /* number of dirty pages since start_time */
299 uint64_t num_dirty_pages_period;
300 /* xbzrle misses since the beginning of the period */
301 uint64_t xbzrle_cache_miss_prev;
302 /* number of iterations at the beginning of period */
303 uint64_t iterations_prev;
304 /* Iterations since start */
306 /* number of dirty bits in the bitmap */
307 uint64_t migration_dirty_pages;
308 /* protects modification of the bitmap */
309 QemuMutex bitmap_mutex;
310 /* The RAMBlock used in the last src_page_requests */
311 RAMBlock *last_req_rb;
312 /* Queue of outstanding page requests from the destination */
313 QemuMutex src_page_req_mutex;
314 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
316 typedef struct RAMState RAMState;
318 static RAMState *ram_state;
320 uint64_t ram_bytes_remaining(void)
322 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
326 MigrationStats ram_counters;
328 /* used by the search for pages to send */
329 struct PageSearchStatus {
330 /* Current block being searched */
332 /* Current page to search from */
334 /* Set once we wrap around */
337 typedef struct PageSearchStatus PageSearchStatus;
339 struct CompressParam {
348 /* internally used fields */
352 typedef struct CompressParam CompressParam;
354 struct DecompressParam {
364 typedef struct DecompressParam DecompressParam;
366 static CompressParam *comp_param;
367 static QemuThread *compress_threads;
368 /* comp_done_cond is used to wake up the migration thread when
369 * one of the compression threads has finished the compression.
370 * comp_done_lock is used to co-work with comp_done_cond.
372 static QemuMutex comp_done_lock;
373 static QemuCond comp_done_cond;
374 /* The empty QEMUFileOps will be used by file in CompressParam */
375 static const QEMUFileOps empty_ops = { };
377 static QEMUFile *decomp_file;
378 static DecompressParam *decomp_param;
379 static QemuThread *decompress_threads;
380 static QemuMutex decomp_done_lock;
381 static QemuCond decomp_done_cond;
383 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
384 ram_addr_t offset, uint8_t *source_buf);
386 static void *do_data_compress(void *opaque)
388 CompressParam *param = opaque;
392 qemu_mutex_lock(¶m->mutex);
393 while (!param->quit) {
395 block = param->block;
396 offset = param->offset;
398 qemu_mutex_unlock(¶m->mutex);
400 do_compress_ram_page(param->file, ¶m->stream, block, offset,
403 qemu_mutex_lock(&comp_done_lock);
405 qemu_cond_signal(&comp_done_cond);
406 qemu_mutex_unlock(&comp_done_lock);
408 qemu_mutex_lock(¶m->mutex);
410 qemu_cond_wait(¶m->cond, ¶m->mutex);
413 qemu_mutex_unlock(¶m->mutex);
418 static inline void terminate_compression_threads(void)
420 int idx, thread_count;
422 thread_count = migrate_compress_threads();
424 for (idx = 0; idx < thread_count; idx++) {
425 qemu_mutex_lock(&comp_param[idx].mutex);
426 comp_param[idx].quit = true;
427 qemu_cond_signal(&comp_param[idx].cond);
428 qemu_mutex_unlock(&comp_param[idx].mutex);
432 static void compress_threads_save_cleanup(void)
436 if (!migrate_use_compression()) {
439 terminate_compression_threads();
440 thread_count = migrate_compress_threads();
441 for (i = 0; i < thread_count; i++) {
443 * we use it as a indicator which shows if the thread is
444 * properly init'd or not
446 if (!comp_param[i].file) {
449 qemu_thread_join(compress_threads + i);
450 qemu_mutex_destroy(&comp_param[i].mutex);
451 qemu_cond_destroy(&comp_param[i].cond);
452 deflateEnd(&comp_param[i].stream);
453 g_free(comp_param[i].originbuf);
454 qemu_fclose(comp_param[i].file);
455 comp_param[i].file = NULL;
457 qemu_mutex_destroy(&comp_done_lock);
458 qemu_cond_destroy(&comp_done_cond);
459 g_free(compress_threads);
461 compress_threads = NULL;
465 static int compress_threads_save_setup(void)
469 if (!migrate_use_compression()) {
472 thread_count = migrate_compress_threads();
473 compress_threads = g_new0(QemuThread, thread_count);
474 comp_param = g_new0(CompressParam, thread_count);
475 qemu_cond_init(&comp_done_cond);
476 qemu_mutex_init(&comp_done_lock);
477 for (i = 0; i < thread_count; i++) {
478 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
479 if (!comp_param[i].originbuf) {
483 if (deflateInit(&comp_param[i].stream,
484 migrate_compress_level()) != Z_OK) {
485 g_free(comp_param[i].originbuf);
489 /* comp_param[i].file is just used as a dummy buffer to save data,
490 * set its ops to empty.
492 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
493 comp_param[i].done = true;
494 comp_param[i].quit = false;
495 qemu_mutex_init(&comp_param[i].mutex);
496 qemu_cond_init(&comp_param[i].cond);
497 qemu_thread_create(compress_threads + i, "compress",
498 do_data_compress, comp_param + i,
499 QEMU_THREAD_JOINABLE);
504 compress_threads_save_cleanup();
510 #define MULTIFD_MAGIC 0x11223344U
511 #define MULTIFD_VERSION 1
516 unsigned char uuid[16]; /* QemuUUID */
518 } __attribute__((packed)) MultiFDInit_t;
521 /* this fields are not changed once the thread is created */
524 /* channel thread name */
526 /* channel thread id */
528 /* communication channel */
530 /* sem where to wait for more work */
532 /* this mutex protects the following parameters */
534 /* is this channel thread running */
536 /* should this thread finish */
541 /* this fields are not changed once the thread is created */
544 /* channel thread name */
546 /* channel thread id */
548 /* communication channel */
550 /* sem where to wait for more work */
552 /* this mutex protects the following parameters */
554 /* is this channel thread running */
556 /* should this thread finish */
560 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
565 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
566 msg.version = cpu_to_be32(MULTIFD_VERSION);
568 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
570 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
577 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
582 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
587 be32_to_cpus(&msg.magic);
588 be32_to_cpus(&msg.version);
590 if (msg.magic != MULTIFD_MAGIC) {
591 error_setg(errp, "multifd: received packet magic %x "
592 "expected %x", msg.magic, MULTIFD_MAGIC);
596 if (msg.version != MULTIFD_VERSION) {
597 error_setg(errp, "multifd: received packet version %d "
598 "expected %d", msg.version, MULTIFD_VERSION);
602 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
603 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
604 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
606 error_setg(errp, "multifd: received uuid '%s' and expected "
607 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
613 if (msg.id > migrate_multifd_channels()) {
614 error_setg(errp, "multifd: received channel version %d "
615 "expected %d", msg.version, MULTIFD_VERSION);
623 MultiFDSendParams *params;
624 /* number of created threads */
626 } *multifd_send_state;
628 static void multifd_send_terminate_threads(Error *err)
633 MigrationState *s = migrate_get_current();
634 migrate_set_error(s, err);
635 if (s->state == MIGRATION_STATUS_SETUP ||
636 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
637 s->state == MIGRATION_STATUS_DEVICE ||
638 s->state == MIGRATION_STATUS_ACTIVE) {
639 migrate_set_state(&s->state, s->state,
640 MIGRATION_STATUS_FAILED);
644 for (i = 0; i < migrate_multifd_channels(); i++) {
645 MultiFDSendParams *p = &multifd_send_state->params[i];
647 qemu_mutex_lock(&p->mutex);
649 qemu_sem_post(&p->sem);
650 qemu_mutex_unlock(&p->mutex);
654 int multifd_save_cleanup(Error **errp)
659 if (!migrate_use_multifd()) {
662 multifd_send_terminate_threads(NULL);
663 for (i = 0; i < migrate_multifd_channels(); i++) {
664 MultiFDSendParams *p = &multifd_send_state->params[i];
667 qemu_thread_join(&p->thread);
669 socket_send_channel_destroy(p->c);
671 qemu_mutex_destroy(&p->mutex);
672 qemu_sem_destroy(&p->sem);
676 g_free(multifd_send_state->params);
677 multifd_send_state->params = NULL;
678 g_free(multifd_send_state);
679 multifd_send_state = NULL;
683 static void *multifd_send_thread(void *opaque)
685 MultiFDSendParams *p = opaque;
686 Error *local_err = NULL;
688 if (multifd_send_initial_packet(p, &local_err) < 0) {
693 qemu_mutex_lock(&p->mutex);
695 qemu_mutex_unlock(&p->mutex);
698 qemu_mutex_unlock(&p->mutex);
699 qemu_sem_wait(&p->sem);
704 multifd_send_terminate_threads(local_err);
707 qemu_mutex_lock(&p->mutex);
709 qemu_mutex_unlock(&p->mutex);
714 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
716 MultiFDSendParams *p = opaque;
717 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
718 Error *local_err = NULL;
720 if (qio_task_propagate_error(task, &local_err)) {
721 if (multifd_save_cleanup(&local_err) != 0) {
722 migrate_set_error(migrate_get_current(), local_err);
725 p->c = QIO_CHANNEL(sioc);
726 qio_channel_set_delay(p->c, false);
728 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
729 QEMU_THREAD_JOINABLE);
731 atomic_inc(&multifd_send_state->count);
735 int multifd_save_setup(void)
740 if (!migrate_use_multifd()) {
743 thread_count = migrate_multifd_channels();
744 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
745 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
746 atomic_set(&multifd_send_state->count, 0);
747 for (i = 0; i < thread_count; i++) {
748 MultiFDSendParams *p = &multifd_send_state->params[i];
750 qemu_mutex_init(&p->mutex);
751 qemu_sem_init(&p->sem, 0);
754 p->name = g_strdup_printf("multifdsend_%d", i);
755 socket_send_channel_create(multifd_new_send_channel_async, p);
761 MultiFDRecvParams *params;
762 /* number of created threads */
764 } *multifd_recv_state;
766 static void multifd_recv_terminate_threads(Error *err)
771 MigrationState *s = migrate_get_current();
772 migrate_set_error(s, err);
773 if (s->state == MIGRATION_STATUS_SETUP ||
774 s->state == MIGRATION_STATUS_ACTIVE) {
775 migrate_set_state(&s->state, s->state,
776 MIGRATION_STATUS_FAILED);
780 for (i = 0; i < migrate_multifd_channels(); i++) {
781 MultiFDRecvParams *p = &multifd_recv_state->params[i];
783 qemu_mutex_lock(&p->mutex);
785 qemu_sem_post(&p->sem);
786 qemu_mutex_unlock(&p->mutex);
790 int multifd_load_cleanup(Error **errp)
795 if (!migrate_use_multifd()) {
798 multifd_recv_terminate_threads(NULL);
799 for (i = 0; i < migrate_multifd_channels(); i++) {
800 MultiFDRecvParams *p = &multifd_recv_state->params[i];
803 qemu_thread_join(&p->thread);
805 object_unref(OBJECT(p->c));
807 qemu_mutex_destroy(&p->mutex);
808 qemu_sem_destroy(&p->sem);
812 g_free(multifd_recv_state->params);
813 multifd_recv_state->params = NULL;
814 g_free(multifd_recv_state);
815 multifd_recv_state = NULL;
820 static void *multifd_recv_thread(void *opaque)
822 MultiFDRecvParams *p = opaque;
825 qemu_mutex_lock(&p->mutex);
827 qemu_mutex_unlock(&p->mutex);
830 qemu_mutex_unlock(&p->mutex);
831 qemu_sem_wait(&p->sem);
834 qemu_mutex_lock(&p->mutex);
836 qemu_mutex_unlock(&p->mutex);
841 int multifd_load_setup(void)
846 if (!migrate_use_multifd()) {
849 thread_count = migrate_multifd_channels();
850 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
851 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
852 atomic_set(&multifd_recv_state->count, 0);
853 for (i = 0; i < thread_count; i++) {
854 MultiFDRecvParams *p = &multifd_recv_state->params[i];
856 qemu_mutex_init(&p->mutex);
857 qemu_sem_init(&p->sem, 0);
860 p->name = g_strdup_printf("multifdrecv_%d", i);
865 bool multifd_recv_all_channels_created(void)
867 int thread_count = migrate_multifd_channels();
869 if (!migrate_use_multifd()) {
873 return thread_count == atomic_read(&multifd_recv_state->count);
876 void multifd_recv_new_channel(QIOChannel *ioc)
878 MultiFDRecvParams *p;
879 Error *local_err = NULL;
882 id = multifd_recv_initial_packet(ioc, &local_err);
884 multifd_recv_terminate_threads(local_err);
888 p = &multifd_recv_state->params[id];
890 error_setg(&local_err, "multifd: received id '%d' already setup'",
892 multifd_recv_terminate_threads(local_err);
896 object_ref(OBJECT(ioc));
899 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
900 QEMU_THREAD_JOINABLE);
901 atomic_inc(&multifd_recv_state->count);
902 if (multifd_recv_state->count == migrate_multifd_channels()) {
903 migration_incoming_process();
908 * save_page_header: write page header to wire
910 * If this is the 1st block, it also writes the block identification
912 * Returns the number of bytes written
914 * @f: QEMUFile where to send the data
915 * @block: block that contains the page we want to send
916 * @offset: offset inside the block for the page
917 * in the lower bits, it contains flags
919 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
924 if (block == rs->last_sent_block) {
925 offset |= RAM_SAVE_FLAG_CONTINUE;
927 qemu_put_be64(f, offset);
930 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
931 len = strlen(block->idstr);
932 qemu_put_byte(f, len);
933 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
935 rs->last_sent_block = block;
941 * mig_throttle_guest_down: throotle down the guest
943 * Reduce amount of guest cpu execution to hopefully slow down memory
944 * writes. If guest dirty memory rate is reduced below the rate at
945 * which we can transfer pages to the destination then we should be
946 * able to complete migration. Some workloads dirty memory way too
947 * fast and will not effectively converge, even with auto-converge.
949 static void mig_throttle_guest_down(void)
951 MigrationState *s = migrate_get_current();
952 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
953 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
955 /* We have not started throttling yet. Let's start it. */
956 if (!cpu_throttle_active()) {
957 cpu_throttle_set(pct_initial);
959 /* Throttling already on, just increase the rate */
960 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
965 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
967 * @rs: current RAM state
968 * @current_addr: address for the zero page
970 * Update the xbzrle cache to reflect a page that's been sent as all 0.
971 * The important thing is that a stale (not-yet-0'd) page be replaced
973 * As a bonus, if the page wasn't in the cache it gets added so that
974 * when a small write is made into the 0'd page it gets XBZRLE sent.
976 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
978 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
982 /* We don't care if this fails to allocate a new cache page
983 * as long as it updated an old one */
984 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
985 ram_counters.dirty_sync_count);
988 #define ENCODING_FLAG_XBZRLE 0x1
991 * save_xbzrle_page: compress and send current page
993 * Returns: 1 means that we wrote the page
994 * 0 means that page is identical to the one already sent
995 * -1 means that xbzrle would be longer than normal
997 * @rs: current RAM state
998 * @current_data: pointer to the address of the page contents
999 * @current_addr: addr of the page
1000 * @block: block that contains the page we want to send
1001 * @offset: offset inside the block for the page
1002 * @last_stage: if we are at the completion stage
1004 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1005 ram_addr_t current_addr, RAMBlock *block,
1006 ram_addr_t offset, bool last_stage)
1008 int encoded_len = 0, bytes_xbzrle;
1009 uint8_t *prev_cached_page;
1011 if (!cache_is_cached(XBZRLE.cache, current_addr,
1012 ram_counters.dirty_sync_count)) {
1013 xbzrle_counters.cache_miss++;
1015 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1016 ram_counters.dirty_sync_count) == -1) {
1019 /* update *current_data when the page has been
1020 inserted into cache */
1021 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1027 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1029 /* save current buffer into memory */
1030 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1032 /* XBZRLE encoding (if there is no overflow) */
1033 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1034 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1036 if (encoded_len == 0) {
1037 trace_save_xbzrle_page_skipping();
1039 } else if (encoded_len == -1) {
1040 trace_save_xbzrle_page_overflow();
1041 xbzrle_counters.overflow++;
1042 /* update data in the cache */
1044 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1045 *current_data = prev_cached_page;
1050 /* we need to update the data in the cache, in order to get the same data */
1052 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1055 /* Send XBZRLE based compressed page */
1056 bytes_xbzrle = save_page_header(rs, rs->f, block,
1057 offset | RAM_SAVE_FLAG_XBZRLE);
1058 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1059 qemu_put_be16(rs->f, encoded_len);
1060 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1061 bytes_xbzrle += encoded_len + 1 + 2;
1062 xbzrle_counters.pages++;
1063 xbzrle_counters.bytes += bytes_xbzrle;
1064 ram_counters.transferred += bytes_xbzrle;
1070 * migration_bitmap_find_dirty: find the next dirty page from start
1072 * Called with rcu_read_lock() to protect migration_bitmap
1074 * Returns the byte offset within memory region of the start of a dirty page
1076 * @rs: current RAM state
1077 * @rb: RAMBlock where to search for dirty pages
1078 * @start: page where we start the search
1081 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1082 unsigned long start)
1084 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1085 unsigned long *bitmap = rb->bmap;
1088 if (!qemu_ram_is_migratable(rb)) {
1092 if (rs->ram_bulk_stage && start > 0) {
1095 next = find_next_bit(bitmap, size, start);
1101 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1107 ret = test_and_clear_bit(page, rb->bmap);
1110 rs->migration_dirty_pages--;
1115 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1116 ram_addr_t start, ram_addr_t length)
1118 rs->migration_dirty_pages +=
1119 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1120 &rs->num_dirty_pages_period);
1124 * ram_pagesize_summary: calculate all the pagesizes of a VM
1126 * Returns a summary bitmap of the page sizes of all RAMBlocks
1128 * For VMs with just normal pages this is equivalent to the host page
1129 * size. If it's got some huge pages then it's the OR of all the
1130 * different page sizes.
1132 uint64_t ram_pagesize_summary(void)
1135 uint64_t summary = 0;
1137 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1138 summary |= block->page_size;
1144 static void migration_update_rates(RAMState *rs, int64_t end_time)
1146 uint64_t iter_count = rs->iterations - rs->iterations_prev;
1148 /* calculate period counters */
1149 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1150 / (end_time - rs->time_last_bitmap_sync);
1156 if (migrate_use_xbzrle()) {
1157 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1158 rs->xbzrle_cache_miss_prev) / iter_count;
1159 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1163 static void migration_bitmap_sync(RAMState *rs)
1167 uint64_t bytes_xfer_now;
1169 ram_counters.dirty_sync_count++;
1171 if (!rs->time_last_bitmap_sync) {
1172 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1175 trace_migration_bitmap_sync_start();
1176 memory_global_dirty_log_sync();
1178 qemu_mutex_lock(&rs->bitmap_mutex);
1180 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1181 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1184 qemu_mutex_unlock(&rs->bitmap_mutex);
1186 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1188 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1190 /* more than 1 second = 1000 millisecons */
1191 if (end_time > rs->time_last_bitmap_sync + 1000) {
1192 bytes_xfer_now = ram_counters.transferred;
1194 /* During block migration the auto-converge logic incorrectly detects
1195 * that ram migration makes no progress. Avoid this by disabling the
1196 * throttling logic during the bulk phase of block migration. */
1197 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1198 /* The following detection logic can be refined later. For now:
1199 Check to see if the dirtied bytes is 50% more than the approx.
1200 amount of bytes that just got transferred since the last time we
1201 were in this routine. If that happens twice, start or increase
1204 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1205 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1206 (++rs->dirty_rate_high_cnt >= 2)) {
1207 trace_migration_throttle();
1208 rs->dirty_rate_high_cnt = 0;
1209 mig_throttle_guest_down();
1213 migration_update_rates(rs, end_time);
1215 rs->iterations_prev = rs->iterations;
1217 /* reset period counters */
1218 rs->time_last_bitmap_sync = end_time;
1219 rs->num_dirty_pages_period = 0;
1220 rs->bytes_xfer_prev = bytes_xfer_now;
1222 if (migrate_use_events()) {
1223 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1228 * save_zero_page: send the zero page to the stream
1230 * Returns the number of pages written.
1232 * @rs: current RAM state
1233 * @block: block that contains the page we want to send
1234 * @offset: offset inside the block for the page
1236 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1238 uint8_t *p = block->host + offset;
1241 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1242 ram_counters.duplicate++;
1243 ram_counters.transferred +=
1244 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1245 qemu_put_byte(rs->f, 0);
1246 ram_counters.transferred += 1;
1253 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1255 if (!migrate_release_ram() || !migration_in_postcopy()) {
1259 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1263 * @pages: the number of pages written by the control path,
1265 * > 0 - number of pages written
1267 * Return true if the pages has been saved, otherwise false is returned.
1269 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1272 uint64_t bytes_xmit = 0;
1276 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1278 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1283 ram_counters.transferred += bytes_xmit;
1287 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1291 if (bytes_xmit > 0) {
1292 ram_counters.normal++;
1293 } else if (bytes_xmit == 0) {
1294 ram_counters.duplicate++;
1301 * directly send the page to the stream
1303 * Returns the number of pages written.
1305 * @rs: current RAM state
1306 * @block: block that contains the page we want to send
1307 * @offset: offset inside the block for the page
1308 * @buf: the page to be sent
1309 * @async: send to page asyncly
1311 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1312 uint8_t *buf, bool async)
1314 ram_counters.transferred += save_page_header(rs, rs->f, block,
1315 offset | RAM_SAVE_FLAG_PAGE);
1317 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1318 migrate_release_ram() &
1319 migration_in_postcopy());
1321 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1323 ram_counters.transferred += TARGET_PAGE_SIZE;
1324 ram_counters.normal++;
1329 * ram_save_page: send the given page to the stream
1331 * Returns the number of pages written.
1333 * >=0 - Number of pages written - this might legally be 0
1334 * if xbzrle noticed the page was the same.
1336 * @rs: current RAM state
1337 * @block: block that contains the page we want to send
1338 * @offset: offset inside the block for the page
1339 * @last_stage: if we are at the completion stage
1341 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1345 bool send_async = true;
1346 RAMBlock *block = pss->block;
1347 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1348 ram_addr_t current_addr = block->offset + offset;
1350 p = block->host + offset;
1351 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1353 XBZRLE_cache_lock();
1354 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1355 migrate_use_xbzrle()) {
1356 pages = save_xbzrle_page(rs, &p, current_addr, block,
1357 offset, last_stage);
1359 /* Can't send this cached data async, since the cache page
1360 * might get updated before it gets to the wire
1366 /* XBZRLE overflow or normal page */
1368 pages = save_normal_page(rs, block, offset, p, send_async);
1371 XBZRLE_cache_unlock();
1376 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1377 ram_addr_t offset, uint8_t *source_buf)
1379 RAMState *rs = ram_state;
1380 int bytes_sent, blen;
1381 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1383 bytes_sent = save_page_header(rs, f, block, offset |
1384 RAM_SAVE_FLAG_COMPRESS_PAGE);
1387 * copy it to a internal buffer to avoid it being modified by VM
1388 * so that we can catch up the error during compression and
1391 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1392 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1395 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1396 error_report("compressed data failed!");
1399 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1405 static void flush_compressed_data(RAMState *rs)
1407 int idx, len, thread_count;
1409 if (!migrate_use_compression()) {
1412 thread_count = migrate_compress_threads();
1414 qemu_mutex_lock(&comp_done_lock);
1415 for (idx = 0; idx < thread_count; idx++) {
1416 while (!comp_param[idx].done) {
1417 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1420 qemu_mutex_unlock(&comp_done_lock);
1422 for (idx = 0; idx < thread_count; idx++) {
1423 qemu_mutex_lock(&comp_param[idx].mutex);
1424 if (!comp_param[idx].quit) {
1425 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1426 ram_counters.transferred += len;
1428 qemu_mutex_unlock(&comp_param[idx].mutex);
1432 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1435 param->block = block;
1436 param->offset = offset;
1439 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1442 int idx, thread_count, bytes_xmit = -1, pages = -1;
1444 thread_count = migrate_compress_threads();
1445 qemu_mutex_lock(&comp_done_lock);
1447 for (idx = 0; idx < thread_count; idx++) {
1448 if (comp_param[idx].done) {
1449 comp_param[idx].done = false;
1450 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1451 qemu_mutex_lock(&comp_param[idx].mutex);
1452 set_compress_params(&comp_param[idx], block, offset);
1453 qemu_cond_signal(&comp_param[idx].cond);
1454 qemu_mutex_unlock(&comp_param[idx].mutex);
1456 ram_counters.normal++;
1457 ram_counters.transferred += bytes_xmit;
1464 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1467 qemu_mutex_unlock(&comp_done_lock);
1473 * find_dirty_block: find the next dirty page and update any state
1474 * associated with the search process.
1476 * Returns if a page is found
1478 * @rs: current RAM state
1479 * @pss: data about the state of the current dirty page scan
1480 * @again: set to false if the search has scanned the whole of RAM
1482 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1484 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1485 if (pss->complete_round && pss->block == rs->last_seen_block &&
1486 pss->page >= rs->last_page) {
1488 * We've been once around the RAM and haven't found anything.
1494 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1495 /* Didn't find anything in this RAM Block */
1497 pss->block = QLIST_NEXT_RCU(pss->block, next);
1499 /* Hit the end of the list */
1500 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1501 /* Flag that we've looped */
1502 pss->complete_round = true;
1503 rs->ram_bulk_stage = false;
1504 if (migrate_use_xbzrle()) {
1505 /* If xbzrle is on, stop using the data compression at this
1506 * point. In theory, xbzrle can do better than compression.
1508 flush_compressed_data(rs);
1511 /* Didn't find anything this time, but try again on the new block */
1515 /* Can go around again, but... */
1517 /* We've found something so probably don't need to */
1523 * unqueue_page: gets a page of the queue
1525 * Helper for 'get_queued_page' - gets a page off the queue
1527 * Returns the block of the page (or NULL if none available)
1529 * @rs: current RAM state
1530 * @offset: used to return the offset within the RAMBlock
1532 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1534 RAMBlock *block = NULL;
1536 qemu_mutex_lock(&rs->src_page_req_mutex);
1537 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1538 struct RAMSrcPageRequest *entry =
1539 QSIMPLEQ_FIRST(&rs->src_page_requests);
1541 *offset = entry->offset;
1543 if (entry->len > TARGET_PAGE_SIZE) {
1544 entry->len -= TARGET_PAGE_SIZE;
1545 entry->offset += TARGET_PAGE_SIZE;
1547 memory_region_unref(block->mr);
1548 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1552 qemu_mutex_unlock(&rs->src_page_req_mutex);
1558 * get_queued_page: unqueue a page from the postocpy requests
1560 * Skips pages that are already sent (!dirty)
1562 * Returns if a queued page is found
1564 * @rs: current RAM state
1565 * @pss: data about the state of the current dirty page scan
1567 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1574 block = unqueue_page(rs, &offset);
1576 * We're sending this page, and since it's postcopy nothing else
1577 * will dirty it, and we must make sure it doesn't get sent again
1578 * even if this queue request was received after the background
1579 * search already sent it.
1584 page = offset >> TARGET_PAGE_BITS;
1585 dirty = test_bit(page, block->bmap);
1587 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1588 page, test_bit(page, block->unsentmap));
1590 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1594 } while (block && !dirty);
1598 * As soon as we start servicing pages out of order, then we have
1599 * to kill the bulk stage, since the bulk stage assumes
1600 * in (migration_bitmap_find_and_reset_dirty) that every page is
1601 * dirty, that's no longer true.
1603 rs->ram_bulk_stage = false;
1606 * We want the background search to continue from the queued page
1607 * since the guest is likely to want other pages near to the page
1608 * it just requested.
1611 pss->page = offset >> TARGET_PAGE_BITS;
1618 * migration_page_queue_free: drop any remaining pages in the ram
1621 * It should be empty at the end anyway, but in error cases there may
1622 * be some left. in case that there is any page left, we drop it.
1625 static void migration_page_queue_free(RAMState *rs)
1627 struct RAMSrcPageRequest *mspr, *next_mspr;
1628 /* This queue generally should be empty - but in the case of a failed
1629 * migration might have some droppings in.
1632 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1633 memory_region_unref(mspr->rb->mr);
1634 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1641 * ram_save_queue_pages: queue the page for transmission
1643 * A request from postcopy destination for example.
1645 * Returns zero on success or negative on error
1647 * @rbname: Name of the RAMBLock of the request. NULL means the
1648 * same that last one.
1649 * @start: starting address from the start of the RAMBlock
1650 * @len: length (in bytes) to send
1652 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1655 RAMState *rs = ram_state;
1657 ram_counters.postcopy_requests++;
1660 /* Reuse last RAMBlock */
1661 ramblock = rs->last_req_rb;
1665 * Shouldn't happen, we can't reuse the last RAMBlock if
1666 * it's the 1st request.
1668 error_report("ram_save_queue_pages no previous block");
1672 ramblock = qemu_ram_block_by_name(rbname);
1675 /* We shouldn't be asked for a non-existent RAMBlock */
1676 error_report("ram_save_queue_pages no block '%s'", rbname);
1679 rs->last_req_rb = ramblock;
1681 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1682 if (start+len > ramblock->used_length) {
1683 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1684 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1685 __func__, start, len, ramblock->used_length);
1689 struct RAMSrcPageRequest *new_entry =
1690 g_malloc0(sizeof(struct RAMSrcPageRequest));
1691 new_entry->rb = ramblock;
1692 new_entry->offset = start;
1693 new_entry->len = len;
1695 memory_region_ref(ramblock->mr);
1696 qemu_mutex_lock(&rs->src_page_req_mutex);
1697 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1698 qemu_mutex_unlock(&rs->src_page_req_mutex);
1708 static bool save_page_use_compression(RAMState *rs)
1710 if (!migrate_use_compression()) {
1715 * If xbzrle is on, stop using the data compression after first
1716 * round of migration even if compression is enabled. In theory,
1717 * xbzrle can do better than compression.
1719 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1727 * ram_save_target_page: save one target page
1729 * Returns the number of pages written
1731 * @rs: current RAM state
1732 * @pss: data about the page we want to send
1733 * @last_stage: if we are at the completion stage
1735 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1738 RAMBlock *block = pss->block;
1739 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1742 if (control_save_page(rs, block, offset, &res)) {
1747 * When starting the process of a new block, the first page of
1748 * the block should be sent out before other pages in the same
1749 * block, and all the pages in last block should have been sent
1750 * out, keeping this order is important, because the 'cont' flag
1751 * is used to avoid resending the block name.
1753 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1754 flush_compressed_data(rs);
1757 res = save_zero_page(rs, block, offset);
1759 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1760 * page would be stale
1762 if (!save_page_use_compression(rs)) {
1763 XBZRLE_cache_lock();
1764 xbzrle_cache_zero_page(rs, block->offset + offset);
1765 XBZRLE_cache_unlock();
1767 ram_release_pages(block->idstr, offset, res);
1772 * Make sure the first page is sent out before other pages.
1774 * we post it as normal page as compression will take much
1777 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1778 return compress_page_with_multi_thread(rs, block, offset);
1781 return ram_save_page(rs, pss, last_stage);
1785 * ram_save_host_page: save a whole host page
1787 * Starting at *offset send pages up to the end of the current host
1788 * page. It's valid for the initial offset to point into the middle of
1789 * a host page in which case the remainder of the hostpage is sent.
1790 * Only dirty target pages are sent. Note that the host page size may
1791 * be a huge page for this block.
1792 * The saving stops at the boundary of the used_length of the block
1793 * if the RAMBlock isn't a multiple of the host page size.
1795 * Returns the number of pages written or negative on error
1797 * @rs: current RAM state
1798 * @ms: current migration state
1799 * @pss: data about the page we want to send
1800 * @last_stage: if we are at the completion stage
1802 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1805 int tmppages, pages = 0;
1806 size_t pagesize_bits =
1807 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1809 if (!qemu_ram_is_migratable(pss->block)) {
1810 error_report("block %s should not be migrated !", pss->block->idstr);
1815 /* Check the pages is dirty and if it is send it */
1816 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1821 tmppages = ram_save_target_page(rs, pss, last_stage);
1827 if (pss->block->unsentmap) {
1828 clear_bit(pss->page, pss->block->unsentmap);
1832 } while ((pss->page & (pagesize_bits - 1)) &&
1833 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1835 /* The offset we leave with is the last one we looked at */
1841 * ram_find_and_save_block: finds a dirty page and sends it to f
1843 * Called within an RCU critical section.
1845 * Returns the number of pages written where zero means no dirty pages
1847 * @rs: current RAM state
1848 * @last_stage: if we are at the completion stage
1850 * On systems where host-page-size > target-page-size it will send all the
1851 * pages in a host page that are dirty.
1854 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1856 PageSearchStatus pss;
1860 /* No dirty page as there is zero RAM */
1861 if (!ram_bytes_total()) {
1865 pss.block = rs->last_seen_block;
1866 pss.page = rs->last_page;
1867 pss.complete_round = false;
1870 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1875 found = get_queued_page(rs, &pss);
1878 /* priority queue empty, so just search for something dirty */
1879 found = find_dirty_block(rs, &pss, &again);
1883 pages = ram_save_host_page(rs, &pss, last_stage);
1885 } while (!pages && again);
1887 rs->last_seen_block = pss.block;
1888 rs->last_page = pss.page;
1893 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1895 uint64_t pages = size / TARGET_PAGE_SIZE;
1898 ram_counters.duplicate += pages;
1900 ram_counters.normal += pages;
1901 ram_counters.transferred += size;
1902 qemu_update_position(f, size);
1906 uint64_t ram_bytes_total(void)
1912 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1913 total += block->used_length;
1919 static void xbzrle_load_setup(void)
1921 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1924 static void xbzrle_load_cleanup(void)
1926 g_free(XBZRLE.decoded_buf);
1927 XBZRLE.decoded_buf = NULL;
1930 static void ram_state_cleanup(RAMState **rsp)
1933 migration_page_queue_free(*rsp);
1934 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1935 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1941 static void xbzrle_cleanup(void)
1943 XBZRLE_cache_lock();
1945 cache_fini(XBZRLE.cache);
1946 g_free(XBZRLE.encoded_buf);
1947 g_free(XBZRLE.current_buf);
1948 g_free(XBZRLE.zero_target_page);
1949 XBZRLE.cache = NULL;
1950 XBZRLE.encoded_buf = NULL;
1951 XBZRLE.current_buf = NULL;
1952 XBZRLE.zero_target_page = NULL;
1954 XBZRLE_cache_unlock();
1957 static void ram_save_cleanup(void *opaque)
1959 RAMState **rsp = opaque;
1962 /* caller have hold iothread lock or is in a bh, so there is
1963 * no writing race against this migration_bitmap
1965 memory_global_dirty_log_stop();
1967 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1968 g_free(block->bmap);
1970 g_free(block->unsentmap);
1971 block->unsentmap = NULL;
1975 compress_threads_save_cleanup();
1976 ram_state_cleanup(rsp);
1979 static void ram_state_reset(RAMState *rs)
1981 rs->last_seen_block = NULL;
1982 rs->last_sent_block = NULL;
1984 rs->last_version = ram_list.version;
1985 rs->ram_bulk_stage = true;
1988 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1991 * 'expected' is the value you expect the bitmap mostly to be full
1992 * of; it won't bother printing lines that are all this value.
1993 * If 'todump' is null the migration bitmap is dumped.
1995 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1996 unsigned long pages)
1999 int64_t linelen = 128;
2002 for (cur = 0; cur < pages; cur += linelen) {
2006 * Last line; catch the case where the line length
2007 * is longer than remaining ram
2009 if (cur + linelen > pages) {
2010 linelen = pages - cur;
2012 for (curb = 0; curb < linelen; curb++) {
2013 bool thisbit = test_bit(cur + curb, todump);
2014 linebuf[curb] = thisbit ? '1' : '.';
2015 found = found || (thisbit != expected);
2018 linebuf[curb] = '\0';
2019 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2024 /* **** functions for postcopy ***** */
2026 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2028 struct RAMBlock *block;
2030 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2031 unsigned long *bitmap = block->bmap;
2032 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2033 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2035 while (run_start < range) {
2036 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2037 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2038 (run_end - run_start) << TARGET_PAGE_BITS);
2039 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2045 * postcopy_send_discard_bm_ram: discard a RAMBlock
2047 * Returns zero on success
2049 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2050 * Note: At this point the 'unsentmap' is the processed bitmap combined
2051 * with the dirtymap; so a '1' means it's either dirty or unsent.
2053 * @ms: current migration state
2054 * @pds: state for postcopy
2055 * @start: RAMBlock starting page
2056 * @length: RAMBlock size
2058 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2059 PostcopyDiscardState *pds,
2062 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2063 unsigned long current;
2064 unsigned long *unsentmap = block->unsentmap;
2066 for (current = 0; current < end; ) {
2067 unsigned long one = find_next_bit(unsentmap, end, current);
2070 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2071 unsigned long discard_length;
2074 discard_length = end - one;
2076 discard_length = zero - one;
2078 if (discard_length) {
2079 postcopy_discard_send_range(ms, pds, one, discard_length);
2081 current = one + discard_length;
2091 * postcopy_each_ram_send_discard: discard all RAMBlocks
2093 * Returns 0 for success or negative for error
2095 * Utility for the outgoing postcopy code.
2096 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2097 * passing it bitmap indexes and name.
2098 * (qemu_ram_foreach_block ends up passing unscaled lengths
2099 * which would mean postcopy code would have to deal with target page)
2101 * @ms: current migration state
2103 static int postcopy_each_ram_send_discard(MigrationState *ms)
2105 struct RAMBlock *block;
2108 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2109 PostcopyDiscardState *pds =
2110 postcopy_discard_send_init(ms, block->idstr);
2113 * Postcopy sends chunks of bitmap over the wire, but it
2114 * just needs indexes at this point, avoids it having
2115 * target page specific code.
2117 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2118 postcopy_discard_send_finish(ms, pds);
2128 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2130 * Helper for postcopy_chunk_hostpages; it's called twice to
2131 * canonicalize the two bitmaps, that are similar, but one is
2134 * Postcopy requires that all target pages in a hostpage are dirty or
2135 * clean, not a mix. This function canonicalizes the bitmaps.
2137 * @ms: current migration state
2138 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2139 * otherwise we need to canonicalize partially dirty host pages
2140 * @block: block that contains the page we want to canonicalize
2141 * @pds: state for postcopy
2143 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2145 PostcopyDiscardState *pds)
2147 RAMState *rs = ram_state;
2148 unsigned long *bitmap = block->bmap;
2149 unsigned long *unsentmap = block->unsentmap;
2150 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2151 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2152 unsigned long run_start;
2154 if (block->page_size == TARGET_PAGE_SIZE) {
2155 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2160 /* Find a sent page */
2161 run_start = find_next_zero_bit(unsentmap, pages, 0);
2163 /* Find a dirty page */
2164 run_start = find_next_bit(bitmap, pages, 0);
2167 while (run_start < pages) {
2168 bool do_fixup = false;
2169 unsigned long fixup_start_addr;
2170 unsigned long host_offset;
2173 * If the start of this run of pages is in the middle of a host
2174 * page, then we need to fixup this host page.
2176 host_offset = run_start % host_ratio;
2179 run_start -= host_offset;
2180 fixup_start_addr = run_start;
2181 /* For the next pass */
2182 run_start = run_start + host_ratio;
2184 /* Find the end of this run */
2185 unsigned long run_end;
2187 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2189 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2192 * If the end isn't at the start of a host page, then the
2193 * run doesn't finish at the end of a host page
2194 * and we need to discard.
2196 host_offset = run_end % host_ratio;
2199 fixup_start_addr = run_end - host_offset;
2201 * This host page has gone, the next loop iteration starts
2202 * from after the fixup
2204 run_start = fixup_start_addr + host_ratio;
2207 * No discards on this iteration, next loop starts from
2208 * next sent/dirty page
2210 run_start = run_end + 1;
2217 /* Tell the destination to discard this page */
2218 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2219 /* For the unsent_pass we:
2220 * discard partially sent pages
2221 * For the !unsent_pass (dirty) we:
2222 * discard partially dirty pages that were sent
2223 * (any partially sent pages were already discarded
2224 * by the previous unsent_pass)
2226 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2230 /* Clean up the bitmap */
2231 for (page = fixup_start_addr;
2232 page < fixup_start_addr + host_ratio; page++) {
2233 /* All pages in this host page are now not sent */
2234 set_bit(page, unsentmap);
2237 * Remark them as dirty, updating the count for any pages
2238 * that weren't previously dirty.
2240 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2245 /* Find the next sent page for the next iteration */
2246 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2248 /* Find the next dirty page for the next iteration */
2249 run_start = find_next_bit(bitmap, pages, run_start);
2255 * postcopy_chuck_hostpages: discrad any partially sent host page
2257 * Utility for the outgoing postcopy code.
2259 * Discard any partially sent host-page size chunks, mark any partially
2260 * dirty host-page size chunks as all dirty. In this case the host-page
2261 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2263 * Returns zero on success
2265 * @ms: current migration state
2266 * @block: block we want to work with
2268 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2270 PostcopyDiscardState *pds =
2271 postcopy_discard_send_init(ms, block->idstr);
2273 /* First pass: Discard all partially sent host pages */
2274 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2276 * Second pass: Ensure that all partially dirty host pages are made
2279 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2281 postcopy_discard_send_finish(ms, pds);
2286 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2288 * Returns zero on success
2290 * Transmit the set of pages to be discarded after precopy to the target
2291 * these are pages that:
2292 * a) Have been previously transmitted but are now dirty again
2293 * b) Pages that have never been transmitted, this ensures that
2294 * any pages on the destination that have been mapped by background
2295 * tasks get discarded (transparent huge pages is the specific concern)
2296 * Hopefully this is pretty sparse
2298 * @ms: current migration state
2300 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2302 RAMState *rs = ram_state;
2308 /* This should be our last sync, the src is now paused */
2309 migration_bitmap_sync(rs);
2311 /* Easiest way to make sure we don't resume in the middle of a host-page */
2312 rs->last_seen_block = NULL;
2313 rs->last_sent_block = NULL;
2316 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2317 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2318 unsigned long *bitmap = block->bmap;
2319 unsigned long *unsentmap = block->unsentmap;
2322 /* We don't have a safe way to resize the sentmap, so
2323 * if the bitmap was resized it will be NULL at this
2326 error_report("migration ram resized during precopy phase");
2330 /* Deal with TPS != HPS and huge pages */
2331 ret = postcopy_chunk_hostpages(ms, block);
2338 * Update the unsentmap to be unsentmap = unsentmap | dirty
2340 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2341 #ifdef DEBUG_POSTCOPY
2342 ram_debug_dump_bitmap(unsentmap, true, pages);
2345 trace_ram_postcopy_send_discard_bitmap();
2347 ret = postcopy_each_ram_send_discard(ms);
2354 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2356 * Returns zero on success
2358 * @rbname: name of the RAMBlock of the request. NULL means the
2359 * same that last one.
2360 * @start: RAMBlock starting page
2361 * @length: RAMBlock size
2363 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2367 trace_ram_discard_range(rbname, start, length);
2370 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2373 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2377 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2378 length >> qemu_target_page_bits());
2379 ret = ram_block_discard_range(rb, start, length);
2388 * For every allocation, we will try not to crash the VM if the
2389 * allocation failed.
2391 static int xbzrle_init(void)
2393 Error *local_err = NULL;
2395 if (!migrate_use_xbzrle()) {
2399 XBZRLE_cache_lock();
2401 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2402 if (!XBZRLE.zero_target_page) {
2403 error_report("%s: Error allocating zero page", __func__);
2407 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2408 TARGET_PAGE_SIZE, &local_err);
2409 if (!XBZRLE.cache) {
2410 error_report_err(local_err);
2411 goto free_zero_page;
2414 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2415 if (!XBZRLE.encoded_buf) {
2416 error_report("%s: Error allocating encoded_buf", __func__);
2420 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2421 if (!XBZRLE.current_buf) {
2422 error_report("%s: Error allocating current_buf", __func__);
2423 goto free_encoded_buf;
2426 /* We are all good */
2427 XBZRLE_cache_unlock();
2431 g_free(XBZRLE.encoded_buf);
2432 XBZRLE.encoded_buf = NULL;
2434 cache_fini(XBZRLE.cache);
2435 XBZRLE.cache = NULL;
2437 g_free(XBZRLE.zero_target_page);
2438 XBZRLE.zero_target_page = NULL;
2440 XBZRLE_cache_unlock();
2444 static int ram_state_init(RAMState **rsp)
2446 *rsp = g_try_new0(RAMState, 1);
2449 error_report("%s: Init ramstate fail", __func__);
2453 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2454 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2455 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2458 * Count the total number of pages used by ram blocks not including any
2459 * gaps due to alignment or unplugs.
2461 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2463 ram_state_reset(*rsp);
2468 static void ram_list_init_bitmaps(void)
2471 unsigned long pages;
2473 /* Skip setting bitmap if there is no RAM */
2474 if (ram_bytes_total()) {
2475 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2476 pages = block->max_length >> TARGET_PAGE_BITS;
2477 block->bmap = bitmap_new(pages);
2478 bitmap_set(block->bmap, 0, pages);
2479 if (migrate_postcopy_ram()) {
2480 block->unsentmap = bitmap_new(pages);
2481 bitmap_set(block->unsentmap, 0, pages);
2487 static void ram_init_bitmaps(RAMState *rs)
2489 /* For memory_global_dirty_log_start below. */
2490 qemu_mutex_lock_iothread();
2491 qemu_mutex_lock_ramlist();
2494 ram_list_init_bitmaps();
2495 memory_global_dirty_log_start();
2496 migration_bitmap_sync(rs);
2499 qemu_mutex_unlock_ramlist();
2500 qemu_mutex_unlock_iothread();
2503 static int ram_init_all(RAMState **rsp)
2505 if (ram_state_init(rsp)) {
2509 if (xbzrle_init()) {
2510 ram_state_cleanup(rsp);
2514 ram_init_bitmaps(*rsp);
2519 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2525 * Postcopy is not using xbzrle/compression, so no need for that.
2526 * Also, since source are already halted, we don't need to care
2527 * about dirty page logging as well.
2530 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2531 pages += bitmap_count_one(block->bmap,
2532 block->used_length >> TARGET_PAGE_BITS);
2535 /* This may not be aligned with current bitmaps. Recalculate. */
2536 rs->migration_dirty_pages = pages;
2538 rs->last_seen_block = NULL;
2539 rs->last_sent_block = NULL;
2541 rs->last_version = ram_list.version;
2543 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2544 * matter what we have sent.
2546 rs->ram_bulk_stage = false;
2548 /* Update RAMState cache of output QEMUFile */
2551 trace_ram_state_resume_prepare(pages);
2555 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2556 * long-running RCU critical section. When rcu-reclaims in the code
2557 * start to become numerous it will be necessary to reduce the
2558 * granularity of these critical sections.
2562 * ram_save_setup: Setup RAM for migration
2564 * Returns zero to indicate success and negative for error
2566 * @f: QEMUFile where to send the data
2567 * @opaque: RAMState pointer
2569 static int ram_save_setup(QEMUFile *f, void *opaque)
2571 RAMState **rsp = opaque;
2574 if (compress_threads_save_setup()) {
2578 /* migration has already setup the bitmap, reuse it. */
2579 if (!migration_in_colo_state()) {
2580 if (ram_init_all(rsp) != 0) {
2581 compress_threads_save_cleanup();
2589 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2591 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2592 qemu_put_byte(f, strlen(block->idstr));
2593 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2594 qemu_put_be64(f, block->used_length);
2595 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2596 qemu_put_be64(f, block->page_size);
2602 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2603 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2605 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2611 * ram_save_iterate: iterative stage for migration
2613 * Returns zero to indicate success and negative for error
2615 * @f: QEMUFile where to send the data
2616 * @opaque: RAMState pointer
2618 static int ram_save_iterate(QEMUFile *f, void *opaque)
2620 RAMState **temp = opaque;
2621 RAMState *rs = *temp;
2627 if (blk_mig_bulk_active()) {
2628 /* Avoid transferring ram during bulk phase of block migration as
2629 * the bulk phase will usually take a long time and transferring
2630 * ram updates during that time is pointless. */
2635 if (ram_list.version != rs->last_version) {
2636 ram_state_reset(rs);
2639 /* Read version before ram_list.blocks */
2642 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2644 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2646 while ((ret = qemu_file_rate_limit(f)) == 0) {
2649 pages = ram_find_and_save_block(rs, false);
2650 /* no more pages to sent */
2657 /* we want to check in the 1st loop, just in case it was the 1st time
2658 and we had to sync the dirty bitmap.
2659 qemu_get_clock_ns() is a bit expensive, so we only check each some
2662 if ((i & 63) == 0) {
2663 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2664 if (t1 > MAX_WAIT) {
2665 trace_ram_save_iterate_big_wait(t1, i);
2671 flush_compressed_data(rs);
2675 * Must occur before EOS (or any QEMUFile operation)
2676 * because of RDMA protocol.
2678 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2681 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2682 ram_counters.transferred += 8;
2684 ret = qemu_file_get_error(f);
2693 * ram_save_complete: function called to send the remaining amount of ram
2695 * Returns zero to indicate success
2697 * Called with iothread lock
2699 * @f: QEMUFile where to send the data
2700 * @opaque: RAMState pointer
2702 static int ram_save_complete(QEMUFile *f, void *opaque)
2704 RAMState **temp = opaque;
2705 RAMState *rs = *temp;
2709 if (!migration_in_postcopy()) {
2710 migration_bitmap_sync(rs);
2713 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2715 /* try transferring iterative blocks of memory */
2717 /* flush all remaining blocks regardless of rate limiting */
2721 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2722 /* no more blocks to sent */
2728 flush_compressed_data(rs);
2729 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2733 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2738 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2739 uint64_t *res_precopy_only,
2740 uint64_t *res_compatible,
2741 uint64_t *res_postcopy_only)
2743 RAMState **temp = opaque;
2744 RAMState *rs = *temp;
2745 uint64_t remaining_size;
2747 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2749 if (!migration_in_postcopy() &&
2750 remaining_size < max_size) {
2751 qemu_mutex_lock_iothread();
2753 migration_bitmap_sync(rs);
2755 qemu_mutex_unlock_iothread();
2756 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2759 if (migrate_postcopy_ram()) {
2760 /* We can do postcopy, and all the data is postcopiable */
2761 *res_compatible += remaining_size;
2763 *res_precopy_only += remaining_size;
2767 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2769 unsigned int xh_len;
2771 uint8_t *loaded_data;
2773 /* extract RLE header */
2774 xh_flags = qemu_get_byte(f);
2775 xh_len = qemu_get_be16(f);
2777 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2778 error_report("Failed to load XBZRLE page - wrong compression!");
2782 if (xh_len > TARGET_PAGE_SIZE) {
2783 error_report("Failed to load XBZRLE page - len overflow!");
2786 loaded_data = XBZRLE.decoded_buf;
2787 /* load data and decode */
2788 /* it can change loaded_data to point to an internal buffer */
2789 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2792 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2793 TARGET_PAGE_SIZE) == -1) {
2794 error_report("Failed to load XBZRLE page - decode error!");
2802 * ram_block_from_stream: read a RAMBlock id from the migration stream
2804 * Must be called from within a rcu critical section.
2806 * Returns a pointer from within the RCU-protected ram_list.
2808 * @f: QEMUFile where to read the data from
2809 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2811 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2813 static RAMBlock *block = NULL;
2817 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2819 error_report("Ack, bad migration stream!");
2825 len = qemu_get_byte(f);
2826 qemu_get_buffer(f, (uint8_t *)id, len);
2829 block = qemu_ram_block_by_name(id);
2831 error_report("Can't find block %s", id);
2835 if (!qemu_ram_is_migratable(block)) {
2836 error_report("block %s should not be migrated !", id);
2843 static inline void *host_from_ram_block_offset(RAMBlock *block,
2846 if (!offset_in_ramblock(block, offset)) {
2850 return block->host + offset;
2854 * ram_handle_compressed: handle the zero page case
2856 * If a page (or a whole RDMA chunk) has been
2857 * determined to be zero, then zap it.
2859 * @host: host address for the zero page
2860 * @ch: what the page is filled from. We only support zero
2861 * @size: size of the zero page
2863 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2865 if (ch != 0 || !is_zero_range(host, size)) {
2866 memset(host, ch, size);
2870 /* return the size after decompression, or negative value on error */
2872 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2873 const uint8_t *source, size_t source_len)
2877 err = inflateReset(stream);
2882 stream->avail_in = source_len;
2883 stream->next_in = (uint8_t *)source;
2884 stream->avail_out = dest_len;
2885 stream->next_out = dest;
2887 err = inflate(stream, Z_NO_FLUSH);
2888 if (err != Z_STREAM_END) {
2892 return stream->total_out;
2895 static void *do_data_decompress(void *opaque)
2897 DecompressParam *param = opaque;
2898 unsigned long pagesize;
2902 qemu_mutex_lock(¶m->mutex);
2903 while (!param->quit) {
2908 qemu_mutex_unlock(¶m->mutex);
2910 pagesize = TARGET_PAGE_SIZE;
2912 ret = qemu_uncompress_data(¶m->stream, des, pagesize,
2913 param->compbuf, len);
2914 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2915 error_report("decompress data failed");
2916 qemu_file_set_error(decomp_file, ret);
2919 qemu_mutex_lock(&decomp_done_lock);
2921 qemu_cond_signal(&decomp_done_cond);
2922 qemu_mutex_unlock(&decomp_done_lock);
2924 qemu_mutex_lock(¶m->mutex);
2926 qemu_cond_wait(¶m->cond, ¶m->mutex);
2929 qemu_mutex_unlock(¶m->mutex);
2934 static int wait_for_decompress_done(void)
2936 int idx, thread_count;
2938 if (!migrate_use_compression()) {
2942 thread_count = migrate_decompress_threads();
2943 qemu_mutex_lock(&decomp_done_lock);
2944 for (idx = 0; idx < thread_count; idx++) {
2945 while (!decomp_param[idx].done) {
2946 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2949 qemu_mutex_unlock(&decomp_done_lock);
2950 return qemu_file_get_error(decomp_file);
2953 static void compress_threads_load_cleanup(void)
2955 int i, thread_count;
2957 if (!migrate_use_compression()) {
2960 thread_count = migrate_decompress_threads();
2961 for (i = 0; i < thread_count; i++) {
2963 * we use it as a indicator which shows if the thread is
2964 * properly init'd or not
2966 if (!decomp_param[i].compbuf) {
2970 qemu_mutex_lock(&decomp_param[i].mutex);
2971 decomp_param[i].quit = true;
2972 qemu_cond_signal(&decomp_param[i].cond);
2973 qemu_mutex_unlock(&decomp_param[i].mutex);
2975 for (i = 0; i < thread_count; i++) {
2976 if (!decomp_param[i].compbuf) {
2980 qemu_thread_join(decompress_threads + i);
2981 qemu_mutex_destroy(&decomp_param[i].mutex);
2982 qemu_cond_destroy(&decomp_param[i].cond);
2983 inflateEnd(&decomp_param[i].stream);
2984 g_free(decomp_param[i].compbuf);
2985 decomp_param[i].compbuf = NULL;
2987 g_free(decompress_threads);
2988 g_free(decomp_param);
2989 decompress_threads = NULL;
2990 decomp_param = NULL;
2994 static int compress_threads_load_setup(QEMUFile *f)
2996 int i, thread_count;
2998 if (!migrate_use_compression()) {
3002 thread_count = migrate_decompress_threads();
3003 decompress_threads = g_new0(QemuThread, thread_count);
3004 decomp_param = g_new0(DecompressParam, thread_count);
3005 qemu_mutex_init(&decomp_done_lock);
3006 qemu_cond_init(&decomp_done_cond);
3008 for (i = 0; i < thread_count; i++) {
3009 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3013 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3014 qemu_mutex_init(&decomp_param[i].mutex);
3015 qemu_cond_init(&decomp_param[i].cond);
3016 decomp_param[i].done = true;
3017 decomp_param[i].quit = false;
3018 qemu_thread_create(decompress_threads + i, "decompress",
3019 do_data_decompress, decomp_param + i,
3020 QEMU_THREAD_JOINABLE);
3024 compress_threads_load_cleanup();
3028 static void decompress_data_with_multi_threads(QEMUFile *f,
3029 void *host, int len)
3031 int idx, thread_count;
3033 thread_count = migrate_decompress_threads();
3034 qemu_mutex_lock(&decomp_done_lock);
3036 for (idx = 0; idx < thread_count; idx++) {
3037 if (decomp_param[idx].done) {
3038 decomp_param[idx].done = false;
3039 qemu_mutex_lock(&decomp_param[idx].mutex);
3040 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3041 decomp_param[idx].des = host;
3042 decomp_param[idx].len = len;
3043 qemu_cond_signal(&decomp_param[idx].cond);
3044 qemu_mutex_unlock(&decomp_param[idx].mutex);
3048 if (idx < thread_count) {
3051 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3054 qemu_mutex_unlock(&decomp_done_lock);
3058 * ram_load_setup: Setup RAM for migration incoming side
3060 * Returns zero to indicate success and negative for error
3062 * @f: QEMUFile where to receive the data
3063 * @opaque: RAMState pointer
3065 static int ram_load_setup(QEMUFile *f, void *opaque)
3067 if (compress_threads_load_setup(f)) {
3071 xbzrle_load_setup();
3072 ramblock_recv_map_init();
3076 static int ram_load_cleanup(void *opaque)
3079 xbzrle_load_cleanup();
3080 compress_threads_load_cleanup();
3082 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3083 g_free(rb->receivedmap);
3084 rb->receivedmap = NULL;
3090 * ram_postcopy_incoming_init: allocate postcopy data structures
3092 * Returns 0 for success and negative if there was one error
3094 * @mis: current migration incoming state
3096 * Allocate data structures etc needed by incoming migration with
3097 * postcopy-ram. postcopy-ram's similarly names
3098 * postcopy_ram_incoming_init does the work.
3100 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3102 unsigned long ram_pages = last_ram_page();
3104 return postcopy_ram_incoming_init(mis, ram_pages);
3108 * ram_load_postcopy: load a page in postcopy case
3110 * Returns 0 for success or -errno in case of error
3112 * Called in postcopy mode by ram_load().
3113 * rcu_read_lock is taken prior to this being called.
3115 * @f: QEMUFile where to send the data
3117 static int ram_load_postcopy(QEMUFile *f)
3119 int flags = 0, ret = 0;
3120 bool place_needed = false;
3121 bool matching_page_sizes = false;
3122 MigrationIncomingState *mis = migration_incoming_get_current();
3123 /* Temporary page that is later 'placed' */
3124 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3125 void *last_host = NULL;
3126 bool all_zero = false;
3128 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3131 void *page_buffer = NULL;
3132 void *place_source = NULL;
3133 RAMBlock *block = NULL;
3136 addr = qemu_get_be64(f);
3139 * If qemu file error, we should stop here, and then "addr"
3142 ret = qemu_file_get_error(f);
3147 flags = addr & ~TARGET_PAGE_MASK;
3148 addr &= TARGET_PAGE_MASK;
3150 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3151 place_needed = false;
3152 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3153 block = ram_block_from_stream(f, flags);
3155 host = host_from_ram_block_offset(block, addr);
3157 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3161 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3163 * Postcopy requires that we place whole host pages atomically;
3164 * these may be huge pages for RAMBlocks that are backed by
3166 * To make it atomic, the data is read into a temporary page
3167 * that's moved into place later.
3168 * The migration protocol uses, possibly smaller, target-pages
3169 * however the source ensures it always sends all the components
3170 * of a host page in order.
3172 page_buffer = postcopy_host_page +
3173 ((uintptr_t)host & (block->page_size - 1));
3174 /* If all TP are zero then we can optimise the place */
3175 if (!((uintptr_t)host & (block->page_size - 1))) {
3178 /* not the 1st TP within the HP */
3179 if (host != (last_host + TARGET_PAGE_SIZE)) {
3180 error_report("Non-sequential target page %p/%p",
3189 * If it's the last part of a host page then we place the host
3192 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3193 (block->page_size - 1)) == 0;
3194 place_source = postcopy_host_page;
3198 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3199 case RAM_SAVE_FLAG_ZERO:
3200 ch = qemu_get_byte(f);
3201 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3207 case RAM_SAVE_FLAG_PAGE:
3209 if (!place_needed || !matching_page_sizes) {
3210 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3212 /* Avoids the qemu_file copy during postcopy, which is
3213 * going to do a copy later; can only do it when we
3214 * do this read in one go (matching page sizes)
3216 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3220 case RAM_SAVE_FLAG_EOS:
3224 error_report("Unknown combination of migration flags: %#x"
3225 " (postcopy mode)", flags);
3230 /* Detect for any possible file errors */
3231 if (!ret && qemu_file_get_error(f)) {
3232 ret = qemu_file_get_error(f);
3235 if (!ret && place_needed) {
3236 /* This gets called at the last target page in the host page */
3237 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3240 ret = postcopy_place_page_zero(mis, place_dest,
3243 ret = postcopy_place_page(mis, place_dest,
3244 place_source, block);
3252 static bool postcopy_is_advised(void)
3254 PostcopyState ps = postcopy_state_get();
3255 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3258 static bool postcopy_is_running(void)
3260 PostcopyState ps = postcopy_state_get();
3261 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3264 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3266 int flags = 0, ret = 0, invalid_flags = 0;
3267 static uint64_t seq_iter;
3270 * If system is running in postcopy mode, page inserts to host memory must
3273 bool postcopy_running = postcopy_is_running();
3274 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3275 bool postcopy_advised = postcopy_is_advised();
3279 if (version_id != 4) {
3283 if (!migrate_use_compression()) {
3284 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3286 /* This RCU critical section can be very long running.
3287 * When RCU reclaims in the code start to become numerous,
3288 * it will be necessary to reduce the granularity of this
3293 if (postcopy_running) {
3294 ret = ram_load_postcopy(f);
3297 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3298 ram_addr_t addr, total_ram_bytes;
3302 addr = qemu_get_be64(f);
3303 flags = addr & ~TARGET_PAGE_MASK;
3304 addr &= TARGET_PAGE_MASK;
3306 if (flags & invalid_flags) {
3307 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3308 error_report("Received an unexpected compressed page");
3315 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3316 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3317 RAMBlock *block = ram_block_from_stream(f, flags);
3319 host = host_from_ram_block_offset(block, addr);
3321 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3325 ramblock_recv_bitmap_set(block, host);
3326 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3329 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3330 case RAM_SAVE_FLAG_MEM_SIZE:
3331 /* Synchronize RAM block list */
3332 total_ram_bytes = addr;
3333 while (!ret && total_ram_bytes) {
3338 len = qemu_get_byte(f);
3339 qemu_get_buffer(f, (uint8_t *)id, len);
3341 length = qemu_get_be64(f);
3343 block = qemu_ram_block_by_name(id);
3344 if (block && !qemu_ram_is_migratable(block)) {
3345 error_report("block %s should not be migrated !", id);
3348 if (length != block->used_length) {
3349 Error *local_err = NULL;
3351 ret = qemu_ram_resize(block, length,
3354 error_report_err(local_err);
3357 /* For postcopy we need to check hugepage sizes match */
3358 if (postcopy_advised &&
3359 block->page_size != qemu_host_page_size) {
3360 uint64_t remote_page_size = qemu_get_be64(f);
3361 if (remote_page_size != block->page_size) {
3362 error_report("Mismatched RAM page size %s "
3363 "(local) %zd != %" PRId64,
3364 id, block->page_size,
3369 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3372 error_report("Unknown ramblock \"%s\", cannot "
3373 "accept migration", id);
3377 total_ram_bytes -= length;
3381 case RAM_SAVE_FLAG_ZERO:
3382 ch = qemu_get_byte(f);
3383 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3386 case RAM_SAVE_FLAG_PAGE:
3387 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3390 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3391 len = qemu_get_be32(f);
3392 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3393 error_report("Invalid compressed data length: %d", len);
3397 decompress_data_with_multi_threads(f, host, len);
3400 case RAM_SAVE_FLAG_XBZRLE:
3401 if (load_xbzrle(f, addr, host) < 0) {
3402 error_report("Failed to decompress XBZRLE page at "
3403 RAM_ADDR_FMT, addr);
3408 case RAM_SAVE_FLAG_EOS:
3412 if (flags & RAM_SAVE_FLAG_HOOK) {
3413 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3415 error_report("Unknown combination of migration flags: %#x",
3421 ret = qemu_file_get_error(f);
3425 ret |= wait_for_decompress_done();
3427 trace_ram_load_complete(ret, seq_iter);
3431 static bool ram_has_postcopy(void *opaque)
3433 return migrate_postcopy_ram();
3436 /* Sync all the dirty bitmap with destination VM. */
3437 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3440 QEMUFile *file = s->to_dst_file;
3441 int ramblock_count = 0;
3443 trace_ram_dirty_bitmap_sync_start();
3445 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3446 qemu_savevm_send_recv_bitmap(file, block->idstr);
3447 trace_ram_dirty_bitmap_request(block->idstr);
3451 trace_ram_dirty_bitmap_sync_wait();
3453 /* Wait until all the ramblocks' dirty bitmap synced */
3454 while (ramblock_count--) {
3455 qemu_sem_wait(&s->rp_state.rp_sem);
3458 trace_ram_dirty_bitmap_sync_complete();
3463 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3465 qemu_sem_post(&s->rp_state.rp_sem);
3469 * Read the received bitmap, revert it as the initial dirty bitmap.
3470 * This is only used when the postcopy migration is paused but wants
3471 * to resume from a middle point.
3473 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3476 QEMUFile *file = s->rp_state.from_dst_file;
3477 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3478 uint64_t local_size = nbits / 8;
3479 uint64_t size, end_mark;
3481 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3483 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3484 error_report("%s: incorrect state %s", __func__,
3485 MigrationStatus_str(s->state));
3490 * Note: see comments in ramblock_recv_bitmap_send() on why we
3491 * need the endianess convertion, and the paddings.
3493 local_size = ROUND_UP(local_size, 8);
3496 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3498 size = qemu_get_be64(file);
3500 /* The size of the bitmap should match with our ramblock */
3501 if (size != local_size) {
3502 error_report("%s: ramblock '%s' bitmap size mismatch "
3503 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3504 block->idstr, size, local_size);
3509 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3510 end_mark = qemu_get_be64(file);
3512 ret = qemu_file_get_error(file);
3513 if (ret || size != local_size) {
3514 error_report("%s: read bitmap failed for ramblock '%s': %d"
3515 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3516 __func__, block->idstr, ret, local_size, size);
3521 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3522 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3523 __func__, block->idstr, end_mark);
3529 * Endianess convertion. We are during postcopy (though paused).
3530 * The dirty bitmap won't change. We can directly modify it.
3532 bitmap_from_le(block->bmap, le_bitmap, nbits);
3535 * What we received is "received bitmap". Revert it as the initial
3536 * dirty bitmap for this ramblock.
3538 bitmap_complement(block->bmap, block->bmap, nbits);
3540 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3543 * We succeeded to sync bitmap for current ramblock. If this is
3544 * the last one to sync, we need to notify the main send thread.
3546 ram_dirty_bitmap_reload_notify(s);
3554 static int ram_resume_prepare(MigrationState *s, void *opaque)
3556 RAMState *rs = *(RAMState **)opaque;
3559 ret = ram_dirty_bitmap_sync_all(s, rs);
3564 ram_state_resume_prepare(rs, s->to_dst_file);
3569 static SaveVMHandlers savevm_ram_handlers = {
3570 .save_setup = ram_save_setup,
3571 .save_live_iterate = ram_save_iterate,
3572 .save_live_complete_postcopy = ram_save_complete,
3573 .save_live_complete_precopy = ram_save_complete,
3574 .has_postcopy = ram_has_postcopy,
3575 .save_live_pending = ram_save_pending,
3576 .load_state = ram_load,
3577 .save_cleanup = ram_save_cleanup,
3578 .load_setup = ram_load_setup,
3579 .load_cleanup = ram_load_cleanup,
3580 .resume_prepare = ram_resume_prepare,
3583 void ram_mig_init(void)
3585 qemu_mutex_init(&XBZRLE.lock);
3586 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);