4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
38 #include "migration.h"
40 #include "migration/register.h"
41 #include "migration/misc.h"
42 #include "qemu-file.h"
43 #include "postcopy-ram.h"
44 #include "page_cache.h"
45 #include "qemu/error-report.h"
46 #include "qapi/error.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
55 #include "sysemu/sysemu.h"
56 #include "qemu/uuid.h"
59 /***********************************************************/
60 /* ram save/restore */
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO 0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE 0x08
72 #define RAM_SAVE_FLAG_EOS 0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE 0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 return buffer_is_zero(p, size);
83 XBZRLECacheStats xbzrle_counters;
85 /* struct contains XBZRLE cache and a static page
86 used by the compression */
88 /* buffer used for XBZRLE encoding */
90 /* buffer for storing page content */
92 /* Cache for XBZRLE, Protected by lock. */
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
97 /* buffer used for XBZRLE decoding */
101 static void XBZRLE_cache_lock(void)
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
107 static void XBZRLE_cache_unlock(void)
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
114 * xbzrle_cache_resize: resize the xbzrle cache
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
121 * Returns 0 for success or -1 for error
123 * @new_size: new cache size
124 * @errp: set *errp if the check failed, with reason
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
128 PageCache *new_cache;
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
138 if (new_size == migrate_xbzrle_cache_size()) {
145 if (XBZRLE.cache != NULL) {
146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
156 XBZRLE_cache_unlock();
160 /* Should be holding either ram_list.mutex, or the RCU lock. */
161 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
162 INTERNAL_RAMBLOCK_FOREACH(block) \
163 if (!qemu_ram_is_migratable(block)) {} else
165 #undef RAMBLOCK_FOREACH
167 static void ramblock_recv_map_init(void)
171 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
172 assert(!rb->receivedmap);
173 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
177 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
179 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
183 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
185 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
188 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
190 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
193 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
196 bitmap_set_atomic(rb->receivedmap,
197 ramblock_recv_bitmap_offset(host_addr, rb),
201 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
204 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
206 * Returns >0 if success with sent bytes, or <0 if error.
208 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
209 const char *block_name)
211 RAMBlock *block = qemu_ram_block_by_name(block_name);
212 unsigned long *le_bitmap, nbits;
216 error_report("%s: invalid block name: %s", __func__, block_name);
220 nbits = block->used_length >> TARGET_PAGE_BITS;
223 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
224 * machines we may need 4 more bytes for padding (see below
225 * comment). So extend it a bit before hand.
227 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
230 * Always use little endian when sending the bitmap. This is
231 * required that when source and destination VMs are not using the
232 * same endianess. (Note: big endian won't work.)
234 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
236 /* Size of the bitmap, in bytes */
240 * size is always aligned to 8 bytes for 64bit machines, but it
241 * may not be true for 32bit machines. We need this padding to
242 * make sure the migration can survive even between 32bit and
245 size = ROUND_UP(size, 8);
247 qemu_put_be64(file, size);
248 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
250 * Mark as an end, in case the middle part is screwed up due to
251 * some "misterious" reason.
253 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
258 if (qemu_file_get_error(file)) {
259 return qemu_file_get_error(file);
262 return size + sizeof(size);
266 * An outstanding page request, on the source, having been received
269 struct RAMSrcPageRequest {
274 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
277 /* State of RAM for migration */
279 /* QEMUFile used for this migration */
281 /* Last block that we have visited searching for dirty pages */
282 RAMBlock *last_seen_block;
283 /* Last block from where we have sent data */
284 RAMBlock *last_sent_block;
285 /* Last dirty target page we have sent */
286 ram_addr_t last_page;
287 /* last ram version we have seen */
288 uint32_t last_version;
289 /* We are in the first round */
291 /* How many times we have dirty too many pages */
292 int dirty_rate_high_cnt;
293 /* these variables are used for bitmap sync */
294 /* last time we did a full bitmap_sync */
295 int64_t time_last_bitmap_sync;
296 /* bytes transferred at start_time */
297 uint64_t bytes_xfer_prev;
298 /* number of dirty pages since start_time */
299 uint64_t num_dirty_pages_period;
300 /* xbzrle misses since the beginning of the period */
301 uint64_t xbzrle_cache_miss_prev;
302 /* number of iterations at the beginning of period */
303 uint64_t iterations_prev;
304 /* Iterations since start */
306 /* number of dirty bits in the bitmap */
307 uint64_t migration_dirty_pages;
308 /* protects modification of the bitmap */
309 QemuMutex bitmap_mutex;
310 /* The RAMBlock used in the last src_page_requests */
311 RAMBlock *last_req_rb;
312 /* Queue of outstanding page requests from the destination */
313 QemuMutex src_page_req_mutex;
314 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
316 typedef struct RAMState RAMState;
318 static RAMState *ram_state;
320 uint64_t ram_bytes_remaining(void)
322 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
326 MigrationStats ram_counters;
328 /* used by the search for pages to send */
329 struct PageSearchStatus {
330 /* Current block being searched */
332 /* Current page to search from */
334 /* Set once we wrap around */
337 typedef struct PageSearchStatus PageSearchStatus;
339 struct CompressParam {
348 /* internally used fields */
352 typedef struct CompressParam CompressParam;
354 struct DecompressParam {
364 typedef struct DecompressParam DecompressParam;
366 static CompressParam *comp_param;
367 static QemuThread *compress_threads;
368 /* comp_done_cond is used to wake up the migration thread when
369 * one of the compression threads has finished the compression.
370 * comp_done_lock is used to co-work with comp_done_cond.
372 static QemuMutex comp_done_lock;
373 static QemuCond comp_done_cond;
374 /* The empty QEMUFileOps will be used by file in CompressParam */
375 static const QEMUFileOps empty_ops = { };
377 static QEMUFile *decomp_file;
378 static DecompressParam *decomp_param;
379 static QemuThread *decompress_threads;
380 static QemuMutex decomp_done_lock;
381 static QemuCond decomp_done_cond;
383 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
384 ram_addr_t offset, uint8_t *source_buf);
386 static void *do_data_compress(void *opaque)
388 CompressParam *param = opaque;
392 qemu_mutex_lock(¶m->mutex);
393 while (!param->quit) {
395 block = param->block;
396 offset = param->offset;
398 qemu_mutex_unlock(¶m->mutex);
400 do_compress_ram_page(param->file, ¶m->stream, block, offset,
403 qemu_mutex_lock(&comp_done_lock);
405 qemu_cond_signal(&comp_done_cond);
406 qemu_mutex_unlock(&comp_done_lock);
408 qemu_mutex_lock(¶m->mutex);
410 qemu_cond_wait(¶m->cond, ¶m->mutex);
413 qemu_mutex_unlock(¶m->mutex);
418 static inline void terminate_compression_threads(void)
420 int idx, thread_count;
422 thread_count = migrate_compress_threads();
424 for (idx = 0; idx < thread_count; idx++) {
425 qemu_mutex_lock(&comp_param[idx].mutex);
426 comp_param[idx].quit = true;
427 qemu_cond_signal(&comp_param[idx].cond);
428 qemu_mutex_unlock(&comp_param[idx].mutex);
432 static void compress_threads_save_cleanup(void)
436 if (!migrate_use_compression()) {
439 terminate_compression_threads();
440 thread_count = migrate_compress_threads();
441 for (i = 0; i < thread_count; i++) {
443 * we use it as a indicator which shows if the thread is
444 * properly init'd or not
446 if (!comp_param[i].file) {
449 qemu_thread_join(compress_threads + i);
450 qemu_mutex_destroy(&comp_param[i].mutex);
451 qemu_cond_destroy(&comp_param[i].cond);
452 deflateEnd(&comp_param[i].stream);
453 g_free(comp_param[i].originbuf);
454 qemu_fclose(comp_param[i].file);
455 comp_param[i].file = NULL;
457 qemu_mutex_destroy(&comp_done_lock);
458 qemu_cond_destroy(&comp_done_cond);
459 g_free(compress_threads);
461 compress_threads = NULL;
465 static int compress_threads_save_setup(void)
469 if (!migrate_use_compression()) {
472 thread_count = migrate_compress_threads();
473 compress_threads = g_new0(QemuThread, thread_count);
474 comp_param = g_new0(CompressParam, thread_count);
475 qemu_cond_init(&comp_done_cond);
476 qemu_mutex_init(&comp_done_lock);
477 for (i = 0; i < thread_count; i++) {
478 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
479 if (!comp_param[i].originbuf) {
483 if (deflateInit(&comp_param[i].stream,
484 migrate_compress_level()) != Z_OK) {
485 g_free(comp_param[i].originbuf);
489 /* comp_param[i].file is just used as a dummy buffer to save data,
490 * set its ops to empty.
492 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
493 comp_param[i].done = true;
494 comp_param[i].quit = false;
495 qemu_mutex_init(&comp_param[i].mutex);
496 qemu_cond_init(&comp_param[i].cond);
497 qemu_thread_create(compress_threads + i, "compress",
498 do_data_compress, comp_param + i,
499 QEMU_THREAD_JOINABLE);
504 compress_threads_save_cleanup();
510 #define MULTIFD_MAGIC 0x11223344U
511 #define MULTIFD_VERSION 1
516 unsigned char uuid[16]; /* QemuUUID */
518 } __attribute__((packed)) MultiFDInit_t;
521 /* this fields are not changed once the thread is created */
524 /* channel thread name */
526 /* channel thread id */
528 /* communication channel */
530 /* sem where to wait for more work */
532 /* this mutex protects the following parameters */
534 /* is this channel thread running */
536 /* should this thread finish */
541 /* this fields are not changed once the thread is created */
544 /* channel thread name */
546 /* channel thread id */
548 /* communication channel */
550 /* sem where to wait for more work */
552 /* this mutex protects the following parameters */
554 /* is this channel thread running */
556 /* should this thread finish */
560 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
565 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
566 msg.version = cpu_to_be32(MULTIFD_VERSION);
568 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
570 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
577 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
582 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
587 be32_to_cpus(&msg.magic);
588 be32_to_cpus(&msg.version);
590 if (msg.magic != MULTIFD_MAGIC) {
591 error_setg(errp, "multifd: received packet magic %x "
592 "expected %x", msg.magic, MULTIFD_MAGIC);
596 if (msg.version != MULTIFD_VERSION) {
597 error_setg(errp, "multifd: received packet version %d "
598 "expected %d", msg.version, MULTIFD_VERSION);
602 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
603 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
604 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
606 error_setg(errp, "multifd: received uuid '%s' and expected "
607 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
613 if (msg.id > migrate_multifd_channels()) {
614 error_setg(errp, "multifd: received channel version %d "
615 "expected %d", msg.version, MULTIFD_VERSION);
623 MultiFDSendParams *params;
624 /* number of created threads */
626 } *multifd_send_state;
628 static void multifd_send_terminate_threads(Error *err)
633 MigrationState *s = migrate_get_current();
634 migrate_set_error(s, err);
635 if (s->state == MIGRATION_STATUS_SETUP ||
636 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
637 s->state == MIGRATION_STATUS_DEVICE ||
638 s->state == MIGRATION_STATUS_ACTIVE) {
639 migrate_set_state(&s->state, s->state,
640 MIGRATION_STATUS_FAILED);
644 for (i = 0; i < migrate_multifd_channels(); i++) {
645 MultiFDSendParams *p = &multifd_send_state->params[i];
647 qemu_mutex_lock(&p->mutex);
649 qemu_sem_post(&p->sem);
650 qemu_mutex_unlock(&p->mutex);
654 int multifd_save_cleanup(Error **errp)
659 if (!migrate_use_multifd()) {
662 multifd_send_terminate_threads(NULL);
663 for (i = 0; i < migrate_multifd_channels(); i++) {
664 MultiFDSendParams *p = &multifd_send_state->params[i];
667 qemu_thread_join(&p->thread);
669 socket_send_channel_destroy(p->c);
671 qemu_mutex_destroy(&p->mutex);
672 qemu_sem_destroy(&p->sem);
676 g_free(multifd_send_state->params);
677 multifd_send_state->params = NULL;
678 g_free(multifd_send_state);
679 multifd_send_state = NULL;
683 static void *multifd_send_thread(void *opaque)
685 MultiFDSendParams *p = opaque;
686 Error *local_err = NULL;
688 if (multifd_send_initial_packet(p, &local_err) < 0) {
693 qemu_mutex_lock(&p->mutex);
695 qemu_mutex_unlock(&p->mutex);
698 qemu_mutex_unlock(&p->mutex);
699 qemu_sem_wait(&p->sem);
704 multifd_send_terminate_threads(local_err);
707 qemu_mutex_lock(&p->mutex);
709 qemu_mutex_unlock(&p->mutex);
714 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
716 MultiFDSendParams *p = opaque;
717 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
718 Error *local_err = NULL;
720 if (qio_task_propagate_error(task, &local_err)) {
721 if (multifd_save_cleanup(&local_err) != 0) {
722 migrate_set_error(migrate_get_current(), local_err);
725 p->c = QIO_CHANNEL(sioc);
726 qio_channel_set_delay(p->c, false);
728 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
729 QEMU_THREAD_JOINABLE);
731 atomic_inc(&multifd_send_state->count);
735 int multifd_save_setup(void)
740 if (!migrate_use_multifd()) {
743 thread_count = migrate_multifd_channels();
744 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
745 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
746 atomic_set(&multifd_send_state->count, 0);
747 for (i = 0; i < thread_count; i++) {
748 MultiFDSendParams *p = &multifd_send_state->params[i];
750 qemu_mutex_init(&p->mutex);
751 qemu_sem_init(&p->sem, 0);
754 p->name = g_strdup_printf("multifdsend_%d", i);
755 socket_send_channel_create(multifd_new_send_channel_async, p);
761 MultiFDRecvParams *params;
762 /* number of created threads */
764 } *multifd_recv_state;
766 static void multifd_recv_terminate_threads(Error *err)
771 MigrationState *s = migrate_get_current();
772 migrate_set_error(s, err);
773 if (s->state == MIGRATION_STATUS_SETUP ||
774 s->state == MIGRATION_STATUS_ACTIVE) {
775 migrate_set_state(&s->state, s->state,
776 MIGRATION_STATUS_FAILED);
780 for (i = 0; i < migrate_multifd_channels(); i++) {
781 MultiFDRecvParams *p = &multifd_recv_state->params[i];
783 qemu_mutex_lock(&p->mutex);
785 qemu_sem_post(&p->sem);
786 qemu_mutex_unlock(&p->mutex);
790 int multifd_load_cleanup(Error **errp)
795 if (!migrate_use_multifd()) {
798 multifd_recv_terminate_threads(NULL);
799 for (i = 0; i < migrate_multifd_channels(); i++) {
800 MultiFDRecvParams *p = &multifd_recv_state->params[i];
803 qemu_thread_join(&p->thread);
805 object_unref(OBJECT(p->c));
807 qemu_mutex_destroy(&p->mutex);
808 qemu_sem_destroy(&p->sem);
812 g_free(multifd_recv_state->params);
813 multifd_recv_state->params = NULL;
814 g_free(multifd_recv_state);
815 multifd_recv_state = NULL;
820 static void *multifd_recv_thread(void *opaque)
822 MultiFDRecvParams *p = opaque;
825 qemu_mutex_lock(&p->mutex);
827 qemu_mutex_unlock(&p->mutex);
830 qemu_mutex_unlock(&p->mutex);
831 qemu_sem_wait(&p->sem);
834 qemu_mutex_lock(&p->mutex);
836 qemu_mutex_unlock(&p->mutex);
841 int multifd_load_setup(void)
846 if (!migrate_use_multifd()) {
849 thread_count = migrate_multifd_channels();
850 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
851 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
852 atomic_set(&multifd_recv_state->count, 0);
853 for (i = 0; i < thread_count; i++) {
854 MultiFDRecvParams *p = &multifd_recv_state->params[i];
856 qemu_mutex_init(&p->mutex);
857 qemu_sem_init(&p->sem, 0);
860 p->name = g_strdup_printf("multifdrecv_%d", i);
865 bool multifd_recv_all_channels_created(void)
867 int thread_count = migrate_multifd_channels();
869 if (!migrate_use_multifd()) {
873 return thread_count == atomic_read(&multifd_recv_state->count);
876 void multifd_recv_new_channel(QIOChannel *ioc)
878 MultiFDRecvParams *p;
879 Error *local_err = NULL;
882 id = multifd_recv_initial_packet(ioc, &local_err);
884 multifd_recv_terminate_threads(local_err);
888 p = &multifd_recv_state->params[id];
890 error_setg(&local_err, "multifd: received id '%d' already setup'",
892 multifd_recv_terminate_threads(local_err);
896 object_ref(OBJECT(ioc));
899 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
900 QEMU_THREAD_JOINABLE);
901 atomic_inc(&multifd_recv_state->count);
902 if (multifd_recv_state->count == migrate_multifd_channels()) {
903 migration_incoming_process();
908 * save_page_header: write page header to wire
910 * If this is the 1st block, it also writes the block identification
912 * Returns the number of bytes written
914 * @f: QEMUFile where to send the data
915 * @block: block that contains the page we want to send
916 * @offset: offset inside the block for the page
917 * in the lower bits, it contains flags
919 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
924 if (block == rs->last_sent_block) {
925 offset |= RAM_SAVE_FLAG_CONTINUE;
927 qemu_put_be64(f, offset);
930 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
931 len = strlen(block->idstr);
932 qemu_put_byte(f, len);
933 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
935 rs->last_sent_block = block;
941 * mig_throttle_guest_down: throotle down the guest
943 * Reduce amount of guest cpu execution to hopefully slow down memory
944 * writes. If guest dirty memory rate is reduced below the rate at
945 * which we can transfer pages to the destination then we should be
946 * able to complete migration. Some workloads dirty memory way too
947 * fast and will not effectively converge, even with auto-converge.
949 static void mig_throttle_guest_down(void)
951 MigrationState *s = migrate_get_current();
952 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
953 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
955 /* We have not started throttling yet. Let's start it. */
956 if (!cpu_throttle_active()) {
957 cpu_throttle_set(pct_initial);
959 /* Throttling already on, just increase the rate */
960 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
965 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
967 * @rs: current RAM state
968 * @current_addr: address for the zero page
970 * Update the xbzrle cache to reflect a page that's been sent as all 0.
971 * The important thing is that a stale (not-yet-0'd) page be replaced
973 * As a bonus, if the page wasn't in the cache it gets added so that
974 * when a small write is made into the 0'd page it gets XBZRLE sent.
976 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
978 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
982 /* We don't care if this fails to allocate a new cache page
983 * as long as it updated an old one */
984 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
985 ram_counters.dirty_sync_count);
988 #define ENCODING_FLAG_XBZRLE 0x1
991 * save_xbzrle_page: compress and send current page
993 * Returns: 1 means that we wrote the page
994 * 0 means that page is identical to the one already sent
995 * -1 means that xbzrle would be longer than normal
997 * @rs: current RAM state
998 * @current_data: pointer to the address of the page contents
999 * @current_addr: addr of the page
1000 * @block: block that contains the page we want to send
1001 * @offset: offset inside the block for the page
1002 * @last_stage: if we are at the completion stage
1004 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1005 ram_addr_t current_addr, RAMBlock *block,
1006 ram_addr_t offset, bool last_stage)
1008 int encoded_len = 0, bytes_xbzrle;
1009 uint8_t *prev_cached_page;
1011 if (!cache_is_cached(XBZRLE.cache, current_addr,
1012 ram_counters.dirty_sync_count)) {
1013 xbzrle_counters.cache_miss++;
1015 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1016 ram_counters.dirty_sync_count) == -1) {
1019 /* update *current_data when the page has been
1020 inserted into cache */
1021 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1027 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1029 /* save current buffer into memory */
1030 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1032 /* XBZRLE encoding (if there is no overflow) */
1033 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1034 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1036 if (encoded_len == 0) {
1037 trace_save_xbzrle_page_skipping();
1039 } else if (encoded_len == -1) {
1040 trace_save_xbzrle_page_overflow();
1041 xbzrle_counters.overflow++;
1042 /* update data in the cache */
1044 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1045 *current_data = prev_cached_page;
1050 /* we need to update the data in the cache, in order to get the same data */
1052 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1055 /* Send XBZRLE based compressed page */
1056 bytes_xbzrle = save_page_header(rs, rs->f, block,
1057 offset | RAM_SAVE_FLAG_XBZRLE);
1058 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1059 qemu_put_be16(rs->f, encoded_len);
1060 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1061 bytes_xbzrle += encoded_len + 1 + 2;
1062 xbzrle_counters.pages++;
1063 xbzrle_counters.bytes += bytes_xbzrle;
1064 ram_counters.transferred += bytes_xbzrle;
1070 * migration_bitmap_find_dirty: find the next dirty page from start
1072 * Called with rcu_read_lock() to protect migration_bitmap
1074 * Returns the byte offset within memory region of the start of a dirty page
1076 * @rs: current RAM state
1077 * @rb: RAMBlock where to search for dirty pages
1078 * @start: page where we start the search
1081 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1082 unsigned long start)
1084 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1085 unsigned long *bitmap = rb->bmap;
1088 if (!qemu_ram_is_migratable(rb)) {
1092 if (rs->ram_bulk_stage && start > 0) {
1095 next = find_next_bit(bitmap, size, start);
1101 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1107 ret = test_and_clear_bit(page, rb->bmap);
1110 rs->migration_dirty_pages--;
1115 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1116 ram_addr_t start, ram_addr_t length)
1118 rs->migration_dirty_pages +=
1119 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1120 &rs->num_dirty_pages_period);
1124 * ram_pagesize_summary: calculate all the pagesizes of a VM
1126 * Returns a summary bitmap of the page sizes of all RAMBlocks
1128 * For VMs with just normal pages this is equivalent to the host page
1129 * size. If it's got some huge pages then it's the OR of all the
1130 * different page sizes.
1132 uint64_t ram_pagesize_summary(void)
1135 uint64_t summary = 0;
1137 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1138 summary |= block->page_size;
1144 static void migration_update_rates(RAMState *rs, int64_t end_time)
1146 uint64_t iter_count = rs->iterations - rs->iterations_prev;
1148 /* calculate period counters */
1149 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1150 / (end_time - rs->time_last_bitmap_sync);
1156 if (migrate_use_xbzrle()) {
1157 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1158 rs->xbzrle_cache_miss_prev) / iter_count;
1159 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1163 static void migration_bitmap_sync(RAMState *rs)
1167 uint64_t bytes_xfer_now;
1169 ram_counters.dirty_sync_count++;
1171 if (!rs->time_last_bitmap_sync) {
1172 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1175 trace_migration_bitmap_sync_start();
1176 memory_global_dirty_log_sync();
1178 qemu_mutex_lock(&rs->bitmap_mutex);
1180 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1181 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1183 ram_counters.remaining = ram_bytes_remaining();
1185 qemu_mutex_unlock(&rs->bitmap_mutex);
1187 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1189 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1191 /* more than 1 second = 1000 millisecons */
1192 if (end_time > rs->time_last_bitmap_sync + 1000) {
1193 bytes_xfer_now = ram_counters.transferred;
1195 /* During block migration the auto-converge logic incorrectly detects
1196 * that ram migration makes no progress. Avoid this by disabling the
1197 * throttling logic during the bulk phase of block migration. */
1198 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1199 /* The following detection logic can be refined later. For now:
1200 Check to see if the dirtied bytes is 50% more than the approx.
1201 amount of bytes that just got transferred since the last time we
1202 were in this routine. If that happens twice, start or increase
1205 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1206 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1207 (++rs->dirty_rate_high_cnt >= 2)) {
1208 trace_migration_throttle();
1209 rs->dirty_rate_high_cnt = 0;
1210 mig_throttle_guest_down();
1214 migration_update_rates(rs, end_time);
1216 rs->iterations_prev = rs->iterations;
1218 /* reset period counters */
1219 rs->time_last_bitmap_sync = end_time;
1220 rs->num_dirty_pages_period = 0;
1221 rs->bytes_xfer_prev = bytes_xfer_now;
1223 if (migrate_use_events()) {
1224 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1229 * save_zero_page: send the zero page to the stream
1231 * Returns the number of pages written.
1233 * @rs: current RAM state
1234 * @block: block that contains the page we want to send
1235 * @offset: offset inside the block for the page
1237 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1239 uint8_t *p = block->host + offset;
1242 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1243 ram_counters.duplicate++;
1244 ram_counters.transferred +=
1245 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1246 qemu_put_byte(rs->f, 0);
1247 ram_counters.transferred += 1;
1254 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1256 if (!migrate_release_ram() || !migration_in_postcopy()) {
1260 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1264 * @pages: the number of pages written by the control path,
1266 * > 0 - number of pages written
1268 * Return true if the pages has been saved, otherwise false is returned.
1270 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1273 uint64_t bytes_xmit = 0;
1277 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1279 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1284 ram_counters.transferred += bytes_xmit;
1288 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1292 if (bytes_xmit > 0) {
1293 ram_counters.normal++;
1294 } else if (bytes_xmit == 0) {
1295 ram_counters.duplicate++;
1302 * directly send the page to the stream
1304 * Returns the number of pages written.
1306 * @rs: current RAM state
1307 * @block: block that contains the page we want to send
1308 * @offset: offset inside the block for the page
1309 * @buf: the page to be sent
1310 * @async: send to page asyncly
1312 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1313 uint8_t *buf, bool async)
1315 ram_counters.transferred += save_page_header(rs, rs->f, block,
1316 offset | RAM_SAVE_FLAG_PAGE);
1318 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1319 migrate_release_ram() &
1320 migration_in_postcopy());
1322 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1324 ram_counters.transferred += TARGET_PAGE_SIZE;
1325 ram_counters.normal++;
1330 * ram_save_page: send the given page to the stream
1332 * Returns the number of pages written.
1334 * >=0 - Number of pages written - this might legally be 0
1335 * if xbzrle noticed the page was the same.
1337 * @rs: current RAM state
1338 * @block: block that contains the page we want to send
1339 * @offset: offset inside the block for the page
1340 * @last_stage: if we are at the completion stage
1342 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1346 bool send_async = true;
1347 RAMBlock *block = pss->block;
1348 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1349 ram_addr_t current_addr = block->offset + offset;
1351 p = block->host + offset;
1352 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1354 XBZRLE_cache_lock();
1355 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1356 migrate_use_xbzrle()) {
1357 pages = save_xbzrle_page(rs, &p, current_addr, block,
1358 offset, last_stage);
1360 /* Can't send this cached data async, since the cache page
1361 * might get updated before it gets to the wire
1367 /* XBZRLE overflow or normal page */
1369 pages = save_normal_page(rs, block, offset, p, send_async);
1372 XBZRLE_cache_unlock();
1377 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1378 ram_addr_t offset, uint8_t *source_buf)
1380 RAMState *rs = ram_state;
1381 int bytes_sent, blen;
1382 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1384 bytes_sent = save_page_header(rs, f, block, offset |
1385 RAM_SAVE_FLAG_COMPRESS_PAGE);
1388 * copy it to a internal buffer to avoid it being modified by VM
1389 * so that we can catch up the error during compression and
1392 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1393 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1396 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1397 error_report("compressed data failed!");
1400 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1406 static void flush_compressed_data(RAMState *rs)
1408 int idx, len, thread_count;
1410 if (!migrate_use_compression()) {
1413 thread_count = migrate_compress_threads();
1415 qemu_mutex_lock(&comp_done_lock);
1416 for (idx = 0; idx < thread_count; idx++) {
1417 while (!comp_param[idx].done) {
1418 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1421 qemu_mutex_unlock(&comp_done_lock);
1423 for (idx = 0; idx < thread_count; idx++) {
1424 qemu_mutex_lock(&comp_param[idx].mutex);
1425 if (!comp_param[idx].quit) {
1426 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1427 ram_counters.transferred += len;
1429 qemu_mutex_unlock(&comp_param[idx].mutex);
1433 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1436 param->block = block;
1437 param->offset = offset;
1440 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1443 int idx, thread_count, bytes_xmit = -1, pages = -1;
1445 thread_count = migrate_compress_threads();
1446 qemu_mutex_lock(&comp_done_lock);
1448 for (idx = 0; idx < thread_count; idx++) {
1449 if (comp_param[idx].done) {
1450 comp_param[idx].done = false;
1451 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1452 qemu_mutex_lock(&comp_param[idx].mutex);
1453 set_compress_params(&comp_param[idx], block, offset);
1454 qemu_cond_signal(&comp_param[idx].cond);
1455 qemu_mutex_unlock(&comp_param[idx].mutex);
1457 ram_counters.normal++;
1458 ram_counters.transferred += bytes_xmit;
1465 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1468 qemu_mutex_unlock(&comp_done_lock);
1474 * find_dirty_block: find the next dirty page and update any state
1475 * associated with the search process.
1477 * Returns if a page is found
1479 * @rs: current RAM state
1480 * @pss: data about the state of the current dirty page scan
1481 * @again: set to false if the search has scanned the whole of RAM
1483 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1485 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1486 if (pss->complete_round && pss->block == rs->last_seen_block &&
1487 pss->page >= rs->last_page) {
1489 * We've been once around the RAM and haven't found anything.
1495 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1496 /* Didn't find anything in this RAM Block */
1498 pss->block = QLIST_NEXT_RCU(pss->block, next);
1500 /* Hit the end of the list */
1501 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1502 /* Flag that we've looped */
1503 pss->complete_round = true;
1504 rs->ram_bulk_stage = false;
1505 if (migrate_use_xbzrle()) {
1506 /* If xbzrle is on, stop using the data compression at this
1507 * point. In theory, xbzrle can do better than compression.
1509 flush_compressed_data(rs);
1512 /* Didn't find anything this time, but try again on the new block */
1516 /* Can go around again, but... */
1518 /* We've found something so probably don't need to */
1524 * unqueue_page: gets a page of the queue
1526 * Helper for 'get_queued_page' - gets a page off the queue
1528 * Returns the block of the page (or NULL if none available)
1530 * @rs: current RAM state
1531 * @offset: used to return the offset within the RAMBlock
1533 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1535 RAMBlock *block = NULL;
1537 qemu_mutex_lock(&rs->src_page_req_mutex);
1538 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1539 struct RAMSrcPageRequest *entry =
1540 QSIMPLEQ_FIRST(&rs->src_page_requests);
1542 *offset = entry->offset;
1544 if (entry->len > TARGET_PAGE_SIZE) {
1545 entry->len -= TARGET_PAGE_SIZE;
1546 entry->offset += TARGET_PAGE_SIZE;
1548 memory_region_unref(block->mr);
1549 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1551 migration_consume_urgent_request();
1554 qemu_mutex_unlock(&rs->src_page_req_mutex);
1560 * get_queued_page: unqueue a page from the postocpy requests
1562 * Skips pages that are already sent (!dirty)
1564 * Returns if a queued page is found
1566 * @rs: current RAM state
1567 * @pss: data about the state of the current dirty page scan
1569 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1576 block = unqueue_page(rs, &offset);
1578 * We're sending this page, and since it's postcopy nothing else
1579 * will dirty it, and we must make sure it doesn't get sent again
1580 * even if this queue request was received after the background
1581 * search already sent it.
1586 page = offset >> TARGET_PAGE_BITS;
1587 dirty = test_bit(page, block->bmap);
1589 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1590 page, test_bit(page, block->unsentmap));
1592 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1596 } while (block && !dirty);
1600 * As soon as we start servicing pages out of order, then we have
1601 * to kill the bulk stage, since the bulk stage assumes
1602 * in (migration_bitmap_find_and_reset_dirty) that every page is
1603 * dirty, that's no longer true.
1605 rs->ram_bulk_stage = false;
1608 * We want the background search to continue from the queued page
1609 * since the guest is likely to want other pages near to the page
1610 * it just requested.
1613 pss->page = offset >> TARGET_PAGE_BITS;
1620 * migration_page_queue_free: drop any remaining pages in the ram
1623 * It should be empty at the end anyway, but in error cases there may
1624 * be some left. in case that there is any page left, we drop it.
1627 static void migration_page_queue_free(RAMState *rs)
1629 struct RAMSrcPageRequest *mspr, *next_mspr;
1630 /* This queue generally should be empty - but in the case of a failed
1631 * migration might have some droppings in.
1634 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1635 memory_region_unref(mspr->rb->mr);
1636 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1643 * ram_save_queue_pages: queue the page for transmission
1645 * A request from postcopy destination for example.
1647 * Returns zero on success or negative on error
1649 * @rbname: Name of the RAMBLock of the request. NULL means the
1650 * same that last one.
1651 * @start: starting address from the start of the RAMBlock
1652 * @len: length (in bytes) to send
1654 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1657 RAMState *rs = ram_state;
1659 ram_counters.postcopy_requests++;
1662 /* Reuse last RAMBlock */
1663 ramblock = rs->last_req_rb;
1667 * Shouldn't happen, we can't reuse the last RAMBlock if
1668 * it's the 1st request.
1670 error_report("ram_save_queue_pages no previous block");
1674 ramblock = qemu_ram_block_by_name(rbname);
1677 /* We shouldn't be asked for a non-existent RAMBlock */
1678 error_report("ram_save_queue_pages no block '%s'", rbname);
1681 rs->last_req_rb = ramblock;
1683 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1684 if (start+len > ramblock->used_length) {
1685 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1686 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1687 __func__, start, len, ramblock->used_length);
1691 struct RAMSrcPageRequest *new_entry =
1692 g_malloc0(sizeof(struct RAMSrcPageRequest));
1693 new_entry->rb = ramblock;
1694 new_entry->offset = start;
1695 new_entry->len = len;
1697 memory_region_ref(ramblock->mr);
1698 qemu_mutex_lock(&rs->src_page_req_mutex);
1699 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1700 migration_make_urgent_request();
1701 qemu_mutex_unlock(&rs->src_page_req_mutex);
1711 static bool save_page_use_compression(RAMState *rs)
1713 if (!migrate_use_compression()) {
1718 * If xbzrle is on, stop using the data compression after first
1719 * round of migration even if compression is enabled. In theory,
1720 * xbzrle can do better than compression.
1722 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1730 * ram_save_target_page: save one target page
1732 * Returns the number of pages written
1734 * @rs: current RAM state
1735 * @pss: data about the page we want to send
1736 * @last_stage: if we are at the completion stage
1738 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1741 RAMBlock *block = pss->block;
1742 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1745 if (control_save_page(rs, block, offset, &res)) {
1750 * When starting the process of a new block, the first page of
1751 * the block should be sent out before other pages in the same
1752 * block, and all the pages in last block should have been sent
1753 * out, keeping this order is important, because the 'cont' flag
1754 * is used to avoid resending the block name.
1756 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1757 flush_compressed_data(rs);
1760 res = save_zero_page(rs, block, offset);
1762 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1763 * page would be stale
1765 if (!save_page_use_compression(rs)) {
1766 XBZRLE_cache_lock();
1767 xbzrle_cache_zero_page(rs, block->offset + offset);
1768 XBZRLE_cache_unlock();
1770 ram_release_pages(block->idstr, offset, res);
1775 * Make sure the first page is sent out before other pages.
1777 * we post it as normal page as compression will take much
1780 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1781 return compress_page_with_multi_thread(rs, block, offset);
1784 return ram_save_page(rs, pss, last_stage);
1788 * ram_save_host_page: save a whole host page
1790 * Starting at *offset send pages up to the end of the current host
1791 * page. It's valid for the initial offset to point into the middle of
1792 * a host page in which case the remainder of the hostpage is sent.
1793 * Only dirty target pages are sent. Note that the host page size may
1794 * be a huge page for this block.
1795 * The saving stops at the boundary of the used_length of the block
1796 * if the RAMBlock isn't a multiple of the host page size.
1798 * Returns the number of pages written or negative on error
1800 * @rs: current RAM state
1801 * @ms: current migration state
1802 * @pss: data about the page we want to send
1803 * @last_stage: if we are at the completion stage
1805 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1808 int tmppages, pages = 0;
1809 size_t pagesize_bits =
1810 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1812 if (!qemu_ram_is_migratable(pss->block)) {
1813 error_report("block %s should not be migrated !", pss->block->idstr);
1818 /* Check the pages is dirty and if it is send it */
1819 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1824 tmppages = ram_save_target_page(rs, pss, last_stage);
1830 if (pss->block->unsentmap) {
1831 clear_bit(pss->page, pss->block->unsentmap);
1835 } while ((pss->page & (pagesize_bits - 1)) &&
1836 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1838 /* The offset we leave with is the last one we looked at */
1844 * ram_find_and_save_block: finds a dirty page and sends it to f
1846 * Called within an RCU critical section.
1848 * Returns the number of pages written where zero means no dirty pages
1850 * @rs: current RAM state
1851 * @last_stage: if we are at the completion stage
1853 * On systems where host-page-size > target-page-size it will send all the
1854 * pages in a host page that are dirty.
1857 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1859 PageSearchStatus pss;
1863 /* No dirty page as there is zero RAM */
1864 if (!ram_bytes_total()) {
1868 pss.block = rs->last_seen_block;
1869 pss.page = rs->last_page;
1870 pss.complete_round = false;
1873 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1878 found = get_queued_page(rs, &pss);
1881 /* priority queue empty, so just search for something dirty */
1882 found = find_dirty_block(rs, &pss, &again);
1886 pages = ram_save_host_page(rs, &pss, last_stage);
1888 } while (!pages && again);
1890 rs->last_seen_block = pss.block;
1891 rs->last_page = pss.page;
1896 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1898 uint64_t pages = size / TARGET_PAGE_SIZE;
1901 ram_counters.duplicate += pages;
1903 ram_counters.normal += pages;
1904 ram_counters.transferred += size;
1905 qemu_update_position(f, size);
1909 uint64_t ram_bytes_total(void)
1915 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1916 total += block->used_length;
1922 static void xbzrle_load_setup(void)
1924 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1927 static void xbzrle_load_cleanup(void)
1929 g_free(XBZRLE.decoded_buf);
1930 XBZRLE.decoded_buf = NULL;
1933 static void ram_state_cleanup(RAMState **rsp)
1936 migration_page_queue_free(*rsp);
1937 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1938 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1944 static void xbzrle_cleanup(void)
1946 XBZRLE_cache_lock();
1948 cache_fini(XBZRLE.cache);
1949 g_free(XBZRLE.encoded_buf);
1950 g_free(XBZRLE.current_buf);
1951 g_free(XBZRLE.zero_target_page);
1952 XBZRLE.cache = NULL;
1953 XBZRLE.encoded_buf = NULL;
1954 XBZRLE.current_buf = NULL;
1955 XBZRLE.zero_target_page = NULL;
1957 XBZRLE_cache_unlock();
1960 static void ram_save_cleanup(void *opaque)
1962 RAMState **rsp = opaque;
1965 /* caller have hold iothread lock or is in a bh, so there is
1966 * no writing race against this migration_bitmap
1968 memory_global_dirty_log_stop();
1970 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1971 g_free(block->bmap);
1973 g_free(block->unsentmap);
1974 block->unsentmap = NULL;
1978 compress_threads_save_cleanup();
1979 ram_state_cleanup(rsp);
1982 static void ram_state_reset(RAMState *rs)
1984 rs->last_seen_block = NULL;
1985 rs->last_sent_block = NULL;
1987 rs->last_version = ram_list.version;
1988 rs->ram_bulk_stage = true;
1991 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1994 * 'expected' is the value you expect the bitmap mostly to be full
1995 * of; it won't bother printing lines that are all this value.
1996 * If 'todump' is null the migration bitmap is dumped.
1998 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1999 unsigned long pages)
2002 int64_t linelen = 128;
2005 for (cur = 0; cur < pages; cur += linelen) {
2009 * Last line; catch the case where the line length
2010 * is longer than remaining ram
2012 if (cur + linelen > pages) {
2013 linelen = pages - cur;
2015 for (curb = 0; curb < linelen; curb++) {
2016 bool thisbit = test_bit(cur + curb, todump);
2017 linebuf[curb] = thisbit ? '1' : '.';
2018 found = found || (thisbit != expected);
2021 linebuf[curb] = '\0';
2022 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2027 /* **** functions for postcopy ***** */
2029 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2031 struct RAMBlock *block;
2033 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2034 unsigned long *bitmap = block->bmap;
2035 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2036 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2038 while (run_start < range) {
2039 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2040 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2041 (run_end - run_start) << TARGET_PAGE_BITS);
2042 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2048 * postcopy_send_discard_bm_ram: discard a RAMBlock
2050 * Returns zero on success
2052 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2053 * Note: At this point the 'unsentmap' is the processed bitmap combined
2054 * with the dirtymap; so a '1' means it's either dirty or unsent.
2056 * @ms: current migration state
2057 * @pds: state for postcopy
2058 * @start: RAMBlock starting page
2059 * @length: RAMBlock size
2061 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2062 PostcopyDiscardState *pds,
2065 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2066 unsigned long current;
2067 unsigned long *unsentmap = block->unsentmap;
2069 for (current = 0; current < end; ) {
2070 unsigned long one = find_next_bit(unsentmap, end, current);
2073 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2074 unsigned long discard_length;
2077 discard_length = end - one;
2079 discard_length = zero - one;
2081 if (discard_length) {
2082 postcopy_discard_send_range(ms, pds, one, discard_length);
2084 current = one + discard_length;
2094 * postcopy_each_ram_send_discard: discard all RAMBlocks
2096 * Returns 0 for success or negative for error
2098 * Utility for the outgoing postcopy code.
2099 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2100 * passing it bitmap indexes and name.
2101 * (qemu_ram_foreach_block ends up passing unscaled lengths
2102 * which would mean postcopy code would have to deal with target page)
2104 * @ms: current migration state
2106 static int postcopy_each_ram_send_discard(MigrationState *ms)
2108 struct RAMBlock *block;
2111 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2112 PostcopyDiscardState *pds =
2113 postcopy_discard_send_init(ms, block->idstr);
2116 * Postcopy sends chunks of bitmap over the wire, but it
2117 * just needs indexes at this point, avoids it having
2118 * target page specific code.
2120 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2121 postcopy_discard_send_finish(ms, pds);
2131 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2133 * Helper for postcopy_chunk_hostpages; it's called twice to
2134 * canonicalize the two bitmaps, that are similar, but one is
2137 * Postcopy requires that all target pages in a hostpage are dirty or
2138 * clean, not a mix. This function canonicalizes the bitmaps.
2140 * @ms: current migration state
2141 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2142 * otherwise we need to canonicalize partially dirty host pages
2143 * @block: block that contains the page we want to canonicalize
2144 * @pds: state for postcopy
2146 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2148 PostcopyDiscardState *pds)
2150 RAMState *rs = ram_state;
2151 unsigned long *bitmap = block->bmap;
2152 unsigned long *unsentmap = block->unsentmap;
2153 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2154 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2155 unsigned long run_start;
2157 if (block->page_size == TARGET_PAGE_SIZE) {
2158 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2163 /* Find a sent page */
2164 run_start = find_next_zero_bit(unsentmap, pages, 0);
2166 /* Find a dirty page */
2167 run_start = find_next_bit(bitmap, pages, 0);
2170 while (run_start < pages) {
2171 bool do_fixup = false;
2172 unsigned long fixup_start_addr;
2173 unsigned long host_offset;
2176 * If the start of this run of pages is in the middle of a host
2177 * page, then we need to fixup this host page.
2179 host_offset = run_start % host_ratio;
2182 run_start -= host_offset;
2183 fixup_start_addr = run_start;
2184 /* For the next pass */
2185 run_start = run_start + host_ratio;
2187 /* Find the end of this run */
2188 unsigned long run_end;
2190 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2192 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2195 * If the end isn't at the start of a host page, then the
2196 * run doesn't finish at the end of a host page
2197 * and we need to discard.
2199 host_offset = run_end % host_ratio;
2202 fixup_start_addr = run_end - host_offset;
2204 * This host page has gone, the next loop iteration starts
2205 * from after the fixup
2207 run_start = fixup_start_addr + host_ratio;
2210 * No discards on this iteration, next loop starts from
2211 * next sent/dirty page
2213 run_start = run_end + 1;
2220 /* Tell the destination to discard this page */
2221 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2222 /* For the unsent_pass we:
2223 * discard partially sent pages
2224 * For the !unsent_pass (dirty) we:
2225 * discard partially dirty pages that were sent
2226 * (any partially sent pages were already discarded
2227 * by the previous unsent_pass)
2229 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2233 /* Clean up the bitmap */
2234 for (page = fixup_start_addr;
2235 page < fixup_start_addr + host_ratio; page++) {
2236 /* All pages in this host page are now not sent */
2237 set_bit(page, unsentmap);
2240 * Remark them as dirty, updating the count for any pages
2241 * that weren't previously dirty.
2243 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2248 /* Find the next sent page for the next iteration */
2249 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2251 /* Find the next dirty page for the next iteration */
2252 run_start = find_next_bit(bitmap, pages, run_start);
2258 * postcopy_chuck_hostpages: discrad any partially sent host page
2260 * Utility for the outgoing postcopy code.
2262 * Discard any partially sent host-page size chunks, mark any partially
2263 * dirty host-page size chunks as all dirty. In this case the host-page
2264 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2266 * Returns zero on success
2268 * @ms: current migration state
2269 * @block: block we want to work with
2271 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2273 PostcopyDiscardState *pds =
2274 postcopy_discard_send_init(ms, block->idstr);
2276 /* First pass: Discard all partially sent host pages */
2277 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2279 * Second pass: Ensure that all partially dirty host pages are made
2282 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2284 postcopy_discard_send_finish(ms, pds);
2289 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2291 * Returns zero on success
2293 * Transmit the set of pages to be discarded after precopy to the target
2294 * these are pages that:
2295 * a) Have been previously transmitted but are now dirty again
2296 * b) Pages that have never been transmitted, this ensures that
2297 * any pages on the destination that have been mapped by background
2298 * tasks get discarded (transparent huge pages is the specific concern)
2299 * Hopefully this is pretty sparse
2301 * @ms: current migration state
2303 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2305 RAMState *rs = ram_state;
2311 /* This should be our last sync, the src is now paused */
2312 migration_bitmap_sync(rs);
2314 /* Easiest way to make sure we don't resume in the middle of a host-page */
2315 rs->last_seen_block = NULL;
2316 rs->last_sent_block = NULL;
2319 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2320 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2321 unsigned long *bitmap = block->bmap;
2322 unsigned long *unsentmap = block->unsentmap;
2325 /* We don't have a safe way to resize the sentmap, so
2326 * if the bitmap was resized it will be NULL at this
2329 error_report("migration ram resized during precopy phase");
2333 /* Deal with TPS != HPS and huge pages */
2334 ret = postcopy_chunk_hostpages(ms, block);
2341 * Update the unsentmap to be unsentmap = unsentmap | dirty
2343 bitmap_or(unsentmap, unsentmap, bitmap, pages);
2344 #ifdef DEBUG_POSTCOPY
2345 ram_debug_dump_bitmap(unsentmap, true, pages);
2348 trace_ram_postcopy_send_discard_bitmap();
2350 ret = postcopy_each_ram_send_discard(ms);
2357 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2359 * Returns zero on success
2361 * @rbname: name of the RAMBlock of the request. NULL means the
2362 * same that last one.
2363 * @start: RAMBlock starting page
2364 * @length: RAMBlock size
2366 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2370 trace_ram_discard_range(rbname, start, length);
2373 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2376 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2380 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2381 length >> qemu_target_page_bits());
2382 ret = ram_block_discard_range(rb, start, length);
2391 * For every allocation, we will try not to crash the VM if the
2392 * allocation failed.
2394 static int xbzrle_init(void)
2396 Error *local_err = NULL;
2398 if (!migrate_use_xbzrle()) {
2402 XBZRLE_cache_lock();
2404 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2405 if (!XBZRLE.zero_target_page) {
2406 error_report("%s: Error allocating zero page", __func__);
2410 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2411 TARGET_PAGE_SIZE, &local_err);
2412 if (!XBZRLE.cache) {
2413 error_report_err(local_err);
2414 goto free_zero_page;
2417 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2418 if (!XBZRLE.encoded_buf) {
2419 error_report("%s: Error allocating encoded_buf", __func__);
2423 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2424 if (!XBZRLE.current_buf) {
2425 error_report("%s: Error allocating current_buf", __func__);
2426 goto free_encoded_buf;
2429 /* We are all good */
2430 XBZRLE_cache_unlock();
2434 g_free(XBZRLE.encoded_buf);
2435 XBZRLE.encoded_buf = NULL;
2437 cache_fini(XBZRLE.cache);
2438 XBZRLE.cache = NULL;
2440 g_free(XBZRLE.zero_target_page);
2441 XBZRLE.zero_target_page = NULL;
2443 XBZRLE_cache_unlock();
2447 static int ram_state_init(RAMState **rsp)
2449 *rsp = g_try_new0(RAMState, 1);
2452 error_report("%s: Init ramstate fail", __func__);
2456 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2457 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2458 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2461 * Count the total number of pages used by ram blocks not including any
2462 * gaps due to alignment or unplugs.
2464 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2466 ram_state_reset(*rsp);
2471 static void ram_list_init_bitmaps(void)
2474 unsigned long pages;
2476 /* Skip setting bitmap if there is no RAM */
2477 if (ram_bytes_total()) {
2478 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2479 pages = block->max_length >> TARGET_PAGE_BITS;
2480 block->bmap = bitmap_new(pages);
2481 bitmap_set(block->bmap, 0, pages);
2482 if (migrate_postcopy_ram()) {
2483 block->unsentmap = bitmap_new(pages);
2484 bitmap_set(block->unsentmap, 0, pages);
2490 static void ram_init_bitmaps(RAMState *rs)
2492 /* For memory_global_dirty_log_start below. */
2493 qemu_mutex_lock_iothread();
2494 qemu_mutex_lock_ramlist();
2497 ram_list_init_bitmaps();
2498 memory_global_dirty_log_start();
2499 migration_bitmap_sync(rs);
2502 qemu_mutex_unlock_ramlist();
2503 qemu_mutex_unlock_iothread();
2506 static int ram_init_all(RAMState **rsp)
2508 if (ram_state_init(rsp)) {
2512 if (xbzrle_init()) {
2513 ram_state_cleanup(rsp);
2517 ram_init_bitmaps(*rsp);
2522 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2528 * Postcopy is not using xbzrle/compression, so no need for that.
2529 * Also, since source are already halted, we don't need to care
2530 * about dirty page logging as well.
2533 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2534 pages += bitmap_count_one(block->bmap,
2535 block->used_length >> TARGET_PAGE_BITS);
2538 /* This may not be aligned with current bitmaps. Recalculate. */
2539 rs->migration_dirty_pages = pages;
2541 rs->last_seen_block = NULL;
2542 rs->last_sent_block = NULL;
2544 rs->last_version = ram_list.version;
2546 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2547 * matter what we have sent.
2549 rs->ram_bulk_stage = false;
2551 /* Update RAMState cache of output QEMUFile */
2554 trace_ram_state_resume_prepare(pages);
2558 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2559 * long-running RCU critical section. When rcu-reclaims in the code
2560 * start to become numerous it will be necessary to reduce the
2561 * granularity of these critical sections.
2565 * ram_save_setup: Setup RAM for migration
2567 * Returns zero to indicate success and negative for error
2569 * @f: QEMUFile where to send the data
2570 * @opaque: RAMState pointer
2572 static int ram_save_setup(QEMUFile *f, void *opaque)
2574 RAMState **rsp = opaque;
2577 if (compress_threads_save_setup()) {
2581 /* migration has already setup the bitmap, reuse it. */
2582 if (!migration_in_colo_state()) {
2583 if (ram_init_all(rsp) != 0) {
2584 compress_threads_save_cleanup();
2592 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2594 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2595 qemu_put_byte(f, strlen(block->idstr));
2596 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2597 qemu_put_be64(f, block->used_length);
2598 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2599 qemu_put_be64(f, block->page_size);
2605 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2606 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2608 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2614 * ram_save_iterate: iterative stage for migration
2616 * Returns zero to indicate success and negative for error
2618 * @f: QEMUFile where to send the data
2619 * @opaque: RAMState pointer
2621 static int ram_save_iterate(QEMUFile *f, void *opaque)
2623 RAMState **temp = opaque;
2624 RAMState *rs = *temp;
2630 if (blk_mig_bulk_active()) {
2631 /* Avoid transferring ram during bulk phase of block migration as
2632 * the bulk phase will usually take a long time and transferring
2633 * ram updates during that time is pointless. */
2638 if (ram_list.version != rs->last_version) {
2639 ram_state_reset(rs);
2642 /* Read version before ram_list.blocks */
2645 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2647 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2649 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2650 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2653 if (qemu_file_get_error(f)) {
2657 pages = ram_find_and_save_block(rs, false);
2658 /* no more pages to sent */
2665 /* we want to check in the 1st loop, just in case it was the 1st time
2666 and we had to sync the dirty bitmap.
2667 qemu_get_clock_ns() is a bit expensive, so we only check each some
2670 if ((i & 63) == 0) {
2671 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2672 if (t1 > MAX_WAIT) {
2673 trace_ram_save_iterate_big_wait(t1, i);
2679 flush_compressed_data(rs);
2683 * Must occur before EOS (or any QEMUFile operation)
2684 * because of RDMA protocol.
2686 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2689 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2690 ram_counters.transferred += 8;
2692 ret = qemu_file_get_error(f);
2701 * ram_save_complete: function called to send the remaining amount of ram
2703 * Returns zero to indicate success
2705 * Called with iothread lock
2707 * @f: QEMUFile where to send the data
2708 * @opaque: RAMState pointer
2710 static int ram_save_complete(QEMUFile *f, void *opaque)
2712 RAMState **temp = opaque;
2713 RAMState *rs = *temp;
2717 if (!migration_in_postcopy()) {
2718 migration_bitmap_sync(rs);
2721 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2723 /* try transferring iterative blocks of memory */
2725 /* flush all remaining blocks regardless of rate limiting */
2729 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2730 /* no more blocks to sent */
2736 flush_compressed_data(rs);
2737 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2741 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2746 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2747 uint64_t *res_precopy_only,
2748 uint64_t *res_compatible,
2749 uint64_t *res_postcopy_only)
2751 RAMState **temp = opaque;
2752 RAMState *rs = *temp;
2753 uint64_t remaining_size;
2755 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2757 if (!migration_in_postcopy() &&
2758 remaining_size < max_size) {
2759 qemu_mutex_lock_iothread();
2761 migration_bitmap_sync(rs);
2763 qemu_mutex_unlock_iothread();
2764 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2767 if (migrate_postcopy_ram()) {
2768 /* We can do postcopy, and all the data is postcopiable */
2769 *res_compatible += remaining_size;
2771 *res_precopy_only += remaining_size;
2775 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2777 unsigned int xh_len;
2779 uint8_t *loaded_data;
2781 /* extract RLE header */
2782 xh_flags = qemu_get_byte(f);
2783 xh_len = qemu_get_be16(f);
2785 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2786 error_report("Failed to load XBZRLE page - wrong compression!");
2790 if (xh_len > TARGET_PAGE_SIZE) {
2791 error_report("Failed to load XBZRLE page - len overflow!");
2794 loaded_data = XBZRLE.decoded_buf;
2795 /* load data and decode */
2796 /* it can change loaded_data to point to an internal buffer */
2797 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2800 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2801 TARGET_PAGE_SIZE) == -1) {
2802 error_report("Failed to load XBZRLE page - decode error!");
2810 * ram_block_from_stream: read a RAMBlock id from the migration stream
2812 * Must be called from within a rcu critical section.
2814 * Returns a pointer from within the RCU-protected ram_list.
2816 * @f: QEMUFile where to read the data from
2817 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2819 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2821 static RAMBlock *block = NULL;
2825 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2827 error_report("Ack, bad migration stream!");
2833 len = qemu_get_byte(f);
2834 qemu_get_buffer(f, (uint8_t *)id, len);
2837 block = qemu_ram_block_by_name(id);
2839 error_report("Can't find block %s", id);
2843 if (!qemu_ram_is_migratable(block)) {
2844 error_report("block %s should not be migrated !", id);
2851 static inline void *host_from_ram_block_offset(RAMBlock *block,
2854 if (!offset_in_ramblock(block, offset)) {
2858 return block->host + offset;
2862 * ram_handle_compressed: handle the zero page case
2864 * If a page (or a whole RDMA chunk) has been
2865 * determined to be zero, then zap it.
2867 * @host: host address for the zero page
2868 * @ch: what the page is filled from. We only support zero
2869 * @size: size of the zero page
2871 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2873 if (ch != 0 || !is_zero_range(host, size)) {
2874 memset(host, ch, size);
2878 /* return the size after decompression, or negative value on error */
2880 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2881 const uint8_t *source, size_t source_len)
2885 err = inflateReset(stream);
2890 stream->avail_in = source_len;
2891 stream->next_in = (uint8_t *)source;
2892 stream->avail_out = dest_len;
2893 stream->next_out = dest;
2895 err = inflate(stream, Z_NO_FLUSH);
2896 if (err != Z_STREAM_END) {
2900 return stream->total_out;
2903 static void *do_data_decompress(void *opaque)
2905 DecompressParam *param = opaque;
2906 unsigned long pagesize;
2910 qemu_mutex_lock(¶m->mutex);
2911 while (!param->quit) {
2916 qemu_mutex_unlock(¶m->mutex);
2918 pagesize = TARGET_PAGE_SIZE;
2920 ret = qemu_uncompress_data(¶m->stream, des, pagesize,
2921 param->compbuf, len);
2922 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2923 error_report("decompress data failed");
2924 qemu_file_set_error(decomp_file, ret);
2927 qemu_mutex_lock(&decomp_done_lock);
2929 qemu_cond_signal(&decomp_done_cond);
2930 qemu_mutex_unlock(&decomp_done_lock);
2932 qemu_mutex_lock(¶m->mutex);
2934 qemu_cond_wait(¶m->cond, ¶m->mutex);
2937 qemu_mutex_unlock(¶m->mutex);
2942 static int wait_for_decompress_done(void)
2944 int idx, thread_count;
2946 if (!migrate_use_compression()) {
2950 thread_count = migrate_decompress_threads();
2951 qemu_mutex_lock(&decomp_done_lock);
2952 for (idx = 0; idx < thread_count; idx++) {
2953 while (!decomp_param[idx].done) {
2954 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2957 qemu_mutex_unlock(&decomp_done_lock);
2958 return qemu_file_get_error(decomp_file);
2961 static void compress_threads_load_cleanup(void)
2963 int i, thread_count;
2965 if (!migrate_use_compression()) {
2968 thread_count = migrate_decompress_threads();
2969 for (i = 0; i < thread_count; i++) {
2971 * we use it as a indicator which shows if the thread is
2972 * properly init'd or not
2974 if (!decomp_param[i].compbuf) {
2978 qemu_mutex_lock(&decomp_param[i].mutex);
2979 decomp_param[i].quit = true;
2980 qemu_cond_signal(&decomp_param[i].cond);
2981 qemu_mutex_unlock(&decomp_param[i].mutex);
2983 for (i = 0; i < thread_count; i++) {
2984 if (!decomp_param[i].compbuf) {
2988 qemu_thread_join(decompress_threads + i);
2989 qemu_mutex_destroy(&decomp_param[i].mutex);
2990 qemu_cond_destroy(&decomp_param[i].cond);
2991 inflateEnd(&decomp_param[i].stream);
2992 g_free(decomp_param[i].compbuf);
2993 decomp_param[i].compbuf = NULL;
2995 g_free(decompress_threads);
2996 g_free(decomp_param);
2997 decompress_threads = NULL;
2998 decomp_param = NULL;
3002 static int compress_threads_load_setup(QEMUFile *f)
3004 int i, thread_count;
3006 if (!migrate_use_compression()) {
3010 thread_count = migrate_decompress_threads();
3011 decompress_threads = g_new0(QemuThread, thread_count);
3012 decomp_param = g_new0(DecompressParam, thread_count);
3013 qemu_mutex_init(&decomp_done_lock);
3014 qemu_cond_init(&decomp_done_cond);
3016 for (i = 0; i < thread_count; i++) {
3017 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3021 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3022 qemu_mutex_init(&decomp_param[i].mutex);
3023 qemu_cond_init(&decomp_param[i].cond);
3024 decomp_param[i].done = true;
3025 decomp_param[i].quit = false;
3026 qemu_thread_create(decompress_threads + i, "decompress",
3027 do_data_decompress, decomp_param + i,
3028 QEMU_THREAD_JOINABLE);
3032 compress_threads_load_cleanup();
3036 static void decompress_data_with_multi_threads(QEMUFile *f,
3037 void *host, int len)
3039 int idx, thread_count;
3041 thread_count = migrate_decompress_threads();
3042 qemu_mutex_lock(&decomp_done_lock);
3044 for (idx = 0; idx < thread_count; idx++) {
3045 if (decomp_param[idx].done) {
3046 decomp_param[idx].done = false;
3047 qemu_mutex_lock(&decomp_param[idx].mutex);
3048 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3049 decomp_param[idx].des = host;
3050 decomp_param[idx].len = len;
3051 qemu_cond_signal(&decomp_param[idx].cond);
3052 qemu_mutex_unlock(&decomp_param[idx].mutex);
3056 if (idx < thread_count) {
3059 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3062 qemu_mutex_unlock(&decomp_done_lock);
3066 * ram_load_setup: Setup RAM for migration incoming side
3068 * Returns zero to indicate success and negative for error
3070 * @f: QEMUFile where to receive the data
3071 * @opaque: RAMState pointer
3073 static int ram_load_setup(QEMUFile *f, void *opaque)
3075 if (compress_threads_load_setup(f)) {
3079 xbzrle_load_setup();
3080 ramblock_recv_map_init();
3084 static int ram_load_cleanup(void *opaque)
3087 xbzrle_load_cleanup();
3088 compress_threads_load_cleanup();
3090 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3091 g_free(rb->receivedmap);
3092 rb->receivedmap = NULL;
3098 * ram_postcopy_incoming_init: allocate postcopy data structures
3100 * Returns 0 for success and negative if there was one error
3102 * @mis: current migration incoming state
3104 * Allocate data structures etc needed by incoming migration with
3105 * postcopy-ram. postcopy-ram's similarly names
3106 * postcopy_ram_incoming_init does the work.
3108 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3110 unsigned long ram_pages = last_ram_page();
3112 return postcopy_ram_incoming_init(mis, ram_pages);
3116 * ram_load_postcopy: load a page in postcopy case
3118 * Returns 0 for success or -errno in case of error
3120 * Called in postcopy mode by ram_load().
3121 * rcu_read_lock is taken prior to this being called.
3123 * @f: QEMUFile where to send the data
3125 static int ram_load_postcopy(QEMUFile *f)
3127 int flags = 0, ret = 0;
3128 bool place_needed = false;
3129 bool matching_page_sizes = false;
3130 MigrationIncomingState *mis = migration_incoming_get_current();
3131 /* Temporary page that is later 'placed' */
3132 void *postcopy_host_page = postcopy_get_tmp_page(mis);
3133 void *last_host = NULL;
3134 bool all_zero = false;
3136 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3139 void *page_buffer = NULL;
3140 void *place_source = NULL;
3141 RAMBlock *block = NULL;
3144 addr = qemu_get_be64(f);
3147 * If qemu file error, we should stop here, and then "addr"
3150 ret = qemu_file_get_error(f);
3155 flags = addr & ~TARGET_PAGE_MASK;
3156 addr &= TARGET_PAGE_MASK;
3158 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3159 place_needed = false;
3160 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3161 block = ram_block_from_stream(f, flags);
3163 host = host_from_ram_block_offset(block, addr);
3165 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3169 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
3171 * Postcopy requires that we place whole host pages atomically;
3172 * these may be huge pages for RAMBlocks that are backed by
3174 * To make it atomic, the data is read into a temporary page
3175 * that's moved into place later.
3176 * The migration protocol uses, possibly smaller, target-pages
3177 * however the source ensures it always sends all the components
3178 * of a host page in order.
3180 page_buffer = postcopy_host_page +
3181 ((uintptr_t)host & (block->page_size - 1));
3182 /* If all TP are zero then we can optimise the place */
3183 if (!((uintptr_t)host & (block->page_size - 1))) {
3186 /* not the 1st TP within the HP */
3187 if (host != (last_host + TARGET_PAGE_SIZE)) {
3188 error_report("Non-sequential target page %p/%p",
3197 * If it's the last part of a host page then we place the host
3200 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3201 (block->page_size - 1)) == 0;
3202 place_source = postcopy_host_page;
3206 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3207 case RAM_SAVE_FLAG_ZERO:
3208 ch = qemu_get_byte(f);
3209 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3215 case RAM_SAVE_FLAG_PAGE:
3217 if (!place_needed || !matching_page_sizes) {
3218 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3220 /* Avoids the qemu_file copy during postcopy, which is
3221 * going to do a copy later; can only do it when we
3222 * do this read in one go (matching page sizes)
3224 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3228 case RAM_SAVE_FLAG_EOS:
3232 error_report("Unknown combination of migration flags: %#x"
3233 " (postcopy mode)", flags);
3238 /* Detect for any possible file errors */
3239 if (!ret && qemu_file_get_error(f)) {
3240 ret = qemu_file_get_error(f);
3243 if (!ret && place_needed) {
3244 /* This gets called at the last target page in the host page */
3245 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3248 ret = postcopy_place_page_zero(mis, place_dest,
3251 ret = postcopy_place_page(mis, place_dest,
3252 place_source, block);
3260 static bool postcopy_is_advised(void)
3262 PostcopyState ps = postcopy_state_get();
3263 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3266 static bool postcopy_is_running(void)
3268 PostcopyState ps = postcopy_state_get();
3269 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3272 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3274 int flags = 0, ret = 0, invalid_flags = 0;
3275 static uint64_t seq_iter;
3278 * If system is running in postcopy mode, page inserts to host memory must
3281 bool postcopy_running = postcopy_is_running();
3282 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3283 bool postcopy_advised = postcopy_is_advised();
3287 if (version_id != 4) {
3291 if (!migrate_use_compression()) {
3292 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3294 /* This RCU critical section can be very long running.
3295 * When RCU reclaims in the code start to become numerous,
3296 * it will be necessary to reduce the granularity of this
3301 if (postcopy_running) {
3302 ret = ram_load_postcopy(f);
3305 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3306 ram_addr_t addr, total_ram_bytes;
3310 addr = qemu_get_be64(f);
3311 flags = addr & ~TARGET_PAGE_MASK;
3312 addr &= TARGET_PAGE_MASK;
3314 if (flags & invalid_flags) {
3315 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3316 error_report("Received an unexpected compressed page");
3323 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3324 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3325 RAMBlock *block = ram_block_from_stream(f, flags);
3327 host = host_from_ram_block_offset(block, addr);
3329 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3333 ramblock_recv_bitmap_set(block, host);
3334 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3337 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3338 case RAM_SAVE_FLAG_MEM_SIZE:
3339 /* Synchronize RAM block list */
3340 total_ram_bytes = addr;
3341 while (!ret && total_ram_bytes) {
3346 len = qemu_get_byte(f);
3347 qemu_get_buffer(f, (uint8_t *)id, len);
3349 length = qemu_get_be64(f);
3351 block = qemu_ram_block_by_name(id);
3352 if (block && !qemu_ram_is_migratable(block)) {
3353 error_report("block %s should not be migrated !", id);
3356 if (length != block->used_length) {
3357 Error *local_err = NULL;
3359 ret = qemu_ram_resize(block, length,
3362 error_report_err(local_err);
3365 /* For postcopy we need to check hugepage sizes match */
3366 if (postcopy_advised &&
3367 block->page_size != qemu_host_page_size) {
3368 uint64_t remote_page_size = qemu_get_be64(f);
3369 if (remote_page_size != block->page_size) {
3370 error_report("Mismatched RAM page size %s "
3371 "(local) %zd != %" PRId64,
3372 id, block->page_size,
3377 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3380 error_report("Unknown ramblock \"%s\", cannot "
3381 "accept migration", id);
3385 total_ram_bytes -= length;
3389 case RAM_SAVE_FLAG_ZERO:
3390 ch = qemu_get_byte(f);
3391 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3394 case RAM_SAVE_FLAG_PAGE:
3395 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3398 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3399 len = qemu_get_be32(f);
3400 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3401 error_report("Invalid compressed data length: %d", len);
3405 decompress_data_with_multi_threads(f, host, len);
3408 case RAM_SAVE_FLAG_XBZRLE:
3409 if (load_xbzrle(f, addr, host) < 0) {
3410 error_report("Failed to decompress XBZRLE page at "
3411 RAM_ADDR_FMT, addr);
3416 case RAM_SAVE_FLAG_EOS:
3420 if (flags & RAM_SAVE_FLAG_HOOK) {
3421 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3423 error_report("Unknown combination of migration flags: %#x",
3429 ret = qemu_file_get_error(f);
3433 ret |= wait_for_decompress_done();
3435 trace_ram_load_complete(ret, seq_iter);
3439 static bool ram_has_postcopy(void *opaque)
3441 return migrate_postcopy_ram();
3444 /* Sync all the dirty bitmap with destination VM. */
3445 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3448 QEMUFile *file = s->to_dst_file;
3449 int ramblock_count = 0;
3451 trace_ram_dirty_bitmap_sync_start();
3453 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3454 qemu_savevm_send_recv_bitmap(file, block->idstr);
3455 trace_ram_dirty_bitmap_request(block->idstr);
3459 trace_ram_dirty_bitmap_sync_wait();
3461 /* Wait until all the ramblocks' dirty bitmap synced */
3462 while (ramblock_count--) {
3463 qemu_sem_wait(&s->rp_state.rp_sem);
3466 trace_ram_dirty_bitmap_sync_complete();
3471 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3473 qemu_sem_post(&s->rp_state.rp_sem);
3477 * Read the received bitmap, revert it as the initial dirty bitmap.
3478 * This is only used when the postcopy migration is paused but wants
3479 * to resume from a middle point.
3481 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3484 QEMUFile *file = s->rp_state.from_dst_file;
3485 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3486 uint64_t local_size = nbits / 8;
3487 uint64_t size, end_mark;
3489 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3491 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3492 error_report("%s: incorrect state %s", __func__,
3493 MigrationStatus_str(s->state));
3498 * Note: see comments in ramblock_recv_bitmap_send() on why we
3499 * need the endianess convertion, and the paddings.
3501 local_size = ROUND_UP(local_size, 8);
3504 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3506 size = qemu_get_be64(file);
3508 /* The size of the bitmap should match with our ramblock */
3509 if (size != local_size) {
3510 error_report("%s: ramblock '%s' bitmap size mismatch "
3511 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3512 block->idstr, size, local_size);
3517 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3518 end_mark = qemu_get_be64(file);
3520 ret = qemu_file_get_error(file);
3521 if (ret || size != local_size) {
3522 error_report("%s: read bitmap failed for ramblock '%s': %d"
3523 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3524 __func__, block->idstr, ret, local_size, size);
3529 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3530 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3531 __func__, block->idstr, end_mark);
3537 * Endianess convertion. We are during postcopy (though paused).
3538 * The dirty bitmap won't change. We can directly modify it.
3540 bitmap_from_le(block->bmap, le_bitmap, nbits);
3543 * What we received is "received bitmap". Revert it as the initial
3544 * dirty bitmap for this ramblock.
3546 bitmap_complement(block->bmap, block->bmap, nbits);
3548 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3551 * We succeeded to sync bitmap for current ramblock. If this is
3552 * the last one to sync, we need to notify the main send thread.
3554 ram_dirty_bitmap_reload_notify(s);
3562 static int ram_resume_prepare(MigrationState *s, void *opaque)
3564 RAMState *rs = *(RAMState **)opaque;
3567 ret = ram_dirty_bitmap_sync_all(s, rs);
3572 ram_state_resume_prepare(rs, s->to_dst_file);
3577 static SaveVMHandlers savevm_ram_handlers = {
3578 .save_setup = ram_save_setup,
3579 .save_live_iterate = ram_save_iterate,
3580 .save_live_complete_postcopy = ram_save_complete,
3581 .save_live_complete_precopy = ram_save_complete,
3582 .has_postcopy = ram_has_postcopy,
3583 .save_live_pending = ram_save_pending,
3584 .load_state = ram_load,
3585 .save_cleanup = ram_save_cleanup,
3586 .load_setup = ram_load_setup,
3587 .load_cleanup = ram_load_cleanup,
3588 .resume_prepare = ram_resume_prepare,
3591 void ram_mig_init(void)
3593 qemu_mutex_init(&XBZRLE.lock);
3594 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);