]> Git Repo - qemu.git/blame - migration/ram.c
migration: Enable UFFD_FEATURE_THREAD_ID even without blocktime feat
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
7205c9ec 33#include "qemu/main-loop.h"
709e3fe8 34#include "xbzrle.h"
7b1e1a22 35#include "ram.h"
6666c96a 36#include "migration.h"
f2a8f0a6 37#include "migration/register.h"
7b1e1a22 38#include "migration/misc.h"
08a0aee1 39#include "qemu-file.h"
be07b0ac 40#include "postcopy-ram.h"
53d37d36 41#include "page_cache.h"
56e93d26 42#include "qemu/error-report.h"
e688df6b 43#include "qapi/error.h"
ab7cbb0b 44#include "qapi/qapi-types-migration.h"
9af23989 45#include "qapi/qapi-events-migration.h"
8acabf69 46#include "qapi/qmp/qerror.h"
56e93d26 47#include "trace.h"
56e93d26 48#include "exec/ram_addr.h"
f9494614 49#include "exec/target_page.h"
56e93d26 50#include "qemu/rcu_queue.h"
a91246c9 51#include "migration/colo.h"
53d37d36 52#include "block.h"
b0c3cf94 53#include "sysemu/cpu-throttle.h"
edd090c7 54#include "savevm.h"
b9ee2f7d 55#include "qemu/iov.h"
d32ca5ad 56#include "multifd.h"
278e2f55
AG
57#include "sysemu/runstate.h"
58
e5fdf920
LS
59#include "hw/boards.h" /* for machine_dump_guest_core() */
60
278e2f55
AG
61#if defined(__linux__)
62#include "qemu/userfaultfd.h"
63#endif /* defined(__linux__) */
56e93d26 64
56e93d26
JQ
65/***********************************************************/
66/* ram save/restore */
67
bb890ed5
JQ
68/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
56e93d26 74#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 75#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
76#define RAM_SAVE_FLAG_MEM_SIZE 0x04
77#define RAM_SAVE_FLAG_PAGE 0x08
78#define RAM_SAVE_FLAG_EOS 0x10
79#define RAM_SAVE_FLAG_CONTINUE 0x20
80#define RAM_SAVE_FLAG_XBZRLE 0x40
81/* 0x80 is reserved in migration.h start with 0x100 next */
82#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
9360447d
JQ
84XBZRLECacheStats xbzrle_counters;
85
56e93d26
JQ
86/* struct contains XBZRLE cache and a static page
87 used by the compression */
88static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
c00e0928
JQ
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
f265e0e4
JQ
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
56e93d26
JQ
100} XBZRLE;
101
56e93d26
JQ
102static void XBZRLE_cache_lock(void)
103{
f4c51a6b 104 if (migrate_use_xbzrle()) {
56e93d26 105 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 106 }
56e93d26
JQ
107}
108
109static void XBZRLE_cache_unlock(void)
110{
f4c51a6b 111 if (migrate_use_xbzrle()) {
56e93d26 112 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 113 }
56e93d26
JQ
114}
115
3d0684b2
JQ
116/**
117 * xbzrle_cache_resize: resize the xbzrle cache
118 *
cbde7be9 119 * This function is called from migrate_params_apply in main
3d0684b2
JQ
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
123 *
c9dede2d 124 * Returns 0 for success or -1 for error
3d0684b2
JQ
125 *
126 * @new_size: new cache size
8acabf69 127 * @errp: set *errp if the check failed, with reason
56e93d26 128 */
8b9407a0 129int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
130{
131 PageCache *new_cache;
c9dede2d 132 int64_t ret = 0;
56e93d26 133
8acabf69
JQ
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
139 }
140
2a313e5c
JQ
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
c9dede2d 143 return 0;
2a313e5c
JQ
144 }
145
56e93d26
JQ
146 XBZRLE_cache_lock();
147
148 if (XBZRLE.cache != NULL) {
80f8dfde 149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 150 if (!new_cache) {
56e93d26
JQ
151 ret = -1;
152 goto out;
153 }
154
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
157 }
56e93d26
JQ
158out:
159 XBZRLE_cache_unlock();
160 return ret;
161}
162
3ded54b1 163bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
164{
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167}
168
343f632c
DDAG
169#undef RAMBLOCK_FOREACH
170
fbd162e6
YK
171int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
172{
173 RAMBlock *block;
174 int ret = 0;
175
89ac5a1d
DDAG
176 RCU_READ_LOCK_GUARD();
177
fbd162e6
YK
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
182 }
183 }
fbd162e6
YK
184 return ret;
185}
186
f9494614
AP
187static void ramblock_recv_map_init(void)
188{
189 RAMBlock *rb;
190
fbd162e6 191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
194 }
195}
196
197int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
198{
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
201}
202
1cba9f6e
DDAG
203bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
204{
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206}
207
f9494614
AP
208void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
209{
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211}
212
213void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
215{
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
219}
220
a335debb
PX
221#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222
223/*
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
225 *
226 * Returns >0 if success with sent bytes, or <0 if error.
227 */
228int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
230{
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
234
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
238 }
239
898ba906 240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
241
242 /*
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
246 */
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248
249 /*
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
3a4452d8 252 * same endianness. (Note: big endian won't work.)
a335debb
PX
253 */
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
255
256 /* Size of the bitmap, in bytes */
a725ef9f 257 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
258
259 /*
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
264 */
265 size = ROUND_UP(size, 8);
266
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
269 /*
270 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 271 * some "mysterious" reason.
a335debb
PX
272 */
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
275
bf269906 276 g_free(le_bitmap);
a335debb
PX
277
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
280 }
281
282 return size + sizeof(size);
283}
284
ec481c6c
JQ
285/*
286 * An outstanding page request, on the source, having been received
287 * and queued
288 */
289struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
293
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295};
296
6f37bb8b
JQ
297/* State of RAM for migration */
298struct RAMState {
204b88b8
JQ
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
278e2f55
AG
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
6f37bb8b
JQ
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
269ace29
JQ
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
6f37bb8b
JQ
309 /* last ram version we have seen */
310 uint32_t last_version;
8d820d6f
JQ
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
f664da80
JQ
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
eac74159 316 /* bytes transferred at start_time */
c4bdf0cf 317 uint64_t bytes_xfer_prev;
a66cd90c 318 /* number of dirty pages since start_time */
68908ed6 319 uint64_t num_dirty_pages_period;
b5833fde
JQ
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
1a373522
DH
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
05931ec5
JQ
328 /* Are we on the last stage of migration */
329 bool last_stage;
76e03000
XG
330 /* compression statistics since the beginning of the period */
331 /* amount of count that no free thread to compress data */
332 uint64_t compress_thread_busy_prev;
333 /* amount bytes after compression */
334 uint64_t compressed_size_prev;
335 /* amount of compressed pages */
336 uint64_t compress_pages_prev;
337
be8b02ed
XG
338 /* total handled target pages at the beginning of period */
339 uint64_t target_page_count_prev;
340 /* total handled target pages since start */
341 uint64_t target_page_count;
9360447d 342 /* number of dirty bits in the bitmap */
2dfaf12e 343 uint64_t migration_dirty_pages;
386a907b 344 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 345 QemuMutex bitmap_mutex;
68a098f3
JQ
346 /* The RAMBlock used in the last src_page_requests */
347 RAMBlock *last_req_rb;
ec481c6c
JQ
348 /* Queue of outstanding page requests from the destination */
349 QemuMutex src_page_req_mutex;
b58deb34 350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
351};
352typedef struct RAMState RAMState;
353
53518d94 354static RAMState *ram_state;
6f37bb8b 355
bd227060
WW
356static NotifierWithReturnList precopy_notifier_list;
357
358void precopy_infrastructure_init(void)
359{
360 notifier_with_return_list_init(&precopy_notifier_list);
361}
362
363void precopy_add_notifier(NotifierWithReturn *n)
364{
365 notifier_with_return_list_add(&precopy_notifier_list, n);
366}
367
368void precopy_remove_notifier(NotifierWithReturn *n)
369{
370 notifier_with_return_remove(n);
371}
372
373int precopy_notify(PrecopyNotifyReason reason, Error **errp)
374{
375 PrecopyNotifyData pnd;
376 pnd.reason = reason;
377 pnd.errp = errp;
378
379 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
380}
381
9edabd4d 382uint64_t ram_bytes_remaining(void)
2f4fde93 383{
bae416e5
DDAG
384 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
385 0;
2f4fde93
JQ
386}
387
9360447d 388MigrationStats ram_counters;
96506894 389
4c2d0f6d
DE
390static void ram_transferred_add(uint64_t bytes)
391{
ae680668
DE
392 if (runstate_is_running()) {
393 ram_counters.precopy_bytes += bytes;
394 } else if (migration_in_postcopy()) {
395 ram_counters.postcopy_bytes += bytes;
396 } else {
397 ram_counters.downtime_bytes += bytes;
398 }
4c2d0f6d
DE
399 ram_counters.transferred += bytes;
400}
401
b8fb8cb7
DDAG
402/* used by the search for pages to send */
403struct PageSearchStatus {
404 /* Current block being searched */
405 RAMBlock *block;
a935e30f
JQ
406 /* Current page to search from */
407 unsigned long page;
b8fb8cb7
DDAG
408 /* Set once we wrap around */
409 bool complete_round;
410};
411typedef struct PageSearchStatus PageSearchStatus;
412
76e03000
XG
413CompressionStats compression_counters;
414
56e93d26 415struct CompressParam {
56e93d26 416 bool done;
90e56fb4 417 bool quit;
5e5fdcff 418 bool zero_page;
56e93d26
JQ
419 QEMUFile *file;
420 QemuMutex mutex;
421 QemuCond cond;
422 RAMBlock *block;
423 ram_addr_t offset;
34ab9e97
XG
424
425 /* internally used fields */
dcaf446e 426 z_stream stream;
34ab9e97 427 uint8_t *originbuf;
56e93d26
JQ
428};
429typedef struct CompressParam CompressParam;
430
431struct DecompressParam {
73a8912b 432 bool done;
90e56fb4 433 bool quit;
56e93d26
JQ
434 QemuMutex mutex;
435 QemuCond cond;
436 void *des;
d341d9f3 437 uint8_t *compbuf;
56e93d26 438 int len;
797ca154 439 z_stream stream;
56e93d26
JQ
440};
441typedef struct DecompressParam DecompressParam;
442
443static CompressParam *comp_param;
444static QemuThread *compress_threads;
445/* comp_done_cond is used to wake up the migration thread when
446 * one of the compression threads has finished the compression.
447 * comp_done_lock is used to co-work with comp_done_cond.
448 */
0d9f9a5c
LL
449static QemuMutex comp_done_lock;
450static QemuCond comp_done_cond;
56e93d26
JQ
451/* The empty QEMUFileOps will be used by file in CompressParam */
452static const QEMUFileOps empty_ops = { };
453
34ab9e97 454static QEMUFile *decomp_file;
56e93d26
JQ
455static DecompressParam *decomp_param;
456static QemuThread *decompress_threads;
73a8912b
LL
457static QemuMutex decomp_done_lock;
458static QemuCond decomp_done_cond;
56e93d26 459
5e5fdcff 460static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 461 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
462
463static void *do_data_compress(void *opaque)
464{
465 CompressParam *param = opaque;
a7a9a88f
LL
466 RAMBlock *block;
467 ram_addr_t offset;
5e5fdcff 468 bool zero_page;
56e93d26 469
a7a9a88f 470 qemu_mutex_lock(&param->mutex);
90e56fb4 471 while (!param->quit) {
a7a9a88f
LL
472 if (param->block) {
473 block = param->block;
474 offset = param->offset;
475 param->block = NULL;
476 qemu_mutex_unlock(&param->mutex);
477
5e5fdcff
XG
478 zero_page = do_compress_ram_page(param->file, &param->stream,
479 block, offset, param->originbuf);
a7a9a88f 480
0d9f9a5c 481 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 482 param->done = true;
5e5fdcff 483 param->zero_page = zero_page;
0d9f9a5c
LL
484 qemu_cond_signal(&comp_done_cond);
485 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
486
487 qemu_mutex_lock(&param->mutex);
488 } else {
56e93d26
JQ
489 qemu_cond_wait(&param->cond, &param->mutex);
490 }
56e93d26 491 }
a7a9a88f 492 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
493
494 return NULL;
495}
496
f0afa331 497static void compress_threads_save_cleanup(void)
56e93d26
JQ
498{
499 int i, thread_count;
500
05306935 501 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
502 return;
503 }
05306935 504
56e93d26
JQ
505 thread_count = migrate_compress_threads();
506 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
507 /*
508 * we use it as a indicator which shows if the thread is
509 * properly init'd or not
510 */
511 if (!comp_param[i].file) {
512 break;
513 }
05306935
FL
514
515 qemu_mutex_lock(&comp_param[i].mutex);
516 comp_param[i].quit = true;
517 qemu_cond_signal(&comp_param[i].cond);
518 qemu_mutex_unlock(&comp_param[i].mutex);
519
56e93d26 520 qemu_thread_join(compress_threads + i);
56e93d26
JQ
521 qemu_mutex_destroy(&comp_param[i].mutex);
522 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 523 deflateEnd(&comp_param[i].stream);
34ab9e97 524 g_free(comp_param[i].originbuf);
dcaf446e
XG
525 qemu_fclose(comp_param[i].file);
526 comp_param[i].file = NULL;
56e93d26 527 }
0d9f9a5c
LL
528 qemu_mutex_destroy(&comp_done_lock);
529 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
530 g_free(compress_threads);
531 g_free(comp_param);
56e93d26
JQ
532 compress_threads = NULL;
533 comp_param = NULL;
56e93d26
JQ
534}
535
dcaf446e 536static int compress_threads_save_setup(void)
56e93d26
JQ
537{
538 int i, thread_count;
539
540 if (!migrate_use_compression()) {
dcaf446e 541 return 0;
56e93d26 542 }
56e93d26
JQ
543 thread_count = migrate_compress_threads();
544 compress_threads = g_new0(QemuThread, thread_count);
545 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
546 qemu_cond_init(&comp_done_cond);
547 qemu_mutex_init(&comp_done_lock);
56e93d26 548 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
549 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550 if (!comp_param[i].originbuf) {
551 goto exit;
552 }
553
dcaf446e
XG
554 if (deflateInit(&comp_param[i].stream,
555 migrate_compress_level()) != Z_OK) {
34ab9e97 556 g_free(comp_param[i].originbuf);
dcaf446e
XG
557 goto exit;
558 }
559
e110aa91
C
560 /* comp_param[i].file is just used as a dummy buffer to save data,
561 * set its ops to empty.
56e93d26 562 */
c6ad5be7 563 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
56e93d26 564 comp_param[i].done = true;
90e56fb4 565 comp_param[i].quit = false;
56e93d26
JQ
566 qemu_mutex_init(&comp_param[i].mutex);
567 qemu_cond_init(&comp_param[i].cond);
568 qemu_thread_create(compress_threads + i, "compress",
569 do_data_compress, comp_param + i,
570 QEMU_THREAD_JOINABLE);
571 }
dcaf446e
XG
572 return 0;
573
574exit:
575 compress_threads_save_cleanup();
576 return -1;
56e93d26
JQ
577}
578
579/**
3d0684b2 580 * save_page_header: write page header to wire
56e93d26
JQ
581 *
582 * If this is the 1st block, it also writes the block identification
583 *
3d0684b2 584 * Returns the number of bytes written
56e93d26
JQ
585 *
586 * @f: QEMUFile where to send the data
587 * @block: block that contains the page we want to send
588 * @offset: offset inside the block for the page
589 * in the lower bits, it contains flags
590 */
2bf3aa85
JQ
591static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
592 ram_addr_t offset)
56e93d26 593{
9f5f380b 594 size_t size, len;
56e93d26 595
24795694
JQ
596 if (block == rs->last_sent_block) {
597 offset |= RAM_SAVE_FLAG_CONTINUE;
598 }
2bf3aa85 599 qemu_put_be64(f, offset);
56e93d26
JQ
600 size = 8;
601
602 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 603 len = strlen(block->idstr);
2bf3aa85
JQ
604 qemu_put_byte(f, len);
605 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 606 size += 1 + len;
24795694 607 rs->last_sent_block = block;
56e93d26
JQ
608 }
609 return size;
610}
611
3d0684b2 612/**
179a8080 613 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
614 *
615 * Reduce amount of guest cpu execution to hopefully slow down memory
616 * writes. If guest dirty memory rate is reduced below the rate at
617 * which we can transfer pages to the destination then we should be
618 * able to complete migration. Some workloads dirty memory way too
619 * fast and will not effectively converge, even with auto-converge.
070afca2 620 */
cbbf8182
KZ
621static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
622 uint64_t bytes_dirty_threshold)
070afca2
JH
623{
624 MigrationState *s = migrate_get_current();
2594f56d 625 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
626 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
627 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 628 int pct_max = s->parameters.max_cpu_throttle;
070afca2 629
cbbf8182
KZ
630 uint64_t throttle_now = cpu_throttle_get_percentage();
631 uint64_t cpu_now, cpu_ideal, throttle_inc;
632
070afca2
JH
633 /* We have not started throttling yet. Let's start it. */
634 if (!cpu_throttle_active()) {
635 cpu_throttle_set(pct_initial);
636 } else {
637 /* Throttling already on, just increase the rate */
cbbf8182
KZ
638 if (!pct_tailslow) {
639 throttle_inc = pct_increment;
640 } else {
641 /* Compute the ideal CPU percentage used by Guest, which may
642 * make the dirty rate match the dirty rate threshold. */
643 cpu_now = 100 - throttle_now;
644 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
645 bytes_dirty_period);
646 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
647 }
648 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
649 }
650}
651
91fe9a8d
RL
652void mig_throttle_counter_reset(void)
653{
654 RAMState *rs = ram_state;
655
656 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
657 rs->num_dirty_pages_period = 0;
658 rs->bytes_xfer_prev = ram_counters.transferred;
659}
660
3d0684b2
JQ
661/**
662 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
663 *
6f37bb8b 664 * @rs: current RAM state
3d0684b2
JQ
665 * @current_addr: address for the zero page
666 *
667 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
668 * The important thing is that a stale (not-yet-0'd) page be replaced
669 * by the new data.
670 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 671 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 672 */
6f37bb8b 673static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 674{
1a373522 675 if (!rs->xbzrle_enabled) {
56e93d26
JQ
676 return;
677 }
678
679 /* We don't care if this fails to allocate a new cache page
680 * as long as it updated an old one */
c00e0928 681 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 682 ram_counters.dirty_sync_count);
56e93d26
JQ
683}
684
685#define ENCODING_FLAG_XBZRLE 0x1
686
687/**
688 * save_xbzrle_page: compress and send current page
689 *
690 * Returns: 1 means that we wrote the page
691 * 0 means that page is identical to the one already sent
692 * -1 means that xbzrle would be longer than normal
693 *
5a987738 694 * @rs: current RAM state
3d0684b2
JQ
695 * @current_data: pointer to the address of the page contents
696 * @current_addr: addr of the page
56e93d26
JQ
697 * @block: block that contains the page we want to send
698 * @offset: offset inside the block for the page
56e93d26 699 */
204b88b8 700static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 701 ram_addr_t current_addr, RAMBlock *block,
05931ec5 702 ram_addr_t offset)
56e93d26
JQ
703{
704 int encoded_len = 0, bytes_xbzrle;
705 uint8_t *prev_cached_page;
706
9360447d
JQ
707 if (!cache_is_cached(XBZRLE.cache, current_addr,
708 ram_counters.dirty_sync_count)) {
709 xbzrle_counters.cache_miss++;
05931ec5 710 if (!rs->last_stage) {
56e93d26 711 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 712 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
713 return -1;
714 } else {
715 /* update *current_data when the page has been
716 inserted into cache */
717 *current_data = get_cached_data(XBZRLE.cache, current_addr);
718 }
719 }
720 return -1;
721 }
722
e460a4b1
WW
723 /*
724 * Reaching here means the page has hit the xbzrle cache, no matter what
725 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 726 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
727 *
728 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
729 * 2nd page turns out to be skipped (i.e. no new bytes written to the
730 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
731 * skipped page included. In this way, the encoding rate can tell if the
732 * guest page is good for xbzrle encoding.
733 */
734 xbzrle_counters.pages++;
56e93d26
JQ
735 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
736
737 /* save current buffer into memory */
738 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
739
740 /* XBZRLE encoding (if there is no overflow) */
741 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
742 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
743 TARGET_PAGE_SIZE);
ca353803
WY
744
745 /*
746 * Update the cache contents, so that it corresponds to the data
747 * sent, in all cases except where we skip the page.
748 */
05931ec5 749 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
750 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
751 /*
752 * In the case where we couldn't compress, ensure that the caller
753 * sends the data from the cache, since the guest might have
754 * changed the RAM since we copied it.
755 */
756 *current_data = prev_cached_page;
757 }
758
56e93d26 759 if (encoded_len == 0) {
55c4446b 760 trace_save_xbzrle_page_skipping();
56e93d26
JQ
761 return 0;
762 } else if (encoded_len == -1) {
55c4446b 763 trace_save_xbzrle_page_overflow();
9360447d 764 xbzrle_counters.overflow++;
e460a4b1 765 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
766 return -1;
767 }
768
56e93d26 769 /* Send XBZRLE based compressed page */
2bf3aa85 770 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
771 offset | RAM_SAVE_FLAG_XBZRLE);
772 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
773 qemu_put_be16(rs->f, encoded_len);
774 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 775 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
776 /*
777 * Like compressed_size (please see update_compress_thread_counts),
778 * the xbzrle encoded bytes don't count the 8 byte header with
779 * RAM_SAVE_FLAG_CONTINUE.
780 */
781 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 782 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
783
784 return 1;
785}
786
3d0684b2
JQ
787/**
788 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 789 *
a5f7b1a6 790 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 791 *
6f37bb8b 792 * @rs: current RAM state
3d0684b2 793 * @rb: RAMBlock where to search for dirty pages
a935e30f 794 * @start: page where we start the search
f3f491fc 795 */
56e93d26 796static inline
a935e30f 797unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 798 unsigned long start)
56e93d26 799{
6b6712ef
JQ
800 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
801 unsigned long *bitmap = rb->bmap;
56e93d26 802
fbd162e6 803 if (ramblock_is_ignored(rb)) {
b895de50
CLG
804 return size;
805 }
806
1a373522 807 return find_next_bit(bitmap, size, start);
56e93d26
JQ
808}
809
1230a25f 810static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
811 unsigned long page)
812{
813 uint8_t shift;
814 hwaddr size, start;
815
816 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
817 return;
818 }
819
820 shift = rb->clear_bmap_shift;
821 /*
822 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
823 * can make things easier sometimes since then start address
824 * of the small chunk will always be 64 pages aligned so the
825 * bitmap will always be aligned to unsigned long. We should
826 * even be able to remove this restriction but I'm simply
827 * keeping it.
828 */
829 assert(shift >= 6);
830
831 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 832 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
833 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
834 memory_region_clear_dirty_bitmap(rb->mr, start, size);
835}
836
837static void
1230a25f 838migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
839 unsigned long start,
840 unsigned long npages)
841{
842 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
843 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
844 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
845
846 /*
847 * Clear pages from start to start + npages - 1, so the end boundary is
848 * exclusive.
849 */
850 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 851 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
852 }
853}
854
a6a83cef
RL
855/*
856 * colo_bitmap_find_diry:find contiguous dirty pages from start
857 *
858 * Returns the page offset within memory region of the start of the contiguout
859 * dirty page
860 *
861 * @rs: current RAM state
862 * @rb: RAMBlock where to search for dirty pages
863 * @start: page where we start the search
864 * @num: the number of contiguous dirty pages
865 */
866static inline
867unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
868 unsigned long start, unsigned long *num)
869{
870 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
871 unsigned long *bitmap = rb->bmap;
872 unsigned long first, next;
873
874 *num = 0;
875
876 if (ramblock_is_ignored(rb)) {
877 return size;
878 }
879
880 first = find_next_bit(bitmap, size, start);
881 if (first >= size) {
882 return first;
883 }
884 next = find_next_zero_bit(bitmap, size, first + 1);
885 assert(next >= first);
886 *num = next - first;
887 return first;
888}
889
06b10688 890static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
891 RAMBlock *rb,
892 unsigned long page)
a82d593b
DDAG
893{
894 bool ret;
a82d593b 895
002cad6b
PX
896 /*
897 * Clear dirty bitmap if needed. This _must_ be called before we
898 * send any of the page in the chunk because we need to make sure
899 * we can capture further page content changes when we sync dirty
900 * log the next time. So as long as we are going to send any of
901 * the page in the chunk we clear the remote dirty bitmap for all.
902 * Clearing it earlier won't be a problem, but too late will.
903 */
1230a25f 904 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 905
6b6712ef 906 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 907 if (ret) {
0d8ec885 908 rs->migration_dirty_pages--;
a82d593b 909 }
386a907b 910
a82d593b
DDAG
911 return ret;
912}
913
be39b4cd
DH
914static void dirty_bitmap_clear_section(MemoryRegionSection *section,
915 void *opaque)
916{
917 const hwaddr offset = section->offset_within_region;
918 const hwaddr size = int128_get64(section->size);
919 const unsigned long start = offset >> TARGET_PAGE_BITS;
920 const unsigned long npages = size >> TARGET_PAGE_BITS;
921 RAMBlock *rb = section->mr->ram_block;
922 uint64_t *cleared_bits = opaque;
923
924 /*
925 * We don't grab ram_state->bitmap_mutex because we expect to run
926 * only when starting migration or during postcopy recovery where
927 * we don't have concurrent access.
928 */
929 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
930 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
931 }
932 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
933 bitmap_clear(rb->bmap, start, npages);
934}
935
936/*
937 * Exclude all dirty pages from migration that fall into a discarded range as
938 * managed by a RamDiscardManager responsible for the mapped memory region of
939 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
940 *
941 * Discarded pages ("logically unplugged") have undefined content and must
942 * not get migrated, because even reading these pages for migration might
943 * result in undesired behavior.
944 *
945 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
946 *
947 * Note: The result is only stable while migrating (precopy/postcopy).
948 */
949static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
950{
951 uint64_t cleared_bits = 0;
952
953 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
954 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
955 MemoryRegionSection section = {
956 .mr = rb->mr,
957 .offset_within_region = 0,
958 .size = int128_make64(qemu_ram_get_used_length(rb)),
959 };
960
961 ram_discard_manager_replay_discarded(rdm, &section,
962 dirty_bitmap_clear_section,
963 &cleared_bits);
964 }
965 return cleared_bits;
966}
967
9470c5e0
DH
968/*
969 * Check if a host-page aligned page falls into a discarded range as managed by
970 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
971 *
972 * Note: The result is only stable while migrating (precopy/postcopy).
973 */
974bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
975{
976 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
977 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
978 MemoryRegionSection section = {
979 .mr = rb->mr,
980 .offset_within_region = start,
981 .size = int128_make64(qemu_ram_pagesize(rb)),
982 };
983
984 return !ram_discard_manager_is_populated(rdm, &section);
985 }
986 return false;
987}
988
267691b6 989/* Called with RCU critical section */
7a3e9571 990static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 991{
fb613580
KZ
992 uint64_t new_dirty_pages =
993 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
994
995 rs->migration_dirty_pages += new_dirty_pages;
996 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
997}
998
3d0684b2
JQ
999/**
1000 * ram_pagesize_summary: calculate all the pagesizes of a VM
1001 *
1002 * Returns a summary bitmap of the page sizes of all RAMBlocks
1003 *
1004 * For VMs with just normal pages this is equivalent to the host page
1005 * size. If it's got some huge pages then it's the OR of all the
1006 * different page sizes.
e8ca1db2
DDAG
1007 */
1008uint64_t ram_pagesize_summary(void)
1009{
1010 RAMBlock *block;
1011 uint64_t summary = 0;
1012
fbd162e6 1013 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1014 summary |= block->page_size;
1015 }
1016
1017 return summary;
1018}
1019
aecbfe9c
XG
1020uint64_t ram_get_total_transferred_pages(void)
1021{
1022 return ram_counters.normal + ram_counters.duplicate +
1023 compression_counters.pages + xbzrle_counters.pages;
1024}
1025
b734035b
XG
1026static void migration_update_rates(RAMState *rs, int64_t end_time)
1027{
be8b02ed 1028 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1029 double compressed_size;
b734035b
XG
1030
1031 /* calculate period counters */
1032 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1033 / (end_time - rs->time_last_bitmap_sync);
1034
be8b02ed 1035 if (!page_count) {
b734035b
XG
1036 return;
1037 }
1038
1039 if (migrate_use_xbzrle()) {
e460a4b1
WW
1040 double encoded_size, unencoded_size;
1041
b734035b 1042 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1043 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1044 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1045 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1046 TARGET_PAGE_SIZE;
1047 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1048 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1049 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1050 } else {
1051 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1052 }
1053 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1054 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1055 }
76e03000
XG
1056
1057 if (migrate_use_compression()) {
1058 compression_counters.busy_rate = (double)(compression_counters.busy -
1059 rs->compress_thread_busy_prev) / page_count;
1060 rs->compress_thread_busy_prev = compression_counters.busy;
1061
1062 compressed_size = compression_counters.compressed_size -
1063 rs->compressed_size_prev;
1064 if (compressed_size) {
1065 double uncompressed_size = (compression_counters.pages -
1066 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1067
1068 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1069 compression_counters.compression_rate =
1070 uncompressed_size / compressed_size;
1071
1072 rs->compress_pages_prev = compression_counters.pages;
1073 rs->compressed_size_prev = compression_counters.compressed_size;
1074 }
1075 }
b734035b
XG
1076}
1077
dc14a470
KZ
1078static void migration_trigger_throttle(RAMState *rs)
1079{
1080 MigrationState *s = migrate_get_current();
1081 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1082
1083 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1084 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1085 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1086
1087 /* During block migration the auto-converge logic incorrectly detects
1088 * that ram migration makes no progress. Avoid this by disabling the
1089 * throttling logic during the bulk phase of block migration. */
1090 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1091 /* The following detection logic can be refined later. For now:
1092 Check to see if the ratio between dirtied bytes and the approx.
1093 amount of bytes that just got transferred since the last time
1094 we were in this routine reaches the threshold. If that happens
1095 twice, start or increase throttling. */
1096
1097 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1098 (++rs->dirty_rate_high_cnt >= 2)) {
1099 trace_migration_throttle();
1100 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1101 mig_throttle_guest_down(bytes_dirty_period,
1102 bytes_dirty_threshold);
dc14a470
KZ
1103 }
1104 }
1105}
1106
8d820d6f 1107static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1108{
1109 RAMBlock *block;
56e93d26 1110 int64_t end_time;
56e93d26 1111
9360447d 1112 ram_counters.dirty_sync_count++;
56e93d26 1113
f664da80
JQ
1114 if (!rs->time_last_bitmap_sync) {
1115 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1116 }
1117
1118 trace_migration_bitmap_sync_start();
9c1f8f44 1119 memory_global_dirty_log_sync();
56e93d26 1120
108cfae0 1121 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1122 WITH_RCU_READ_LOCK_GUARD() {
1123 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1124 ramblock_sync_dirty_bitmap(rs, block);
1125 }
1126 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1127 }
108cfae0 1128 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1129
9458a9a1 1130 memory_global_after_dirty_log_sync();
a66cd90c 1131 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1132
56e93d26
JQ
1133 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1134
1135 /* more than 1 second = 1000 millisecons */
f664da80 1136 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1137 migration_trigger_throttle(rs);
070afca2 1138
b734035b
XG
1139 migration_update_rates(rs, end_time);
1140
be8b02ed 1141 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1142
1143 /* reset period counters */
f664da80 1144 rs->time_last_bitmap_sync = end_time;
a66cd90c 1145 rs->num_dirty_pages_period = 0;
dc14a470 1146 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1147 }
4addcd4f 1148 if (migrate_use_events()) {
3ab72385 1149 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1150 }
56e93d26
JQ
1151}
1152
bd227060
WW
1153static void migration_bitmap_sync_precopy(RAMState *rs)
1154{
1155 Error *local_err = NULL;
1156
1157 /*
1158 * The current notifier usage is just an optimization to migration, so we
1159 * don't stop the normal migration process in the error case.
1160 */
1161 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1162 error_report_err(local_err);
b4a1733c 1163 local_err = NULL;
bd227060
WW
1164 }
1165
1166 migration_bitmap_sync(rs);
1167
1168 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1169 error_report_err(local_err);
1170 }
1171}
1172
47fe16ff
JQ
1173static void ram_release_page(const char *rbname, uint64_t offset)
1174{
1175 if (!migrate_release_ram() || !migration_in_postcopy()) {
1176 return;
1177 }
1178
1179 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1180}
1181
6c97ec5f
XG
1182/**
1183 * save_zero_page_to_file: send the zero page to the file
1184 *
1185 * Returns the size of data written to the file, 0 means the page is not
1186 * a zero page
1187 *
1188 * @rs: current RAM state
1189 * @file: the file where the data is saved
1190 * @block: block that contains the page we want to send
1191 * @offset: offset inside the block for the page
1192 */
1193static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1194 RAMBlock *block, ram_addr_t offset)
1195{
1196 uint8_t *p = block->host + offset;
1197 int len = 0;
1198
bad452a7 1199 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
6c97ec5f
XG
1200 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1201 qemu_put_byte(file, 0);
1202 len += 1;
47fe16ff 1203 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1204 }
1205 return len;
1206}
1207
56e93d26 1208/**
3d0684b2 1209 * save_zero_page: send the zero page to the stream
56e93d26 1210 *
3d0684b2 1211 * Returns the number of pages written.
56e93d26 1212 *
f7ccd61b 1213 * @rs: current RAM state
56e93d26
JQ
1214 * @block: block that contains the page we want to send
1215 * @offset: offset inside the block for the page
56e93d26 1216 */
7faccdc3 1217static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1218{
6c97ec5f 1219 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1220
6c97ec5f 1221 if (len) {
9360447d 1222 ram_counters.duplicate++;
4c2d0f6d 1223 ram_transferred_add(len);
6c97ec5f 1224 return 1;
56e93d26 1225 }
6c97ec5f 1226 return -1;
56e93d26
JQ
1227}
1228
059ff0fb
XG
1229/*
1230 * @pages: the number of pages written by the control path,
1231 * < 0 - error
1232 * > 0 - number of pages written
1233 *
1234 * Return true if the pages has been saved, otherwise false is returned.
1235 */
1236static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1237 int *pages)
1238{
1239 uint64_t bytes_xmit = 0;
1240 int ret;
1241
1242 *pages = -1;
1243 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1244 &bytes_xmit);
1245 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1246 return false;
1247 }
1248
1249 if (bytes_xmit) {
4c2d0f6d 1250 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1251 *pages = 1;
1252 }
1253
1254 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1255 return true;
1256 }
1257
1258 if (bytes_xmit > 0) {
1259 ram_counters.normal++;
1260 } else if (bytes_xmit == 0) {
1261 ram_counters.duplicate++;
1262 }
1263
1264 return true;
1265}
1266
65dacaa0
XG
1267/*
1268 * directly send the page to the stream
1269 *
1270 * Returns the number of pages written.
1271 *
1272 * @rs: current RAM state
1273 * @block: block that contains the page we want to send
1274 * @offset: offset inside the block for the page
1275 * @buf: the page to be sent
1276 * @async: send to page asyncly
1277 */
1278static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1279 uint8_t *buf, bool async)
1280{
4c2d0f6d
DE
1281 ram_transferred_add(save_page_header(rs, rs->f, block,
1282 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0
XG
1283 if (async) {
1284 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1285 migrate_release_ram() &
1286 migration_in_postcopy());
1287 } else {
1288 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1289 }
4c2d0f6d 1290 ram_transferred_add(TARGET_PAGE_SIZE);
65dacaa0
XG
1291 ram_counters.normal++;
1292 return 1;
1293}
1294
56e93d26 1295/**
3d0684b2 1296 * ram_save_page: send the given page to the stream
56e93d26 1297 *
3d0684b2 1298 * Returns the number of pages written.
3fd3c4b3
DDAG
1299 * < 0 - error
1300 * >=0 - Number of pages written - this might legally be 0
1301 * if xbzrle noticed the page was the same.
56e93d26 1302 *
6f37bb8b 1303 * @rs: current RAM state
56e93d26
JQ
1304 * @block: block that contains the page we want to send
1305 * @offset: offset inside the block for the page
56e93d26 1306 */
05931ec5 1307static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1308{
1309 int pages = -1;
56e93d26 1310 uint8_t *p;
56e93d26 1311 bool send_async = true;
a08f6890 1312 RAMBlock *block = pss->block;
8bba004c 1313 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1314 ram_addr_t current_addr = block->offset + offset;
56e93d26 1315
2f68e399 1316 p = block->host + offset;
1db9d8e5 1317 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1318
56e93d26 1319 XBZRLE_cache_lock();
1a373522 1320 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
059ff0fb 1321 pages = save_xbzrle_page(rs, &p, current_addr, block,
05931ec5
JQ
1322 offset);
1323 if (!rs->last_stage) {
059ff0fb
XG
1324 /* Can't send this cached data async, since the cache page
1325 * might get updated before it gets to the wire
56e93d26 1326 */
059ff0fb 1327 send_async = false;
56e93d26
JQ
1328 }
1329 }
1330
1331 /* XBZRLE overflow or normal page */
1332 if (pages == -1) {
65dacaa0 1333 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1334 }
1335
1336 XBZRLE_cache_unlock();
1337
1338 return pages;
1339}
1340
b9ee2f7d
JQ
1341static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1342 ram_addr_t offset)
1343{
67a4c891 1344 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1345 return -1;
1346 }
b9ee2f7d
JQ
1347 ram_counters.normal++;
1348
1349 return 1;
1350}
1351
5e5fdcff 1352static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1353 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1354{
53518d94 1355 RAMState *rs = ram_state;
20d549cb 1356 uint8_t *p = block->host + offset;
6ef3771c 1357 int ret;
56e93d26 1358
5e5fdcff 1359 if (save_zero_page_to_file(rs, f, block, offset)) {
e7f2e190 1360 return true;
5e5fdcff
XG
1361 }
1362
6ef3771c 1363 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1364
1365 /*
1366 * copy it to a internal buffer to avoid it being modified by VM
1367 * so that we can catch up the error during compression and
1368 * decompression
1369 */
1370 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1371 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1372 if (ret < 0) {
1373 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1374 error_report("compressed data failed!");
b3be2896 1375 }
e7f2e190 1376 return false;
5e5fdcff
XG
1377}
1378
1379static void
1380update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1381{
4c2d0f6d 1382 ram_transferred_add(bytes_xmit);
76e03000 1383
5e5fdcff
XG
1384 if (param->zero_page) {
1385 ram_counters.duplicate++;
76e03000 1386 return;
5e5fdcff 1387 }
76e03000
XG
1388
1389 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1390 compression_counters.compressed_size += bytes_xmit - 8;
1391 compression_counters.pages++;
56e93d26
JQ
1392}
1393
32b05495
XG
1394static bool save_page_use_compression(RAMState *rs);
1395
ce25d337 1396static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1397{
1398 int idx, len, thread_count;
1399
32b05495 1400 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1401 return;
1402 }
1403 thread_count = migrate_compress_threads();
a7a9a88f 1404
0d9f9a5c 1405 qemu_mutex_lock(&comp_done_lock);
56e93d26 1406 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1407 while (!comp_param[idx].done) {
0d9f9a5c 1408 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1409 }
a7a9a88f 1410 }
0d9f9a5c 1411 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1412
1413 for (idx = 0; idx < thread_count; idx++) {
1414 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1415 if (!comp_param[idx].quit) {
ce25d337 1416 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1417 /*
1418 * it's safe to fetch zero_page without holding comp_done_lock
1419 * as there is no further request submitted to the thread,
1420 * i.e, the thread should be waiting for a request at this point.
1421 */
1422 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1423 }
a7a9a88f 1424 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1425 }
1426}
1427
1428static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1429 ram_addr_t offset)
1430{
1431 param->block = block;
1432 param->offset = offset;
1433}
1434
ce25d337
JQ
1435static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1436 ram_addr_t offset)
56e93d26
JQ
1437{
1438 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1439 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1440
1441 thread_count = migrate_compress_threads();
0d9f9a5c 1442 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1443retry:
1444 for (idx = 0; idx < thread_count; idx++) {
1445 if (comp_param[idx].done) {
1446 comp_param[idx].done = false;
1447 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1448 qemu_mutex_lock(&comp_param[idx].mutex);
1449 set_compress_params(&comp_param[idx], block, offset);
1450 qemu_cond_signal(&comp_param[idx].cond);
1451 qemu_mutex_unlock(&comp_param[idx].mutex);
1452 pages = 1;
5e5fdcff 1453 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1454 break;
56e93d26
JQ
1455 }
1456 }
1d58872a
XG
1457
1458 /*
1459 * wait for the free thread if the user specifies 'compress-wait-thread',
1460 * otherwise we will post the page out in the main thread as normal page.
1461 */
1462 if (pages < 0 && wait) {
1463 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1464 goto retry;
1465 }
0d9f9a5c 1466 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1467
1468 return pages;
1469}
1470
3d0684b2
JQ
1471/**
1472 * find_dirty_block: find the next dirty page and update any state
1473 * associated with the search process.
b9e60928 1474 *
a5f7b1a6 1475 * Returns true if a page is found
b9e60928 1476 *
6f37bb8b 1477 * @rs: current RAM state
3d0684b2
JQ
1478 * @pss: data about the state of the current dirty page scan
1479 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1480 */
f20e2865 1481static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1482{
f20e2865 1483 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1484 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1485 pss->page >= rs->last_page) {
b9e60928
DDAG
1486 /*
1487 * We've been once around the RAM and haven't found anything.
1488 * Give up.
1489 */
1490 *again = false;
1491 return false;
1492 }
542147f4
DH
1493 if (!offset_in_ramblock(pss->block,
1494 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1495 /* Didn't find anything in this RAM Block */
a935e30f 1496 pss->page = 0;
b9e60928
DDAG
1497 pss->block = QLIST_NEXT_RCU(pss->block, next);
1498 if (!pss->block) {
48df9d80
XG
1499 /*
1500 * If memory migration starts over, we will meet a dirtied page
1501 * which may still exists in compression threads's ring, so we
1502 * should flush the compressed data to make sure the new page
1503 * is not overwritten by the old one in the destination.
1504 *
1505 * Also If xbzrle is on, stop using the data compression at this
1506 * point. In theory, xbzrle can do better than compression.
1507 */
1508 flush_compressed_data(rs);
1509
b9e60928
DDAG
1510 /* Hit the end of the list */
1511 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1512 /* Flag that we've looped */
1513 pss->complete_round = true;
1a373522
DH
1514 /* After the first round, enable XBZRLE. */
1515 if (migrate_use_xbzrle()) {
1516 rs->xbzrle_enabled = true;
1517 }
b9e60928
DDAG
1518 }
1519 /* Didn't find anything this time, but try again on the new block */
1520 *again = true;
1521 return false;
1522 } else {
1523 /* Can go around again, but... */
1524 *again = true;
1525 /* We've found something so probably don't need to */
1526 return true;
1527 }
1528}
1529
3d0684b2
JQ
1530/**
1531 * unqueue_page: gets a page of the queue
1532 *
a82d593b 1533 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1534 *
3d0684b2
JQ
1535 * Returns the block of the page (or NULL if none available)
1536 *
ec481c6c 1537 * @rs: current RAM state
3d0684b2 1538 * @offset: used to return the offset within the RAMBlock
a82d593b 1539 */
f20e2865 1540static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1541{
1542 RAMBlock *block = NULL;
1543
ae526e32
XG
1544 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1545 return NULL;
1546 }
1547
6e8a355d 1548 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
ec481c6c
JQ
1549 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1550 struct RAMSrcPageRequest *entry =
1551 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1552 block = entry->rb;
1553 *offset = entry->offset;
a82d593b
DDAG
1554
1555 if (entry->len > TARGET_PAGE_SIZE) {
1556 entry->len -= TARGET_PAGE_SIZE;
1557 entry->offset += TARGET_PAGE_SIZE;
1558 } else {
1559 memory_region_unref(block->mr);
ec481c6c 1560 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 1561 g_free(entry);
e03a34f8 1562 migration_consume_urgent_request();
a82d593b
DDAG
1563 }
1564 }
a82d593b
DDAG
1565
1566 return block;
1567}
1568
278e2f55
AG
1569#if defined(__linux__)
1570/**
1571 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1572 * is found, return RAM block pointer and page offset
1573 *
1574 * Returns pointer to the RAMBlock containing faulting page,
1575 * NULL if no write faults are pending
1576 *
1577 * @rs: current RAM state
1578 * @offset: page offset from the beginning of the block
1579 */
1580static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1581{
1582 struct uffd_msg uffd_msg;
1583 void *page_address;
82ea3e3b 1584 RAMBlock *block;
278e2f55
AG
1585 int res;
1586
1587 if (!migrate_background_snapshot()) {
1588 return NULL;
1589 }
1590
1591 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1592 if (res <= 0) {
1593 return NULL;
1594 }
1595
1596 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1597 block = qemu_ram_block_from_host(page_address, false, offset);
1598 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1599 return block;
278e2f55
AG
1600}
1601
1602/**
1603 * ram_save_release_protection: release UFFD write protection after
1604 * a range of pages has been saved
1605 *
1606 * @rs: current RAM state
1607 * @pss: page-search-status structure
1608 * @start_page: index of the first page in the range relative to pss->block
1609 *
1610 * Returns 0 on success, negative value in case of an error
1611*/
1612static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1613 unsigned long start_page)
1614{
1615 int res = 0;
1616
1617 /* Check if page is from UFFD-managed region. */
1618 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1619 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1620 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1621
1622 /* Flush async buffers before un-protect. */
1623 qemu_fflush(rs->f);
1624 /* Un-protect memory range. */
1625 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1626 false, false);
1627 }
1628
1629 return res;
1630}
1631
1632/* ram_write_tracking_available: check if kernel supports required UFFD features
1633 *
1634 * Returns true if supports, false otherwise
1635 */
1636bool ram_write_tracking_available(void)
1637{
1638 uint64_t uffd_features;
1639 int res;
1640
1641 res = uffd_query_features(&uffd_features);
1642 return (res == 0 &&
1643 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1644}
1645
1646/* ram_write_tracking_compatible: check if guest configuration is
1647 * compatible with 'write-tracking'
1648 *
1649 * Returns true if compatible, false otherwise
1650 */
1651bool ram_write_tracking_compatible(void)
1652{
1653 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1654 int uffd_fd;
82ea3e3b 1655 RAMBlock *block;
278e2f55
AG
1656 bool ret = false;
1657
1658 /* Open UFFD file descriptor */
1659 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1660 if (uffd_fd < 0) {
1661 return false;
1662 }
1663
1664 RCU_READ_LOCK_GUARD();
1665
82ea3e3b 1666 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1667 uint64_t uffd_ioctls;
1668
1669 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1670 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1671 continue;
1672 }
1673 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1674 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1675 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1676 goto out;
1677 }
1678 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1679 goto out;
1680 }
1681 }
1682 ret = true;
1683
1684out:
1685 uffd_close_fd(uffd_fd);
1686 return ret;
1687}
1688
f7b9dcfb
DH
1689static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1690 ram_addr_t size)
1691{
1692 /*
1693 * We read one byte of each page; this will preallocate page tables if
1694 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1695 * where no page was populated yet. This might require adaption when
1696 * supporting other mappings, like shmem.
1697 */
1698 for (; offset < size; offset += block->page_size) {
1699 char tmp = *((char *)block->host + offset);
1700
1701 /* Don't optimize the read out */
1702 asm volatile("" : "+r" (tmp));
1703 }
1704}
1705
6fee3a1f
DH
1706static inline int populate_read_section(MemoryRegionSection *section,
1707 void *opaque)
1708{
1709 const hwaddr size = int128_get64(section->size);
1710 hwaddr offset = section->offset_within_region;
1711 RAMBlock *block = section->mr->ram_block;
1712
1713 populate_read_range(block, offset, size);
1714 return 0;
1715}
1716
eeccb99c 1717/*
f7b9dcfb
DH
1718 * ram_block_populate_read: preallocate page tables and populate pages in the
1719 * RAM block by reading a byte of each page.
eeccb99c
AG
1720 *
1721 * Since it's solely used for userfault_fd WP feature, here we just
1722 * hardcode page size to qemu_real_host_page_size.
1723 *
82ea3e3b 1724 * @block: RAM block to populate
eeccb99c 1725 */
6fee3a1f 1726static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1727{
6fee3a1f
DH
1728 /*
1729 * Skip populating all pages that fall into a discarded range as managed by
1730 * a RamDiscardManager responsible for the mapped memory region of the
1731 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1732 * must not get populated automatically. We don't have to track
1733 * modifications via userfaultfd WP reliably, because these pages will
1734 * not be part of the migration stream either way -- see
1735 * ramblock_dirty_bitmap_exclude_discarded_pages().
1736 *
1737 * Note: The result is only stable while migrating (precopy/postcopy).
1738 */
1739 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1740 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1741 MemoryRegionSection section = {
1742 .mr = rb->mr,
1743 .offset_within_region = 0,
1744 .size = rb->mr->size,
1745 };
1746
1747 ram_discard_manager_replay_populated(rdm, &section,
1748 populate_read_section, NULL);
1749 } else {
1750 populate_read_range(rb, 0, rb->used_length);
1751 }
eeccb99c
AG
1752}
1753
1754/*
1755 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1756 */
1757void ram_write_tracking_prepare(void)
1758{
82ea3e3b 1759 RAMBlock *block;
eeccb99c
AG
1760
1761 RCU_READ_LOCK_GUARD();
1762
82ea3e3b 1763 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1764 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1765 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1766 continue;
1767 }
1768
1769 /*
1770 * Populate pages of the RAM block before enabling userfault_fd
1771 * write protection.
1772 *
1773 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1774 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1775 * pages with pte_none() entries in page table.
1776 */
f7b9dcfb 1777 ram_block_populate_read(block);
eeccb99c
AG
1778 }
1779}
1780
278e2f55
AG
1781/*
1782 * ram_write_tracking_start: start UFFD-WP memory tracking
1783 *
1784 * Returns 0 for success or negative value in case of error
1785 */
1786int ram_write_tracking_start(void)
1787{
1788 int uffd_fd;
1789 RAMState *rs = ram_state;
82ea3e3b 1790 RAMBlock *block;
278e2f55
AG
1791
1792 /* Open UFFD file descriptor */
1793 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1794 if (uffd_fd < 0) {
1795 return uffd_fd;
1796 }
1797 rs->uffdio_fd = uffd_fd;
1798
1799 RCU_READ_LOCK_GUARD();
1800
82ea3e3b 1801 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1802 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1803 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1804 continue;
1805 }
1806
1807 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1808 if (uffd_register_memory(rs->uffdio_fd, block->host,
1809 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1810 goto fail;
1811 }
1812 /* Apply UFFD write protection to the block memory range */
82ea3e3b
AG
1813 if (uffd_change_protection(rs->uffdio_fd, block->host,
1814 block->max_length, true, false)) {
278e2f55
AG
1815 goto fail;
1816 }
82ea3e3b
AG
1817 block->flags |= RAM_UF_WRITEPROTECT;
1818 memory_region_ref(block->mr);
278e2f55 1819
82ea3e3b
AG
1820 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1821 block->host, block->max_length);
278e2f55
AG
1822 }
1823
1824 return 0;
1825
1826fail:
1827 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1828
82ea3e3b
AG
1829 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1830 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1831 continue;
1832 }
1833 /*
1834 * In case some memory block failed to be write-protected
1835 * remove protection and unregister all succeeded RAM blocks
1836 */
82ea3e3b
AG
1837 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1838 false, false);
1839 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1840 /* Cleanup flags and remove reference */
82ea3e3b
AG
1841 block->flags &= ~RAM_UF_WRITEPROTECT;
1842 memory_region_unref(block->mr);
278e2f55
AG
1843 }
1844
1845 uffd_close_fd(uffd_fd);
1846 rs->uffdio_fd = -1;
1847 return -1;
1848}
1849
1850/**
1851 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1852 */
1853void ram_write_tracking_stop(void)
1854{
1855 RAMState *rs = ram_state;
82ea3e3b 1856 RAMBlock *block;
278e2f55
AG
1857
1858 RCU_READ_LOCK_GUARD();
1859
82ea3e3b
AG
1860 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1861 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1862 continue;
1863 }
1864 /* Remove protection and unregister all affected RAM blocks */
82ea3e3b
AG
1865 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1866 false, false);
1867 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1868
82ea3e3b
AG
1869 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1870 block->host, block->max_length);
278e2f55
AG
1871
1872 /* Cleanup flags and remove reference */
82ea3e3b
AG
1873 block->flags &= ~RAM_UF_WRITEPROTECT;
1874 memory_region_unref(block->mr);
278e2f55
AG
1875 }
1876
1877 /* Finally close UFFD file descriptor */
1878 uffd_close_fd(rs->uffdio_fd);
1879 rs->uffdio_fd = -1;
1880}
1881
1882#else
1883/* No target OS support, stubs just fail or ignore */
1884
1885static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1886{
1887 (void) rs;
1888 (void) offset;
1889
1890 return NULL;
1891}
1892
1893static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1894 unsigned long start_page)
1895{
1896 (void) rs;
1897 (void) pss;
1898 (void) start_page;
1899
1900 return 0;
1901}
1902
1903bool ram_write_tracking_available(void)
1904{
1905 return false;
1906}
1907
1908bool ram_write_tracking_compatible(void)
1909{
1910 assert(0);
1911 return false;
1912}
1913
1914int ram_write_tracking_start(void)
1915{
1916 assert(0);
1917 return -1;
1918}
1919
1920void ram_write_tracking_stop(void)
1921{
1922 assert(0);
1923}
1924#endif /* defined(__linux__) */
1925
3d0684b2 1926/**
ff1543af 1927 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1928 *
1929 * Skips pages that are already sent (!dirty)
a82d593b 1930 *
a5f7b1a6 1931 * Returns true if a queued page is found
a82d593b 1932 *
6f37bb8b 1933 * @rs: current RAM state
3d0684b2 1934 * @pss: data about the state of the current dirty page scan
a82d593b 1935 */
f20e2865 1936static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1937{
1938 RAMBlock *block;
1939 ram_addr_t offset;
1940 bool dirty;
1941
1942 do {
f20e2865 1943 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1944 /*
1945 * We're sending this page, and since it's postcopy nothing else
1946 * will dirty it, and we must make sure it doesn't get sent again
1947 * even if this queue request was received after the background
1948 * search already sent it.
1949 */
1950 if (block) {
f20e2865
JQ
1951 unsigned long page;
1952
6b6712ef
JQ
1953 page = offset >> TARGET_PAGE_BITS;
1954 dirty = test_bit(page, block->bmap);
a82d593b 1955 if (!dirty) {
06b10688 1956 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 1957 page);
a82d593b 1958 } else {
f20e2865 1959 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1960 }
1961 }
1962
1963 } while (block && !dirty);
1964
278e2f55
AG
1965 if (!block) {
1966 /*
1967 * Poll write faults too if background snapshot is enabled; that's
1968 * when we have vcpus got blocked by the write protected pages.
1969 */
1970 block = poll_fault_page(rs, &offset);
1971 }
1972
a82d593b 1973 if (block) {
a82d593b
DDAG
1974 /*
1975 * We want the background search to continue from the queued page
1976 * since the guest is likely to want other pages near to the page
1977 * it just requested.
1978 */
1979 pss->block = block;
a935e30f 1980 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1981
1982 /*
1983 * This unqueued page would break the "one round" check, even is
1984 * really rare.
1985 */
1986 pss->complete_round = false;
a82d593b
DDAG
1987 }
1988
1989 return !!block;
1990}
1991
6c595cde 1992/**
5e58f968
JQ
1993 * migration_page_queue_free: drop any remaining pages in the ram
1994 * request queue
6c595cde 1995 *
3d0684b2
JQ
1996 * It should be empty at the end anyway, but in error cases there may
1997 * be some left. in case that there is any page left, we drop it.
1998 *
6c595cde 1999 */
83c13382 2000static void migration_page_queue_free(RAMState *rs)
6c595cde 2001{
ec481c6c 2002 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2003 /* This queue generally should be empty - but in the case of a failed
2004 * migration might have some droppings in.
2005 */
89ac5a1d 2006 RCU_READ_LOCK_GUARD();
ec481c6c 2007 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2008 memory_region_unref(mspr->rb->mr);
ec481c6c 2009 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2010 g_free(mspr);
2011 }
6c595cde
DDAG
2012}
2013
2014/**
3d0684b2
JQ
2015 * ram_save_queue_pages: queue the page for transmission
2016 *
2017 * A request from postcopy destination for example.
2018 *
2019 * Returns zero on success or negative on error
2020 *
3d0684b2
JQ
2021 * @rbname: Name of the RAMBLock of the request. NULL means the
2022 * same that last one.
2023 * @start: starting address from the start of the RAMBlock
2024 * @len: length (in bytes) to send
6c595cde 2025 */
96506894 2026int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2027{
2028 RAMBlock *ramblock;
53518d94 2029 RAMState *rs = ram_state;
6c595cde 2030
9360447d 2031 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2032 RCU_READ_LOCK_GUARD();
2033
6c595cde
DDAG
2034 if (!rbname) {
2035 /* Reuse last RAMBlock */
68a098f3 2036 ramblock = rs->last_req_rb;
6c595cde
DDAG
2037
2038 if (!ramblock) {
2039 /*
2040 * Shouldn't happen, we can't reuse the last RAMBlock if
2041 * it's the 1st request.
2042 */
2043 error_report("ram_save_queue_pages no previous block");
03acb4e9 2044 return -1;
6c595cde
DDAG
2045 }
2046 } else {
2047 ramblock = qemu_ram_block_by_name(rbname);
2048
2049 if (!ramblock) {
2050 /* We shouldn't be asked for a non-existent RAMBlock */
2051 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2052 return -1;
6c595cde 2053 }
68a098f3 2054 rs->last_req_rb = ramblock;
6c595cde
DDAG
2055 }
2056 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2057 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2058 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2059 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2060 __func__, start, len, ramblock->used_length);
03acb4e9 2061 return -1;
6c595cde
DDAG
2062 }
2063
ec481c6c
JQ
2064 struct RAMSrcPageRequest *new_entry =
2065 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2066 new_entry->rb = ramblock;
2067 new_entry->offset = start;
2068 new_entry->len = len;
2069
2070 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2071 qemu_mutex_lock(&rs->src_page_req_mutex);
2072 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2073 migration_make_urgent_request();
ec481c6c 2074 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2075
2076 return 0;
6c595cde
DDAG
2077}
2078
d7400a34
XG
2079static bool save_page_use_compression(RAMState *rs)
2080{
2081 if (!migrate_use_compression()) {
2082 return false;
2083 }
2084
2085 /*
1a373522
DH
2086 * If xbzrle is enabled (e.g., after first round of migration), stop
2087 * using the data compression. In theory, xbzrle can do better than
2088 * compression.
d7400a34 2089 */
1a373522
DH
2090 if (rs->xbzrle_enabled) {
2091 return false;
d7400a34
XG
2092 }
2093
1a373522 2094 return true;
d7400a34
XG
2095}
2096
5e5fdcff
XG
2097/*
2098 * try to compress the page before posting it out, return true if the page
2099 * has been properly handled by compression, otherwise needs other
2100 * paths to handle it
2101 */
2102static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2103{
2104 if (!save_page_use_compression(rs)) {
2105 return false;
2106 }
2107
2108 /*
2109 * When starting the process of a new block, the first page of
2110 * the block should be sent out before other pages in the same
2111 * block, and all the pages in last block should have been sent
2112 * out, keeping this order is important, because the 'cont' flag
2113 * is used to avoid resending the block name.
2114 *
2115 * We post the fist page as normal page as compression will take
2116 * much CPU resource.
2117 */
2118 if (block != rs->last_sent_block) {
2119 flush_compressed_data(rs);
2120 return false;
2121 }
2122
2123 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2124 return true;
2125 }
2126
76e03000 2127 compression_counters.busy++;
5e5fdcff
XG
2128 return false;
2129}
2130
a82d593b 2131/**
3d0684b2 2132 * ram_save_target_page: save one target page
a82d593b 2133 *
3d0684b2 2134 * Returns the number of pages written
a82d593b 2135 *
6f37bb8b 2136 * @rs: current RAM state
3d0684b2 2137 * @pss: data about the page we want to send
a82d593b 2138 */
05931ec5 2139static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2140{
a8ec91f9 2141 RAMBlock *block = pss->block;
8bba004c 2142 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2143 int res;
2144
2145 if (control_save_page(rs, block, offset, &res)) {
2146 return res;
2147 }
2148
5e5fdcff
XG
2149 if (save_compress_page(rs, block, offset)) {
2150 return 1;
d7400a34
XG
2151 }
2152
2153 res = save_zero_page(rs, block, offset);
2154 if (res > 0) {
2155 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2156 * page would be stale
2157 */
2158 if (!save_page_use_compression(rs)) {
2159 XBZRLE_cache_lock();
2160 xbzrle_cache_zero_page(rs, block->offset + offset);
2161 XBZRLE_cache_unlock();
2162 }
d7400a34
XG
2163 return res;
2164 }
2165
da3f56cb 2166 /*
c6b3a2e0
WY
2167 * Do not use multifd for:
2168 * 1. Compression as the first page in the new block should be posted out
2169 * before sending the compressed page
2170 * 2. In postcopy as one whole host page should be placed
da3f56cb 2171 */
c6b3a2e0
WY
2172 if (!save_page_use_compression(rs) && migrate_use_multifd()
2173 && !migration_in_postcopy()) {
b9ee2f7d 2174 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2175 }
2176
05931ec5 2177 return ram_save_page(rs, pss);
a82d593b
DDAG
2178}
2179
2180/**
3d0684b2 2181 * ram_save_host_page: save a whole host page
a82d593b 2182 *
3d0684b2
JQ
2183 * Starting at *offset send pages up to the end of the current host
2184 * page. It's valid for the initial offset to point into the middle of
2185 * a host page in which case the remainder of the hostpage is sent.
2186 * Only dirty target pages are sent. Note that the host page size may
2187 * be a huge page for this block.
1eb3fc0a
DDAG
2188 * The saving stops at the boundary of the used_length of the block
2189 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2190 *
3d0684b2
JQ
2191 * Returns the number of pages written or negative on error
2192 *
6f37bb8b 2193 * @rs: current RAM state
3d0684b2 2194 * @pss: data about the page we want to send
a82d593b 2195 */
05931ec5 2196static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2197{
2198 int tmppages, pages = 0;
a935e30f
JQ
2199 size_t pagesize_bits =
2200 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
ba1b7c81
KJ
2201 unsigned long hostpage_boundary =
2202 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
278e2f55
AG
2203 unsigned long start_page = pss->page;
2204 int res;
4c011c37 2205
fbd162e6 2206 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2207 error_report("block %s should not be migrated !", pss->block->idstr);
2208 return 0;
2209 }
2210
a82d593b 2211 do {
1faa5665 2212 /* Check the pages is dirty and if it is send it */
ba1b7c81 2213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
05931ec5 2214 tmppages = ram_save_target_page(rs, pss);
ba1b7c81
KJ
2215 if (tmppages < 0) {
2216 return tmppages;
2217 }
a82d593b 2218
ba1b7c81
KJ
2219 pages += tmppages;
2220 /*
2221 * Allow rate limiting to happen in the middle of huge pages if
2222 * something is sent in the current iteration.
2223 */
2224 if (pagesize_bits > 1 && tmppages > 0) {
2225 migration_rate_limit();
2226 }
23feba90 2227 }
ba1b7c81
KJ
2228 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2229 } while ((pss->page < hostpage_boundary) &&
8bba004c
AR
2230 offset_in_ramblock(pss->block,
2231 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
ba1b7c81 2232 /* The offset we leave with is the min boundary of host page and block */
258f5c98 2233 pss->page = MIN(pss->page, hostpage_boundary);
278e2f55
AG
2234
2235 res = ram_save_release_protection(rs, pss, start_page);
2236 return (res < 0 ? res : pages);
a82d593b 2237}
6c595cde 2238
56e93d26 2239/**
3d0684b2 2240 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2241 *
2242 * Called within an RCU critical section.
2243 *
e8f3735f
XG
2244 * Returns the number of pages written where zero means no dirty pages,
2245 * or negative on error
56e93d26 2246 *
6f37bb8b 2247 * @rs: current RAM state
a82d593b
DDAG
2248 *
2249 * On systems where host-page-size > target-page-size it will send all the
2250 * pages in a host page that are dirty.
56e93d26 2251 */
05931ec5 2252static int ram_find_and_save_block(RAMState *rs)
56e93d26 2253{
b8fb8cb7 2254 PageSearchStatus pss;
56e93d26 2255 int pages = 0;
b9e60928 2256 bool again, found;
56e93d26 2257
0827b9e9
AA
2258 /* No dirty page as there is zero RAM */
2259 if (!ram_bytes_total()) {
2260 return pages;
2261 }
2262
6f37bb8b 2263 pss.block = rs->last_seen_block;
a935e30f 2264 pss.page = rs->last_page;
b8fb8cb7
DDAG
2265 pss.complete_round = false;
2266
2267 if (!pss.block) {
2268 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2269 }
56e93d26 2270
b9e60928 2271 do {
a82d593b 2272 again = true;
f20e2865 2273 found = get_queued_page(rs, &pss);
b9e60928 2274
a82d593b
DDAG
2275 if (!found) {
2276 /* priority queue empty, so just search for something dirty */
f20e2865 2277 found = find_dirty_block(rs, &pss, &again);
a82d593b 2278 }
f3f491fc 2279
a82d593b 2280 if (found) {
05931ec5 2281 pages = ram_save_host_page(rs, &pss);
56e93d26 2282 }
b9e60928 2283 } while (!pages && again);
56e93d26 2284
6f37bb8b 2285 rs->last_seen_block = pss.block;
a935e30f 2286 rs->last_page = pss.page;
56e93d26
JQ
2287
2288 return pages;
2289}
2290
2291void acct_update_position(QEMUFile *f, size_t size, bool zero)
2292{
2293 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2294
56e93d26 2295 if (zero) {
9360447d 2296 ram_counters.duplicate += pages;
56e93d26 2297 } else {
9360447d 2298 ram_counters.normal += pages;
4c2d0f6d 2299 ram_transferred_add(size);
56e93d26
JQ
2300 qemu_update_position(f, size);
2301 }
2302}
2303
fbd162e6 2304static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2305{
2306 RAMBlock *block;
2307 uint64_t total = 0;
2308
89ac5a1d
DDAG
2309 RCU_READ_LOCK_GUARD();
2310
fbd162e6
YK
2311 if (count_ignored) {
2312 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2313 total += block->used_length;
2314 }
2315 } else {
2316 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2317 total += block->used_length;
2318 }
99e15582 2319 }
56e93d26
JQ
2320 return total;
2321}
2322
fbd162e6
YK
2323uint64_t ram_bytes_total(void)
2324{
2325 return ram_bytes_total_common(false);
2326}
2327
f265e0e4 2328static void xbzrle_load_setup(void)
56e93d26 2329{
f265e0e4 2330 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2331}
2332
f265e0e4
JQ
2333static void xbzrle_load_cleanup(void)
2334{
2335 g_free(XBZRLE.decoded_buf);
2336 XBZRLE.decoded_buf = NULL;
2337}
2338
7d7c96be
PX
2339static void ram_state_cleanup(RAMState **rsp)
2340{
b9ccaf6d
DDAG
2341 if (*rsp) {
2342 migration_page_queue_free(*rsp);
2343 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2344 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2345 g_free(*rsp);
2346 *rsp = NULL;
2347 }
7d7c96be
PX
2348}
2349
84593a08
PX
2350static void xbzrle_cleanup(void)
2351{
2352 XBZRLE_cache_lock();
2353 if (XBZRLE.cache) {
2354 cache_fini(XBZRLE.cache);
2355 g_free(XBZRLE.encoded_buf);
2356 g_free(XBZRLE.current_buf);
2357 g_free(XBZRLE.zero_target_page);
2358 XBZRLE.cache = NULL;
2359 XBZRLE.encoded_buf = NULL;
2360 XBZRLE.current_buf = NULL;
2361 XBZRLE.zero_target_page = NULL;
2362 }
2363 XBZRLE_cache_unlock();
2364}
2365
f265e0e4 2366static void ram_save_cleanup(void *opaque)
56e93d26 2367{
53518d94 2368 RAMState **rsp = opaque;
6b6712ef 2369 RAMBlock *block;
eb859c53 2370
278e2f55
AG
2371 /* We don't use dirty log with background snapshots */
2372 if (!migrate_background_snapshot()) {
2373 /* caller have hold iothread lock or is in a bh, so there is
2374 * no writing race against the migration bitmap
2375 */
63b41db4
HH
2376 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2377 /*
2378 * do not stop dirty log without starting it, since
2379 * memory_global_dirty_log_stop will assert that
2380 * memory_global_dirty_log_start/stop used in pairs
2381 */
2382 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2383 }
278e2f55 2384 }
6b6712ef 2385
fbd162e6 2386 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2387 g_free(block->clear_bmap);
2388 block->clear_bmap = NULL;
6b6712ef
JQ
2389 g_free(block->bmap);
2390 block->bmap = NULL;
56e93d26
JQ
2391 }
2392
84593a08 2393 xbzrle_cleanup();
f0afa331 2394 compress_threads_save_cleanup();
7d7c96be 2395 ram_state_cleanup(rsp);
56e93d26
JQ
2396}
2397
6f37bb8b 2398static void ram_state_reset(RAMState *rs)
56e93d26 2399{
6f37bb8b
JQ
2400 rs->last_seen_block = NULL;
2401 rs->last_sent_block = NULL;
269ace29 2402 rs->last_page = 0;
6f37bb8b 2403 rs->last_version = ram_list.version;
1a373522 2404 rs->xbzrle_enabled = false;
56e93d26
JQ
2405}
2406
2407#define MAX_WAIT 50 /* ms, half buffered_file limit */
2408
e0b266f0
DDAG
2409/* **** functions for postcopy ***** */
2410
ced1c616
PB
2411void ram_postcopy_migrated_memory_release(MigrationState *ms)
2412{
2413 struct RAMBlock *block;
ced1c616 2414
fbd162e6 2415 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2416 unsigned long *bitmap = block->bmap;
2417 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2418 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2419
2420 while (run_start < range) {
2421 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2422 ram_discard_range(block->idstr,
2423 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2424 ((ram_addr_t)(run_end - run_start))
2425 << TARGET_PAGE_BITS);
ced1c616
PB
2426 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2427 }
2428 }
2429}
2430
3d0684b2
JQ
2431/**
2432 * postcopy_send_discard_bm_ram: discard a RAMBlock
2433 *
e0b266f0 2434 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2435 *
2436 * @ms: current migration state
89dab31b 2437 * @block: RAMBlock to discard
e0b266f0 2438 */
9e7d1223 2439static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2440{
6b6712ef 2441 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2442 unsigned long current;
1e7cf8c3 2443 unsigned long *bitmap = block->bmap;
e0b266f0 2444
6b6712ef 2445 for (current = 0; current < end; ) {
1e7cf8c3 2446 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2447 unsigned long zero, discard_length;
e0b266f0 2448
33a5cb62
WY
2449 if (one >= end) {
2450 break;
2451 }
e0b266f0 2452
1e7cf8c3 2453 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2454
2455 if (zero >= end) {
2456 discard_length = end - one;
e0b266f0 2457 } else {
33a5cb62
WY
2458 discard_length = zero - one;
2459 }
810cf2bb 2460 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2461 current = one + discard_length;
e0b266f0 2462 }
e0b266f0
DDAG
2463}
2464
f30c2e5b
PX
2465static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2466
3d0684b2
JQ
2467/**
2468 * postcopy_each_ram_send_discard: discard all RAMBlocks
2469 *
e0b266f0
DDAG
2470 * Utility for the outgoing postcopy code.
2471 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2472 * passing it bitmap indexes and name.
e0b266f0
DDAG
2473 * (qemu_ram_foreach_block ends up passing unscaled lengths
2474 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2475 *
2476 * @ms: current migration state
e0b266f0 2477 */
739fcc1b 2478static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2479{
2480 struct RAMBlock *block;
e0b266f0 2481
fbd162e6 2482 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2483 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2484
f30c2e5b
PX
2485 /*
2486 * Deal with TPS != HPS and huge pages. It discard any partially sent
2487 * host-page size chunks, mark any partially dirty host-page size
2488 * chunks as all dirty. In this case the host-page is the host-page
2489 * for the particular RAMBlock, i.e. it might be a huge page.
2490 */
2491 postcopy_chunk_hostpages_pass(ms, block);
2492
e0b266f0
DDAG
2493 /*
2494 * Postcopy sends chunks of bitmap over the wire, but it
2495 * just needs indexes at this point, avoids it having
2496 * target page specific code.
2497 */
739fcc1b 2498 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2499 postcopy_discard_send_finish(ms);
e0b266f0 2500 }
e0b266f0
DDAG
2501}
2502
3d0684b2 2503/**
8324ef86 2504 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2505 *
2506 * Helper for postcopy_chunk_hostpages; it's called twice to
2507 * canonicalize the two bitmaps, that are similar, but one is
2508 * inverted.
99e314eb 2509 *
3d0684b2
JQ
2510 * Postcopy requires that all target pages in a hostpage are dirty or
2511 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2512 *
3d0684b2 2513 * @ms: current migration state
3d0684b2 2514 * @block: block that contains the page we want to canonicalize
99e314eb 2515 */
1e7cf8c3 2516static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2517{
53518d94 2518 RAMState *rs = ram_state;
6b6712ef 2519 unsigned long *bitmap = block->bmap;
29c59172 2520 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2521 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2522 unsigned long run_start;
2523
29c59172
DDAG
2524 if (block->page_size == TARGET_PAGE_SIZE) {
2525 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2526 return;
2527 }
2528
1e7cf8c3
WY
2529 /* Find a dirty page */
2530 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2531
6b6712ef 2532 while (run_start < pages) {
99e314eb
DDAG
2533
2534 /*
2535 * If the start of this run of pages is in the middle of a host
2536 * page, then we need to fixup this host page.
2537 */
9dec3cc3 2538 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2539 /* Find the end of this run */
1e7cf8c3 2540 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2541 /*
2542 * If the end isn't at the start of a host page, then the
2543 * run doesn't finish at the end of a host page
2544 * and we need to discard.
2545 */
99e314eb
DDAG
2546 }
2547
9dec3cc3 2548 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2549 unsigned long page;
dad45ab2
WY
2550 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2551 host_ratio);
2552 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2553
99e314eb
DDAG
2554 /* Clean up the bitmap */
2555 for (page = fixup_start_addr;
2556 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2557 /*
2558 * Remark them as dirty, updating the count for any pages
2559 * that weren't previously dirty.
2560 */
0d8ec885 2561 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2562 }
2563 }
2564
1e7cf8c3
WY
2565 /* Find the next dirty page for the next iteration */
2566 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2567 }
2568}
2569
3d0684b2
JQ
2570/**
2571 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2572 *
e0b266f0
DDAG
2573 * Transmit the set of pages to be discarded after precopy to the target
2574 * these are pages that:
2575 * a) Have been previously transmitted but are now dirty again
2576 * b) Pages that have never been transmitted, this ensures that
2577 * any pages on the destination that have been mapped by background
2578 * tasks get discarded (transparent huge pages is the specific concern)
2579 * Hopefully this is pretty sparse
3d0684b2
JQ
2580 *
2581 * @ms: current migration state
e0b266f0 2582 */
739fcc1b 2583void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2584{
53518d94 2585 RAMState *rs = ram_state;
e0b266f0 2586
89ac5a1d 2587 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2588
2589 /* This should be our last sync, the src is now paused */
eb859c53 2590 migration_bitmap_sync(rs);
e0b266f0 2591
6b6712ef
JQ
2592 /* Easiest way to make sure we don't resume in the middle of a host-page */
2593 rs->last_seen_block = NULL;
2594 rs->last_sent_block = NULL;
2595 rs->last_page = 0;
e0b266f0 2596
739fcc1b 2597 postcopy_each_ram_send_discard(ms);
e0b266f0 2598
739fcc1b 2599 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2600}
2601
3d0684b2
JQ
2602/**
2603 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2604 *
3d0684b2 2605 * Returns zero on success
e0b266f0 2606 *
36449157
JQ
2607 * @rbname: name of the RAMBlock of the request. NULL means the
2608 * same that last one.
3d0684b2
JQ
2609 * @start: RAMBlock starting page
2610 * @length: RAMBlock size
e0b266f0 2611 */
aaa2064c 2612int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2613{
36449157 2614 trace_ram_discard_range(rbname, start, length);
d3a5038c 2615
89ac5a1d 2616 RCU_READ_LOCK_GUARD();
36449157 2617 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2618
2619 if (!rb) {
36449157 2620 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2621 return -1;
e0b266f0
DDAG
2622 }
2623
814bb08f
PX
2624 /*
2625 * On source VM, we don't need to update the received bitmap since
2626 * we don't even have one.
2627 */
2628 if (rb->receivedmap) {
2629 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2630 length >> qemu_target_page_bits());
2631 }
2632
03acb4e9 2633 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2634}
2635
84593a08
PX
2636/*
2637 * For every allocation, we will try not to crash the VM if the
2638 * allocation failed.
2639 */
2640static int xbzrle_init(void)
2641{
2642 Error *local_err = NULL;
2643
2644 if (!migrate_use_xbzrle()) {
2645 return 0;
2646 }
2647
2648 XBZRLE_cache_lock();
2649
2650 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2651 if (!XBZRLE.zero_target_page) {
2652 error_report("%s: Error allocating zero page", __func__);
2653 goto err_out;
2654 }
2655
2656 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2657 TARGET_PAGE_SIZE, &local_err);
2658 if (!XBZRLE.cache) {
2659 error_report_err(local_err);
2660 goto free_zero_page;
2661 }
2662
2663 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2664 if (!XBZRLE.encoded_buf) {
2665 error_report("%s: Error allocating encoded_buf", __func__);
2666 goto free_cache;
2667 }
2668
2669 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2670 if (!XBZRLE.current_buf) {
2671 error_report("%s: Error allocating current_buf", __func__);
2672 goto free_encoded_buf;
2673 }
2674
2675 /* We are all good */
2676 XBZRLE_cache_unlock();
2677 return 0;
2678
2679free_encoded_buf:
2680 g_free(XBZRLE.encoded_buf);
2681 XBZRLE.encoded_buf = NULL;
2682free_cache:
2683 cache_fini(XBZRLE.cache);
2684 XBZRLE.cache = NULL;
2685free_zero_page:
2686 g_free(XBZRLE.zero_target_page);
2687 XBZRLE.zero_target_page = NULL;
2688err_out:
2689 XBZRLE_cache_unlock();
2690 return -ENOMEM;
2691}
2692
53518d94 2693static int ram_state_init(RAMState **rsp)
56e93d26 2694{
7d00ee6a
PX
2695 *rsp = g_try_new0(RAMState, 1);
2696
2697 if (!*rsp) {
2698 error_report("%s: Init ramstate fail", __func__);
2699 return -1;
2700 }
53518d94
JQ
2701
2702 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2703 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2704 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2705
7d00ee6a 2706 /*
40c4d4a8
IR
2707 * Count the total number of pages used by ram blocks not including any
2708 * gaps due to alignment or unplugs.
03158519 2709 * This must match with the initial values of dirty bitmap.
7d00ee6a 2710 */
40c4d4a8 2711 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2712 ram_state_reset(*rsp);
2713
2714 return 0;
2715}
2716
d6eff5d7 2717static void ram_list_init_bitmaps(void)
7d00ee6a 2718{
002cad6b 2719 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2720 RAMBlock *block;
2721 unsigned long pages;
002cad6b 2722 uint8_t shift;
56e93d26 2723
0827b9e9
AA
2724 /* Skip setting bitmap if there is no RAM */
2725 if (ram_bytes_total()) {
002cad6b
PX
2726 shift = ms->clear_bitmap_shift;
2727 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2728 error_report("clear_bitmap_shift (%u) too big, using "
2729 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2730 shift = CLEAR_BITMAP_SHIFT_MAX;
2731 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2732 error_report("clear_bitmap_shift (%u) too small, using "
2733 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2734 shift = CLEAR_BITMAP_SHIFT_MIN;
2735 }
2736
fbd162e6 2737 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2738 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2739 /*
2740 * The initial dirty bitmap for migration must be set with all
2741 * ones to make sure we'll migrate every guest RAM page to
2742 * destination.
40c4d4a8
IR
2743 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2744 * new migration after a failed migration, ram_list.
2745 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2746 * guest memory.
03158519 2747 */
6b6712ef 2748 block->bmap = bitmap_new(pages);
40c4d4a8 2749 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2750 block->clear_bmap_shift = shift;
2751 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2752 }
f3f491fc 2753 }
d6eff5d7
PX
2754}
2755
be39b4cd
DH
2756static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2757{
2758 unsigned long pages;
2759 RAMBlock *rb;
2760
2761 RCU_READ_LOCK_GUARD();
2762
2763 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2764 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2765 rs->migration_dirty_pages -= pages;
2766 }
2767}
2768
d6eff5d7
PX
2769static void ram_init_bitmaps(RAMState *rs)
2770{
2771 /* For memory_global_dirty_log_start below. */
2772 qemu_mutex_lock_iothread();
2773 qemu_mutex_lock_ramlist();
f3f491fc 2774
89ac5a1d
DDAG
2775 WITH_RCU_READ_LOCK_GUARD() {
2776 ram_list_init_bitmaps();
278e2f55
AG
2777 /* We don't use dirty log with background snapshots */
2778 if (!migrate_background_snapshot()) {
63b41db4 2779 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
2780 migration_bitmap_sync_precopy(rs);
2781 }
89ac5a1d 2782 }
56e93d26 2783 qemu_mutex_unlock_ramlist();
49877834 2784 qemu_mutex_unlock_iothread();
be39b4cd
DH
2785
2786 /*
2787 * After an eventual first bitmap sync, fixup the initial bitmap
2788 * containing all 1s to exclude any discarded pages from migration.
2789 */
2790 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
2791}
2792
2793static int ram_init_all(RAMState **rsp)
2794{
2795 if (ram_state_init(rsp)) {
2796 return -1;
2797 }
2798
2799 if (xbzrle_init()) {
2800 ram_state_cleanup(rsp);
2801 return -1;
2802 }
2803
2804 ram_init_bitmaps(*rsp);
a91246c9
HZ
2805
2806 return 0;
2807}
2808
08614f34
PX
2809static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2810{
2811 RAMBlock *block;
2812 uint64_t pages = 0;
2813
2814 /*
2815 * Postcopy is not using xbzrle/compression, so no need for that.
2816 * Also, since source are already halted, we don't need to care
2817 * about dirty page logging as well.
2818 */
2819
fbd162e6 2820 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2821 pages += bitmap_count_one(block->bmap,
2822 block->used_length >> TARGET_PAGE_BITS);
2823 }
2824
2825 /* This may not be aligned with current bitmaps. Recalculate. */
2826 rs->migration_dirty_pages = pages;
2827
1a373522 2828 ram_state_reset(rs);
08614f34
PX
2829
2830 /* Update RAMState cache of output QEMUFile */
2831 rs->f = out;
2832
2833 trace_ram_state_resume_prepare(pages);
2834}
2835
6bcb05fc
WW
2836/*
2837 * This function clears bits of the free pages reported by the caller from the
2838 * migration dirty bitmap. @addr is the host address corresponding to the
2839 * start of the continuous guest free pages, and @len is the total bytes of
2840 * those pages.
2841 */
2842void qemu_guest_free_page_hint(void *addr, size_t len)
2843{
2844 RAMBlock *block;
2845 ram_addr_t offset;
2846 size_t used_len, start, npages;
2847 MigrationState *s = migrate_get_current();
2848
2849 /* This function is currently expected to be used during live migration */
2850 if (!migration_is_setup_or_active(s->state)) {
2851 return;
2852 }
2853
2854 for (; len > 0; len -= used_len, addr += used_len) {
2855 block = qemu_ram_block_from_host(addr, false, &offset);
2856 if (unlikely(!block || offset >= block->used_length)) {
2857 /*
2858 * The implementation might not support RAMBlock resize during
2859 * live migration, but it could happen in theory with future
2860 * updates. So we add a check here to capture that case.
2861 */
2862 error_report_once("%s unexpected error", __func__);
2863 return;
2864 }
2865
2866 if (len <= block->used_length - offset) {
2867 used_len = len;
2868 } else {
2869 used_len = block->used_length - offset;
2870 }
2871
2872 start = offset >> TARGET_PAGE_BITS;
2873 npages = used_len >> TARGET_PAGE_BITS;
2874
2875 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
2876 /*
2877 * The skipped free pages are equavalent to be sent from clear_bmap's
2878 * perspective, so clear the bits from the memory region bitmap which
2879 * are initially set. Otherwise those skipped pages will be sent in
2880 * the next round after syncing from the memory region bitmap.
2881 */
1230a25f 2882 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
2883 ram_state->migration_dirty_pages -=
2884 bitmap_count_one_with_offset(block->bmap, start, npages);
2885 bitmap_clear(block->bmap, start, npages);
2886 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2887 }
2888}
2889
3d0684b2
JQ
2890/*
2891 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2892 * long-running RCU critical section. When rcu-reclaims in the code
2893 * start to become numerous it will be necessary to reduce the
2894 * granularity of these critical sections.
2895 */
2896
3d0684b2
JQ
2897/**
2898 * ram_save_setup: Setup RAM for migration
2899 *
2900 * Returns zero to indicate success and negative for error
2901 *
2902 * @f: QEMUFile where to send the data
2903 * @opaque: RAMState pointer
2904 */
a91246c9
HZ
2905static int ram_save_setup(QEMUFile *f, void *opaque)
2906{
53518d94 2907 RAMState **rsp = opaque;
a91246c9
HZ
2908 RAMBlock *block;
2909
dcaf446e
XG
2910 if (compress_threads_save_setup()) {
2911 return -1;
2912 }
2913
a91246c9
HZ
2914 /* migration has already setup the bitmap, reuse it. */
2915 if (!migration_in_colo_state()) {
7d00ee6a 2916 if (ram_init_all(rsp) != 0) {
dcaf446e 2917 compress_threads_save_cleanup();
a91246c9 2918 return -1;
53518d94 2919 }
a91246c9 2920 }
53518d94 2921 (*rsp)->f = f;
a91246c9 2922
0e6ebd48
DDAG
2923 WITH_RCU_READ_LOCK_GUARD() {
2924 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2925
0e6ebd48
DDAG
2926 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2927 qemu_put_byte(f, strlen(block->idstr));
2928 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2929 qemu_put_be64(f, block->used_length);
2930 if (migrate_postcopy_ram() && block->page_size !=
2931 qemu_host_page_size) {
2932 qemu_put_be64(f, block->page_size);
2933 }
2934 if (migrate_ignore_shared()) {
2935 qemu_put_be64(f, block->mr->addr);
2936 }
fbd162e6 2937 }
56e93d26
JQ
2938 }
2939
56e93d26
JQ
2940 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2941 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2942
99f2c6fb 2943 multifd_send_sync_main(f);
56e93d26 2944 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2945 qemu_fflush(f);
56e93d26
JQ
2946
2947 return 0;
2948}
2949
3d0684b2
JQ
2950/**
2951 * ram_save_iterate: iterative stage for migration
2952 *
2953 * Returns zero to indicate success and negative for error
2954 *
2955 * @f: QEMUFile where to send the data
2956 * @opaque: RAMState pointer
2957 */
56e93d26
JQ
2958static int ram_save_iterate(QEMUFile *f, void *opaque)
2959{
53518d94
JQ
2960 RAMState **temp = opaque;
2961 RAMState *rs = *temp;
3d4095b2 2962 int ret = 0;
56e93d26
JQ
2963 int i;
2964 int64_t t0;
5c90308f 2965 int done = 0;
56e93d26 2966
b2557345
PL
2967 if (blk_mig_bulk_active()) {
2968 /* Avoid transferring ram during bulk phase of block migration as
2969 * the bulk phase will usually take a long time and transferring
2970 * ram updates during that time is pointless. */
2971 goto out;
2972 }
2973
63268c49
PX
2974 /*
2975 * We'll take this lock a little bit long, but it's okay for two reasons.
2976 * Firstly, the only possible other thread to take it is who calls
2977 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2978 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2979 * guarantees that we'll at least released it in a regular basis.
2980 */
2981 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
2982 WITH_RCU_READ_LOCK_GUARD() {
2983 if (ram_list.version != rs->last_version) {
2984 ram_state_reset(rs);
2985 }
56e93d26 2986
89ac5a1d
DDAG
2987 /* Read version before ram_list.blocks */
2988 smp_rmb();
56e93d26 2989
89ac5a1d 2990 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2991
89ac5a1d
DDAG
2992 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2993 i = 0;
2994 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2995 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2996 int pages;
e03a34f8 2997
89ac5a1d
DDAG
2998 if (qemu_file_get_error(f)) {
2999 break;
3000 }
e8f3735f 3001
05931ec5 3002 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3003 /* no more pages to sent */
3004 if (pages == 0) {
3005 done = 1;
3006 break;
3007 }
e8f3735f 3008
89ac5a1d
DDAG
3009 if (pages < 0) {
3010 qemu_file_set_error(f, pages);
56e93d26
JQ
3011 break;
3012 }
89ac5a1d
DDAG
3013
3014 rs->target_page_count += pages;
3015
644acf99
WY
3016 /*
3017 * During postcopy, it is necessary to make sure one whole host
3018 * page is sent in one chunk.
3019 */
3020 if (migrate_postcopy_ram()) {
3021 flush_compressed_data(rs);
3022 }
3023
89ac5a1d
DDAG
3024 /*
3025 * we want to check in the 1st loop, just in case it was the 1st
3026 * time and we had to sync the dirty bitmap.
3027 * qemu_clock_get_ns() is a bit expensive, so we only check each
3028 * some iterations
3029 */
3030 if ((i & 63) == 0) {
3031 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3032 1000000;
3033 if (t1 > MAX_WAIT) {
3034 trace_ram_save_iterate_big_wait(t1, i);
3035 break;
3036 }
3037 }
3038 i++;
56e93d26 3039 }
56e93d26 3040 }
63268c49 3041 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3042
3043 /*
3044 * Must occur before EOS (or any QEMUFile operation)
3045 * because of RDMA protocol.
3046 */
3047 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3048
b2557345 3049out:
b69a0227
JQ
3050 if (ret >= 0
3051 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 3052 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3053 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3054 qemu_fflush(f);
4c2d0f6d 3055 ram_transferred_add(8);
56e93d26 3056
3d4095b2
JQ
3057 ret = qemu_file_get_error(f);
3058 }
56e93d26
JQ
3059 if (ret < 0) {
3060 return ret;
3061 }
3062
5c90308f 3063 return done;
56e93d26
JQ
3064}
3065
3d0684b2
JQ
3066/**
3067 * ram_save_complete: function called to send the remaining amount of ram
3068 *
e8f3735f 3069 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3070 *
3071 * Called with iothread lock
3072 *
3073 * @f: QEMUFile where to send the data
3074 * @opaque: RAMState pointer
3075 */
56e93d26
JQ
3076static int ram_save_complete(QEMUFile *f, void *opaque)
3077{
53518d94
JQ
3078 RAMState **temp = opaque;
3079 RAMState *rs = *temp;
e8f3735f 3080 int ret = 0;
6f37bb8b 3081
05931ec5
JQ
3082 rs->last_stage = !migration_in_colo_state();
3083
89ac5a1d
DDAG
3084 WITH_RCU_READ_LOCK_GUARD() {
3085 if (!migration_in_postcopy()) {
3086 migration_bitmap_sync_precopy(rs);
3087 }
56e93d26 3088
89ac5a1d 3089 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3090
89ac5a1d 3091 /* try transferring iterative blocks of memory */
56e93d26 3092
89ac5a1d
DDAG
3093 /* flush all remaining blocks regardless of rate limiting */
3094 while (true) {
3095 int pages;
56e93d26 3096
05931ec5 3097 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3098 /* no more blocks to sent */
3099 if (pages == 0) {
3100 break;
3101 }
3102 if (pages < 0) {
3103 ret = pages;
3104 break;
3105 }
e8f3735f 3106 }
56e93d26 3107
89ac5a1d
DDAG
3108 flush_compressed_data(rs);
3109 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3110 }
d09a6fde 3111
3d4095b2 3112 if (ret >= 0) {
99f2c6fb 3113 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3114 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3115 qemu_fflush(f);
3116 }
56e93d26 3117
e8f3735f 3118 return ret;
56e93d26
JQ
3119}
3120
c31b098f 3121static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3122 uint64_t *res_precopy_only,
3123 uint64_t *res_compatible,
3124 uint64_t *res_postcopy_only)
56e93d26 3125{
53518d94
JQ
3126 RAMState **temp = opaque;
3127 RAMState *rs = *temp;
56e93d26
JQ
3128 uint64_t remaining_size;
3129
9edabd4d 3130 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3131
5727309d 3132 if (!migration_in_postcopy() &&
663e6c1d 3133 remaining_size < max_size) {
56e93d26 3134 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3135 WITH_RCU_READ_LOCK_GUARD() {
3136 migration_bitmap_sync_precopy(rs);
3137 }
56e93d26 3138 qemu_mutex_unlock_iothread();
9edabd4d 3139 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3140 }
c31b098f 3141
86e1167e
VSO
3142 if (migrate_postcopy_ram()) {
3143 /* We can do postcopy, and all the data is postcopiable */
47995026 3144 *res_compatible += remaining_size;
86e1167e 3145 } else {
47995026 3146 *res_precopy_only += remaining_size;
86e1167e 3147 }
56e93d26
JQ
3148}
3149
3150static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3151{
3152 unsigned int xh_len;
3153 int xh_flags;
063e760a 3154 uint8_t *loaded_data;
56e93d26 3155
56e93d26
JQ
3156 /* extract RLE header */
3157 xh_flags = qemu_get_byte(f);
3158 xh_len = qemu_get_be16(f);
3159
3160 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3161 error_report("Failed to load XBZRLE page - wrong compression!");
3162 return -1;
3163 }
3164
3165 if (xh_len > TARGET_PAGE_SIZE) {
3166 error_report("Failed to load XBZRLE page - len overflow!");
3167 return -1;
3168 }
f265e0e4 3169 loaded_data = XBZRLE.decoded_buf;
56e93d26 3170 /* load data and decode */
f265e0e4 3171 /* it can change loaded_data to point to an internal buffer */
063e760a 3172 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3173
3174 /* decode RLE */
063e760a 3175 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3176 TARGET_PAGE_SIZE) == -1) {
3177 error_report("Failed to load XBZRLE page - decode error!");
3178 return -1;
3179 }
3180
3181 return 0;
3182}
3183
3d0684b2
JQ
3184/**
3185 * ram_block_from_stream: read a RAMBlock id from the migration stream
3186 *
3187 * Must be called from within a rcu critical section.
3188 *
56e93d26 3189 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3190 *
3d0684b2
JQ
3191 * @f: QEMUFile where to read the data from
3192 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3193 */
3d0684b2 3194static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26 3195{
49324e93 3196 static RAMBlock *block;
56e93d26
JQ
3197 char id[256];
3198 uint8_t len;
3199
3200 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3201 if (!block) {
56e93d26
JQ
3202 error_report("Ack, bad migration stream!");
3203 return NULL;
3204 }
4c4bad48 3205 return block;
56e93d26
JQ
3206 }
3207
3208 len = qemu_get_byte(f);
3209 qemu_get_buffer(f, (uint8_t *)id, len);
3210 id[len] = 0;
3211
e3dd7493 3212 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3213 if (!block) {
3214 error_report("Can't find block %s", id);
3215 return NULL;
56e93d26
JQ
3216 }
3217
fbd162e6 3218 if (ramblock_is_ignored(block)) {
b895de50
CLG
3219 error_report("block %s should not be migrated !", id);
3220 return NULL;
3221 }
3222
4c4bad48
HZ
3223 return block;
3224}
3225
3226static inline void *host_from_ram_block_offset(RAMBlock *block,
3227 ram_addr_t offset)
3228{
3229 if (!offset_in_ramblock(block, offset)) {
3230 return NULL;
3231 }
3232
3233 return block->host + offset;
56e93d26
JQ
3234}
3235
6a23f639
DH
3236static void *host_page_from_ram_block_offset(RAMBlock *block,
3237 ram_addr_t offset)
3238{
3239 /* Note: Explicitly no check against offset_in_ramblock(). */
3240 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3241 block->page_size);
3242}
3243
3244static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3245 ram_addr_t offset)
3246{
3247 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3248}
3249
13af18f2 3250static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3251 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3252{
3253 if (!offset_in_ramblock(block, offset)) {
3254 return NULL;
3255 }
3256 if (!block->colo_cache) {
3257 error_report("%s: colo_cache is NULL in block :%s",
3258 __func__, block->idstr);
3259 return NULL;
3260 }
7d9acafa
ZC
3261
3262 /*
3263 * During colo checkpoint, we need bitmap of these migrated pages.
3264 * It help us to decide which pages in ram cache should be flushed
3265 * into VM's RAM later.
3266 */
8af66371
HZ
3267 if (record_bitmap &&
3268 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3269 ram_state->migration_dirty_pages++;
3270 }
13af18f2
ZC
3271 return block->colo_cache + offset;
3272}
3273
3d0684b2
JQ
3274/**
3275 * ram_handle_compressed: handle the zero page case
3276 *
56e93d26
JQ
3277 * If a page (or a whole RDMA chunk) has been
3278 * determined to be zero, then zap it.
3d0684b2
JQ
3279 *
3280 * @host: host address for the zero page
3281 * @ch: what the page is filled from. We only support zero
3282 * @size: size of the zero page
56e93d26
JQ
3283 */
3284void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3285{
bad452a7 3286 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3287 memset(host, ch, size);
3288 }
3289}
3290
797ca154
XG
3291/* return the size after decompression, or negative value on error */
3292static int
3293qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3294 const uint8_t *source, size_t source_len)
3295{
3296 int err;
3297
3298 err = inflateReset(stream);
3299 if (err != Z_OK) {
3300 return -1;
3301 }
3302
3303 stream->avail_in = source_len;
3304 stream->next_in = (uint8_t *)source;
3305 stream->avail_out = dest_len;
3306 stream->next_out = dest;
3307
3308 err = inflate(stream, Z_NO_FLUSH);
3309 if (err != Z_STREAM_END) {
3310 return -1;
3311 }
3312
3313 return stream->total_out;
3314}
3315
56e93d26
JQ
3316static void *do_data_decompress(void *opaque)
3317{
3318 DecompressParam *param = opaque;
3319 unsigned long pagesize;
33d151f4 3320 uint8_t *des;
34ab9e97 3321 int len, ret;
56e93d26 3322
33d151f4 3323 qemu_mutex_lock(&param->mutex);
90e56fb4 3324 while (!param->quit) {
33d151f4
LL
3325 if (param->des) {
3326 des = param->des;
3327 len = param->len;
3328 param->des = 0;
3329 qemu_mutex_unlock(&param->mutex);
3330
56e93d26 3331 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3332
3333 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3334 param->compbuf, len);
f548222c 3335 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3336 error_report("decompress data failed");
3337 qemu_file_set_error(decomp_file, ret);
3338 }
73a8912b 3339
33d151f4
LL
3340 qemu_mutex_lock(&decomp_done_lock);
3341 param->done = true;
3342 qemu_cond_signal(&decomp_done_cond);
3343 qemu_mutex_unlock(&decomp_done_lock);
3344
3345 qemu_mutex_lock(&param->mutex);
3346 } else {
3347 qemu_cond_wait(&param->cond, &param->mutex);
3348 }
56e93d26 3349 }
33d151f4 3350 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3351
3352 return NULL;
3353}
3354
34ab9e97 3355static int wait_for_decompress_done(void)
5533b2e9
LL
3356{
3357 int idx, thread_count;
3358
3359 if (!migrate_use_compression()) {
34ab9e97 3360 return 0;
5533b2e9
LL
3361 }
3362
3363 thread_count = migrate_decompress_threads();
3364 qemu_mutex_lock(&decomp_done_lock);
3365 for (idx = 0; idx < thread_count; idx++) {
3366 while (!decomp_param[idx].done) {
3367 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3368 }
3369 }
3370 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3371 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3372}
3373
f0afa331 3374static void compress_threads_load_cleanup(void)
56e93d26
JQ
3375{
3376 int i, thread_count;
3377
3416ab5b
JQ
3378 if (!migrate_use_compression()) {
3379 return;
3380 }
56e93d26
JQ
3381 thread_count = migrate_decompress_threads();
3382 for (i = 0; i < thread_count; i++) {
797ca154
XG
3383 /*
3384 * we use it as a indicator which shows if the thread is
3385 * properly init'd or not
3386 */
3387 if (!decomp_param[i].compbuf) {
3388 break;
3389 }
3390
56e93d26 3391 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3392 decomp_param[i].quit = true;
56e93d26
JQ
3393 qemu_cond_signal(&decomp_param[i].cond);
3394 qemu_mutex_unlock(&decomp_param[i].mutex);
3395 }
3396 for (i = 0; i < thread_count; i++) {
797ca154
XG
3397 if (!decomp_param[i].compbuf) {
3398 break;
3399 }
3400
56e93d26
JQ
3401 qemu_thread_join(decompress_threads + i);
3402 qemu_mutex_destroy(&decomp_param[i].mutex);
3403 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3404 inflateEnd(&decomp_param[i].stream);
56e93d26 3405 g_free(decomp_param[i].compbuf);
797ca154 3406 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3407 }
3408 g_free(decompress_threads);
3409 g_free(decomp_param);
56e93d26
JQ
3410 decompress_threads = NULL;
3411 decomp_param = NULL;
34ab9e97 3412 decomp_file = NULL;
56e93d26
JQ
3413}
3414
34ab9e97 3415static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3416{
3417 int i, thread_count;
3418
3419 if (!migrate_use_compression()) {
3420 return 0;
3421 }
3422
3423 thread_count = migrate_decompress_threads();
3424 decompress_threads = g_new0(QemuThread, thread_count);
3425 decomp_param = g_new0(DecompressParam, thread_count);
3426 qemu_mutex_init(&decomp_done_lock);
3427 qemu_cond_init(&decomp_done_cond);
34ab9e97 3428 decomp_file = f;
797ca154
XG
3429 for (i = 0; i < thread_count; i++) {
3430 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3431 goto exit;
3432 }
3433
3434 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3435 qemu_mutex_init(&decomp_param[i].mutex);
3436 qemu_cond_init(&decomp_param[i].cond);
3437 decomp_param[i].done = true;
3438 decomp_param[i].quit = false;
3439 qemu_thread_create(decompress_threads + i, "decompress",
3440 do_data_decompress, decomp_param + i,
3441 QEMU_THREAD_JOINABLE);
3442 }
3443 return 0;
3444exit:
3445 compress_threads_load_cleanup();
3446 return -1;
3447}
3448
c1bc6626 3449static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3450 void *host, int len)
3451{
3452 int idx, thread_count;
3453
3454 thread_count = migrate_decompress_threads();
37396950 3455 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3456 while (true) {
3457 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3458 if (decomp_param[idx].done) {
33d151f4
LL
3459 decomp_param[idx].done = false;
3460 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3461 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3462 decomp_param[idx].des = host;
3463 decomp_param[idx].len = len;
33d151f4
LL
3464 qemu_cond_signal(&decomp_param[idx].cond);
3465 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3466 break;
3467 }
3468 }
3469 if (idx < thread_count) {
3470 break;
73a8912b
LL
3471 } else {
3472 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3473 }
3474 }
3475}
3476
b70cb3b4
RL
3477static void colo_init_ram_state(void)
3478{
3479 ram_state_init(&ram_state);
b70cb3b4
RL
3480}
3481
13af18f2
ZC
3482/*
3483 * colo cache: this is for secondary VM, we cache the whole
3484 * memory of the secondary VM, it is need to hold the global lock
3485 * to call this helper.
3486 */
3487int colo_init_ram_cache(void)
3488{
3489 RAMBlock *block;
3490
44901b5a
PB
3491 WITH_RCU_READ_LOCK_GUARD() {
3492 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3493 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3494 NULL, false, false);
44901b5a
PB
3495 if (!block->colo_cache) {
3496 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3497 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3498 block->used_length);
3499 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3500 if (block->colo_cache) {
3501 qemu_anon_ram_free(block->colo_cache, block->used_length);
3502 block->colo_cache = NULL;
3503 }
89ac5a1d 3504 }
44901b5a 3505 return -errno;
89ac5a1d 3506 }
e5fdf920
LS
3507 if (!machine_dump_guest_core(current_machine)) {
3508 qemu_madvise(block->colo_cache, block->used_length,
3509 QEMU_MADV_DONTDUMP);
3510 }
13af18f2 3511 }
13af18f2 3512 }
44901b5a 3513
7d9acafa
ZC
3514 /*
3515 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3516 * with to decide which page in cache should be flushed into SVM's RAM. Here
3517 * we use the same name 'ram_bitmap' as for migration.
3518 */
3519 if (ram_bytes_total()) {
3520 RAMBlock *block;
3521
fbd162e6 3522 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3523 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3524 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3525 }
3526 }
7d9acafa 3527
b70cb3b4 3528 colo_init_ram_state();
13af18f2 3529 return 0;
13af18f2
ZC
3530}
3531
0393031a
HZ
3532/* TODO: duplicated with ram_init_bitmaps */
3533void colo_incoming_start_dirty_log(void)
3534{
3535 RAMBlock *block = NULL;
3536 /* For memory_global_dirty_log_start below. */
3537 qemu_mutex_lock_iothread();
3538 qemu_mutex_lock_ramlist();
3539
3540 memory_global_dirty_log_sync();
3541 WITH_RCU_READ_LOCK_GUARD() {
3542 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3543 ramblock_sync_dirty_bitmap(ram_state, block);
3544 /* Discard this dirty bitmap record */
3545 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3546 }
63b41db4 3547 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3548 }
3549 ram_state->migration_dirty_pages = 0;
3550 qemu_mutex_unlock_ramlist();
3551 qemu_mutex_unlock_iothread();
3552}
3553
13af18f2
ZC
3554/* It is need to hold the global lock to call this helper */
3555void colo_release_ram_cache(void)
3556{
3557 RAMBlock *block;
3558
63b41db4 3559 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3560 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3561 g_free(block->bmap);
3562 block->bmap = NULL;
3563 }
3564
89ac5a1d
DDAG
3565 WITH_RCU_READ_LOCK_GUARD() {
3566 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3567 if (block->colo_cache) {
3568 qemu_anon_ram_free(block->colo_cache, block->used_length);
3569 block->colo_cache = NULL;
3570 }
13af18f2
ZC
3571 }
3572 }
0393031a 3573 ram_state_cleanup(&ram_state);
13af18f2
ZC
3574}
3575
f265e0e4
JQ
3576/**
3577 * ram_load_setup: Setup RAM for migration incoming side
3578 *
3579 * Returns zero to indicate success and negative for error
3580 *
3581 * @f: QEMUFile where to receive the data
3582 * @opaque: RAMState pointer
3583 */
3584static int ram_load_setup(QEMUFile *f, void *opaque)
3585{
34ab9e97 3586 if (compress_threads_load_setup(f)) {
797ca154
XG
3587 return -1;
3588 }
3589
f265e0e4 3590 xbzrle_load_setup();
f9494614 3591 ramblock_recv_map_init();
13af18f2 3592
f265e0e4
JQ
3593 return 0;
3594}
3595
3596static int ram_load_cleanup(void *opaque)
3597{
f9494614 3598 RAMBlock *rb;
56eb90af 3599
fbd162e6 3600 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3601 qemu_ram_block_writeback(rb);
56eb90af
JH
3602 }
3603
f265e0e4 3604 xbzrle_load_cleanup();
f0afa331 3605 compress_threads_load_cleanup();
f9494614 3606
fbd162e6 3607 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3608 g_free(rb->receivedmap);
3609 rb->receivedmap = NULL;
3610 }
13af18f2 3611
f265e0e4
JQ
3612 return 0;
3613}
3614
3d0684b2
JQ
3615/**
3616 * ram_postcopy_incoming_init: allocate postcopy data structures
3617 *
3618 * Returns 0 for success and negative if there was one error
3619 *
3620 * @mis: current migration incoming state
3621 *
3622 * Allocate data structures etc needed by incoming migration with
3623 * postcopy-ram. postcopy-ram's similarly names
3624 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3625 */
3626int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3627{
c136180c 3628 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3629}
3630
3d0684b2
JQ
3631/**
3632 * ram_load_postcopy: load a page in postcopy case
3633 *
3634 * Returns 0 for success or -errno in case of error
3635 *
a7180877
DDAG
3636 * Called in postcopy mode by ram_load().
3637 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3638 *
3639 * @f: QEMUFile where to send the data
a7180877
DDAG
3640 */
3641static int ram_load_postcopy(QEMUFile *f)
3642{
3643 int flags = 0, ret = 0;
3644 bool place_needed = false;
1aa83678 3645 bool matches_target_page_size = false;
a7180877
DDAG
3646 MigrationIncomingState *mis = migration_incoming_get_current();
3647 /* Temporary page that is later 'placed' */
3414322a 3648 void *postcopy_host_page = mis->postcopy_tmp_page;
6a23f639 3649 void *host_page = NULL;
ddf35bdf 3650 bool all_zero = true;
4cbb3c63 3651 int target_pages = 0;
a7180877
DDAG
3652
3653 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3654 ram_addr_t addr;
a7180877
DDAG
3655 void *page_buffer = NULL;
3656 void *place_source = NULL;
df9ff5e1 3657 RAMBlock *block = NULL;
a7180877 3658 uint8_t ch;
644acf99 3659 int len;
a7180877
DDAG
3660
3661 addr = qemu_get_be64(f);
7a9ddfbf
PX
3662
3663 /*
3664 * If qemu file error, we should stop here, and then "addr"
3665 * may be invalid
3666 */
3667 ret = qemu_file_get_error(f);
3668 if (ret) {
3669 break;
3670 }
3671
a7180877
DDAG
3672 flags = addr & ~TARGET_PAGE_MASK;
3673 addr &= TARGET_PAGE_MASK;
3674
3675 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
644acf99
WY
3676 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3677 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3678 block = ram_block_from_stream(f, flags);
6a23f639
DH
3679 if (!block) {
3680 ret = -EINVAL;
3681 break;
3682 }
4c4bad48 3683
898ba906
DH
3684 /*
3685 * Relying on used_length is racy and can result in false positives.
3686 * We might place pages beyond used_length in case RAM was shrunk
3687 * while in postcopy, which is fine - trying to place via
3688 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3689 */
3690 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
3691 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3692 ret = -EINVAL;
3693 break;
3694 }
4cbb3c63 3695 target_pages++;
1aa83678 3696 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3697 /*
28abd200
DDAG
3698 * Postcopy requires that we place whole host pages atomically;
3699 * these may be huge pages for RAMBlocks that are backed by
3700 * hugetlbfs.
a7180877
DDAG
3701 * To make it atomic, the data is read into a temporary page
3702 * that's moved into place later.
3703 * The migration protocol uses, possibly smaller, target-pages
3704 * however the source ensures it always sends all the components
91ba442f 3705 * of a host page in one chunk.
a7180877
DDAG
3706 */
3707 page_buffer = postcopy_host_page +
6a23f639
DH
3708 host_page_offset_from_ram_block_offset(block, addr);
3709 /* If all TP are zero then we can optimise the place */
e5e73b0f 3710 if (target_pages == 1) {
6a23f639
DH
3711 host_page = host_page_from_ram_block_offset(block, addr);
3712 } else if (host_page != host_page_from_ram_block_offset(block,
3713 addr)) {
c53b7ddc 3714 /* not the 1st TP within the HP */
6a23f639
DH
3715 error_report("Non-same host page %p/%p", host_page,
3716 host_page_from_ram_block_offset(block, addr));
3717 ret = -EINVAL;
3718 break;
a7180877
DDAG
3719 }
3720
3721 /*
3722 * If it's the last part of a host page then we place the host
3723 * page
3724 */
4cbb3c63
WY
3725 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3726 place_needed = true;
4cbb3c63 3727 }
a7180877
DDAG
3728 place_source = postcopy_host_page;
3729 }
3730
3731 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3732 case RAM_SAVE_FLAG_ZERO:
a7180877 3733 ch = qemu_get_byte(f);
2e36bc1b
WY
3734 /*
3735 * Can skip to set page_buffer when
3736 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3737 */
3738 if (ch || !matches_target_page_size) {
3739 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3740 }
a7180877
DDAG
3741 if (ch) {
3742 all_zero = false;
3743 }
3744 break;
3745
3746 case RAM_SAVE_FLAG_PAGE:
3747 all_zero = false;
1aa83678
PX
3748 if (!matches_target_page_size) {
3749 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3750 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3751 } else {
1aa83678
PX
3752 /*
3753 * For small pages that matches target page size, we
3754 * avoid the qemu_file copy. Instead we directly use
3755 * the buffer of QEMUFile to place the page. Note: we
3756 * cannot do any QEMUFile operation before using that
3757 * buffer to make sure the buffer is valid when
3758 * placing the page.
a7180877
DDAG
3759 */
3760 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3761 TARGET_PAGE_SIZE);
3762 }
3763 break;
644acf99
WY
3764 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3765 all_zero = false;
3766 len = qemu_get_be32(f);
3767 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3768 error_report("Invalid compressed data length: %d", len);
3769 ret = -EINVAL;
3770 break;
3771 }
3772 decompress_data_with_multi_threads(f, page_buffer, len);
3773 break;
3774
a7180877
DDAG
3775 case RAM_SAVE_FLAG_EOS:
3776 /* normal exit */
6df264ac 3777 multifd_recv_sync_main();
a7180877
DDAG
3778 break;
3779 default:
29fccade 3780 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
3781 " (postcopy mode)", flags);
3782 ret = -EINVAL;
7a9ddfbf
PX
3783 break;
3784 }
3785
644acf99
WY
3786 /* Got the whole host page, wait for decompress before placing. */
3787 if (place_needed) {
3788 ret |= wait_for_decompress_done();
3789 }
3790
7a9ddfbf
PX
3791 /* Detect for any possible file errors */
3792 if (!ret && qemu_file_get_error(f)) {
3793 ret = qemu_file_get_error(f);
a7180877
DDAG
3794 }
3795
7a9ddfbf 3796 if (!ret && place_needed) {
a7180877 3797 if (all_zero) {
6a23f639 3798 ret = postcopy_place_page_zero(mis, host_page, block);
a7180877 3799 } else {
6a23f639
DH
3800 ret = postcopy_place_page(mis, host_page, place_source,
3801 block);
a7180877 3802 }
ddf35bdf
DH
3803 place_needed = false;
3804 target_pages = 0;
3805 /* Assume we have a zero page until we detect something different */
3806 all_zero = true;
a7180877 3807 }
a7180877
DDAG
3808 }
3809
3810 return ret;
3811}
3812
acab30b8
DHB
3813static bool postcopy_is_advised(void)
3814{
3815 PostcopyState ps = postcopy_state_get();
3816 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3817}
3818
3819static bool postcopy_is_running(void)
3820{
3821 PostcopyState ps = postcopy_state_get();
3822 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3823}
3824
e6f4aa18
ZC
3825/*
3826 * Flush content of RAM cache into SVM's memory.
3827 * Only flush the pages that be dirtied by PVM or SVM or both.
3828 */
24fa16f8 3829void colo_flush_ram_cache(void)
e6f4aa18
ZC
3830{
3831 RAMBlock *block = NULL;
3832 void *dst_host;
3833 void *src_host;
3834 unsigned long offset = 0;
3835
d1955d22 3836 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3837 WITH_RCU_READ_LOCK_GUARD() {
3838 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3839 ramblock_sync_dirty_bitmap(ram_state, block);
3840 }
d1955d22 3841 }
d1955d22 3842
e6f4aa18 3843 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3844 WITH_RCU_READ_LOCK_GUARD() {
3845 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3846
89ac5a1d 3847 while (block) {
a6a83cef 3848 unsigned long num = 0;
e6f4aa18 3849
a6a83cef 3850 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
3851 if (!offset_in_ramblock(block,
3852 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 3853 offset = 0;
a6a83cef 3854 num = 0;
89ac5a1d
DDAG
3855 block = QLIST_NEXT_RCU(block, next);
3856 } else {
a6a83cef
RL
3857 unsigned long i = 0;
3858
3859 for (i = 0; i < num; i++) {
3860 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3861 }
8bba004c
AR
3862 dst_host = block->host
3863 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3864 src_host = block->colo_cache
3865 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
3866 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3867 offset += num;
89ac5a1d 3868 }
e6f4aa18
ZC
3869 }
3870 }
e6f4aa18
ZC
3871 trace_colo_flush_ram_cache_end();
3872}
3873
10da4a36
WY
3874/**
3875 * ram_load_precopy: load pages in precopy case
3876 *
3877 * Returns 0 for success or -errno in case of error
3878 *
3879 * Called in precopy mode by ram_load().
3880 * rcu_read_lock is taken prior to this being called.
3881 *
3882 * @f: QEMUFile where to send the data
3883 */
3884static int ram_load_precopy(QEMUFile *f)
56e93d26 3885{
e65cec5e 3886 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3887 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3888 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3889 if (!migrate_use_compression()) {
3890 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3891 }
a7180877 3892
10da4a36 3893 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3894 ram_addr_t addr, total_ram_bytes;
0393031a 3895 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3896 uint8_t ch;
3897
e65cec5e
YK
3898 /*
3899 * Yield periodically to let main loop run, but an iteration of
3900 * the main loop is expensive, so do it each some iterations
3901 */
3902 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3903 aio_co_schedule(qemu_get_current_aio_context(),
3904 qemu_coroutine_self());
3905 qemu_coroutine_yield();
3906 }
3907 i++;
3908
56e93d26
JQ
3909 addr = qemu_get_be64(f);
3910 flags = addr & ~TARGET_PAGE_MASK;
3911 addr &= TARGET_PAGE_MASK;
3912
edc60127
JQ
3913 if (flags & invalid_flags) {
3914 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3915 error_report("Received an unexpected compressed page");
3916 }
3917
3918 ret = -EINVAL;
3919 break;
3920 }
3921
bb890ed5 3922 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3923 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3924 RAMBlock *block = ram_block_from_stream(f, flags);
3925
0393031a 3926 host = host_from_ram_block_offset(block, addr);
13af18f2 3927 /*
0393031a
HZ
3928 * After going into COLO stage, we should not load the page
3929 * into SVM's memory directly, we put them into colo_cache firstly.
3930 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3931 * Previously, we copied all these memory in preparing stage of COLO
3932 * while we need to stop VM, which is a time-consuming process.
3933 * Here we optimize it by a trick, back-up every page while in
3934 * migration process while COLO is enabled, though it affects the
3935 * speed of the migration, but it obviously reduce the downtime of
3936 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3937 */
0393031a
HZ
3938 if (migration_incoming_colo_enabled()) {
3939 if (migration_incoming_in_colo_state()) {
3940 /* In COLO stage, put all pages into cache temporarily */
8af66371 3941 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3942 } else {
3943 /*
3944 * In migration stage but before COLO stage,
3945 * Put all pages into both cache and SVM's memory.
3946 */
8af66371 3947 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3948 }
13af18f2 3949 }
a776aa15
DDAG
3950 if (!host) {
3951 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3952 ret = -EINVAL;
3953 break;
3954 }
13af18f2
ZC
3955 if (!migration_incoming_in_colo_state()) {
3956 ramblock_recv_bitmap_set(block, host);
3957 }
3958
1db9d8e5 3959 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3960 }
3961
56e93d26
JQ
3962 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3963 case RAM_SAVE_FLAG_MEM_SIZE:
3964 /* Synchronize RAM block list */
3965 total_ram_bytes = addr;
3966 while (!ret && total_ram_bytes) {
3967 RAMBlock *block;
56e93d26
JQ
3968 char id[256];
3969 ram_addr_t length;
3970
3971 len = qemu_get_byte(f);
3972 qemu_get_buffer(f, (uint8_t *)id, len);
3973 id[len] = 0;
3974 length = qemu_get_be64(f);
3975
e3dd7493 3976 block = qemu_ram_block_by_name(id);
b895de50
CLG
3977 if (block && !qemu_ram_is_migratable(block)) {
3978 error_report("block %s should not be migrated !", id);
3979 ret = -EINVAL;
3980 } else if (block) {
e3dd7493
DDAG
3981 if (length != block->used_length) {
3982 Error *local_err = NULL;
56e93d26 3983
fa53a0e5 3984 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3985 &local_err);
3986 if (local_err) {
3987 error_report_err(local_err);
56e93d26 3988 }
56e93d26 3989 }
ef08fb38 3990 /* For postcopy we need to check hugepage sizes match */
e846b746 3991 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
3992 block->page_size != qemu_host_page_size) {
3993 uint64_t remote_page_size = qemu_get_be64(f);
3994 if (remote_page_size != block->page_size) {
3995 error_report("Mismatched RAM page size %s "
3996 "(local) %zd != %" PRId64,
3997 id, block->page_size,
3998 remote_page_size);
3999 ret = -EINVAL;
4000 }
4001 }
fbd162e6
YK
4002 if (migrate_ignore_shared()) {
4003 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4004 if (ramblock_is_ignored(block) &&
4005 block->mr->addr != addr) {
4006 error_report("Mismatched GPAs for block %s "
4007 "%" PRId64 "!= %" PRId64,
4008 id, (uint64_t)addr,
4009 (uint64_t)block->mr->addr);
4010 ret = -EINVAL;
4011 }
4012 }
e3dd7493
DDAG
4013 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4014 block->idstr);
4015 } else {
56e93d26
JQ
4016 error_report("Unknown ramblock \"%s\", cannot "
4017 "accept migration", id);
4018 ret = -EINVAL;
4019 }
4020
4021 total_ram_bytes -= length;
4022 }
4023 break;
a776aa15 4024
bb890ed5 4025 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4026 ch = qemu_get_byte(f);
4027 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4028 break;
a776aa15 4029
56e93d26 4030 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4031 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4032 break;
56e93d26 4033
a776aa15 4034 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4035 len = qemu_get_be32(f);
4036 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4037 error_report("Invalid compressed data length: %d", len);
4038 ret = -EINVAL;
4039 break;
4040 }
c1bc6626 4041 decompress_data_with_multi_threads(f, host, len);
56e93d26 4042 break;
a776aa15 4043
56e93d26 4044 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4045 if (load_xbzrle(f, addr, host) < 0) {
4046 error_report("Failed to decompress XBZRLE page at "
4047 RAM_ADDR_FMT, addr);
4048 ret = -EINVAL;
4049 break;
4050 }
4051 break;
4052 case RAM_SAVE_FLAG_EOS:
4053 /* normal exit */
6df264ac 4054 multifd_recv_sync_main();
56e93d26
JQ
4055 break;
4056 default:
4057 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4058 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4059 } else {
29fccade 4060 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4061 flags);
4062 ret = -EINVAL;
4063 }
4064 }
4065 if (!ret) {
4066 ret = qemu_file_get_error(f);
4067 }
0393031a
HZ
4068 if (!ret && host_bak) {
4069 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4070 }
56e93d26
JQ
4071 }
4072
ca1a6b70 4073 ret |= wait_for_decompress_done();
10da4a36
WY
4074 return ret;
4075}
4076
4077static int ram_load(QEMUFile *f, void *opaque, int version_id)
4078{
4079 int ret = 0;
4080 static uint64_t seq_iter;
4081 /*
4082 * If system is running in postcopy mode, page inserts to host memory must
4083 * be atomic
4084 */
4085 bool postcopy_running = postcopy_is_running();
4086
4087 seq_iter++;
4088
4089 if (version_id != 4) {
4090 return -EINVAL;
4091 }
4092
4093 /*
4094 * This RCU critical section can be very long running.
4095 * When RCU reclaims in the code start to become numerous,
4096 * it will be necessary to reduce the granularity of this
4097 * critical section.
4098 */
89ac5a1d
DDAG
4099 WITH_RCU_READ_LOCK_GUARD() {
4100 if (postcopy_running) {
4101 ret = ram_load_postcopy(f);
4102 } else {
4103 ret = ram_load_precopy(f);
4104 }
10da4a36 4105 }
55c4446b 4106 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4107
56e93d26
JQ
4108 return ret;
4109}
4110
c6467627
VSO
4111static bool ram_has_postcopy(void *opaque)
4112{
469dd51b 4113 RAMBlock *rb;
fbd162e6 4114 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4115 if (ramblock_is_pmem(rb)) {
4116 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4117 "is not supported now!", rb->idstr, rb->host);
4118 return false;
4119 }
4120 }
4121
c6467627
VSO
4122 return migrate_postcopy_ram();
4123}
4124
edd090c7
PX
4125/* Sync all the dirty bitmap with destination VM. */
4126static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4127{
4128 RAMBlock *block;
4129 QEMUFile *file = s->to_dst_file;
4130 int ramblock_count = 0;
4131
4132 trace_ram_dirty_bitmap_sync_start();
4133
fbd162e6 4134 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4135 qemu_savevm_send_recv_bitmap(file, block->idstr);
4136 trace_ram_dirty_bitmap_request(block->idstr);
4137 ramblock_count++;
4138 }
4139
4140 trace_ram_dirty_bitmap_sync_wait();
4141
4142 /* Wait until all the ramblocks' dirty bitmap synced */
4143 while (ramblock_count--) {
4144 qemu_sem_wait(&s->rp_state.rp_sem);
4145 }
4146
4147 trace_ram_dirty_bitmap_sync_complete();
4148
4149 return 0;
4150}
4151
4152static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4153{
4154 qemu_sem_post(&s->rp_state.rp_sem);
4155}
4156
a335debb
PX
4157/*
4158 * Read the received bitmap, revert it as the initial dirty bitmap.
4159 * This is only used when the postcopy migration is paused but wants
4160 * to resume from a middle point.
4161 */
4162int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4163{
4164 int ret = -EINVAL;
43044ac0 4165 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4166 QEMUFile *file = s->rp_state.from_dst_file;
4167 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4168 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4169 uint64_t size, end_mark;
4170
4171 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4172
4173 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4174 error_report("%s: incorrect state %s", __func__,
4175 MigrationStatus_str(s->state));
4176 return -EINVAL;
4177 }
4178
4179 /*
4180 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4181 * need the endianness conversion, and the paddings.
a335debb
PX
4182 */
4183 local_size = ROUND_UP(local_size, 8);
4184
4185 /* Add paddings */
4186 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4187
4188 size = qemu_get_be64(file);
4189
4190 /* The size of the bitmap should match with our ramblock */
4191 if (size != local_size) {
4192 error_report("%s: ramblock '%s' bitmap size mismatch "
4193 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4194 block->idstr, size, local_size);
4195 ret = -EINVAL;
4196 goto out;
4197 }
4198
4199 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4200 end_mark = qemu_get_be64(file);
4201
4202 ret = qemu_file_get_error(file);
4203 if (ret || size != local_size) {
4204 error_report("%s: read bitmap failed for ramblock '%s': %d"
4205 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4206 __func__, block->idstr, ret, local_size, size);
4207 ret = -EIO;
4208 goto out;
4209 }
4210
4211 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4212 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4213 __func__, block->idstr, end_mark);
4214 ret = -EINVAL;
4215 goto out;
4216 }
4217
4218 /*
3a4452d8 4219 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4220 * The dirty bitmap won't change. We can directly modify it.
4221 */
4222 bitmap_from_le(block->bmap, le_bitmap, nbits);
4223
4224 /*
4225 * What we received is "received bitmap". Revert it as the initial
4226 * dirty bitmap for this ramblock.
4227 */
4228 bitmap_complement(block->bmap, block->bmap, nbits);
4229
be39b4cd
DH
4230 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4231 ramblock_dirty_bitmap_clear_discarded_pages(block);
4232
4233 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4234 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4235
edd090c7
PX
4236 /*
4237 * We succeeded to sync bitmap for current ramblock. If this is
4238 * the last one to sync, we need to notify the main send thread.
4239 */
4240 ram_dirty_bitmap_reload_notify(s);
4241
a335debb
PX
4242 ret = 0;
4243out:
bf269906 4244 g_free(le_bitmap);
a335debb
PX
4245 return ret;
4246}
4247
edd090c7
PX
4248static int ram_resume_prepare(MigrationState *s, void *opaque)
4249{
4250 RAMState *rs = *(RAMState **)opaque;
08614f34 4251 int ret;
edd090c7 4252
08614f34
PX
4253 ret = ram_dirty_bitmap_sync_all(s, rs);
4254 if (ret) {
4255 return ret;
4256 }
4257
4258 ram_state_resume_prepare(rs, s->to_dst_file);
4259
4260 return 0;
edd090c7
PX
4261}
4262
56e93d26 4263static SaveVMHandlers savevm_ram_handlers = {
9907e842 4264 .save_setup = ram_save_setup,
56e93d26 4265 .save_live_iterate = ram_save_iterate,
763c906b 4266 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4267 .save_live_complete_precopy = ram_save_complete,
c6467627 4268 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4269 .save_live_pending = ram_save_pending,
4270 .load_state = ram_load,
f265e0e4
JQ
4271 .save_cleanup = ram_save_cleanup,
4272 .load_setup = ram_load_setup,
4273 .load_cleanup = ram_load_cleanup,
edd090c7 4274 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4275};
4276
c7c0e724
DH
4277static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4278 size_t old_size, size_t new_size)
4279{
cc61c703 4280 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4281 ram_addr_t offset;
4282 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4283 Error *err = NULL;
4284
4285 if (ramblock_is_ignored(rb)) {
4286 return;
4287 }
4288
4289 if (!migration_is_idle()) {
4290 /*
4291 * Precopy code on the source cannot deal with the size of RAM blocks
4292 * changing at random points in time - especially after sending the
4293 * RAM block sizes in the migration stream, they must no longer change.
4294 * Abort and indicate a proper reason.
4295 */
4296 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4297 migration_cancel(err);
c7c0e724 4298 error_free(err);
c7c0e724 4299 }
cc61c703
DH
4300
4301 switch (ps) {
4302 case POSTCOPY_INCOMING_ADVISE:
4303 /*
4304 * Update what ram_postcopy_incoming_init()->init_range() does at the
4305 * time postcopy was advised. Syncing RAM blocks with the source will
4306 * result in RAM resizes.
4307 */
4308 if (old_size < new_size) {
4309 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4310 error_report("RAM block '%s' discard of resized RAM failed",
4311 rb->idstr);
4312 }
4313 }
898ba906 4314 rb->postcopy_length = new_size;
cc61c703
DH
4315 break;
4316 case POSTCOPY_INCOMING_NONE:
4317 case POSTCOPY_INCOMING_RUNNING:
4318 case POSTCOPY_INCOMING_END:
4319 /*
4320 * Once our guest is running, postcopy does no longer care about
4321 * resizes. When growing, the new memory was not available on the
4322 * source, no handler needed.
4323 */
4324 break;
4325 default:
4326 error_report("RAM block '%s' resized during postcopy state: %d",
4327 rb->idstr, ps);
4328 exit(-1);
4329 }
c7c0e724
DH
4330}
4331
4332static RAMBlockNotifier ram_mig_ram_notifier = {
4333 .ram_block_resized = ram_mig_ram_block_resized,
4334};
4335
56e93d26
JQ
4336void ram_mig_init(void)
4337{
4338 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4339 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4340 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4341}
This page took 1.137054 seconds and 4 git commands to generate.