]> Git Repo - qemu.git/blame - migration/ram.c
migration: Move temp page setup and cleanup into separate functions
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
f348b6d1 30#include "qemu/cutils.h"
56e93d26
JQ
31#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
7205c9ec 33#include "qemu/main-loop.h"
709e3fe8 34#include "xbzrle.h"
7b1e1a22 35#include "ram.h"
6666c96a 36#include "migration.h"
f2a8f0a6 37#include "migration/register.h"
7b1e1a22 38#include "migration/misc.h"
08a0aee1 39#include "qemu-file.h"
be07b0ac 40#include "postcopy-ram.h"
53d37d36 41#include "page_cache.h"
56e93d26 42#include "qemu/error-report.h"
e688df6b 43#include "qapi/error.h"
ab7cbb0b 44#include "qapi/qapi-types-migration.h"
9af23989 45#include "qapi/qapi-events-migration.h"
8acabf69 46#include "qapi/qmp/qerror.h"
56e93d26 47#include "trace.h"
56e93d26 48#include "exec/ram_addr.h"
f9494614 49#include "exec/target_page.h"
56e93d26 50#include "qemu/rcu_queue.h"
a91246c9 51#include "migration/colo.h"
53d37d36 52#include "block.h"
b0c3cf94 53#include "sysemu/cpu-throttle.h"
edd090c7 54#include "savevm.h"
b9ee2f7d 55#include "qemu/iov.h"
d32ca5ad 56#include "multifd.h"
278e2f55
AG
57#include "sysemu/runstate.h"
58
e5fdf920
LS
59#include "hw/boards.h" /* for machine_dump_guest_core() */
60
278e2f55
AG
61#if defined(__linux__)
62#include "qemu/userfaultfd.h"
63#endif /* defined(__linux__) */
56e93d26 64
56e93d26
JQ
65/***********************************************************/
66/* ram save/restore */
67
bb890ed5
JQ
68/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
56e93d26 74#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 75#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
76#define RAM_SAVE_FLAG_MEM_SIZE 0x04
77#define RAM_SAVE_FLAG_PAGE 0x08
78#define RAM_SAVE_FLAG_EOS 0x10
79#define RAM_SAVE_FLAG_CONTINUE 0x20
80#define RAM_SAVE_FLAG_XBZRLE 0x40
81/* 0x80 is reserved in migration.h start with 0x100 next */
82#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
9360447d
JQ
84XBZRLECacheStats xbzrle_counters;
85
56e93d26
JQ
86/* struct contains XBZRLE cache and a static page
87 used by the compression */
88static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
c00e0928
JQ
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
f265e0e4
JQ
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
56e93d26
JQ
100} XBZRLE;
101
56e93d26
JQ
102static void XBZRLE_cache_lock(void)
103{
f4c51a6b 104 if (migrate_use_xbzrle()) {
56e93d26 105 qemu_mutex_lock(&XBZRLE.lock);
f4c51a6b 106 }
56e93d26
JQ
107}
108
109static void XBZRLE_cache_unlock(void)
110{
f4c51a6b 111 if (migrate_use_xbzrle()) {
56e93d26 112 qemu_mutex_unlock(&XBZRLE.lock);
f4c51a6b 113 }
56e93d26
JQ
114}
115
3d0684b2
JQ
116/**
117 * xbzrle_cache_resize: resize the xbzrle cache
118 *
cbde7be9 119 * This function is called from migrate_params_apply in main
3d0684b2
JQ
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
123 *
c9dede2d 124 * Returns 0 for success or -1 for error
3d0684b2
JQ
125 *
126 * @new_size: new cache size
8acabf69 127 * @errp: set *errp if the check failed, with reason
56e93d26 128 */
8b9407a0 129int xbzrle_cache_resize(uint64_t new_size, Error **errp)
56e93d26
JQ
130{
131 PageCache *new_cache;
c9dede2d 132 int64_t ret = 0;
56e93d26 133
8acabf69
JQ
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
139 }
140
2a313e5c
JQ
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
c9dede2d 143 return 0;
2a313e5c
JQ
144 }
145
56e93d26
JQ
146 XBZRLE_cache_lock();
147
148 if (XBZRLE.cache != NULL) {
80f8dfde 149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 150 if (!new_cache) {
56e93d26
JQ
151 ret = -1;
152 goto out;
153 }
154
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
157 }
56e93d26
JQ
158out:
159 XBZRLE_cache_unlock();
160 return ret;
161}
162
3ded54b1 163bool ramblock_is_ignored(RAMBlock *block)
fbd162e6
YK
164{
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167}
168
343f632c
DDAG
169#undef RAMBLOCK_FOREACH
170
fbd162e6
YK
171int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
172{
173 RAMBlock *block;
174 int ret = 0;
175
89ac5a1d
DDAG
176 RCU_READ_LOCK_GUARD();
177
fbd162e6
YK
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
182 }
183 }
fbd162e6
YK
184 return ret;
185}
186
f9494614
AP
187static void ramblock_recv_map_init(void)
188{
189 RAMBlock *rb;
190
fbd162e6 191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
194 }
195}
196
197int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
198{
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
201}
202
1cba9f6e
DDAG
203bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
204{
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206}
207
f9494614
AP
208void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
209{
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211}
212
213void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
215{
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
219}
220
a335debb
PX
221#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222
223/*
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
225 *
226 * Returns >0 if success with sent bytes, or <0 if error.
227 */
228int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
230{
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
234
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
238 }
239
898ba906 240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
a335debb
PX
241
242 /*
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
246 */
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248
249 /*
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
3a4452d8 252 * same endianness. (Note: big endian won't work.)
a335debb
PX
253 */
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
255
256 /* Size of the bitmap, in bytes */
a725ef9f 257 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
258
259 /*
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
264 */
265 size = ROUND_UP(size, 8);
266
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
269 /*
270 * Mark as an end, in case the middle part is screwed up due to
3a4452d8 271 * some "mysterious" reason.
a335debb
PX
272 */
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
275
bf269906 276 g_free(le_bitmap);
a335debb
PX
277
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
280 }
281
282 return size + sizeof(size);
283}
284
ec481c6c
JQ
285/*
286 * An outstanding page request, on the source, having been received
287 * and queued
288 */
289struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
293
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295};
296
6f37bb8b
JQ
297/* State of RAM for migration */
298struct RAMState {
204b88b8
JQ
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
278e2f55
AG
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
6f37bb8b
JQ
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
269ace29
JQ
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
6f37bb8b
JQ
309 /* last ram version we have seen */
310 uint32_t last_version;
8d820d6f
JQ
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
f664da80
JQ
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
eac74159 316 /* bytes transferred at start_time */
c4bdf0cf 317 uint64_t bytes_xfer_prev;
a66cd90c 318 /* number of dirty pages since start_time */
68908ed6 319 uint64_t num_dirty_pages_period;
b5833fde
JQ
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
e460a4b1
WW
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
1a373522
DH
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
05931ec5
JQ
328 /* Are we on the last stage of migration */
329 bool last_stage;
76e03000
XG
330 /* compression statistics since the beginning of the period */
331 /* amount of count that no free thread to compress data */
332 uint64_t compress_thread_busy_prev;
333 /* amount bytes after compression */
334 uint64_t compressed_size_prev;
335 /* amount of compressed pages */
336 uint64_t compress_pages_prev;
337
be8b02ed
XG
338 /* total handled target pages at the beginning of period */
339 uint64_t target_page_count_prev;
340 /* total handled target pages since start */
341 uint64_t target_page_count;
9360447d 342 /* number of dirty bits in the bitmap */
2dfaf12e 343 uint64_t migration_dirty_pages;
386a907b 344 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 345 QemuMutex bitmap_mutex;
68a098f3
JQ
346 /* The RAMBlock used in the last src_page_requests */
347 RAMBlock *last_req_rb;
ec481c6c
JQ
348 /* Queue of outstanding page requests from the destination */
349 QemuMutex src_page_req_mutex;
b58deb34 350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
351};
352typedef struct RAMState RAMState;
353
53518d94 354static RAMState *ram_state;
6f37bb8b 355
bd227060
WW
356static NotifierWithReturnList precopy_notifier_list;
357
a1fe28df
PX
358/* Whether postcopy has queued requests? */
359static bool postcopy_has_request(RAMState *rs)
360{
361 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
362}
363
bd227060
WW
364void precopy_infrastructure_init(void)
365{
366 notifier_with_return_list_init(&precopy_notifier_list);
367}
368
369void precopy_add_notifier(NotifierWithReturn *n)
370{
371 notifier_with_return_list_add(&precopy_notifier_list, n);
372}
373
374void precopy_remove_notifier(NotifierWithReturn *n)
375{
376 notifier_with_return_remove(n);
377}
378
379int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380{
381 PrecopyNotifyData pnd;
382 pnd.reason = reason;
383 pnd.errp = errp;
384
385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386}
387
9edabd4d 388uint64_t ram_bytes_remaining(void)
2f4fde93 389{
bae416e5
DDAG
390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
391 0;
2f4fde93
JQ
392}
393
9360447d 394MigrationStats ram_counters;
96506894 395
4c2d0f6d
DE
396static void ram_transferred_add(uint64_t bytes)
397{
ae680668
DE
398 if (runstate_is_running()) {
399 ram_counters.precopy_bytes += bytes;
400 } else if (migration_in_postcopy()) {
401 ram_counters.postcopy_bytes += bytes;
402 } else {
403 ram_counters.downtime_bytes += bytes;
404 }
4c2d0f6d
DE
405 ram_counters.transferred += bytes;
406}
407
b8fb8cb7
DDAG
408/* used by the search for pages to send */
409struct PageSearchStatus {
410 /* Current block being searched */
411 RAMBlock *block;
a935e30f
JQ
412 /* Current page to search from */
413 unsigned long page;
b8fb8cb7
DDAG
414 /* Set once we wrap around */
415 bool complete_round;
416};
417typedef struct PageSearchStatus PageSearchStatus;
418
76e03000
XG
419CompressionStats compression_counters;
420
56e93d26 421struct CompressParam {
56e93d26 422 bool done;
90e56fb4 423 bool quit;
5e5fdcff 424 bool zero_page;
56e93d26
JQ
425 QEMUFile *file;
426 QemuMutex mutex;
427 QemuCond cond;
428 RAMBlock *block;
429 ram_addr_t offset;
34ab9e97
XG
430
431 /* internally used fields */
dcaf446e 432 z_stream stream;
34ab9e97 433 uint8_t *originbuf;
56e93d26
JQ
434};
435typedef struct CompressParam CompressParam;
436
437struct DecompressParam {
73a8912b 438 bool done;
90e56fb4 439 bool quit;
56e93d26
JQ
440 QemuMutex mutex;
441 QemuCond cond;
442 void *des;
d341d9f3 443 uint8_t *compbuf;
56e93d26 444 int len;
797ca154 445 z_stream stream;
56e93d26
JQ
446};
447typedef struct DecompressParam DecompressParam;
448
449static CompressParam *comp_param;
450static QemuThread *compress_threads;
451/* comp_done_cond is used to wake up the migration thread when
452 * one of the compression threads has finished the compression.
453 * comp_done_lock is used to co-work with comp_done_cond.
454 */
0d9f9a5c
LL
455static QemuMutex comp_done_lock;
456static QemuCond comp_done_cond;
56e93d26
JQ
457/* The empty QEMUFileOps will be used by file in CompressParam */
458static const QEMUFileOps empty_ops = { };
459
34ab9e97 460static QEMUFile *decomp_file;
56e93d26
JQ
461static DecompressParam *decomp_param;
462static QemuThread *decompress_threads;
73a8912b
LL
463static QemuMutex decomp_done_lock;
464static QemuCond decomp_done_cond;
56e93d26 465
5e5fdcff 466static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 467 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
468
469static void *do_data_compress(void *opaque)
470{
471 CompressParam *param = opaque;
a7a9a88f
LL
472 RAMBlock *block;
473 ram_addr_t offset;
5e5fdcff 474 bool zero_page;
56e93d26 475
a7a9a88f 476 qemu_mutex_lock(&param->mutex);
90e56fb4 477 while (!param->quit) {
a7a9a88f
LL
478 if (param->block) {
479 block = param->block;
480 offset = param->offset;
481 param->block = NULL;
482 qemu_mutex_unlock(&param->mutex);
483
5e5fdcff
XG
484 zero_page = do_compress_ram_page(param->file, &param->stream,
485 block, offset, param->originbuf);
a7a9a88f 486
0d9f9a5c 487 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 488 param->done = true;
5e5fdcff 489 param->zero_page = zero_page;
0d9f9a5c
LL
490 qemu_cond_signal(&comp_done_cond);
491 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
492
493 qemu_mutex_lock(&param->mutex);
494 } else {
56e93d26
JQ
495 qemu_cond_wait(&param->cond, &param->mutex);
496 }
56e93d26 497 }
a7a9a88f 498 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
499
500 return NULL;
501}
502
f0afa331 503static void compress_threads_save_cleanup(void)
56e93d26
JQ
504{
505 int i, thread_count;
506
05306935 507 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
508 return;
509 }
05306935 510
56e93d26
JQ
511 thread_count = migrate_compress_threads();
512 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
513 /*
514 * we use it as a indicator which shows if the thread is
515 * properly init'd or not
516 */
517 if (!comp_param[i].file) {
518 break;
519 }
05306935
FL
520
521 qemu_mutex_lock(&comp_param[i].mutex);
522 comp_param[i].quit = true;
523 qemu_cond_signal(&comp_param[i].cond);
524 qemu_mutex_unlock(&comp_param[i].mutex);
525
56e93d26 526 qemu_thread_join(compress_threads + i);
56e93d26
JQ
527 qemu_mutex_destroy(&comp_param[i].mutex);
528 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 529 deflateEnd(&comp_param[i].stream);
34ab9e97 530 g_free(comp_param[i].originbuf);
dcaf446e
XG
531 qemu_fclose(comp_param[i].file);
532 comp_param[i].file = NULL;
56e93d26 533 }
0d9f9a5c
LL
534 qemu_mutex_destroy(&comp_done_lock);
535 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
536 g_free(compress_threads);
537 g_free(comp_param);
56e93d26
JQ
538 compress_threads = NULL;
539 comp_param = NULL;
56e93d26
JQ
540}
541
dcaf446e 542static int compress_threads_save_setup(void)
56e93d26
JQ
543{
544 int i, thread_count;
545
546 if (!migrate_use_compression()) {
dcaf446e 547 return 0;
56e93d26 548 }
56e93d26
JQ
549 thread_count = migrate_compress_threads();
550 compress_threads = g_new0(QemuThread, thread_count);
551 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
552 qemu_cond_init(&comp_done_cond);
553 qemu_mutex_init(&comp_done_lock);
56e93d26 554 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
555 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
556 if (!comp_param[i].originbuf) {
557 goto exit;
558 }
559
dcaf446e
XG
560 if (deflateInit(&comp_param[i].stream,
561 migrate_compress_level()) != Z_OK) {
34ab9e97 562 g_free(comp_param[i].originbuf);
dcaf446e
XG
563 goto exit;
564 }
565
e110aa91
C
566 /* comp_param[i].file is just used as a dummy buffer to save data,
567 * set its ops to empty.
56e93d26 568 */
c6ad5be7 569 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
56e93d26 570 comp_param[i].done = true;
90e56fb4 571 comp_param[i].quit = false;
56e93d26
JQ
572 qemu_mutex_init(&comp_param[i].mutex);
573 qemu_cond_init(&comp_param[i].cond);
574 qemu_thread_create(compress_threads + i, "compress",
575 do_data_compress, comp_param + i,
576 QEMU_THREAD_JOINABLE);
577 }
dcaf446e
XG
578 return 0;
579
580exit:
581 compress_threads_save_cleanup();
582 return -1;
56e93d26
JQ
583}
584
585/**
3d0684b2 586 * save_page_header: write page header to wire
56e93d26
JQ
587 *
588 * If this is the 1st block, it also writes the block identification
589 *
3d0684b2 590 * Returns the number of bytes written
56e93d26
JQ
591 *
592 * @f: QEMUFile where to send the data
593 * @block: block that contains the page we want to send
594 * @offset: offset inside the block for the page
595 * in the lower bits, it contains flags
596 */
2bf3aa85
JQ
597static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
598 ram_addr_t offset)
56e93d26 599{
9f5f380b 600 size_t size, len;
56e93d26 601
24795694
JQ
602 if (block == rs->last_sent_block) {
603 offset |= RAM_SAVE_FLAG_CONTINUE;
604 }
2bf3aa85 605 qemu_put_be64(f, offset);
56e93d26
JQ
606 size = 8;
607
608 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 609 len = strlen(block->idstr);
2bf3aa85
JQ
610 qemu_put_byte(f, len);
611 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 612 size += 1 + len;
24795694 613 rs->last_sent_block = block;
56e93d26
JQ
614 }
615 return size;
616}
617
3d0684b2 618/**
179a8080 619 * mig_throttle_guest_down: throttle down the guest
3d0684b2
JQ
620 *
621 * Reduce amount of guest cpu execution to hopefully slow down memory
622 * writes. If guest dirty memory rate is reduced below the rate at
623 * which we can transfer pages to the destination then we should be
624 * able to complete migration. Some workloads dirty memory way too
625 * fast and will not effectively converge, even with auto-converge.
070afca2 626 */
cbbf8182
KZ
627static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
628 uint64_t bytes_dirty_threshold)
070afca2
JH
629{
630 MigrationState *s = migrate_get_current();
2594f56d 631 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
cbbf8182
KZ
632 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
633 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
4cbc9c7f 634 int pct_max = s->parameters.max_cpu_throttle;
070afca2 635
cbbf8182
KZ
636 uint64_t throttle_now = cpu_throttle_get_percentage();
637 uint64_t cpu_now, cpu_ideal, throttle_inc;
638
070afca2
JH
639 /* We have not started throttling yet. Let's start it. */
640 if (!cpu_throttle_active()) {
641 cpu_throttle_set(pct_initial);
642 } else {
643 /* Throttling already on, just increase the rate */
cbbf8182
KZ
644 if (!pct_tailslow) {
645 throttle_inc = pct_increment;
646 } else {
647 /* Compute the ideal CPU percentage used by Guest, which may
648 * make the dirty rate match the dirty rate threshold. */
649 cpu_now = 100 - throttle_now;
650 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
651 bytes_dirty_period);
652 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
653 }
654 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
070afca2
JH
655 }
656}
657
91fe9a8d
RL
658void mig_throttle_counter_reset(void)
659{
660 RAMState *rs = ram_state;
661
662 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
663 rs->num_dirty_pages_period = 0;
664 rs->bytes_xfer_prev = ram_counters.transferred;
665}
666
3d0684b2
JQ
667/**
668 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
669 *
6f37bb8b 670 * @rs: current RAM state
3d0684b2
JQ
671 * @current_addr: address for the zero page
672 *
673 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
674 * The important thing is that a stale (not-yet-0'd) page be replaced
675 * by the new data.
676 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 677 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 678 */
6f37bb8b 679static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 680{
1a373522 681 if (!rs->xbzrle_enabled) {
56e93d26
JQ
682 return;
683 }
684
685 /* We don't care if this fails to allocate a new cache page
686 * as long as it updated an old one */
c00e0928 687 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 688 ram_counters.dirty_sync_count);
56e93d26
JQ
689}
690
691#define ENCODING_FLAG_XBZRLE 0x1
692
693/**
694 * save_xbzrle_page: compress and send current page
695 *
696 * Returns: 1 means that we wrote the page
697 * 0 means that page is identical to the one already sent
698 * -1 means that xbzrle would be longer than normal
699 *
5a987738 700 * @rs: current RAM state
3d0684b2
JQ
701 * @current_data: pointer to the address of the page contents
702 * @current_addr: addr of the page
56e93d26
JQ
703 * @block: block that contains the page we want to send
704 * @offset: offset inside the block for the page
56e93d26 705 */
204b88b8 706static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 707 ram_addr_t current_addr, RAMBlock *block,
05931ec5 708 ram_addr_t offset)
56e93d26
JQ
709{
710 int encoded_len = 0, bytes_xbzrle;
711 uint8_t *prev_cached_page;
712
9360447d
JQ
713 if (!cache_is_cached(XBZRLE.cache, current_addr,
714 ram_counters.dirty_sync_count)) {
715 xbzrle_counters.cache_miss++;
05931ec5 716 if (!rs->last_stage) {
56e93d26 717 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 718 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
719 return -1;
720 } else {
721 /* update *current_data when the page has been
722 inserted into cache */
723 *current_data = get_cached_data(XBZRLE.cache, current_addr);
724 }
725 }
726 return -1;
727 }
728
e460a4b1
WW
729 /*
730 * Reaching here means the page has hit the xbzrle cache, no matter what
731 * encoding result it is (normal encoding, overflow or skipping the page),
3a4452d8 732 * count the page as encoded. This is used to calculate the encoding rate.
e460a4b1
WW
733 *
734 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
735 * 2nd page turns out to be skipped (i.e. no new bytes written to the
736 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
737 * skipped page included. In this way, the encoding rate can tell if the
738 * guest page is good for xbzrle encoding.
739 */
740 xbzrle_counters.pages++;
56e93d26
JQ
741 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
742
743 /* save current buffer into memory */
744 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
745
746 /* XBZRLE encoding (if there is no overflow) */
747 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
748 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
749 TARGET_PAGE_SIZE);
ca353803
WY
750
751 /*
752 * Update the cache contents, so that it corresponds to the data
753 * sent, in all cases except where we skip the page.
754 */
05931ec5 755 if (!rs->last_stage && encoded_len != 0) {
ca353803
WY
756 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
757 /*
758 * In the case where we couldn't compress, ensure that the caller
759 * sends the data from the cache, since the guest might have
760 * changed the RAM since we copied it.
761 */
762 *current_data = prev_cached_page;
763 }
764
56e93d26 765 if (encoded_len == 0) {
55c4446b 766 trace_save_xbzrle_page_skipping();
56e93d26
JQ
767 return 0;
768 } else if (encoded_len == -1) {
55c4446b 769 trace_save_xbzrle_page_overflow();
9360447d 770 xbzrle_counters.overflow++;
e460a4b1 771 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
56e93d26
JQ
772 return -1;
773 }
774
56e93d26 775 /* Send XBZRLE based compressed page */
2bf3aa85 776 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
777 offset | RAM_SAVE_FLAG_XBZRLE);
778 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
779 qemu_put_be16(rs->f, encoded_len);
780 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 781 bytes_xbzrle += encoded_len + 1 + 2;
e460a4b1
WW
782 /*
783 * Like compressed_size (please see update_compress_thread_counts),
784 * the xbzrle encoded bytes don't count the 8 byte header with
785 * RAM_SAVE_FLAG_CONTINUE.
786 */
787 xbzrle_counters.bytes += bytes_xbzrle - 8;
4c2d0f6d 788 ram_transferred_add(bytes_xbzrle);
56e93d26
JQ
789
790 return 1;
791}
792
3d0684b2
JQ
793/**
794 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 795 *
a5f7b1a6 796 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 797 *
6f37bb8b 798 * @rs: current RAM state
3d0684b2 799 * @rb: RAMBlock where to search for dirty pages
a935e30f 800 * @start: page where we start the search
f3f491fc 801 */
56e93d26 802static inline
a935e30f 803unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 804 unsigned long start)
56e93d26 805{
6b6712ef
JQ
806 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
807 unsigned long *bitmap = rb->bmap;
56e93d26 808
fbd162e6 809 if (ramblock_is_ignored(rb)) {
b895de50
CLG
810 return size;
811 }
812
1a373522 813 return find_next_bit(bitmap, size, start);
56e93d26
JQ
814}
815
1230a25f 816static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
3143577d
WW
817 unsigned long page)
818{
819 uint8_t shift;
820 hwaddr size, start;
821
822 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
823 return;
824 }
825
826 shift = rb->clear_bmap_shift;
827 /*
828 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
829 * can make things easier sometimes since then start address
830 * of the small chunk will always be 64 pages aligned so the
831 * bitmap will always be aligned to unsigned long. We should
832 * even be able to remove this restriction but I'm simply
833 * keeping it.
834 */
835 assert(shift >= 6);
836
837 size = 1ULL << (TARGET_PAGE_BITS + shift);
7648297d 838 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
3143577d
WW
839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
840 memory_region_clear_dirty_bitmap(rb->mr, start, size);
841}
842
843static void
1230a25f 844migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
3143577d
WW
845 unsigned long start,
846 unsigned long npages)
847{
848 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
849 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
850 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
851
852 /*
853 * Clear pages from start to start + npages - 1, so the end boundary is
854 * exclusive.
855 */
856 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
1230a25f 857 migration_clear_memory_region_dirty_bitmap(rb, i);
3143577d
WW
858 }
859}
860
a6a83cef
RL
861/*
862 * colo_bitmap_find_diry:find contiguous dirty pages from start
863 *
864 * Returns the page offset within memory region of the start of the contiguout
865 * dirty page
866 *
867 * @rs: current RAM state
868 * @rb: RAMBlock where to search for dirty pages
869 * @start: page where we start the search
870 * @num: the number of contiguous dirty pages
871 */
872static inline
873unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
874 unsigned long start, unsigned long *num)
875{
876 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
877 unsigned long *bitmap = rb->bmap;
878 unsigned long first, next;
879
880 *num = 0;
881
882 if (ramblock_is_ignored(rb)) {
883 return size;
884 }
885
886 first = find_next_bit(bitmap, size, start);
887 if (first >= size) {
888 return first;
889 }
890 next = find_next_zero_bit(bitmap, size, first + 1);
891 assert(next >= first);
892 *num = next - first;
893 return first;
894}
895
06b10688 896static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
897 RAMBlock *rb,
898 unsigned long page)
a82d593b
DDAG
899{
900 bool ret;
a82d593b 901
002cad6b
PX
902 /*
903 * Clear dirty bitmap if needed. This _must_ be called before we
904 * send any of the page in the chunk because we need to make sure
905 * we can capture further page content changes when we sync dirty
906 * log the next time. So as long as we are going to send any of
907 * the page in the chunk we clear the remote dirty bitmap for all.
908 * Clearing it earlier won't be a problem, but too late will.
909 */
1230a25f 910 migration_clear_memory_region_dirty_bitmap(rb, page);
002cad6b 911
6b6712ef 912 ret = test_and_clear_bit(page, rb->bmap);
a82d593b 913 if (ret) {
0d8ec885 914 rs->migration_dirty_pages--;
a82d593b 915 }
386a907b 916
a82d593b
DDAG
917 return ret;
918}
919
be39b4cd
DH
920static void dirty_bitmap_clear_section(MemoryRegionSection *section,
921 void *opaque)
922{
923 const hwaddr offset = section->offset_within_region;
924 const hwaddr size = int128_get64(section->size);
925 const unsigned long start = offset >> TARGET_PAGE_BITS;
926 const unsigned long npages = size >> TARGET_PAGE_BITS;
927 RAMBlock *rb = section->mr->ram_block;
928 uint64_t *cleared_bits = opaque;
929
930 /*
931 * We don't grab ram_state->bitmap_mutex because we expect to run
932 * only when starting migration or during postcopy recovery where
933 * we don't have concurrent access.
934 */
935 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
936 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
937 }
938 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
939 bitmap_clear(rb->bmap, start, npages);
940}
941
942/*
943 * Exclude all dirty pages from migration that fall into a discarded range as
944 * managed by a RamDiscardManager responsible for the mapped memory region of
945 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
946 *
947 * Discarded pages ("logically unplugged") have undefined content and must
948 * not get migrated, because even reading these pages for migration might
949 * result in undesired behavior.
950 *
951 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
952 *
953 * Note: The result is only stable while migrating (precopy/postcopy).
954 */
955static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
956{
957 uint64_t cleared_bits = 0;
958
959 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
960 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
961 MemoryRegionSection section = {
962 .mr = rb->mr,
963 .offset_within_region = 0,
964 .size = int128_make64(qemu_ram_get_used_length(rb)),
965 };
966
967 ram_discard_manager_replay_discarded(rdm, &section,
968 dirty_bitmap_clear_section,
969 &cleared_bits);
970 }
971 return cleared_bits;
972}
973
9470c5e0
DH
974/*
975 * Check if a host-page aligned page falls into a discarded range as managed by
976 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
977 *
978 * Note: The result is only stable while migrating (precopy/postcopy).
979 */
980bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
981{
982 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
983 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
984 MemoryRegionSection section = {
985 .mr = rb->mr,
986 .offset_within_region = start,
987 .size = int128_make64(qemu_ram_pagesize(rb)),
988 };
989
990 return !ram_discard_manager_is_populated(rdm, &section);
991 }
992 return false;
993}
994
267691b6 995/* Called with RCU critical section */
7a3e9571 996static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 997{
fb613580
KZ
998 uint64_t new_dirty_pages =
999 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1000
1001 rs->migration_dirty_pages += new_dirty_pages;
1002 rs->num_dirty_pages_period += new_dirty_pages;
56e93d26
JQ
1003}
1004
3d0684b2
JQ
1005/**
1006 * ram_pagesize_summary: calculate all the pagesizes of a VM
1007 *
1008 * Returns a summary bitmap of the page sizes of all RAMBlocks
1009 *
1010 * For VMs with just normal pages this is equivalent to the host page
1011 * size. If it's got some huge pages then it's the OR of all the
1012 * different page sizes.
e8ca1db2
DDAG
1013 */
1014uint64_t ram_pagesize_summary(void)
1015{
1016 RAMBlock *block;
1017 uint64_t summary = 0;
1018
fbd162e6 1019 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1020 summary |= block->page_size;
1021 }
1022
1023 return summary;
1024}
1025
aecbfe9c
XG
1026uint64_t ram_get_total_transferred_pages(void)
1027{
1028 return ram_counters.normal + ram_counters.duplicate +
1029 compression_counters.pages + xbzrle_counters.pages;
1030}
1031
b734035b
XG
1032static void migration_update_rates(RAMState *rs, int64_t end_time)
1033{
be8b02ed 1034 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1035 double compressed_size;
b734035b
XG
1036
1037 /* calculate period counters */
1038 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1039 / (end_time - rs->time_last_bitmap_sync);
1040
be8b02ed 1041 if (!page_count) {
b734035b
XG
1042 return;
1043 }
1044
1045 if (migrate_use_xbzrle()) {
e460a4b1
WW
1046 double encoded_size, unencoded_size;
1047
b734035b 1048 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1049 rs->xbzrle_cache_miss_prev) / page_count;
b734035b 1050 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
e460a4b1
WW
1051 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1052 TARGET_PAGE_SIZE;
1053 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
92271402 1054 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
e460a4b1 1055 xbzrle_counters.encoding_rate = 0;
e460a4b1
WW
1056 } else {
1057 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1058 }
1059 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1060 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
b734035b 1061 }
76e03000
XG
1062
1063 if (migrate_use_compression()) {
1064 compression_counters.busy_rate = (double)(compression_counters.busy -
1065 rs->compress_thread_busy_prev) / page_count;
1066 rs->compress_thread_busy_prev = compression_counters.busy;
1067
1068 compressed_size = compression_counters.compressed_size -
1069 rs->compressed_size_prev;
1070 if (compressed_size) {
1071 double uncompressed_size = (compression_counters.pages -
1072 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1073
1074 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1075 compression_counters.compression_rate =
1076 uncompressed_size / compressed_size;
1077
1078 rs->compress_pages_prev = compression_counters.pages;
1079 rs->compressed_size_prev = compression_counters.compressed_size;
1080 }
1081 }
b734035b
XG
1082}
1083
dc14a470
KZ
1084static void migration_trigger_throttle(RAMState *rs)
1085{
1086 MigrationState *s = migrate_get_current();
1087 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1088
1089 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1090 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1091 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1092
1093 /* During block migration the auto-converge logic incorrectly detects
1094 * that ram migration makes no progress. Avoid this by disabling the
1095 * throttling logic during the bulk phase of block migration. */
1096 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1097 /* The following detection logic can be refined later. For now:
1098 Check to see if the ratio between dirtied bytes and the approx.
1099 amount of bytes that just got transferred since the last time
1100 we were in this routine reaches the threshold. If that happens
1101 twice, start or increase throttling. */
1102
1103 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1104 (++rs->dirty_rate_high_cnt >= 2)) {
1105 trace_migration_throttle();
1106 rs->dirty_rate_high_cnt = 0;
cbbf8182
KZ
1107 mig_throttle_guest_down(bytes_dirty_period,
1108 bytes_dirty_threshold);
dc14a470
KZ
1109 }
1110 }
1111}
1112
8d820d6f 1113static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1114{
1115 RAMBlock *block;
56e93d26 1116 int64_t end_time;
56e93d26 1117
9360447d 1118 ram_counters.dirty_sync_count++;
56e93d26 1119
f664da80
JQ
1120 if (!rs->time_last_bitmap_sync) {
1121 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1122 }
1123
1124 trace_migration_bitmap_sync_start();
9c1f8f44 1125 memory_global_dirty_log_sync();
56e93d26 1126
108cfae0 1127 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1128 WITH_RCU_READ_LOCK_GUARD() {
1129 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1130 ramblock_sync_dirty_bitmap(rs, block);
1131 }
1132 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1133 }
108cfae0 1134 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1135
9458a9a1 1136 memory_global_after_dirty_log_sync();
a66cd90c 1137 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1138
56e93d26
JQ
1139 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1140
1141 /* more than 1 second = 1000 millisecons */
f664da80 1142 if (end_time > rs->time_last_bitmap_sync + 1000) {
dc14a470 1143 migration_trigger_throttle(rs);
070afca2 1144
b734035b
XG
1145 migration_update_rates(rs, end_time);
1146
be8b02ed 1147 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1148
1149 /* reset period counters */
f664da80 1150 rs->time_last_bitmap_sync = end_time;
a66cd90c 1151 rs->num_dirty_pages_period = 0;
dc14a470 1152 rs->bytes_xfer_prev = ram_counters.transferred;
56e93d26 1153 }
4addcd4f 1154 if (migrate_use_events()) {
3ab72385 1155 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1156 }
56e93d26
JQ
1157}
1158
bd227060
WW
1159static void migration_bitmap_sync_precopy(RAMState *rs)
1160{
1161 Error *local_err = NULL;
1162
1163 /*
1164 * The current notifier usage is just an optimization to migration, so we
1165 * don't stop the normal migration process in the error case.
1166 */
1167 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1168 error_report_err(local_err);
b4a1733c 1169 local_err = NULL;
bd227060
WW
1170 }
1171
1172 migration_bitmap_sync(rs);
1173
1174 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1175 error_report_err(local_err);
1176 }
1177}
1178
47fe16ff
JQ
1179static void ram_release_page(const char *rbname, uint64_t offset)
1180{
1181 if (!migrate_release_ram() || !migration_in_postcopy()) {
1182 return;
1183 }
1184
1185 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1186}
1187
6c97ec5f
XG
1188/**
1189 * save_zero_page_to_file: send the zero page to the file
1190 *
1191 * Returns the size of data written to the file, 0 means the page is not
1192 * a zero page
1193 *
1194 * @rs: current RAM state
1195 * @file: the file where the data is saved
1196 * @block: block that contains the page we want to send
1197 * @offset: offset inside the block for the page
1198 */
1199static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1200 RAMBlock *block, ram_addr_t offset)
1201{
1202 uint8_t *p = block->host + offset;
1203 int len = 0;
1204
bad452a7 1205 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
6c97ec5f
XG
1206 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1207 qemu_put_byte(file, 0);
1208 len += 1;
47fe16ff 1209 ram_release_page(block->idstr, offset);
6c97ec5f
XG
1210 }
1211 return len;
1212}
1213
56e93d26 1214/**
3d0684b2 1215 * save_zero_page: send the zero page to the stream
56e93d26 1216 *
3d0684b2 1217 * Returns the number of pages written.
56e93d26 1218 *
f7ccd61b 1219 * @rs: current RAM state
56e93d26
JQ
1220 * @block: block that contains the page we want to send
1221 * @offset: offset inside the block for the page
56e93d26 1222 */
7faccdc3 1223static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1224{
6c97ec5f 1225 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1226
6c97ec5f 1227 if (len) {
9360447d 1228 ram_counters.duplicate++;
4c2d0f6d 1229 ram_transferred_add(len);
6c97ec5f 1230 return 1;
56e93d26 1231 }
6c97ec5f 1232 return -1;
56e93d26
JQ
1233}
1234
059ff0fb
XG
1235/*
1236 * @pages: the number of pages written by the control path,
1237 * < 0 - error
1238 * > 0 - number of pages written
1239 *
1240 * Return true if the pages has been saved, otherwise false is returned.
1241 */
1242static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1243 int *pages)
1244{
1245 uint64_t bytes_xmit = 0;
1246 int ret;
1247
1248 *pages = -1;
1249 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1250 &bytes_xmit);
1251 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1252 return false;
1253 }
1254
1255 if (bytes_xmit) {
4c2d0f6d 1256 ram_transferred_add(bytes_xmit);
059ff0fb
XG
1257 *pages = 1;
1258 }
1259
1260 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1261 return true;
1262 }
1263
1264 if (bytes_xmit > 0) {
1265 ram_counters.normal++;
1266 } else if (bytes_xmit == 0) {
1267 ram_counters.duplicate++;
1268 }
1269
1270 return true;
1271}
1272
65dacaa0
XG
1273/*
1274 * directly send the page to the stream
1275 *
1276 * Returns the number of pages written.
1277 *
1278 * @rs: current RAM state
1279 * @block: block that contains the page we want to send
1280 * @offset: offset inside the block for the page
1281 * @buf: the page to be sent
1282 * @async: send to page asyncly
1283 */
1284static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1285 uint8_t *buf, bool async)
1286{
4c2d0f6d
DE
1287 ram_transferred_add(save_page_header(rs, rs->f, block,
1288 offset | RAM_SAVE_FLAG_PAGE));
65dacaa0
XG
1289 if (async) {
1290 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1291 migrate_release_ram() &
1292 migration_in_postcopy());
1293 } else {
1294 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1295 }
4c2d0f6d 1296 ram_transferred_add(TARGET_PAGE_SIZE);
65dacaa0
XG
1297 ram_counters.normal++;
1298 return 1;
1299}
1300
56e93d26 1301/**
3d0684b2 1302 * ram_save_page: send the given page to the stream
56e93d26 1303 *
3d0684b2 1304 * Returns the number of pages written.
3fd3c4b3
DDAG
1305 * < 0 - error
1306 * >=0 - Number of pages written - this might legally be 0
1307 * if xbzrle noticed the page was the same.
56e93d26 1308 *
6f37bb8b 1309 * @rs: current RAM state
56e93d26
JQ
1310 * @block: block that contains the page we want to send
1311 * @offset: offset inside the block for the page
56e93d26 1312 */
05931ec5 1313static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
56e93d26
JQ
1314{
1315 int pages = -1;
56e93d26 1316 uint8_t *p;
56e93d26 1317 bool send_async = true;
a08f6890 1318 RAMBlock *block = pss->block;
8bba004c 1319 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 1320 ram_addr_t current_addr = block->offset + offset;
56e93d26 1321
2f68e399 1322 p = block->host + offset;
1db9d8e5 1323 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1324
56e93d26 1325 XBZRLE_cache_lock();
1a373522 1326 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
059ff0fb 1327 pages = save_xbzrle_page(rs, &p, current_addr, block,
05931ec5
JQ
1328 offset);
1329 if (!rs->last_stage) {
059ff0fb
XG
1330 /* Can't send this cached data async, since the cache page
1331 * might get updated before it gets to the wire
56e93d26 1332 */
059ff0fb 1333 send_async = false;
56e93d26
JQ
1334 }
1335 }
1336
1337 /* XBZRLE overflow or normal page */
1338 if (pages == -1) {
65dacaa0 1339 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1340 }
1341
1342 XBZRLE_cache_unlock();
1343
1344 return pages;
1345}
1346
b9ee2f7d
JQ
1347static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1348 ram_addr_t offset)
1349{
67a4c891 1350 if (multifd_queue_page(rs->f, block, offset) < 0) {
713f762a
IR
1351 return -1;
1352 }
b9ee2f7d
JQ
1353 ram_counters.normal++;
1354
1355 return 1;
1356}
1357
5e5fdcff 1358static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1359 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1360{
53518d94 1361 RAMState *rs = ram_state;
20d549cb 1362 uint8_t *p = block->host + offset;
6ef3771c 1363 int ret;
56e93d26 1364
5e5fdcff 1365 if (save_zero_page_to_file(rs, f, block, offset)) {
e7f2e190 1366 return true;
5e5fdcff
XG
1367 }
1368
6ef3771c 1369 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1370
1371 /*
1372 * copy it to a internal buffer to avoid it being modified by VM
1373 * so that we can catch up the error during compression and
1374 * decompression
1375 */
1376 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1377 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1378 if (ret < 0) {
1379 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1380 error_report("compressed data failed!");
b3be2896 1381 }
e7f2e190 1382 return false;
5e5fdcff
XG
1383}
1384
1385static void
1386update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1387{
4c2d0f6d 1388 ram_transferred_add(bytes_xmit);
76e03000 1389
5e5fdcff
XG
1390 if (param->zero_page) {
1391 ram_counters.duplicate++;
76e03000 1392 return;
5e5fdcff 1393 }
76e03000
XG
1394
1395 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1396 compression_counters.compressed_size += bytes_xmit - 8;
1397 compression_counters.pages++;
56e93d26
JQ
1398}
1399
32b05495
XG
1400static bool save_page_use_compression(RAMState *rs);
1401
ce25d337 1402static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1403{
1404 int idx, len, thread_count;
1405
32b05495 1406 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1407 return;
1408 }
1409 thread_count = migrate_compress_threads();
a7a9a88f 1410
0d9f9a5c 1411 qemu_mutex_lock(&comp_done_lock);
56e93d26 1412 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1413 while (!comp_param[idx].done) {
0d9f9a5c 1414 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1415 }
a7a9a88f 1416 }
0d9f9a5c 1417 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1418
1419 for (idx = 0; idx < thread_count; idx++) {
1420 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1421 if (!comp_param[idx].quit) {
ce25d337 1422 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1423 /*
1424 * it's safe to fetch zero_page without holding comp_done_lock
1425 * as there is no further request submitted to the thread,
1426 * i.e, the thread should be waiting for a request at this point.
1427 */
1428 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1429 }
a7a9a88f 1430 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1431 }
1432}
1433
1434static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1435 ram_addr_t offset)
1436{
1437 param->block = block;
1438 param->offset = offset;
1439}
1440
ce25d337
JQ
1441static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1442 ram_addr_t offset)
56e93d26
JQ
1443{
1444 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 1445 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
1446
1447 thread_count = migrate_compress_threads();
0d9f9a5c 1448 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
1449retry:
1450 for (idx = 0; idx < thread_count; idx++) {
1451 if (comp_param[idx].done) {
1452 comp_param[idx].done = false;
1453 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1454 qemu_mutex_lock(&comp_param[idx].mutex);
1455 set_compress_params(&comp_param[idx], block, offset);
1456 qemu_cond_signal(&comp_param[idx].cond);
1457 qemu_mutex_unlock(&comp_param[idx].mutex);
1458 pages = 1;
5e5fdcff 1459 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 1460 break;
56e93d26
JQ
1461 }
1462 }
1d58872a
XG
1463
1464 /*
1465 * wait for the free thread if the user specifies 'compress-wait-thread',
1466 * otherwise we will post the page out in the main thread as normal page.
1467 */
1468 if (pages < 0 && wait) {
1469 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1470 goto retry;
1471 }
0d9f9a5c 1472 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1473
1474 return pages;
1475}
1476
3d0684b2
JQ
1477/**
1478 * find_dirty_block: find the next dirty page and update any state
1479 * associated with the search process.
b9e60928 1480 *
a5f7b1a6 1481 * Returns true if a page is found
b9e60928 1482 *
6f37bb8b 1483 * @rs: current RAM state
3d0684b2
JQ
1484 * @pss: data about the state of the current dirty page scan
1485 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1486 */
f20e2865 1487static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1488{
f20e2865 1489 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1490 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1491 pss->page >= rs->last_page) {
b9e60928
DDAG
1492 /*
1493 * We've been once around the RAM and haven't found anything.
1494 * Give up.
1495 */
1496 *again = false;
1497 return false;
1498 }
542147f4
DH
1499 if (!offset_in_ramblock(pss->block,
1500 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
b9e60928 1501 /* Didn't find anything in this RAM Block */
a935e30f 1502 pss->page = 0;
b9e60928
DDAG
1503 pss->block = QLIST_NEXT_RCU(pss->block, next);
1504 if (!pss->block) {
48df9d80
XG
1505 /*
1506 * If memory migration starts over, we will meet a dirtied page
1507 * which may still exists in compression threads's ring, so we
1508 * should flush the compressed data to make sure the new page
1509 * is not overwritten by the old one in the destination.
1510 *
1511 * Also If xbzrle is on, stop using the data compression at this
1512 * point. In theory, xbzrle can do better than compression.
1513 */
1514 flush_compressed_data(rs);
1515
b9e60928
DDAG
1516 /* Hit the end of the list */
1517 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1518 /* Flag that we've looped */
1519 pss->complete_round = true;
1a373522
DH
1520 /* After the first round, enable XBZRLE. */
1521 if (migrate_use_xbzrle()) {
1522 rs->xbzrle_enabled = true;
1523 }
b9e60928
DDAG
1524 }
1525 /* Didn't find anything this time, but try again on the new block */
1526 *again = true;
1527 return false;
1528 } else {
1529 /* Can go around again, but... */
1530 *again = true;
1531 /* We've found something so probably don't need to */
1532 return true;
1533 }
1534}
1535
3d0684b2
JQ
1536/**
1537 * unqueue_page: gets a page of the queue
1538 *
a82d593b 1539 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1540 *
3d0684b2
JQ
1541 * Returns the block of the page (or NULL if none available)
1542 *
ec481c6c 1543 * @rs: current RAM state
3d0684b2 1544 * @offset: used to return the offset within the RAMBlock
a82d593b 1545 */
f20e2865 1546static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b 1547{
a1fe28df 1548 struct RAMSrcPageRequest *entry;
a82d593b 1549 RAMBlock *block = NULL;
cfd66f30 1550 size_t page_size;
a82d593b 1551
a1fe28df 1552 if (!postcopy_has_request(rs)) {
ae526e32
XG
1553 return NULL;
1554 }
1555
6e8a355d 1556 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
a1fe28df
PX
1557
1558 /*
1559 * This should _never_ change even after we take the lock, because no one
1560 * should be taking anything off the request list other than us.
1561 */
1562 assert(postcopy_has_request(rs));
1563
1564 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1565 block = entry->rb;
1566 *offset = entry->offset;
cfd66f30
PX
1567 page_size = qemu_ram_pagesize(block);
1568 /* Each page request should only be multiple page size of the ramblock */
1569 assert((entry->len % page_size) == 0);
a1fe28df 1570
cfd66f30
PX
1571 if (entry->len > page_size) {
1572 entry->len -= page_size;
1573 entry->offset += page_size;
a1fe28df
PX
1574 } else {
1575 memory_region_unref(block->mr);
1576 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1577 g_free(entry);
1578 migration_consume_urgent_request();
a82d593b 1579 }
a82d593b 1580
cfd66f30
PX
1581 trace_unqueue_page(block->idstr, *offset,
1582 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1583
a82d593b
DDAG
1584 return block;
1585}
1586
278e2f55
AG
1587#if defined(__linux__)
1588/**
1589 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1590 * is found, return RAM block pointer and page offset
1591 *
1592 * Returns pointer to the RAMBlock containing faulting page,
1593 * NULL if no write faults are pending
1594 *
1595 * @rs: current RAM state
1596 * @offset: page offset from the beginning of the block
1597 */
1598static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1599{
1600 struct uffd_msg uffd_msg;
1601 void *page_address;
82ea3e3b 1602 RAMBlock *block;
278e2f55
AG
1603 int res;
1604
1605 if (!migrate_background_snapshot()) {
1606 return NULL;
1607 }
1608
1609 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1610 if (res <= 0) {
1611 return NULL;
1612 }
1613
1614 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
82ea3e3b
AG
1615 block = qemu_ram_block_from_host(page_address, false, offset);
1616 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1617 return block;
278e2f55
AG
1618}
1619
1620/**
1621 * ram_save_release_protection: release UFFD write protection after
1622 * a range of pages has been saved
1623 *
1624 * @rs: current RAM state
1625 * @pss: page-search-status structure
1626 * @start_page: index of the first page in the range relative to pss->block
1627 *
1628 * Returns 0 on success, negative value in case of an error
1629*/
1630static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1631 unsigned long start_page)
1632{
1633 int res = 0;
1634
1635 /* Check if page is from UFFD-managed region. */
1636 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1637 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
258f5c98 1638 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
278e2f55
AG
1639
1640 /* Flush async buffers before un-protect. */
1641 qemu_fflush(rs->f);
1642 /* Un-protect memory range. */
1643 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1644 false, false);
1645 }
1646
1647 return res;
1648}
1649
1650/* ram_write_tracking_available: check if kernel supports required UFFD features
1651 *
1652 * Returns true if supports, false otherwise
1653 */
1654bool ram_write_tracking_available(void)
1655{
1656 uint64_t uffd_features;
1657 int res;
1658
1659 res = uffd_query_features(&uffd_features);
1660 return (res == 0 &&
1661 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1662}
1663
1664/* ram_write_tracking_compatible: check if guest configuration is
1665 * compatible with 'write-tracking'
1666 *
1667 * Returns true if compatible, false otherwise
1668 */
1669bool ram_write_tracking_compatible(void)
1670{
1671 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1672 int uffd_fd;
82ea3e3b 1673 RAMBlock *block;
278e2f55
AG
1674 bool ret = false;
1675
1676 /* Open UFFD file descriptor */
1677 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1678 if (uffd_fd < 0) {
1679 return false;
1680 }
1681
1682 RCU_READ_LOCK_GUARD();
1683
82ea3e3b 1684 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55
AG
1685 uint64_t uffd_ioctls;
1686
1687 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1688 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1689 continue;
1690 }
1691 /* Try to register block memory via UFFD-IO to track writes */
82ea3e3b 1692 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
278e2f55
AG
1693 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1694 goto out;
1695 }
1696 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1697 goto out;
1698 }
1699 }
1700 ret = true;
1701
1702out:
1703 uffd_close_fd(uffd_fd);
1704 return ret;
1705}
1706
f7b9dcfb
DH
1707static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1708 ram_addr_t size)
1709{
1710 /*
1711 * We read one byte of each page; this will preallocate page tables if
1712 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1713 * where no page was populated yet. This might require adaption when
1714 * supporting other mappings, like shmem.
1715 */
1716 for (; offset < size; offset += block->page_size) {
1717 char tmp = *((char *)block->host + offset);
1718
1719 /* Don't optimize the read out */
1720 asm volatile("" : "+r" (tmp));
1721 }
1722}
1723
6fee3a1f
DH
1724static inline int populate_read_section(MemoryRegionSection *section,
1725 void *opaque)
1726{
1727 const hwaddr size = int128_get64(section->size);
1728 hwaddr offset = section->offset_within_region;
1729 RAMBlock *block = section->mr->ram_block;
1730
1731 populate_read_range(block, offset, size);
1732 return 0;
1733}
1734
eeccb99c 1735/*
f7b9dcfb
DH
1736 * ram_block_populate_read: preallocate page tables and populate pages in the
1737 * RAM block by reading a byte of each page.
eeccb99c
AG
1738 *
1739 * Since it's solely used for userfault_fd WP feature, here we just
1740 * hardcode page size to qemu_real_host_page_size.
1741 *
82ea3e3b 1742 * @block: RAM block to populate
eeccb99c 1743 */
6fee3a1f 1744static void ram_block_populate_read(RAMBlock *rb)
eeccb99c 1745{
6fee3a1f
DH
1746 /*
1747 * Skip populating all pages that fall into a discarded range as managed by
1748 * a RamDiscardManager responsible for the mapped memory region of the
1749 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1750 * must not get populated automatically. We don't have to track
1751 * modifications via userfaultfd WP reliably, because these pages will
1752 * not be part of the migration stream either way -- see
1753 * ramblock_dirty_bitmap_exclude_discarded_pages().
1754 *
1755 * Note: The result is only stable while migrating (precopy/postcopy).
1756 */
1757 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1758 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1759 MemoryRegionSection section = {
1760 .mr = rb->mr,
1761 .offset_within_region = 0,
1762 .size = rb->mr->size,
1763 };
1764
1765 ram_discard_manager_replay_populated(rdm, &section,
1766 populate_read_section, NULL);
1767 } else {
1768 populate_read_range(rb, 0, rb->used_length);
1769 }
eeccb99c
AG
1770}
1771
1772/*
1773 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1774 */
1775void ram_write_tracking_prepare(void)
1776{
82ea3e3b 1777 RAMBlock *block;
eeccb99c
AG
1778
1779 RCU_READ_LOCK_GUARD();
1780
82ea3e3b 1781 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
eeccb99c 1782 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1783 if (block->mr->readonly || block->mr->rom_device) {
eeccb99c
AG
1784 continue;
1785 }
1786
1787 /*
1788 * Populate pages of the RAM block before enabling userfault_fd
1789 * write protection.
1790 *
1791 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1792 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1793 * pages with pte_none() entries in page table.
1794 */
f7b9dcfb 1795 ram_block_populate_read(block);
eeccb99c
AG
1796 }
1797}
1798
278e2f55
AG
1799/*
1800 * ram_write_tracking_start: start UFFD-WP memory tracking
1801 *
1802 * Returns 0 for success or negative value in case of error
1803 */
1804int ram_write_tracking_start(void)
1805{
1806 int uffd_fd;
1807 RAMState *rs = ram_state;
82ea3e3b 1808 RAMBlock *block;
278e2f55
AG
1809
1810 /* Open UFFD file descriptor */
1811 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1812 if (uffd_fd < 0) {
1813 return uffd_fd;
1814 }
1815 rs->uffdio_fd = uffd_fd;
1816
1817 RCU_READ_LOCK_GUARD();
1818
82ea3e3b 1819 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
278e2f55 1820 /* Nothing to do with read-only and MMIO-writable regions */
82ea3e3b 1821 if (block->mr->readonly || block->mr->rom_device) {
278e2f55
AG
1822 continue;
1823 }
1824
1825 /* Register block memory with UFFD to track writes */
82ea3e3b
AG
1826 if (uffd_register_memory(rs->uffdio_fd, block->host,
1827 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
278e2f55
AG
1828 goto fail;
1829 }
1830 /* Apply UFFD write protection to the block memory range */
82ea3e3b
AG
1831 if (uffd_change_protection(rs->uffdio_fd, block->host,
1832 block->max_length, true, false)) {
278e2f55
AG
1833 goto fail;
1834 }
82ea3e3b
AG
1835 block->flags |= RAM_UF_WRITEPROTECT;
1836 memory_region_ref(block->mr);
278e2f55 1837
82ea3e3b
AG
1838 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1839 block->host, block->max_length);
278e2f55
AG
1840 }
1841
1842 return 0;
1843
1844fail:
1845 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1846
82ea3e3b
AG
1847 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1848 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1849 continue;
1850 }
1851 /*
1852 * In case some memory block failed to be write-protected
1853 * remove protection and unregister all succeeded RAM blocks
1854 */
82ea3e3b
AG
1855 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1856 false, false);
1857 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1858 /* Cleanup flags and remove reference */
82ea3e3b
AG
1859 block->flags &= ~RAM_UF_WRITEPROTECT;
1860 memory_region_unref(block->mr);
278e2f55
AG
1861 }
1862
1863 uffd_close_fd(uffd_fd);
1864 rs->uffdio_fd = -1;
1865 return -1;
1866}
1867
1868/**
1869 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1870 */
1871void ram_write_tracking_stop(void)
1872{
1873 RAMState *rs = ram_state;
82ea3e3b 1874 RAMBlock *block;
278e2f55
AG
1875
1876 RCU_READ_LOCK_GUARD();
1877
82ea3e3b
AG
1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1879 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
278e2f55
AG
1880 continue;
1881 }
1882 /* Remove protection and unregister all affected RAM blocks */
82ea3e3b
AG
1883 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1884 false, false);
1885 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
278e2f55 1886
82ea3e3b
AG
1887 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1888 block->host, block->max_length);
278e2f55
AG
1889
1890 /* Cleanup flags and remove reference */
82ea3e3b
AG
1891 block->flags &= ~RAM_UF_WRITEPROTECT;
1892 memory_region_unref(block->mr);
278e2f55
AG
1893 }
1894
1895 /* Finally close UFFD file descriptor */
1896 uffd_close_fd(rs->uffdio_fd);
1897 rs->uffdio_fd = -1;
1898}
1899
1900#else
1901/* No target OS support, stubs just fail or ignore */
1902
1903static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1904{
1905 (void) rs;
1906 (void) offset;
1907
1908 return NULL;
1909}
1910
1911static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1912 unsigned long start_page)
1913{
1914 (void) rs;
1915 (void) pss;
1916 (void) start_page;
1917
1918 return 0;
1919}
1920
1921bool ram_write_tracking_available(void)
1922{
1923 return false;
1924}
1925
1926bool ram_write_tracking_compatible(void)
1927{
1928 assert(0);
1929 return false;
1930}
1931
1932int ram_write_tracking_start(void)
1933{
1934 assert(0);
1935 return -1;
1936}
1937
1938void ram_write_tracking_stop(void)
1939{
1940 assert(0);
1941}
1942#endif /* defined(__linux__) */
1943
3d0684b2 1944/**
ff1543af 1945 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
1946 *
1947 * Skips pages that are already sent (!dirty)
a82d593b 1948 *
a5f7b1a6 1949 * Returns true if a queued page is found
a82d593b 1950 *
6f37bb8b 1951 * @rs: current RAM state
3d0684b2 1952 * @pss: data about the state of the current dirty page scan
a82d593b 1953 */
f20e2865 1954static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1955{
1956 RAMBlock *block;
1957 ram_addr_t offset;
a82d593b 1958
cfd66f30 1959 block = unqueue_page(rs, &offset);
a82d593b 1960
278e2f55
AG
1961 if (!block) {
1962 /*
1963 * Poll write faults too if background snapshot is enabled; that's
1964 * when we have vcpus got blocked by the write protected pages.
1965 */
1966 block = poll_fault_page(rs, &offset);
1967 }
1968
a82d593b 1969 if (block) {
a82d593b
DDAG
1970 /*
1971 * We want the background search to continue from the queued page
1972 * since the guest is likely to want other pages near to the page
1973 * it just requested.
1974 */
1975 pss->block = block;
a935e30f 1976 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
1977
1978 /*
1979 * This unqueued page would break the "one round" check, even is
1980 * really rare.
1981 */
1982 pss->complete_round = false;
a82d593b
DDAG
1983 }
1984
1985 return !!block;
1986}
1987
6c595cde 1988/**
5e58f968
JQ
1989 * migration_page_queue_free: drop any remaining pages in the ram
1990 * request queue
6c595cde 1991 *
3d0684b2
JQ
1992 * It should be empty at the end anyway, but in error cases there may
1993 * be some left. in case that there is any page left, we drop it.
1994 *
6c595cde 1995 */
83c13382 1996static void migration_page_queue_free(RAMState *rs)
6c595cde 1997{
ec481c6c 1998 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1999 /* This queue generally should be empty - but in the case of a failed
2000 * migration might have some droppings in.
2001 */
89ac5a1d 2002 RCU_READ_LOCK_GUARD();
ec481c6c 2003 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2004 memory_region_unref(mspr->rb->mr);
ec481c6c 2005 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2006 g_free(mspr);
2007 }
6c595cde
DDAG
2008}
2009
2010/**
3d0684b2
JQ
2011 * ram_save_queue_pages: queue the page for transmission
2012 *
2013 * A request from postcopy destination for example.
2014 *
2015 * Returns zero on success or negative on error
2016 *
3d0684b2
JQ
2017 * @rbname: Name of the RAMBLock of the request. NULL means the
2018 * same that last one.
2019 * @start: starting address from the start of the RAMBlock
2020 * @len: length (in bytes) to send
6c595cde 2021 */
96506894 2022int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2023{
2024 RAMBlock *ramblock;
53518d94 2025 RAMState *rs = ram_state;
6c595cde 2026
9360447d 2027 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2028 RCU_READ_LOCK_GUARD();
2029
6c595cde
DDAG
2030 if (!rbname) {
2031 /* Reuse last RAMBlock */
68a098f3 2032 ramblock = rs->last_req_rb;
6c595cde
DDAG
2033
2034 if (!ramblock) {
2035 /*
2036 * Shouldn't happen, we can't reuse the last RAMBlock if
2037 * it's the 1st request.
2038 */
2039 error_report("ram_save_queue_pages no previous block");
03acb4e9 2040 return -1;
6c595cde
DDAG
2041 }
2042 } else {
2043 ramblock = qemu_ram_block_by_name(rbname);
2044
2045 if (!ramblock) {
2046 /* We shouldn't be asked for a non-existent RAMBlock */
2047 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2048 return -1;
6c595cde 2049 }
68a098f3 2050 rs->last_req_rb = ramblock;
6c595cde
DDAG
2051 }
2052 trace_ram_save_queue_pages(ramblock->idstr, start, len);
542147f4 2053 if (!offset_in_ramblock(ramblock, start + len - 1)) {
9458ad6b
JQ
2054 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2055 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2056 __func__, start, len, ramblock->used_length);
03acb4e9 2057 return -1;
6c595cde
DDAG
2058 }
2059
ec481c6c
JQ
2060 struct RAMSrcPageRequest *new_entry =
2061 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2062 new_entry->rb = ramblock;
2063 new_entry->offset = start;
2064 new_entry->len = len;
2065
2066 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2067 qemu_mutex_lock(&rs->src_page_req_mutex);
2068 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2069 migration_make_urgent_request();
ec481c6c 2070 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2071
2072 return 0;
6c595cde
DDAG
2073}
2074
d7400a34
XG
2075static bool save_page_use_compression(RAMState *rs)
2076{
2077 if (!migrate_use_compression()) {
2078 return false;
2079 }
2080
2081 /*
1a373522
DH
2082 * If xbzrle is enabled (e.g., after first round of migration), stop
2083 * using the data compression. In theory, xbzrle can do better than
2084 * compression.
d7400a34 2085 */
1a373522
DH
2086 if (rs->xbzrle_enabled) {
2087 return false;
d7400a34
XG
2088 }
2089
1a373522 2090 return true;
d7400a34
XG
2091}
2092
5e5fdcff
XG
2093/*
2094 * try to compress the page before posting it out, return true if the page
2095 * has been properly handled by compression, otherwise needs other
2096 * paths to handle it
2097 */
2098static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2099{
2100 if (!save_page_use_compression(rs)) {
2101 return false;
2102 }
2103
2104 /*
2105 * When starting the process of a new block, the first page of
2106 * the block should be sent out before other pages in the same
2107 * block, and all the pages in last block should have been sent
2108 * out, keeping this order is important, because the 'cont' flag
2109 * is used to avoid resending the block name.
2110 *
2111 * We post the fist page as normal page as compression will take
2112 * much CPU resource.
2113 */
2114 if (block != rs->last_sent_block) {
2115 flush_compressed_data(rs);
2116 return false;
2117 }
2118
2119 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2120 return true;
2121 }
2122
76e03000 2123 compression_counters.busy++;
5e5fdcff
XG
2124 return false;
2125}
2126
a82d593b 2127/**
3d0684b2 2128 * ram_save_target_page: save one target page
a82d593b 2129 *
3d0684b2 2130 * Returns the number of pages written
a82d593b 2131 *
6f37bb8b 2132 * @rs: current RAM state
3d0684b2 2133 * @pss: data about the page we want to send
a82d593b 2134 */
05931ec5 2135static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
a82d593b 2136{
a8ec91f9 2137 RAMBlock *block = pss->block;
8bba004c 2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2139 int res;
2140
2141 if (control_save_page(rs, block, offset, &res)) {
2142 return res;
2143 }
2144
5e5fdcff
XG
2145 if (save_compress_page(rs, block, offset)) {
2146 return 1;
d7400a34
XG
2147 }
2148
2149 res = save_zero_page(rs, block, offset);
2150 if (res > 0) {
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2153 */
2154 if (!save_page_use_compression(rs)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs, block->offset + offset);
2157 XBZRLE_cache_unlock();
2158 }
d7400a34
XG
2159 return res;
2160 }
2161
da3f56cb 2162 /*
c6b3a2e0
WY
2163 * Do not use multifd for:
2164 * 1. Compression as the first page in the new block should be posted out
2165 * before sending the compressed page
2166 * 2. In postcopy as one whole host page should be placed
da3f56cb 2167 */
c6b3a2e0
WY
2168 if (!save_page_use_compression(rs) && migrate_use_multifd()
2169 && !migration_in_postcopy()) {
b9ee2f7d 2170 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2171 }
2172
05931ec5 2173 return ram_save_page(rs, pss);
a82d593b
DDAG
2174}
2175
2176/**
3d0684b2 2177 * ram_save_host_page: save a whole host page
a82d593b 2178 *
3d0684b2
JQ
2179 * Starting at *offset send pages up to the end of the current host
2180 * page. It's valid for the initial offset to point into the middle of
2181 * a host page in which case the remainder of the hostpage is sent.
2182 * Only dirty target pages are sent. Note that the host page size may
2183 * be a huge page for this block.
1eb3fc0a
DDAG
2184 * The saving stops at the boundary of the used_length of the block
2185 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2186 *
3d0684b2
JQ
2187 * Returns the number of pages written or negative on error
2188 *
6f37bb8b 2189 * @rs: current RAM state
3d0684b2 2190 * @pss: data about the page we want to send
a82d593b 2191 */
05931ec5 2192static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2193{
2194 int tmppages, pages = 0;
a935e30f
JQ
2195 size_t pagesize_bits =
2196 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
ba1b7c81
KJ
2197 unsigned long hostpage_boundary =
2198 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
278e2f55
AG
2199 unsigned long start_page = pss->page;
2200 int res;
4c011c37 2201
fbd162e6 2202 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2203 error_report("block %s should not be migrated !", pss->block->idstr);
2204 return 0;
2205 }
2206
a82d593b 2207 do {
1faa5665 2208 /* Check the pages is dirty and if it is send it */
ba1b7c81 2209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
05931ec5 2210 tmppages = ram_save_target_page(rs, pss);
ba1b7c81
KJ
2211 if (tmppages < 0) {
2212 return tmppages;
2213 }
a82d593b 2214
ba1b7c81
KJ
2215 pages += tmppages;
2216 /*
2217 * Allow rate limiting to happen in the middle of huge pages if
2218 * something is sent in the current iteration.
2219 */
2220 if (pagesize_bits > 1 && tmppages > 0) {
2221 migration_rate_limit();
2222 }
23feba90 2223 }
ba1b7c81
KJ
2224 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2225 } while ((pss->page < hostpage_boundary) &&
8bba004c
AR
2226 offset_in_ramblock(pss->block,
2227 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
ba1b7c81 2228 /* The offset we leave with is the min boundary of host page and block */
258f5c98 2229 pss->page = MIN(pss->page, hostpage_boundary);
278e2f55
AG
2230
2231 res = ram_save_release_protection(rs, pss, start_page);
2232 return (res < 0 ? res : pages);
a82d593b 2233}
6c595cde 2234
56e93d26 2235/**
3d0684b2 2236 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2237 *
2238 * Called within an RCU critical section.
2239 *
e8f3735f
XG
2240 * Returns the number of pages written where zero means no dirty pages,
2241 * or negative on error
56e93d26 2242 *
6f37bb8b 2243 * @rs: current RAM state
a82d593b
DDAG
2244 *
2245 * On systems where host-page-size > target-page-size it will send all the
2246 * pages in a host page that are dirty.
56e93d26 2247 */
05931ec5 2248static int ram_find_and_save_block(RAMState *rs)
56e93d26 2249{
b8fb8cb7 2250 PageSearchStatus pss;
56e93d26 2251 int pages = 0;
b9e60928 2252 bool again, found;
56e93d26 2253
0827b9e9
AA
2254 /* No dirty page as there is zero RAM */
2255 if (!ram_bytes_total()) {
2256 return pages;
2257 }
2258
6f37bb8b 2259 pss.block = rs->last_seen_block;
a935e30f 2260 pss.page = rs->last_page;
b8fb8cb7
DDAG
2261 pss.complete_round = false;
2262
2263 if (!pss.block) {
2264 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2265 }
56e93d26 2266
b9e60928 2267 do {
a82d593b 2268 again = true;
f20e2865 2269 found = get_queued_page(rs, &pss);
b9e60928 2270
a82d593b
DDAG
2271 if (!found) {
2272 /* priority queue empty, so just search for something dirty */
f20e2865 2273 found = find_dirty_block(rs, &pss, &again);
a82d593b 2274 }
f3f491fc 2275
a82d593b 2276 if (found) {
05931ec5 2277 pages = ram_save_host_page(rs, &pss);
56e93d26 2278 }
b9e60928 2279 } while (!pages && again);
56e93d26 2280
6f37bb8b 2281 rs->last_seen_block = pss.block;
a935e30f 2282 rs->last_page = pss.page;
56e93d26
JQ
2283
2284 return pages;
2285}
2286
2287void acct_update_position(QEMUFile *f, size_t size, bool zero)
2288{
2289 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2290
56e93d26 2291 if (zero) {
9360447d 2292 ram_counters.duplicate += pages;
56e93d26 2293 } else {
9360447d 2294 ram_counters.normal += pages;
4c2d0f6d 2295 ram_transferred_add(size);
56e93d26
JQ
2296 qemu_update_position(f, size);
2297 }
2298}
2299
fbd162e6 2300static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2301{
2302 RAMBlock *block;
2303 uint64_t total = 0;
2304
89ac5a1d
DDAG
2305 RCU_READ_LOCK_GUARD();
2306
fbd162e6
YK
2307 if (count_ignored) {
2308 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2309 total += block->used_length;
2310 }
2311 } else {
2312 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2313 total += block->used_length;
2314 }
99e15582 2315 }
56e93d26
JQ
2316 return total;
2317}
2318
fbd162e6
YK
2319uint64_t ram_bytes_total(void)
2320{
2321 return ram_bytes_total_common(false);
2322}
2323
f265e0e4 2324static void xbzrle_load_setup(void)
56e93d26 2325{
f265e0e4 2326 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2327}
2328
f265e0e4
JQ
2329static void xbzrle_load_cleanup(void)
2330{
2331 g_free(XBZRLE.decoded_buf);
2332 XBZRLE.decoded_buf = NULL;
2333}
2334
7d7c96be
PX
2335static void ram_state_cleanup(RAMState **rsp)
2336{
b9ccaf6d
DDAG
2337 if (*rsp) {
2338 migration_page_queue_free(*rsp);
2339 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2340 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2341 g_free(*rsp);
2342 *rsp = NULL;
2343 }
7d7c96be
PX
2344}
2345
84593a08
PX
2346static void xbzrle_cleanup(void)
2347{
2348 XBZRLE_cache_lock();
2349 if (XBZRLE.cache) {
2350 cache_fini(XBZRLE.cache);
2351 g_free(XBZRLE.encoded_buf);
2352 g_free(XBZRLE.current_buf);
2353 g_free(XBZRLE.zero_target_page);
2354 XBZRLE.cache = NULL;
2355 XBZRLE.encoded_buf = NULL;
2356 XBZRLE.current_buf = NULL;
2357 XBZRLE.zero_target_page = NULL;
2358 }
2359 XBZRLE_cache_unlock();
2360}
2361
f265e0e4 2362static void ram_save_cleanup(void *opaque)
56e93d26 2363{
53518d94 2364 RAMState **rsp = opaque;
6b6712ef 2365 RAMBlock *block;
eb859c53 2366
278e2f55
AG
2367 /* We don't use dirty log with background snapshots */
2368 if (!migrate_background_snapshot()) {
2369 /* caller have hold iothread lock or is in a bh, so there is
2370 * no writing race against the migration bitmap
2371 */
63b41db4
HH
2372 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2373 /*
2374 * do not stop dirty log without starting it, since
2375 * memory_global_dirty_log_stop will assert that
2376 * memory_global_dirty_log_start/stop used in pairs
2377 */
2378 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2379 }
278e2f55 2380 }
6b6712ef 2381
fbd162e6 2382 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2383 g_free(block->clear_bmap);
2384 block->clear_bmap = NULL;
6b6712ef
JQ
2385 g_free(block->bmap);
2386 block->bmap = NULL;
56e93d26
JQ
2387 }
2388
84593a08 2389 xbzrle_cleanup();
f0afa331 2390 compress_threads_save_cleanup();
7d7c96be 2391 ram_state_cleanup(rsp);
56e93d26
JQ
2392}
2393
6f37bb8b 2394static void ram_state_reset(RAMState *rs)
56e93d26 2395{
6f37bb8b
JQ
2396 rs->last_seen_block = NULL;
2397 rs->last_sent_block = NULL;
269ace29 2398 rs->last_page = 0;
6f37bb8b 2399 rs->last_version = ram_list.version;
1a373522 2400 rs->xbzrle_enabled = false;
56e93d26
JQ
2401}
2402
2403#define MAX_WAIT 50 /* ms, half buffered_file limit */
2404
e0b266f0
DDAG
2405/* **** functions for postcopy ***** */
2406
ced1c616
PB
2407void ram_postcopy_migrated_memory_release(MigrationState *ms)
2408{
2409 struct RAMBlock *block;
ced1c616 2410
fbd162e6 2411 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2412 unsigned long *bitmap = block->bmap;
2413 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2414 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2415
2416 while (run_start < range) {
2417 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2418 ram_discard_range(block->idstr,
2419 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2420 ((ram_addr_t)(run_end - run_start))
2421 << TARGET_PAGE_BITS);
ced1c616
PB
2422 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2423 }
2424 }
2425}
2426
3d0684b2
JQ
2427/**
2428 * postcopy_send_discard_bm_ram: discard a RAMBlock
2429 *
e0b266f0 2430 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2431 *
2432 * @ms: current migration state
89dab31b 2433 * @block: RAMBlock to discard
e0b266f0 2434 */
9e7d1223 2435static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2436{
6b6712ef 2437 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2438 unsigned long current;
1e7cf8c3 2439 unsigned long *bitmap = block->bmap;
e0b266f0 2440
6b6712ef 2441 for (current = 0; current < end; ) {
1e7cf8c3 2442 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2443 unsigned long zero, discard_length;
e0b266f0 2444
33a5cb62
WY
2445 if (one >= end) {
2446 break;
2447 }
e0b266f0 2448
1e7cf8c3 2449 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2450
2451 if (zero >= end) {
2452 discard_length = end - one;
e0b266f0 2453 } else {
33a5cb62
WY
2454 discard_length = zero - one;
2455 }
810cf2bb 2456 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2457 current = one + discard_length;
e0b266f0 2458 }
e0b266f0
DDAG
2459}
2460
f30c2e5b
PX
2461static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2462
3d0684b2
JQ
2463/**
2464 * postcopy_each_ram_send_discard: discard all RAMBlocks
2465 *
e0b266f0
DDAG
2466 * Utility for the outgoing postcopy code.
2467 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2468 * passing it bitmap indexes and name.
e0b266f0
DDAG
2469 * (qemu_ram_foreach_block ends up passing unscaled lengths
2470 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2471 *
2472 * @ms: current migration state
e0b266f0 2473 */
739fcc1b 2474static void postcopy_each_ram_send_discard(MigrationState *ms)
e0b266f0
DDAG
2475{
2476 struct RAMBlock *block;
e0b266f0 2477
fbd162e6 2478 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2479 postcopy_discard_send_init(ms, block->idstr);
e0b266f0 2480
f30c2e5b
PX
2481 /*
2482 * Deal with TPS != HPS and huge pages. It discard any partially sent
2483 * host-page size chunks, mark any partially dirty host-page size
2484 * chunks as all dirty. In this case the host-page is the host-page
2485 * for the particular RAMBlock, i.e. it might be a huge page.
2486 */
2487 postcopy_chunk_hostpages_pass(ms, block);
2488
e0b266f0
DDAG
2489 /*
2490 * Postcopy sends chunks of bitmap over the wire, but it
2491 * just needs indexes at this point, avoids it having
2492 * target page specific code.
2493 */
739fcc1b 2494 postcopy_send_discard_bm_ram(ms, block);
810cf2bb 2495 postcopy_discard_send_finish(ms);
e0b266f0 2496 }
e0b266f0
DDAG
2497}
2498
3d0684b2 2499/**
8324ef86 2500 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2501 *
2502 * Helper for postcopy_chunk_hostpages; it's called twice to
2503 * canonicalize the two bitmaps, that are similar, but one is
2504 * inverted.
99e314eb 2505 *
3d0684b2
JQ
2506 * Postcopy requires that all target pages in a hostpage are dirty or
2507 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2508 *
3d0684b2 2509 * @ms: current migration state
3d0684b2 2510 * @block: block that contains the page we want to canonicalize
99e314eb 2511 */
1e7cf8c3 2512static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2513{
53518d94 2514 RAMState *rs = ram_state;
6b6712ef 2515 unsigned long *bitmap = block->bmap;
29c59172 2516 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2517 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2518 unsigned long run_start;
2519
29c59172
DDAG
2520 if (block->page_size == TARGET_PAGE_SIZE) {
2521 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2522 return;
2523 }
2524
1e7cf8c3
WY
2525 /* Find a dirty page */
2526 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2527
6b6712ef 2528 while (run_start < pages) {
99e314eb
DDAG
2529
2530 /*
2531 * If the start of this run of pages is in the middle of a host
2532 * page, then we need to fixup this host page.
2533 */
9dec3cc3 2534 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2535 /* Find the end of this run */
1e7cf8c3 2536 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2537 /*
2538 * If the end isn't at the start of a host page, then the
2539 * run doesn't finish at the end of a host page
2540 * and we need to discard.
2541 */
99e314eb
DDAG
2542 }
2543
9dec3cc3 2544 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2545 unsigned long page;
dad45ab2
WY
2546 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2547 host_ratio);
2548 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2549
99e314eb
DDAG
2550 /* Clean up the bitmap */
2551 for (page = fixup_start_addr;
2552 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2553 /*
2554 * Remark them as dirty, updating the count for any pages
2555 * that weren't previously dirty.
2556 */
0d8ec885 2557 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2558 }
2559 }
2560
1e7cf8c3
WY
2561 /* Find the next dirty page for the next iteration */
2562 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2563 }
2564}
2565
3d0684b2
JQ
2566/**
2567 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2568 *
e0b266f0
DDAG
2569 * Transmit the set of pages to be discarded after precopy to the target
2570 * these are pages that:
2571 * a) Have been previously transmitted but are now dirty again
2572 * b) Pages that have never been transmitted, this ensures that
2573 * any pages on the destination that have been mapped by background
2574 * tasks get discarded (transparent huge pages is the specific concern)
2575 * Hopefully this is pretty sparse
3d0684b2
JQ
2576 *
2577 * @ms: current migration state
e0b266f0 2578 */
739fcc1b 2579void ram_postcopy_send_discard_bitmap(MigrationState *ms)
e0b266f0 2580{
53518d94 2581 RAMState *rs = ram_state;
e0b266f0 2582
89ac5a1d 2583 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
2584
2585 /* This should be our last sync, the src is now paused */
eb859c53 2586 migration_bitmap_sync(rs);
e0b266f0 2587
6b6712ef
JQ
2588 /* Easiest way to make sure we don't resume in the middle of a host-page */
2589 rs->last_seen_block = NULL;
2590 rs->last_sent_block = NULL;
2591 rs->last_page = 0;
e0b266f0 2592
739fcc1b 2593 postcopy_each_ram_send_discard(ms);
e0b266f0 2594
739fcc1b 2595 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2596}
2597
3d0684b2
JQ
2598/**
2599 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2600 *
3d0684b2 2601 * Returns zero on success
e0b266f0 2602 *
36449157
JQ
2603 * @rbname: name of the RAMBlock of the request. NULL means the
2604 * same that last one.
3d0684b2
JQ
2605 * @start: RAMBlock starting page
2606 * @length: RAMBlock size
e0b266f0 2607 */
aaa2064c 2608int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 2609{
36449157 2610 trace_ram_discard_range(rbname, start, length);
d3a5038c 2611
89ac5a1d 2612 RCU_READ_LOCK_GUARD();
36449157 2613 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2614
2615 if (!rb) {
36449157 2616 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 2617 return -1;
e0b266f0
DDAG
2618 }
2619
814bb08f
PX
2620 /*
2621 * On source VM, we don't need to update the received bitmap since
2622 * we don't even have one.
2623 */
2624 if (rb->receivedmap) {
2625 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2626 length >> qemu_target_page_bits());
2627 }
2628
03acb4e9 2629 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2630}
2631
84593a08
PX
2632/*
2633 * For every allocation, we will try not to crash the VM if the
2634 * allocation failed.
2635 */
2636static int xbzrle_init(void)
2637{
2638 Error *local_err = NULL;
2639
2640 if (!migrate_use_xbzrle()) {
2641 return 0;
2642 }
2643
2644 XBZRLE_cache_lock();
2645
2646 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2647 if (!XBZRLE.zero_target_page) {
2648 error_report("%s: Error allocating zero page", __func__);
2649 goto err_out;
2650 }
2651
2652 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2653 TARGET_PAGE_SIZE, &local_err);
2654 if (!XBZRLE.cache) {
2655 error_report_err(local_err);
2656 goto free_zero_page;
2657 }
2658
2659 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2660 if (!XBZRLE.encoded_buf) {
2661 error_report("%s: Error allocating encoded_buf", __func__);
2662 goto free_cache;
2663 }
2664
2665 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2666 if (!XBZRLE.current_buf) {
2667 error_report("%s: Error allocating current_buf", __func__);
2668 goto free_encoded_buf;
2669 }
2670
2671 /* We are all good */
2672 XBZRLE_cache_unlock();
2673 return 0;
2674
2675free_encoded_buf:
2676 g_free(XBZRLE.encoded_buf);
2677 XBZRLE.encoded_buf = NULL;
2678free_cache:
2679 cache_fini(XBZRLE.cache);
2680 XBZRLE.cache = NULL;
2681free_zero_page:
2682 g_free(XBZRLE.zero_target_page);
2683 XBZRLE.zero_target_page = NULL;
2684err_out:
2685 XBZRLE_cache_unlock();
2686 return -ENOMEM;
2687}
2688
53518d94 2689static int ram_state_init(RAMState **rsp)
56e93d26 2690{
7d00ee6a
PX
2691 *rsp = g_try_new0(RAMState, 1);
2692
2693 if (!*rsp) {
2694 error_report("%s: Init ramstate fail", __func__);
2695 return -1;
2696 }
53518d94
JQ
2697
2698 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2699 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2700 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2701
7d00ee6a 2702 /*
40c4d4a8
IR
2703 * Count the total number of pages used by ram blocks not including any
2704 * gaps due to alignment or unplugs.
03158519 2705 * This must match with the initial values of dirty bitmap.
7d00ee6a 2706 */
40c4d4a8 2707 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
2708 ram_state_reset(*rsp);
2709
2710 return 0;
2711}
2712
d6eff5d7 2713static void ram_list_init_bitmaps(void)
7d00ee6a 2714{
002cad6b 2715 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
2716 RAMBlock *block;
2717 unsigned long pages;
002cad6b 2718 uint8_t shift;
56e93d26 2719
0827b9e9
AA
2720 /* Skip setting bitmap if there is no RAM */
2721 if (ram_bytes_total()) {
002cad6b
PX
2722 shift = ms->clear_bitmap_shift;
2723 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2724 error_report("clear_bitmap_shift (%u) too big, using "
2725 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2726 shift = CLEAR_BITMAP_SHIFT_MAX;
2727 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2728 error_report("clear_bitmap_shift (%u) too small, using "
2729 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2730 shift = CLEAR_BITMAP_SHIFT_MIN;
2731 }
2732
fbd162e6 2733 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 2734 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
2735 /*
2736 * The initial dirty bitmap for migration must be set with all
2737 * ones to make sure we'll migrate every guest RAM page to
2738 * destination.
40c4d4a8
IR
2739 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2740 * new migration after a failed migration, ram_list.
2741 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2742 * guest memory.
03158519 2743 */
6b6712ef 2744 block->bmap = bitmap_new(pages);
40c4d4a8 2745 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
2746 block->clear_bmap_shift = shift;
2747 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 2748 }
f3f491fc 2749 }
d6eff5d7
PX
2750}
2751
be39b4cd
DH
2752static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2753{
2754 unsigned long pages;
2755 RAMBlock *rb;
2756
2757 RCU_READ_LOCK_GUARD();
2758
2759 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2760 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2761 rs->migration_dirty_pages -= pages;
2762 }
2763}
2764
d6eff5d7
PX
2765static void ram_init_bitmaps(RAMState *rs)
2766{
2767 /* For memory_global_dirty_log_start below. */
2768 qemu_mutex_lock_iothread();
2769 qemu_mutex_lock_ramlist();
f3f491fc 2770
89ac5a1d
DDAG
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 ram_list_init_bitmaps();
278e2f55
AG
2773 /* We don't use dirty log with background snapshots */
2774 if (!migrate_background_snapshot()) {
63b41db4 2775 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
278e2f55
AG
2776 migration_bitmap_sync_precopy(rs);
2777 }
89ac5a1d 2778 }
56e93d26 2779 qemu_mutex_unlock_ramlist();
49877834 2780 qemu_mutex_unlock_iothread();
be39b4cd
DH
2781
2782 /*
2783 * After an eventual first bitmap sync, fixup the initial bitmap
2784 * containing all 1s to exclude any discarded pages from migration.
2785 */
2786 migration_bitmap_clear_discarded_pages(rs);
d6eff5d7
PX
2787}
2788
2789static int ram_init_all(RAMState **rsp)
2790{
2791 if (ram_state_init(rsp)) {
2792 return -1;
2793 }
2794
2795 if (xbzrle_init()) {
2796 ram_state_cleanup(rsp);
2797 return -1;
2798 }
2799
2800 ram_init_bitmaps(*rsp);
a91246c9
HZ
2801
2802 return 0;
2803}
2804
08614f34
PX
2805static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2806{
2807 RAMBlock *block;
2808 uint64_t pages = 0;
2809
2810 /*
2811 * Postcopy is not using xbzrle/compression, so no need for that.
2812 * Also, since source are already halted, we don't need to care
2813 * about dirty page logging as well.
2814 */
2815
fbd162e6 2816 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
2817 pages += bitmap_count_one(block->bmap,
2818 block->used_length >> TARGET_PAGE_BITS);
2819 }
2820
2821 /* This may not be aligned with current bitmaps. Recalculate. */
2822 rs->migration_dirty_pages = pages;
2823
1a373522 2824 ram_state_reset(rs);
08614f34
PX
2825
2826 /* Update RAMState cache of output QEMUFile */
2827 rs->f = out;
2828
2829 trace_ram_state_resume_prepare(pages);
2830}
2831
6bcb05fc
WW
2832/*
2833 * This function clears bits of the free pages reported by the caller from the
2834 * migration dirty bitmap. @addr is the host address corresponding to the
2835 * start of the continuous guest free pages, and @len is the total bytes of
2836 * those pages.
2837 */
2838void qemu_guest_free_page_hint(void *addr, size_t len)
2839{
2840 RAMBlock *block;
2841 ram_addr_t offset;
2842 size_t used_len, start, npages;
2843 MigrationState *s = migrate_get_current();
2844
2845 /* This function is currently expected to be used during live migration */
2846 if (!migration_is_setup_or_active(s->state)) {
2847 return;
2848 }
2849
2850 for (; len > 0; len -= used_len, addr += used_len) {
2851 block = qemu_ram_block_from_host(addr, false, &offset);
2852 if (unlikely(!block || offset >= block->used_length)) {
2853 /*
2854 * The implementation might not support RAMBlock resize during
2855 * live migration, but it could happen in theory with future
2856 * updates. So we add a check here to capture that case.
2857 */
2858 error_report_once("%s unexpected error", __func__);
2859 return;
2860 }
2861
2862 if (len <= block->used_length - offset) {
2863 used_len = len;
2864 } else {
2865 used_len = block->used_length - offset;
2866 }
2867
2868 start = offset >> TARGET_PAGE_BITS;
2869 npages = used_len >> TARGET_PAGE_BITS;
2870
2871 qemu_mutex_lock(&ram_state->bitmap_mutex);
3143577d
WW
2872 /*
2873 * The skipped free pages are equavalent to be sent from clear_bmap's
2874 * perspective, so clear the bits from the memory region bitmap which
2875 * are initially set. Otherwise those skipped pages will be sent in
2876 * the next round after syncing from the memory region bitmap.
2877 */
1230a25f 2878 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
6bcb05fc
WW
2879 ram_state->migration_dirty_pages -=
2880 bitmap_count_one_with_offset(block->bmap, start, npages);
2881 bitmap_clear(block->bmap, start, npages);
2882 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2883 }
2884}
2885
3d0684b2
JQ
2886/*
2887 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2888 * long-running RCU critical section. When rcu-reclaims in the code
2889 * start to become numerous it will be necessary to reduce the
2890 * granularity of these critical sections.
2891 */
2892
3d0684b2
JQ
2893/**
2894 * ram_save_setup: Setup RAM for migration
2895 *
2896 * Returns zero to indicate success and negative for error
2897 *
2898 * @f: QEMUFile where to send the data
2899 * @opaque: RAMState pointer
2900 */
a91246c9
HZ
2901static int ram_save_setup(QEMUFile *f, void *opaque)
2902{
53518d94 2903 RAMState **rsp = opaque;
a91246c9
HZ
2904 RAMBlock *block;
2905
dcaf446e
XG
2906 if (compress_threads_save_setup()) {
2907 return -1;
2908 }
2909
a91246c9
HZ
2910 /* migration has already setup the bitmap, reuse it. */
2911 if (!migration_in_colo_state()) {
7d00ee6a 2912 if (ram_init_all(rsp) != 0) {
dcaf446e 2913 compress_threads_save_cleanup();
a91246c9 2914 return -1;
53518d94 2915 }
a91246c9 2916 }
53518d94 2917 (*rsp)->f = f;
a91246c9 2918
0e6ebd48
DDAG
2919 WITH_RCU_READ_LOCK_GUARD() {
2920 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 2921
0e6ebd48
DDAG
2922 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2923 qemu_put_byte(f, strlen(block->idstr));
2924 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2925 qemu_put_be64(f, block->used_length);
2926 if (migrate_postcopy_ram() && block->page_size !=
2927 qemu_host_page_size) {
2928 qemu_put_be64(f, block->page_size);
2929 }
2930 if (migrate_ignore_shared()) {
2931 qemu_put_be64(f, block->mr->addr);
2932 }
fbd162e6 2933 }
56e93d26
JQ
2934 }
2935
56e93d26
JQ
2936 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2937 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2938
99f2c6fb 2939 multifd_send_sync_main(f);
56e93d26 2940 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 2941 qemu_fflush(f);
56e93d26
JQ
2942
2943 return 0;
2944}
2945
3d0684b2
JQ
2946/**
2947 * ram_save_iterate: iterative stage for migration
2948 *
2949 * Returns zero to indicate success and negative for error
2950 *
2951 * @f: QEMUFile where to send the data
2952 * @opaque: RAMState pointer
2953 */
56e93d26
JQ
2954static int ram_save_iterate(QEMUFile *f, void *opaque)
2955{
53518d94
JQ
2956 RAMState **temp = opaque;
2957 RAMState *rs = *temp;
3d4095b2 2958 int ret = 0;
56e93d26
JQ
2959 int i;
2960 int64_t t0;
5c90308f 2961 int done = 0;
56e93d26 2962
b2557345
PL
2963 if (blk_mig_bulk_active()) {
2964 /* Avoid transferring ram during bulk phase of block migration as
2965 * the bulk phase will usually take a long time and transferring
2966 * ram updates during that time is pointless. */
2967 goto out;
2968 }
2969
63268c49
PX
2970 /*
2971 * We'll take this lock a little bit long, but it's okay for two reasons.
2972 * Firstly, the only possible other thread to take it is who calls
2973 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2974 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2975 * guarantees that we'll at least released it in a regular basis.
2976 */
2977 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
2978 WITH_RCU_READ_LOCK_GUARD() {
2979 if (ram_list.version != rs->last_version) {
2980 ram_state_reset(rs);
2981 }
56e93d26 2982
89ac5a1d
DDAG
2983 /* Read version before ram_list.blocks */
2984 smp_rmb();
56e93d26 2985
89ac5a1d 2986 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 2987
89ac5a1d
DDAG
2988 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2989 i = 0;
2990 while ((ret = qemu_file_rate_limit(f)) == 0 ||
a1fe28df 2991 postcopy_has_request(rs)) {
89ac5a1d 2992 int pages;
e03a34f8 2993
89ac5a1d
DDAG
2994 if (qemu_file_get_error(f)) {
2995 break;
2996 }
e8f3735f 2997
05931ec5 2998 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
2999 /* no more pages to sent */
3000 if (pages == 0) {
3001 done = 1;
3002 break;
3003 }
e8f3735f 3004
89ac5a1d
DDAG
3005 if (pages < 0) {
3006 qemu_file_set_error(f, pages);
56e93d26
JQ
3007 break;
3008 }
89ac5a1d
DDAG
3009
3010 rs->target_page_count += pages;
3011
644acf99
WY
3012 /*
3013 * During postcopy, it is necessary to make sure one whole host
3014 * page is sent in one chunk.
3015 */
3016 if (migrate_postcopy_ram()) {
3017 flush_compressed_data(rs);
3018 }
3019
89ac5a1d
DDAG
3020 /*
3021 * we want to check in the 1st loop, just in case it was the 1st
3022 * time and we had to sync the dirty bitmap.
3023 * qemu_clock_get_ns() is a bit expensive, so we only check each
3024 * some iterations
3025 */
3026 if ((i & 63) == 0) {
3027 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3028 1000000;
3029 if (t1 > MAX_WAIT) {
3030 trace_ram_save_iterate_big_wait(t1, i);
3031 break;
3032 }
3033 }
3034 i++;
56e93d26 3035 }
56e93d26 3036 }
63268c49 3037 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26
JQ
3038
3039 /*
3040 * Must occur before EOS (or any QEMUFile operation)
3041 * because of RDMA protocol.
3042 */
3043 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3044
b2557345 3045out:
b69a0227
JQ
3046 if (ret >= 0
3047 && migration_is_setup_or_active(migrate_get_current()->state)) {
99f2c6fb 3048 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3049 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3050 qemu_fflush(f);
4c2d0f6d 3051 ram_transferred_add(8);
56e93d26 3052
3d4095b2
JQ
3053 ret = qemu_file_get_error(f);
3054 }
56e93d26
JQ
3055 if (ret < 0) {
3056 return ret;
3057 }
3058
5c90308f 3059 return done;
56e93d26
JQ
3060}
3061
3d0684b2
JQ
3062/**
3063 * ram_save_complete: function called to send the remaining amount of ram
3064 *
e8f3735f 3065 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3066 *
3067 * Called with iothread lock
3068 *
3069 * @f: QEMUFile where to send the data
3070 * @opaque: RAMState pointer
3071 */
56e93d26
JQ
3072static int ram_save_complete(QEMUFile *f, void *opaque)
3073{
53518d94
JQ
3074 RAMState **temp = opaque;
3075 RAMState *rs = *temp;
e8f3735f 3076 int ret = 0;
6f37bb8b 3077
05931ec5
JQ
3078 rs->last_stage = !migration_in_colo_state();
3079
89ac5a1d
DDAG
3080 WITH_RCU_READ_LOCK_GUARD() {
3081 if (!migration_in_postcopy()) {
3082 migration_bitmap_sync_precopy(rs);
3083 }
56e93d26 3084
89ac5a1d 3085 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3086
89ac5a1d 3087 /* try transferring iterative blocks of memory */
56e93d26 3088
89ac5a1d
DDAG
3089 /* flush all remaining blocks regardless of rate limiting */
3090 while (true) {
3091 int pages;
56e93d26 3092
05931ec5 3093 pages = ram_find_and_save_block(rs);
89ac5a1d
DDAG
3094 /* no more blocks to sent */
3095 if (pages == 0) {
3096 break;
3097 }
3098 if (pages < 0) {
3099 ret = pages;
3100 break;
3101 }
e8f3735f 3102 }
56e93d26 3103
89ac5a1d
DDAG
3104 flush_compressed_data(rs);
3105 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3106 }
d09a6fde 3107
3d4095b2 3108 if (ret >= 0) {
99f2c6fb 3109 multifd_send_sync_main(rs->f);
3d4095b2
JQ
3110 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3111 qemu_fflush(f);
3112 }
56e93d26 3113
e8f3735f 3114 return ret;
56e93d26
JQ
3115}
3116
c31b098f 3117static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3118 uint64_t *res_precopy_only,
3119 uint64_t *res_compatible,
3120 uint64_t *res_postcopy_only)
56e93d26 3121{
53518d94
JQ
3122 RAMState **temp = opaque;
3123 RAMState *rs = *temp;
56e93d26
JQ
3124 uint64_t remaining_size;
3125
9edabd4d 3126 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3127
5727309d 3128 if (!migration_in_postcopy() &&
663e6c1d 3129 remaining_size < max_size) {
56e93d26 3130 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3131 WITH_RCU_READ_LOCK_GUARD() {
3132 migration_bitmap_sync_precopy(rs);
3133 }
56e93d26 3134 qemu_mutex_unlock_iothread();
9edabd4d 3135 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3136 }
c31b098f 3137
86e1167e
VSO
3138 if (migrate_postcopy_ram()) {
3139 /* We can do postcopy, and all the data is postcopiable */
47995026 3140 *res_compatible += remaining_size;
86e1167e 3141 } else {
47995026 3142 *res_precopy_only += remaining_size;
86e1167e 3143 }
56e93d26
JQ
3144}
3145
3146static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3147{
3148 unsigned int xh_len;
3149 int xh_flags;
063e760a 3150 uint8_t *loaded_data;
56e93d26 3151
56e93d26
JQ
3152 /* extract RLE header */
3153 xh_flags = qemu_get_byte(f);
3154 xh_len = qemu_get_be16(f);
3155
3156 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3157 error_report("Failed to load XBZRLE page - wrong compression!");
3158 return -1;
3159 }
3160
3161 if (xh_len > TARGET_PAGE_SIZE) {
3162 error_report("Failed to load XBZRLE page - len overflow!");
3163 return -1;
3164 }
f265e0e4 3165 loaded_data = XBZRLE.decoded_buf;
56e93d26 3166 /* load data and decode */
f265e0e4 3167 /* it can change loaded_data to point to an internal buffer */
063e760a 3168 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3169
3170 /* decode RLE */
063e760a 3171 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3172 TARGET_PAGE_SIZE) == -1) {
3173 error_report("Failed to load XBZRLE page - decode error!");
3174 return -1;
3175 }
3176
3177 return 0;
3178}
3179
3d0684b2
JQ
3180/**
3181 * ram_block_from_stream: read a RAMBlock id from the migration stream
3182 *
3183 * Must be called from within a rcu critical section.
3184 *
56e93d26 3185 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3186 *
3d0684b2
JQ
3187 * @f: QEMUFile where to read the data from
3188 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3189 */
3d0684b2 3190static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26 3191{
49324e93 3192 static RAMBlock *block;
56e93d26
JQ
3193 char id[256];
3194 uint8_t len;
3195
3196 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3197 if (!block) {
56e93d26
JQ
3198 error_report("Ack, bad migration stream!");
3199 return NULL;
3200 }
4c4bad48 3201 return block;
56e93d26
JQ
3202 }
3203
3204 len = qemu_get_byte(f);
3205 qemu_get_buffer(f, (uint8_t *)id, len);
3206 id[len] = 0;
3207
e3dd7493 3208 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3209 if (!block) {
3210 error_report("Can't find block %s", id);
3211 return NULL;
56e93d26
JQ
3212 }
3213
fbd162e6 3214 if (ramblock_is_ignored(block)) {
b895de50
CLG
3215 error_report("block %s should not be migrated !", id);
3216 return NULL;
3217 }
3218
4c4bad48
HZ
3219 return block;
3220}
3221
3222static inline void *host_from_ram_block_offset(RAMBlock *block,
3223 ram_addr_t offset)
3224{
3225 if (!offset_in_ramblock(block, offset)) {
3226 return NULL;
3227 }
3228
3229 return block->host + offset;
56e93d26
JQ
3230}
3231
6a23f639
DH
3232static void *host_page_from_ram_block_offset(RAMBlock *block,
3233 ram_addr_t offset)
3234{
3235 /* Note: Explicitly no check against offset_in_ramblock(). */
3236 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3237 block->page_size);
3238}
3239
3240static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3241 ram_addr_t offset)
3242{
3243 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3244}
3245
13af18f2 3246static inline void *colo_cache_from_block_offset(RAMBlock *block,
8af66371 3247 ram_addr_t offset, bool record_bitmap)
13af18f2
ZC
3248{
3249 if (!offset_in_ramblock(block, offset)) {
3250 return NULL;
3251 }
3252 if (!block->colo_cache) {
3253 error_report("%s: colo_cache is NULL in block :%s",
3254 __func__, block->idstr);
3255 return NULL;
3256 }
7d9acafa
ZC
3257
3258 /*
3259 * During colo checkpoint, we need bitmap of these migrated pages.
3260 * It help us to decide which pages in ram cache should be flushed
3261 * into VM's RAM later.
3262 */
8af66371
HZ
3263 if (record_bitmap &&
3264 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
7d9acafa
ZC
3265 ram_state->migration_dirty_pages++;
3266 }
13af18f2
ZC
3267 return block->colo_cache + offset;
3268}
3269
3d0684b2
JQ
3270/**
3271 * ram_handle_compressed: handle the zero page case
3272 *
56e93d26
JQ
3273 * If a page (or a whole RDMA chunk) has been
3274 * determined to be zero, then zap it.
3d0684b2
JQ
3275 *
3276 * @host: host address for the zero page
3277 * @ch: what the page is filled from. We only support zero
3278 * @size: size of the zero page
56e93d26
JQ
3279 */
3280void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3281{
bad452a7 3282 if (ch != 0 || !buffer_is_zero(host, size)) {
56e93d26
JQ
3283 memset(host, ch, size);
3284 }
3285}
3286
797ca154
XG
3287/* return the size after decompression, or negative value on error */
3288static int
3289qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3290 const uint8_t *source, size_t source_len)
3291{
3292 int err;
3293
3294 err = inflateReset(stream);
3295 if (err != Z_OK) {
3296 return -1;
3297 }
3298
3299 stream->avail_in = source_len;
3300 stream->next_in = (uint8_t *)source;
3301 stream->avail_out = dest_len;
3302 stream->next_out = dest;
3303
3304 err = inflate(stream, Z_NO_FLUSH);
3305 if (err != Z_STREAM_END) {
3306 return -1;
3307 }
3308
3309 return stream->total_out;
3310}
3311
56e93d26
JQ
3312static void *do_data_decompress(void *opaque)
3313{
3314 DecompressParam *param = opaque;
3315 unsigned long pagesize;
33d151f4 3316 uint8_t *des;
34ab9e97 3317 int len, ret;
56e93d26 3318
33d151f4 3319 qemu_mutex_lock(&param->mutex);
90e56fb4 3320 while (!param->quit) {
33d151f4
LL
3321 if (param->des) {
3322 des = param->des;
3323 len = param->len;
3324 param->des = 0;
3325 qemu_mutex_unlock(&param->mutex);
3326
56e93d26 3327 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3328
3329 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3330 param->compbuf, len);
f548222c 3331 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3332 error_report("decompress data failed");
3333 qemu_file_set_error(decomp_file, ret);
3334 }
73a8912b 3335
33d151f4
LL
3336 qemu_mutex_lock(&decomp_done_lock);
3337 param->done = true;
3338 qemu_cond_signal(&decomp_done_cond);
3339 qemu_mutex_unlock(&decomp_done_lock);
3340
3341 qemu_mutex_lock(&param->mutex);
3342 } else {
3343 qemu_cond_wait(&param->cond, &param->mutex);
3344 }
56e93d26 3345 }
33d151f4 3346 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3347
3348 return NULL;
3349}
3350
34ab9e97 3351static int wait_for_decompress_done(void)
5533b2e9
LL
3352{
3353 int idx, thread_count;
3354
3355 if (!migrate_use_compression()) {
34ab9e97 3356 return 0;
5533b2e9
LL
3357 }
3358
3359 thread_count = migrate_decompress_threads();
3360 qemu_mutex_lock(&decomp_done_lock);
3361 for (idx = 0; idx < thread_count; idx++) {
3362 while (!decomp_param[idx].done) {
3363 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3364 }
3365 }
3366 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3367 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3368}
3369
f0afa331 3370static void compress_threads_load_cleanup(void)
56e93d26
JQ
3371{
3372 int i, thread_count;
3373
3416ab5b
JQ
3374 if (!migrate_use_compression()) {
3375 return;
3376 }
56e93d26
JQ
3377 thread_count = migrate_decompress_threads();
3378 for (i = 0; i < thread_count; i++) {
797ca154
XG
3379 /*
3380 * we use it as a indicator which shows if the thread is
3381 * properly init'd or not
3382 */
3383 if (!decomp_param[i].compbuf) {
3384 break;
3385 }
3386
56e93d26 3387 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3388 decomp_param[i].quit = true;
56e93d26
JQ
3389 qemu_cond_signal(&decomp_param[i].cond);
3390 qemu_mutex_unlock(&decomp_param[i].mutex);
3391 }
3392 for (i = 0; i < thread_count; i++) {
797ca154
XG
3393 if (!decomp_param[i].compbuf) {
3394 break;
3395 }
3396
56e93d26
JQ
3397 qemu_thread_join(decompress_threads + i);
3398 qemu_mutex_destroy(&decomp_param[i].mutex);
3399 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3400 inflateEnd(&decomp_param[i].stream);
56e93d26 3401 g_free(decomp_param[i].compbuf);
797ca154 3402 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3403 }
3404 g_free(decompress_threads);
3405 g_free(decomp_param);
56e93d26
JQ
3406 decompress_threads = NULL;
3407 decomp_param = NULL;
34ab9e97 3408 decomp_file = NULL;
56e93d26
JQ
3409}
3410
34ab9e97 3411static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3412{
3413 int i, thread_count;
3414
3415 if (!migrate_use_compression()) {
3416 return 0;
3417 }
3418
3419 thread_count = migrate_decompress_threads();
3420 decompress_threads = g_new0(QemuThread, thread_count);
3421 decomp_param = g_new0(DecompressParam, thread_count);
3422 qemu_mutex_init(&decomp_done_lock);
3423 qemu_cond_init(&decomp_done_cond);
34ab9e97 3424 decomp_file = f;
797ca154
XG
3425 for (i = 0; i < thread_count; i++) {
3426 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3427 goto exit;
3428 }
3429
3430 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3431 qemu_mutex_init(&decomp_param[i].mutex);
3432 qemu_cond_init(&decomp_param[i].cond);
3433 decomp_param[i].done = true;
3434 decomp_param[i].quit = false;
3435 qemu_thread_create(decompress_threads + i, "decompress",
3436 do_data_decompress, decomp_param + i,
3437 QEMU_THREAD_JOINABLE);
3438 }
3439 return 0;
3440exit:
3441 compress_threads_load_cleanup();
3442 return -1;
3443}
3444
c1bc6626 3445static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3446 void *host, int len)
3447{
3448 int idx, thread_count;
3449
3450 thread_count = migrate_decompress_threads();
37396950 3451 QEMU_LOCK_GUARD(&decomp_done_lock);
56e93d26
JQ
3452 while (true) {
3453 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3454 if (decomp_param[idx].done) {
33d151f4
LL
3455 decomp_param[idx].done = false;
3456 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3457 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3458 decomp_param[idx].des = host;
3459 decomp_param[idx].len = len;
33d151f4
LL
3460 qemu_cond_signal(&decomp_param[idx].cond);
3461 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3462 break;
3463 }
3464 }
3465 if (idx < thread_count) {
3466 break;
73a8912b
LL
3467 } else {
3468 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3469 }
3470 }
3471}
3472
b70cb3b4
RL
3473static void colo_init_ram_state(void)
3474{
3475 ram_state_init(&ram_state);
b70cb3b4
RL
3476}
3477
13af18f2
ZC
3478/*
3479 * colo cache: this is for secondary VM, we cache the whole
3480 * memory of the secondary VM, it is need to hold the global lock
3481 * to call this helper.
3482 */
3483int colo_init_ram_cache(void)
3484{
3485 RAMBlock *block;
3486
44901b5a
PB
3487 WITH_RCU_READ_LOCK_GUARD() {
3488 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3489 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
8dbe22c6 3490 NULL, false, false);
44901b5a
PB
3491 if (!block->colo_cache) {
3492 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3493 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3494 block->used_length);
3495 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3496 if (block->colo_cache) {
3497 qemu_anon_ram_free(block->colo_cache, block->used_length);
3498 block->colo_cache = NULL;
3499 }
89ac5a1d 3500 }
44901b5a 3501 return -errno;
89ac5a1d 3502 }
e5fdf920
LS
3503 if (!machine_dump_guest_core(current_machine)) {
3504 qemu_madvise(block->colo_cache, block->used_length,
3505 QEMU_MADV_DONTDUMP);
3506 }
13af18f2 3507 }
13af18f2 3508 }
44901b5a 3509
7d9acafa
ZC
3510 /*
3511 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3512 * with to decide which page in cache should be flushed into SVM's RAM. Here
3513 * we use the same name 'ram_bitmap' as for migration.
3514 */
3515 if (ram_bytes_total()) {
3516 RAMBlock *block;
3517
fbd162e6 3518 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa 3519 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
7d9acafa 3520 block->bmap = bitmap_new(pages);
7d9acafa
ZC
3521 }
3522 }
7d9acafa 3523
b70cb3b4 3524 colo_init_ram_state();
13af18f2 3525 return 0;
13af18f2
ZC
3526}
3527
0393031a
HZ
3528/* TODO: duplicated with ram_init_bitmaps */
3529void colo_incoming_start_dirty_log(void)
3530{
3531 RAMBlock *block = NULL;
3532 /* For memory_global_dirty_log_start below. */
3533 qemu_mutex_lock_iothread();
3534 qemu_mutex_lock_ramlist();
3535
3536 memory_global_dirty_log_sync();
3537 WITH_RCU_READ_LOCK_GUARD() {
3538 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3539 ramblock_sync_dirty_bitmap(ram_state, block);
3540 /* Discard this dirty bitmap record */
3541 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3542 }
63b41db4 3543 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
0393031a
HZ
3544 }
3545 ram_state->migration_dirty_pages = 0;
3546 qemu_mutex_unlock_ramlist();
3547 qemu_mutex_unlock_iothread();
3548}
3549
13af18f2
ZC
3550/* It is need to hold the global lock to call this helper */
3551void colo_release_ram_cache(void)
3552{
3553 RAMBlock *block;
3554
63b41db4 3555 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
fbd162e6 3556 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3557 g_free(block->bmap);
3558 block->bmap = NULL;
3559 }
3560
89ac5a1d
DDAG
3561 WITH_RCU_READ_LOCK_GUARD() {
3562 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563 if (block->colo_cache) {
3564 qemu_anon_ram_free(block->colo_cache, block->used_length);
3565 block->colo_cache = NULL;
3566 }
13af18f2
ZC
3567 }
3568 }
0393031a 3569 ram_state_cleanup(&ram_state);
13af18f2
ZC
3570}
3571
f265e0e4
JQ
3572/**
3573 * ram_load_setup: Setup RAM for migration incoming side
3574 *
3575 * Returns zero to indicate success and negative for error
3576 *
3577 * @f: QEMUFile where to receive the data
3578 * @opaque: RAMState pointer
3579 */
3580static int ram_load_setup(QEMUFile *f, void *opaque)
3581{
34ab9e97 3582 if (compress_threads_load_setup(f)) {
797ca154
XG
3583 return -1;
3584 }
3585
f265e0e4 3586 xbzrle_load_setup();
f9494614 3587 ramblock_recv_map_init();
13af18f2 3588
f265e0e4
JQ
3589 return 0;
3590}
3591
3592static int ram_load_cleanup(void *opaque)
3593{
f9494614 3594 RAMBlock *rb;
56eb90af 3595
fbd162e6 3596 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 3597 qemu_ram_block_writeback(rb);
56eb90af
JH
3598 }
3599
f265e0e4 3600 xbzrle_load_cleanup();
f0afa331 3601 compress_threads_load_cleanup();
f9494614 3602
fbd162e6 3603 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3604 g_free(rb->receivedmap);
3605 rb->receivedmap = NULL;
3606 }
13af18f2 3607
f265e0e4
JQ
3608 return 0;
3609}
3610
3d0684b2
JQ
3611/**
3612 * ram_postcopy_incoming_init: allocate postcopy data structures
3613 *
3614 * Returns 0 for success and negative if there was one error
3615 *
3616 * @mis: current migration incoming state
3617 *
3618 * Allocate data structures etc needed by incoming migration with
3619 * postcopy-ram. postcopy-ram's similarly names
3620 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3621 */
3622int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3623{
c136180c 3624 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3625}
3626
3d0684b2
JQ
3627/**
3628 * ram_load_postcopy: load a page in postcopy case
3629 *
3630 * Returns 0 for success or -errno in case of error
3631 *
a7180877
DDAG
3632 * Called in postcopy mode by ram_load().
3633 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3634 *
3635 * @f: QEMUFile where to send the data
a7180877
DDAG
3636 */
3637static int ram_load_postcopy(QEMUFile *f)
3638{
3639 int flags = 0, ret = 0;
3640 bool place_needed = false;
1aa83678 3641 bool matches_target_page_size = false;
a7180877
DDAG
3642 MigrationIncomingState *mis = migration_incoming_get_current();
3643 /* Temporary page that is later 'placed' */
3414322a 3644 void *postcopy_host_page = mis->postcopy_tmp_page;
6a23f639 3645 void *host_page = NULL;
ddf35bdf 3646 bool all_zero = true;
4cbb3c63 3647 int target_pages = 0;
a7180877
DDAG
3648
3649 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3650 ram_addr_t addr;
a7180877
DDAG
3651 void *page_buffer = NULL;
3652 void *place_source = NULL;
df9ff5e1 3653 RAMBlock *block = NULL;
a7180877 3654 uint8_t ch;
644acf99 3655 int len;
a7180877
DDAG
3656
3657 addr = qemu_get_be64(f);
7a9ddfbf
PX
3658
3659 /*
3660 * If qemu file error, we should stop here, and then "addr"
3661 * may be invalid
3662 */
3663 ret = qemu_file_get_error(f);
3664 if (ret) {
3665 break;
3666 }
3667
a7180877
DDAG
3668 flags = addr & ~TARGET_PAGE_MASK;
3669 addr &= TARGET_PAGE_MASK;
3670
3671 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
644acf99
WY
3672 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3673 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 3674 block = ram_block_from_stream(f, flags);
6a23f639
DH
3675 if (!block) {
3676 ret = -EINVAL;
3677 break;
3678 }
4c4bad48 3679
898ba906
DH
3680 /*
3681 * Relying on used_length is racy and can result in false positives.
3682 * We might place pages beyond used_length in case RAM was shrunk
3683 * while in postcopy, which is fine - trying to place via
3684 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3685 */
3686 if (!block->host || addr >= block->postcopy_length) {
a7180877
DDAG
3687 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3688 ret = -EINVAL;
3689 break;
3690 }
4cbb3c63 3691 target_pages++;
1aa83678 3692 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3693 /*
28abd200
DDAG
3694 * Postcopy requires that we place whole host pages atomically;
3695 * these may be huge pages for RAMBlocks that are backed by
3696 * hugetlbfs.
a7180877
DDAG
3697 * To make it atomic, the data is read into a temporary page
3698 * that's moved into place later.
3699 * The migration protocol uses, possibly smaller, target-pages
3700 * however the source ensures it always sends all the components
91ba442f 3701 * of a host page in one chunk.
a7180877
DDAG
3702 */
3703 page_buffer = postcopy_host_page +
6a23f639
DH
3704 host_page_offset_from_ram_block_offset(block, addr);
3705 /* If all TP are zero then we can optimise the place */
e5e73b0f 3706 if (target_pages == 1) {
6a23f639
DH
3707 host_page = host_page_from_ram_block_offset(block, addr);
3708 } else if (host_page != host_page_from_ram_block_offset(block,
3709 addr)) {
c53b7ddc 3710 /* not the 1st TP within the HP */
6a23f639
DH
3711 error_report("Non-same host page %p/%p", host_page,
3712 host_page_from_ram_block_offset(block, addr));
3713 ret = -EINVAL;
3714 break;
a7180877
DDAG
3715 }
3716
3717 /*
3718 * If it's the last part of a host page then we place the host
3719 * page
3720 */
4cbb3c63
WY
3721 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3722 place_needed = true;
4cbb3c63 3723 }
a7180877
DDAG
3724 place_source = postcopy_host_page;
3725 }
3726
3727 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3728 case RAM_SAVE_FLAG_ZERO:
a7180877 3729 ch = qemu_get_byte(f);
2e36bc1b
WY
3730 /*
3731 * Can skip to set page_buffer when
3732 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3733 */
3734 if (ch || !matches_target_page_size) {
3735 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3736 }
a7180877
DDAG
3737 if (ch) {
3738 all_zero = false;
3739 }
3740 break;
3741
3742 case RAM_SAVE_FLAG_PAGE:
3743 all_zero = false;
1aa83678
PX
3744 if (!matches_target_page_size) {
3745 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
3746 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3747 } else {
1aa83678
PX
3748 /*
3749 * For small pages that matches target page size, we
3750 * avoid the qemu_file copy. Instead we directly use
3751 * the buffer of QEMUFile to place the page. Note: we
3752 * cannot do any QEMUFile operation before using that
3753 * buffer to make sure the buffer is valid when
3754 * placing the page.
a7180877
DDAG
3755 */
3756 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3757 TARGET_PAGE_SIZE);
3758 }
3759 break;
644acf99
WY
3760 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3761 all_zero = false;
3762 len = qemu_get_be32(f);
3763 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3764 error_report("Invalid compressed data length: %d", len);
3765 ret = -EINVAL;
3766 break;
3767 }
3768 decompress_data_with_multi_threads(f, page_buffer, len);
3769 break;
3770
a7180877
DDAG
3771 case RAM_SAVE_FLAG_EOS:
3772 /* normal exit */
6df264ac 3773 multifd_recv_sync_main();
a7180877
DDAG
3774 break;
3775 default:
29fccade 3776 error_report("Unknown combination of migration flags: 0x%x"
a7180877
DDAG
3777 " (postcopy mode)", flags);
3778 ret = -EINVAL;
7a9ddfbf
PX
3779 break;
3780 }
3781
644acf99
WY
3782 /* Got the whole host page, wait for decompress before placing. */
3783 if (place_needed) {
3784 ret |= wait_for_decompress_done();
3785 }
3786
7a9ddfbf
PX
3787 /* Detect for any possible file errors */
3788 if (!ret && qemu_file_get_error(f)) {
3789 ret = qemu_file_get_error(f);
a7180877
DDAG
3790 }
3791
7a9ddfbf 3792 if (!ret && place_needed) {
a7180877 3793 if (all_zero) {
6a23f639 3794 ret = postcopy_place_page_zero(mis, host_page, block);
a7180877 3795 } else {
6a23f639
DH
3796 ret = postcopy_place_page(mis, host_page, place_source,
3797 block);
a7180877 3798 }
ddf35bdf
DH
3799 place_needed = false;
3800 target_pages = 0;
3801 /* Assume we have a zero page until we detect something different */
3802 all_zero = true;
a7180877 3803 }
a7180877
DDAG
3804 }
3805
3806 return ret;
3807}
3808
acab30b8
DHB
3809static bool postcopy_is_advised(void)
3810{
3811 PostcopyState ps = postcopy_state_get();
3812 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3813}
3814
3815static bool postcopy_is_running(void)
3816{
3817 PostcopyState ps = postcopy_state_get();
3818 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3819}
3820
e6f4aa18
ZC
3821/*
3822 * Flush content of RAM cache into SVM's memory.
3823 * Only flush the pages that be dirtied by PVM or SVM or both.
3824 */
24fa16f8 3825void colo_flush_ram_cache(void)
e6f4aa18
ZC
3826{
3827 RAMBlock *block = NULL;
3828 void *dst_host;
3829 void *src_host;
3830 unsigned long offset = 0;
3831
d1955d22 3832 memory_global_dirty_log_sync();
89ac5a1d
DDAG
3833 WITH_RCU_READ_LOCK_GUARD() {
3834 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3835 ramblock_sync_dirty_bitmap(ram_state, block);
3836 }
d1955d22 3837 }
d1955d22 3838
e6f4aa18 3839 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
3840 WITH_RCU_READ_LOCK_GUARD() {
3841 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 3842
89ac5a1d 3843 while (block) {
a6a83cef 3844 unsigned long num = 0;
e6f4aa18 3845
a6a83cef 3846 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
542147f4
DH
3847 if (!offset_in_ramblock(block,
3848 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
89ac5a1d 3849 offset = 0;
a6a83cef 3850 num = 0;
89ac5a1d
DDAG
3851 block = QLIST_NEXT_RCU(block, next);
3852 } else {
a6a83cef
RL
3853 unsigned long i = 0;
3854
3855 for (i = 0; i < num; i++) {
3856 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3857 }
8bba004c
AR
3858 dst_host = block->host
3859 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3860 src_host = block->colo_cache
3861 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
a6a83cef
RL
3862 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3863 offset += num;
89ac5a1d 3864 }
e6f4aa18
ZC
3865 }
3866 }
e6f4aa18
ZC
3867 trace_colo_flush_ram_cache_end();
3868}
3869
10da4a36
WY
3870/**
3871 * ram_load_precopy: load pages in precopy case
3872 *
3873 * Returns 0 for success or -errno in case of error
3874 *
3875 * Called in precopy mode by ram_load().
3876 * rcu_read_lock is taken prior to this being called.
3877 *
3878 * @f: QEMUFile where to send the data
3879 */
3880static int ram_load_precopy(QEMUFile *f)
56e93d26 3881{
e65cec5e 3882 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 3883 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3884 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
3885 if (!migrate_use_compression()) {
3886 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3887 }
a7180877 3888
10da4a36 3889 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3890 ram_addr_t addr, total_ram_bytes;
0393031a 3891 void *host = NULL, *host_bak = NULL;
56e93d26
JQ
3892 uint8_t ch;
3893
e65cec5e
YK
3894 /*
3895 * Yield periodically to let main loop run, but an iteration of
3896 * the main loop is expensive, so do it each some iterations
3897 */
3898 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3899 aio_co_schedule(qemu_get_current_aio_context(),
3900 qemu_coroutine_self());
3901 qemu_coroutine_yield();
3902 }
3903 i++;
3904
56e93d26
JQ
3905 addr = qemu_get_be64(f);
3906 flags = addr & ~TARGET_PAGE_MASK;
3907 addr &= TARGET_PAGE_MASK;
3908
edc60127
JQ
3909 if (flags & invalid_flags) {
3910 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3911 error_report("Received an unexpected compressed page");
3912 }
3913
3914 ret = -EINVAL;
3915 break;
3916 }
3917
bb890ed5 3918 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3919 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3920 RAMBlock *block = ram_block_from_stream(f, flags);
3921
0393031a 3922 host = host_from_ram_block_offset(block, addr);
13af18f2 3923 /*
0393031a
HZ
3924 * After going into COLO stage, we should not load the page
3925 * into SVM's memory directly, we put them into colo_cache firstly.
3926 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3927 * Previously, we copied all these memory in preparing stage of COLO
3928 * while we need to stop VM, which is a time-consuming process.
3929 * Here we optimize it by a trick, back-up every page while in
3930 * migration process while COLO is enabled, though it affects the
3931 * speed of the migration, but it obviously reduce the downtime of
3932 * back-up all SVM'S memory in COLO preparing stage.
13af18f2 3933 */
0393031a
HZ
3934 if (migration_incoming_colo_enabled()) {
3935 if (migration_incoming_in_colo_state()) {
3936 /* In COLO stage, put all pages into cache temporarily */
8af66371 3937 host = colo_cache_from_block_offset(block, addr, true);
0393031a
HZ
3938 } else {
3939 /*
3940 * In migration stage but before COLO stage,
3941 * Put all pages into both cache and SVM's memory.
3942 */
8af66371 3943 host_bak = colo_cache_from_block_offset(block, addr, false);
0393031a 3944 }
13af18f2 3945 }
a776aa15
DDAG
3946 if (!host) {
3947 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3948 ret = -EINVAL;
3949 break;
3950 }
13af18f2
ZC
3951 if (!migration_incoming_in_colo_state()) {
3952 ramblock_recv_bitmap_set(block, host);
3953 }
3954
1db9d8e5 3955 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3956 }
3957
56e93d26
JQ
3958 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3959 case RAM_SAVE_FLAG_MEM_SIZE:
3960 /* Synchronize RAM block list */
3961 total_ram_bytes = addr;
3962 while (!ret && total_ram_bytes) {
3963 RAMBlock *block;
56e93d26
JQ
3964 char id[256];
3965 ram_addr_t length;
3966
3967 len = qemu_get_byte(f);
3968 qemu_get_buffer(f, (uint8_t *)id, len);
3969 id[len] = 0;
3970 length = qemu_get_be64(f);
3971
e3dd7493 3972 block = qemu_ram_block_by_name(id);
b895de50
CLG
3973 if (block && !qemu_ram_is_migratable(block)) {
3974 error_report("block %s should not be migrated !", id);
3975 ret = -EINVAL;
3976 } else if (block) {
e3dd7493
DDAG
3977 if (length != block->used_length) {
3978 Error *local_err = NULL;
56e93d26 3979
fa53a0e5 3980 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3981 &local_err);
3982 if (local_err) {
3983 error_report_err(local_err);
56e93d26 3984 }
56e93d26 3985 }
ef08fb38 3986 /* For postcopy we need to check hugepage sizes match */
e846b746 3987 if (postcopy_advised && migrate_postcopy_ram() &&
ef08fb38
DDAG
3988 block->page_size != qemu_host_page_size) {
3989 uint64_t remote_page_size = qemu_get_be64(f);
3990 if (remote_page_size != block->page_size) {
3991 error_report("Mismatched RAM page size %s "
3992 "(local) %zd != %" PRId64,
3993 id, block->page_size,
3994 remote_page_size);
3995 ret = -EINVAL;
3996 }
3997 }
fbd162e6
YK
3998 if (migrate_ignore_shared()) {
3999 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4000 if (ramblock_is_ignored(block) &&
4001 block->mr->addr != addr) {
4002 error_report("Mismatched GPAs for block %s "
4003 "%" PRId64 "!= %" PRId64,
4004 id, (uint64_t)addr,
4005 (uint64_t)block->mr->addr);
4006 ret = -EINVAL;
4007 }
4008 }
e3dd7493
DDAG
4009 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4010 block->idstr);
4011 } else {
56e93d26
JQ
4012 error_report("Unknown ramblock \"%s\", cannot "
4013 "accept migration", id);
4014 ret = -EINVAL;
4015 }
4016
4017 total_ram_bytes -= length;
4018 }
4019 break;
a776aa15 4020
bb890ed5 4021 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4022 ch = qemu_get_byte(f);
4023 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4024 break;
a776aa15 4025
56e93d26 4026 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4027 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4028 break;
56e93d26 4029
a776aa15 4030 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4031 len = qemu_get_be32(f);
4032 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4033 error_report("Invalid compressed data length: %d", len);
4034 ret = -EINVAL;
4035 break;
4036 }
c1bc6626 4037 decompress_data_with_multi_threads(f, host, len);
56e93d26 4038 break;
a776aa15 4039
56e93d26 4040 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4041 if (load_xbzrle(f, addr, host) < 0) {
4042 error_report("Failed to decompress XBZRLE page at "
4043 RAM_ADDR_FMT, addr);
4044 ret = -EINVAL;
4045 break;
4046 }
4047 break;
4048 case RAM_SAVE_FLAG_EOS:
4049 /* normal exit */
6df264ac 4050 multifd_recv_sync_main();
56e93d26
JQ
4051 break;
4052 default:
4053 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4054 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26 4055 } else {
29fccade 4056 error_report("Unknown combination of migration flags: 0x%x",
56e93d26
JQ
4057 flags);
4058 ret = -EINVAL;
4059 }
4060 }
4061 if (!ret) {
4062 ret = qemu_file_get_error(f);
4063 }
0393031a
HZ
4064 if (!ret && host_bak) {
4065 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4066 }
56e93d26
JQ
4067 }
4068
ca1a6b70 4069 ret |= wait_for_decompress_done();
10da4a36
WY
4070 return ret;
4071}
4072
4073static int ram_load(QEMUFile *f, void *opaque, int version_id)
4074{
4075 int ret = 0;
4076 static uint64_t seq_iter;
4077 /*
4078 * If system is running in postcopy mode, page inserts to host memory must
4079 * be atomic
4080 */
4081 bool postcopy_running = postcopy_is_running();
4082
4083 seq_iter++;
4084
4085 if (version_id != 4) {
4086 return -EINVAL;
4087 }
4088
4089 /*
4090 * This RCU critical section can be very long running.
4091 * When RCU reclaims in the code start to become numerous,
4092 * it will be necessary to reduce the granularity of this
4093 * critical section.
4094 */
89ac5a1d
DDAG
4095 WITH_RCU_READ_LOCK_GUARD() {
4096 if (postcopy_running) {
4097 ret = ram_load_postcopy(f);
4098 } else {
4099 ret = ram_load_precopy(f);
4100 }
10da4a36 4101 }
55c4446b 4102 trace_ram_load_complete(ret, seq_iter);
e6f4aa18 4103
56e93d26
JQ
4104 return ret;
4105}
4106
c6467627
VSO
4107static bool ram_has_postcopy(void *opaque)
4108{
469dd51b 4109 RAMBlock *rb;
fbd162e6 4110 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4111 if (ramblock_is_pmem(rb)) {
4112 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4113 "is not supported now!", rb->idstr, rb->host);
4114 return false;
4115 }
4116 }
4117
c6467627
VSO
4118 return migrate_postcopy_ram();
4119}
4120
edd090c7
PX
4121/* Sync all the dirty bitmap with destination VM. */
4122static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4123{
4124 RAMBlock *block;
4125 QEMUFile *file = s->to_dst_file;
4126 int ramblock_count = 0;
4127
4128 trace_ram_dirty_bitmap_sync_start();
4129
fbd162e6 4130 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4131 qemu_savevm_send_recv_bitmap(file, block->idstr);
4132 trace_ram_dirty_bitmap_request(block->idstr);
4133 ramblock_count++;
4134 }
4135
4136 trace_ram_dirty_bitmap_sync_wait();
4137
4138 /* Wait until all the ramblocks' dirty bitmap synced */
4139 while (ramblock_count--) {
4140 qemu_sem_wait(&s->rp_state.rp_sem);
4141 }
4142
4143 trace_ram_dirty_bitmap_sync_complete();
4144
4145 return 0;
4146}
4147
4148static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4149{
4150 qemu_sem_post(&s->rp_state.rp_sem);
4151}
4152
a335debb
PX
4153/*
4154 * Read the received bitmap, revert it as the initial dirty bitmap.
4155 * This is only used when the postcopy migration is paused but wants
4156 * to resume from a middle point.
4157 */
4158int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4159{
4160 int ret = -EINVAL;
43044ac0 4161 /* from_dst_file is always valid because we're within rp_thread */
a335debb
PX
4162 QEMUFile *file = s->rp_state.from_dst_file;
4163 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4164 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4165 uint64_t size, end_mark;
4166
4167 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4168
4169 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4170 error_report("%s: incorrect state %s", __func__,
4171 MigrationStatus_str(s->state));
4172 return -EINVAL;
4173 }
4174
4175 /*
4176 * Note: see comments in ramblock_recv_bitmap_send() on why we
3a4452d8 4177 * need the endianness conversion, and the paddings.
a335debb
PX
4178 */
4179 local_size = ROUND_UP(local_size, 8);
4180
4181 /* Add paddings */
4182 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4183
4184 size = qemu_get_be64(file);
4185
4186 /* The size of the bitmap should match with our ramblock */
4187 if (size != local_size) {
4188 error_report("%s: ramblock '%s' bitmap size mismatch "
4189 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4190 block->idstr, size, local_size);
4191 ret = -EINVAL;
4192 goto out;
4193 }
4194
4195 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4196 end_mark = qemu_get_be64(file);
4197
4198 ret = qemu_file_get_error(file);
4199 if (ret || size != local_size) {
4200 error_report("%s: read bitmap failed for ramblock '%s': %d"
4201 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4202 __func__, block->idstr, ret, local_size, size);
4203 ret = -EIO;
4204 goto out;
4205 }
4206
4207 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
af3bbbe9 4208 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
a335debb
PX
4209 __func__, block->idstr, end_mark);
4210 ret = -EINVAL;
4211 goto out;
4212 }
4213
4214 /*
3a4452d8 4215 * Endianness conversion. We are during postcopy (though paused).
a335debb
PX
4216 * The dirty bitmap won't change. We can directly modify it.
4217 */
4218 bitmap_from_le(block->bmap, le_bitmap, nbits);
4219
4220 /*
4221 * What we received is "received bitmap". Revert it as the initial
4222 * dirty bitmap for this ramblock.
4223 */
4224 bitmap_complement(block->bmap, block->bmap, nbits);
4225
be39b4cd
DH
4226 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4227 ramblock_dirty_bitmap_clear_discarded_pages(block);
4228
4229 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
a335debb
PX
4230 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4231
edd090c7
PX
4232 /*
4233 * We succeeded to sync bitmap for current ramblock. If this is
4234 * the last one to sync, we need to notify the main send thread.
4235 */
4236 ram_dirty_bitmap_reload_notify(s);
4237
a335debb
PX
4238 ret = 0;
4239out:
bf269906 4240 g_free(le_bitmap);
a335debb
PX
4241 return ret;
4242}
4243
edd090c7
PX
4244static int ram_resume_prepare(MigrationState *s, void *opaque)
4245{
4246 RAMState *rs = *(RAMState **)opaque;
08614f34 4247 int ret;
edd090c7 4248
08614f34
PX
4249 ret = ram_dirty_bitmap_sync_all(s, rs);
4250 if (ret) {
4251 return ret;
4252 }
4253
4254 ram_state_resume_prepare(rs, s->to_dst_file);
4255
4256 return 0;
edd090c7
PX
4257}
4258
56e93d26 4259static SaveVMHandlers savevm_ram_handlers = {
9907e842 4260 .save_setup = ram_save_setup,
56e93d26 4261 .save_live_iterate = ram_save_iterate,
763c906b 4262 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4263 .save_live_complete_precopy = ram_save_complete,
c6467627 4264 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4265 .save_live_pending = ram_save_pending,
4266 .load_state = ram_load,
f265e0e4
JQ
4267 .save_cleanup = ram_save_cleanup,
4268 .load_setup = ram_load_setup,
4269 .load_cleanup = ram_load_cleanup,
edd090c7 4270 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4271};
4272
c7c0e724
DH
4273static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4274 size_t old_size, size_t new_size)
4275{
cc61c703 4276 PostcopyState ps = postcopy_state_get();
c7c0e724
DH
4277 ram_addr_t offset;
4278 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4279 Error *err = NULL;
4280
4281 if (ramblock_is_ignored(rb)) {
4282 return;
4283 }
4284
4285 if (!migration_is_idle()) {
4286 /*
4287 * Precopy code on the source cannot deal with the size of RAM blocks
4288 * changing at random points in time - especially after sending the
4289 * RAM block sizes in the migration stream, they must no longer change.
4290 * Abort and indicate a proper reason.
4291 */
4292 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
458fecca 4293 migration_cancel(err);
c7c0e724 4294 error_free(err);
c7c0e724 4295 }
cc61c703
DH
4296
4297 switch (ps) {
4298 case POSTCOPY_INCOMING_ADVISE:
4299 /*
4300 * Update what ram_postcopy_incoming_init()->init_range() does at the
4301 * time postcopy was advised. Syncing RAM blocks with the source will
4302 * result in RAM resizes.
4303 */
4304 if (old_size < new_size) {
4305 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4306 error_report("RAM block '%s' discard of resized RAM failed",
4307 rb->idstr);
4308 }
4309 }
898ba906 4310 rb->postcopy_length = new_size;
cc61c703
DH
4311 break;
4312 case POSTCOPY_INCOMING_NONE:
4313 case POSTCOPY_INCOMING_RUNNING:
4314 case POSTCOPY_INCOMING_END:
4315 /*
4316 * Once our guest is running, postcopy does no longer care about
4317 * resizes. When growing, the new memory was not available on the
4318 * source, no handler needed.
4319 */
4320 break;
4321 default:
4322 error_report("RAM block '%s' resized during postcopy state: %d",
4323 rb->idstr, ps);
4324 exit(-1);
4325 }
c7c0e724
DH
4326}
4327
4328static RAMBlockNotifier ram_mig_ram_notifier = {
4329 .ram_block_resized = ram_mig_ram_block_resized,
4330};
4331
56e93d26
JQ
4332void ram_mig_init(void)
4333{
4334 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4335 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
c7c0e724 4336 ram_block_notifier_add(&ram_mig_ram_notifier);
56e93d26 4337}
This page took 1.145177 seconds and 4 git commands to generate.