]> Git Repo - qemu.git/blame - migration/ram.c
migration/postcopy: enable random order target page arrival
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
71bb07db 39#include "socket.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
af8b7d2b
JQ
55#include "sysemu/sysemu.h"
56#include "qemu/uuid.h"
edd090c7 57#include "savevm.h"
b9ee2f7d 58#include "qemu/iov.h"
56e93d26 59
56e93d26
JQ
60/***********************************************************/
61/* ram save/restore */
62
bb890ed5
JQ
63/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67 */
68
56e93d26 69#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 70#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
71#define RAM_SAVE_FLAG_MEM_SIZE 0x04
72#define RAM_SAVE_FLAG_PAGE 0x08
73#define RAM_SAVE_FLAG_EOS 0x10
74#define RAM_SAVE_FLAG_CONTINUE 0x20
75#define RAM_SAVE_FLAG_XBZRLE 0x40
76/* 0x80 is reserved in migration.h start with 0x100 next */
77#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78
56e93d26
JQ
79static inline bool is_zero_range(uint8_t *p, uint64_t size)
80{
a1febc49 81 return buffer_is_zero(p, size);
56e93d26
JQ
82}
83
9360447d
JQ
84XBZRLECacheStats xbzrle_counters;
85
56e93d26
JQ
86/* struct contains XBZRLE cache and a static page
87 used by the compression */
88static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
c00e0928
JQ
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
f265e0e4
JQ
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
56e93d26
JQ
100} XBZRLE;
101
56e93d26
JQ
102static void XBZRLE_cache_lock(void)
103{
104 if (migrate_use_xbzrle())
105 qemu_mutex_lock(&XBZRLE.lock);
106}
107
108static void XBZRLE_cache_unlock(void)
109{
110 if (migrate_use_xbzrle())
111 qemu_mutex_unlock(&XBZRLE.lock);
112}
113
3d0684b2
JQ
114/**
115 * xbzrle_cache_resize: resize the xbzrle cache
116 *
117 * This function is called from qmp_migrate_set_cache_size in main
118 * thread, possibly while a migration is in progress. A running
119 * migration may be using the cache and might finish during this call,
120 * hence changes to the cache are protected by XBZRLE.lock().
121 *
c9dede2d 122 * Returns 0 for success or -1 for error
3d0684b2
JQ
123 *
124 * @new_size: new cache size
8acabf69 125 * @errp: set *errp if the check failed, with reason
56e93d26 126 */
c9dede2d 127int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
128{
129 PageCache *new_cache;
c9dede2d 130 int64_t ret = 0;
56e93d26 131
8acabf69
JQ
132 /* Check for truncation */
133 if (new_size != (size_t)new_size) {
134 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135 "exceeding address space");
136 return -1;
137 }
138
2a313e5c
JQ
139 if (new_size == migrate_xbzrle_cache_size()) {
140 /* nothing to do */
c9dede2d 141 return 0;
2a313e5c
JQ
142 }
143
56e93d26
JQ
144 XBZRLE_cache_lock();
145
146 if (XBZRLE.cache != NULL) {
80f8dfde 147 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 148 if (!new_cache) {
56e93d26
JQ
149 ret = -1;
150 goto out;
151 }
152
153 cache_fini(XBZRLE.cache);
154 XBZRLE.cache = new_cache;
155 }
56e93d26
JQ
156out:
157 XBZRLE_cache_unlock();
158 return ret;
159}
160
fbd162e6
YK
161static bool ramblock_is_ignored(RAMBlock *block)
162{
163 return !qemu_ram_is_migratable(block) ||
164 (migrate_ignore_shared() && qemu_ram_is_shared(block));
165}
166
b895de50 167/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
168#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
169 INTERNAL_RAMBLOCK_FOREACH(block) \
170 if (ramblock_is_ignored(block)) {} else
171
b895de50 172#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 173 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
174 if (!qemu_ram_is_migratable(block)) {} else
175
343f632c
DDAG
176#undef RAMBLOCK_FOREACH
177
fbd162e6
YK
178int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
179{
180 RAMBlock *block;
181 int ret = 0;
182
89ac5a1d
DDAG
183 RCU_READ_LOCK_GUARD();
184
fbd162e6
YK
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
fbd162e6
YK
191 return ret;
192}
193
f9494614
AP
194static void ramblock_recv_map_init(void)
195{
196 RAMBlock *rb;
197
fbd162e6 198 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
199 assert(!rb->receivedmap);
200 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
201 }
202}
203
204int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
205{
206 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
207 rb->receivedmap);
208}
209
1cba9f6e
DDAG
210bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
211{
212 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
213}
214
f9494614
AP
215void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
216{
217 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
218}
219
220void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
221 size_t nr)
222{
223 bitmap_set_atomic(rb->receivedmap,
224 ramblock_recv_bitmap_offset(host_addr, rb),
225 nr);
226}
227
a335debb
PX
228#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229
230/*
231 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
232 *
233 * Returns >0 if success with sent bytes, or <0 if error.
234 */
235int64_t ramblock_recv_bitmap_send(QEMUFile *file,
236 const char *block_name)
237{
238 RAMBlock *block = qemu_ram_block_by_name(block_name);
239 unsigned long *le_bitmap, nbits;
240 uint64_t size;
241
242 if (!block) {
243 error_report("%s: invalid block name: %s", __func__, block_name);
244 return -1;
245 }
246
247 nbits = block->used_length >> TARGET_PAGE_BITS;
248
249 /*
250 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
251 * machines we may need 4 more bytes for padding (see below
252 * comment). So extend it a bit before hand.
253 */
254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
255
256 /*
257 * Always use little endian when sending the bitmap. This is
258 * required that when source and destination VMs are not using the
259 * same endianess. (Note: big endian won't work.)
260 */
261 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
262
263 /* Size of the bitmap, in bytes */
a725ef9f 264 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
265
266 /*
267 * size is always aligned to 8 bytes for 64bit machines, but it
268 * may not be true for 32bit machines. We need this padding to
269 * make sure the migration can survive even between 32bit and
270 * 64bit machines.
271 */
272 size = ROUND_UP(size, 8);
273
274 qemu_put_be64(file, size);
275 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
276 /*
277 * Mark as an end, in case the middle part is screwed up due to
278 * some "misterious" reason.
279 */
280 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
281 qemu_fflush(file);
282
bf269906 283 g_free(le_bitmap);
a335debb
PX
284
285 if (qemu_file_get_error(file)) {
286 return qemu_file_get_error(file);
287 }
288
289 return size + sizeof(size);
290}
291
ec481c6c
JQ
292/*
293 * An outstanding page request, on the source, having been received
294 * and queued
295 */
296struct RAMSrcPageRequest {
297 RAMBlock *rb;
298 hwaddr offset;
299 hwaddr len;
300
301 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
302};
303
6f37bb8b
JQ
304/* State of RAM for migration */
305struct RAMState {
204b88b8
JQ
306 /* QEMUFile used for this migration */
307 QEMUFile *f;
6f37bb8b
JQ
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
269ace29
JQ
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
6f37bb8b
JQ
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* We are in the first round */
317 bool ram_bulk_stage;
6eeb63f7
WW
318 /* The free page optimization is enabled */
319 bool fpo_enabled;
8d820d6f
JQ
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt;
f664da80
JQ
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync;
eac74159 325 /* bytes transferred at start_time */
c4bdf0cf 326 uint64_t bytes_xfer_prev;
a66cd90c 327 /* number of dirty pages since start_time */
68908ed6 328 uint64_t num_dirty_pages_period;
b5833fde
JQ
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
331
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
be8b02ed
XG
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
9360447d 344 /* number of dirty bits in the bitmap */
2dfaf12e 345 uint64_t migration_dirty_pages;
386a907b 346 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 347 QemuMutex bitmap_mutex;
68a098f3
JQ
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
ec481c6c
JQ
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
b58deb34 352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
353};
354typedef struct RAMState RAMState;
355
53518d94 356static RAMState *ram_state;
6f37bb8b 357
bd227060
WW
358static NotifierWithReturnList precopy_notifier_list;
359
360void precopy_infrastructure_init(void)
361{
362 notifier_with_return_list_init(&precopy_notifier_list);
363}
364
365void precopy_add_notifier(NotifierWithReturn *n)
366{
367 notifier_with_return_list_add(&precopy_notifier_list, n);
368}
369
370void precopy_remove_notifier(NotifierWithReturn *n)
371{
372 notifier_with_return_remove(n);
373}
374
375int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376{
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
380
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382}
383
6eeb63f7
WW
384void precopy_enable_free_page_optimization(void)
385{
386 if (!ram_state) {
387 return;
388 }
389
390 ram_state->fpo_enabled = true;
391}
392
9edabd4d 393uint64_t ram_bytes_remaining(void)
2f4fde93 394{
bae416e5
DDAG
395 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
396 0;
2f4fde93
JQ
397}
398
9360447d 399MigrationStats ram_counters;
96506894 400
b8fb8cb7
DDAG
401/* used by the search for pages to send */
402struct PageSearchStatus {
403 /* Current block being searched */
404 RAMBlock *block;
a935e30f
JQ
405 /* Current page to search from */
406 unsigned long page;
b8fb8cb7
DDAG
407 /* Set once we wrap around */
408 bool complete_round;
409};
410typedef struct PageSearchStatus PageSearchStatus;
411
76e03000
XG
412CompressionStats compression_counters;
413
56e93d26 414struct CompressParam {
56e93d26 415 bool done;
90e56fb4 416 bool quit;
5e5fdcff 417 bool zero_page;
56e93d26
JQ
418 QEMUFile *file;
419 QemuMutex mutex;
420 QemuCond cond;
421 RAMBlock *block;
422 ram_addr_t offset;
34ab9e97
XG
423
424 /* internally used fields */
dcaf446e 425 z_stream stream;
34ab9e97 426 uint8_t *originbuf;
56e93d26
JQ
427};
428typedef struct CompressParam CompressParam;
429
430struct DecompressParam {
73a8912b 431 bool done;
90e56fb4 432 bool quit;
56e93d26
JQ
433 QemuMutex mutex;
434 QemuCond cond;
435 void *des;
d341d9f3 436 uint8_t *compbuf;
56e93d26 437 int len;
797ca154 438 z_stream stream;
56e93d26
JQ
439};
440typedef struct DecompressParam DecompressParam;
441
442static CompressParam *comp_param;
443static QemuThread *compress_threads;
444/* comp_done_cond is used to wake up the migration thread when
445 * one of the compression threads has finished the compression.
446 * comp_done_lock is used to co-work with comp_done_cond.
447 */
0d9f9a5c
LL
448static QemuMutex comp_done_lock;
449static QemuCond comp_done_cond;
56e93d26
JQ
450/* The empty QEMUFileOps will be used by file in CompressParam */
451static const QEMUFileOps empty_ops = { };
452
34ab9e97 453static QEMUFile *decomp_file;
56e93d26
JQ
454static DecompressParam *decomp_param;
455static QemuThread *decompress_threads;
73a8912b
LL
456static QemuMutex decomp_done_lock;
457static QemuCond decomp_done_cond;
56e93d26 458
5e5fdcff 459static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 460 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
461
462static void *do_data_compress(void *opaque)
463{
464 CompressParam *param = opaque;
a7a9a88f
LL
465 RAMBlock *block;
466 ram_addr_t offset;
5e5fdcff 467 bool zero_page;
56e93d26 468
a7a9a88f 469 qemu_mutex_lock(&param->mutex);
90e56fb4 470 while (!param->quit) {
a7a9a88f
LL
471 if (param->block) {
472 block = param->block;
473 offset = param->offset;
474 param->block = NULL;
475 qemu_mutex_unlock(&param->mutex);
476
5e5fdcff
XG
477 zero_page = do_compress_ram_page(param->file, &param->stream,
478 block, offset, param->originbuf);
a7a9a88f 479
0d9f9a5c 480 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 481 param->done = true;
5e5fdcff 482 param->zero_page = zero_page;
0d9f9a5c
LL
483 qemu_cond_signal(&comp_done_cond);
484 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
485
486 qemu_mutex_lock(&param->mutex);
487 } else {
56e93d26
JQ
488 qemu_cond_wait(&param->cond, &param->mutex);
489 }
56e93d26 490 }
a7a9a88f 491 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
492
493 return NULL;
494}
495
f0afa331 496static void compress_threads_save_cleanup(void)
56e93d26
JQ
497{
498 int i, thread_count;
499
05306935 500 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
501 return;
502 }
05306935 503
56e93d26
JQ
504 thread_count = migrate_compress_threads();
505 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
506 /*
507 * we use it as a indicator which shows if the thread is
508 * properly init'd or not
509 */
510 if (!comp_param[i].file) {
511 break;
512 }
05306935
FL
513
514 qemu_mutex_lock(&comp_param[i].mutex);
515 comp_param[i].quit = true;
516 qemu_cond_signal(&comp_param[i].cond);
517 qemu_mutex_unlock(&comp_param[i].mutex);
518
56e93d26 519 qemu_thread_join(compress_threads + i);
56e93d26
JQ
520 qemu_mutex_destroy(&comp_param[i].mutex);
521 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 522 deflateEnd(&comp_param[i].stream);
34ab9e97 523 g_free(comp_param[i].originbuf);
dcaf446e
XG
524 qemu_fclose(comp_param[i].file);
525 comp_param[i].file = NULL;
56e93d26 526 }
0d9f9a5c
LL
527 qemu_mutex_destroy(&comp_done_lock);
528 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
529 g_free(compress_threads);
530 g_free(comp_param);
56e93d26
JQ
531 compress_threads = NULL;
532 comp_param = NULL;
56e93d26
JQ
533}
534
dcaf446e 535static int compress_threads_save_setup(void)
56e93d26
JQ
536{
537 int i, thread_count;
538
539 if (!migrate_use_compression()) {
dcaf446e 540 return 0;
56e93d26 541 }
56e93d26
JQ
542 thread_count = migrate_compress_threads();
543 compress_threads = g_new0(QemuThread, thread_count);
544 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
545 qemu_cond_init(&comp_done_cond);
546 qemu_mutex_init(&comp_done_lock);
56e93d26 547 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
548 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
549 if (!comp_param[i].originbuf) {
550 goto exit;
551 }
552
dcaf446e
XG
553 if (deflateInit(&comp_param[i].stream,
554 migrate_compress_level()) != Z_OK) {
34ab9e97 555 g_free(comp_param[i].originbuf);
dcaf446e
XG
556 goto exit;
557 }
558
e110aa91
C
559 /* comp_param[i].file is just used as a dummy buffer to save data,
560 * set its ops to empty.
56e93d26
JQ
561 */
562 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
563 comp_param[i].done = true;
90e56fb4 564 comp_param[i].quit = false;
56e93d26
JQ
565 qemu_mutex_init(&comp_param[i].mutex);
566 qemu_cond_init(&comp_param[i].cond);
567 qemu_thread_create(compress_threads + i, "compress",
568 do_data_compress, comp_param + i,
569 QEMU_THREAD_JOINABLE);
570 }
dcaf446e
XG
571 return 0;
572
573exit:
574 compress_threads_save_cleanup();
575 return -1;
56e93d26
JQ
576}
577
f986c3d2
JQ
578/* Multiple fd's */
579
af8b7d2b
JQ
580#define MULTIFD_MAGIC 0x11223344U
581#define MULTIFD_VERSION 1
582
6df264ac
JQ
583#define MULTIFD_FLAG_SYNC (1 << 0)
584
efd1a1d6 585/* This value needs to be a multiple of qemu_target_page_size() */
4b0c7264 586#define MULTIFD_PACKET_SIZE (512 * 1024)
efd1a1d6 587
af8b7d2b
JQ
588typedef struct {
589 uint32_t magic;
590 uint32_t version;
591 unsigned char uuid[16]; /* QemuUUID */
592 uint8_t id;
5fbd8b4b
JQ
593 uint8_t unused1[7]; /* Reserved for future use */
594 uint64_t unused2[4]; /* Reserved for future use */
af8b7d2b
JQ
595} __attribute__((packed)) MultiFDInit_t;
596
2a26c979
JQ
597typedef struct {
598 uint32_t magic;
599 uint32_t version;
600 uint32_t flags;
6f862692
JQ
601 /* maximum number of allocated pages */
602 uint32_t pages_alloc;
603 uint32_t pages_used;
2a34ee59
JQ
604 /* size of the next packet that contains pages */
605 uint32_t next_packet_size;
2a26c979 606 uint64_t packet_num;
5fbd8b4b 607 uint64_t unused[4]; /* Reserved for future use */
2a26c979
JQ
608 char ramblock[256];
609 uint64_t offset[];
610} __attribute__((packed)) MultiFDPacket_t;
611
34c55a94
JQ
612typedef struct {
613 /* number of used pages */
614 uint32_t used;
615 /* number of allocated pages */
616 uint32_t allocated;
617 /* global number of generated multifd packets */
618 uint64_t packet_num;
619 /* offset of each page */
620 ram_addr_t *offset;
621 /* pointer to each page */
622 struct iovec *iov;
623 RAMBlock *block;
624} MultiFDPages_t;
625
8c4598f2
JQ
626typedef struct {
627 /* this fields are not changed once the thread is created */
628 /* channel number */
f986c3d2 629 uint8_t id;
8c4598f2 630 /* channel thread name */
f986c3d2 631 char *name;
8c4598f2 632 /* channel thread id */
f986c3d2 633 QemuThread thread;
8c4598f2 634 /* communication channel */
60df2d4a 635 QIOChannel *c;
8c4598f2 636 /* sem where to wait for more work */
f986c3d2 637 QemuSemaphore sem;
8c4598f2 638 /* this mutex protects the following parameters */
f986c3d2 639 QemuMutex mutex;
8c4598f2 640 /* is this channel thread running */
66770707 641 bool running;
8c4598f2 642 /* should this thread finish */
f986c3d2 643 bool quit;
0beb5ed3
JQ
644 /* thread has work to do */
645 int pending_job;
34c55a94
JQ
646 /* array of pages to sent */
647 MultiFDPages_t *pages;
2a26c979
JQ
648 /* packet allocated len */
649 uint32_t packet_len;
650 /* pointer to the packet */
651 MultiFDPacket_t *packet;
652 /* multifd flags for each packet */
653 uint32_t flags;
2a34ee59
JQ
654 /* size of the next packet that contains pages */
655 uint32_t next_packet_size;
2a26c979
JQ
656 /* global number of generated multifd packets */
657 uint64_t packet_num;
408ea6ae
JQ
658 /* thread local variables */
659 /* packets sent through this channel */
660 uint64_t num_packets;
661 /* pages sent through this channel */
662 uint64_t num_pages;
18cdcea3
JQ
663 /* syncs main thread and channels */
664 QemuSemaphore sem_sync;
8c4598f2
JQ
665} MultiFDSendParams;
666
667typedef struct {
668 /* this fields are not changed once the thread is created */
669 /* channel number */
670 uint8_t id;
671 /* channel thread name */
672 char *name;
673 /* channel thread id */
674 QemuThread thread;
675 /* communication channel */
676 QIOChannel *c;
8c4598f2
JQ
677 /* this mutex protects the following parameters */
678 QemuMutex mutex;
679 /* is this channel thread running */
680 bool running;
3c3ca25d
JQ
681 /* should this thread finish */
682 bool quit;
34c55a94
JQ
683 /* array of pages to receive */
684 MultiFDPages_t *pages;
2a26c979
JQ
685 /* packet allocated len */
686 uint32_t packet_len;
687 /* pointer to the packet */
688 MultiFDPacket_t *packet;
689 /* multifd flags for each packet */
690 uint32_t flags;
691 /* global number of generated multifd packets */
692 uint64_t packet_num;
408ea6ae 693 /* thread local variables */
2a34ee59
JQ
694 /* size of the next packet that contains pages */
695 uint32_t next_packet_size;
408ea6ae
JQ
696 /* packets sent through this channel */
697 uint64_t num_packets;
698 /* pages sent through this channel */
699 uint64_t num_pages;
6df264ac
JQ
700 /* syncs main thread and channels */
701 QemuSemaphore sem_sync;
8c4598f2 702} MultiFDRecvParams;
f986c3d2 703
af8b7d2b
JQ
704static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
705{
d069bcca 706 MultiFDInit_t msg = {};
af8b7d2b
JQ
707 int ret;
708
709 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
710 msg.version = cpu_to_be32(MULTIFD_VERSION);
711 msg.id = p->id;
712 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
713
714 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
715 if (ret != 0) {
716 return -1;
717 }
718 return 0;
719}
720
721static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
722{
723 MultiFDInit_t msg;
724 int ret;
725
726 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
727 if (ret != 0) {
728 return -1;
729 }
730
341ba0df
PM
731 msg.magic = be32_to_cpu(msg.magic);
732 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
733
734 if (msg.magic != MULTIFD_MAGIC) {
735 error_setg(errp, "multifd: received packet magic %x "
736 "expected %x", msg.magic, MULTIFD_MAGIC);
737 return -1;
738 }
739
740 if (msg.version != MULTIFD_VERSION) {
741 error_setg(errp, "multifd: received packet version %d "
742 "expected %d", msg.version, MULTIFD_VERSION);
743 return -1;
744 }
745
746 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
747 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
748 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
749
750 error_setg(errp, "multifd: received uuid '%s' and expected "
751 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
752 g_free(uuid);
753 g_free(msg_uuid);
754 return -1;
755 }
756
757 if (msg.id > migrate_multifd_channels()) {
758 error_setg(errp, "multifd: received channel version %d "
759 "expected %d", msg.version, MULTIFD_VERSION);
760 return -1;
761 }
762
763 return msg.id;
764}
765
34c55a94
JQ
766static MultiFDPages_t *multifd_pages_init(size_t size)
767{
768 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
769
770 pages->allocated = size;
771 pages->iov = g_new0(struct iovec, size);
772 pages->offset = g_new0(ram_addr_t, size);
773
774 return pages;
775}
776
777static void multifd_pages_clear(MultiFDPages_t *pages)
778{
779 pages->used = 0;
780 pages->allocated = 0;
781 pages->packet_num = 0;
782 pages->block = NULL;
783 g_free(pages->iov);
784 pages->iov = NULL;
785 g_free(pages->offset);
786 pages->offset = NULL;
787 g_free(pages);
788}
789
2a26c979
JQ
790static void multifd_send_fill_packet(MultiFDSendParams *p)
791{
792 MultiFDPacket_t *packet = p->packet;
793 int i;
794
2a26c979 795 packet->flags = cpu_to_be32(p->flags);
f2148c4c 796 packet->pages_alloc = cpu_to_be32(p->pages->allocated);
6f862692 797 packet->pages_used = cpu_to_be32(p->pages->used);
2a34ee59 798 packet->next_packet_size = cpu_to_be32(p->next_packet_size);
2a26c979
JQ
799 packet->packet_num = cpu_to_be64(p->packet_num);
800
801 if (p->pages->block) {
802 strncpy(packet->ramblock, p->pages->block->idstr, 256);
803 }
804
805 for (i = 0; i < p->pages->used; i++) {
806 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
807 }
808}
809
810static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
811{
812 MultiFDPacket_t *packet = p->packet;
7ed379b2 813 uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
2a26c979
JQ
814 RAMBlock *block;
815 int i;
816
341ba0df 817 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
818 if (packet->magic != MULTIFD_MAGIC) {
819 error_setg(errp, "multifd: received packet "
820 "magic %x and expected magic %x",
821 packet->magic, MULTIFD_MAGIC);
822 return -1;
823 }
824
341ba0df 825 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
826 if (packet->version != MULTIFD_VERSION) {
827 error_setg(errp, "multifd: received packet "
828 "version %d and expected version %d",
829 packet->version, MULTIFD_VERSION);
830 return -1;
831 }
832
833 p->flags = be32_to_cpu(packet->flags);
834
6f862692 835 packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
7ed379b2 836 /*
d884e77b 837 * If we received a packet that is 100 times bigger than expected
7ed379b2
JQ
838 * just stop migration. It is a magic number.
839 */
840 if (packet->pages_alloc > pages_max * 100) {
2a26c979 841 error_setg(errp, "multifd: received packet "
7ed379b2
JQ
842 "with size %d and expected a maximum size of %d",
843 packet->pages_alloc, pages_max * 100) ;
2a26c979
JQ
844 return -1;
845 }
7ed379b2
JQ
846 /*
847 * We received a packet that is bigger than expected but inside
848 * reasonable limits (see previous comment). Just reallocate.
849 */
850 if (packet->pages_alloc > p->pages->allocated) {
851 multifd_pages_clear(p->pages);
f151f8ac 852 p->pages = multifd_pages_init(packet->pages_alloc);
7ed379b2 853 }
2a26c979 854
6f862692
JQ
855 p->pages->used = be32_to_cpu(packet->pages_used);
856 if (p->pages->used > packet->pages_alloc) {
2a26c979 857 error_setg(errp, "multifd: received packet "
6f862692
JQ
858 "with %d pages and expected maximum pages are %d",
859 p->pages->used, packet->pages_alloc) ;
2a26c979
JQ
860 return -1;
861 }
862
2a34ee59 863 p->next_packet_size = be32_to_cpu(packet->next_packet_size);
2a26c979
JQ
864 p->packet_num = be64_to_cpu(packet->packet_num);
865
e4f1bea2
MAL
866 if (p->pages->used == 0) {
867 return 0;
868 }
869
870 /* make sure that ramblock is 0 terminated */
871 packet->ramblock[255] = 0;
872 block = qemu_ram_block_by_name(packet->ramblock);
873 if (!block) {
874 error_setg(errp, "multifd: unknown ram block %s",
875 packet->ramblock);
876 return -1;
2a26c979
JQ
877 }
878
879 for (i = 0; i < p->pages->used; i++) {
880 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
881
882 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
883 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
884 " (max " RAM_ADDR_FMT ")",
885 offset, block->max_length);
886 return -1;
887 }
888 p->pages->iov[i].iov_base = block->host + offset;
889 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
890 }
891
892 return 0;
893}
894
f986c3d2
JQ
895struct {
896 MultiFDSendParams *params;
34c55a94
JQ
897 /* array of pages to sent */
898 MultiFDPages_t *pages;
6df264ac
JQ
899 /* global number of generated multifd packets */
900 uint64_t packet_num;
b9ee2f7d
JQ
901 /* send channels ready */
902 QemuSemaphore channels_ready;
4d65a621
JQ
903 /*
904 * Have we already run terminate threads. There is a race when it
905 * happens that we got one error while we are exiting.
906 * We will use atomic operations. Only valid values are 0 and 1.
907 */
908 int exiting;
f986c3d2
JQ
909} *multifd_send_state;
910
b9ee2f7d
JQ
911/*
912 * How we use multifd_send_state->pages and channel->pages?
913 *
914 * We create a pages for each channel, and a main one. Each time that
915 * we need to send a batch of pages we interchange the ones between
916 * multifd_send_state and the channel that is sending it. There are
917 * two reasons for that:
918 * - to not have to do so many mallocs during migration
919 * - to make easier to know what to free at the end of migration
920 *
921 * This way we always know who is the owner of each "pages" struct,
a5f7b1a6 922 * and we don't need any locking. It belongs to the migration thread
b9ee2f7d
JQ
923 * or to the channel thread. Switching is safe because the migration
924 * thread is using the channel mutex when changing it, and the channel
925 * have to had finish with its own, otherwise pending_job can't be
926 * false.
927 */
928
1b81c974 929static int multifd_send_pages(RAMState *rs)
b9ee2f7d
JQ
930{
931 int i;
932 static int next_channel;
933 MultiFDSendParams *p = NULL; /* make happy gcc */
934 MultiFDPages_t *pages = multifd_send_state->pages;
935 uint64_t transferred;
936
4d65a621
JQ
937 if (atomic_read(&multifd_send_state->exiting)) {
938 return -1;
939 }
940
b9ee2f7d
JQ
941 qemu_sem_wait(&multifd_send_state->channels_ready);
942 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
943 p = &multifd_send_state->params[i];
944
945 qemu_mutex_lock(&p->mutex);
713f762a
IR
946 if (p->quit) {
947 error_report("%s: channel %d has already quit!", __func__, i);
948 qemu_mutex_unlock(&p->mutex);
949 return -1;
950 }
b9ee2f7d
JQ
951 if (!p->pending_job) {
952 p->pending_job++;
953 next_channel = (i + 1) % migrate_multifd_channels();
954 break;
955 }
956 qemu_mutex_unlock(&p->mutex);
957 }
958 p->pages->used = 0;
959
960 p->packet_num = multifd_send_state->packet_num++;
961 p->pages->block = NULL;
962 multifd_send_state->pages = p->pages;
963 p->pages = pages;
4fcefd44 964 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
1b81c974 965 qemu_file_update_transfer(rs->f, transferred);
b9ee2f7d
JQ
966 ram_counters.multifd_bytes += transferred;
967 ram_counters.transferred += transferred;;
968 qemu_mutex_unlock(&p->mutex);
969 qemu_sem_post(&p->sem);
713f762a
IR
970
971 return 1;
b9ee2f7d
JQ
972}
973
1b81c974 974static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
b9ee2f7d
JQ
975{
976 MultiFDPages_t *pages = multifd_send_state->pages;
977
978 if (!pages->block) {
979 pages->block = block;
980 }
981
982 if (pages->block == block) {
983 pages->offset[pages->used] = offset;
984 pages->iov[pages->used].iov_base = block->host + offset;
985 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
986 pages->used++;
987
988 if (pages->used < pages->allocated) {
713f762a 989 return 1;
b9ee2f7d
JQ
990 }
991 }
992
1b81c974 993 if (multifd_send_pages(rs) < 0) {
713f762a
IR
994 return -1;
995 }
b9ee2f7d
JQ
996
997 if (pages->block != block) {
1b81c974 998 return multifd_queue_page(rs, block, offset);
b9ee2f7d 999 }
713f762a
IR
1000
1001 return 1;
b9ee2f7d
JQ
1002}
1003
66770707 1004static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
1005{
1006 int i;
1007
5558c91a
JQ
1008 trace_multifd_send_terminate_threads(err != NULL);
1009
7a169d74
JQ
1010 if (err) {
1011 MigrationState *s = migrate_get_current();
1012 migrate_set_error(s, err);
1013 if (s->state == MIGRATION_STATUS_SETUP ||
1014 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1015 s->state == MIGRATION_STATUS_DEVICE ||
1016 s->state == MIGRATION_STATUS_ACTIVE) {
1017 migrate_set_state(&s->state, s->state,
1018 MIGRATION_STATUS_FAILED);
1019 }
1020 }
1021
4d65a621
JQ
1022 /*
1023 * We don't want to exit each threads twice. Depending on where
1024 * we get the error, or if there are two independent errors in two
1025 * threads at the same time, we can end calling this function
1026 * twice.
1027 */
1028 if (atomic_xchg(&multifd_send_state->exiting, 1)) {
1029 return;
1030 }
1031
66770707 1032 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1033 MultiFDSendParams *p = &multifd_send_state->params[i];
1034
1035 qemu_mutex_lock(&p->mutex);
1036 p->quit = true;
1037 qemu_sem_post(&p->sem);
1038 qemu_mutex_unlock(&p->mutex);
1039 }
1040}
1041
1398b2e3 1042void multifd_save_cleanup(void)
f986c3d2
JQ
1043{
1044 int i;
f986c3d2
JQ
1045
1046 if (!migrate_use_multifd()) {
1398b2e3 1047 return;
f986c3d2 1048 }
66770707
JQ
1049 multifd_send_terminate_threads(NULL);
1050 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1051 MultiFDSendParams *p = &multifd_send_state->params[i];
1052
66770707
JQ
1053 if (p->running) {
1054 qemu_thread_join(&p->thread);
1055 }
60df2d4a
JQ
1056 socket_send_channel_destroy(p->c);
1057 p->c = NULL;
f986c3d2
JQ
1058 qemu_mutex_destroy(&p->mutex);
1059 qemu_sem_destroy(&p->sem);
18cdcea3 1060 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1061 g_free(p->name);
1062 p->name = NULL;
34c55a94
JQ
1063 multifd_pages_clear(p->pages);
1064 p->pages = NULL;
2a26c979
JQ
1065 p->packet_len = 0;
1066 g_free(p->packet);
1067 p->packet = NULL;
f986c3d2 1068 }
b9ee2f7d 1069 qemu_sem_destroy(&multifd_send_state->channels_ready);
f986c3d2
JQ
1070 g_free(multifd_send_state->params);
1071 multifd_send_state->params = NULL;
34c55a94
JQ
1072 multifd_pages_clear(multifd_send_state->pages);
1073 multifd_send_state->pages = NULL;
f986c3d2
JQ
1074 g_free(multifd_send_state);
1075 multifd_send_state = NULL;
f986c3d2
JQ
1076}
1077
1b81c974 1078static void multifd_send_sync_main(RAMState *rs)
6df264ac
JQ
1079{
1080 int i;
1081
1082 if (!migrate_use_multifd()) {
1083 return;
1084 }
b9ee2f7d 1085 if (multifd_send_state->pages->used) {
1b81c974 1086 if (multifd_send_pages(rs) < 0) {
713f762a
IR
1087 error_report("%s: multifd_send_pages fail", __func__);
1088 return;
1089 }
b9ee2f7d 1090 }
6df264ac
JQ
1091 for (i = 0; i < migrate_multifd_channels(); i++) {
1092 MultiFDSendParams *p = &multifd_send_state->params[i];
1093
1094 trace_multifd_send_sync_main_signal(p->id);
1095
1096 qemu_mutex_lock(&p->mutex);
b9ee2f7d 1097
713f762a
IR
1098 if (p->quit) {
1099 error_report("%s: channel %d has already quit", __func__, i);
1100 qemu_mutex_unlock(&p->mutex);
1101 return;
1102 }
1103
b9ee2f7d 1104 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
1105 p->flags |= MULTIFD_FLAG_SYNC;
1106 p->pending_job++;
1b81c974 1107 qemu_file_update_transfer(rs->f, p->packet_len);
81507f6b
IR
1108 ram_counters.multifd_bytes += p->packet_len;
1109 ram_counters.transferred += p->packet_len;
6df264ac
JQ
1110 qemu_mutex_unlock(&p->mutex);
1111 qemu_sem_post(&p->sem);
1112 }
1113 for (i = 0; i < migrate_multifd_channels(); i++) {
1114 MultiFDSendParams *p = &multifd_send_state->params[i];
1115
1116 trace_multifd_send_sync_main_wait(p->id);
18cdcea3 1117 qemu_sem_wait(&p->sem_sync);
6df264ac
JQ
1118 }
1119 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1120}
1121
f986c3d2
JQ
1122static void *multifd_send_thread(void *opaque)
1123{
1124 MultiFDSendParams *p = opaque;
af8b7d2b 1125 Error *local_err = NULL;
a3ec6b7d
IR
1126 int ret = 0;
1127 uint32_t flags = 0;
af8b7d2b 1128
408ea6ae 1129 trace_multifd_send_thread_start(p->id);
74637e6f 1130 rcu_register_thread();
408ea6ae 1131
af8b7d2b 1132 if (multifd_send_initial_packet(p, &local_err) < 0) {
2f4aefd3 1133 ret = -1;
af8b7d2b
JQ
1134 goto out;
1135 }
408ea6ae
JQ
1136 /* initial packet */
1137 p->num_packets = 1;
f986c3d2
JQ
1138
1139 while (true) {
d82628e4 1140 qemu_sem_wait(&p->sem);
4d65a621
JQ
1141
1142 if (atomic_read(&multifd_send_state->exiting)) {
1143 break;
1144 }
f986c3d2 1145 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1146
1147 if (p->pending_job) {
1148 uint32_t used = p->pages->used;
1149 uint64_t packet_num = p->packet_num;
a3ec6b7d 1150 flags = p->flags;
0beb5ed3 1151
2a34ee59 1152 p->next_packet_size = used * qemu_target_page_size();
0beb5ed3
JQ
1153 multifd_send_fill_packet(p);
1154 p->flags = 0;
1155 p->num_packets++;
1156 p->num_pages += used;
0beb5ed3
JQ
1157 qemu_mutex_unlock(&p->mutex);
1158
2a34ee59
JQ
1159 trace_multifd_send(p->id, packet_num, used, flags,
1160 p->next_packet_size);
0beb5ed3 1161
8b2db7f5
JQ
1162 ret = qio_channel_write_all(p->c, (void *)p->packet,
1163 p->packet_len, &local_err);
1164 if (ret != 0) {
1165 break;
1166 }
1167
ad24c7cb
JQ
1168 if (used) {
1169 ret = qio_channel_writev_all(p->c, p->pages->iov,
1170 used, &local_err);
1171 if (ret != 0) {
1172 break;
1173 }
8b2db7f5 1174 }
0beb5ed3
JQ
1175
1176 qemu_mutex_lock(&p->mutex);
1177 p->pending_job--;
1178 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1179
1180 if (flags & MULTIFD_FLAG_SYNC) {
18cdcea3 1181 qemu_sem_post(&p->sem_sync);
6df264ac 1182 }
b9ee2f7d 1183 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1184 } else if (p->quit) {
f986c3d2
JQ
1185 qemu_mutex_unlock(&p->mutex);
1186 break;
6df264ac
JQ
1187 } else {
1188 qemu_mutex_unlock(&p->mutex);
1189 /* sometimes there are spurious wakeups */
f986c3d2 1190 }
f986c3d2
JQ
1191 }
1192
af8b7d2b
JQ
1193out:
1194 if (local_err) {
7dd59d01 1195 trace_multifd_send_error(p->id);
af8b7d2b
JQ
1196 multifd_send_terminate_threads(local_err);
1197 }
1198
a3ec6b7d
IR
1199 /*
1200 * Error happen, I will exit, but I can't just leave, tell
1201 * who pay attention to me.
1202 */
1203 if (ret != 0) {
2f4aefd3 1204 qemu_sem_post(&p->sem_sync);
a3ec6b7d
IR
1205 qemu_sem_post(&multifd_send_state->channels_ready);
1206 }
1207
66770707
JQ
1208 qemu_mutex_lock(&p->mutex);
1209 p->running = false;
1210 qemu_mutex_unlock(&p->mutex);
1211
74637e6f 1212 rcu_unregister_thread();
408ea6ae
JQ
1213 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1214
f986c3d2
JQ
1215 return NULL;
1216}
1217
60df2d4a
JQ
1218static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1219{
1220 MultiFDSendParams *p = opaque;
1221 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1222 Error *local_err = NULL;
1223
7dd59d01 1224 trace_multifd_new_send_channel_async(p->id);
60df2d4a 1225 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1226 migrate_set_error(migrate_get_current(), local_err);
1227 multifd_save_cleanup();
60df2d4a
JQ
1228 } else {
1229 p->c = QIO_CHANNEL(sioc);
1230 qio_channel_set_delay(p->c, false);
1231 p->running = true;
1232 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1233 QEMU_THREAD_JOINABLE);
60df2d4a
JQ
1234 }
1235}
1236
f986c3d2
JQ
1237int multifd_save_setup(void)
1238{
1239 int thread_count;
efd1a1d6 1240 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1241 uint8_t i;
1242
1243 if (!migrate_use_multifd()) {
1244 return 0;
1245 }
1246 thread_count = migrate_multifd_channels();
1247 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1248 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
34c55a94 1249 multifd_send_state->pages = multifd_pages_init(page_count);
b9ee2f7d 1250 qemu_sem_init(&multifd_send_state->channels_ready, 0);
4d65a621 1251 atomic_set(&multifd_send_state->exiting, 0);
34c55a94 1252
f986c3d2
JQ
1253 for (i = 0; i < thread_count; i++) {
1254 MultiFDSendParams *p = &multifd_send_state->params[i];
1255
1256 qemu_mutex_init(&p->mutex);
1257 qemu_sem_init(&p->sem, 0);
18cdcea3 1258 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1259 p->quit = false;
0beb5ed3 1260 p->pending_job = 0;
f986c3d2 1261 p->id = i;
34c55a94 1262 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1263 p->packet_len = sizeof(MultiFDPacket_t)
1264 + sizeof(ram_addr_t) * page_count;
1265 p->packet = g_malloc0(p->packet_len);
9985e1f4
WY
1266 p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1267 p->packet->version = cpu_to_be32(MULTIFD_VERSION);
f986c3d2 1268 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1269 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1270 }
1271 return 0;
1272}
1273
f986c3d2
JQ
1274struct {
1275 MultiFDRecvParams *params;
1276 /* number of created threads */
1277 int count;
6df264ac
JQ
1278 /* syncs main thread and channels */
1279 QemuSemaphore sem_sync;
1280 /* global number of generated multifd packets */
1281 uint64_t packet_num;
f986c3d2
JQ
1282} *multifd_recv_state;
1283
66770707 1284static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1285{
1286 int i;
1287
5558c91a
JQ
1288 trace_multifd_recv_terminate_threads(err != NULL);
1289
7a169d74
JQ
1290 if (err) {
1291 MigrationState *s = migrate_get_current();
1292 migrate_set_error(s, err);
1293 if (s->state == MIGRATION_STATUS_SETUP ||
1294 s->state == MIGRATION_STATUS_ACTIVE) {
1295 migrate_set_state(&s->state, s->state,
1296 MIGRATION_STATUS_FAILED);
1297 }
1298 }
1299
66770707 1300 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1301 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1302
1303 qemu_mutex_lock(&p->mutex);
3c3ca25d 1304 p->quit = true;
7a5cc33c
JQ
1305 /* We could arrive here for two reasons:
1306 - normal quit, i.e. everything went fine, just finished
1307 - error quit: We close the channels so the channel threads
1308 finish the qio_channel_read_all_eof() */
1309 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1310 qemu_mutex_unlock(&p->mutex);
1311 }
1312}
1313
1314int multifd_load_cleanup(Error **errp)
1315{
1316 int i;
1317 int ret = 0;
1318
1319 if (!migrate_use_multifd()) {
1320 return 0;
1321 }
66770707
JQ
1322 multifd_recv_terminate_threads(NULL);
1323 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1324 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1325
66770707 1326 if (p->running) {
3c3ca25d 1327 p->quit = true;
f193bc0c
IR
1328 /*
1329 * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1330 * however try to wakeup it without harm in cleanup phase.
1331 */
1332 qemu_sem_post(&p->sem_sync);
66770707
JQ
1333 qemu_thread_join(&p->thread);
1334 }
60df2d4a
JQ
1335 object_unref(OBJECT(p->c));
1336 p->c = NULL;
f986c3d2 1337 qemu_mutex_destroy(&p->mutex);
6df264ac 1338 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1339 g_free(p->name);
1340 p->name = NULL;
34c55a94
JQ
1341 multifd_pages_clear(p->pages);
1342 p->pages = NULL;
2a26c979
JQ
1343 p->packet_len = 0;
1344 g_free(p->packet);
1345 p->packet = NULL;
f986c3d2 1346 }
6df264ac 1347 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1348 g_free(multifd_recv_state->params);
1349 multifd_recv_state->params = NULL;
1350 g_free(multifd_recv_state);
1351 multifd_recv_state = NULL;
1352
1353 return ret;
1354}
1355
6df264ac
JQ
1356static void multifd_recv_sync_main(void)
1357{
1358 int i;
1359
1360 if (!migrate_use_multifd()) {
1361 return;
1362 }
1363 for (i = 0; i < migrate_multifd_channels(); i++) {
1364 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1365
6df264ac
JQ
1366 trace_multifd_recv_sync_main_wait(p->id);
1367 qemu_sem_wait(&multifd_recv_state->sem_sync);
77568ea7
WY
1368 }
1369 for (i = 0; i < migrate_multifd_channels(); i++) {
1370 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1371
6df264ac
JQ
1372 qemu_mutex_lock(&p->mutex);
1373 if (multifd_recv_state->packet_num < p->packet_num) {
1374 multifd_recv_state->packet_num = p->packet_num;
1375 }
1376 qemu_mutex_unlock(&p->mutex);
6df264ac 1377 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1378 qemu_sem_post(&p->sem_sync);
1379 }
1380 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1381}
1382
f986c3d2
JQ
1383static void *multifd_recv_thread(void *opaque)
1384{
1385 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1386 Error *local_err = NULL;
1387 int ret;
f986c3d2 1388
408ea6ae 1389 trace_multifd_recv_thread_start(p->id);
74637e6f 1390 rcu_register_thread();
408ea6ae 1391
f986c3d2 1392 while (true) {
6df264ac
JQ
1393 uint32_t used;
1394 uint32_t flags;
0beb5ed3 1395
3c3ca25d
JQ
1396 if (p->quit) {
1397 break;
1398 }
1399
8b2db7f5
JQ
1400 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1401 p->packet_len, &local_err);
1402 if (ret == 0) { /* EOF */
1403 break;
1404 }
1405 if (ret == -1) { /* Error */
1406 break;
1407 }
2a26c979 1408
6df264ac
JQ
1409 qemu_mutex_lock(&p->mutex);
1410 ret = multifd_recv_unfill_packet(p, &local_err);
1411 if (ret) {
f986c3d2
JQ
1412 qemu_mutex_unlock(&p->mutex);
1413 break;
1414 }
6df264ac
JQ
1415
1416 used = p->pages->used;
1417 flags = p->flags;
2a34ee59
JQ
1418 trace_multifd_recv(p->id, p->packet_num, used, flags,
1419 p->next_packet_size);
6df264ac
JQ
1420 p->num_packets++;
1421 p->num_pages += used;
f986c3d2 1422 qemu_mutex_unlock(&p->mutex);
6df264ac 1423
ad24c7cb
JQ
1424 if (used) {
1425 ret = qio_channel_readv_all(p->c, p->pages->iov,
1426 used, &local_err);
1427 if (ret != 0) {
1428 break;
1429 }
8b2db7f5
JQ
1430 }
1431
6df264ac
JQ
1432 if (flags & MULTIFD_FLAG_SYNC) {
1433 qemu_sem_post(&multifd_recv_state->sem_sync);
1434 qemu_sem_wait(&p->sem_sync);
1435 }
f986c3d2
JQ
1436 }
1437
d82628e4
JQ
1438 if (local_err) {
1439 multifd_recv_terminate_threads(local_err);
1440 }
66770707
JQ
1441 qemu_mutex_lock(&p->mutex);
1442 p->running = false;
1443 qemu_mutex_unlock(&p->mutex);
1444
74637e6f 1445 rcu_unregister_thread();
408ea6ae
JQ
1446 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1447
f986c3d2
JQ
1448 return NULL;
1449}
1450
1451int multifd_load_setup(void)
1452{
1453 int thread_count;
efd1a1d6 1454 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1455 uint8_t i;
1456
1457 if (!migrate_use_multifd()) {
1458 return 0;
1459 }
1460 thread_count = migrate_multifd_channels();
1461 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1462 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1463 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1464 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1465
f986c3d2
JQ
1466 for (i = 0; i < thread_count; i++) {
1467 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1468
1469 qemu_mutex_init(&p->mutex);
6df264ac 1470 qemu_sem_init(&p->sem_sync, 0);
3c3ca25d 1471 p->quit = false;
f986c3d2 1472 p->id = i;
34c55a94 1473 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1474 p->packet_len = sizeof(MultiFDPacket_t)
1475 + sizeof(ram_addr_t) * page_count;
1476 p->packet = g_malloc0(p->packet_len);
f986c3d2 1477 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1478 }
1479 return 0;
1480}
1481
62c1e0ca
JQ
1482bool multifd_recv_all_channels_created(void)
1483{
1484 int thread_count = migrate_multifd_channels();
1485
1486 if (!migrate_use_multifd()) {
1487 return true;
1488 }
1489
1490 return thread_count == atomic_read(&multifd_recv_state->count);
1491}
1492
49ed0d24
FL
1493/*
1494 * Try to receive all multifd channels to get ready for the migration.
1495 * - Return true and do not set @errp when correctly receving all channels;
1496 * - Return false and do not set @errp when correctly receiving the current one;
1497 * - Return false and set @errp when failing to receive the current channel.
1498 */
1499bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1500{
60df2d4a 1501 MultiFDRecvParams *p;
af8b7d2b
JQ
1502 Error *local_err = NULL;
1503 int id;
60df2d4a 1504
af8b7d2b
JQ
1505 id = multifd_recv_initial_packet(ioc, &local_err);
1506 if (id < 0) {
1507 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1508 error_propagate_prepend(errp, local_err,
1509 "failed to receive packet"
1510 " via multifd channel %d: ",
1511 atomic_read(&multifd_recv_state->count));
81e62053 1512 return false;
af8b7d2b 1513 }
7dd59d01 1514 trace_multifd_recv_new_channel(id);
af8b7d2b
JQ
1515
1516 p = &multifd_recv_state->params[id];
1517 if (p->c != NULL) {
1518 error_setg(&local_err, "multifd: received id '%d' already setup'",
1519 id);
1520 multifd_recv_terminate_threads(local_err);
49ed0d24 1521 error_propagate(errp, local_err);
81e62053 1522 return false;
af8b7d2b 1523 }
60df2d4a
JQ
1524 p->c = ioc;
1525 object_ref(OBJECT(ioc));
408ea6ae
JQ
1526 /* initial packet */
1527 p->num_packets = 1;
60df2d4a
JQ
1528
1529 p->running = true;
1530 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1531 QEMU_THREAD_JOINABLE);
1532 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1533 return atomic_read(&multifd_recv_state->count) ==
1534 migrate_multifd_channels();
71bb07db
JQ
1535}
1536
56e93d26 1537/**
3d0684b2 1538 * save_page_header: write page header to wire
56e93d26
JQ
1539 *
1540 * If this is the 1st block, it also writes the block identification
1541 *
3d0684b2 1542 * Returns the number of bytes written
56e93d26
JQ
1543 *
1544 * @f: QEMUFile where to send the data
1545 * @block: block that contains the page we want to send
1546 * @offset: offset inside the block for the page
1547 * in the lower bits, it contains flags
1548 */
2bf3aa85
JQ
1549static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1550 ram_addr_t offset)
56e93d26 1551{
9f5f380b 1552 size_t size, len;
56e93d26 1553
24795694
JQ
1554 if (block == rs->last_sent_block) {
1555 offset |= RAM_SAVE_FLAG_CONTINUE;
1556 }
2bf3aa85 1557 qemu_put_be64(f, offset);
56e93d26
JQ
1558 size = 8;
1559
1560 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1561 len = strlen(block->idstr);
2bf3aa85
JQ
1562 qemu_put_byte(f, len);
1563 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1564 size += 1 + len;
24795694 1565 rs->last_sent_block = block;
56e93d26
JQ
1566 }
1567 return size;
1568}
1569
3d0684b2
JQ
1570/**
1571 * mig_throttle_guest_down: throotle down the guest
1572 *
1573 * Reduce amount of guest cpu execution to hopefully slow down memory
1574 * writes. If guest dirty memory rate is reduced below the rate at
1575 * which we can transfer pages to the destination then we should be
1576 * able to complete migration. Some workloads dirty memory way too
1577 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1578 */
1579static void mig_throttle_guest_down(void)
1580{
1581 MigrationState *s = migrate_get_current();
2594f56d
DB
1582 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1583 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1584 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1585
1586 /* We have not started throttling yet. Let's start it. */
1587 if (!cpu_throttle_active()) {
1588 cpu_throttle_set(pct_initial);
1589 } else {
1590 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1591 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1592 pct_max));
070afca2
JH
1593 }
1594}
1595
3d0684b2
JQ
1596/**
1597 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1598 *
6f37bb8b 1599 * @rs: current RAM state
3d0684b2
JQ
1600 * @current_addr: address for the zero page
1601 *
1602 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1603 * The important thing is that a stale (not-yet-0'd) page be replaced
1604 * by the new data.
1605 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1606 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1607 */
6f37bb8b 1608static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1609{
6f37bb8b 1610 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1611 return;
1612 }
1613
1614 /* We don't care if this fails to allocate a new cache page
1615 * as long as it updated an old one */
c00e0928 1616 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1617 ram_counters.dirty_sync_count);
56e93d26
JQ
1618}
1619
1620#define ENCODING_FLAG_XBZRLE 0x1
1621
1622/**
1623 * save_xbzrle_page: compress and send current page
1624 *
1625 * Returns: 1 means that we wrote the page
1626 * 0 means that page is identical to the one already sent
1627 * -1 means that xbzrle would be longer than normal
1628 *
5a987738 1629 * @rs: current RAM state
3d0684b2
JQ
1630 * @current_data: pointer to the address of the page contents
1631 * @current_addr: addr of the page
56e93d26
JQ
1632 * @block: block that contains the page we want to send
1633 * @offset: offset inside the block for the page
1634 * @last_stage: if we are at the completion stage
56e93d26 1635 */
204b88b8 1636static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1637 ram_addr_t current_addr, RAMBlock *block,
072c2511 1638 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1639{
1640 int encoded_len = 0, bytes_xbzrle;
1641 uint8_t *prev_cached_page;
1642
9360447d
JQ
1643 if (!cache_is_cached(XBZRLE.cache, current_addr,
1644 ram_counters.dirty_sync_count)) {
1645 xbzrle_counters.cache_miss++;
56e93d26
JQ
1646 if (!last_stage) {
1647 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1648 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1649 return -1;
1650 } else {
1651 /* update *current_data when the page has been
1652 inserted into cache */
1653 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1654 }
1655 }
1656 return -1;
1657 }
1658
1659 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1660
1661 /* save current buffer into memory */
1662 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1663
1664 /* XBZRLE encoding (if there is no overflow) */
1665 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1666 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1667 TARGET_PAGE_SIZE);
ca353803
WY
1668
1669 /*
1670 * Update the cache contents, so that it corresponds to the data
1671 * sent, in all cases except where we skip the page.
1672 */
1673 if (!last_stage && encoded_len != 0) {
1674 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1675 /*
1676 * In the case where we couldn't compress, ensure that the caller
1677 * sends the data from the cache, since the guest might have
1678 * changed the RAM since we copied it.
1679 */
1680 *current_data = prev_cached_page;
1681 }
1682
56e93d26 1683 if (encoded_len == 0) {
55c4446b 1684 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1685 return 0;
1686 } else if (encoded_len == -1) {
55c4446b 1687 trace_save_xbzrle_page_overflow();
9360447d 1688 xbzrle_counters.overflow++;
56e93d26
JQ
1689 return -1;
1690 }
1691
56e93d26 1692 /* Send XBZRLE based compressed page */
2bf3aa85 1693 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1694 offset | RAM_SAVE_FLAG_XBZRLE);
1695 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1696 qemu_put_be16(rs->f, encoded_len);
1697 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1698 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1699 xbzrle_counters.pages++;
1700 xbzrle_counters.bytes += bytes_xbzrle;
1701 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1702
1703 return 1;
1704}
1705
3d0684b2
JQ
1706/**
1707 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1708 *
a5f7b1a6 1709 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 1710 *
6f37bb8b 1711 * @rs: current RAM state
3d0684b2 1712 * @rb: RAMBlock where to search for dirty pages
a935e30f 1713 * @start: page where we start the search
f3f491fc 1714 */
56e93d26 1715static inline
a935e30f 1716unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1717 unsigned long start)
56e93d26 1718{
6b6712ef
JQ
1719 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1720 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1721 unsigned long next;
1722
fbd162e6 1723 if (ramblock_is_ignored(rb)) {
b895de50
CLG
1724 return size;
1725 }
1726
6eeb63f7
WW
1727 /*
1728 * When the free page optimization is enabled, we need to check the bitmap
1729 * to send the non-free pages rather than all the pages in the bulk stage.
1730 */
1731 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 1732 next = start + 1;
56e93d26 1733 } else {
6b6712ef 1734 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1735 }
1736
6b6712ef 1737 return next;
56e93d26
JQ
1738}
1739
06b10688 1740static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1741 RAMBlock *rb,
1742 unsigned long page)
a82d593b
DDAG
1743{
1744 bool ret;
a82d593b 1745
386a907b 1746 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
1747
1748 /*
1749 * Clear dirty bitmap if needed. This _must_ be called before we
1750 * send any of the page in the chunk because we need to make sure
1751 * we can capture further page content changes when we sync dirty
1752 * log the next time. So as long as we are going to send any of
1753 * the page in the chunk we clear the remote dirty bitmap for all.
1754 * Clearing it earlier won't be a problem, but too late will.
1755 */
1756 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1757 uint8_t shift = rb->clear_bmap_shift;
1758 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1759 hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1760
1761 /*
1762 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1763 * can make things easier sometimes since then start address
1764 * of the small chunk will always be 64 pages aligned so the
1765 * bitmap will always be aligned to unsigned long. We should
1766 * even be able to remove this restriction but I'm simply
1767 * keeping it.
1768 */
1769 assert(shift >= 6);
1770 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1771 memory_region_clear_dirty_bitmap(rb->mr, start, size);
1772 }
1773
6b6712ef 1774 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1775
1776 if (ret) {
0d8ec885 1777 rs->migration_dirty_pages--;
a82d593b 1778 }
386a907b
WW
1779 qemu_mutex_unlock(&rs->bitmap_mutex);
1780
a82d593b
DDAG
1781 return ret;
1782}
1783
267691b6 1784/* Called with RCU critical section */
7a3e9571 1785static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1786{
0d8ec885 1787 rs->migration_dirty_pages +=
5d0980a4 1788 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 1789 &rs->num_dirty_pages_period);
56e93d26
JQ
1790}
1791
3d0684b2
JQ
1792/**
1793 * ram_pagesize_summary: calculate all the pagesizes of a VM
1794 *
1795 * Returns a summary bitmap of the page sizes of all RAMBlocks
1796 *
1797 * For VMs with just normal pages this is equivalent to the host page
1798 * size. If it's got some huge pages then it's the OR of all the
1799 * different page sizes.
e8ca1db2
DDAG
1800 */
1801uint64_t ram_pagesize_summary(void)
1802{
1803 RAMBlock *block;
1804 uint64_t summary = 0;
1805
fbd162e6 1806 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1807 summary |= block->page_size;
1808 }
1809
1810 return summary;
1811}
1812
aecbfe9c
XG
1813uint64_t ram_get_total_transferred_pages(void)
1814{
1815 return ram_counters.normal + ram_counters.duplicate +
1816 compression_counters.pages + xbzrle_counters.pages;
1817}
1818
b734035b
XG
1819static void migration_update_rates(RAMState *rs, int64_t end_time)
1820{
be8b02ed 1821 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1822 double compressed_size;
b734035b
XG
1823
1824 /* calculate period counters */
1825 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1826 / (end_time - rs->time_last_bitmap_sync);
1827
be8b02ed 1828 if (!page_count) {
b734035b
XG
1829 return;
1830 }
1831
1832 if (migrate_use_xbzrle()) {
1833 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1834 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1835 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1836 }
76e03000
XG
1837
1838 if (migrate_use_compression()) {
1839 compression_counters.busy_rate = (double)(compression_counters.busy -
1840 rs->compress_thread_busy_prev) / page_count;
1841 rs->compress_thread_busy_prev = compression_counters.busy;
1842
1843 compressed_size = compression_counters.compressed_size -
1844 rs->compressed_size_prev;
1845 if (compressed_size) {
1846 double uncompressed_size = (compression_counters.pages -
1847 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1848
1849 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1850 compression_counters.compression_rate =
1851 uncompressed_size / compressed_size;
1852
1853 rs->compress_pages_prev = compression_counters.pages;
1854 rs->compressed_size_prev = compression_counters.compressed_size;
1855 }
1856 }
b734035b
XG
1857}
1858
8d820d6f 1859static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1860{
1861 RAMBlock *block;
56e93d26 1862 int64_t end_time;
c4bdf0cf 1863 uint64_t bytes_xfer_now;
56e93d26 1864
9360447d 1865 ram_counters.dirty_sync_count++;
56e93d26 1866
f664da80
JQ
1867 if (!rs->time_last_bitmap_sync) {
1868 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1869 }
1870
1871 trace_migration_bitmap_sync_start();
9c1f8f44 1872 memory_global_dirty_log_sync();
56e93d26 1873
108cfae0 1874 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1875 WITH_RCU_READ_LOCK_GUARD() {
1876 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1877 ramblock_sync_dirty_bitmap(rs, block);
1878 }
1879 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1880 }
108cfae0 1881 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1882
9458a9a1 1883 memory_global_after_dirty_log_sync();
a66cd90c 1884 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1885
56e93d26
JQ
1886 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1887
1888 /* more than 1 second = 1000 millisecons */
f664da80 1889 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1890 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1891
9ac78b61
PL
1892 /* During block migration the auto-converge logic incorrectly detects
1893 * that ram migration makes no progress. Avoid this by disabling the
1894 * throttling logic during the bulk phase of block migration. */
1895 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1896 /* The following detection logic can be refined later. For now:
1897 Check to see if the dirtied bytes is 50% more than the approx.
1898 amount of bytes that just got transferred since the last time we
070afca2
JH
1899 were in this routine. If that happens twice, start or increase
1900 throttling */
070afca2 1901
d693c6f1 1902 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1903 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1904 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1905 trace_migration_throttle();
8d820d6f 1906 rs->dirty_rate_high_cnt = 0;
070afca2 1907 mig_throttle_guest_down();
d693c6f1 1908 }
56e93d26 1909 }
070afca2 1910
b734035b
XG
1911 migration_update_rates(rs, end_time);
1912
be8b02ed 1913 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1914
1915 /* reset period counters */
f664da80 1916 rs->time_last_bitmap_sync = end_time;
a66cd90c 1917 rs->num_dirty_pages_period = 0;
d2a4d85a 1918 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1919 }
4addcd4f 1920 if (migrate_use_events()) {
3ab72385 1921 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1922 }
56e93d26
JQ
1923}
1924
bd227060
WW
1925static void migration_bitmap_sync_precopy(RAMState *rs)
1926{
1927 Error *local_err = NULL;
1928
1929 /*
1930 * The current notifier usage is just an optimization to migration, so we
1931 * don't stop the normal migration process in the error case.
1932 */
1933 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1934 error_report_err(local_err);
1935 }
1936
1937 migration_bitmap_sync(rs);
1938
1939 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1940 error_report_err(local_err);
1941 }
1942}
1943
6c97ec5f
XG
1944/**
1945 * save_zero_page_to_file: send the zero page to the file
1946 *
1947 * Returns the size of data written to the file, 0 means the page is not
1948 * a zero page
1949 *
1950 * @rs: current RAM state
1951 * @file: the file where the data is saved
1952 * @block: block that contains the page we want to send
1953 * @offset: offset inside the block for the page
1954 */
1955static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1956 RAMBlock *block, ram_addr_t offset)
1957{
1958 uint8_t *p = block->host + offset;
1959 int len = 0;
1960
1961 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1962 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1963 qemu_put_byte(file, 0);
1964 len += 1;
1965 }
1966 return len;
1967}
1968
56e93d26 1969/**
3d0684b2 1970 * save_zero_page: send the zero page to the stream
56e93d26 1971 *
3d0684b2 1972 * Returns the number of pages written.
56e93d26 1973 *
f7ccd61b 1974 * @rs: current RAM state
56e93d26
JQ
1975 * @block: block that contains the page we want to send
1976 * @offset: offset inside the block for the page
56e93d26 1977 */
7faccdc3 1978static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1979{
6c97ec5f 1980 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1981
6c97ec5f 1982 if (len) {
9360447d 1983 ram_counters.duplicate++;
6c97ec5f
XG
1984 ram_counters.transferred += len;
1985 return 1;
56e93d26 1986 }
6c97ec5f 1987 return -1;
56e93d26
JQ
1988}
1989
5727309d 1990static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1991{
5727309d 1992 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1993 return;
1994 }
1995
aaa2064c 1996 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1997}
1998
059ff0fb
XG
1999/*
2000 * @pages: the number of pages written by the control path,
2001 * < 0 - error
2002 * > 0 - number of pages written
2003 *
2004 * Return true if the pages has been saved, otherwise false is returned.
2005 */
2006static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2007 int *pages)
2008{
2009 uint64_t bytes_xmit = 0;
2010 int ret;
2011
2012 *pages = -1;
2013 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
2014 &bytes_xmit);
2015 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
2016 return false;
2017 }
2018
2019 if (bytes_xmit) {
2020 ram_counters.transferred += bytes_xmit;
2021 *pages = 1;
2022 }
2023
2024 if (ret == RAM_SAVE_CONTROL_DELAYED) {
2025 return true;
2026 }
2027
2028 if (bytes_xmit > 0) {
2029 ram_counters.normal++;
2030 } else if (bytes_xmit == 0) {
2031 ram_counters.duplicate++;
2032 }
2033
2034 return true;
2035}
2036
65dacaa0
XG
2037/*
2038 * directly send the page to the stream
2039 *
2040 * Returns the number of pages written.
2041 *
2042 * @rs: current RAM state
2043 * @block: block that contains the page we want to send
2044 * @offset: offset inside the block for the page
2045 * @buf: the page to be sent
2046 * @async: send to page asyncly
2047 */
2048static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2049 uint8_t *buf, bool async)
2050{
2051 ram_counters.transferred += save_page_header(rs, rs->f, block,
2052 offset | RAM_SAVE_FLAG_PAGE);
2053 if (async) {
2054 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2055 migrate_release_ram() &
2056 migration_in_postcopy());
2057 } else {
2058 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2059 }
2060 ram_counters.transferred += TARGET_PAGE_SIZE;
2061 ram_counters.normal++;
2062 return 1;
2063}
2064
56e93d26 2065/**
3d0684b2 2066 * ram_save_page: send the given page to the stream
56e93d26 2067 *
3d0684b2 2068 * Returns the number of pages written.
3fd3c4b3
DDAG
2069 * < 0 - error
2070 * >=0 - Number of pages written - this might legally be 0
2071 * if xbzrle noticed the page was the same.
56e93d26 2072 *
6f37bb8b 2073 * @rs: current RAM state
56e93d26
JQ
2074 * @block: block that contains the page we want to send
2075 * @offset: offset inside the block for the page
2076 * @last_stage: if we are at the completion stage
56e93d26 2077 */
a0a8aa14 2078static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
2079{
2080 int pages = -1;
56e93d26 2081 uint8_t *p;
56e93d26 2082 bool send_async = true;
a08f6890 2083 RAMBlock *block = pss->block;
a935e30f 2084 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 2085 ram_addr_t current_addr = block->offset + offset;
56e93d26 2086
2f68e399 2087 p = block->host + offset;
1db9d8e5 2088 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 2089
56e93d26 2090 XBZRLE_cache_lock();
d7400a34
XG
2091 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2092 migrate_use_xbzrle()) {
059ff0fb
XG
2093 pages = save_xbzrle_page(rs, &p, current_addr, block,
2094 offset, last_stage);
2095 if (!last_stage) {
2096 /* Can't send this cached data async, since the cache page
2097 * might get updated before it gets to the wire
56e93d26 2098 */
059ff0fb 2099 send_async = false;
56e93d26
JQ
2100 }
2101 }
2102
2103 /* XBZRLE overflow or normal page */
2104 if (pages == -1) {
65dacaa0 2105 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
2106 }
2107
2108 XBZRLE_cache_unlock();
2109
2110 return pages;
2111}
2112
b9ee2f7d
JQ
2113static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2114 ram_addr_t offset)
2115{
1b81c974 2116 if (multifd_queue_page(rs, block, offset) < 0) {
713f762a
IR
2117 return -1;
2118 }
b9ee2f7d
JQ
2119 ram_counters.normal++;
2120
2121 return 1;
2122}
2123
5e5fdcff 2124static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 2125 ram_addr_t offset, uint8_t *source_buf)
56e93d26 2126{
53518d94 2127 RAMState *rs = ram_state;
a7a9a88f 2128 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 2129 bool zero_page = false;
6ef3771c 2130 int ret;
56e93d26 2131
5e5fdcff
XG
2132 if (save_zero_page_to_file(rs, f, block, offset)) {
2133 zero_page = true;
2134 goto exit;
2135 }
2136
6ef3771c 2137 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
2138
2139 /*
2140 * copy it to a internal buffer to avoid it being modified by VM
2141 * so that we can catch up the error during compression and
2142 * decompression
2143 */
2144 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
2145 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2146 if (ret < 0) {
2147 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 2148 error_report("compressed data failed!");
5e5fdcff 2149 return false;
b3be2896 2150 }
56e93d26 2151
5e5fdcff 2152exit:
6ef3771c 2153 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
2154 return zero_page;
2155}
2156
2157static void
2158update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2159{
76e03000
XG
2160 ram_counters.transferred += bytes_xmit;
2161
5e5fdcff
XG
2162 if (param->zero_page) {
2163 ram_counters.duplicate++;
76e03000 2164 return;
5e5fdcff 2165 }
76e03000
XG
2166
2167 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2168 compression_counters.compressed_size += bytes_xmit - 8;
2169 compression_counters.pages++;
56e93d26
JQ
2170}
2171
32b05495
XG
2172static bool save_page_use_compression(RAMState *rs);
2173
ce25d337 2174static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
2175{
2176 int idx, len, thread_count;
2177
32b05495 2178 if (!save_page_use_compression(rs)) {
56e93d26
JQ
2179 return;
2180 }
2181 thread_count = migrate_compress_threads();
a7a9a88f 2182
0d9f9a5c 2183 qemu_mutex_lock(&comp_done_lock);
56e93d26 2184 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 2185 while (!comp_param[idx].done) {
0d9f9a5c 2186 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 2187 }
a7a9a88f 2188 }
0d9f9a5c 2189 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
2190
2191 for (idx = 0; idx < thread_count; idx++) {
2192 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 2193 if (!comp_param[idx].quit) {
ce25d337 2194 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
2195 /*
2196 * it's safe to fetch zero_page without holding comp_done_lock
2197 * as there is no further request submitted to the thread,
2198 * i.e, the thread should be waiting for a request at this point.
2199 */
2200 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 2201 }
a7a9a88f 2202 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
2203 }
2204}
2205
2206static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2207 ram_addr_t offset)
2208{
2209 param->block = block;
2210 param->offset = offset;
2211}
2212
ce25d337
JQ
2213static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2214 ram_addr_t offset)
56e93d26
JQ
2215{
2216 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 2217 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
2218
2219 thread_count = migrate_compress_threads();
0d9f9a5c 2220 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
2221retry:
2222 for (idx = 0; idx < thread_count; idx++) {
2223 if (comp_param[idx].done) {
2224 comp_param[idx].done = false;
2225 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2226 qemu_mutex_lock(&comp_param[idx].mutex);
2227 set_compress_params(&comp_param[idx], block, offset);
2228 qemu_cond_signal(&comp_param[idx].cond);
2229 qemu_mutex_unlock(&comp_param[idx].mutex);
2230 pages = 1;
5e5fdcff 2231 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2232 break;
56e93d26
JQ
2233 }
2234 }
1d58872a
XG
2235
2236 /*
2237 * wait for the free thread if the user specifies 'compress-wait-thread',
2238 * otherwise we will post the page out in the main thread as normal page.
2239 */
2240 if (pages < 0 && wait) {
2241 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2242 goto retry;
2243 }
0d9f9a5c 2244 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2245
2246 return pages;
2247}
2248
3d0684b2
JQ
2249/**
2250 * find_dirty_block: find the next dirty page and update any state
2251 * associated with the search process.
b9e60928 2252 *
a5f7b1a6 2253 * Returns true if a page is found
b9e60928 2254 *
6f37bb8b 2255 * @rs: current RAM state
3d0684b2
JQ
2256 * @pss: data about the state of the current dirty page scan
2257 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2258 */
f20e2865 2259static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2260{
f20e2865 2261 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2262 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2263 pss->page >= rs->last_page) {
b9e60928
DDAG
2264 /*
2265 * We've been once around the RAM and haven't found anything.
2266 * Give up.
2267 */
2268 *again = false;
2269 return false;
2270 }
a935e30f 2271 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2272 /* Didn't find anything in this RAM Block */
a935e30f 2273 pss->page = 0;
b9e60928
DDAG
2274 pss->block = QLIST_NEXT_RCU(pss->block, next);
2275 if (!pss->block) {
48df9d80
XG
2276 /*
2277 * If memory migration starts over, we will meet a dirtied page
2278 * which may still exists in compression threads's ring, so we
2279 * should flush the compressed data to make sure the new page
2280 * is not overwritten by the old one in the destination.
2281 *
2282 * Also If xbzrle is on, stop using the data compression at this
2283 * point. In theory, xbzrle can do better than compression.
2284 */
2285 flush_compressed_data(rs);
2286
b9e60928
DDAG
2287 /* Hit the end of the list */
2288 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2289 /* Flag that we've looped */
2290 pss->complete_round = true;
6f37bb8b 2291 rs->ram_bulk_stage = false;
b9e60928
DDAG
2292 }
2293 /* Didn't find anything this time, but try again on the new block */
2294 *again = true;
2295 return false;
2296 } else {
2297 /* Can go around again, but... */
2298 *again = true;
2299 /* We've found something so probably don't need to */
2300 return true;
2301 }
2302}
2303
3d0684b2
JQ
2304/**
2305 * unqueue_page: gets a page of the queue
2306 *
a82d593b 2307 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2308 *
3d0684b2
JQ
2309 * Returns the block of the page (or NULL if none available)
2310 *
ec481c6c 2311 * @rs: current RAM state
3d0684b2 2312 * @offset: used to return the offset within the RAMBlock
a82d593b 2313 */
f20e2865 2314static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2315{
2316 RAMBlock *block = NULL;
2317
ae526e32
XG
2318 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2319 return NULL;
2320 }
2321
ec481c6c
JQ
2322 qemu_mutex_lock(&rs->src_page_req_mutex);
2323 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2324 struct RAMSrcPageRequest *entry =
2325 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2326 block = entry->rb;
2327 *offset = entry->offset;
a82d593b
DDAG
2328
2329 if (entry->len > TARGET_PAGE_SIZE) {
2330 entry->len -= TARGET_PAGE_SIZE;
2331 entry->offset += TARGET_PAGE_SIZE;
2332 } else {
2333 memory_region_unref(block->mr);
ec481c6c 2334 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2335 g_free(entry);
e03a34f8 2336 migration_consume_urgent_request();
a82d593b
DDAG
2337 }
2338 }
ec481c6c 2339 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2340
2341 return block;
2342}
2343
3d0684b2 2344/**
ff1543af 2345 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2346 *
2347 * Skips pages that are already sent (!dirty)
a82d593b 2348 *
a5f7b1a6 2349 * Returns true if a queued page is found
a82d593b 2350 *
6f37bb8b 2351 * @rs: current RAM state
3d0684b2 2352 * @pss: data about the state of the current dirty page scan
a82d593b 2353 */
f20e2865 2354static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2355{
2356 RAMBlock *block;
2357 ram_addr_t offset;
2358 bool dirty;
2359
2360 do {
f20e2865 2361 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2362 /*
2363 * We're sending this page, and since it's postcopy nothing else
2364 * will dirty it, and we must make sure it doesn't get sent again
2365 * even if this queue request was received after the background
2366 * search already sent it.
2367 */
2368 if (block) {
f20e2865
JQ
2369 unsigned long page;
2370
6b6712ef
JQ
2371 page = offset >> TARGET_PAGE_BITS;
2372 dirty = test_bit(page, block->bmap);
a82d593b 2373 if (!dirty) {
06b10688 2374 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 2375 page);
a82d593b 2376 } else {
f20e2865 2377 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2378 }
2379 }
2380
2381 } while (block && !dirty);
2382
2383 if (block) {
2384 /*
2385 * As soon as we start servicing pages out of order, then we have
2386 * to kill the bulk stage, since the bulk stage assumes
2387 * in (migration_bitmap_find_and_reset_dirty) that every page is
2388 * dirty, that's no longer true.
2389 */
6f37bb8b 2390 rs->ram_bulk_stage = false;
a82d593b
DDAG
2391
2392 /*
2393 * We want the background search to continue from the queued page
2394 * since the guest is likely to want other pages near to the page
2395 * it just requested.
2396 */
2397 pss->block = block;
a935e30f 2398 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2399
2400 /*
2401 * This unqueued page would break the "one round" check, even is
2402 * really rare.
2403 */
2404 pss->complete_round = false;
a82d593b
DDAG
2405 }
2406
2407 return !!block;
2408}
2409
6c595cde 2410/**
5e58f968
JQ
2411 * migration_page_queue_free: drop any remaining pages in the ram
2412 * request queue
6c595cde 2413 *
3d0684b2
JQ
2414 * It should be empty at the end anyway, but in error cases there may
2415 * be some left. in case that there is any page left, we drop it.
2416 *
6c595cde 2417 */
83c13382 2418static void migration_page_queue_free(RAMState *rs)
6c595cde 2419{
ec481c6c 2420 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2421 /* This queue generally should be empty - but in the case of a failed
2422 * migration might have some droppings in.
2423 */
89ac5a1d 2424 RCU_READ_LOCK_GUARD();
ec481c6c 2425 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2426 memory_region_unref(mspr->rb->mr);
ec481c6c 2427 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2428 g_free(mspr);
2429 }
6c595cde
DDAG
2430}
2431
2432/**
3d0684b2
JQ
2433 * ram_save_queue_pages: queue the page for transmission
2434 *
2435 * A request from postcopy destination for example.
2436 *
2437 * Returns zero on success or negative on error
2438 *
3d0684b2
JQ
2439 * @rbname: Name of the RAMBLock of the request. NULL means the
2440 * same that last one.
2441 * @start: starting address from the start of the RAMBlock
2442 * @len: length (in bytes) to send
6c595cde 2443 */
96506894 2444int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2445{
2446 RAMBlock *ramblock;
53518d94 2447 RAMState *rs = ram_state;
6c595cde 2448
9360447d 2449 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2450 RCU_READ_LOCK_GUARD();
2451
6c595cde
DDAG
2452 if (!rbname) {
2453 /* Reuse last RAMBlock */
68a098f3 2454 ramblock = rs->last_req_rb;
6c595cde
DDAG
2455
2456 if (!ramblock) {
2457 /*
2458 * Shouldn't happen, we can't reuse the last RAMBlock if
2459 * it's the 1st request.
2460 */
2461 error_report("ram_save_queue_pages no previous block");
03acb4e9 2462 return -1;
6c595cde
DDAG
2463 }
2464 } else {
2465 ramblock = qemu_ram_block_by_name(rbname);
2466
2467 if (!ramblock) {
2468 /* We shouldn't be asked for a non-existent RAMBlock */
2469 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2470 return -1;
6c595cde 2471 }
68a098f3 2472 rs->last_req_rb = ramblock;
6c595cde
DDAG
2473 }
2474 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2475 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2476 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2477 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2478 __func__, start, len, ramblock->used_length);
03acb4e9 2479 return -1;
6c595cde
DDAG
2480 }
2481
ec481c6c
JQ
2482 struct RAMSrcPageRequest *new_entry =
2483 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2484 new_entry->rb = ramblock;
2485 new_entry->offset = start;
2486 new_entry->len = len;
2487
2488 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2489 qemu_mutex_lock(&rs->src_page_req_mutex);
2490 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2491 migration_make_urgent_request();
ec481c6c 2492 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2493
2494 return 0;
6c595cde
DDAG
2495}
2496
d7400a34
XG
2497static bool save_page_use_compression(RAMState *rs)
2498{
2499 if (!migrate_use_compression()) {
2500 return false;
2501 }
2502
2503 /*
2504 * If xbzrle is on, stop using the data compression after first
2505 * round of migration even if compression is enabled. In theory,
2506 * xbzrle can do better than compression.
2507 */
2508 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2509 return true;
2510 }
2511
2512 return false;
2513}
2514
5e5fdcff
XG
2515/*
2516 * try to compress the page before posting it out, return true if the page
2517 * has been properly handled by compression, otherwise needs other
2518 * paths to handle it
2519 */
2520static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2521{
2522 if (!save_page_use_compression(rs)) {
2523 return false;
2524 }
2525
2526 /*
2527 * When starting the process of a new block, the first page of
2528 * the block should be sent out before other pages in the same
2529 * block, and all the pages in last block should have been sent
2530 * out, keeping this order is important, because the 'cont' flag
2531 * is used to avoid resending the block name.
2532 *
2533 * We post the fist page as normal page as compression will take
2534 * much CPU resource.
2535 */
2536 if (block != rs->last_sent_block) {
2537 flush_compressed_data(rs);
2538 return false;
2539 }
2540
2541 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2542 return true;
2543 }
2544
76e03000 2545 compression_counters.busy++;
5e5fdcff
XG
2546 return false;
2547}
2548
a82d593b 2549/**
3d0684b2 2550 * ram_save_target_page: save one target page
a82d593b 2551 *
3d0684b2 2552 * Returns the number of pages written
a82d593b 2553 *
6f37bb8b 2554 * @rs: current RAM state
3d0684b2 2555 * @pss: data about the page we want to send
a82d593b 2556 * @last_stage: if we are at the completion stage
a82d593b 2557 */
a0a8aa14 2558static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2559 bool last_stage)
a82d593b 2560{
a8ec91f9
XG
2561 RAMBlock *block = pss->block;
2562 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2563 int res;
2564
2565 if (control_save_page(rs, block, offset, &res)) {
2566 return res;
2567 }
2568
5e5fdcff
XG
2569 if (save_compress_page(rs, block, offset)) {
2570 return 1;
d7400a34
XG
2571 }
2572
2573 res = save_zero_page(rs, block, offset);
2574 if (res > 0) {
2575 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2576 * page would be stale
2577 */
2578 if (!save_page_use_compression(rs)) {
2579 XBZRLE_cache_lock();
2580 xbzrle_cache_zero_page(rs, block->offset + offset);
2581 XBZRLE_cache_unlock();
2582 }
2583 ram_release_pages(block->idstr, offset, res);
2584 return res;
2585 }
2586
da3f56cb 2587 /*
5e5fdcff
XG
2588 * do not use multifd for compression as the first page in the new
2589 * block should be posted out before sending the compressed page
da3f56cb 2590 */
5e5fdcff 2591 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2592 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2593 }
2594
1faa5665 2595 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2596}
2597
2598/**
3d0684b2 2599 * ram_save_host_page: save a whole host page
a82d593b 2600 *
3d0684b2
JQ
2601 * Starting at *offset send pages up to the end of the current host
2602 * page. It's valid for the initial offset to point into the middle of
2603 * a host page in which case the remainder of the hostpage is sent.
2604 * Only dirty target pages are sent. Note that the host page size may
2605 * be a huge page for this block.
1eb3fc0a
DDAG
2606 * The saving stops at the boundary of the used_length of the block
2607 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2608 *
3d0684b2
JQ
2609 * Returns the number of pages written or negative on error
2610 *
6f37bb8b 2611 * @rs: current RAM state
3d0684b2 2612 * @ms: current migration state
3d0684b2 2613 * @pss: data about the page we want to send
a82d593b 2614 * @last_stage: if we are at the completion stage
a82d593b 2615 */
a0a8aa14 2616static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2617 bool last_stage)
a82d593b
DDAG
2618{
2619 int tmppages, pages = 0;
a935e30f
JQ
2620 size_t pagesize_bits =
2621 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2622
fbd162e6 2623 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2624 error_report("block %s should not be migrated !", pss->block->idstr);
2625 return 0;
2626 }
2627
a82d593b 2628 do {
1faa5665
XG
2629 /* Check the pages is dirty and if it is send it */
2630 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2631 pss->page++;
2632 continue;
2633 }
2634
f20e2865 2635 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2636 if (tmppages < 0) {
2637 return tmppages;
2638 }
2639
2640 pages += tmppages;
a935e30f 2641 pss->page++;
97e1e067
DDAG
2642 /* Allow rate limiting to happen in the middle of huge pages */
2643 migration_rate_limit();
1eb3fc0a
DDAG
2644 } while ((pss->page & (pagesize_bits - 1)) &&
2645 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2646
2647 /* The offset we leave with is the last one we looked at */
a935e30f 2648 pss->page--;
a82d593b
DDAG
2649 return pages;
2650}
6c595cde 2651
56e93d26 2652/**
3d0684b2 2653 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2654 *
2655 * Called within an RCU critical section.
2656 *
e8f3735f
XG
2657 * Returns the number of pages written where zero means no dirty pages,
2658 * or negative on error
56e93d26 2659 *
6f37bb8b 2660 * @rs: current RAM state
56e93d26 2661 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2662 *
2663 * On systems where host-page-size > target-page-size it will send all the
2664 * pages in a host page that are dirty.
56e93d26
JQ
2665 */
2666
ce25d337 2667static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2668{
b8fb8cb7 2669 PageSearchStatus pss;
56e93d26 2670 int pages = 0;
b9e60928 2671 bool again, found;
56e93d26 2672
0827b9e9
AA
2673 /* No dirty page as there is zero RAM */
2674 if (!ram_bytes_total()) {
2675 return pages;
2676 }
2677
6f37bb8b 2678 pss.block = rs->last_seen_block;
a935e30f 2679 pss.page = rs->last_page;
b8fb8cb7
DDAG
2680 pss.complete_round = false;
2681
2682 if (!pss.block) {
2683 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2684 }
56e93d26 2685
b9e60928 2686 do {
a82d593b 2687 again = true;
f20e2865 2688 found = get_queued_page(rs, &pss);
b9e60928 2689
a82d593b
DDAG
2690 if (!found) {
2691 /* priority queue empty, so just search for something dirty */
f20e2865 2692 found = find_dirty_block(rs, &pss, &again);
a82d593b 2693 }
f3f491fc 2694
a82d593b 2695 if (found) {
f20e2865 2696 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2697 }
b9e60928 2698 } while (!pages && again);
56e93d26 2699
6f37bb8b 2700 rs->last_seen_block = pss.block;
a935e30f 2701 rs->last_page = pss.page;
56e93d26
JQ
2702
2703 return pages;
2704}
2705
2706void acct_update_position(QEMUFile *f, size_t size, bool zero)
2707{
2708 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2709
56e93d26 2710 if (zero) {
9360447d 2711 ram_counters.duplicate += pages;
56e93d26 2712 } else {
9360447d
JQ
2713 ram_counters.normal += pages;
2714 ram_counters.transferred += size;
56e93d26
JQ
2715 qemu_update_position(f, size);
2716 }
2717}
2718
fbd162e6 2719static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2720{
2721 RAMBlock *block;
2722 uint64_t total = 0;
2723
89ac5a1d
DDAG
2724 RCU_READ_LOCK_GUARD();
2725
fbd162e6
YK
2726 if (count_ignored) {
2727 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2728 total += block->used_length;
2729 }
2730 } else {
2731 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2732 total += block->used_length;
2733 }
99e15582 2734 }
56e93d26
JQ
2735 return total;
2736}
2737
fbd162e6
YK
2738uint64_t ram_bytes_total(void)
2739{
2740 return ram_bytes_total_common(false);
2741}
2742
f265e0e4 2743static void xbzrle_load_setup(void)
56e93d26 2744{
f265e0e4 2745 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2746}
2747
f265e0e4
JQ
2748static void xbzrle_load_cleanup(void)
2749{
2750 g_free(XBZRLE.decoded_buf);
2751 XBZRLE.decoded_buf = NULL;
2752}
2753
7d7c96be
PX
2754static void ram_state_cleanup(RAMState **rsp)
2755{
b9ccaf6d
DDAG
2756 if (*rsp) {
2757 migration_page_queue_free(*rsp);
2758 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2759 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2760 g_free(*rsp);
2761 *rsp = NULL;
2762 }
7d7c96be
PX
2763}
2764
84593a08
PX
2765static void xbzrle_cleanup(void)
2766{
2767 XBZRLE_cache_lock();
2768 if (XBZRLE.cache) {
2769 cache_fini(XBZRLE.cache);
2770 g_free(XBZRLE.encoded_buf);
2771 g_free(XBZRLE.current_buf);
2772 g_free(XBZRLE.zero_target_page);
2773 XBZRLE.cache = NULL;
2774 XBZRLE.encoded_buf = NULL;
2775 XBZRLE.current_buf = NULL;
2776 XBZRLE.zero_target_page = NULL;
2777 }
2778 XBZRLE_cache_unlock();
2779}
2780
f265e0e4 2781static void ram_save_cleanup(void *opaque)
56e93d26 2782{
53518d94 2783 RAMState **rsp = opaque;
6b6712ef 2784 RAMBlock *block;
eb859c53 2785
2ff64038 2786 /* caller have hold iothread lock or is in a bh, so there is
4633456c 2787 * no writing race against the migration bitmap
2ff64038 2788 */
6b6712ef
JQ
2789 memory_global_dirty_log_stop();
2790
fbd162e6 2791 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2792 g_free(block->clear_bmap);
2793 block->clear_bmap = NULL;
6b6712ef
JQ
2794 g_free(block->bmap);
2795 block->bmap = NULL;
56e93d26
JQ
2796 }
2797
84593a08 2798 xbzrle_cleanup();
f0afa331 2799 compress_threads_save_cleanup();
7d7c96be 2800 ram_state_cleanup(rsp);
56e93d26
JQ
2801}
2802
6f37bb8b 2803static void ram_state_reset(RAMState *rs)
56e93d26 2804{
6f37bb8b
JQ
2805 rs->last_seen_block = NULL;
2806 rs->last_sent_block = NULL;
269ace29 2807 rs->last_page = 0;
6f37bb8b
JQ
2808 rs->last_version = ram_list.version;
2809 rs->ram_bulk_stage = true;
6eeb63f7 2810 rs->fpo_enabled = false;
56e93d26
JQ
2811}
2812
2813#define MAX_WAIT 50 /* ms, half buffered_file limit */
2814
4f2e4252
DDAG
2815/*
2816 * 'expected' is the value you expect the bitmap mostly to be full
2817 * of; it won't bother printing lines that are all this value.
2818 * If 'todump' is null the migration bitmap is dumped.
2819 */
6b6712ef
JQ
2820void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2821 unsigned long pages)
4f2e4252 2822{
4f2e4252
DDAG
2823 int64_t cur;
2824 int64_t linelen = 128;
2825 char linebuf[129];
2826
6b6712ef 2827 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2828 int64_t curb;
2829 bool found = false;
2830 /*
2831 * Last line; catch the case where the line length
2832 * is longer than remaining ram
2833 */
6b6712ef
JQ
2834 if (cur + linelen > pages) {
2835 linelen = pages - cur;
4f2e4252
DDAG
2836 }
2837 for (curb = 0; curb < linelen; curb++) {
2838 bool thisbit = test_bit(cur + curb, todump);
2839 linebuf[curb] = thisbit ? '1' : '.';
2840 found = found || (thisbit != expected);
2841 }
2842 if (found) {
2843 linebuf[curb] = '\0';
2844 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2845 }
2846 }
2847}
2848
e0b266f0
DDAG
2849/* **** functions for postcopy ***** */
2850
ced1c616
PB
2851void ram_postcopy_migrated_memory_release(MigrationState *ms)
2852{
2853 struct RAMBlock *block;
ced1c616 2854
fbd162e6 2855 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2856 unsigned long *bitmap = block->bmap;
2857 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2858 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2859
2860 while (run_start < range) {
2861 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2862 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2863 (run_end - run_start) << TARGET_PAGE_BITS);
2864 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2865 }
2866 }
2867}
2868
3d0684b2
JQ
2869/**
2870 * postcopy_send_discard_bm_ram: discard a RAMBlock
2871 *
2872 * Returns zero on success
2873 *
e0b266f0 2874 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2875 *
2876 * @ms: current migration state
89dab31b 2877 * @block: RAMBlock to discard
e0b266f0 2878 */
810cf2bb 2879static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2880{
6b6712ef 2881 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2882 unsigned long current;
1e7cf8c3 2883 unsigned long *bitmap = block->bmap;
e0b266f0 2884
6b6712ef 2885 for (current = 0; current < end; ) {
1e7cf8c3 2886 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2887 unsigned long zero, discard_length;
e0b266f0 2888
33a5cb62
WY
2889 if (one >= end) {
2890 break;
2891 }
e0b266f0 2892
1e7cf8c3 2893 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2894
2895 if (zero >= end) {
2896 discard_length = end - one;
e0b266f0 2897 } else {
33a5cb62
WY
2898 discard_length = zero - one;
2899 }
810cf2bb 2900 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2901 current = one + discard_length;
e0b266f0
DDAG
2902 }
2903
2904 return 0;
2905}
2906
3d0684b2
JQ
2907/**
2908 * postcopy_each_ram_send_discard: discard all RAMBlocks
2909 *
2910 * Returns 0 for success or negative for error
2911 *
e0b266f0
DDAG
2912 * Utility for the outgoing postcopy code.
2913 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2914 * passing it bitmap indexes and name.
e0b266f0
DDAG
2915 * (qemu_ram_foreach_block ends up passing unscaled lengths
2916 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2917 *
2918 * @ms: current migration state
e0b266f0
DDAG
2919 */
2920static int postcopy_each_ram_send_discard(MigrationState *ms)
2921{
2922 struct RAMBlock *block;
2923 int ret;
2924
fbd162e6 2925 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2926 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2927
2928 /*
2929 * Postcopy sends chunks of bitmap over the wire, but it
2930 * just needs indexes at this point, avoids it having
2931 * target page specific code.
2932 */
810cf2bb
WY
2933 ret = postcopy_send_discard_bm_ram(ms, block);
2934 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2935 if (ret) {
2936 return ret;
2937 }
2938 }
2939
2940 return 0;
2941}
2942
3d0684b2 2943/**
8324ef86 2944 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2945 *
2946 * Helper for postcopy_chunk_hostpages; it's called twice to
2947 * canonicalize the two bitmaps, that are similar, but one is
2948 * inverted.
99e314eb 2949 *
3d0684b2
JQ
2950 * Postcopy requires that all target pages in a hostpage are dirty or
2951 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2952 *
3d0684b2 2953 * @ms: current migration state
3d0684b2 2954 * @block: block that contains the page we want to canonicalize
99e314eb 2955 */
1e7cf8c3 2956static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2957{
53518d94 2958 RAMState *rs = ram_state;
6b6712ef 2959 unsigned long *bitmap = block->bmap;
29c59172 2960 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2961 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2962 unsigned long run_start;
2963
29c59172
DDAG
2964 if (block->page_size == TARGET_PAGE_SIZE) {
2965 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2966 return;
2967 }
2968
1e7cf8c3
WY
2969 /* Find a dirty page */
2970 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2971
6b6712ef 2972 while (run_start < pages) {
99e314eb
DDAG
2973
2974 /*
2975 * If the start of this run of pages is in the middle of a host
2976 * page, then we need to fixup this host page.
2977 */
9dec3cc3 2978 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2979 /* Find the end of this run */
1e7cf8c3 2980 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2981 /*
2982 * If the end isn't at the start of a host page, then the
2983 * run doesn't finish at the end of a host page
2984 * and we need to discard.
2985 */
99e314eb
DDAG
2986 }
2987
9dec3cc3 2988 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 2989 unsigned long page;
dad45ab2
WY
2990 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2991 host_ratio);
2992 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 2993
99e314eb
DDAG
2994 /* Clean up the bitmap */
2995 for (page = fixup_start_addr;
2996 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
2997 /*
2998 * Remark them as dirty, updating the count for any pages
2999 * that weren't previously dirty.
3000 */
0d8ec885 3001 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
3002 }
3003 }
3004
1e7cf8c3
WY
3005 /* Find the next dirty page for the next iteration */
3006 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
3007 }
3008}
3009
3d0684b2 3010/**
89dab31b 3011 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 3012 *
99e314eb
DDAG
3013 * Utility for the outgoing postcopy code.
3014 *
3015 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
3016 * dirty host-page size chunks as all dirty. In this case the host-page
3017 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 3018 *
3d0684b2
JQ
3019 * Returns zero on success
3020 *
3021 * @ms: current migration state
6b6712ef 3022 * @block: block we want to work with
99e314eb 3023 */
6b6712ef 3024static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 3025{
810cf2bb 3026 postcopy_discard_send_init(ms, block->idstr);
99e314eb 3027
6b6712ef 3028 /*
1e7cf8c3 3029 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 3030 */
1e7cf8c3 3031 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 3032
810cf2bb 3033 postcopy_discard_send_finish(ms);
99e314eb
DDAG
3034 return 0;
3035}
3036
3d0684b2
JQ
3037/**
3038 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3039 *
3040 * Returns zero on success
3041 *
e0b266f0
DDAG
3042 * Transmit the set of pages to be discarded after precopy to the target
3043 * these are pages that:
3044 * a) Have been previously transmitted but are now dirty again
3045 * b) Pages that have never been transmitted, this ensures that
3046 * any pages on the destination that have been mapped by background
3047 * tasks get discarded (transparent huge pages is the specific concern)
3048 * Hopefully this is pretty sparse
3d0684b2
JQ
3049 *
3050 * @ms: current migration state
e0b266f0
DDAG
3051 */
3052int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3053{
53518d94 3054 RAMState *rs = ram_state;
6b6712ef 3055 RAMBlock *block;
e0b266f0 3056 int ret;
e0b266f0 3057
89ac5a1d 3058 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
3059
3060 /* This should be our last sync, the src is now paused */
eb859c53 3061 migration_bitmap_sync(rs);
e0b266f0 3062
6b6712ef
JQ
3063 /* Easiest way to make sure we don't resume in the middle of a host-page */
3064 rs->last_seen_block = NULL;
3065 rs->last_sent_block = NULL;
3066 rs->last_page = 0;
e0b266f0 3067
fbd162e6 3068 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
3069 /* Deal with TPS != HPS and huge pages */
3070 ret = postcopy_chunk_hostpages(ms, block);
3071 if (ret) {
6b6712ef
JQ
3072 return ret;
3073 }
e0b266f0 3074
e0b266f0 3075#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
3076 ram_debug_dump_bitmap(block->bmap, true,
3077 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 3078#endif
6b6712ef
JQ
3079 }
3080 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
3081
3082 ret = postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
3083
3084 return ret;
3085}
3086
3d0684b2
JQ
3087/**
3088 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 3089 *
3d0684b2 3090 * Returns zero on success
e0b266f0 3091 *
36449157
JQ
3092 * @rbname: name of the RAMBlock of the request. NULL means the
3093 * same that last one.
3d0684b2
JQ
3094 * @start: RAMBlock starting page
3095 * @length: RAMBlock size
e0b266f0 3096 */
aaa2064c 3097int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 3098{
36449157 3099 trace_ram_discard_range(rbname, start, length);
d3a5038c 3100
89ac5a1d 3101 RCU_READ_LOCK_GUARD();
36449157 3102 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
3103
3104 if (!rb) {
36449157 3105 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 3106 return -1;
e0b266f0
DDAG
3107 }
3108
814bb08f
PX
3109 /*
3110 * On source VM, we don't need to update the received bitmap since
3111 * we don't even have one.
3112 */
3113 if (rb->receivedmap) {
3114 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3115 length >> qemu_target_page_bits());
3116 }
3117
03acb4e9 3118 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3119}
3120
84593a08
PX
3121/*
3122 * For every allocation, we will try not to crash the VM if the
3123 * allocation failed.
3124 */
3125static int xbzrle_init(void)
3126{
3127 Error *local_err = NULL;
3128
3129 if (!migrate_use_xbzrle()) {
3130 return 0;
3131 }
3132
3133 XBZRLE_cache_lock();
3134
3135 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3136 if (!XBZRLE.zero_target_page) {
3137 error_report("%s: Error allocating zero page", __func__);
3138 goto err_out;
3139 }
3140
3141 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3142 TARGET_PAGE_SIZE, &local_err);
3143 if (!XBZRLE.cache) {
3144 error_report_err(local_err);
3145 goto free_zero_page;
3146 }
3147
3148 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3149 if (!XBZRLE.encoded_buf) {
3150 error_report("%s: Error allocating encoded_buf", __func__);
3151 goto free_cache;
3152 }
3153
3154 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3155 if (!XBZRLE.current_buf) {
3156 error_report("%s: Error allocating current_buf", __func__);
3157 goto free_encoded_buf;
3158 }
3159
3160 /* We are all good */
3161 XBZRLE_cache_unlock();
3162 return 0;
3163
3164free_encoded_buf:
3165 g_free(XBZRLE.encoded_buf);
3166 XBZRLE.encoded_buf = NULL;
3167free_cache:
3168 cache_fini(XBZRLE.cache);
3169 XBZRLE.cache = NULL;
3170free_zero_page:
3171 g_free(XBZRLE.zero_target_page);
3172 XBZRLE.zero_target_page = NULL;
3173err_out:
3174 XBZRLE_cache_unlock();
3175 return -ENOMEM;
3176}
3177
53518d94 3178static int ram_state_init(RAMState **rsp)
56e93d26 3179{
7d00ee6a
PX
3180 *rsp = g_try_new0(RAMState, 1);
3181
3182 if (!*rsp) {
3183 error_report("%s: Init ramstate fail", __func__);
3184 return -1;
3185 }
53518d94
JQ
3186
3187 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3188 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3189 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3190
7d00ee6a 3191 /*
40c4d4a8
IR
3192 * Count the total number of pages used by ram blocks not including any
3193 * gaps due to alignment or unplugs.
03158519 3194 * This must match with the initial values of dirty bitmap.
7d00ee6a 3195 */
40c4d4a8 3196 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3197 ram_state_reset(*rsp);
3198
3199 return 0;
3200}
3201
d6eff5d7 3202static void ram_list_init_bitmaps(void)
7d00ee6a 3203{
002cad6b 3204 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3205 RAMBlock *block;
3206 unsigned long pages;
002cad6b 3207 uint8_t shift;
56e93d26 3208
0827b9e9
AA
3209 /* Skip setting bitmap if there is no RAM */
3210 if (ram_bytes_total()) {
002cad6b
PX
3211 shift = ms->clear_bitmap_shift;
3212 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3213 error_report("clear_bitmap_shift (%u) too big, using "
3214 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3215 shift = CLEAR_BITMAP_SHIFT_MAX;
3216 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3217 error_report("clear_bitmap_shift (%u) too small, using "
3218 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3219 shift = CLEAR_BITMAP_SHIFT_MIN;
3220 }
3221
fbd162e6 3222 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3223 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3224 /*
3225 * The initial dirty bitmap for migration must be set with all
3226 * ones to make sure we'll migrate every guest RAM page to
3227 * destination.
40c4d4a8
IR
3228 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3229 * new migration after a failed migration, ram_list.
3230 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3231 * guest memory.
03158519 3232 */
6b6712ef 3233 block->bmap = bitmap_new(pages);
40c4d4a8 3234 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3235 block->clear_bmap_shift = shift;
3236 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3237 }
f3f491fc 3238 }
d6eff5d7
PX
3239}
3240
3241static void ram_init_bitmaps(RAMState *rs)
3242{
3243 /* For memory_global_dirty_log_start below. */
3244 qemu_mutex_lock_iothread();
3245 qemu_mutex_lock_ramlist();
f3f491fc 3246
89ac5a1d
DDAG
3247 WITH_RCU_READ_LOCK_GUARD() {
3248 ram_list_init_bitmaps();
3249 memory_global_dirty_log_start();
3250 migration_bitmap_sync_precopy(rs);
3251 }
56e93d26 3252 qemu_mutex_unlock_ramlist();
49877834 3253 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3254}
3255
3256static int ram_init_all(RAMState **rsp)
3257{
3258 if (ram_state_init(rsp)) {
3259 return -1;
3260 }
3261
3262 if (xbzrle_init()) {
3263 ram_state_cleanup(rsp);
3264 return -1;
3265 }
3266
3267 ram_init_bitmaps(*rsp);
a91246c9
HZ
3268
3269 return 0;
3270}
3271
08614f34
PX
3272static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3273{
3274 RAMBlock *block;
3275 uint64_t pages = 0;
3276
3277 /*
3278 * Postcopy is not using xbzrle/compression, so no need for that.
3279 * Also, since source are already halted, we don't need to care
3280 * about dirty page logging as well.
3281 */
3282
fbd162e6 3283 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3284 pages += bitmap_count_one(block->bmap,
3285 block->used_length >> TARGET_PAGE_BITS);
3286 }
3287
3288 /* This may not be aligned with current bitmaps. Recalculate. */
3289 rs->migration_dirty_pages = pages;
3290
3291 rs->last_seen_block = NULL;
3292 rs->last_sent_block = NULL;
3293 rs->last_page = 0;
3294 rs->last_version = ram_list.version;
3295 /*
3296 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3297 * matter what we have sent.
3298 */
3299 rs->ram_bulk_stage = false;
3300
3301 /* Update RAMState cache of output QEMUFile */
3302 rs->f = out;
3303
3304 trace_ram_state_resume_prepare(pages);
3305}
3306
6bcb05fc
WW
3307/*
3308 * This function clears bits of the free pages reported by the caller from the
3309 * migration dirty bitmap. @addr is the host address corresponding to the
3310 * start of the continuous guest free pages, and @len is the total bytes of
3311 * those pages.
3312 */
3313void qemu_guest_free_page_hint(void *addr, size_t len)
3314{
3315 RAMBlock *block;
3316 ram_addr_t offset;
3317 size_t used_len, start, npages;
3318 MigrationState *s = migrate_get_current();
3319
3320 /* This function is currently expected to be used during live migration */
3321 if (!migration_is_setup_or_active(s->state)) {
3322 return;
3323 }
3324
3325 for (; len > 0; len -= used_len, addr += used_len) {
3326 block = qemu_ram_block_from_host(addr, false, &offset);
3327 if (unlikely(!block || offset >= block->used_length)) {
3328 /*
3329 * The implementation might not support RAMBlock resize during
3330 * live migration, but it could happen in theory with future
3331 * updates. So we add a check here to capture that case.
3332 */
3333 error_report_once("%s unexpected error", __func__);
3334 return;
3335 }
3336
3337 if (len <= block->used_length - offset) {
3338 used_len = len;
3339 } else {
3340 used_len = block->used_length - offset;
3341 }
3342
3343 start = offset >> TARGET_PAGE_BITS;
3344 npages = used_len >> TARGET_PAGE_BITS;
3345
3346 qemu_mutex_lock(&ram_state->bitmap_mutex);
3347 ram_state->migration_dirty_pages -=
3348 bitmap_count_one_with_offset(block->bmap, start, npages);
3349 bitmap_clear(block->bmap, start, npages);
3350 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3351 }
3352}
3353
3d0684b2
JQ
3354/*
3355 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3356 * long-running RCU critical section. When rcu-reclaims in the code
3357 * start to become numerous it will be necessary to reduce the
3358 * granularity of these critical sections.
3359 */
3360
3d0684b2
JQ
3361/**
3362 * ram_save_setup: Setup RAM for migration
3363 *
3364 * Returns zero to indicate success and negative for error
3365 *
3366 * @f: QEMUFile where to send the data
3367 * @opaque: RAMState pointer
3368 */
a91246c9
HZ
3369static int ram_save_setup(QEMUFile *f, void *opaque)
3370{
53518d94 3371 RAMState **rsp = opaque;
a91246c9
HZ
3372 RAMBlock *block;
3373
dcaf446e
XG
3374 if (compress_threads_save_setup()) {
3375 return -1;
3376 }
3377
a91246c9
HZ
3378 /* migration has already setup the bitmap, reuse it. */
3379 if (!migration_in_colo_state()) {
7d00ee6a 3380 if (ram_init_all(rsp) != 0) {
dcaf446e 3381 compress_threads_save_cleanup();
a91246c9 3382 return -1;
53518d94 3383 }
a91246c9 3384 }
53518d94 3385 (*rsp)->f = f;
a91246c9 3386
0e6ebd48
DDAG
3387 WITH_RCU_READ_LOCK_GUARD() {
3388 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3389
0e6ebd48
DDAG
3390 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3391 qemu_put_byte(f, strlen(block->idstr));
3392 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3393 qemu_put_be64(f, block->used_length);
3394 if (migrate_postcopy_ram() && block->page_size !=
3395 qemu_host_page_size) {
3396 qemu_put_be64(f, block->page_size);
3397 }
3398 if (migrate_ignore_shared()) {
3399 qemu_put_be64(f, block->mr->addr);
3400 }
fbd162e6 3401 }
56e93d26
JQ
3402 }
3403
56e93d26
JQ
3404 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3405 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3406
1b81c974 3407 multifd_send_sync_main(*rsp);
56e93d26 3408 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3409 qemu_fflush(f);
56e93d26
JQ
3410
3411 return 0;
3412}
3413
3d0684b2
JQ
3414/**
3415 * ram_save_iterate: iterative stage for migration
3416 *
3417 * Returns zero to indicate success and negative for error
3418 *
3419 * @f: QEMUFile where to send the data
3420 * @opaque: RAMState pointer
3421 */
56e93d26
JQ
3422static int ram_save_iterate(QEMUFile *f, void *opaque)
3423{
53518d94
JQ
3424 RAMState **temp = opaque;
3425 RAMState *rs = *temp;
56e93d26
JQ
3426 int ret;
3427 int i;
3428 int64_t t0;
5c90308f 3429 int done = 0;
56e93d26 3430
b2557345
PL
3431 if (blk_mig_bulk_active()) {
3432 /* Avoid transferring ram during bulk phase of block migration as
3433 * the bulk phase will usually take a long time and transferring
3434 * ram updates during that time is pointless. */
3435 goto out;
3436 }
3437
89ac5a1d
DDAG
3438 WITH_RCU_READ_LOCK_GUARD() {
3439 if (ram_list.version != rs->last_version) {
3440 ram_state_reset(rs);
3441 }
56e93d26 3442
89ac5a1d
DDAG
3443 /* Read version before ram_list.blocks */
3444 smp_rmb();
56e93d26 3445
89ac5a1d 3446 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3447
89ac5a1d
DDAG
3448 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3449 i = 0;
3450 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3451 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3452 int pages;
e03a34f8 3453
89ac5a1d
DDAG
3454 if (qemu_file_get_error(f)) {
3455 break;
3456 }
e8f3735f 3457
89ac5a1d
DDAG
3458 pages = ram_find_and_save_block(rs, false);
3459 /* no more pages to sent */
3460 if (pages == 0) {
3461 done = 1;
3462 break;
3463 }
e8f3735f 3464
89ac5a1d
DDAG
3465 if (pages < 0) {
3466 qemu_file_set_error(f, pages);
56e93d26
JQ
3467 break;
3468 }
89ac5a1d
DDAG
3469
3470 rs->target_page_count += pages;
3471
3472 /*
3473 * we want to check in the 1st loop, just in case it was the 1st
3474 * time and we had to sync the dirty bitmap.
3475 * qemu_clock_get_ns() is a bit expensive, so we only check each
3476 * some iterations
3477 */
3478 if ((i & 63) == 0) {
3479 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3480 1000000;
3481 if (t1 > MAX_WAIT) {
3482 trace_ram_save_iterate_big_wait(t1, i);
3483 break;
3484 }
3485 }
3486 i++;
56e93d26 3487 }
56e93d26 3488 }
56e93d26
JQ
3489
3490 /*
3491 * Must occur before EOS (or any QEMUFile operation)
3492 * because of RDMA protocol.
3493 */
3494 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3495
b2557345 3496out:
1b81c974 3497 multifd_send_sync_main(rs);
56e93d26 3498 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3499 qemu_fflush(f);
9360447d 3500 ram_counters.transferred += 8;
56e93d26
JQ
3501
3502 ret = qemu_file_get_error(f);
3503 if (ret < 0) {
3504 return ret;
3505 }
3506
5c90308f 3507 return done;
56e93d26
JQ
3508}
3509
3d0684b2
JQ
3510/**
3511 * ram_save_complete: function called to send the remaining amount of ram
3512 *
e8f3735f 3513 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3514 *
3515 * Called with iothread lock
3516 *
3517 * @f: QEMUFile where to send the data
3518 * @opaque: RAMState pointer
3519 */
56e93d26
JQ
3520static int ram_save_complete(QEMUFile *f, void *opaque)
3521{
53518d94
JQ
3522 RAMState **temp = opaque;
3523 RAMState *rs = *temp;
e8f3735f 3524 int ret = 0;
6f37bb8b 3525
89ac5a1d
DDAG
3526 WITH_RCU_READ_LOCK_GUARD() {
3527 if (!migration_in_postcopy()) {
3528 migration_bitmap_sync_precopy(rs);
3529 }
56e93d26 3530
89ac5a1d 3531 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3532
89ac5a1d 3533 /* try transferring iterative blocks of memory */
56e93d26 3534
89ac5a1d
DDAG
3535 /* flush all remaining blocks regardless of rate limiting */
3536 while (true) {
3537 int pages;
56e93d26 3538
89ac5a1d
DDAG
3539 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3540 /* no more blocks to sent */
3541 if (pages == 0) {
3542 break;
3543 }
3544 if (pages < 0) {
3545 ret = pages;
3546 break;
3547 }
e8f3735f 3548 }
56e93d26 3549
89ac5a1d
DDAG
3550 flush_compressed_data(rs);
3551 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3552 }
d09a6fde 3553
1b81c974 3554 multifd_send_sync_main(rs);
56e93d26 3555 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3556 qemu_fflush(f);
56e93d26 3557
e8f3735f 3558 return ret;
56e93d26
JQ
3559}
3560
c31b098f 3561static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3562 uint64_t *res_precopy_only,
3563 uint64_t *res_compatible,
3564 uint64_t *res_postcopy_only)
56e93d26 3565{
53518d94
JQ
3566 RAMState **temp = opaque;
3567 RAMState *rs = *temp;
56e93d26
JQ
3568 uint64_t remaining_size;
3569
9edabd4d 3570 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3571
5727309d 3572 if (!migration_in_postcopy() &&
663e6c1d 3573 remaining_size < max_size) {
56e93d26 3574 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3575 WITH_RCU_READ_LOCK_GUARD() {
3576 migration_bitmap_sync_precopy(rs);
3577 }
56e93d26 3578 qemu_mutex_unlock_iothread();
9edabd4d 3579 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3580 }
c31b098f 3581
86e1167e
VSO
3582 if (migrate_postcopy_ram()) {
3583 /* We can do postcopy, and all the data is postcopiable */
47995026 3584 *res_compatible += remaining_size;
86e1167e 3585 } else {
47995026 3586 *res_precopy_only += remaining_size;
86e1167e 3587 }
56e93d26
JQ
3588}
3589
3590static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3591{
3592 unsigned int xh_len;
3593 int xh_flags;
063e760a 3594 uint8_t *loaded_data;
56e93d26 3595
56e93d26
JQ
3596 /* extract RLE header */
3597 xh_flags = qemu_get_byte(f);
3598 xh_len = qemu_get_be16(f);
3599
3600 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3601 error_report("Failed to load XBZRLE page - wrong compression!");
3602 return -1;
3603 }
3604
3605 if (xh_len > TARGET_PAGE_SIZE) {
3606 error_report("Failed to load XBZRLE page - len overflow!");
3607 return -1;
3608 }
f265e0e4 3609 loaded_data = XBZRLE.decoded_buf;
56e93d26 3610 /* load data and decode */
f265e0e4 3611 /* it can change loaded_data to point to an internal buffer */
063e760a 3612 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3613
3614 /* decode RLE */
063e760a 3615 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3616 TARGET_PAGE_SIZE) == -1) {
3617 error_report("Failed to load XBZRLE page - decode error!");
3618 return -1;
3619 }
3620
3621 return 0;
3622}
3623
3d0684b2
JQ
3624/**
3625 * ram_block_from_stream: read a RAMBlock id from the migration stream
3626 *
3627 * Must be called from within a rcu critical section.
3628 *
56e93d26 3629 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3630 *
3d0684b2
JQ
3631 * @f: QEMUFile where to read the data from
3632 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3633 */
3d0684b2 3634static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3635{
3636 static RAMBlock *block = NULL;
3637 char id[256];
3638 uint8_t len;
3639
3640 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3641 if (!block) {
56e93d26
JQ
3642 error_report("Ack, bad migration stream!");
3643 return NULL;
3644 }
4c4bad48 3645 return block;
56e93d26
JQ
3646 }
3647
3648 len = qemu_get_byte(f);
3649 qemu_get_buffer(f, (uint8_t *)id, len);
3650 id[len] = 0;
3651
e3dd7493 3652 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3653 if (!block) {
3654 error_report("Can't find block %s", id);
3655 return NULL;
56e93d26
JQ
3656 }
3657
fbd162e6 3658 if (ramblock_is_ignored(block)) {
b895de50
CLG
3659 error_report("block %s should not be migrated !", id);
3660 return NULL;
3661 }
3662
4c4bad48
HZ
3663 return block;
3664}
3665
3666static inline void *host_from_ram_block_offset(RAMBlock *block,
3667 ram_addr_t offset)
3668{
3669 if (!offset_in_ramblock(block, offset)) {
3670 return NULL;
3671 }
3672
3673 return block->host + offset;
56e93d26
JQ
3674}
3675
13af18f2
ZC
3676static inline void *colo_cache_from_block_offset(RAMBlock *block,
3677 ram_addr_t offset)
3678{
3679 if (!offset_in_ramblock(block, offset)) {
3680 return NULL;
3681 }
3682 if (!block->colo_cache) {
3683 error_report("%s: colo_cache is NULL in block :%s",
3684 __func__, block->idstr);
3685 return NULL;
3686 }
7d9acafa
ZC
3687
3688 /*
3689 * During colo checkpoint, we need bitmap of these migrated pages.
3690 * It help us to decide which pages in ram cache should be flushed
3691 * into VM's RAM later.
3692 */
3693 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3694 ram_state->migration_dirty_pages++;
3695 }
13af18f2
ZC
3696 return block->colo_cache + offset;
3697}
3698
3d0684b2
JQ
3699/**
3700 * ram_handle_compressed: handle the zero page case
3701 *
56e93d26
JQ
3702 * If a page (or a whole RDMA chunk) has been
3703 * determined to be zero, then zap it.
3d0684b2
JQ
3704 *
3705 * @host: host address for the zero page
3706 * @ch: what the page is filled from. We only support zero
3707 * @size: size of the zero page
56e93d26
JQ
3708 */
3709void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3710{
3711 if (ch != 0 || !is_zero_range(host, size)) {
3712 memset(host, ch, size);
3713 }
3714}
3715
797ca154
XG
3716/* return the size after decompression, or negative value on error */
3717static int
3718qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3719 const uint8_t *source, size_t source_len)
3720{
3721 int err;
3722
3723 err = inflateReset(stream);
3724 if (err != Z_OK) {
3725 return -1;
3726 }
3727
3728 stream->avail_in = source_len;
3729 stream->next_in = (uint8_t *)source;
3730 stream->avail_out = dest_len;
3731 stream->next_out = dest;
3732
3733 err = inflate(stream, Z_NO_FLUSH);
3734 if (err != Z_STREAM_END) {
3735 return -1;
3736 }
3737
3738 return stream->total_out;
3739}
3740
56e93d26
JQ
3741static void *do_data_decompress(void *opaque)
3742{
3743 DecompressParam *param = opaque;
3744 unsigned long pagesize;
33d151f4 3745 uint8_t *des;
34ab9e97 3746 int len, ret;
56e93d26 3747
33d151f4 3748 qemu_mutex_lock(&param->mutex);
90e56fb4 3749 while (!param->quit) {
33d151f4
LL
3750 if (param->des) {
3751 des = param->des;
3752 len = param->len;
3753 param->des = 0;
3754 qemu_mutex_unlock(&param->mutex);
3755
56e93d26 3756 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3757
3758 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3759 param->compbuf, len);
f548222c 3760 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3761 error_report("decompress data failed");
3762 qemu_file_set_error(decomp_file, ret);
3763 }
73a8912b 3764
33d151f4
LL
3765 qemu_mutex_lock(&decomp_done_lock);
3766 param->done = true;
3767 qemu_cond_signal(&decomp_done_cond);
3768 qemu_mutex_unlock(&decomp_done_lock);
3769
3770 qemu_mutex_lock(&param->mutex);
3771 } else {
3772 qemu_cond_wait(&param->cond, &param->mutex);
3773 }
56e93d26 3774 }
33d151f4 3775 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3776
3777 return NULL;
3778}
3779
34ab9e97 3780static int wait_for_decompress_done(void)
5533b2e9
LL
3781{
3782 int idx, thread_count;
3783
3784 if (!migrate_use_compression()) {
34ab9e97 3785 return 0;
5533b2e9
LL
3786 }
3787
3788 thread_count = migrate_decompress_threads();
3789 qemu_mutex_lock(&decomp_done_lock);
3790 for (idx = 0; idx < thread_count; idx++) {
3791 while (!decomp_param[idx].done) {
3792 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3793 }
3794 }
3795 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3796 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3797}
3798
f0afa331 3799static void compress_threads_load_cleanup(void)
56e93d26
JQ
3800{
3801 int i, thread_count;
3802
3416ab5b
JQ
3803 if (!migrate_use_compression()) {
3804 return;
3805 }
56e93d26
JQ
3806 thread_count = migrate_decompress_threads();
3807 for (i = 0; i < thread_count; i++) {
797ca154
XG
3808 /*
3809 * we use it as a indicator which shows if the thread is
3810 * properly init'd or not
3811 */
3812 if (!decomp_param[i].compbuf) {
3813 break;
3814 }
3815
56e93d26 3816 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3817 decomp_param[i].quit = true;
56e93d26
JQ
3818 qemu_cond_signal(&decomp_param[i].cond);
3819 qemu_mutex_unlock(&decomp_param[i].mutex);
3820 }
3821 for (i = 0; i < thread_count; i++) {
797ca154
XG
3822 if (!decomp_param[i].compbuf) {
3823 break;
3824 }
3825
56e93d26
JQ
3826 qemu_thread_join(decompress_threads + i);
3827 qemu_mutex_destroy(&decomp_param[i].mutex);
3828 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3829 inflateEnd(&decomp_param[i].stream);
56e93d26 3830 g_free(decomp_param[i].compbuf);
797ca154 3831 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3832 }
3833 g_free(decompress_threads);
3834 g_free(decomp_param);
56e93d26
JQ
3835 decompress_threads = NULL;
3836 decomp_param = NULL;
34ab9e97 3837 decomp_file = NULL;
56e93d26
JQ
3838}
3839
34ab9e97 3840static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3841{
3842 int i, thread_count;
3843
3844 if (!migrate_use_compression()) {
3845 return 0;
3846 }
3847
3848 thread_count = migrate_decompress_threads();
3849 decompress_threads = g_new0(QemuThread, thread_count);
3850 decomp_param = g_new0(DecompressParam, thread_count);
3851 qemu_mutex_init(&decomp_done_lock);
3852 qemu_cond_init(&decomp_done_cond);
34ab9e97 3853 decomp_file = f;
797ca154
XG
3854 for (i = 0; i < thread_count; i++) {
3855 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3856 goto exit;
3857 }
3858
3859 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3860 qemu_mutex_init(&decomp_param[i].mutex);
3861 qemu_cond_init(&decomp_param[i].cond);
3862 decomp_param[i].done = true;
3863 decomp_param[i].quit = false;
3864 qemu_thread_create(decompress_threads + i, "decompress",
3865 do_data_decompress, decomp_param + i,
3866 QEMU_THREAD_JOINABLE);
3867 }
3868 return 0;
3869exit:
3870 compress_threads_load_cleanup();
3871 return -1;
3872}
3873
c1bc6626 3874static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3875 void *host, int len)
3876{
3877 int idx, thread_count;
3878
3879 thread_count = migrate_decompress_threads();
73a8912b 3880 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3881 while (true) {
3882 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3883 if (decomp_param[idx].done) {
33d151f4
LL
3884 decomp_param[idx].done = false;
3885 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3886 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3887 decomp_param[idx].des = host;
3888 decomp_param[idx].len = len;
33d151f4
LL
3889 qemu_cond_signal(&decomp_param[idx].cond);
3890 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3891 break;
3892 }
3893 }
3894 if (idx < thread_count) {
3895 break;
73a8912b
LL
3896 } else {
3897 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3898 }
3899 }
73a8912b 3900 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3901}
3902
13af18f2
ZC
3903/*
3904 * colo cache: this is for secondary VM, we cache the whole
3905 * memory of the secondary VM, it is need to hold the global lock
3906 * to call this helper.
3907 */
3908int colo_init_ram_cache(void)
3909{
3910 RAMBlock *block;
3911
44901b5a
PB
3912 WITH_RCU_READ_LOCK_GUARD() {
3913 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3914 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3915 NULL,
3916 false);
3917 if (!block->colo_cache) {
3918 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3919 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3920 block->used_length);
3921 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3922 if (block->colo_cache) {
3923 qemu_anon_ram_free(block->colo_cache, block->used_length);
3924 block->colo_cache = NULL;
3925 }
89ac5a1d 3926 }
44901b5a 3927 return -errno;
89ac5a1d 3928 }
44901b5a 3929 memcpy(block->colo_cache, block->host, block->used_length);
13af18f2 3930 }
13af18f2 3931 }
44901b5a 3932
7d9acafa
ZC
3933 /*
3934 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3935 * with to decide which page in cache should be flushed into SVM's RAM. Here
3936 * we use the same name 'ram_bitmap' as for migration.
3937 */
3938 if (ram_bytes_total()) {
3939 RAMBlock *block;
3940
fbd162e6 3941 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3942 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3943
3944 block->bmap = bitmap_new(pages);
3945 bitmap_set(block->bmap, 0, pages);
3946 }
3947 }
3948 ram_state = g_new0(RAMState, 1);
3949 ram_state->migration_dirty_pages = 0;
c6e5bafb 3950 qemu_mutex_init(&ram_state->bitmap_mutex);
d1955d22 3951 memory_global_dirty_log_start();
7d9acafa 3952
13af18f2 3953 return 0;
13af18f2
ZC
3954}
3955
3956/* It is need to hold the global lock to call this helper */
3957void colo_release_ram_cache(void)
3958{
3959 RAMBlock *block;
3960
d1955d22 3961 memory_global_dirty_log_stop();
fbd162e6 3962 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3963 g_free(block->bmap);
3964 block->bmap = NULL;
3965 }
3966
89ac5a1d
DDAG
3967 WITH_RCU_READ_LOCK_GUARD() {
3968 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3969 if (block->colo_cache) {
3970 qemu_anon_ram_free(block->colo_cache, block->used_length);
3971 block->colo_cache = NULL;
3972 }
13af18f2
ZC
3973 }
3974 }
c6e5bafb 3975 qemu_mutex_destroy(&ram_state->bitmap_mutex);
7d9acafa
ZC
3976 g_free(ram_state);
3977 ram_state = NULL;
13af18f2
ZC
3978}
3979
f265e0e4
JQ
3980/**
3981 * ram_load_setup: Setup RAM for migration incoming side
3982 *
3983 * Returns zero to indicate success and negative for error
3984 *
3985 * @f: QEMUFile where to receive the data
3986 * @opaque: RAMState pointer
3987 */
3988static int ram_load_setup(QEMUFile *f, void *opaque)
3989{
34ab9e97 3990 if (compress_threads_load_setup(f)) {
797ca154
XG
3991 return -1;
3992 }
3993
f265e0e4 3994 xbzrle_load_setup();
f9494614 3995 ramblock_recv_map_init();
13af18f2 3996
f265e0e4
JQ
3997 return 0;
3998}
3999
4000static int ram_load_cleanup(void *opaque)
4001{
f9494614 4002 RAMBlock *rb;
56eb90af 4003
fbd162e6 4004 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 4005 qemu_ram_block_writeback(rb);
56eb90af
JH
4006 }
4007
f265e0e4 4008 xbzrle_load_cleanup();
f0afa331 4009 compress_threads_load_cleanup();
f9494614 4010
fbd162e6 4011 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
4012 g_free(rb->receivedmap);
4013 rb->receivedmap = NULL;
4014 }
13af18f2 4015
f265e0e4
JQ
4016 return 0;
4017}
4018
3d0684b2
JQ
4019/**
4020 * ram_postcopy_incoming_init: allocate postcopy data structures
4021 *
4022 * Returns 0 for success and negative if there was one error
4023 *
4024 * @mis: current migration incoming state
4025 *
4026 * Allocate data structures etc needed by incoming migration with
4027 * postcopy-ram. postcopy-ram's similarly names
4028 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4029 */
4030int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4031{
c136180c 4032 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4033}
4034
3d0684b2
JQ
4035/**
4036 * ram_load_postcopy: load a page in postcopy case
4037 *
4038 * Returns 0 for success or -errno in case of error
4039 *
a7180877
DDAG
4040 * Called in postcopy mode by ram_load().
4041 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4042 *
4043 * @f: QEMUFile where to send the data
a7180877
DDAG
4044 */
4045static int ram_load_postcopy(QEMUFile *f)
4046{
4047 int flags = 0, ret = 0;
4048 bool place_needed = false;
1aa83678 4049 bool matches_target_page_size = false;
a7180877
DDAG
4050 MigrationIncomingState *mis = migration_incoming_get_current();
4051 /* Temporary page that is later 'placed' */
3414322a 4052 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 4053 void *this_host = NULL;
a3b6ff6d 4054 bool all_zero = false;
4cbb3c63 4055 int target_pages = 0;
a7180877
DDAG
4056
4057 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4058 ram_addr_t addr;
4059 void *host = NULL;
4060 void *page_buffer = NULL;
4061 void *place_source = NULL;
df9ff5e1 4062 RAMBlock *block = NULL;
a7180877 4063 uint8_t ch;
a7180877
DDAG
4064
4065 addr = qemu_get_be64(f);
7a9ddfbf
PX
4066
4067 /*
4068 * If qemu file error, we should stop here, and then "addr"
4069 * may be invalid
4070 */
4071 ret = qemu_file_get_error(f);
4072 if (ret) {
4073 break;
4074 }
4075
a7180877
DDAG
4076 flags = addr & ~TARGET_PAGE_MASK;
4077 addr &= TARGET_PAGE_MASK;
4078
4079 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4080 place_needed = false;
bb890ed5 4081 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 4082 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
4083
4084 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
4085 if (!host) {
4086 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4087 ret = -EINVAL;
4088 break;
4089 }
4cbb3c63 4090 target_pages++;
1aa83678 4091 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4092 /*
28abd200
DDAG
4093 * Postcopy requires that we place whole host pages atomically;
4094 * these may be huge pages for RAMBlocks that are backed by
4095 * hugetlbfs.
a7180877
DDAG
4096 * To make it atomic, the data is read into a temporary page
4097 * that's moved into place later.
4098 * The migration protocol uses, possibly smaller, target-pages
4099 * however the source ensures it always sends all the components
91ba442f 4100 * of a host page in one chunk.
a7180877
DDAG
4101 */
4102 page_buffer = postcopy_host_page +
28abd200 4103 ((uintptr_t)host & (block->page_size - 1));
a7180877 4104 /* If all TP are zero then we can optimise the place */
e5e73b0f 4105 if (target_pages == 1) {
a7180877 4106 all_zero = true;
91ba442f
WY
4107 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4108 block->page_size);
c53b7ddc
DDAG
4109 } else {
4110 /* not the 1st TP within the HP */
91ba442f
WY
4111 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
4112 (uintptr_t)this_host) {
4113 error_report("Non-same host page %p/%p",
4114 host, this_host);
c53b7ddc
DDAG
4115 ret = -EINVAL;
4116 break;
4117 }
a7180877
DDAG
4118 }
4119
4120 /*
4121 * If it's the last part of a host page then we place the host
4122 * page
4123 */
4cbb3c63
WY
4124 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
4125 place_needed = true;
4126 target_pages = 0;
4127 }
a7180877
DDAG
4128 place_source = postcopy_host_page;
4129 }
4130
4131 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4132 case RAM_SAVE_FLAG_ZERO:
a7180877 4133 ch = qemu_get_byte(f);
2e36bc1b
WY
4134 /*
4135 * Can skip to set page_buffer when
4136 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4137 */
4138 if (ch || !matches_target_page_size) {
4139 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4140 }
a7180877
DDAG
4141 if (ch) {
4142 all_zero = false;
4143 }
4144 break;
4145
4146 case RAM_SAVE_FLAG_PAGE:
4147 all_zero = false;
1aa83678
PX
4148 if (!matches_target_page_size) {
4149 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4150 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4151 } else {
1aa83678
PX
4152 /*
4153 * For small pages that matches target page size, we
4154 * avoid the qemu_file copy. Instead we directly use
4155 * the buffer of QEMUFile to place the page. Note: we
4156 * cannot do any QEMUFile operation before using that
4157 * buffer to make sure the buffer is valid when
4158 * placing the page.
a7180877
DDAG
4159 */
4160 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4161 TARGET_PAGE_SIZE);
4162 }
4163 break;
4164 case RAM_SAVE_FLAG_EOS:
4165 /* normal exit */
6df264ac 4166 multifd_recv_sync_main();
a7180877
DDAG
4167 break;
4168 default:
4169 error_report("Unknown combination of migration flags: %#x"
4170 " (postcopy mode)", flags);
4171 ret = -EINVAL;
7a9ddfbf
PX
4172 break;
4173 }
4174
4175 /* Detect for any possible file errors */
4176 if (!ret && qemu_file_get_error(f)) {
4177 ret = qemu_file_get_error(f);
a7180877
DDAG
4178 }
4179
7a9ddfbf 4180 if (!ret && place_needed) {
a7180877 4181 /* This gets called at the last target page in the host page */
91ba442f
WY
4182 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4183 block->page_size);
df9ff5e1 4184
a7180877 4185 if (all_zero) {
df9ff5e1 4186 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 4187 block);
a7180877 4188 } else {
df9ff5e1 4189 ret = postcopy_place_page(mis, place_dest,
8be4620b 4190 place_source, block);
a7180877
DDAG
4191 }
4192 }
a7180877
DDAG
4193 }
4194
4195 return ret;
4196}
4197
acab30b8
DHB
4198static bool postcopy_is_advised(void)
4199{
4200 PostcopyState ps = postcopy_state_get();
4201 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4202}
4203
4204static bool postcopy_is_running(void)
4205{
4206 PostcopyState ps = postcopy_state_get();
4207 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4208}
4209
e6f4aa18
ZC
4210/*
4211 * Flush content of RAM cache into SVM's memory.
4212 * Only flush the pages that be dirtied by PVM or SVM or both.
4213 */
4214static void colo_flush_ram_cache(void)
4215{
4216 RAMBlock *block = NULL;
4217 void *dst_host;
4218 void *src_host;
4219 unsigned long offset = 0;
4220
d1955d22 4221 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4222 WITH_RCU_READ_LOCK_GUARD() {
4223 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4224 ramblock_sync_dirty_bitmap(ram_state, block);
4225 }
d1955d22 4226 }
d1955d22 4227
e6f4aa18 4228 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4229 WITH_RCU_READ_LOCK_GUARD() {
4230 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4231
89ac5a1d
DDAG
4232 while (block) {
4233 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 4234
89ac5a1d
DDAG
4235 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4236 offset = 0;
4237 block = QLIST_NEXT_RCU(block, next);
4238 } else {
4239 migration_bitmap_clear_dirty(ram_state, block, offset);
4240 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4241 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4242 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4243 }
e6f4aa18
ZC
4244 }
4245 }
e6f4aa18
ZC
4246 trace_colo_flush_ram_cache_end();
4247}
4248
10da4a36
WY
4249/**
4250 * ram_load_precopy: load pages in precopy case
4251 *
4252 * Returns 0 for success or -errno in case of error
4253 *
4254 * Called in precopy mode by ram_load().
4255 * rcu_read_lock is taken prior to this being called.
4256 *
4257 * @f: QEMUFile where to send the data
4258 */
4259static int ram_load_precopy(QEMUFile *f)
56e93d26 4260{
e65cec5e 4261 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4262 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4263 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
4264 if (!migrate_use_compression()) {
4265 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4266 }
a7180877 4267
10da4a36 4268 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4269 ram_addr_t addr, total_ram_bytes;
a776aa15 4270 void *host = NULL;
56e93d26
JQ
4271 uint8_t ch;
4272
e65cec5e
YK
4273 /*
4274 * Yield periodically to let main loop run, but an iteration of
4275 * the main loop is expensive, so do it each some iterations
4276 */
4277 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4278 aio_co_schedule(qemu_get_current_aio_context(),
4279 qemu_coroutine_self());
4280 qemu_coroutine_yield();
4281 }
4282 i++;
4283
56e93d26
JQ
4284 addr = qemu_get_be64(f);
4285 flags = addr & ~TARGET_PAGE_MASK;
4286 addr &= TARGET_PAGE_MASK;
4287
edc60127
JQ
4288 if (flags & invalid_flags) {
4289 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4290 error_report("Received an unexpected compressed page");
4291 }
4292
4293 ret = -EINVAL;
4294 break;
4295 }
4296
bb890ed5 4297 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4298 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4299 RAMBlock *block = ram_block_from_stream(f, flags);
4300
13af18f2
ZC
4301 /*
4302 * After going into COLO, we should load the Page into colo_cache.
4303 */
4304 if (migration_incoming_in_colo_state()) {
4305 host = colo_cache_from_block_offset(block, addr);
4306 } else {
4307 host = host_from_ram_block_offset(block, addr);
4308 }
a776aa15
DDAG
4309 if (!host) {
4310 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4311 ret = -EINVAL;
4312 break;
4313 }
13af18f2
ZC
4314
4315 if (!migration_incoming_in_colo_state()) {
4316 ramblock_recv_bitmap_set(block, host);
4317 }
4318
1db9d8e5 4319 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4320 }
4321
56e93d26
JQ
4322 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4323 case RAM_SAVE_FLAG_MEM_SIZE:
4324 /* Synchronize RAM block list */
4325 total_ram_bytes = addr;
4326 while (!ret && total_ram_bytes) {
4327 RAMBlock *block;
56e93d26
JQ
4328 char id[256];
4329 ram_addr_t length;
4330
4331 len = qemu_get_byte(f);
4332 qemu_get_buffer(f, (uint8_t *)id, len);
4333 id[len] = 0;
4334 length = qemu_get_be64(f);
4335
e3dd7493 4336 block = qemu_ram_block_by_name(id);
b895de50
CLG
4337 if (block && !qemu_ram_is_migratable(block)) {
4338 error_report("block %s should not be migrated !", id);
4339 ret = -EINVAL;
4340 } else if (block) {
e3dd7493
DDAG
4341 if (length != block->used_length) {
4342 Error *local_err = NULL;
56e93d26 4343
fa53a0e5 4344 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4345 &local_err);
4346 if (local_err) {
4347 error_report_err(local_err);
56e93d26 4348 }
56e93d26 4349 }
ef08fb38
DDAG
4350 /* For postcopy we need to check hugepage sizes match */
4351 if (postcopy_advised &&
4352 block->page_size != qemu_host_page_size) {
4353 uint64_t remote_page_size = qemu_get_be64(f);
4354 if (remote_page_size != block->page_size) {
4355 error_report("Mismatched RAM page size %s "
4356 "(local) %zd != %" PRId64,
4357 id, block->page_size,
4358 remote_page_size);
4359 ret = -EINVAL;
4360 }
4361 }
fbd162e6
YK
4362 if (migrate_ignore_shared()) {
4363 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4364 if (ramblock_is_ignored(block) &&
4365 block->mr->addr != addr) {
4366 error_report("Mismatched GPAs for block %s "
4367 "%" PRId64 "!= %" PRId64,
4368 id, (uint64_t)addr,
4369 (uint64_t)block->mr->addr);
4370 ret = -EINVAL;
4371 }
4372 }
e3dd7493
DDAG
4373 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4374 block->idstr);
4375 } else {
56e93d26
JQ
4376 error_report("Unknown ramblock \"%s\", cannot "
4377 "accept migration", id);
4378 ret = -EINVAL;
4379 }
4380
4381 total_ram_bytes -= length;
4382 }
4383 break;
a776aa15 4384
bb890ed5 4385 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4386 ch = qemu_get_byte(f);
4387 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4388 break;
a776aa15 4389
56e93d26 4390 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4391 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4392 break;
56e93d26 4393
a776aa15 4394 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4395 len = qemu_get_be32(f);
4396 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4397 error_report("Invalid compressed data length: %d", len);
4398 ret = -EINVAL;
4399 break;
4400 }
c1bc6626 4401 decompress_data_with_multi_threads(f, host, len);
56e93d26 4402 break;
a776aa15 4403
56e93d26 4404 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4405 if (load_xbzrle(f, addr, host) < 0) {
4406 error_report("Failed to decompress XBZRLE page at "
4407 RAM_ADDR_FMT, addr);
4408 ret = -EINVAL;
4409 break;
4410 }
4411 break;
4412 case RAM_SAVE_FLAG_EOS:
4413 /* normal exit */
6df264ac 4414 multifd_recv_sync_main();
56e93d26
JQ
4415 break;
4416 default:
4417 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4418 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4419 } else {
4420 error_report("Unknown combination of migration flags: %#x",
4421 flags);
4422 ret = -EINVAL;
4423 }
4424 }
4425 if (!ret) {
4426 ret = qemu_file_get_error(f);
4427 }
4428 }
4429
ca1a6b70 4430 ret |= wait_for_decompress_done();
10da4a36
WY
4431 return ret;
4432}
4433
4434static int ram_load(QEMUFile *f, void *opaque, int version_id)
4435{
4436 int ret = 0;
4437 static uint64_t seq_iter;
4438 /*
4439 * If system is running in postcopy mode, page inserts to host memory must
4440 * be atomic
4441 */
4442 bool postcopy_running = postcopy_is_running();
4443
4444 seq_iter++;
4445
4446 if (version_id != 4) {
4447 return -EINVAL;
4448 }
4449
4450 /*
4451 * This RCU critical section can be very long running.
4452 * When RCU reclaims in the code start to become numerous,
4453 * it will be necessary to reduce the granularity of this
4454 * critical section.
4455 */
89ac5a1d
DDAG
4456 WITH_RCU_READ_LOCK_GUARD() {
4457 if (postcopy_running) {
4458 ret = ram_load_postcopy(f);
4459 } else {
4460 ret = ram_load_precopy(f);
4461 }
10da4a36 4462 }
55c4446b 4463 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4464
4465 if (!ret && migration_incoming_in_colo_state()) {
4466 colo_flush_ram_cache();
4467 }
56e93d26
JQ
4468 return ret;
4469}
4470
c6467627
VSO
4471static bool ram_has_postcopy(void *opaque)
4472{
469dd51b 4473 RAMBlock *rb;
fbd162e6 4474 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4475 if (ramblock_is_pmem(rb)) {
4476 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4477 "is not supported now!", rb->idstr, rb->host);
4478 return false;
4479 }
4480 }
4481
c6467627
VSO
4482 return migrate_postcopy_ram();
4483}
4484
edd090c7
PX
4485/* Sync all the dirty bitmap with destination VM. */
4486static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4487{
4488 RAMBlock *block;
4489 QEMUFile *file = s->to_dst_file;
4490 int ramblock_count = 0;
4491
4492 trace_ram_dirty_bitmap_sync_start();
4493
fbd162e6 4494 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4495 qemu_savevm_send_recv_bitmap(file, block->idstr);
4496 trace_ram_dirty_bitmap_request(block->idstr);
4497 ramblock_count++;
4498 }
4499
4500 trace_ram_dirty_bitmap_sync_wait();
4501
4502 /* Wait until all the ramblocks' dirty bitmap synced */
4503 while (ramblock_count--) {
4504 qemu_sem_wait(&s->rp_state.rp_sem);
4505 }
4506
4507 trace_ram_dirty_bitmap_sync_complete();
4508
4509 return 0;
4510}
4511
4512static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4513{
4514 qemu_sem_post(&s->rp_state.rp_sem);
4515}
4516
a335debb
PX
4517/*
4518 * Read the received bitmap, revert it as the initial dirty bitmap.
4519 * This is only used when the postcopy migration is paused but wants
4520 * to resume from a middle point.
4521 */
4522int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4523{
4524 int ret = -EINVAL;
4525 QEMUFile *file = s->rp_state.from_dst_file;
4526 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4527 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4528 uint64_t size, end_mark;
4529
4530 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4531
4532 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4533 error_report("%s: incorrect state %s", __func__,
4534 MigrationStatus_str(s->state));
4535 return -EINVAL;
4536 }
4537
4538 /*
4539 * Note: see comments in ramblock_recv_bitmap_send() on why we
4540 * need the endianess convertion, and the paddings.
4541 */
4542 local_size = ROUND_UP(local_size, 8);
4543
4544 /* Add paddings */
4545 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4546
4547 size = qemu_get_be64(file);
4548
4549 /* The size of the bitmap should match with our ramblock */
4550 if (size != local_size) {
4551 error_report("%s: ramblock '%s' bitmap size mismatch "
4552 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4553 block->idstr, size, local_size);
4554 ret = -EINVAL;
4555 goto out;
4556 }
4557
4558 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4559 end_mark = qemu_get_be64(file);
4560
4561 ret = qemu_file_get_error(file);
4562 if (ret || size != local_size) {
4563 error_report("%s: read bitmap failed for ramblock '%s': %d"
4564 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4565 __func__, block->idstr, ret, local_size, size);
4566 ret = -EIO;
4567 goto out;
4568 }
4569
4570 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4571 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4572 __func__, block->idstr, end_mark);
4573 ret = -EINVAL;
4574 goto out;
4575 }
4576
4577 /*
4578 * Endianess convertion. We are during postcopy (though paused).
4579 * The dirty bitmap won't change. We can directly modify it.
4580 */
4581 bitmap_from_le(block->bmap, le_bitmap, nbits);
4582
4583 /*
4584 * What we received is "received bitmap". Revert it as the initial
4585 * dirty bitmap for this ramblock.
4586 */
4587 bitmap_complement(block->bmap, block->bmap, nbits);
4588
4589 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4590
edd090c7
PX
4591 /*
4592 * We succeeded to sync bitmap for current ramblock. If this is
4593 * the last one to sync, we need to notify the main send thread.
4594 */
4595 ram_dirty_bitmap_reload_notify(s);
4596
a335debb
PX
4597 ret = 0;
4598out:
bf269906 4599 g_free(le_bitmap);
a335debb
PX
4600 return ret;
4601}
4602
edd090c7
PX
4603static int ram_resume_prepare(MigrationState *s, void *opaque)
4604{
4605 RAMState *rs = *(RAMState **)opaque;
08614f34 4606 int ret;
edd090c7 4607
08614f34
PX
4608 ret = ram_dirty_bitmap_sync_all(s, rs);
4609 if (ret) {
4610 return ret;
4611 }
4612
4613 ram_state_resume_prepare(rs, s->to_dst_file);
4614
4615 return 0;
edd090c7
PX
4616}
4617
56e93d26 4618static SaveVMHandlers savevm_ram_handlers = {
9907e842 4619 .save_setup = ram_save_setup,
56e93d26 4620 .save_live_iterate = ram_save_iterate,
763c906b 4621 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4622 .save_live_complete_precopy = ram_save_complete,
c6467627 4623 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4624 .save_live_pending = ram_save_pending,
4625 .load_state = ram_load,
f265e0e4
JQ
4626 .save_cleanup = ram_save_cleanup,
4627 .load_setup = ram_load_setup,
4628 .load_cleanup = ram_load_cleanup,
edd090c7 4629 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4630};
4631
4632void ram_mig_init(void)
4633{
4634 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4635 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4636}
This page took 1.122631 seconds and 4 git commands to generate.