]> Git Repo - qemu.git/blame - migration/ram.c
qemu-file: Don't do IO after shutdown
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
71bb07db 39#include "socket.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
af8b7d2b
JQ
55#include "sysemu/sysemu.h"
56#include "qemu/uuid.h"
edd090c7 57#include "savevm.h"
b9ee2f7d 58#include "qemu/iov.h"
56e93d26 59
56e93d26
JQ
60/***********************************************************/
61/* ram save/restore */
62
bb890ed5
JQ
63/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67 */
68
56e93d26 69#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 70#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
71#define RAM_SAVE_FLAG_MEM_SIZE 0x04
72#define RAM_SAVE_FLAG_PAGE 0x08
73#define RAM_SAVE_FLAG_EOS 0x10
74#define RAM_SAVE_FLAG_CONTINUE 0x20
75#define RAM_SAVE_FLAG_XBZRLE 0x40
76/* 0x80 is reserved in migration.h start with 0x100 next */
77#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78
56e93d26
JQ
79static inline bool is_zero_range(uint8_t *p, uint64_t size)
80{
a1febc49 81 return buffer_is_zero(p, size);
56e93d26
JQ
82}
83
9360447d
JQ
84XBZRLECacheStats xbzrle_counters;
85
56e93d26
JQ
86/* struct contains XBZRLE cache and a static page
87 used by the compression */
88static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
c00e0928
JQ
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
f265e0e4
JQ
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
56e93d26
JQ
100} XBZRLE;
101
56e93d26
JQ
102static void XBZRLE_cache_lock(void)
103{
104 if (migrate_use_xbzrle())
105 qemu_mutex_lock(&XBZRLE.lock);
106}
107
108static void XBZRLE_cache_unlock(void)
109{
110 if (migrate_use_xbzrle())
111 qemu_mutex_unlock(&XBZRLE.lock);
112}
113
3d0684b2
JQ
114/**
115 * xbzrle_cache_resize: resize the xbzrle cache
116 *
117 * This function is called from qmp_migrate_set_cache_size in main
118 * thread, possibly while a migration is in progress. A running
119 * migration may be using the cache and might finish during this call,
120 * hence changes to the cache are protected by XBZRLE.lock().
121 *
c9dede2d 122 * Returns 0 for success or -1 for error
3d0684b2
JQ
123 *
124 * @new_size: new cache size
8acabf69 125 * @errp: set *errp if the check failed, with reason
56e93d26 126 */
c9dede2d 127int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
128{
129 PageCache *new_cache;
c9dede2d 130 int64_t ret = 0;
56e93d26 131
8acabf69
JQ
132 /* Check for truncation */
133 if (new_size != (size_t)new_size) {
134 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
135 "exceeding address space");
136 return -1;
137 }
138
2a313e5c
JQ
139 if (new_size == migrate_xbzrle_cache_size()) {
140 /* nothing to do */
c9dede2d 141 return 0;
2a313e5c
JQ
142 }
143
56e93d26
JQ
144 XBZRLE_cache_lock();
145
146 if (XBZRLE.cache != NULL) {
80f8dfde 147 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 148 if (!new_cache) {
56e93d26
JQ
149 ret = -1;
150 goto out;
151 }
152
153 cache_fini(XBZRLE.cache);
154 XBZRLE.cache = new_cache;
155 }
56e93d26
JQ
156out:
157 XBZRLE_cache_unlock();
158 return ret;
159}
160
fbd162e6
YK
161static bool ramblock_is_ignored(RAMBlock *block)
162{
163 return !qemu_ram_is_migratable(block) ||
164 (migrate_ignore_shared() && qemu_ram_is_shared(block));
165}
166
b895de50 167/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
168#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
169 INTERNAL_RAMBLOCK_FOREACH(block) \
170 if (ramblock_is_ignored(block)) {} else
171
b895de50 172#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 173 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
174 if (!qemu_ram_is_migratable(block)) {} else
175
343f632c
DDAG
176#undef RAMBLOCK_FOREACH
177
fbd162e6
YK
178int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
179{
180 RAMBlock *block;
181 int ret = 0;
182
89ac5a1d
DDAG
183 RCU_READ_LOCK_GUARD();
184
fbd162e6
YK
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
fbd162e6
YK
191 return ret;
192}
193
f9494614
AP
194static void ramblock_recv_map_init(void)
195{
196 RAMBlock *rb;
197
fbd162e6 198 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
199 assert(!rb->receivedmap);
200 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
201 }
202}
203
204int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
205{
206 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
207 rb->receivedmap);
208}
209
1cba9f6e
DDAG
210bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
211{
212 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
213}
214
f9494614
AP
215void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
216{
217 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
218}
219
220void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
221 size_t nr)
222{
223 bitmap_set_atomic(rb->receivedmap,
224 ramblock_recv_bitmap_offset(host_addr, rb),
225 nr);
226}
227
a335debb
PX
228#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229
230/*
231 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
232 *
233 * Returns >0 if success with sent bytes, or <0 if error.
234 */
235int64_t ramblock_recv_bitmap_send(QEMUFile *file,
236 const char *block_name)
237{
238 RAMBlock *block = qemu_ram_block_by_name(block_name);
239 unsigned long *le_bitmap, nbits;
240 uint64_t size;
241
242 if (!block) {
243 error_report("%s: invalid block name: %s", __func__, block_name);
244 return -1;
245 }
246
247 nbits = block->used_length >> TARGET_PAGE_BITS;
248
249 /*
250 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
251 * machines we may need 4 more bytes for padding (see below
252 * comment). So extend it a bit before hand.
253 */
254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
255
256 /*
257 * Always use little endian when sending the bitmap. This is
258 * required that when source and destination VMs are not using the
259 * same endianess. (Note: big endian won't work.)
260 */
261 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
262
263 /* Size of the bitmap, in bytes */
a725ef9f 264 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
265
266 /*
267 * size is always aligned to 8 bytes for 64bit machines, but it
268 * may not be true for 32bit machines. We need this padding to
269 * make sure the migration can survive even between 32bit and
270 * 64bit machines.
271 */
272 size = ROUND_UP(size, 8);
273
274 qemu_put_be64(file, size);
275 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
276 /*
277 * Mark as an end, in case the middle part is screwed up due to
278 * some "misterious" reason.
279 */
280 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
281 qemu_fflush(file);
282
bf269906 283 g_free(le_bitmap);
a335debb
PX
284
285 if (qemu_file_get_error(file)) {
286 return qemu_file_get_error(file);
287 }
288
289 return size + sizeof(size);
290}
291
ec481c6c
JQ
292/*
293 * An outstanding page request, on the source, having been received
294 * and queued
295 */
296struct RAMSrcPageRequest {
297 RAMBlock *rb;
298 hwaddr offset;
299 hwaddr len;
300
301 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
302};
303
6f37bb8b
JQ
304/* State of RAM for migration */
305struct RAMState {
204b88b8
JQ
306 /* QEMUFile used for this migration */
307 QEMUFile *f;
6f37bb8b
JQ
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
269ace29
JQ
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
6f37bb8b
JQ
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* We are in the first round */
317 bool ram_bulk_stage;
6eeb63f7
WW
318 /* The free page optimization is enabled */
319 bool fpo_enabled;
8d820d6f
JQ
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt;
f664da80
JQ
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync;
eac74159 325 /* bytes transferred at start_time */
c4bdf0cf 326 uint64_t bytes_xfer_prev;
a66cd90c 327 /* number of dirty pages since start_time */
68908ed6 328 uint64_t num_dirty_pages_period;
b5833fde
JQ
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
331
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
be8b02ed
XG
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
9360447d 344 /* number of dirty bits in the bitmap */
2dfaf12e 345 uint64_t migration_dirty_pages;
386a907b 346 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 347 QemuMutex bitmap_mutex;
68a098f3
JQ
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
ec481c6c
JQ
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
b58deb34 352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
353};
354typedef struct RAMState RAMState;
355
53518d94 356static RAMState *ram_state;
6f37bb8b 357
bd227060
WW
358static NotifierWithReturnList precopy_notifier_list;
359
360void precopy_infrastructure_init(void)
361{
362 notifier_with_return_list_init(&precopy_notifier_list);
363}
364
365void precopy_add_notifier(NotifierWithReturn *n)
366{
367 notifier_with_return_list_add(&precopy_notifier_list, n);
368}
369
370void precopy_remove_notifier(NotifierWithReturn *n)
371{
372 notifier_with_return_remove(n);
373}
374
375int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376{
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
380
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382}
383
6eeb63f7
WW
384void precopy_enable_free_page_optimization(void)
385{
386 if (!ram_state) {
387 return;
388 }
389
390 ram_state->fpo_enabled = true;
391}
392
9edabd4d 393uint64_t ram_bytes_remaining(void)
2f4fde93 394{
bae416e5
DDAG
395 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
396 0;
2f4fde93
JQ
397}
398
9360447d 399MigrationStats ram_counters;
96506894 400
b8fb8cb7
DDAG
401/* used by the search for pages to send */
402struct PageSearchStatus {
403 /* Current block being searched */
404 RAMBlock *block;
a935e30f
JQ
405 /* Current page to search from */
406 unsigned long page;
b8fb8cb7
DDAG
407 /* Set once we wrap around */
408 bool complete_round;
409};
410typedef struct PageSearchStatus PageSearchStatus;
411
76e03000
XG
412CompressionStats compression_counters;
413
56e93d26 414struct CompressParam {
56e93d26 415 bool done;
90e56fb4 416 bool quit;
5e5fdcff 417 bool zero_page;
56e93d26
JQ
418 QEMUFile *file;
419 QemuMutex mutex;
420 QemuCond cond;
421 RAMBlock *block;
422 ram_addr_t offset;
34ab9e97
XG
423
424 /* internally used fields */
dcaf446e 425 z_stream stream;
34ab9e97 426 uint8_t *originbuf;
56e93d26
JQ
427};
428typedef struct CompressParam CompressParam;
429
430struct DecompressParam {
73a8912b 431 bool done;
90e56fb4 432 bool quit;
56e93d26
JQ
433 QemuMutex mutex;
434 QemuCond cond;
435 void *des;
d341d9f3 436 uint8_t *compbuf;
56e93d26 437 int len;
797ca154 438 z_stream stream;
56e93d26
JQ
439};
440typedef struct DecompressParam DecompressParam;
441
442static CompressParam *comp_param;
443static QemuThread *compress_threads;
444/* comp_done_cond is used to wake up the migration thread when
445 * one of the compression threads has finished the compression.
446 * comp_done_lock is used to co-work with comp_done_cond.
447 */
0d9f9a5c
LL
448static QemuMutex comp_done_lock;
449static QemuCond comp_done_cond;
56e93d26
JQ
450/* The empty QEMUFileOps will be used by file in CompressParam */
451static const QEMUFileOps empty_ops = { };
452
34ab9e97 453static QEMUFile *decomp_file;
56e93d26
JQ
454static DecompressParam *decomp_param;
455static QemuThread *decompress_threads;
73a8912b
LL
456static QemuMutex decomp_done_lock;
457static QemuCond decomp_done_cond;
56e93d26 458
5e5fdcff 459static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 460 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
461
462static void *do_data_compress(void *opaque)
463{
464 CompressParam *param = opaque;
a7a9a88f
LL
465 RAMBlock *block;
466 ram_addr_t offset;
5e5fdcff 467 bool zero_page;
56e93d26 468
a7a9a88f 469 qemu_mutex_lock(&param->mutex);
90e56fb4 470 while (!param->quit) {
a7a9a88f
LL
471 if (param->block) {
472 block = param->block;
473 offset = param->offset;
474 param->block = NULL;
475 qemu_mutex_unlock(&param->mutex);
476
5e5fdcff
XG
477 zero_page = do_compress_ram_page(param->file, &param->stream,
478 block, offset, param->originbuf);
a7a9a88f 479
0d9f9a5c 480 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 481 param->done = true;
5e5fdcff 482 param->zero_page = zero_page;
0d9f9a5c
LL
483 qemu_cond_signal(&comp_done_cond);
484 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
485
486 qemu_mutex_lock(&param->mutex);
487 } else {
56e93d26
JQ
488 qemu_cond_wait(&param->cond, &param->mutex);
489 }
56e93d26 490 }
a7a9a88f 491 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
492
493 return NULL;
494}
495
f0afa331 496static void compress_threads_save_cleanup(void)
56e93d26
JQ
497{
498 int i, thread_count;
499
05306935 500 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
501 return;
502 }
05306935 503
56e93d26
JQ
504 thread_count = migrate_compress_threads();
505 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
506 /*
507 * we use it as a indicator which shows if the thread is
508 * properly init'd or not
509 */
510 if (!comp_param[i].file) {
511 break;
512 }
05306935
FL
513
514 qemu_mutex_lock(&comp_param[i].mutex);
515 comp_param[i].quit = true;
516 qemu_cond_signal(&comp_param[i].cond);
517 qemu_mutex_unlock(&comp_param[i].mutex);
518
56e93d26 519 qemu_thread_join(compress_threads + i);
56e93d26
JQ
520 qemu_mutex_destroy(&comp_param[i].mutex);
521 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 522 deflateEnd(&comp_param[i].stream);
34ab9e97 523 g_free(comp_param[i].originbuf);
dcaf446e
XG
524 qemu_fclose(comp_param[i].file);
525 comp_param[i].file = NULL;
56e93d26 526 }
0d9f9a5c
LL
527 qemu_mutex_destroy(&comp_done_lock);
528 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
529 g_free(compress_threads);
530 g_free(comp_param);
56e93d26
JQ
531 compress_threads = NULL;
532 comp_param = NULL;
56e93d26
JQ
533}
534
dcaf446e 535static int compress_threads_save_setup(void)
56e93d26
JQ
536{
537 int i, thread_count;
538
539 if (!migrate_use_compression()) {
dcaf446e 540 return 0;
56e93d26 541 }
56e93d26
JQ
542 thread_count = migrate_compress_threads();
543 compress_threads = g_new0(QemuThread, thread_count);
544 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
545 qemu_cond_init(&comp_done_cond);
546 qemu_mutex_init(&comp_done_lock);
56e93d26 547 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
548 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
549 if (!comp_param[i].originbuf) {
550 goto exit;
551 }
552
dcaf446e
XG
553 if (deflateInit(&comp_param[i].stream,
554 migrate_compress_level()) != Z_OK) {
34ab9e97 555 g_free(comp_param[i].originbuf);
dcaf446e
XG
556 goto exit;
557 }
558
e110aa91
C
559 /* comp_param[i].file is just used as a dummy buffer to save data,
560 * set its ops to empty.
56e93d26
JQ
561 */
562 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
563 comp_param[i].done = true;
90e56fb4 564 comp_param[i].quit = false;
56e93d26
JQ
565 qemu_mutex_init(&comp_param[i].mutex);
566 qemu_cond_init(&comp_param[i].cond);
567 qemu_thread_create(compress_threads + i, "compress",
568 do_data_compress, comp_param + i,
569 QEMU_THREAD_JOINABLE);
570 }
dcaf446e
XG
571 return 0;
572
573exit:
574 compress_threads_save_cleanup();
575 return -1;
56e93d26
JQ
576}
577
f986c3d2
JQ
578/* Multiple fd's */
579
af8b7d2b
JQ
580#define MULTIFD_MAGIC 0x11223344U
581#define MULTIFD_VERSION 1
582
6df264ac
JQ
583#define MULTIFD_FLAG_SYNC (1 << 0)
584
efd1a1d6 585/* This value needs to be a multiple of qemu_target_page_size() */
4b0c7264 586#define MULTIFD_PACKET_SIZE (512 * 1024)
efd1a1d6 587
af8b7d2b
JQ
588typedef struct {
589 uint32_t magic;
590 uint32_t version;
591 unsigned char uuid[16]; /* QemuUUID */
592 uint8_t id;
5fbd8b4b
JQ
593 uint8_t unused1[7]; /* Reserved for future use */
594 uint64_t unused2[4]; /* Reserved for future use */
af8b7d2b
JQ
595} __attribute__((packed)) MultiFDInit_t;
596
2a26c979
JQ
597typedef struct {
598 uint32_t magic;
599 uint32_t version;
600 uint32_t flags;
6f862692
JQ
601 /* maximum number of allocated pages */
602 uint32_t pages_alloc;
603 uint32_t pages_used;
2a34ee59
JQ
604 /* size of the next packet that contains pages */
605 uint32_t next_packet_size;
2a26c979 606 uint64_t packet_num;
5fbd8b4b 607 uint64_t unused[4]; /* Reserved for future use */
2a26c979
JQ
608 char ramblock[256];
609 uint64_t offset[];
610} __attribute__((packed)) MultiFDPacket_t;
611
34c55a94
JQ
612typedef struct {
613 /* number of used pages */
614 uint32_t used;
615 /* number of allocated pages */
616 uint32_t allocated;
617 /* global number of generated multifd packets */
618 uint64_t packet_num;
619 /* offset of each page */
620 ram_addr_t *offset;
621 /* pointer to each page */
622 struct iovec *iov;
623 RAMBlock *block;
624} MultiFDPages_t;
625
8c4598f2
JQ
626typedef struct {
627 /* this fields are not changed once the thread is created */
628 /* channel number */
f986c3d2 629 uint8_t id;
8c4598f2 630 /* channel thread name */
f986c3d2 631 char *name;
8c4598f2 632 /* channel thread id */
f986c3d2 633 QemuThread thread;
8c4598f2 634 /* communication channel */
60df2d4a 635 QIOChannel *c;
8c4598f2 636 /* sem where to wait for more work */
f986c3d2 637 QemuSemaphore sem;
8c4598f2 638 /* this mutex protects the following parameters */
f986c3d2 639 QemuMutex mutex;
8c4598f2 640 /* is this channel thread running */
66770707 641 bool running;
8c4598f2 642 /* should this thread finish */
f986c3d2 643 bool quit;
0beb5ed3
JQ
644 /* thread has work to do */
645 int pending_job;
34c55a94
JQ
646 /* array of pages to sent */
647 MultiFDPages_t *pages;
2a26c979
JQ
648 /* packet allocated len */
649 uint32_t packet_len;
650 /* pointer to the packet */
651 MultiFDPacket_t *packet;
652 /* multifd flags for each packet */
653 uint32_t flags;
2a34ee59
JQ
654 /* size of the next packet that contains pages */
655 uint32_t next_packet_size;
2a26c979
JQ
656 /* global number of generated multifd packets */
657 uint64_t packet_num;
408ea6ae
JQ
658 /* thread local variables */
659 /* packets sent through this channel */
660 uint64_t num_packets;
661 /* pages sent through this channel */
662 uint64_t num_pages;
18cdcea3
JQ
663 /* syncs main thread and channels */
664 QemuSemaphore sem_sync;
8c4598f2
JQ
665} MultiFDSendParams;
666
667typedef struct {
668 /* this fields are not changed once the thread is created */
669 /* channel number */
670 uint8_t id;
671 /* channel thread name */
672 char *name;
673 /* channel thread id */
674 QemuThread thread;
675 /* communication channel */
676 QIOChannel *c;
8c4598f2
JQ
677 /* this mutex protects the following parameters */
678 QemuMutex mutex;
679 /* is this channel thread running */
680 bool running;
3c3ca25d
JQ
681 /* should this thread finish */
682 bool quit;
34c55a94
JQ
683 /* array of pages to receive */
684 MultiFDPages_t *pages;
2a26c979
JQ
685 /* packet allocated len */
686 uint32_t packet_len;
687 /* pointer to the packet */
688 MultiFDPacket_t *packet;
689 /* multifd flags for each packet */
690 uint32_t flags;
691 /* global number of generated multifd packets */
692 uint64_t packet_num;
408ea6ae 693 /* thread local variables */
2a34ee59
JQ
694 /* size of the next packet that contains pages */
695 uint32_t next_packet_size;
408ea6ae
JQ
696 /* packets sent through this channel */
697 uint64_t num_packets;
698 /* pages sent through this channel */
699 uint64_t num_pages;
6df264ac
JQ
700 /* syncs main thread and channels */
701 QemuSemaphore sem_sync;
8c4598f2 702} MultiFDRecvParams;
f986c3d2 703
af8b7d2b
JQ
704static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
705{
d069bcca 706 MultiFDInit_t msg = {};
af8b7d2b
JQ
707 int ret;
708
709 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
710 msg.version = cpu_to_be32(MULTIFD_VERSION);
711 msg.id = p->id;
712 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
713
714 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
715 if (ret != 0) {
716 return -1;
717 }
718 return 0;
719}
720
721static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
722{
723 MultiFDInit_t msg;
724 int ret;
725
726 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
727 if (ret != 0) {
728 return -1;
729 }
730
341ba0df
PM
731 msg.magic = be32_to_cpu(msg.magic);
732 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
733
734 if (msg.magic != MULTIFD_MAGIC) {
735 error_setg(errp, "multifd: received packet magic %x "
736 "expected %x", msg.magic, MULTIFD_MAGIC);
737 return -1;
738 }
739
740 if (msg.version != MULTIFD_VERSION) {
741 error_setg(errp, "multifd: received packet version %d "
742 "expected %d", msg.version, MULTIFD_VERSION);
743 return -1;
744 }
745
746 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
747 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
748 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
749
750 error_setg(errp, "multifd: received uuid '%s' and expected "
751 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
752 g_free(uuid);
753 g_free(msg_uuid);
754 return -1;
755 }
756
757 if (msg.id > migrate_multifd_channels()) {
758 error_setg(errp, "multifd: received channel version %d "
759 "expected %d", msg.version, MULTIFD_VERSION);
760 return -1;
761 }
762
763 return msg.id;
764}
765
34c55a94
JQ
766static MultiFDPages_t *multifd_pages_init(size_t size)
767{
768 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
769
770 pages->allocated = size;
771 pages->iov = g_new0(struct iovec, size);
772 pages->offset = g_new0(ram_addr_t, size);
773
774 return pages;
775}
776
777static void multifd_pages_clear(MultiFDPages_t *pages)
778{
779 pages->used = 0;
780 pages->allocated = 0;
781 pages->packet_num = 0;
782 pages->block = NULL;
783 g_free(pages->iov);
784 pages->iov = NULL;
785 g_free(pages->offset);
786 pages->offset = NULL;
787 g_free(pages);
788}
789
2a26c979
JQ
790static void multifd_send_fill_packet(MultiFDSendParams *p)
791{
792 MultiFDPacket_t *packet = p->packet;
793 int i;
794
2a26c979 795 packet->flags = cpu_to_be32(p->flags);
f2148c4c 796 packet->pages_alloc = cpu_to_be32(p->pages->allocated);
6f862692 797 packet->pages_used = cpu_to_be32(p->pages->used);
2a34ee59 798 packet->next_packet_size = cpu_to_be32(p->next_packet_size);
2a26c979
JQ
799 packet->packet_num = cpu_to_be64(p->packet_num);
800
801 if (p->pages->block) {
802 strncpy(packet->ramblock, p->pages->block->idstr, 256);
803 }
804
805 for (i = 0; i < p->pages->used; i++) {
ddac5cb2
JQ
806 /* there are architectures where ram_addr_t is 32 bit */
807 uint64_t temp = p->pages->offset[i];
808
809 packet->offset[i] = cpu_to_be64(temp);
2a26c979
JQ
810 }
811}
812
813static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
814{
815 MultiFDPacket_t *packet = p->packet;
7ed379b2 816 uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
2a26c979
JQ
817 RAMBlock *block;
818 int i;
819
341ba0df 820 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
821 if (packet->magic != MULTIFD_MAGIC) {
822 error_setg(errp, "multifd: received packet "
823 "magic %x and expected magic %x",
824 packet->magic, MULTIFD_MAGIC);
825 return -1;
826 }
827
341ba0df 828 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
829 if (packet->version != MULTIFD_VERSION) {
830 error_setg(errp, "multifd: received packet "
831 "version %d and expected version %d",
832 packet->version, MULTIFD_VERSION);
833 return -1;
834 }
835
836 p->flags = be32_to_cpu(packet->flags);
837
6f862692 838 packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
7ed379b2 839 /*
d884e77b 840 * If we received a packet that is 100 times bigger than expected
7ed379b2
JQ
841 * just stop migration. It is a magic number.
842 */
843 if (packet->pages_alloc > pages_max * 100) {
2a26c979 844 error_setg(errp, "multifd: received packet "
7ed379b2
JQ
845 "with size %d and expected a maximum size of %d",
846 packet->pages_alloc, pages_max * 100) ;
2a26c979
JQ
847 return -1;
848 }
7ed379b2
JQ
849 /*
850 * We received a packet that is bigger than expected but inside
851 * reasonable limits (see previous comment). Just reallocate.
852 */
853 if (packet->pages_alloc > p->pages->allocated) {
854 multifd_pages_clear(p->pages);
f151f8ac 855 p->pages = multifd_pages_init(packet->pages_alloc);
7ed379b2 856 }
2a26c979 857
6f862692
JQ
858 p->pages->used = be32_to_cpu(packet->pages_used);
859 if (p->pages->used > packet->pages_alloc) {
2a26c979 860 error_setg(errp, "multifd: received packet "
6f862692
JQ
861 "with %d pages and expected maximum pages are %d",
862 p->pages->used, packet->pages_alloc) ;
2a26c979
JQ
863 return -1;
864 }
865
2a34ee59 866 p->next_packet_size = be32_to_cpu(packet->next_packet_size);
2a26c979
JQ
867 p->packet_num = be64_to_cpu(packet->packet_num);
868
e4f1bea2
MAL
869 if (p->pages->used == 0) {
870 return 0;
871 }
872
873 /* make sure that ramblock is 0 terminated */
874 packet->ramblock[255] = 0;
875 block = qemu_ram_block_by_name(packet->ramblock);
876 if (!block) {
877 error_setg(errp, "multifd: unknown ram block %s",
878 packet->ramblock);
879 return -1;
2a26c979
JQ
880 }
881
882 for (i = 0; i < p->pages->used; i++) {
ddac5cb2 883 uint64_t offset = be64_to_cpu(packet->offset[i]);
2a26c979
JQ
884
885 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
ddac5cb2 886 error_setg(errp, "multifd: offset too long %" PRIu64
2a26c979
JQ
887 " (max " RAM_ADDR_FMT ")",
888 offset, block->max_length);
889 return -1;
890 }
891 p->pages->iov[i].iov_base = block->host + offset;
892 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
893 }
894
895 return 0;
896}
897
f986c3d2
JQ
898struct {
899 MultiFDSendParams *params;
34c55a94
JQ
900 /* array of pages to sent */
901 MultiFDPages_t *pages;
6df264ac
JQ
902 /* global number of generated multifd packets */
903 uint64_t packet_num;
b9ee2f7d
JQ
904 /* send channels ready */
905 QemuSemaphore channels_ready;
4d65a621
JQ
906 /*
907 * Have we already run terminate threads. There is a race when it
908 * happens that we got one error while we are exiting.
909 * We will use atomic operations. Only valid values are 0 and 1.
910 */
911 int exiting;
f986c3d2
JQ
912} *multifd_send_state;
913
b9ee2f7d
JQ
914/*
915 * How we use multifd_send_state->pages and channel->pages?
916 *
917 * We create a pages for each channel, and a main one. Each time that
918 * we need to send a batch of pages we interchange the ones between
919 * multifd_send_state and the channel that is sending it. There are
920 * two reasons for that:
921 * - to not have to do so many mallocs during migration
922 * - to make easier to know what to free at the end of migration
923 *
924 * This way we always know who is the owner of each "pages" struct,
a5f7b1a6 925 * and we don't need any locking. It belongs to the migration thread
b9ee2f7d
JQ
926 * or to the channel thread. Switching is safe because the migration
927 * thread is using the channel mutex when changing it, and the channel
928 * have to had finish with its own, otherwise pending_job can't be
929 * false.
930 */
931
1b81c974 932static int multifd_send_pages(RAMState *rs)
b9ee2f7d
JQ
933{
934 int i;
935 static int next_channel;
936 MultiFDSendParams *p = NULL; /* make happy gcc */
937 MultiFDPages_t *pages = multifd_send_state->pages;
938 uint64_t transferred;
939
4d65a621
JQ
940 if (atomic_read(&multifd_send_state->exiting)) {
941 return -1;
942 }
943
b9ee2f7d
JQ
944 qemu_sem_wait(&multifd_send_state->channels_ready);
945 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
946 p = &multifd_send_state->params[i];
947
948 qemu_mutex_lock(&p->mutex);
713f762a
IR
949 if (p->quit) {
950 error_report("%s: channel %d has already quit!", __func__, i);
951 qemu_mutex_unlock(&p->mutex);
952 return -1;
953 }
b9ee2f7d
JQ
954 if (!p->pending_job) {
955 p->pending_job++;
956 next_channel = (i + 1) % migrate_multifd_channels();
957 break;
958 }
959 qemu_mutex_unlock(&p->mutex);
960 }
eab54aa7
WY
961 assert(!p->pages->used);
962 assert(!p->pages->block);
b9ee2f7d
JQ
963
964 p->packet_num = multifd_send_state->packet_num++;
b9ee2f7d
JQ
965 multifd_send_state->pages = p->pages;
966 p->pages = pages;
4fcefd44 967 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
1b81c974 968 qemu_file_update_transfer(rs->f, transferred);
b9ee2f7d
JQ
969 ram_counters.multifd_bytes += transferred;
970 ram_counters.transferred += transferred;;
971 qemu_mutex_unlock(&p->mutex);
972 qemu_sem_post(&p->sem);
713f762a
IR
973
974 return 1;
b9ee2f7d
JQ
975}
976
1b81c974 977static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
b9ee2f7d
JQ
978{
979 MultiFDPages_t *pages = multifd_send_state->pages;
980
981 if (!pages->block) {
982 pages->block = block;
983 }
984
985 if (pages->block == block) {
986 pages->offset[pages->used] = offset;
987 pages->iov[pages->used].iov_base = block->host + offset;
988 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
989 pages->used++;
990
991 if (pages->used < pages->allocated) {
713f762a 992 return 1;
b9ee2f7d
JQ
993 }
994 }
995
1b81c974 996 if (multifd_send_pages(rs) < 0) {
713f762a
IR
997 return -1;
998 }
b9ee2f7d
JQ
999
1000 if (pages->block != block) {
1b81c974 1001 return multifd_queue_page(rs, block, offset);
b9ee2f7d 1002 }
713f762a
IR
1003
1004 return 1;
b9ee2f7d
JQ
1005}
1006
66770707 1007static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
1008{
1009 int i;
1010
5558c91a
JQ
1011 trace_multifd_send_terminate_threads(err != NULL);
1012
7a169d74
JQ
1013 if (err) {
1014 MigrationState *s = migrate_get_current();
1015 migrate_set_error(s, err);
1016 if (s->state == MIGRATION_STATUS_SETUP ||
1017 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1018 s->state == MIGRATION_STATUS_DEVICE ||
1019 s->state == MIGRATION_STATUS_ACTIVE) {
1020 migrate_set_state(&s->state, s->state,
1021 MIGRATION_STATUS_FAILED);
1022 }
1023 }
1024
4d65a621
JQ
1025 /*
1026 * We don't want to exit each threads twice. Depending on where
1027 * we get the error, or if there are two independent errors in two
1028 * threads at the same time, we can end calling this function
1029 * twice.
1030 */
1031 if (atomic_xchg(&multifd_send_state->exiting, 1)) {
1032 return;
1033 }
1034
66770707 1035 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1036 MultiFDSendParams *p = &multifd_send_state->params[i];
1037
1038 qemu_mutex_lock(&p->mutex);
1039 p->quit = true;
1040 qemu_sem_post(&p->sem);
1041 qemu_mutex_unlock(&p->mutex);
1042 }
1043}
1044
1398b2e3 1045void multifd_save_cleanup(void)
f986c3d2
JQ
1046{
1047 int i;
f986c3d2
JQ
1048
1049 if (!migrate_use_multifd()) {
1398b2e3 1050 return;
f986c3d2 1051 }
66770707
JQ
1052 multifd_send_terminate_threads(NULL);
1053 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1054 MultiFDSendParams *p = &multifd_send_state->params[i];
1055
66770707
JQ
1056 if (p->running) {
1057 qemu_thread_join(&p->thread);
1058 }
9560a48e
JC
1059 }
1060 for (i = 0; i < migrate_multifd_channels(); i++) {
1061 MultiFDSendParams *p = &multifd_send_state->params[i];
1062
60df2d4a
JQ
1063 socket_send_channel_destroy(p->c);
1064 p->c = NULL;
f986c3d2
JQ
1065 qemu_mutex_destroy(&p->mutex);
1066 qemu_sem_destroy(&p->sem);
18cdcea3 1067 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1068 g_free(p->name);
1069 p->name = NULL;
34c55a94
JQ
1070 multifd_pages_clear(p->pages);
1071 p->pages = NULL;
2a26c979
JQ
1072 p->packet_len = 0;
1073 g_free(p->packet);
1074 p->packet = NULL;
f986c3d2 1075 }
b9ee2f7d 1076 qemu_sem_destroy(&multifd_send_state->channels_ready);
f986c3d2
JQ
1077 g_free(multifd_send_state->params);
1078 multifd_send_state->params = NULL;
34c55a94
JQ
1079 multifd_pages_clear(multifd_send_state->pages);
1080 multifd_send_state->pages = NULL;
f986c3d2
JQ
1081 g_free(multifd_send_state);
1082 multifd_send_state = NULL;
f986c3d2
JQ
1083}
1084
1b81c974 1085static void multifd_send_sync_main(RAMState *rs)
6df264ac
JQ
1086{
1087 int i;
1088
1089 if (!migrate_use_multifd()) {
1090 return;
1091 }
b9ee2f7d 1092 if (multifd_send_state->pages->used) {
1b81c974 1093 if (multifd_send_pages(rs) < 0) {
713f762a
IR
1094 error_report("%s: multifd_send_pages fail", __func__);
1095 return;
1096 }
b9ee2f7d 1097 }
6df264ac
JQ
1098 for (i = 0; i < migrate_multifd_channels(); i++) {
1099 MultiFDSendParams *p = &multifd_send_state->params[i];
1100
1101 trace_multifd_send_sync_main_signal(p->id);
1102
1103 qemu_mutex_lock(&p->mutex);
b9ee2f7d 1104
713f762a
IR
1105 if (p->quit) {
1106 error_report("%s: channel %d has already quit", __func__, i);
1107 qemu_mutex_unlock(&p->mutex);
1108 return;
1109 }
1110
b9ee2f7d 1111 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
1112 p->flags |= MULTIFD_FLAG_SYNC;
1113 p->pending_job++;
1b81c974 1114 qemu_file_update_transfer(rs->f, p->packet_len);
81507f6b
IR
1115 ram_counters.multifd_bytes += p->packet_len;
1116 ram_counters.transferred += p->packet_len;
6df264ac
JQ
1117 qemu_mutex_unlock(&p->mutex);
1118 qemu_sem_post(&p->sem);
1119 }
1120 for (i = 0; i < migrate_multifd_channels(); i++) {
1121 MultiFDSendParams *p = &multifd_send_state->params[i];
1122
1123 trace_multifd_send_sync_main_wait(p->id);
18cdcea3 1124 qemu_sem_wait(&p->sem_sync);
6df264ac
JQ
1125 }
1126 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1127}
1128
f986c3d2
JQ
1129static void *multifd_send_thread(void *opaque)
1130{
1131 MultiFDSendParams *p = opaque;
af8b7d2b 1132 Error *local_err = NULL;
a3ec6b7d
IR
1133 int ret = 0;
1134 uint32_t flags = 0;
af8b7d2b 1135
408ea6ae 1136 trace_multifd_send_thread_start(p->id);
74637e6f 1137 rcu_register_thread();
408ea6ae 1138
af8b7d2b 1139 if (multifd_send_initial_packet(p, &local_err) < 0) {
2f4aefd3 1140 ret = -1;
af8b7d2b
JQ
1141 goto out;
1142 }
408ea6ae
JQ
1143 /* initial packet */
1144 p->num_packets = 1;
f986c3d2
JQ
1145
1146 while (true) {
d82628e4 1147 qemu_sem_wait(&p->sem);
4d65a621
JQ
1148
1149 if (atomic_read(&multifd_send_state->exiting)) {
1150 break;
1151 }
f986c3d2 1152 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1153
1154 if (p->pending_job) {
1155 uint32_t used = p->pages->used;
1156 uint64_t packet_num = p->packet_num;
a3ec6b7d 1157 flags = p->flags;
0beb5ed3 1158
2a34ee59 1159 p->next_packet_size = used * qemu_target_page_size();
0beb5ed3
JQ
1160 multifd_send_fill_packet(p);
1161 p->flags = 0;
1162 p->num_packets++;
1163 p->num_pages += used;
eab54aa7
WY
1164 p->pages->used = 0;
1165 p->pages->block = NULL;
0beb5ed3
JQ
1166 qemu_mutex_unlock(&p->mutex);
1167
2a34ee59
JQ
1168 trace_multifd_send(p->id, packet_num, used, flags,
1169 p->next_packet_size);
0beb5ed3 1170
8b2db7f5
JQ
1171 ret = qio_channel_write_all(p->c, (void *)p->packet,
1172 p->packet_len, &local_err);
1173 if (ret != 0) {
1174 break;
1175 }
1176
ad24c7cb
JQ
1177 if (used) {
1178 ret = qio_channel_writev_all(p->c, p->pages->iov,
1179 used, &local_err);
1180 if (ret != 0) {
1181 break;
1182 }
8b2db7f5 1183 }
0beb5ed3
JQ
1184
1185 qemu_mutex_lock(&p->mutex);
1186 p->pending_job--;
1187 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1188
1189 if (flags & MULTIFD_FLAG_SYNC) {
18cdcea3 1190 qemu_sem_post(&p->sem_sync);
6df264ac 1191 }
b9ee2f7d 1192 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1193 } else if (p->quit) {
f986c3d2
JQ
1194 qemu_mutex_unlock(&p->mutex);
1195 break;
6df264ac
JQ
1196 } else {
1197 qemu_mutex_unlock(&p->mutex);
1198 /* sometimes there are spurious wakeups */
f986c3d2 1199 }
f986c3d2
JQ
1200 }
1201
af8b7d2b
JQ
1202out:
1203 if (local_err) {
7dd59d01 1204 trace_multifd_send_error(p->id);
af8b7d2b
JQ
1205 multifd_send_terminate_threads(local_err);
1206 }
1207
a3ec6b7d
IR
1208 /*
1209 * Error happen, I will exit, but I can't just leave, tell
1210 * who pay attention to me.
1211 */
1212 if (ret != 0) {
2f4aefd3 1213 qemu_sem_post(&p->sem_sync);
a3ec6b7d
IR
1214 qemu_sem_post(&multifd_send_state->channels_ready);
1215 }
1216
66770707
JQ
1217 qemu_mutex_lock(&p->mutex);
1218 p->running = false;
1219 qemu_mutex_unlock(&p->mutex);
1220
74637e6f 1221 rcu_unregister_thread();
408ea6ae
JQ
1222 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1223
f986c3d2
JQ
1224 return NULL;
1225}
1226
60df2d4a
JQ
1227static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1228{
1229 MultiFDSendParams *p = opaque;
1230 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1231 Error *local_err = NULL;
1232
7dd59d01 1233 trace_multifd_new_send_channel_async(p->id);
60df2d4a 1234 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1235 migrate_set_error(migrate_get_current(), local_err);
1236 multifd_save_cleanup();
60df2d4a
JQ
1237 } else {
1238 p->c = QIO_CHANNEL(sioc);
1239 qio_channel_set_delay(p->c, false);
1240 p->running = true;
1241 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1242 QEMU_THREAD_JOINABLE);
60df2d4a
JQ
1243 }
1244}
1245
f986c3d2
JQ
1246int multifd_save_setup(void)
1247{
1248 int thread_count;
efd1a1d6 1249 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1250 uint8_t i;
1251
1252 if (!migrate_use_multifd()) {
1253 return 0;
1254 }
1255 thread_count = migrate_multifd_channels();
1256 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1257 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
34c55a94 1258 multifd_send_state->pages = multifd_pages_init(page_count);
b9ee2f7d 1259 qemu_sem_init(&multifd_send_state->channels_ready, 0);
4d65a621 1260 atomic_set(&multifd_send_state->exiting, 0);
34c55a94 1261
f986c3d2
JQ
1262 for (i = 0; i < thread_count; i++) {
1263 MultiFDSendParams *p = &multifd_send_state->params[i];
1264
1265 qemu_mutex_init(&p->mutex);
1266 qemu_sem_init(&p->sem, 0);
18cdcea3 1267 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1268 p->quit = false;
0beb5ed3 1269 p->pending_job = 0;
f986c3d2 1270 p->id = i;
34c55a94 1271 p->pages = multifd_pages_init(page_count);
2a26c979 1272 p->packet_len = sizeof(MultiFDPacket_t)
ddac5cb2 1273 + sizeof(uint64_t) * page_count;
2a26c979 1274 p->packet = g_malloc0(p->packet_len);
9985e1f4
WY
1275 p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1276 p->packet->version = cpu_to_be32(MULTIFD_VERSION);
f986c3d2 1277 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1278 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1279 }
1280 return 0;
1281}
1282
f986c3d2
JQ
1283struct {
1284 MultiFDRecvParams *params;
1285 /* number of created threads */
1286 int count;
6df264ac
JQ
1287 /* syncs main thread and channels */
1288 QemuSemaphore sem_sync;
1289 /* global number of generated multifd packets */
1290 uint64_t packet_num;
f986c3d2
JQ
1291} *multifd_recv_state;
1292
66770707 1293static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1294{
1295 int i;
1296
5558c91a
JQ
1297 trace_multifd_recv_terminate_threads(err != NULL);
1298
7a169d74
JQ
1299 if (err) {
1300 MigrationState *s = migrate_get_current();
1301 migrate_set_error(s, err);
1302 if (s->state == MIGRATION_STATUS_SETUP ||
1303 s->state == MIGRATION_STATUS_ACTIVE) {
1304 migrate_set_state(&s->state, s->state,
1305 MIGRATION_STATUS_FAILED);
1306 }
1307 }
1308
66770707 1309 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1310 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1311
1312 qemu_mutex_lock(&p->mutex);
3c3ca25d 1313 p->quit = true;
7a5cc33c
JQ
1314 /* We could arrive here for two reasons:
1315 - normal quit, i.e. everything went fine, just finished
1316 - error quit: We close the channels so the channel threads
1317 finish the qio_channel_read_all_eof() */
f76e32eb
JC
1318 if (p->c) {
1319 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1320 }
f986c3d2
JQ
1321 qemu_mutex_unlock(&p->mutex);
1322 }
1323}
1324
1325int multifd_load_cleanup(Error **errp)
1326{
1327 int i;
1328 int ret = 0;
1329
1330 if (!migrate_use_multifd()) {
1331 return 0;
1332 }
66770707
JQ
1333 multifd_recv_terminate_threads(NULL);
1334 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1335 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1336
66770707 1337 if (p->running) {
3c3ca25d 1338 p->quit = true;
f193bc0c
IR
1339 /*
1340 * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1341 * however try to wakeup it without harm in cleanup phase.
1342 */
1343 qemu_sem_post(&p->sem_sync);
66770707
JQ
1344 qemu_thread_join(&p->thread);
1345 }
9560a48e
JC
1346 }
1347 for (i = 0; i < migrate_multifd_channels(); i++) {
1348 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1349
60df2d4a
JQ
1350 object_unref(OBJECT(p->c));
1351 p->c = NULL;
f986c3d2 1352 qemu_mutex_destroy(&p->mutex);
6df264ac 1353 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1354 g_free(p->name);
1355 p->name = NULL;
34c55a94
JQ
1356 multifd_pages_clear(p->pages);
1357 p->pages = NULL;
2a26c979
JQ
1358 p->packet_len = 0;
1359 g_free(p->packet);
1360 p->packet = NULL;
f986c3d2 1361 }
6df264ac 1362 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1363 g_free(multifd_recv_state->params);
1364 multifd_recv_state->params = NULL;
1365 g_free(multifd_recv_state);
1366 multifd_recv_state = NULL;
1367
1368 return ret;
1369}
1370
6df264ac
JQ
1371static void multifd_recv_sync_main(void)
1372{
1373 int i;
1374
1375 if (!migrate_use_multifd()) {
1376 return;
1377 }
1378 for (i = 0; i < migrate_multifd_channels(); i++) {
1379 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1380
6df264ac
JQ
1381 trace_multifd_recv_sync_main_wait(p->id);
1382 qemu_sem_wait(&multifd_recv_state->sem_sync);
77568ea7
WY
1383 }
1384 for (i = 0; i < migrate_multifd_channels(); i++) {
1385 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1386
6df264ac
JQ
1387 qemu_mutex_lock(&p->mutex);
1388 if (multifd_recv_state->packet_num < p->packet_num) {
1389 multifd_recv_state->packet_num = p->packet_num;
1390 }
1391 qemu_mutex_unlock(&p->mutex);
6df264ac 1392 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1393 qemu_sem_post(&p->sem_sync);
1394 }
1395 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1396}
1397
f986c3d2
JQ
1398static void *multifd_recv_thread(void *opaque)
1399{
1400 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1401 Error *local_err = NULL;
1402 int ret;
f986c3d2 1403
408ea6ae 1404 trace_multifd_recv_thread_start(p->id);
74637e6f 1405 rcu_register_thread();
408ea6ae 1406
f986c3d2 1407 while (true) {
6df264ac
JQ
1408 uint32_t used;
1409 uint32_t flags;
0beb5ed3 1410
3c3ca25d
JQ
1411 if (p->quit) {
1412 break;
1413 }
1414
8b2db7f5
JQ
1415 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1416 p->packet_len, &local_err);
1417 if (ret == 0) { /* EOF */
1418 break;
1419 }
1420 if (ret == -1) { /* Error */
1421 break;
1422 }
2a26c979 1423
6df264ac
JQ
1424 qemu_mutex_lock(&p->mutex);
1425 ret = multifd_recv_unfill_packet(p, &local_err);
1426 if (ret) {
f986c3d2
JQ
1427 qemu_mutex_unlock(&p->mutex);
1428 break;
1429 }
6df264ac
JQ
1430
1431 used = p->pages->used;
1432 flags = p->flags;
2a34ee59
JQ
1433 trace_multifd_recv(p->id, p->packet_num, used, flags,
1434 p->next_packet_size);
6df264ac
JQ
1435 p->num_packets++;
1436 p->num_pages += used;
f986c3d2 1437 qemu_mutex_unlock(&p->mutex);
6df264ac 1438
ad24c7cb
JQ
1439 if (used) {
1440 ret = qio_channel_readv_all(p->c, p->pages->iov,
1441 used, &local_err);
1442 if (ret != 0) {
1443 break;
1444 }
8b2db7f5
JQ
1445 }
1446
6df264ac
JQ
1447 if (flags & MULTIFD_FLAG_SYNC) {
1448 qemu_sem_post(&multifd_recv_state->sem_sync);
1449 qemu_sem_wait(&p->sem_sync);
1450 }
f986c3d2
JQ
1451 }
1452
d82628e4
JQ
1453 if (local_err) {
1454 multifd_recv_terminate_threads(local_err);
1455 }
66770707
JQ
1456 qemu_mutex_lock(&p->mutex);
1457 p->running = false;
1458 qemu_mutex_unlock(&p->mutex);
1459
74637e6f 1460 rcu_unregister_thread();
408ea6ae
JQ
1461 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1462
f986c3d2
JQ
1463 return NULL;
1464}
1465
1466int multifd_load_setup(void)
1467{
1468 int thread_count;
efd1a1d6 1469 uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
f986c3d2
JQ
1470 uint8_t i;
1471
1472 if (!migrate_use_multifd()) {
1473 return 0;
1474 }
1475 thread_count = migrate_multifd_channels();
1476 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1477 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1478 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1479 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1480
f986c3d2
JQ
1481 for (i = 0; i < thread_count; i++) {
1482 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1483
1484 qemu_mutex_init(&p->mutex);
6df264ac 1485 qemu_sem_init(&p->sem_sync, 0);
3c3ca25d 1486 p->quit = false;
f986c3d2 1487 p->id = i;
34c55a94 1488 p->pages = multifd_pages_init(page_count);
2a26c979 1489 p->packet_len = sizeof(MultiFDPacket_t)
ddac5cb2 1490 + sizeof(uint64_t) * page_count;
2a26c979 1491 p->packet = g_malloc0(p->packet_len);
f986c3d2 1492 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1493 }
1494 return 0;
1495}
1496
62c1e0ca
JQ
1497bool multifd_recv_all_channels_created(void)
1498{
1499 int thread_count = migrate_multifd_channels();
1500
1501 if (!migrate_use_multifd()) {
1502 return true;
1503 }
1504
1505 return thread_count == atomic_read(&multifd_recv_state->count);
1506}
1507
49ed0d24
FL
1508/*
1509 * Try to receive all multifd channels to get ready for the migration.
1510 * - Return true and do not set @errp when correctly receving all channels;
1511 * - Return false and do not set @errp when correctly receiving the current one;
1512 * - Return false and set @errp when failing to receive the current channel.
1513 */
1514bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1515{
60df2d4a 1516 MultiFDRecvParams *p;
af8b7d2b
JQ
1517 Error *local_err = NULL;
1518 int id;
60df2d4a 1519
af8b7d2b
JQ
1520 id = multifd_recv_initial_packet(ioc, &local_err);
1521 if (id < 0) {
1522 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1523 error_propagate_prepend(errp, local_err,
1524 "failed to receive packet"
1525 " via multifd channel %d: ",
1526 atomic_read(&multifd_recv_state->count));
81e62053 1527 return false;
af8b7d2b 1528 }
7dd59d01 1529 trace_multifd_recv_new_channel(id);
af8b7d2b
JQ
1530
1531 p = &multifd_recv_state->params[id];
1532 if (p->c != NULL) {
1533 error_setg(&local_err, "multifd: received id '%d' already setup'",
1534 id);
1535 multifd_recv_terminate_threads(local_err);
49ed0d24 1536 error_propagate(errp, local_err);
81e62053 1537 return false;
af8b7d2b 1538 }
60df2d4a
JQ
1539 p->c = ioc;
1540 object_ref(OBJECT(ioc));
408ea6ae
JQ
1541 /* initial packet */
1542 p->num_packets = 1;
60df2d4a
JQ
1543
1544 p->running = true;
1545 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1546 QEMU_THREAD_JOINABLE);
1547 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1548 return atomic_read(&multifd_recv_state->count) ==
1549 migrate_multifd_channels();
71bb07db
JQ
1550}
1551
56e93d26 1552/**
3d0684b2 1553 * save_page_header: write page header to wire
56e93d26
JQ
1554 *
1555 * If this is the 1st block, it also writes the block identification
1556 *
3d0684b2 1557 * Returns the number of bytes written
56e93d26
JQ
1558 *
1559 * @f: QEMUFile where to send the data
1560 * @block: block that contains the page we want to send
1561 * @offset: offset inside the block for the page
1562 * in the lower bits, it contains flags
1563 */
2bf3aa85
JQ
1564static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1565 ram_addr_t offset)
56e93d26 1566{
9f5f380b 1567 size_t size, len;
56e93d26 1568
24795694
JQ
1569 if (block == rs->last_sent_block) {
1570 offset |= RAM_SAVE_FLAG_CONTINUE;
1571 }
2bf3aa85 1572 qemu_put_be64(f, offset);
56e93d26
JQ
1573 size = 8;
1574
1575 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1576 len = strlen(block->idstr);
2bf3aa85
JQ
1577 qemu_put_byte(f, len);
1578 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1579 size += 1 + len;
24795694 1580 rs->last_sent_block = block;
56e93d26
JQ
1581 }
1582 return size;
1583}
1584
3d0684b2
JQ
1585/**
1586 * mig_throttle_guest_down: throotle down the guest
1587 *
1588 * Reduce amount of guest cpu execution to hopefully slow down memory
1589 * writes. If guest dirty memory rate is reduced below the rate at
1590 * which we can transfer pages to the destination then we should be
1591 * able to complete migration. Some workloads dirty memory way too
1592 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1593 */
1594static void mig_throttle_guest_down(void)
1595{
1596 MigrationState *s = migrate_get_current();
2594f56d
DB
1597 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1598 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1599 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1600
1601 /* We have not started throttling yet. Let's start it. */
1602 if (!cpu_throttle_active()) {
1603 cpu_throttle_set(pct_initial);
1604 } else {
1605 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1606 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1607 pct_max));
070afca2
JH
1608 }
1609}
1610
3d0684b2
JQ
1611/**
1612 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1613 *
6f37bb8b 1614 * @rs: current RAM state
3d0684b2
JQ
1615 * @current_addr: address for the zero page
1616 *
1617 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1618 * The important thing is that a stale (not-yet-0'd) page be replaced
1619 * by the new data.
1620 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1621 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1622 */
6f37bb8b 1623static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1624{
6f37bb8b 1625 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1626 return;
1627 }
1628
1629 /* We don't care if this fails to allocate a new cache page
1630 * as long as it updated an old one */
c00e0928 1631 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1632 ram_counters.dirty_sync_count);
56e93d26
JQ
1633}
1634
1635#define ENCODING_FLAG_XBZRLE 0x1
1636
1637/**
1638 * save_xbzrle_page: compress and send current page
1639 *
1640 * Returns: 1 means that we wrote the page
1641 * 0 means that page is identical to the one already sent
1642 * -1 means that xbzrle would be longer than normal
1643 *
5a987738 1644 * @rs: current RAM state
3d0684b2
JQ
1645 * @current_data: pointer to the address of the page contents
1646 * @current_addr: addr of the page
56e93d26
JQ
1647 * @block: block that contains the page we want to send
1648 * @offset: offset inside the block for the page
1649 * @last_stage: if we are at the completion stage
56e93d26 1650 */
204b88b8 1651static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1652 ram_addr_t current_addr, RAMBlock *block,
072c2511 1653 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1654{
1655 int encoded_len = 0, bytes_xbzrle;
1656 uint8_t *prev_cached_page;
1657
9360447d
JQ
1658 if (!cache_is_cached(XBZRLE.cache, current_addr,
1659 ram_counters.dirty_sync_count)) {
1660 xbzrle_counters.cache_miss++;
56e93d26
JQ
1661 if (!last_stage) {
1662 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1663 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1664 return -1;
1665 } else {
1666 /* update *current_data when the page has been
1667 inserted into cache */
1668 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1669 }
1670 }
1671 return -1;
1672 }
1673
1674 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1675
1676 /* save current buffer into memory */
1677 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1678
1679 /* XBZRLE encoding (if there is no overflow) */
1680 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1681 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1682 TARGET_PAGE_SIZE);
ca353803
WY
1683
1684 /*
1685 * Update the cache contents, so that it corresponds to the data
1686 * sent, in all cases except where we skip the page.
1687 */
1688 if (!last_stage && encoded_len != 0) {
1689 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1690 /*
1691 * In the case where we couldn't compress, ensure that the caller
1692 * sends the data from the cache, since the guest might have
1693 * changed the RAM since we copied it.
1694 */
1695 *current_data = prev_cached_page;
1696 }
1697
56e93d26 1698 if (encoded_len == 0) {
55c4446b 1699 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1700 return 0;
1701 } else if (encoded_len == -1) {
55c4446b 1702 trace_save_xbzrle_page_overflow();
9360447d 1703 xbzrle_counters.overflow++;
56e93d26
JQ
1704 return -1;
1705 }
1706
56e93d26 1707 /* Send XBZRLE based compressed page */
2bf3aa85 1708 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1709 offset | RAM_SAVE_FLAG_XBZRLE);
1710 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1711 qemu_put_be16(rs->f, encoded_len);
1712 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1713 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1714 xbzrle_counters.pages++;
1715 xbzrle_counters.bytes += bytes_xbzrle;
1716 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1717
1718 return 1;
1719}
1720
3d0684b2
JQ
1721/**
1722 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1723 *
a5f7b1a6 1724 * Returns the page offset within memory region of the start of a dirty page
3d0684b2 1725 *
6f37bb8b 1726 * @rs: current RAM state
3d0684b2 1727 * @rb: RAMBlock where to search for dirty pages
a935e30f 1728 * @start: page where we start the search
f3f491fc 1729 */
56e93d26 1730static inline
a935e30f 1731unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1732 unsigned long start)
56e93d26 1733{
6b6712ef
JQ
1734 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1735 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1736 unsigned long next;
1737
fbd162e6 1738 if (ramblock_is_ignored(rb)) {
b895de50
CLG
1739 return size;
1740 }
1741
6eeb63f7
WW
1742 /*
1743 * When the free page optimization is enabled, we need to check the bitmap
1744 * to send the non-free pages rather than all the pages in the bulk stage.
1745 */
1746 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
6b6712ef 1747 next = start + 1;
56e93d26 1748 } else {
6b6712ef 1749 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1750 }
1751
6b6712ef 1752 return next;
56e93d26
JQ
1753}
1754
06b10688 1755static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1756 RAMBlock *rb,
1757 unsigned long page)
a82d593b
DDAG
1758{
1759 bool ret;
a82d593b 1760
386a907b 1761 qemu_mutex_lock(&rs->bitmap_mutex);
002cad6b
PX
1762
1763 /*
1764 * Clear dirty bitmap if needed. This _must_ be called before we
1765 * send any of the page in the chunk because we need to make sure
1766 * we can capture further page content changes when we sync dirty
1767 * log the next time. So as long as we are going to send any of
1768 * the page in the chunk we clear the remote dirty bitmap for all.
1769 * Clearing it earlier won't be a problem, but too late will.
1770 */
1771 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1772 uint8_t shift = rb->clear_bmap_shift;
1773 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
8bba004c 1774 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
002cad6b
PX
1775
1776 /*
1777 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1778 * can make things easier sometimes since then start address
1779 * of the small chunk will always be 64 pages aligned so the
1780 * bitmap will always be aligned to unsigned long. We should
1781 * even be able to remove this restriction but I'm simply
1782 * keeping it.
1783 */
1784 assert(shift >= 6);
1785 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1786 memory_region_clear_dirty_bitmap(rb->mr, start, size);
1787 }
1788
6b6712ef 1789 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1790
1791 if (ret) {
0d8ec885 1792 rs->migration_dirty_pages--;
a82d593b 1793 }
386a907b
WW
1794 qemu_mutex_unlock(&rs->bitmap_mutex);
1795
a82d593b
DDAG
1796 return ret;
1797}
1798
267691b6 1799/* Called with RCU critical section */
7a3e9571 1800static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
56e93d26 1801{
0d8ec885 1802 rs->migration_dirty_pages +=
5d0980a4 1803 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
0d8ec885 1804 &rs->num_dirty_pages_period);
56e93d26
JQ
1805}
1806
3d0684b2
JQ
1807/**
1808 * ram_pagesize_summary: calculate all the pagesizes of a VM
1809 *
1810 * Returns a summary bitmap of the page sizes of all RAMBlocks
1811 *
1812 * For VMs with just normal pages this is equivalent to the host page
1813 * size. If it's got some huge pages then it's the OR of all the
1814 * different page sizes.
e8ca1db2
DDAG
1815 */
1816uint64_t ram_pagesize_summary(void)
1817{
1818 RAMBlock *block;
1819 uint64_t summary = 0;
1820
fbd162e6 1821 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1822 summary |= block->page_size;
1823 }
1824
1825 return summary;
1826}
1827
aecbfe9c
XG
1828uint64_t ram_get_total_transferred_pages(void)
1829{
1830 return ram_counters.normal + ram_counters.duplicate +
1831 compression_counters.pages + xbzrle_counters.pages;
1832}
1833
b734035b
XG
1834static void migration_update_rates(RAMState *rs, int64_t end_time)
1835{
be8b02ed 1836 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1837 double compressed_size;
b734035b
XG
1838
1839 /* calculate period counters */
1840 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1841 / (end_time - rs->time_last_bitmap_sync);
1842
be8b02ed 1843 if (!page_count) {
b734035b
XG
1844 return;
1845 }
1846
1847 if (migrate_use_xbzrle()) {
1848 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1849 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1850 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1851 }
76e03000
XG
1852
1853 if (migrate_use_compression()) {
1854 compression_counters.busy_rate = (double)(compression_counters.busy -
1855 rs->compress_thread_busy_prev) / page_count;
1856 rs->compress_thread_busy_prev = compression_counters.busy;
1857
1858 compressed_size = compression_counters.compressed_size -
1859 rs->compressed_size_prev;
1860 if (compressed_size) {
1861 double uncompressed_size = (compression_counters.pages -
1862 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1863
1864 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1865 compression_counters.compression_rate =
1866 uncompressed_size / compressed_size;
1867
1868 rs->compress_pages_prev = compression_counters.pages;
1869 rs->compressed_size_prev = compression_counters.compressed_size;
1870 }
1871 }
b734035b
XG
1872}
1873
8d820d6f 1874static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1875{
1876 RAMBlock *block;
56e93d26 1877 int64_t end_time;
c4bdf0cf 1878 uint64_t bytes_xfer_now;
56e93d26 1879
9360447d 1880 ram_counters.dirty_sync_count++;
56e93d26 1881
f664da80
JQ
1882 if (!rs->time_last_bitmap_sync) {
1883 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1884 }
1885
1886 trace_migration_bitmap_sync_start();
9c1f8f44 1887 memory_global_dirty_log_sync();
56e93d26 1888
108cfae0 1889 qemu_mutex_lock(&rs->bitmap_mutex);
89ac5a1d
DDAG
1890 WITH_RCU_READ_LOCK_GUARD() {
1891 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1892 ramblock_sync_dirty_bitmap(rs, block);
1893 }
1894 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1895 }
108cfae0 1896 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1897
9458a9a1 1898 memory_global_after_dirty_log_sync();
a66cd90c 1899 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1900
56e93d26
JQ
1901 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1902
1903 /* more than 1 second = 1000 millisecons */
f664da80 1904 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1905 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1906
9ac78b61
PL
1907 /* During block migration the auto-converge logic incorrectly detects
1908 * that ram migration makes no progress. Avoid this by disabling the
1909 * throttling logic during the bulk phase of block migration. */
1910 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1911 /* The following detection logic can be refined later. For now:
1912 Check to see if the dirtied bytes is 50% more than the approx.
1913 amount of bytes that just got transferred since the last time we
070afca2
JH
1914 were in this routine. If that happens twice, start or increase
1915 throttling */
070afca2 1916
d693c6f1 1917 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1918 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1919 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1920 trace_migration_throttle();
8d820d6f 1921 rs->dirty_rate_high_cnt = 0;
070afca2 1922 mig_throttle_guest_down();
d693c6f1 1923 }
56e93d26 1924 }
070afca2 1925
b734035b
XG
1926 migration_update_rates(rs, end_time);
1927
be8b02ed 1928 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1929
1930 /* reset period counters */
f664da80 1931 rs->time_last_bitmap_sync = end_time;
a66cd90c 1932 rs->num_dirty_pages_period = 0;
d2a4d85a 1933 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1934 }
4addcd4f 1935 if (migrate_use_events()) {
3ab72385 1936 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1937 }
56e93d26
JQ
1938}
1939
bd227060
WW
1940static void migration_bitmap_sync_precopy(RAMState *rs)
1941{
1942 Error *local_err = NULL;
1943
1944 /*
1945 * The current notifier usage is just an optimization to migration, so we
1946 * don't stop the normal migration process in the error case.
1947 */
1948 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1949 error_report_err(local_err);
1950 }
1951
1952 migration_bitmap_sync(rs);
1953
1954 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1955 error_report_err(local_err);
1956 }
1957}
1958
6c97ec5f
XG
1959/**
1960 * save_zero_page_to_file: send the zero page to the file
1961 *
1962 * Returns the size of data written to the file, 0 means the page is not
1963 * a zero page
1964 *
1965 * @rs: current RAM state
1966 * @file: the file where the data is saved
1967 * @block: block that contains the page we want to send
1968 * @offset: offset inside the block for the page
1969 */
1970static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1971 RAMBlock *block, ram_addr_t offset)
1972{
1973 uint8_t *p = block->host + offset;
1974 int len = 0;
1975
1976 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1977 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1978 qemu_put_byte(file, 0);
1979 len += 1;
1980 }
1981 return len;
1982}
1983
56e93d26 1984/**
3d0684b2 1985 * save_zero_page: send the zero page to the stream
56e93d26 1986 *
3d0684b2 1987 * Returns the number of pages written.
56e93d26 1988 *
f7ccd61b 1989 * @rs: current RAM state
56e93d26
JQ
1990 * @block: block that contains the page we want to send
1991 * @offset: offset inside the block for the page
56e93d26 1992 */
7faccdc3 1993static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1994{
6c97ec5f 1995 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1996
6c97ec5f 1997 if (len) {
9360447d 1998 ram_counters.duplicate++;
6c97ec5f
XG
1999 ram_counters.transferred += len;
2000 return 1;
56e93d26 2001 }
6c97ec5f 2002 return -1;
56e93d26
JQ
2003}
2004
5727309d 2005static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 2006{
5727309d 2007 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
2008 return;
2009 }
2010
8bba004c 2011 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
53f09a10
PB
2012}
2013
059ff0fb
XG
2014/*
2015 * @pages: the number of pages written by the control path,
2016 * < 0 - error
2017 * > 0 - number of pages written
2018 *
2019 * Return true if the pages has been saved, otherwise false is returned.
2020 */
2021static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2022 int *pages)
2023{
2024 uint64_t bytes_xmit = 0;
2025 int ret;
2026
2027 *pages = -1;
2028 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
2029 &bytes_xmit);
2030 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
2031 return false;
2032 }
2033
2034 if (bytes_xmit) {
2035 ram_counters.transferred += bytes_xmit;
2036 *pages = 1;
2037 }
2038
2039 if (ret == RAM_SAVE_CONTROL_DELAYED) {
2040 return true;
2041 }
2042
2043 if (bytes_xmit > 0) {
2044 ram_counters.normal++;
2045 } else if (bytes_xmit == 0) {
2046 ram_counters.duplicate++;
2047 }
2048
2049 return true;
2050}
2051
65dacaa0
XG
2052/*
2053 * directly send the page to the stream
2054 *
2055 * Returns the number of pages written.
2056 *
2057 * @rs: current RAM state
2058 * @block: block that contains the page we want to send
2059 * @offset: offset inside the block for the page
2060 * @buf: the page to be sent
2061 * @async: send to page asyncly
2062 */
2063static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2064 uint8_t *buf, bool async)
2065{
2066 ram_counters.transferred += save_page_header(rs, rs->f, block,
2067 offset | RAM_SAVE_FLAG_PAGE);
2068 if (async) {
2069 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2070 migrate_release_ram() &
2071 migration_in_postcopy());
2072 } else {
2073 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2074 }
2075 ram_counters.transferred += TARGET_PAGE_SIZE;
2076 ram_counters.normal++;
2077 return 1;
2078}
2079
56e93d26 2080/**
3d0684b2 2081 * ram_save_page: send the given page to the stream
56e93d26 2082 *
3d0684b2 2083 * Returns the number of pages written.
3fd3c4b3
DDAG
2084 * < 0 - error
2085 * >=0 - Number of pages written - this might legally be 0
2086 * if xbzrle noticed the page was the same.
56e93d26 2087 *
6f37bb8b 2088 * @rs: current RAM state
56e93d26
JQ
2089 * @block: block that contains the page we want to send
2090 * @offset: offset inside the block for the page
2091 * @last_stage: if we are at the completion stage
56e93d26 2092 */
a0a8aa14 2093static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
2094{
2095 int pages = -1;
56e93d26 2096 uint8_t *p;
56e93d26 2097 bool send_async = true;
a08f6890 2098 RAMBlock *block = pss->block;
8bba004c 2099 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
059ff0fb 2100 ram_addr_t current_addr = block->offset + offset;
56e93d26 2101
2f68e399 2102 p = block->host + offset;
1db9d8e5 2103 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 2104
56e93d26 2105 XBZRLE_cache_lock();
d7400a34
XG
2106 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2107 migrate_use_xbzrle()) {
059ff0fb
XG
2108 pages = save_xbzrle_page(rs, &p, current_addr, block,
2109 offset, last_stage);
2110 if (!last_stage) {
2111 /* Can't send this cached data async, since the cache page
2112 * might get updated before it gets to the wire
56e93d26 2113 */
059ff0fb 2114 send_async = false;
56e93d26
JQ
2115 }
2116 }
2117
2118 /* XBZRLE overflow or normal page */
2119 if (pages == -1) {
65dacaa0 2120 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
2121 }
2122
2123 XBZRLE_cache_unlock();
2124
2125 return pages;
2126}
2127
b9ee2f7d
JQ
2128static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2129 ram_addr_t offset)
2130{
1b81c974 2131 if (multifd_queue_page(rs, block, offset) < 0) {
713f762a
IR
2132 return -1;
2133 }
b9ee2f7d
JQ
2134 ram_counters.normal++;
2135
2136 return 1;
2137}
2138
5e5fdcff 2139static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 2140 ram_addr_t offset, uint8_t *source_buf)
56e93d26 2141{
53518d94 2142 RAMState *rs = ram_state;
a7a9a88f 2143 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 2144 bool zero_page = false;
6ef3771c 2145 int ret;
56e93d26 2146
5e5fdcff
XG
2147 if (save_zero_page_to_file(rs, f, block, offset)) {
2148 zero_page = true;
2149 goto exit;
2150 }
2151
6ef3771c 2152 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
2153
2154 /*
2155 * copy it to a internal buffer to avoid it being modified by VM
2156 * so that we can catch up the error during compression and
2157 * decompression
2158 */
2159 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
2160 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2161 if (ret < 0) {
2162 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 2163 error_report("compressed data failed!");
5e5fdcff 2164 return false;
b3be2896 2165 }
56e93d26 2166
5e5fdcff 2167exit:
6ef3771c 2168 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
2169 return zero_page;
2170}
2171
2172static void
2173update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2174{
76e03000
XG
2175 ram_counters.transferred += bytes_xmit;
2176
5e5fdcff
XG
2177 if (param->zero_page) {
2178 ram_counters.duplicate++;
76e03000 2179 return;
5e5fdcff 2180 }
76e03000
XG
2181
2182 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2183 compression_counters.compressed_size += bytes_xmit - 8;
2184 compression_counters.pages++;
56e93d26
JQ
2185}
2186
32b05495
XG
2187static bool save_page_use_compression(RAMState *rs);
2188
ce25d337 2189static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
2190{
2191 int idx, len, thread_count;
2192
32b05495 2193 if (!save_page_use_compression(rs)) {
56e93d26
JQ
2194 return;
2195 }
2196 thread_count = migrate_compress_threads();
a7a9a88f 2197
0d9f9a5c 2198 qemu_mutex_lock(&comp_done_lock);
56e93d26 2199 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 2200 while (!comp_param[idx].done) {
0d9f9a5c 2201 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 2202 }
a7a9a88f 2203 }
0d9f9a5c 2204 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
2205
2206 for (idx = 0; idx < thread_count; idx++) {
2207 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 2208 if (!comp_param[idx].quit) {
ce25d337 2209 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
2210 /*
2211 * it's safe to fetch zero_page without holding comp_done_lock
2212 * as there is no further request submitted to the thread,
2213 * i.e, the thread should be waiting for a request at this point.
2214 */
2215 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 2216 }
a7a9a88f 2217 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
2218 }
2219}
2220
2221static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2222 ram_addr_t offset)
2223{
2224 param->block = block;
2225 param->offset = offset;
2226}
2227
ce25d337
JQ
2228static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2229 ram_addr_t offset)
56e93d26
JQ
2230{
2231 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 2232 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
2233
2234 thread_count = migrate_compress_threads();
0d9f9a5c 2235 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
2236retry:
2237 for (idx = 0; idx < thread_count; idx++) {
2238 if (comp_param[idx].done) {
2239 comp_param[idx].done = false;
2240 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2241 qemu_mutex_lock(&comp_param[idx].mutex);
2242 set_compress_params(&comp_param[idx], block, offset);
2243 qemu_cond_signal(&comp_param[idx].cond);
2244 qemu_mutex_unlock(&comp_param[idx].mutex);
2245 pages = 1;
5e5fdcff 2246 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2247 break;
56e93d26
JQ
2248 }
2249 }
1d58872a
XG
2250
2251 /*
2252 * wait for the free thread if the user specifies 'compress-wait-thread',
2253 * otherwise we will post the page out in the main thread as normal page.
2254 */
2255 if (pages < 0 && wait) {
2256 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2257 goto retry;
2258 }
0d9f9a5c 2259 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2260
2261 return pages;
2262}
2263
3d0684b2
JQ
2264/**
2265 * find_dirty_block: find the next dirty page and update any state
2266 * associated with the search process.
b9e60928 2267 *
a5f7b1a6 2268 * Returns true if a page is found
b9e60928 2269 *
6f37bb8b 2270 * @rs: current RAM state
3d0684b2
JQ
2271 * @pss: data about the state of the current dirty page scan
2272 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2273 */
f20e2865 2274static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2275{
f20e2865 2276 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2277 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2278 pss->page >= rs->last_page) {
b9e60928
DDAG
2279 /*
2280 * We've been once around the RAM and haven't found anything.
2281 * Give up.
2282 */
2283 *again = false;
2284 return false;
2285 }
8bba004c
AR
2286 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
2287 >= pss->block->used_length) {
b9e60928 2288 /* Didn't find anything in this RAM Block */
a935e30f 2289 pss->page = 0;
b9e60928
DDAG
2290 pss->block = QLIST_NEXT_RCU(pss->block, next);
2291 if (!pss->block) {
48df9d80
XG
2292 /*
2293 * If memory migration starts over, we will meet a dirtied page
2294 * which may still exists in compression threads's ring, so we
2295 * should flush the compressed data to make sure the new page
2296 * is not overwritten by the old one in the destination.
2297 *
2298 * Also If xbzrle is on, stop using the data compression at this
2299 * point. In theory, xbzrle can do better than compression.
2300 */
2301 flush_compressed_data(rs);
2302
b9e60928
DDAG
2303 /* Hit the end of the list */
2304 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2305 /* Flag that we've looped */
2306 pss->complete_round = true;
6f37bb8b 2307 rs->ram_bulk_stage = false;
b9e60928
DDAG
2308 }
2309 /* Didn't find anything this time, but try again on the new block */
2310 *again = true;
2311 return false;
2312 } else {
2313 /* Can go around again, but... */
2314 *again = true;
2315 /* We've found something so probably don't need to */
2316 return true;
2317 }
2318}
2319
3d0684b2
JQ
2320/**
2321 * unqueue_page: gets a page of the queue
2322 *
a82d593b 2323 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2324 *
3d0684b2
JQ
2325 * Returns the block of the page (or NULL if none available)
2326 *
ec481c6c 2327 * @rs: current RAM state
3d0684b2 2328 * @offset: used to return the offset within the RAMBlock
a82d593b 2329 */
f20e2865 2330static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2331{
2332 RAMBlock *block = NULL;
2333
ae526e32
XG
2334 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2335 return NULL;
2336 }
2337
ec481c6c
JQ
2338 qemu_mutex_lock(&rs->src_page_req_mutex);
2339 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2340 struct RAMSrcPageRequest *entry =
2341 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2342 block = entry->rb;
2343 *offset = entry->offset;
a82d593b
DDAG
2344
2345 if (entry->len > TARGET_PAGE_SIZE) {
2346 entry->len -= TARGET_PAGE_SIZE;
2347 entry->offset += TARGET_PAGE_SIZE;
2348 } else {
2349 memory_region_unref(block->mr);
ec481c6c 2350 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2351 g_free(entry);
e03a34f8 2352 migration_consume_urgent_request();
a82d593b
DDAG
2353 }
2354 }
ec481c6c 2355 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2356
2357 return block;
2358}
2359
3d0684b2 2360/**
ff1543af 2361 * get_queued_page: unqueue a page from the postcopy requests
3d0684b2
JQ
2362 *
2363 * Skips pages that are already sent (!dirty)
a82d593b 2364 *
a5f7b1a6 2365 * Returns true if a queued page is found
a82d593b 2366 *
6f37bb8b 2367 * @rs: current RAM state
3d0684b2 2368 * @pss: data about the state of the current dirty page scan
a82d593b 2369 */
f20e2865 2370static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2371{
2372 RAMBlock *block;
2373 ram_addr_t offset;
2374 bool dirty;
2375
2376 do {
f20e2865 2377 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2378 /*
2379 * We're sending this page, and since it's postcopy nothing else
2380 * will dirty it, and we must make sure it doesn't get sent again
2381 * even if this queue request was received after the background
2382 * search already sent it.
2383 */
2384 if (block) {
f20e2865
JQ
2385 unsigned long page;
2386
6b6712ef
JQ
2387 page = offset >> TARGET_PAGE_BITS;
2388 dirty = test_bit(page, block->bmap);
a82d593b 2389 if (!dirty) {
06b10688 2390 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
64737606 2391 page);
a82d593b 2392 } else {
f20e2865 2393 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2394 }
2395 }
2396
2397 } while (block && !dirty);
2398
2399 if (block) {
2400 /*
2401 * As soon as we start servicing pages out of order, then we have
2402 * to kill the bulk stage, since the bulk stage assumes
2403 * in (migration_bitmap_find_and_reset_dirty) that every page is
2404 * dirty, that's no longer true.
2405 */
6f37bb8b 2406 rs->ram_bulk_stage = false;
a82d593b
DDAG
2407
2408 /*
2409 * We want the background search to continue from the queued page
2410 * since the guest is likely to want other pages near to the page
2411 * it just requested.
2412 */
2413 pss->block = block;
a935e30f 2414 pss->page = offset >> TARGET_PAGE_BITS;
422314e7
WY
2415
2416 /*
2417 * This unqueued page would break the "one round" check, even is
2418 * really rare.
2419 */
2420 pss->complete_round = false;
a82d593b
DDAG
2421 }
2422
2423 return !!block;
2424}
2425
6c595cde 2426/**
5e58f968
JQ
2427 * migration_page_queue_free: drop any remaining pages in the ram
2428 * request queue
6c595cde 2429 *
3d0684b2
JQ
2430 * It should be empty at the end anyway, but in error cases there may
2431 * be some left. in case that there is any page left, we drop it.
2432 *
6c595cde 2433 */
83c13382 2434static void migration_page_queue_free(RAMState *rs)
6c595cde 2435{
ec481c6c 2436 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2437 /* This queue generally should be empty - but in the case of a failed
2438 * migration might have some droppings in.
2439 */
89ac5a1d 2440 RCU_READ_LOCK_GUARD();
ec481c6c 2441 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2442 memory_region_unref(mspr->rb->mr);
ec481c6c 2443 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2444 g_free(mspr);
2445 }
6c595cde
DDAG
2446}
2447
2448/**
3d0684b2
JQ
2449 * ram_save_queue_pages: queue the page for transmission
2450 *
2451 * A request from postcopy destination for example.
2452 *
2453 * Returns zero on success or negative on error
2454 *
3d0684b2
JQ
2455 * @rbname: Name of the RAMBLock of the request. NULL means the
2456 * same that last one.
2457 * @start: starting address from the start of the RAMBlock
2458 * @len: length (in bytes) to send
6c595cde 2459 */
96506894 2460int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2461{
2462 RAMBlock *ramblock;
53518d94 2463 RAMState *rs = ram_state;
6c595cde 2464
9360447d 2465 ram_counters.postcopy_requests++;
89ac5a1d
DDAG
2466 RCU_READ_LOCK_GUARD();
2467
6c595cde
DDAG
2468 if (!rbname) {
2469 /* Reuse last RAMBlock */
68a098f3 2470 ramblock = rs->last_req_rb;
6c595cde
DDAG
2471
2472 if (!ramblock) {
2473 /*
2474 * Shouldn't happen, we can't reuse the last RAMBlock if
2475 * it's the 1st request.
2476 */
2477 error_report("ram_save_queue_pages no previous block");
03acb4e9 2478 return -1;
6c595cde
DDAG
2479 }
2480 } else {
2481 ramblock = qemu_ram_block_by_name(rbname);
2482
2483 if (!ramblock) {
2484 /* We shouldn't be asked for a non-existent RAMBlock */
2485 error_report("ram_save_queue_pages no block '%s'", rbname);
03acb4e9 2486 return -1;
6c595cde 2487 }
68a098f3 2488 rs->last_req_rb = ramblock;
6c595cde
DDAG
2489 }
2490 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2491 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2492 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2493 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde 2494 __func__, start, len, ramblock->used_length);
03acb4e9 2495 return -1;
6c595cde
DDAG
2496 }
2497
ec481c6c
JQ
2498 struct RAMSrcPageRequest *new_entry =
2499 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2500 new_entry->rb = ramblock;
2501 new_entry->offset = start;
2502 new_entry->len = len;
2503
2504 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2505 qemu_mutex_lock(&rs->src_page_req_mutex);
2506 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2507 migration_make_urgent_request();
ec481c6c 2508 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2509
2510 return 0;
6c595cde
DDAG
2511}
2512
d7400a34
XG
2513static bool save_page_use_compression(RAMState *rs)
2514{
2515 if (!migrate_use_compression()) {
2516 return false;
2517 }
2518
2519 /*
2520 * If xbzrle is on, stop using the data compression after first
2521 * round of migration even if compression is enabled. In theory,
2522 * xbzrle can do better than compression.
2523 */
2524 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2525 return true;
2526 }
2527
2528 return false;
2529}
2530
5e5fdcff
XG
2531/*
2532 * try to compress the page before posting it out, return true if the page
2533 * has been properly handled by compression, otherwise needs other
2534 * paths to handle it
2535 */
2536static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2537{
2538 if (!save_page_use_compression(rs)) {
2539 return false;
2540 }
2541
2542 /*
2543 * When starting the process of a new block, the first page of
2544 * the block should be sent out before other pages in the same
2545 * block, and all the pages in last block should have been sent
2546 * out, keeping this order is important, because the 'cont' flag
2547 * is used to avoid resending the block name.
2548 *
2549 * We post the fist page as normal page as compression will take
2550 * much CPU resource.
2551 */
2552 if (block != rs->last_sent_block) {
2553 flush_compressed_data(rs);
2554 return false;
2555 }
2556
2557 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2558 return true;
2559 }
2560
76e03000 2561 compression_counters.busy++;
5e5fdcff
XG
2562 return false;
2563}
2564
a82d593b 2565/**
3d0684b2 2566 * ram_save_target_page: save one target page
a82d593b 2567 *
3d0684b2 2568 * Returns the number of pages written
a82d593b 2569 *
6f37bb8b 2570 * @rs: current RAM state
3d0684b2 2571 * @pss: data about the page we want to send
a82d593b 2572 * @last_stage: if we are at the completion stage
a82d593b 2573 */
a0a8aa14 2574static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2575 bool last_stage)
a82d593b 2576{
a8ec91f9 2577 RAMBlock *block = pss->block;
8bba004c 2578 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
a8ec91f9
XG
2579 int res;
2580
2581 if (control_save_page(rs, block, offset, &res)) {
2582 return res;
2583 }
2584
5e5fdcff
XG
2585 if (save_compress_page(rs, block, offset)) {
2586 return 1;
d7400a34
XG
2587 }
2588
2589 res = save_zero_page(rs, block, offset);
2590 if (res > 0) {
2591 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2592 * page would be stale
2593 */
2594 if (!save_page_use_compression(rs)) {
2595 XBZRLE_cache_lock();
2596 xbzrle_cache_zero_page(rs, block->offset + offset);
2597 XBZRLE_cache_unlock();
2598 }
2599 ram_release_pages(block->idstr, offset, res);
2600 return res;
2601 }
2602
da3f56cb 2603 /*
c6b3a2e0
WY
2604 * Do not use multifd for:
2605 * 1. Compression as the first page in the new block should be posted out
2606 * before sending the compressed page
2607 * 2. In postcopy as one whole host page should be placed
da3f56cb 2608 */
c6b3a2e0
WY
2609 if (!save_page_use_compression(rs) && migrate_use_multifd()
2610 && !migration_in_postcopy()) {
b9ee2f7d 2611 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2612 }
2613
1faa5665 2614 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2615}
2616
2617/**
3d0684b2 2618 * ram_save_host_page: save a whole host page
a82d593b 2619 *
3d0684b2
JQ
2620 * Starting at *offset send pages up to the end of the current host
2621 * page. It's valid for the initial offset to point into the middle of
2622 * a host page in which case the remainder of the hostpage is sent.
2623 * Only dirty target pages are sent. Note that the host page size may
2624 * be a huge page for this block.
1eb3fc0a
DDAG
2625 * The saving stops at the boundary of the used_length of the block
2626 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2627 *
3d0684b2
JQ
2628 * Returns the number of pages written or negative on error
2629 *
6f37bb8b 2630 * @rs: current RAM state
3d0684b2 2631 * @ms: current migration state
3d0684b2 2632 * @pss: data about the page we want to send
a82d593b 2633 * @last_stage: if we are at the completion stage
a82d593b 2634 */
a0a8aa14 2635static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2636 bool last_stage)
a82d593b
DDAG
2637{
2638 int tmppages, pages = 0;
a935e30f
JQ
2639 size_t pagesize_bits =
2640 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2641
fbd162e6 2642 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2643 error_report("block %s should not be migrated !", pss->block->idstr);
2644 return 0;
2645 }
2646
a82d593b 2647 do {
1faa5665
XG
2648 /* Check the pages is dirty and if it is send it */
2649 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2650 pss->page++;
2651 continue;
2652 }
2653
f20e2865 2654 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2655 if (tmppages < 0) {
2656 return tmppages;
2657 }
2658
2659 pages += tmppages;
a935e30f 2660 pss->page++;
97e1e067
DDAG
2661 /* Allow rate limiting to happen in the middle of huge pages */
2662 migration_rate_limit();
1eb3fc0a 2663 } while ((pss->page & (pagesize_bits - 1)) &&
8bba004c
AR
2664 offset_in_ramblock(pss->block,
2665 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
a82d593b
DDAG
2666
2667 /* The offset we leave with is the last one we looked at */
a935e30f 2668 pss->page--;
a82d593b
DDAG
2669 return pages;
2670}
6c595cde 2671
56e93d26 2672/**
3d0684b2 2673 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2674 *
2675 * Called within an RCU critical section.
2676 *
e8f3735f
XG
2677 * Returns the number of pages written where zero means no dirty pages,
2678 * or negative on error
56e93d26 2679 *
6f37bb8b 2680 * @rs: current RAM state
56e93d26 2681 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2682 *
2683 * On systems where host-page-size > target-page-size it will send all the
2684 * pages in a host page that are dirty.
56e93d26
JQ
2685 */
2686
ce25d337 2687static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2688{
b8fb8cb7 2689 PageSearchStatus pss;
56e93d26 2690 int pages = 0;
b9e60928 2691 bool again, found;
56e93d26 2692
0827b9e9
AA
2693 /* No dirty page as there is zero RAM */
2694 if (!ram_bytes_total()) {
2695 return pages;
2696 }
2697
6f37bb8b 2698 pss.block = rs->last_seen_block;
a935e30f 2699 pss.page = rs->last_page;
b8fb8cb7
DDAG
2700 pss.complete_round = false;
2701
2702 if (!pss.block) {
2703 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2704 }
56e93d26 2705
b9e60928 2706 do {
a82d593b 2707 again = true;
f20e2865 2708 found = get_queued_page(rs, &pss);
b9e60928 2709
a82d593b
DDAG
2710 if (!found) {
2711 /* priority queue empty, so just search for something dirty */
f20e2865 2712 found = find_dirty_block(rs, &pss, &again);
a82d593b 2713 }
f3f491fc 2714
a82d593b 2715 if (found) {
f20e2865 2716 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2717 }
b9e60928 2718 } while (!pages && again);
56e93d26 2719
6f37bb8b 2720 rs->last_seen_block = pss.block;
a935e30f 2721 rs->last_page = pss.page;
56e93d26
JQ
2722
2723 return pages;
2724}
2725
2726void acct_update_position(QEMUFile *f, size_t size, bool zero)
2727{
2728 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2729
56e93d26 2730 if (zero) {
9360447d 2731 ram_counters.duplicate += pages;
56e93d26 2732 } else {
9360447d
JQ
2733 ram_counters.normal += pages;
2734 ram_counters.transferred += size;
56e93d26
JQ
2735 qemu_update_position(f, size);
2736 }
2737}
2738
fbd162e6 2739static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2740{
2741 RAMBlock *block;
2742 uint64_t total = 0;
2743
89ac5a1d
DDAG
2744 RCU_READ_LOCK_GUARD();
2745
fbd162e6
YK
2746 if (count_ignored) {
2747 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2748 total += block->used_length;
2749 }
2750 } else {
2751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2752 total += block->used_length;
2753 }
99e15582 2754 }
56e93d26
JQ
2755 return total;
2756}
2757
fbd162e6
YK
2758uint64_t ram_bytes_total(void)
2759{
2760 return ram_bytes_total_common(false);
2761}
2762
f265e0e4 2763static void xbzrle_load_setup(void)
56e93d26 2764{
f265e0e4 2765 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2766}
2767
f265e0e4
JQ
2768static void xbzrle_load_cleanup(void)
2769{
2770 g_free(XBZRLE.decoded_buf);
2771 XBZRLE.decoded_buf = NULL;
2772}
2773
7d7c96be
PX
2774static void ram_state_cleanup(RAMState **rsp)
2775{
b9ccaf6d
DDAG
2776 if (*rsp) {
2777 migration_page_queue_free(*rsp);
2778 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2779 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2780 g_free(*rsp);
2781 *rsp = NULL;
2782 }
7d7c96be
PX
2783}
2784
84593a08
PX
2785static void xbzrle_cleanup(void)
2786{
2787 XBZRLE_cache_lock();
2788 if (XBZRLE.cache) {
2789 cache_fini(XBZRLE.cache);
2790 g_free(XBZRLE.encoded_buf);
2791 g_free(XBZRLE.current_buf);
2792 g_free(XBZRLE.zero_target_page);
2793 XBZRLE.cache = NULL;
2794 XBZRLE.encoded_buf = NULL;
2795 XBZRLE.current_buf = NULL;
2796 XBZRLE.zero_target_page = NULL;
2797 }
2798 XBZRLE_cache_unlock();
2799}
2800
f265e0e4 2801static void ram_save_cleanup(void *opaque)
56e93d26 2802{
53518d94 2803 RAMState **rsp = opaque;
6b6712ef 2804 RAMBlock *block;
eb859c53 2805
2ff64038 2806 /* caller have hold iothread lock or is in a bh, so there is
4633456c 2807 * no writing race against the migration bitmap
2ff64038 2808 */
6b6712ef
JQ
2809 memory_global_dirty_log_stop();
2810
fbd162e6 2811 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
002cad6b
PX
2812 g_free(block->clear_bmap);
2813 block->clear_bmap = NULL;
6b6712ef
JQ
2814 g_free(block->bmap);
2815 block->bmap = NULL;
56e93d26
JQ
2816 }
2817
84593a08 2818 xbzrle_cleanup();
f0afa331 2819 compress_threads_save_cleanup();
7d7c96be 2820 ram_state_cleanup(rsp);
56e93d26
JQ
2821}
2822
6f37bb8b 2823static void ram_state_reset(RAMState *rs)
56e93d26 2824{
6f37bb8b
JQ
2825 rs->last_seen_block = NULL;
2826 rs->last_sent_block = NULL;
269ace29 2827 rs->last_page = 0;
6f37bb8b
JQ
2828 rs->last_version = ram_list.version;
2829 rs->ram_bulk_stage = true;
6eeb63f7 2830 rs->fpo_enabled = false;
56e93d26
JQ
2831}
2832
2833#define MAX_WAIT 50 /* ms, half buffered_file limit */
2834
4f2e4252
DDAG
2835/*
2836 * 'expected' is the value you expect the bitmap mostly to be full
2837 * of; it won't bother printing lines that are all this value.
2838 * If 'todump' is null the migration bitmap is dumped.
2839 */
6b6712ef
JQ
2840void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2841 unsigned long pages)
4f2e4252 2842{
4f2e4252
DDAG
2843 int64_t cur;
2844 int64_t linelen = 128;
2845 char linebuf[129];
2846
6b6712ef 2847 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2848 int64_t curb;
2849 bool found = false;
2850 /*
2851 * Last line; catch the case where the line length
2852 * is longer than remaining ram
2853 */
6b6712ef
JQ
2854 if (cur + linelen > pages) {
2855 linelen = pages - cur;
4f2e4252
DDAG
2856 }
2857 for (curb = 0; curb < linelen; curb++) {
2858 bool thisbit = test_bit(cur + curb, todump);
2859 linebuf[curb] = thisbit ? '1' : '.';
2860 found = found || (thisbit != expected);
2861 }
2862 if (found) {
2863 linebuf[curb] = '\0';
2864 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2865 }
2866 }
2867}
2868
e0b266f0
DDAG
2869/* **** functions for postcopy ***** */
2870
ced1c616
PB
2871void ram_postcopy_migrated_memory_release(MigrationState *ms)
2872{
2873 struct RAMBlock *block;
ced1c616 2874
fbd162e6 2875 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2876 unsigned long *bitmap = block->bmap;
2877 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2878 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2879
2880 while (run_start < range) {
2881 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
8bba004c
AR
2882 ram_discard_range(block->idstr,
2883 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2884 ((ram_addr_t)(run_end - run_start))
2885 << TARGET_PAGE_BITS);
ced1c616
PB
2886 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2887 }
2888 }
2889}
2890
3d0684b2
JQ
2891/**
2892 * postcopy_send_discard_bm_ram: discard a RAMBlock
2893 *
2894 * Returns zero on success
2895 *
e0b266f0 2896 * Callback from postcopy_each_ram_send_discard for each RAMBlock
3d0684b2
JQ
2897 *
2898 * @ms: current migration state
89dab31b 2899 * @block: RAMBlock to discard
e0b266f0 2900 */
810cf2bb 2901static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
e0b266f0 2902{
6b6712ef 2903 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2904 unsigned long current;
1e7cf8c3 2905 unsigned long *bitmap = block->bmap;
e0b266f0 2906
6b6712ef 2907 for (current = 0; current < end; ) {
1e7cf8c3 2908 unsigned long one = find_next_bit(bitmap, end, current);
33a5cb62 2909 unsigned long zero, discard_length;
e0b266f0 2910
33a5cb62
WY
2911 if (one >= end) {
2912 break;
2913 }
e0b266f0 2914
1e7cf8c3 2915 zero = find_next_zero_bit(bitmap, end, one + 1);
33a5cb62
WY
2916
2917 if (zero >= end) {
2918 discard_length = end - one;
e0b266f0 2919 } else {
33a5cb62
WY
2920 discard_length = zero - one;
2921 }
810cf2bb 2922 postcopy_discard_send_range(ms, one, discard_length);
33a5cb62 2923 current = one + discard_length;
e0b266f0
DDAG
2924 }
2925
2926 return 0;
2927}
2928
3d0684b2
JQ
2929/**
2930 * postcopy_each_ram_send_discard: discard all RAMBlocks
2931 *
2932 * Returns 0 for success or negative for error
2933 *
e0b266f0
DDAG
2934 * Utility for the outgoing postcopy code.
2935 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2936 * passing it bitmap indexes and name.
e0b266f0
DDAG
2937 * (qemu_ram_foreach_block ends up passing unscaled lengths
2938 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2939 *
2940 * @ms: current migration state
e0b266f0
DDAG
2941 */
2942static int postcopy_each_ram_send_discard(MigrationState *ms)
2943{
2944 struct RAMBlock *block;
2945 int ret;
2946
fbd162e6 2947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
810cf2bb 2948 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2949
2950 /*
2951 * Postcopy sends chunks of bitmap over the wire, but it
2952 * just needs indexes at this point, avoids it having
2953 * target page specific code.
2954 */
810cf2bb
WY
2955 ret = postcopy_send_discard_bm_ram(ms, block);
2956 postcopy_discard_send_finish(ms);
e0b266f0
DDAG
2957 if (ret) {
2958 return ret;
2959 }
2960 }
2961
2962 return 0;
2963}
2964
3d0684b2 2965/**
8324ef86 2966 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3d0684b2
JQ
2967 *
2968 * Helper for postcopy_chunk_hostpages; it's called twice to
2969 * canonicalize the two bitmaps, that are similar, but one is
2970 * inverted.
99e314eb 2971 *
3d0684b2
JQ
2972 * Postcopy requires that all target pages in a hostpage are dirty or
2973 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2974 *
3d0684b2 2975 * @ms: current migration state
3d0684b2 2976 * @block: block that contains the page we want to canonicalize
99e314eb 2977 */
1e7cf8c3 2978static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
99e314eb 2979{
53518d94 2980 RAMState *rs = ram_state;
6b6712ef 2981 unsigned long *bitmap = block->bmap;
29c59172 2982 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2983 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2984 unsigned long run_start;
2985
29c59172
DDAG
2986 if (block->page_size == TARGET_PAGE_SIZE) {
2987 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2988 return;
2989 }
2990
1e7cf8c3
WY
2991 /* Find a dirty page */
2992 run_start = find_next_bit(bitmap, pages, 0);
99e314eb 2993
6b6712ef 2994 while (run_start < pages) {
99e314eb
DDAG
2995
2996 /*
2997 * If the start of this run of pages is in the middle of a host
2998 * page, then we need to fixup this host page.
2999 */
9dec3cc3 3000 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 3001 /* Find the end of this run */
1e7cf8c3 3002 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
3003 /*
3004 * If the end isn't at the start of a host page, then the
3005 * run doesn't finish at the end of a host page
3006 * and we need to discard.
3007 */
99e314eb
DDAG
3008 }
3009
9dec3cc3 3010 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
99e314eb 3011 unsigned long page;
dad45ab2
WY
3012 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
3013 host_ratio);
3014 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
99e314eb 3015
99e314eb
DDAG
3016 /* Clean up the bitmap */
3017 for (page = fixup_start_addr;
3018 page < fixup_start_addr + host_ratio; page++) {
99e314eb
DDAG
3019 /*
3020 * Remark them as dirty, updating the count for any pages
3021 * that weren't previously dirty.
3022 */
0d8ec885 3023 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
3024 }
3025 }
3026
1e7cf8c3
WY
3027 /* Find the next dirty page for the next iteration */
3028 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
3029 }
3030}
3031
3d0684b2 3032/**
89dab31b 3033 * postcopy_chunk_hostpages: discard any partially sent host page
3d0684b2 3034 *
99e314eb
DDAG
3035 * Utility for the outgoing postcopy code.
3036 *
3037 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
3038 * dirty host-page size chunks as all dirty. In this case the host-page
3039 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 3040 *
3d0684b2
JQ
3041 * Returns zero on success
3042 *
3043 * @ms: current migration state
6b6712ef 3044 * @block: block we want to work with
99e314eb 3045 */
6b6712ef 3046static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 3047{
810cf2bb 3048 postcopy_discard_send_init(ms, block->idstr);
99e314eb 3049
6b6712ef 3050 /*
1e7cf8c3 3051 * Ensure that all partially dirty host pages are made fully dirty.
6b6712ef 3052 */
1e7cf8c3 3053 postcopy_chunk_hostpages_pass(ms, block);
99e314eb 3054
810cf2bb 3055 postcopy_discard_send_finish(ms);
99e314eb
DDAG
3056 return 0;
3057}
3058
3d0684b2
JQ
3059/**
3060 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3061 *
3062 * Returns zero on success
3063 *
e0b266f0
DDAG
3064 * Transmit the set of pages to be discarded after precopy to the target
3065 * these are pages that:
3066 * a) Have been previously transmitted but are now dirty again
3067 * b) Pages that have never been transmitted, this ensures that
3068 * any pages on the destination that have been mapped by background
3069 * tasks get discarded (transparent huge pages is the specific concern)
3070 * Hopefully this is pretty sparse
3d0684b2
JQ
3071 *
3072 * @ms: current migration state
e0b266f0
DDAG
3073 */
3074int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3075{
53518d94 3076 RAMState *rs = ram_state;
6b6712ef 3077 RAMBlock *block;
e0b266f0 3078 int ret;
e0b266f0 3079
89ac5a1d 3080 RCU_READ_LOCK_GUARD();
e0b266f0
DDAG
3081
3082 /* This should be our last sync, the src is now paused */
eb859c53 3083 migration_bitmap_sync(rs);
e0b266f0 3084
6b6712ef
JQ
3085 /* Easiest way to make sure we don't resume in the middle of a host-page */
3086 rs->last_seen_block = NULL;
3087 rs->last_sent_block = NULL;
3088 rs->last_page = 0;
e0b266f0 3089
fbd162e6 3090 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
3091 /* Deal with TPS != HPS and huge pages */
3092 ret = postcopy_chunk_hostpages(ms, block);
3093 if (ret) {
6b6712ef
JQ
3094 return ret;
3095 }
e0b266f0 3096
e0b266f0 3097#ifdef DEBUG_POSTCOPY
1e7cf8c3
WY
3098 ram_debug_dump_bitmap(block->bmap, true,
3099 block->used_length >> TARGET_PAGE_BITS);
e0b266f0 3100#endif
6b6712ef
JQ
3101 }
3102 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
3103
3104 ret = postcopy_each_ram_send_discard(ms);
e0b266f0
DDAG
3105
3106 return ret;
3107}
3108
3d0684b2
JQ
3109/**
3110 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 3111 *
3d0684b2 3112 * Returns zero on success
e0b266f0 3113 *
36449157
JQ
3114 * @rbname: name of the RAMBlock of the request. NULL means the
3115 * same that last one.
3d0684b2
JQ
3116 * @start: RAMBlock starting page
3117 * @length: RAMBlock size
e0b266f0 3118 */
aaa2064c 3119int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0 3120{
36449157 3121 trace_ram_discard_range(rbname, start, length);
d3a5038c 3122
89ac5a1d 3123 RCU_READ_LOCK_GUARD();
36449157 3124 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
3125
3126 if (!rb) {
36449157 3127 error_report("ram_discard_range: Failed to find block '%s'", rbname);
03acb4e9 3128 return -1;
e0b266f0
DDAG
3129 }
3130
814bb08f
PX
3131 /*
3132 * On source VM, we don't need to update the received bitmap since
3133 * we don't even have one.
3134 */
3135 if (rb->receivedmap) {
3136 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3137 length >> qemu_target_page_bits());
3138 }
3139
03acb4e9 3140 return ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3141}
3142
84593a08
PX
3143/*
3144 * For every allocation, we will try not to crash the VM if the
3145 * allocation failed.
3146 */
3147static int xbzrle_init(void)
3148{
3149 Error *local_err = NULL;
3150
3151 if (!migrate_use_xbzrle()) {
3152 return 0;
3153 }
3154
3155 XBZRLE_cache_lock();
3156
3157 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3158 if (!XBZRLE.zero_target_page) {
3159 error_report("%s: Error allocating zero page", __func__);
3160 goto err_out;
3161 }
3162
3163 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3164 TARGET_PAGE_SIZE, &local_err);
3165 if (!XBZRLE.cache) {
3166 error_report_err(local_err);
3167 goto free_zero_page;
3168 }
3169
3170 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3171 if (!XBZRLE.encoded_buf) {
3172 error_report("%s: Error allocating encoded_buf", __func__);
3173 goto free_cache;
3174 }
3175
3176 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3177 if (!XBZRLE.current_buf) {
3178 error_report("%s: Error allocating current_buf", __func__);
3179 goto free_encoded_buf;
3180 }
3181
3182 /* We are all good */
3183 XBZRLE_cache_unlock();
3184 return 0;
3185
3186free_encoded_buf:
3187 g_free(XBZRLE.encoded_buf);
3188 XBZRLE.encoded_buf = NULL;
3189free_cache:
3190 cache_fini(XBZRLE.cache);
3191 XBZRLE.cache = NULL;
3192free_zero_page:
3193 g_free(XBZRLE.zero_target_page);
3194 XBZRLE.zero_target_page = NULL;
3195err_out:
3196 XBZRLE_cache_unlock();
3197 return -ENOMEM;
3198}
3199
53518d94 3200static int ram_state_init(RAMState **rsp)
56e93d26 3201{
7d00ee6a
PX
3202 *rsp = g_try_new0(RAMState, 1);
3203
3204 if (!*rsp) {
3205 error_report("%s: Init ramstate fail", __func__);
3206 return -1;
3207 }
53518d94
JQ
3208
3209 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3210 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3211 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3212
7d00ee6a 3213 /*
40c4d4a8
IR
3214 * Count the total number of pages used by ram blocks not including any
3215 * gaps due to alignment or unplugs.
03158519 3216 * This must match with the initial values of dirty bitmap.
7d00ee6a 3217 */
40c4d4a8 3218 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
7d00ee6a
PX
3219 ram_state_reset(*rsp);
3220
3221 return 0;
3222}
3223
d6eff5d7 3224static void ram_list_init_bitmaps(void)
7d00ee6a 3225{
002cad6b 3226 MigrationState *ms = migrate_get_current();
d6eff5d7
PX
3227 RAMBlock *block;
3228 unsigned long pages;
002cad6b 3229 uint8_t shift;
56e93d26 3230
0827b9e9
AA
3231 /* Skip setting bitmap if there is no RAM */
3232 if (ram_bytes_total()) {
002cad6b
PX
3233 shift = ms->clear_bitmap_shift;
3234 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3235 error_report("clear_bitmap_shift (%u) too big, using "
3236 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3237 shift = CLEAR_BITMAP_SHIFT_MAX;
3238 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3239 error_report("clear_bitmap_shift (%u) too small, using "
3240 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3241 shift = CLEAR_BITMAP_SHIFT_MIN;
3242 }
3243
fbd162e6 3244 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3245 pages = block->max_length >> TARGET_PAGE_BITS;
03158519
WY
3246 /*
3247 * The initial dirty bitmap for migration must be set with all
3248 * ones to make sure we'll migrate every guest RAM page to
3249 * destination.
40c4d4a8
IR
3250 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3251 * new migration after a failed migration, ram_list.
3252 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3253 * guest memory.
03158519 3254 */
6b6712ef 3255 block->bmap = bitmap_new(pages);
40c4d4a8 3256 bitmap_set(block->bmap, 0, pages);
002cad6b
PX
3257 block->clear_bmap_shift = shift;
3258 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
0827b9e9 3259 }
f3f491fc 3260 }
d6eff5d7
PX
3261}
3262
3263static void ram_init_bitmaps(RAMState *rs)
3264{
3265 /* For memory_global_dirty_log_start below. */
3266 qemu_mutex_lock_iothread();
3267 qemu_mutex_lock_ramlist();
f3f491fc 3268
89ac5a1d
DDAG
3269 WITH_RCU_READ_LOCK_GUARD() {
3270 ram_list_init_bitmaps();
3271 memory_global_dirty_log_start();
3272 migration_bitmap_sync_precopy(rs);
3273 }
56e93d26 3274 qemu_mutex_unlock_ramlist();
49877834 3275 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3276}
3277
3278static int ram_init_all(RAMState **rsp)
3279{
3280 if (ram_state_init(rsp)) {
3281 return -1;
3282 }
3283
3284 if (xbzrle_init()) {
3285 ram_state_cleanup(rsp);
3286 return -1;
3287 }
3288
3289 ram_init_bitmaps(*rsp);
a91246c9
HZ
3290
3291 return 0;
3292}
3293
08614f34
PX
3294static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3295{
3296 RAMBlock *block;
3297 uint64_t pages = 0;
3298
3299 /*
3300 * Postcopy is not using xbzrle/compression, so no need for that.
3301 * Also, since source are already halted, we don't need to care
3302 * about dirty page logging as well.
3303 */
3304
fbd162e6 3305 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3306 pages += bitmap_count_one(block->bmap,
3307 block->used_length >> TARGET_PAGE_BITS);
3308 }
3309
3310 /* This may not be aligned with current bitmaps. Recalculate. */
3311 rs->migration_dirty_pages = pages;
3312
3313 rs->last_seen_block = NULL;
3314 rs->last_sent_block = NULL;
3315 rs->last_page = 0;
3316 rs->last_version = ram_list.version;
3317 /*
3318 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3319 * matter what we have sent.
3320 */
3321 rs->ram_bulk_stage = false;
3322
3323 /* Update RAMState cache of output QEMUFile */
3324 rs->f = out;
3325
3326 trace_ram_state_resume_prepare(pages);
3327}
3328
6bcb05fc
WW
3329/*
3330 * This function clears bits of the free pages reported by the caller from the
3331 * migration dirty bitmap. @addr is the host address corresponding to the
3332 * start of the continuous guest free pages, and @len is the total bytes of
3333 * those pages.
3334 */
3335void qemu_guest_free_page_hint(void *addr, size_t len)
3336{
3337 RAMBlock *block;
3338 ram_addr_t offset;
3339 size_t used_len, start, npages;
3340 MigrationState *s = migrate_get_current();
3341
3342 /* This function is currently expected to be used during live migration */
3343 if (!migration_is_setup_or_active(s->state)) {
3344 return;
3345 }
3346
3347 for (; len > 0; len -= used_len, addr += used_len) {
3348 block = qemu_ram_block_from_host(addr, false, &offset);
3349 if (unlikely(!block || offset >= block->used_length)) {
3350 /*
3351 * The implementation might not support RAMBlock resize during
3352 * live migration, but it could happen in theory with future
3353 * updates. So we add a check here to capture that case.
3354 */
3355 error_report_once("%s unexpected error", __func__);
3356 return;
3357 }
3358
3359 if (len <= block->used_length - offset) {
3360 used_len = len;
3361 } else {
3362 used_len = block->used_length - offset;
3363 }
3364
3365 start = offset >> TARGET_PAGE_BITS;
3366 npages = used_len >> TARGET_PAGE_BITS;
3367
3368 qemu_mutex_lock(&ram_state->bitmap_mutex);
3369 ram_state->migration_dirty_pages -=
3370 bitmap_count_one_with_offset(block->bmap, start, npages);
3371 bitmap_clear(block->bmap, start, npages);
3372 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3373 }
3374}
3375
3d0684b2
JQ
3376/*
3377 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3378 * long-running RCU critical section. When rcu-reclaims in the code
3379 * start to become numerous it will be necessary to reduce the
3380 * granularity of these critical sections.
3381 */
3382
3d0684b2
JQ
3383/**
3384 * ram_save_setup: Setup RAM for migration
3385 *
3386 * Returns zero to indicate success and negative for error
3387 *
3388 * @f: QEMUFile where to send the data
3389 * @opaque: RAMState pointer
3390 */
a91246c9
HZ
3391static int ram_save_setup(QEMUFile *f, void *opaque)
3392{
53518d94 3393 RAMState **rsp = opaque;
a91246c9
HZ
3394 RAMBlock *block;
3395
dcaf446e
XG
3396 if (compress_threads_save_setup()) {
3397 return -1;
3398 }
3399
a91246c9
HZ
3400 /* migration has already setup the bitmap, reuse it. */
3401 if (!migration_in_colo_state()) {
7d00ee6a 3402 if (ram_init_all(rsp) != 0) {
dcaf446e 3403 compress_threads_save_cleanup();
a91246c9 3404 return -1;
53518d94 3405 }
a91246c9 3406 }
53518d94 3407 (*rsp)->f = f;
a91246c9 3408
0e6ebd48
DDAG
3409 WITH_RCU_READ_LOCK_GUARD() {
3410 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3411
0e6ebd48
DDAG
3412 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3413 qemu_put_byte(f, strlen(block->idstr));
3414 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3415 qemu_put_be64(f, block->used_length);
3416 if (migrate_postcopy_ram() && block->page_size !=
3417 qemu_host_page_size) {
3418 qemu_put_be64(f, block->page_size);
3419 }
3420 if (migrate_ignore_shared()) {
3421 qemu_put_be64(f, block->mr->addr);
3422 }
fbd162e6 3423 }
56e93d26
JQ
3424 }
3425
56e93d26
JQ
3426 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3427 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3428
1b81c974 3429 multifd_send_sync_main(*rsp);
56e93d26 3430 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3431 qemu_fflush(f);
56e93d26
JQ
3432
3433 return 0;
3434}
3435
3d0684b2
JQ
3436/**
3437 * ram_save_iterate: iterative stage for migration
3438 *
3439 * Returns zero to indicate success and negative for error
3440 *
3441 * @f: QEMUFile where to send the data
3442 * @opaque: RAMState pointer
3443 */
56e93d26
JQ
3444static int ram_save_iterate(QEMUFile *f, void *opaque)
3445{
53518d94
JQ
3446 RAMState **temp = opaque;
3447 RAMState *rs = *temp;
3d4095b2 3448 int ret = 0;
56e93d26
JQ
3449 int i;
3450 int64_t t0;
5c90308f 3451 int done = 0;
56e93d26 3452
b2557345
PL
3453 if (blk_mig_bulk_active()) {
3454 /* Avoid transferring ram during bulk phase of block migration as
3455 * the bulk phase will usually take a long time and transferring
3456 * ram updates during that time is pointless. */
3457 goto out;
3458 }
3459
89ac5a1d
DDAG
3460 WITH_RCU_READ_LOCK_GUARD() {
3461 if (ram_list.version != rs->last_version) {
3462 ram_state_reset(rs);
3463 }
56e93d26 3464
89ac5a1d
DDAG
3465 /* Read version before ram_list.blocks */
3466 smp_rmb();
56e93d26 3467
89ac5a1d 3468 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
56e93d26 3469
89ac5a1d
DDAG
3470 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3471 i = 0;
3472 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3473 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3474 int pages;
e03a34f8 3475
89ac5a1d
DDAG
3476 if (qemu_file_get_error(f)) {
3477 break;
3478 }
e8f3735f 3479
89ac5a1d
DDAG
3480 pages = ram_find_and_save_block(rs, false);
3481 /* no more pages to sent */
3482 if (pages == 0) {
3483 done = 1;
3484 break;
3485 }
e8f3735f 3486
89ac5a1d
DDAG
3487 if (pages < 0) {
3488 qemu_file_set_error(f, pages);
56e93d26
JQ
3489 break;
3490 }
89ac5a1d
DDAG
3491
3492 rs->target_page_count += pages;
3493
644acf99
WY
3494 /*
3495 * During postcopy, it is necessary to make sure one whole host
3496 * page is sent in one chunk.
3497 */
3498 if (migrate_postcopy_ram()) {
3499 flush_compressed_data(rs);
3500 }
3501
89ac5a1d
DDAG
3502 /*
3503 * we want to check in the 1st loop, just in case it was the 1st
3504 * time and we had to sync the dirty bitmap.
3505 * qemu_clock_get_ns() is a bit expensive, so we only check each
3506 * some iterations
3507 */
3508 if ((i & 63) == 0) {
3509 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3510 1000000;
3511 if (t1 > MAX_WAIT) {
3512 trace_ram_save_iterate_big_wait(t1, i);
3513 break;
3514 }
3515 }
3516 i++;
56e93d26 3517 }
56e93d26 3518 }
56e93d26
JQ
3519
3520 /*
3521 * Must occur before EOS (or any QEMUFile operation)
3522 * because of RDMA protocol.
3523 */
3524 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3525
b2557345 3526out:
3d4095b2
JQ
3527 if (ret >= 0) {
3528 multifd_send_sync_main(rs);
3529 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3530 qemu_fflush(f);
3531 ram_counters.transferred += 8;
56e93d26 3532
3d4095b2
JQ
3533 ret = qemu_file_get_error(f);
3534 }
56e93d26
JQ
3535 if (ret < 0) {
3536 return ret;
3537 }
3538
5c90308f 3539 return done;
56e93d26
JQ
3540}
3541
3d0684b2
JQ
3542/**
3543 * ram_save_complete: function called to send the remaining amount of ram
3544 *
e8f3735f 3545 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3546 *
3547 * Called with iothread lock
3548 *
3549 * @f: QEMUFile where to send the data
3550 * @opaque: RAMState pointer
3551 */
56e93d26
JQ
3552static int ram_save_complete(QEMUFile *f, void *opaque)
3553{
53518d94
JQ
3554 RAMState **temp = opaque;
3555 RAMState *rs = *temp;
e8f3735f 3556 int ret = 0;
6f37bb8b 3557
89ac5a1d
DDAG
3558 WITH_RCU_READ_LOCK_GUARD() {
3559 if (!migration_in_postcopy()) {
3560 migration_bitmap_sync_precopy(rs);
3561 }
56e93d26 3562
89ac5a1d 3563 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
56e93d26 3564
89ac5a1d 3565 /* try transferring iterative blocks of memory */
56e93d26 3566
89ac5a1d
DDAG
3567 /* flush all remaining blocks regardless of rate limiting */
3568 while (true) {
3569 int pages;
56e93d26 3570
89ac5a1d
DDAG
3571 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3572 /* no more blocks to sent */
3573 if (pages == 0) {
3574 break;
3575 }
3576 if (pages < 0) {
3577 ret = pages;
3578 break;
3579 }
e8f3735f 3580 }
56e93d26 3581
89ac5a1d
DDAG
3582 flush_compressed_data(rs);
3583 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3584 }
d09a6fde 3585
3d4095b2
JQ
3586 if (ret >= 0) {
3587 multifd_send_sync_main(rs);
3588 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3589 qemu_fflush(f);
3590 }
56e93d26 3591
e8f3735f 3592 return ret;
56e93d26
JQ
3593}
3594
c31b098f 3595static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3596 uint64_t *res_precopy_only,
3597 uint64_t *res_compatible,
3598 uint64_t *res_postcopy_only)
56e93d26 3599{
53518d94
JQ
3600 RAMState **temp = opaque;
3601 RAMState *rs = *temp;
56e93d26
JQ
3602 uint64_t remaining_size;
3603
9edabd4d 3604 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3605
5727309d 3606 if (!migration_in_postcopy() &&
663e6c1d 3607 remaining_size < max_size) {
56e93d26 3608 qemu_mutex_lock_iothread();
89ac5a1d
DDAG
3609 WITH_RCU_READ_LOCK_GUARD() {
3610 migration_bitmap_sync_precopy(rs);
3611 }
56e93d26 3612 qemu_mutex_unlock_iothread();
9edabd4d 3613 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3614 }
c31b098f 3615
86e1167e
VSO
3616 if (migrate_postcopy_ram()) {
3617 /* We can do postcopy, and all the data is postcopiable */
47995026 3618 *res_compatible += remaining_size;
86e1167e 3619 } else {
47995026 3620 *res_precopy_only += remaining_size;
86e1167e 3621 }
56e93d26
JQ
3622}
3623
3624static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3625{
3626 unsigned int xh_len;
3627 int xh_flags;
063e760a 3628 uint8_t *loaded_data;
56e93d26 3629
56e93d26
JQ
3630 /* extract RLE header */
3631 xh_flags = qemu_get_byte(f);
3632 xh_len = qemu_get_be16(f);
3633
3634 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3635 error_report("Failed to load XBZRLE page - wrong compression!");
3636 return -1;
3637 }
3638
3639 if (xh_len > TARGET_PAGE_SIZE) {
3640 error_report("Failed to load XBZRLE page - len overflow!");
3641 return -1;
3642 }
f265e0e4 3643 loaded_data = XBZRLE.decoded_buf;
56e93d26 3644 /* load data and decode */
f265e0e4 3645 /* it can change loaded_data to point to an internal buffer */
063e760a 3646 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3647
3648 /* decode RLE */
063e760a 3649 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3650 TARGET_PAGE_SIZE) == -1) {
3651 error_report("Failed to load XBZRLE page - decode error!");
3652 return -1;
3653 }
3654
3655 return 0;
3656}
3657
3d0684b2
JQ
3658/**
3659 * ram_block_from_stream: read a RAMBlock id from the migration stream
3660 *
3661 * Must be called from within a rcu critical section.
3662 *
56e93d26 3663 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3664 *
3d0684b2
JQ
3665 * @f: QEMUFile where to read the data from
3666 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3667 */
3d0684b2 3668static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3669{
3670 static RAMBlock *block = NULL;
3671 char id[256];
3672 uint8_t len;
3673
3674 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3675 if (!block) {
56e93d26
JQ
3676 error_report("Ack, bad migration stream!");
3677 return NULL;
3678 }
4c4bad48 3679 return block;
56e93d26
JQ
3680 }
3681
3682 len = qemu_get_byte(f);
3683 qemu_get_buffer(f, (uint8_t *)id, len);
3684 id[len] = 0;
3685
e3dd7493 3686 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3687 if (!block) {
3688 error_report("Can't find block %s", id);
3689 return NULL;
56e93d26
JQ
3690 }
3691
fbd162e6 3692 if (ramblock_is_ignored(block)) {
b895de50
CLG
3693 error_report("block %s should not be migrated !", id);
3694 return NULL;
3695 }
3696
4c4bad48
HZ
3697 return block;
3698}
3699
3700static inline void *host_from_ram_block_offset(RAMBlock *block,
3701 ram_addr_t offset)
3702{
3703 if (!offset_in_ramblock(block, offset)) {
3704 return NULL;
3705 }
3706
3707 return block->host + offset;
56e93d26
JQ
3708}
3709
13af18f2
ZC
3710static inline void *colo_cache_from_block_offset(RAMBlock *block,
3711 ram_addr_t offset)
3712{
3713 if (!offset_in_ramblock(block, offset)) {
3714 return NULL;
3715 }
3716 if (!block->colo_cache) {
3717 error_report("%s: colo_cache is NULL in block :%s",
3718 __func__, block->idstr);
3719 return NULL;
3720 }
7d9acafa
ZC
3721
3722 /*
3723 * During colo checkpoint, we need bitmap of these migrated pages.
3724 * It help us to decide which pages in ram cache should be flushed
3725 * into VM's RAM later.
3726 */
3727 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3728 ram_state->migration_dirty_pages++;
3729 }
13af18f2
ZC
3730 return block->colo_cache + offset;
3731}
3732
3d0684b2
JQ
3733/**
3734 * ram_handle_compressed: handle the zero page case
3735 *
56e93d26
JQ
3736 * If a page (or a whole RDMA chunk) has been
3737 * determined to be zero, then zap it.
3d0684b2
JQ
3738 *
3739 * @host: host address for the zero page
3740 * @ch: what the page is filled from. We only support zero
3741 * @size: size of the zero page
56e93d26
JQ
3742 */
3743void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3744{
3745 if (ch != 0 || !is_zero_range(host, size)) {
3746 memset(host, ch, size);
3747 }
3748}
3749
797ca154
XG
3750/* return the size after decompression, or negative value on error */
3751static int
3752qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3753 const uint8_t *source, size_t source_len)
3754{
3755 int err;
3756
3757 err = inflateReset(stream);
3758 if (err != Z_OK) {
3759 return -1;
3760 }
3761
3762 stream->avail_in = source_len;
3763 stream->next_in = (uint8_t *)source;
3764 stream->avail_out = dest_len;
3765 stream->next_out = dest;
3766
3767 err = inflate(stream, Z_NO_FLUSH);
3768 if (err != Z_STREAM_END) {
3769 return -1;
3770 }
3771
3772 return stream->total_out;
3773}
3774
56e93d26
JQ
3775static void *do_data_decompress(void *opaque)
3776{
3777 DecompressParam *param = opaque;
3778 unsigned long pagesize;
33d151f4 3779 uint8_t *des;
34ab9e97 3780 int len, ret;
56e93d26 3781
33d151f4 3782 qemu_mutex_lock(&param->mutex);
90e56fb4 3783 while (!param->quit) {
33d151f4
LL
3784 if (param->des) {
3785 des = param->des;
3786 len = param->len;
3787 param->des = 0;
3788 qemu_mutex_unlock(&param->mutex);
3789
56e93d26 3790 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3791
3792 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3793 param->compbuf, len);
f548222c 3794 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3795 error_report("decompress data failed");
3796 qemu_file_set_error(decomp_file, ret);
3797 }
73a8912b 3798
33d151f4
LL
3799 qemu_mutex_lock(&decomp_done_lock);
3800 param->done = true;
3801 qemu_cond_signal(&decomp_done_cond);
3802 qemu_mutex_unlock(&decomp_done_lock);
3803
3804 qemu_mutex_lock(&param->mutex);
3805 } else {
3806 qemu_cond_wait(&param->cond, &param->mutex);
3807 }
56e93d26 3808 }
33d151f4 3809 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3810
3811 return NULL;
3812}
3813
34ab9e97 3814static int wait_for_decompress_done(void)
5533b2e9
LL
3815{
3816 int idx, thread_count;
3817
3818 if (!migrate_use_compression()) {
34ab9e97 3819 return 0;
5533b2e9
LL
3820 }
3821
3822 thread_count = migrate_decompress_threads();
3823 qemu_mutex_lock(&decomp_done_lock);
3824 for (idx = 0; idx < thread_count; idx++) {
3825 while (!decomp_param[idx].done) {
3826 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3827 }
3828 }
3829 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3830 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3831}
3832
f0afa331 3833static void compress_threads_load_cleanup(void)
56e93d26
JQ
3834{
3835 int i, thread_count;
3836
3416ab5b
JQ
3837 if (!migrate_use_compression()) {
3838 return;
3839 }
56e93d26
JQ
3840 thread_count = migrate_decompress_threads();
3841 for (i = 0; i < thread_count; i++) {
797ca154
XG
3842 /*
3843 * we use it as a indicator which shows if the thread is
3844 * properly init'd or not
3845 */
3846 if (!decomp_param[i].compbuf) {
3847 break;
3848 }
3849
56e93d26 3850 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3851 decomp_param[i].quit = true;
56e93d26
JQ
3852 qemu_cond_signal(&decomp_param[i].cond);
3853 qemu_mutex_unlock(&decomp_param[i].mutex);
3854 }
3855 for (i = 0; i < thread_count; i++) {
797ca154
XG
3856 if (!decomp_param[i].compbuf) {
3857 break;
3858 }
3859
56e93d26
JQ
3860 qemu_thread_join(decompress_threads + i);
3861 qemu_mutex_destroy(&decomp_param[i].mutex);
3862 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3863 inflateEnd(&decomp_param[i].stream);
56e93d26 3864 g_free(decomp_param[i].compbuf);
797ca154 3865 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3866 }
3867 g_free(decompress_threads);
3868 g_free(decomp_param);
56e93d26
JQ
3869 decompress_threads = NULL;
3870 decomp_param = NULL;
34ab9e97 3871 decomp_file = NULL;
56e93d26
JQ
3872}
3873
34ab9e97 3874static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3875{
3876 int i, thread_count;
3877
3878 if (!migrate_use_compression()) {
3879 return 0;
3880 }
3881
3882 thread_count = migrate_decompress_threads();
3883 decompress_threads = g_new0(QemuThread, thread_count);
3884 decomp_param = g_new0(DecompressParam, thread_count);
3885 qemu_mutex_init(&decomp_done_lock);
3886 qemu_cond_init(&decomp_done_cond);
34ab9e97 3887 decomp_file = f;
797ca154
XG
3888 for (i = 0; i < thread_count; i++) {
3889 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3890 goto exit;
3891 }
3892
3893 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3894 qemu_mutex_init(&decomp_param[i].mutex);
3895 qemu_cond_init(&decomp_param[i].cond);
3896 decomp_param[i].done = true;
3897 decomp_param[i].quit = false;
3898 qemu_thread_create(decompress_threads + i, "decompress",
3899 do_data_decompress, decomp_param + i,
3900 QEMU_THREAD_JOINABLE);
3901 }
3902 return 0;
3903exit:
3904 compress_threads_load_cleanup();
3905 return -1;
3906}
3907
c1bc6626 3908static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3909 void *host, int len)
3910{
3911 int idx, thread_count;
3912
3913 thread_count = migrate_decompress_threads();
73a8912b 3914 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3915 while (true) {
3916 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3917 if (decomp_param[idx].done) {
33d151f4
LL
3918 decomp_param[idx].done = false;
3919 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3920 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3921 decomp_param[idx].des = host;
3922 decomp_param[idx].len = len;
33d151f4
LL
3923 qemu_cond_signal(&decomp_param[idx].cond);
3924 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3925 break;
3926 }
3927 }
3928 if (idx < thread_count) {
3929 break;
73a8912b
LL
3930 } else {
3931 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3932 }
3933 }
73a8912b 3934 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3935}
3936
13af18f2
ZC
3937/*
3938 * colo cache: this is for secondary VM, we cache the whole
3939 * memory of the secondary VM, it is need to hold the global lock
3940 * to call this helper.
3941 */
3942int colo_init_ram_cache(void)
3943{
3944 RAMBlock *block;
3945
44901b5a
PB
3946 WITH_RCU_READ_LOCK_GUARD() {
3947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3948 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3949 NULL,
3950 false);
3951 if (!block->colo_cache) {
3952 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3953 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3954 block->used_length);
3955 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3956 if (block->colo_cache) {
3957 qemu_anon_ram_free(block->colo_cache, block->used_length);
3958 block->colo_cache = NULL;
3959 }
89ac5a1d 3960 }
44901b5a 3961 return -errno;
89ac5a1d 3962 }
44901b5a 3963 memcpy(block->colo_cache, block->host, block->used_length);
13af18f2 3964 }
13af18f2 3965 }
44901b5a 3966
7d9acafa
ZC
3967 /*
3968 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3969 * with to decide which page in cache should be flushed into SVM's RAM. Here
3970 * we use the same name 'ram_bitmap' as for migration.
3971 */
3972 if (ram_bytes_total()) {
3973 RAMBlock *block;
3974
fbd162e6 3975 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3976 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3977
3978 block->bmap = bitmap_new(pages);
3979 bitmap_set(block->bmap, 0, pages);
3980 }
3981 }
3982 ram_state = g_new0(RAMState, 1);
3983 ram_state->migration_dirty_pages = 0;
c6e5bafb 3984 qemu_mutex_init(&ram_state->bitmap_mutex);
d1955d22 3985 memory_global_dirty_log_start();
7d9acafa 3986
13af18f2 3987 return 0;
13af18f2
ZC
3988}
3989
3990/* It is need to hold the global lock to call this helper */
3991void colo_release_ram_cache(void)
3992{
3993 RAMBlock *block;
3994
d1955d22 3995 memory_global_dirty_log_stop();
fbd162e6 3996 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3997 g_free(block->bmap);
3998 block->bmap = NULL;
3999 }
4000
89ac5a1d
DDAG
4001 WITH_RCU_READ_LOCK_GUARD() {
4002 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4003 if (block->colo_cache) {
4004 qemu_anon_ram_free(block->colo_cache, block->used_length);
4005 block->colo_cache = NULL;
4006 }
13af18f2
ZC
4007 }
4008 }
c6e5bafb 4009 qemu_mutex_destroy(&ram_state->bitmap_mutex);
7d9acafa
ZC
4010 g_free(ram_state);
4011 ram_state = NULL;
13af18f2
ZC
4012}
4013
f265e0e4
JQ
4014/**
4015 * ram_load_setup: Setup RAM for migration incoming side
4016 *
4017 * Returns zero to indicate success and negative for error
4018 *
4019 * @f: QEMUFile where to receive the data
4020 * @opaque: RAMState pointer
4021 */
4022static int ram_load_setup(QEMUFile *f, void *opaque)
4023{
34ab9e97 4024 if (compress_threads_load_setup(f)) {
797ca154
XG
4025 return -1;
4026 }
4027
f265e0e4 4028 xbzrle_load_setup();
f9494614 4029 ramblock_recv_map_init();
13af18f2 4030
f265e0e4
JQ
4031 return 0;
4032}
4033
4034static int ram_load_cleanup(void *opaque)
4035{
f9494614 4036 RAMBlock *rb;
56eb90af 4037
fbd162e6 4038 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
bd108a44 4039 qemu_ram_block_writeback(rb);
56eb90af
JH
4040 }
4041
f265e0e4 4042 xbzrle_load_cleanup();
f0afa331 4043 compress_threads_load_cleanup();
f9494614 4044
fbd162e6 4045 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
4046 g_free(rb->receivedmap);
4047 rb->receivedmap = NULL;
4048 }
13af18f2 4049
f265e0e4
JQ
4050 return 0;
4051}
4052
3d0684b2
JQ
4053/**
4054 * ram_postcopy_incoming_init: allocate postcopy data structures
4055 *
4056 * Returns 0 for success and negative if there was one error
4057 *
4058 * @mis: current migration incoming state
4059 *
4060 * Allocate data structures etc needed by incoming migration with
4061 * postcopy-ram. postcopy-ram's similarly names
4062 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
4063 */
4064int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4065{
c136180c 4066 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
4067}
4068
3d0684b2
JQ
4069/**
4070 * ram_load_postcopy: load a page in postcopy case
4071 *
4072 * Returns 0 for success or -errno in case of error
4073 *
a7180877
DDAG
4074 * Called in postcopy mode by ram_load().
4075 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
4076 *
4077 * @f: QEMUFile where to send the data
a7180877
DDAG
4078 */
4079static int ram_load_postcopy(QEMUFile *f)
4080{
4081 int flags = 0, ret = 0;
4082 bool place_needed = false;
1aa83678 4083 bool matches_target_page_size = false;
a7180877
DDAG
4084 MigrationIncomingState *mis = migration_incoming_get_current();
4085 /* Temporary page that is later 'placed' */
3414322a 4086 void *postcopy_host_page = mis->postcopy_tmp_page;
91ba442f 4087 void *this_host = NULL;
a3b6ff6d 4088 bool all_zero = false;
4cbb3c63 4089 int target_pages = 0;
a7180877
DDAG
4090
4091 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4092 ram_addr_t addr;
4093 void *host = NULL;
4094 void *page_buffer = NULL;
4095 void *place_source = NULL;
df9ff5e1 4096 RAMBlock *block = NULL;
a7180877 4097 uint8_t ch;
644acf99 4098 int len;
a7180877
DDAG
4099
4100 addr = qemu_get_be64(f);
7a9ddfbf
PX
4101
4102 /*
4103 * If qemu file error, we should stop here, and then "addr"
4104 * may be invalid
4105 */
4106 ret = qemu_file_get_error(f);
4107 if (ret) {
4108 break;
4109 }
4110
a7180877
DDAG
4111 flags = addr & ~TARGET_PAGE_MASK;
4112 addr &= TARGET_PAGE_MASK;
4113
4114 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4115 place_needed = false;
644acf99
WY
4116 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4117 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
df9ff5e1 4118 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
4119
4120 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
4121 if (!host) {
4122 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4123 ret = -EINVAL;
4124 break;
4125 }
4cbb3c63 4126 target_pages++;
1aa83678 4127 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 4128 /*
28abd200
DDAG
4129 * Postcopy requires that we place whole host pages atomically;
4130 * these may be huge pages for RAMBlocks that are backed by
4131 * hugetlbfs.
a7180877
DDAG
4132 * To make it atomic, the data is read into a temporary page
4133 * that's moved into place later.
4134 * The migration protocol uses, possibly smaller, target-pages
4135 * however the source ensures it always sends all the components
91ba442f 4136 * of a host page in one chunk.
a7180877
DDAG
4137 */
4138 page_buffer = postcopy_host_page +
28abd200 4139 ((uintptr_t)host & (block->page_size - 1));
a7180877 4140 /* If all TP are zero then we can optimise the place */
e5e73b0f 4141 if (target_pages == 1) {
a7180877 4142 all_zero = true;
91ba442f
WY
4143 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4144 block->page_size);
c53b7ddc
DDAG
4145 } else {
4146 /* not the 1st TP within the HP */
91ba442f
WY
4147 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
4148 (uintptr_t)this_host) {
4149 error_report("Non-same host page %p/%p",
4150 host, this_host);
c53b7ddc
DDAG
4151 ret = -EINVAL;
4152 break;
4153 }
a7180877
DDAG
4154 }
4155
4156 /*
4157 * If it's the last part of a host page then we place the host
4158 * page
4159 */
4cbb3c63
WY
4160 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
4161 place_needed = true;
4162 target_pages = 0;
4163 }
a7180877
DDAG
4164 place_source = postcopy_host_page;
4165 }
4166
4167 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4168 case RAM_SAVE_FLAG_ZERO:
a7180877 4169 ch = qemu_get_byte(f);
2e36bc1b
WY
4170 /*
4171 * Can skip to set page_buffer when
4172 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4173 */
4174 if (ch || !matches_target_page_size) {
4175 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4176 }
a7180877
DDAG
4177 if (ch) {
4178 all_zero = false;
4179 }
4180 break;
4181
4182 case RAM_SAVE_FLAG_PAGE:
4183 all_zero = false;
1aa83678
PX
4184 if (!matches_target_page_size) {
4185 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4186 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4187 } else {
1aa83678
PX
4188 /*
4189 * For small pages that matches target page size, we
4190 * avoid the qemu_file copy. Instead we directly use
4191 * the buffer of QEMUFile to place the page. Note: we
4192 * cannot do any QEMUFile operation before using that
4193 * buffer to make sure the buffer is valid when
4194 * placing the page.
a7180877
DDAG
4195 */
4196 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4197 TARGET_PAGE_SIZE);
4198 }
4199 break;
644acf99
WY
4200 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4201 all_zero = false;
4202 len = qemu_get_be32(f);
4203 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4204 error_report("Invalid compressed data length: %d", len);
4205 ret = -EINVAL;
4206 break;
4207 }
4208 decompress_data_with_multi_threads(f, page_buffer, len);
4209 break;
4210
a7180877
DDAG
4211 case RAM_SAVE_FLAG_EOS:
4212 /* normal exit */
6df264ac 4213 multifd_recv_sync_main();
a7180877
DDAG
4214 break;
4215 default:
4216 error_report("Unknown combination of migration flags: %#x"
4217 " (postcopy mode)", flags);
4218 ret = -EINVAL;
7a9ddfbf
PX
4219 break;
4220 }
4221
644acf99
WY
4222 /* Got the whole host page, wait for decompress before placing. */
4223 if (place_needed) {
4224 ret |= wait_for_decompress_done();
4225 }
4226
7a9ddfbf
PX
4227 /* Detect for any possible file errors */
4228 if (!ret && qemu_file_get_error(f)) {
4229 ret = qemu_file_get_error(f);
a7180877
DDAG
4230 }
4231
7a9ddfbf 4232 if (!ret && place_needed) {
a7180877 4233 /* This gets called at the last target page in the host page */
91ba442f
WY
4234 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4235 block->page_size);
df9ff5e1 4236
a7180877 4237 if (all_zero) {
df9ff5e1 4238 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 4239 block);
a7180877 4240 } else {
df9ff5e1 4241 ret = postcopy_place_page(mis, place_dest,
8be4620b 4242 place_source, block);
a7180877
DDAG
4243 }
4244 }
a7180877
DDAG
4245 }
4246
4247 return ret;
4248}
4249
acab30b8
DHB
4250static bool postcopy_is_advised(void)
4251{
4252 PostcopyState ps = postcopy_state_get();
4253 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4254}
4255
4256static bool postcopy_is_running(void)
4257{
4258 PostcopyState ps = postcopy_state_get();
4259 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4260}
4261
e6f4aa18
ZC
4262/*
4263 * Flush content of RAM cache into SVM's memory.
4264 * Only flush the pages that be dirtied by PVM or SVM or both.
4265 */
4266static void colo_flush_ram_cache(void)
4267{
4268 RAMBlock *block = NULL;
4269 void *dst_host;
4270 void *src_host;
4271 unsigned long offset = 0;
4272
d1955d22 4273 memory_global_dirty_log_sync();
89ac5a1d
DDAG
4274 WITH_RCU_READ_LOCK_GUARD() {
4275 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4276 ramblock_sync_dirty_bitmap(ram_state, block);
4277 }
d1955d22 4278 }
d1955d22 4279
e6f4aa18 4280 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
89ac5a1d
DDAG
4281 WITH_RCU_READ_LOCK_GUARD() {
4282 block = QLIST_FIRST_RCU(&ram_list.blocks);
e6f4aa18 4283
89ac5a1d
DDAG
4284 while (block) {
4285 offset = migration_bitmap_find_dirty(ram_state, block, offset);
e6f4aa18 4286
8bba004c
AR
4287 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
4288 >= block->used_length) {
89ac5a1d
DDAG
4289 offset = 0;
4290 block = QLIST_NEXT_RCU(block, next);
4291 } else {
4292 migration_bitmap_clear_dirty(ram_state, block, offset);
8bba004c
AR
4293 dst_host = block->host
4294 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4295 src_host = block->colo_cache
4296 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
89ac5a1d
DDAG
4297 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4298 }
e6f4aa18
ZC
4299 }
4300 }
e6f4aa18
ZC
4301 trace_colo_flush_ram_cache_end();
4302}
4303
10da4a36
WY
4304/**
4305 * ram_load_precopy: load pages in precopy case
4306 *
4307 * Returns 0 for success or -errno in case of error
4308 *
4309 * Called in precopy mode by ram_load().
4310 * rcu_read_lock is taken prior to this being called.
4311 *
4312 * @f: QEMUFile where to send the data
4313 */
4314static int ram_load_precopy(QEMUFile *f)
56e93d26 4315{
e65cec5e 4316 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
ef08fb38 4317 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4318 bool postcopy_advised = postcopy_is_advised();
edc60127
JQ
4319 if (!migrate_use_compression()) {
4320 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4321 }
a7180877 4322
10da4a36 4323 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4324 ram_addr_t addr, total_ram_bytes;
a776aa15 4325 void *host = NULL;
56e93d26
JQ
4326 uint8_t ch;
4327
e65cec5e
YK
4328 /*
4329 * Yield periodically to let main loop run, but an iteration of
4330 * the main loop is expensive, so do it each some iterations
4331 */
4332 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4333 aio_co_schedule(qemu_get_current_aio_context(),
4334 qemu_coroutine_self());
4335 qemu_coroutine_yield();
4336 }
4337 i++;
4338
56e93d26
JQ
4339 addr = qemu_get_be64(f);
4340 flags = addr & ~TARGET_PAGE_MASK;
4341 addr &= TARGET_PAGE_MASK;
4342
edc60127
JQ
4343 if (flags & invalid_flags) {
4344 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4345 error_report("Received an unexpected compressed page");
4346 }
4347
4348 ret = -EINVAL;
4349 break;
4350 }
4351
bb890ed5 4352 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4353 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4354 RAMBlock *block = ram_block_from_stream(f, flags);
4355
13af18f2
ZC
4356 /*
4357 * After going into COLO, we should load the Page into colo_cache.
4358 */
4359 if (migration_incoming_in_colo_state()) {
4360 host = colo_cache_from_block_offset(block, addr);
4361 } else {
4362 host = host_from_ram_block_offset(block, addr);
4363 }
a776aa15
DDAG
4364 if (!host) {
4365 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4366 ret = -EINVAL;
4367 break;
4368 }
13af18f2
ZC
4369
4370 if (!migration_incoming_in_colo_state()) {
4371 ramblock_recv_bitmap_set(block, host);
4372 }
4373
1db9d8e5 4374 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4375 }
4376
56e93d26
JQ
4377 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4378 case RAM_SAVE_FLAG_MEM_SIZE:
4379 /* Synchronize RAM block list */
4380 total_ram_bytes = addr;
4381 while (!ret && total_ram_bytes) {
4382 RAMBlock *block;
56e93d26
JQ
4383 char id[256];
4384 ram_addr_t length;
4385
4386 len = qemu_get_byte(f);
4387 qemu_get_buffer(f, (uint8_t *)id, len);
4388 id[len] = 0;
4389 length = qemu_get_be64(f);
4390
e3dd7493 4391 block = qemu_ram_block_by_name(id);
b895de50
CLG
4392 if (block && !qemu_ram_is_migratable(block)) {
4393 error_report("block %s should not be migrated !", id);
4394 ret = -EINVAL;
4395 } else if (block) {
e3dd7493
DDAG
4396 if (length != block->used_length) {
4397 Error *local_err = NULL;
56e93d26 4398
fa53a0e5 4399 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4400 &local_err);
4401 if (local_err) {
4402 error_report_err(local_err);
56e93d26 4403 }
56e93d26 4404 }
ef08fb38
DDAG
4405 /* For postcopy we need to check hugepage sizes match */
4406 if (postcopy_advised &&
4407 block->page_size != qemu_host_page_size) {
4408 uint64_t remote_page_size = qemu_get_be64(f);
4409 if (remote_page_size != block->page_size) {
4410 error_report("Mismatched RAM page size %s "
4411 "(local) %zd != %" PRId64,
4412 id, block->page_size,
4413 remote_page_size);
4414 ret = -EINVAL;
4415 }
4416 }
fbd162e6
YK
4417 if (migrate_ignore_shared()) {
4418 hwaddr addr = qemu_get_be64(f);
fbd162e6
YK
4419 if (ramblock_is_ignored(block) &&
4420 block->mr->addr != addr) {
4421 error_report("Mismatched GPAs for block %s "
4422 "%" PRId64 "!= %" PRId64,
4423 id, (uint64_t)addr,
4424 (uint64_t)block->mr->addr);
4425 ret = -EINVAL;
4426 }
4427 }
e3dd7493
DDAG
4428 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4429 block->idstr);
4430 } else {
56e93d26
JQ
4431 error_report("Unknown ramblock \"%s\", cannot "
4432 "accept migration", id);
4433 ret = -EINVAL;
4434 }
4435
4436 total_ram_bytes -= length;
4437 }
4438 break;
a776aa15 4439
bb890ed5 4440 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4441 ch = qemu_get_byte(f);
4442 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4443 break;
a776aa15 4444
56e93d26 4445 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4446 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4447 break;
56e93d26 4448
a776aa15 4449 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4450 len = qemu_get_be32(f);
4451 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4452 error_report("Invalid compressed data length: %d", len);
4453 ret = -EINVAL;
4454 break;
4455 }
c1bc6626 4456 decompress_data_with_multi_threads(f, host, len);
56e93d26 4457 break;
a776aa15 4458
56e93d26 4459 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4460 if (load_xbzrle(f, addr, host) < 0) {
4461 error_report("Failed to decompress XBZRLE page at "
4462 RAM_ADDR_FMT, addr);
4463 ret = -EINVAL;
4464 break;
4465 }
4466 break;
4467 case RAM_SAVE_FLAG_EOS:
4468 /* normal exit */
6df264ac 4469 multifd_recv_sync_main();
56e93d26
JQ
4470 break;
4471 default:
4472 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4473 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4474 } else {
4475 error_report("Unknown combination of migration flags: %#x",
4476 flags);
4477 ret = -EINVAL;
4478 }
4479 }
4480 if (!ret) {
4481 ret = qemu_file_get_error(f);
4482 }
4483 }
4484
ca1a6b70 4485 ret |= wait_for_decompress_done();
10da4a36
WY
4486 return ret;
4487}
4488
4489static int ram_load(QEMUFile *f, void *opaque, int version_id)
4490{
4491 int ret = 0;
4492 static uint64_t seq_iter;
4493 /*
4494 * If system is running in postcopy mode, page inserts to host memory must
4495 * be atomic
4496 */
4497 bool postcopy_running = postcopy_is_running();
4498
4499 seq_iter++;
4500
4501 if (version_id != 4) {
4502 return -EINVAL;
4503 }
4504
4505 /*
4506 * This RCU critical section can be very long running.
4507 * When RCU reclaims in the code start to become numerous,
4508 * it will be necessary to reduce the granularity of this
4509 * critical section.
4510 */
89ac5a1d
DDAG
4511 WITH_RCU_READ_LOCK_GUARD() {
4512 if (postcopy_running) {
4513 ret = ram_load_postcopy(f);
4514 } else {
4515 ret = ram_load_precopy(f);
4516 }
10da4a36 4517 }
55c4446b 4518 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4519
4520 if (!ret && migration_incoming_in_colo_state()) {
4521 colo_flush_ram_cache();
4522 }
56e93d26
JQ
4523 return ret;
4524}
4525
c6467627
VSO
4526static bool ram_has_postcopy(void *opaque)
4527{
469dd51b 4528 RAMBlock *rb;
fbd162e6 4529 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4530 if (ramblock_is_pmem(rb)) {
4531 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4532 "is not supported now!", rb->idstr, rb->host);
4533 return false;
4534 }
4535 }
4536
c6467627
VSO
4537 return migrate_postcopy_ram();
4538}
4539
edd090c7
PX
4540/* Sync all the dirty bitmap with destination VM. */
4541static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4542{
4543 RAMBlock *block;
4544 QEMUFile *file = s->to_dst_file;
4545 int ramblock_count = 0;
4546
4547 trace_ram_dirty_bitmap_sync_start();
4548
fbd162e6 4549 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4550 qemu_savevm_send_recv_bitmap(file, block->idstr);
4551 trace_ram_dirty_bitmap_request(block->idstr);
4552 ramblock_count++;
4553 }
4554
4555 trace_ram_dirty_bitmap_sync_wait();
4556
4557 /* Wait until all the ramblocks' dirty bitmap synced */
4558 while (ramblock_count--) {
4559 qemu_sem_wait(&s->rp_state.rp_sem);
4560 }
4561
4562 trace_ram_dirty_bitmap_sync_complete();
4563
4564 return 0;
4565}
4566
4567static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4568{
4569 qemu_sem_post(&s->rp_state.rp_sem);
4570}
4571
a335debb
PX
4572/*
4573 * Read the received bitmap, revert it as the initial dirty bitmap.
4574 * This is only used when the postcopy migration is paused but wants
4575 * to resume from a middle point.
4576 */
4577int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4578{
4579 int ret = -EINVAL;
4580 QEMUFile *file = s->rp_state.from_dst_file;
4581 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4582 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4583 uint64_t size, end_mark;
4584
4585 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4586
4587 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4588 error_report("%s: incorrect state %s", __func__,
4589 MigrationStatus_str(s->state));
4590 return -EINVAL;
4591 }
4592
4593 /*
4594 * Note: see comments in ramblock_recv_bitmap_send() on why we
4595 * need the endianess convertion, and the paddings.
4596 */
4597 local_size = ROUND_UP(local_size, 8);
4598
4599 /* Add paddings */
4600 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4601
4602 size = qemu_get_be64(file);
4603
4604 /* The size of the bitmap should match with our ramblock */
4605 if (size != local_size) {
4606 error_report("%s: ramblock '%s' bitmap size mismatch "
4607 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4608 block->idstr, size, local_size);
4609 ret = -EINVAL;
4610 goto out;
4611 }
4612
4613 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4614 end_mark = qemu_get_be64(file);
4615
4616 ret = qemu_file_get_error(file);
4617 if (ret || size != local_size) {
4618 error_report("%s: read bitmap failed for ramblock '%s': %d"
4619 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4620 __func__, block->idstr, ret, local_size, size);
4621 ret = -EIO;
4622 goto out;
4623 }
4624
4625 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4626 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4627 __func__, block->idstr, end_mark);
4628 ret = -EINVAL;
4629 goto out;
4630 }
4631
4632 /*
4633 * Endianess convertion. We are during postcopy (though paused).
4634 * The dirty bitmap won't change. We can directly modify it.
4635 */
4636 bitmap_from_le(block->bmap, le_bitmap, nbits);
4637
4638 /*
4639 * What we received is "received bitmap". Revert it as the initial
4640 * dirty bitmap for this ramblock.
4641 */
4642 bitmap_complement(block->bmap, block->bmap, nbits);
4643
4644 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4645
edd090c7
PX
4646 /*
4647 * We succeeded to sync bitmap for current ramblock. If this is
4648 * the last one to sync, we need to notify the main send thread.
4649 */
4650 ram_dirty_bitmap_reload_notify(s);
4651
a335debb
PX
4652 ret = 0;
4653out:
bf269906 4654 g_free(le_bitmap);
a335debb
PX
4655 return ret;
4656}
4657
edd090c7
PX
4658static int ram_resume_prepare(MigrationState *s, void *opaque)
4659{
4660 RAMState *rs = *(RAMState **)opaque;
08614f34 4661 int ret;
edd090c7 4662
08614f34
PX
4663 ret = ram_dirty_bitmap_sync_all(s, rs);
4664 if (ret) {
4665 return ret;
4666 }
4667
4668 ram_state_resume_prepare(rs, s->to_dst_file);
4669
4670 return 0;
edd090c7
PX
4671}
4672
56e93d26 4673static SaveVMHandlers savevm_ram_handlers = {
9907e842 4674 .save_setup = ram_save_setup,
56e93d26 4675 .save_live_iterate = ram_save_iterate,
763c906b 4676 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4677 .save_live_complete_precopy = ram_save_complete,
c6467627 4678 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4679 .save_live_pending = ram_save_pending,
4680 .load_state = ram_load,
f265e0e4
JQ
4681 .save_cleanup = ram_save_cleanup,
4682 .load_setup = ram_load_setup,
4683 .load_cleanup = ram_load_cleanup,
edd090c7 4684 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4685};
4686
4687void ram_mig_init(void)
4688{
4689 qemu_mutex_init(&XBZRLE.lock);
ce62df53 4690 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4691}
This page took 1.070927 seconds and 4 git commands to generate.