]> Git Repo - qemu.git/blame - migration/ram.c
migration: API to clear bits of guest free pages from the dirty bitmap
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
56eb90af 36#include "qemu/pmem.h"
709e3fe8 37#include "xbzrle.h"
7b1e1a22 38#include "ram.h"
6666c96a 39#include "migration.h"
71bb07db 40#include "socket.h"
f2a8f0a6 41#include "migration/register.h"
7b1e1a22 42#include "migration/misc.h"
08a0aee1 43#include "qemu-file.h"
be07b0ac 44#include "postcopy-ram.h"
53d37d36 45#include "page_cache.h"
56e93d26 46#include "qemu/error-report.h"
e688df6b 47#include "qapi/error.h"
9af23989 48#include "qapi/qapi-events-migration.h"
8acabf69 49#include "qapi/qmp/qerror.h"
56e93d26 50#include "trace.h"
56e93d26 51#include "exec/ram_addr.h"
f9494614 52#include "exec/target_page.h"
56e93d26 53#include "qemu/rcu_queue.h"
a91246c9 54#include "migration/colo.h"
53d37d36 55#include "block.h"
af8b7d2b
JQ
56#include "sysemu/sysemu.h"
57#include "qemu/uuid.h"
edd090c7 58#include "savevm.h"
b9ee2f7d 59#include "qemu/iov.h"
56e93d26 60
56e93d26
JQ
61/***********************************************************/
62/* ram save/restore */
63
bb890ed5
JQ
64/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65 * worked for pages that where filled with the same char. We switched
66 * it to only search for the zero value. And to avoid confusion with
67 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 */
69
56e93d26 70#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 71#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
72#define RAM_SAVE_FLAG_MEM_SIZE 0x04
73#define RAM_SAVE_FLAG_PAGE 0x08
74#define RAM_SAVE_FLAG_EOS 0x10
75#define RAM_SAVE_FLAG_CONTINUE 0x20
76#define RAM_SAVE_FLAG_XBZRLE 0x40
77/* 0x80 is reserved in migration.h start with 0x100 next */
78#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
56e93d26
JQ
80static inline bool is_zero_range(uint8_t *p, uint64_t size)
81{
a1febc49 82 return buffer_is_zero(p, size);
56e93d26
JQ
83}
84
9360447d
JQ
85XBZRLECacheStats xbzrle_counters;
86
56e93d26
JQ
87/* struct contains XBZRLE cache and a static page
88 used by the compression */
89static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
c00e0928
JQ
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
f265e0e4
JQ
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
56e93d26
JQ
101} XBZRLE;
102
56e93d26
JQ
103static void XBZRLE_cache_lock(void)
104{
105 if (migrate_use_xbzrle())
106 qemu_mutex_lock(&XBZRLE.lock);
107}
108
109static void XBZRLE_cache_unlock(void)
110{
111 if (migrate_use_xbzrle())
112 qemu_mutex_unlock(&XBZRLE.lock);
113}
114
3d0684b2
JQ
115/**
116 * xbzrle_cache_resize: resize the xbzrle cache
117 *
118 * This function is called from qmp_migrate_set_cache_size in main
119 * thread, possibly while a migration is in progress. A running
120 * migration may be using the cache and might finish during this call,
121 * hence changes to the cache are protected by XBZRLE.lock().
122 *
c9dede2d 123 * Returns 0 for success or -1 for error
3d0684b2
JQ
124 *
125 * @new_size: new cache size
8acabf69 126 * @errp: set *errp if the check failed, with reason
56e93d26 127 */
c9dede2d 128int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
129{
130 PageCache *new_cache;
c9dede2d 131 int64_t ret = 0;
56e93d26 132
8acabf69
JQ
133 /* Check for truncation */
134 if (new_size != (size_t)new_size) {
135 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136 "exceeding address space");
137 return -1;
138 }
139
2a313e5c
JQ
140 if (new_size == migrate_xbzrle_cache_size()) {
141 /* nothing to do */
c9dede2d 142 return 0;
2a313e5c
JQ
143 }
144
56e93d26
JQ
145 XBZRLE_cache_lock();
146
147 if (XBZRLE.cache != NULL) {
80f8dfde 148 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 149 if (!new_cache) {
56e93d26
JQ
150 ret = -1;
151 goto out;
152 }
153
154 cache_fini(XBZRLE.cache);
155 XBZRLE.cache = new_cache;
156 }
56e93d26
JQ
157out:
158 XBZRLE_cache_unlock();
159 return ret;
160}
161
fbd162e6
YK
162static bool ramblock_is_ignored(RAMBlock *block)
163{
164 return !qemu_ram_is_migratable(block) ||
165 (migrate_ignore_shared() && qemu_ram_is_shared(block));
166}
167
b895de50 168/* Should be holding either ram_list.mutex, or the RCU lock. */
fbd162e6
YK
169#define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
170 INTERNAL_RAMBLOCK_FOREACH(block) \
171 if (ramblock_is_ignored(block)) {} else
172
b895de50 173#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
343f632c 174 INTERNAL_RAMBLOCK_FOREACH(block) \
b895de50
CLG
175 if (!qemu_ram_is_migratable(block)) {} else
176
343f632c
DDAG
177#undef RAMBLOCK_FOREACH
178
fbd162e6
YK
179int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180{
181 RAMBlock *block;
182 int ret = 0;
183
184 rcu_read_lock();
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
191 rcu_read_unlock();
192 return ret;
193}
194
f9494614
AP
195static void ramblock_recv_map_init(void)
196{
197 RAMBlock *rb;
198
fbd162e6 199 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
200 assert(!rb->receivedmap);
201 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202 }
203}
204
205int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206{
207 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208 rb->receivedmap);
209}
210
1cba9f6e
DDAG
211bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212{
213 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214}
215
f9494614
AP
216void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217{
218 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219}
220
221void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222 size_t nr)
223{
224 bitmap_set_atomic(rb->receivedmap,
225 ramblock_recv_bitmap_offset(host_addr, rb),
226 nr);
227}
228
a335debb
PX
229#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
230
231/*
232 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233 *
234 * Returns >0 if success with sent bytes, or <0 if error.
235 */
236int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237 const char *block_name)
238{
239 RAMBlock *block = qemu_ram_block_by_name(block_name);
240 unsigned long *le_bitmap, nbits;
241 uint64_t size;
242
243 if (!block) {
244 error_report("%s: invalid block name: %s", __func__, block_name);
245 return -1;
246 }
247
248 nbits = block->used_length >> TARGET_PAGE_BITS;
249
250 /*
251 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252 * machines we may need 4 more bytes for padding (see below
253 * comment). So extend it a bit before hand.
254 */
255 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256
257 /*
258 * Always use little endian when sending the bitmap. This is
259 * required that when source and destination VMs are not using the
260 * same endianess. (Note: big endian won't work.)
261 */
262 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263
264 /* Size of the bitmap, in bytes */
a725ef9f 265 size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
266
267 /*
268 * size is always aligned to 8 bytes for 64bit machines, but it
269 * may not be true for 32bit machines. We need this padding to
270 * make sure the migration can survive even between 32bit and
271 * 64bit machines.
272 */
273 size = ROUND_UP(size, 8);
274
275 qemu_put_be64(file, size);
276 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277 /*
278 * Mark as an end, in case the middle part is screwed up due to
279 * some "misterious" reason.
280 */
281 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282 qemu_fflush(file);
283
bf269906 284 g_free(le_bitmap);
a335debb
PX
285
286 if (qemu_file_get_error(file)) {
287 return qemu_file_get_error(file);
288 }
289
290 return size + sizeof(size);
291}
292
ec481c6c
JQ
293/*
294 * An outstanding page request, on the source, having been received
295 * and queued
296 */
297struct RAMSrcPageRequest {
298 RAMBlock *rb;
299 hwaddr offset;
300 hwaddr len;
301
302 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303};
304
6f37bb8b
JQ
305/* State of RAM for migration */
306struct RAMState {
204b88b8
JQ
307 /* QEMUFile used for this migration */
308 QEMUFile *f;
6f37bb8b
JQ
309 /* Last block that we have visited searching for dirty pages */
310 RAMBlock *last_seen_block;
311 /* Last block from where we have sent data */
312 RAMBlock *last_sent_block;
269ace29
JQ
313 /* Last dirty target page we have sent */
314 ram_addr_t last_page;
6f37bb8b
JQ
315 /* last ram version we have seen */
316 uint32_t last_version;
317 /* We are in the first round */
318 bool ram_bulk_stage;
8d820d6f
JQ
319 /* How many times we have dirty too many pages */
320 int dirty_rate_high_cnt;
f664da80
JQ
321 /* these variables are used for bitmap sync */
322 /* last time we did a full bitmap_sync */
323 int64_t time_last_bitmap_sync;
eac74159 324 /* bytes transferred at start_time */
c4bdf0cf 325 uint64_t bytes_xfer_prev;
a66cd90c 326 /* number of dirty pages since start_time */
68908ed6 327 uint64_t num_dirty_pages_period;
b5833fde
JQ
328 /* xbzrle misses since the beginning of the period */
329 uint64_t xbzrle_cache_miss_prev;
76e03000
XG
330
331 /* compression statistics since the beginning of the period */
332 /* amount of count that no free thread to compress data */
333 uint64_t compress_thread_busy_prev;
334 /* amount bytes after compression */
335 uint64_t compressed_size_prev;
336 /* amount of compressed pages */
337 uint64_t compress_pages_prev;
338
be8b02ed
XG
339 /* total handled target pages at the beginning of period */
340 uint64_t target_page_count_prev;
341 /* total handled target pages since start */
342 uint64_t target_page_count;
9360447d 343 /* number of dirty bits in the bitmap */
2dfaf12e 344 uint64_t migration_dirty_pages;
386a907b 345 /* Protects modification of the bitmap and migration dirty pages */
108cfae0 346 QemuMutex bitmap_mutex;
68a098f3
JQ
347 /* The RAMBlock used in the last src_page_requests */
348 RAMBlock *last_req_rb;
ec481c6c
JQ
349 /* Queue of outstanding page requests from the destination */
350 QemuMutex src_page_req_mutex;
b58deb34 351 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
352};
353typedef struct RAMState RAMState;
354
53518d94 355static RAMState *ram_state;
6f37bb8b 356
9edabd4d 357uint64_t ram_bytes_remaining(void)
2f4fde93 358{
bae416e5
DDAG
359 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
360 0;
2f4fde93
JQ
361}
362
9360447d 363MigrationStats ram_counters;
96506894 364
b8fb8cb7
DDAG
365/* used by the search for pages to send */
366struct PageSearchStatus {
367 /* Current block being searched */
368 RAMBlock *block;
a935e30f
JQ
369 /* Current page to search from */
370 unsigned long page;
b8fb8cb7
DDAG
371 /* Set once we wrap around */
372 bool complete_round;
373};
374typedef struct PageSearchStatus PageSearchStatus;
375
76e03000
XG
376CompressionStats compression_counters;
377
56e93d26 378struct CompressParam {
56e93d26 379 bool done;
90e56fb4 380 bool quit;
5e5fdcff 381 bool zero_page;
56e93d26
JQ
382 QEMUFile *file;
383 QemuMutex mutex;
384 QemuCond cond;
385 RAMBlock *block;
386 ram_addr_t offset;
34ab9e97
XG
387
388 /* internally used fields */
dcaf446e 389 z_stream stream;
34ab9e97 390 uint8_t *originbuf;
56e93d26
JQ
391};
392typedef struct CompressParam CompressParam;
393
394struct DecompressParam {
73a8912b 395 bool done;
90e56fb4 396 bool quit;
56e93d26
JQ
397 QemuMutex mutex;
398 QemuCond cond;
399 void *des;
d341d9f3 400 uint8_t *compbuf;
56e93d26 401 int len;
797ca154 402 z_stream stream;
56e93d26
JQ
403};
404typedef struct DecompressParam DecompressParam;
405
406static CompressParam *comp_param;
407static QemuThread *compress_threads;
408/* comp_done_cond is used to wake up the migration thread when
409 * one of the compression threads has finished the compression.
410 * comp_done_lock is used to co-work with comp_done_cond.
411 */
0d9f9a5c
LL
412static QemuMutex comp_done_lock;
413static QemuCond comp_done_cond;
56e93d26
JQ
414/* The empty QEMUFileOps will be used by file in CompressParam */
415static const QEMUFileOps empty_ops = { };
416
34ab9e97 417static QEMUFile *decomp_file;
56e93d26
JQ
418static DecompressParam *decomp_param;
419static QemuThread *decompress_threads;
73a8912b
LL
420static QemuMutex decomp_done_lock;
421static QemuCond decomp_done_cond;
56e93d26 422
5e5fdcff 423static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 424 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
425
426static void *do_data_compress(void *opaque)
427{
428 CompressParam *param = opaque;
a7a9a88f
LL
429 RAMBlock *block;
430 ram_addr_t offset;
5e5fdcff 431 bool zero_page;
56e93d26 432
a7a9a88f 433 qemu_mutex_lock(&param->mutex);
90e56fb4 434 while (!param->quit) {
a7a9a88f
LL
435 if (param->block) {
436 block = param->block;
437 offset = param->offset;
438 param->block = NULL;
439 qemu_mutex_unlock(&param->mutex);
440
5e5fdcff
XG
441 zero_page = do_compress_ram_page(param->file, &param->stream,
442 block, offset, param->originbuf);
a7a9a88f 443
0d9f9a5c 444 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 445 param->done = true;
5e5fdcff 446 param->zero_page = zero_page;
0d9f9a5c
LL
447 qemu_cond_signal(&comp_done_cond);
448 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
449
450 qemu_mutex_lock(&param->mutex);
451 } else {
56e93d26
JQ
452 qemu_cond_wait(&param->cond, &param->mutex);
453 }
56e93d26 454 }
a7a9a88f 455 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
456
457 return NULL;
458}
459
f0afa331 460static void compress_threads_save_cleanup(void)
56e93d26
JQ
461{
462 int i, thread_count;
463
05306935 464 if (!migrate_use_compression() || !comp_param) {
56e93d26
JQ
465 return;
466 }
05306935 467
56e93d26
JQ
468 thread_count = migrate_compress_threads();
469 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
470 /*
471 * we use it as a indicator which shows if the thread is
472 * properly init'd or not
473 */
474 if (!comp_param[i].file) {
475 break;
476 }
05306935
FL
477
478 qemu_mutex_lock(&comp_param[i].mutex);
479 comp_param[i].quit = true;
480 qemu_cond_signal(&comp_param[i].cond);
481 qemu_mutex_unlock(&comp_param[i].mutex);
482
56e93d26 483 qemu_thread_join(compress_threads + i);
56e93d26
JQ
484 qemu_mutex_destroy(&comp_param[i].mutex);
485 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 486 deflateEnd(&comp_param[i].stream);
34ab9e97 487 g_free(comp_param[i].originbuf);
dcaf446e
XG
488 qemu_fclose(comp_param[i].file);
489 comp_param[i].file = NULL;
56e93d26 490 }
0d9f9a5c
LL
491 qemu_mutex_destroy(&comp_done_lock);
492 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
493 g_free(compress_threads);
494 g_free(comp_param);
56e93d26
JQ
495 compress_threads = NULL;
496 comp_param = NULL;
56e93d26
JQ
497}
498
dcaf446e 499static int compress_threads_save_setup(void)
56e93d26
JQ
500{
501 int i, thread_count;
502
503 if (!migrate_use_compression()) {
dcaf446e 504 return 0;
56e93d26 505 }
56e93d26
JQ
506 thread_count = migrate_compress_threads();
507 compress_threads = g_new0(QemuThread, thread_count);
508 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
509 qemu_cond_init(&comp_done_cond);
510 qemu_mutex_init(&comp_done_lock);
56e93d26 511 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
512 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
513 if (!comp_param[i].originbuf) {
514 goto exit;
515 }
516
dcaf446e
XG
517 if (deflateInit(&comp_param[i].stream,
518 migrate_compress_level()) != Z_OK) {
34ab9e97 519 g_free(comp_param[i].originbuf);
dcaf446e
XG
520 goto exit;
521 }
522
e110aa91
C
523 /* comp_param[i].file is just used as a dummy buffer to save data,
524 * set its ops to empty.
56e93d26
JQ
525 */
526 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
527 comp_param[i].done = true;
90e56fb4 528 comp_param[i].quit = false;
56e93d26
JQ
529 qemu_mutex_init(&comp_param[i].mutex);
530 qemu_cond_init(&comp_param[i].cond);
531 qemu_thread_create(compress_threads + i, "compress",
532 do_data_compress, comp_param + i,
533 QEMU_THREAD_JOINABLE);
534 }
dcaf446e
XG
535 return 0;
536
537exit:
538 compress_threads_save_cleanup();
539 return -1;
56e93d26
JQ
540}
541
f986c3d2
JQ
542/* Multiple fd's */
543
af8b7d2b
JQ
544#define MULTIFD_MAGIC 0x11223344U
545#define MULTIFD_VERSION 1
546
6df264ac
JQ
547#define MULTIFD_FLAG_SYNC (1 << 0)
548
af8b7d2b
JQ
549typedef struct {
550 uint32_t magic;
551 uint32_t version;
552 unsigned char uuid[16]; /* QemuUUID */
553 uint8_t id;
554} __attribute__((packed)) MultiFDInit_t;
555
2a26c979
JQ
556typedef struct {
557 uint32_t magic;
558 uint32_t version;
559 uint32_t flags;
560 uint32_t size;
561 uint32_t used;
562 uint64_t packet_num;
563 char ramblock[256];
564 uint64_t offset[];
565} __attribute__((packed)) MultiFDPacket_t;
566
34c55a94
JQ
567typedef struct {
568 /* number of used pages */
569 uint32_t used;
570 /* number of allocated pages */
571 uint32_t allocated;
572 /* global number of generated multifd packets */
573 uint64_t packet_num;
574 /* offset of each page */
575 ram_addr_t *offset;
576 /* pointer to each page */
577 struct iovec *iov;
578 RAMBlock *block;
579} MultiFDPages_t;
580
8c4598f2
JQ
581typedef struct {
582 /* this fields are not changed once the thread is created */
583 /* channel number */
f986c3d2 584 uint8_t id;
8c4598f2 585 /* channel thread name */
f986c3d2 586 char *name;
8c4598f2 587 /* channel thread id */
f986c3d2 588 QemuThread thread;
8c4598f2 589 /* communication channel */
60df2d4a 590 QIOChannel *c;
8c4598f2 591 /* sem where to wait for more work */
f986c3d2 592 QemuSemaphore sem;
8c4598f2 593 /* this mutex protects the following parameters */
f986c3d2 594 QemuMutex mutex;
8c4598f2 595 /* is this channel thread running */
66770707 596 bool running;
8c4598f2 597 /* should this thread finish */
f986c3d2 598 bool quit;
0beb5ed3
JQ
599 /* thread has work to do */
600 int pending_job;
34c55a94
JQ
601 /* array of pages to sent */
602 MultiFDPages_t *pages;
2a26c979
JQ
603 /* packet allocated len */
604 uint32_t packet_len;
605 /* pointer to the packet */
606 MultiFDPacket_t *packet;
607 /* multifd flags for each packet */
608 uint32_t flags;
609 /* global number of generated multifd packets */
610 uint64_t packet_num;
408ea6ae
JQ
611 /* thread local variables */
612 /* packets sent through this channel */
613 uint64_t num_packets;
614 /* pages sent through this channel */
615 uint64_t num_pages;
6df264ac
JQ
616 /* syncs main thread and channels */
617 QemuSemaphore sem_sync;
8c4598f2
JQ
618} MultiFDSendParams;
619
620typedef struct {
621 /* this fields are not changed once the thread is created */
622 /* channel number */
623 uint8_t id;
624 /* channel thread name */
625 char *name;
626 /* channel thread id */
627 QemuThread thread;
628 /* communication channel */
629 QIOChannel *c;
8c4598f2
JQ
630 /* this mutex protects the following parameters */
631 QemuMutex mutex;
632 /* is this channel thread running */
633 bool running;
34c55a94
JQ
634 /* array of pages to receive */
635 MultiFDPages_t *pages;
2a26c979
JQ
636 /* packet allocated len */
637 uint32_t packet_len;
638 /* pointer to the packet */
639 MultiFDPacket_t *packet;
640 /* multifd flags for each packet */
641 uint32_t flags;
642 /* global number of generated multifd packets */
643 uint64_t packet_num;
408ea6ae
JQ
644 /* thread local variables */
645 /* packets sent through this channel */
646 uint64_t num_packets;
647 /* pages sent through this channel */
648 uint64_t num_pages;
6df264ac
JQ
649 /* syncs main thread and channels */
650 QemuSemaphore sem_sync;
8c4598f2 651} MultiFDRecvParams;
f986c3d2 652
af8b7d2b
JQ
653static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
654{
655 MultiFDInit_t msg;
656 int ret;
657
658 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
659 msg.version = cpu_to_be32(MULTIFD_VERSION);
660 msg.id = p->id;
661 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
662
663 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
664 if (ret != 0) {
665 return -1;
666 }
667 return 0;
668}
669
670static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
671{
672 MultiFDInit_t msg;
673 int ret;
674
675 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
676 if (ret != 0) {
677 return -1;
678 }
679
341ba0df
PM
680 msg.magic = be32_to_cpu(msg.magic);
681 msg.version = be32_to_cpu(msg.version);
af8b7d2b
JQ
682
683 if (msg.magic != MULTIFD_MAGIC) {
684 error_setg(errp, "multifd: received packet magic %x "
685 "expected %x", msg.magic, MULTIFD_MAGIC);
686 return -1;
687 }
688
689 if (msg.version != MULTIFD_VERSION) {
690 error_setg(errp, "multifd: received packet version %d "
691 "expected %d", msg.version, MULTIFD_VERSION);
692 return -1;
693 }
694
695 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
696 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
697 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
698
699 error_setg(errp, "multifd: received uuid '%s' and expected "
700 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
701 g_free(uuid);
702 g_free(msg_uuid);
703 return -1;
704 }
705
706 if (msg.id > migrate_multifd_channels()) {
707 error_setg(errp, "multifd: received channel version %d "
708 "expected %d", msg.version, MULTIFD_VERSION);
709 return -1;
710 }
711
712 return msg.id;
713}
714
34c55a94
JQ
715static MultiFDPages_t *multifd_pages_init(size_t size)
716{
717 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
718
719 pages->allocated = size;
720 pages->iov = g_new0(struct iovec, size);
721 pages->offset = g_new0(ram_addr_t, size);
722
723 return pages;
724}
725
726static void multifd_pages_clear(MultiFDPages_t *pages)
727{
728 pages->used = 0;
729 pages->allocated = 0;
730 pages->packet_num = 0;
731 pages->block = NULL;
732 g_free(pages->iov);
733 pages->iov = NULL;
734 g_free(pages->offset);
735 pages->offset = NULL;
736 g_free(pages);
737}
738
2a26c979
JQ
739static void multifd_send_fill_packet(MultiFDSendParams *p)
740{
741 MultiFDPacket_t *packet = p->packet;
742 int i;
743
744 packet->magic = cpu_to_be32(MULTIFD_MAGIC);
745 packet->version = cpu_to_be32(MULTIFD_VERSION);
746 packet->flags = cpu_to_be32(p->flags);
747 packet->size = cpu_to_be32(migrate_multifd_page_count());
748 packet->used = cpu_to_be32(p->pages->used);
749 packet->packet_num = cpu_to_be64(p->packet_num);
750
751 if (p->pages->block) {
752 strncpy(packet->ramblock, p->pages->block->idstr, 256);
753 }
754
755 for (i = 0; i < p->pages->used; i++) {
756 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
757 }
758}
759
760static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
761{
762 MultiFDPacket_t *packet = p->packet;
763 RAMBlock *block;
764 int i;
765
341ba0df 766 packet->magic = be32_to_cpu(packet->magic);
2a26c979
JQ
767 if (packet->magic != MULTIFD_MAGIC) {
768 error_setg(errp, "multifd: received packet "
769 "magic %x and expected magic %x",
770 packet->magic, MULTIFD_MAGIC);
771 return -1;
772 }
773
341ba0df 774 packet->version = be32_to_cpu(packet->version);
2a26c979
JQ
775 if (packet->version != MULTIFD_VERSION) {
776 error_setg(errp, "multifd: received packet "
777 "version %d and expected version %d",
778 packet->version, MULTIFD_VERSION);
779 return -1;
780 }
781
782 p->flags = be32_to_cpu(packet->flags);
783
341ba0df 784 packet->size = be32_to_cpu(packet->size);
2a26c979
JQ
785 if (packet->size > migrate_multifd_page_count()) {
786 error_setg(errp, "multifd: received packet "
787 "with size %d and expected maximum size %d",
788 packet->size, migrate_multifd_page_count()) ;
789 return -1;
790 }
791
792 p->pages->used = be32_to_cpu(packet->used);
793 if (p->pages->used > packet->size) {
794 error_setg(errp, "multifd: received packet "
795 "with size %d and expected maximum size %d",
796 p->pages->used, packet->size) ;
797 return -1;
798 }
799
800 p->packet_num = be64_to_cpu(packet->packet_num);
801
802 if (p->pages->used) {
803 /* make sure that ramblock is 0 terminated */
804 packet->ramblock[255] = 0;
805 block = qemu_ram_block_by_name(packet->ramblock);
806 if (!block) {
807 error_setg(errp, "multifd: unknown ram block %s",
808 packet->ramblock);
809 return -1;
810 }
811 }
812
813 for (i = 0; i < p->pages->used; i++) {
814 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
815
816 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
817 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
818 " (max " RAM_ADDR_FMT ")",
819 offset, block->max_length);
820 return -1;
821 }
822 p->pages->iov[i].iov_base = block->host + offset;
823 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
824 }
825
826 return 0;
827}
828
f986c3d2
JQ
829struct {
830 MultiFDSendParams *params;
831 /* number of created threads */
832 int count;
34c55a94
JQ
833 /* array of pages to sent */
834 MultiFDPages_t *pages;
6df264ac
JQ
835 /* syncs main thread and channels */
836 QemuSemaphore sem_sync;
837 /* global number of generated multifd packets */
838 uint64_t packet_num;
b9ee2f7d
JQ
839 /* send channels ready */
840 QemuSemaphore channels_ready;
f986c3d2
JQ
841} *multifd_send_state;
842
b9ee2f7d
JQ
843/*
844 * How we use multifd_send_state->pages and channel->pages?
845 *
846 * We create a pages for each channel, and a main one. Each time that
847 * we need to send a batch of pages we interchange the ones between
848 * multifd_send_state and the channel that is sending it. There are
849 * two reasons for that:
850 * - to not have to do so many mallocs during migration
851 * - to make easier to know what to free at the end of migration
852 *
853 * This way we always know who is the owner of each "pages" struct,
854 * and we don't need any loocking. It belongs to the migration thread
855 * or to the channel thread. Switching is safe because the migration
856 * thread is using the channel mutex when changing it, and the channel
857 * have to had finish with its own, otherwise pending_job can't be
858 * false.
859 */
860
861static void multifd_send_pages(void)
862{
863 int i;
864 static int next_channel;
865 MultiFDSendParams *p = NULL; /* make happy gcc */
866 MultiFDPages_t *pages = multifd_send_state->pages;
867 uint64_t transferred;
868
869 qemu_sem_wait(&multifd_send_state->channels_ready);
870 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
871 p = &multifd_send_state->params[i];
872
873 qemu_mutex_lock(&p->mutex);
874 if (!p->pending_job) {
875 p->pending_job++;
876 next_channel = (i + 1) % migrate_multifd_channels();
877 break;
878 }
879 qemu_mutex_unlock(&p->mutex);
880 }
881 p->pages->used = 0;
882
883 p->packet_num = multifd_send_state->packet_num++;
884 p->pages->block = NULL;
885 multifd_send_state->pages = p->pages;
886 p->pages = pages;
4fcefd44 887 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
b9ee2f7d
JQ
888 ram_counters.multifd_bytes += transferred;
889 ram_counters.transferred += transferred;;
890 qemu_mutex_unlock(&p->mutex);
891 qemu_sem_post(&p->sem);
892}
893
894static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
895{
896 MultiFDPages_t *pages = multifd_send_state->pages;
897
898 if (!pages->block) {
899 pages->block = block;
900 }
901
902 if (pages->block == block) {
903 pages->offset[pages->used] = offset;
904 pages->iov[pages->used].iov_base = block->host + offset;
905 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
906 pages->used++;
907
908 if (pages->used < pages->allocated) {
909 return;
910 }
911 }
912
913 multifd_send_pages();
914
915 if (pages->block != block) {
916 multifd_queue_page(block, offset);
917 }
918}
919
66770707 920static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
921{
922 int i;
923
7a169d74
JQ
924 if (err) {
925 MigrationState *s = migrate_get_current();
926 migrate_set_error(s, err);
927 if (s->state == MIGRATION_STATUS_SETUP ||
928 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
929 s->state == MIGRATION_STATUS_DEVICE ||
930 s->state == MIGRATION_STATUS_ACTIVE) {
931 migrate_set_state(&s->state, s->state,
932 MIGRATION_STATUS_FAILED);
933 }
934 }
935
66770707 936 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
937 MultiFDSendParams *p = &multifd_send_state->params[i];
938
939 qemu_mutex_lock(&p->mutex);
940 p->quit = true;
941 qemu_sem_post(&p->sem);
942 qemu_mutex_unlock(&p->mutex);
943 }
944}
945
1398b2e3 946void multifd_save_cleanup(void)
f986c3d2
JQ
947{
948 int i;
f986c3d2
JQ
949
950 if (!migrate_use_multifd()) {
1398b2e3 951 return;
f986c3d2 952 }
66770707
JQ
953 multifd_send_terminate_threads(NULL);
954 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
955 MultiFDSendParams *p = &multifd_send_state->params[i];
956
66770707
JQ
957 if (p->running) {
958 qemu_thread_join(&p->thread);
959 }
60df2d4a
JQ
960 socket_send_channel_destroy(p->c);
961 p->c = NULL;
f986c3d2
JQ
962 qemu_mutex_destroy(&p->mutex);
963 qemu_sem_destroy(&p->sem);
6df264ac 964 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
965 g_free(p->name);
966 p->name = NULL;
34c55a94
JQ
967 multifd_pages_clear(p->pages);
968 p->pages = NULL;
2a26c979
JQ
969 p->packet_len = 0;
970 g_free(p->packet);
971 p->packet = NULL;
f986c3d2 972 }
b9ee2f7d 973 qemu_sem_destroy(&multifd_send_state->channels_ready);
6df264ac 974 qemu_sem_destroy(&multifd_send_state->sem_sync);
f986c3d2
JQ
975 g_free(multifd_send_state->params);
976 multifd_send_state->params = NULL;
34c55a94
JQ
977 multifd_pages_clear(multifd_send_state->pages);
978 multifd_send_state->pages = NULL;
f986c3d2
JQ
979 g_free(multifd_send_state);
980 multifd_send_state = NULL;
f986c3d2
JQ
981}
982
6df264ac
JQ
983static void multifd_send_sync_main(void)
984{
985 int i;
986
987 if (!migrate_use_multifd()) {
988 return;
989 }
b9ee2f7d
JQ
990 if (multifd_send_state->pages->used) {
991 multifd_send_pages();
992 }
6df264ac
JQ
993 for (i = 0; i < migrate_multifd_channels(); i++) {
994 MultiFDSendParams *p = &multifd_send_state->params[i];
995
996 trace_multifd_send_sync_main_signal(p->id);
997
998 qemu_mutex_lock(&p->mutex);
b9ee2f7d
JQ
999
1000 p->packet_num = multifd_send_state->packet_num++;
6df264ac
JQ
1001 p->flags |= MULTIFD_FLAG_SYNC;
1002 p->pending_job++;
1003 qemu_mutex_unlock(&p->mutex);
1004 qemu_sem_post(&p->sem);
1005 }
1006 for (i = 0; i < migrate_multifd_channels(); i++) {
1007 MultiFDSendParams *p = &multifd_send_state->params[i];
1008
1009 trace_multifd_send_sync_main_wait(p->id);
1010 qemu_sem_wait(&multifd_send_state->sem_sync);
1011 }
1012 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1013}
1014
f986c3d2
JQ
1015static void *multifd_send_thread(void *opaque)
1016{
1017 MultiFDSendParams *p = opaque;
af8b7d2b 1018 Error *local_err = NULL;
8b2db7f5 1019 int ret;
af8b7d2b 1020
408ea6ae 1021 trace_multifd_send_thread_start(p->id);
74637e6f 1022 rcu_register_thread();
408ea6ae 1023
af8b7d2b
JQ
1024 if (multifd_send_initial_packet(p, &local_err) < 0) {
1025 goto out;
1026 }
408ea6ae
JQ
1027 /* initial packet */
1028 p->num_packets = 1;
f986c3d2
JQ
1029
1030 while (true) {
d82628e4 1031 qemu_sem_wait(&p->sem);
f986c3d2 1032 qemu_mutex_lock(&p->mutex);
0beb5ed3
JQ
1033
1034 if (p->pending_job) {
1035 uint32_t used = p->pages->used;
1036 uint64_t packet_num = p->packet_num;
1037 uint32_t flags = p->flags;
1038
1039 multifd_send_fill_packet(p);
1040 p->flags = 0;
1041 p->num_packets++;
1042 p->num_pages += used;
1043 p->pages->used = 0;
1044 qemu_mutex_unlock(&p->mutex);
1045
1046 trace_multifd_send(p->id, packet_num, used, flags);
1047
8b2db7f5
JQ
1048 ret = qio_channel_write_all(p->c, (void *)p->packet,
1049 p->packet_len, &local_err);
1050 if (ret != 0) {
1051 break;
1052 }
1053
1054 ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1055 if (ret != 0) {
1056 break;
1057 }
0beb5ed3
JQ
1058
1059 qemu_mutex_lock(&p->mutex);
1060 p->pending_job--;
1061 qemu_mutex_unlock(&p->mutex);
6df264ac
JQ
1062
1063 if (flags & MULTIFD_FLAG_SYNC) {
1064 qemu_sem_post(&multifd_send_state->sem_sync);
1065 }
b9ee2f7d 1066 qemu_sem_post(&multifd_send_state->channels_ready);
0beb5ed3 1067 } else if (p->quit) {
f986c3d2
JQ
1068 qemu_mutex_unlock(&p->mutex);
1069 break;
6df264ac
JQ
1070 } else {
1071 qemu_mutex_unlock(&p->mutex);
1072 /* sometimes there are spurious wakeups */
f986c3d2 1073 }
f986c3d2
JQ
1074 }
1075
af8b7d2b
JQ
1076out:
1077 if (local_err) {
1078 multifd_send_terminate_threads(local_err);
1079 }
1080
66770707
JQ
1081 qemu_mutex_lock(&p->mutex);
1082 p->running = false;
1083 qemu_mutex_unlock(&p->mutex);
1084
74637e6f 1085 rcu_unregister_thread();
408ea6ae
JQ
1086 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1087
f986c3d2
JQ
1088 return NULL;
1089}
1090
60df2d4a
JQ
1091static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1092{
1093 MultiFDSendParams *p = opaque;
1094 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1095 Error *local_err = NULL;
1096
1097 if (qio_task_propagate_error(task, &local_err)) {
1398b2e3
FL
1098 migrate_set_error(migrate_get_current(), local_err);
1099 multifd_save_cleanup();
60df2d4a
JQ
1100 } else {
1101 p->c = QIO_CHANNEL(sioc);
1102 qio_channel_set_delay(p->c, false);
1103 p->running = true;
1104 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1105 QEMU_THREAD_JOINABLE);
1106
1107 atomic_inc(&multifd_send_state->count);
1108 }
1109}
1110
f986c3d2
JQ
1111int multifd_save_setup(void)
1112{
1113 int thread_count;
34c55a94 1114 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1115 uint8_t i;
1116
1117 if (!migrate_use_multifd()) {
1118 return 0;
1119 }
1120 thread_count = migrate_multifd_channels();
1121 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1122 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
66770707 1123 atomic_set(&multifd_send_state->count, 0);
34c55a94 1124 multifd_send_state->pages = multifd_pages_init(page_count);
6df264ac 1125 qemu_sem_init(&multifd_send_state->sem_sync, 0);
b9ee2f7d 1126 qemu_sem_init(&multifd_send_state->channels_ready, 0);
34c55a94 1127
f986c3d2
JQ
1128 for (i = 0; i < thread_count; i++) {
1129 MultiFDSendParams *p = &multifd_send_state->params[i];
1130
1131 qemu_mutex_init(&p->mutex);
1132 qemu_sem_init(&p->sem, 0);
6df264ac 1133 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1134 p->quit = false;
0beb5ed3 1135 p->pending_job = 0;
f986c3d2 1136 p->id = i;
34c55a94 1137 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1138 p->packet_len = sizeof(MultiFDPacket_t)
1139 + sizeof(ram_addr_t) * page_count;
1140 p->packet = g_malloc0(p->packet_len);
f986c3d2 1141 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 1142 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
1143 }
1144 return 0;
1145}
1146
f986c3d2
JQ
1147struct {
1148 MultiFDRecvParams *params;
1149 /* number of created threads */
1150 int count;
6df264ac
JQ
1151 /* syncs main thread and channels */
1152 QemuSemaphore sem_sync;
1153 /* global number of generated multifd packets */
1154 uint64_t packet_num;
f986c3d2
JQ
1155} *multifd_recv_state;
1156
66770707 1157static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
1158{
1159 int i;
1160
7a169d74
JQ
1161 if (err) {
1162 MigrationState *s = migrate_get_current();
1163 migrate_set_error(s, err);
1164 if (s->state == MIGRATION_STATUS_SETUP ||
1165 s->state == MIGRATION_STATUS_ACTIVE) {
1166 migrate_set_state(&s->state, s->state,
1167 MIGRATION_STATUS_FAILED);
1168 }
1169 }
1170
66770707 1171 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1172 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1173
1174 qemu_mutex_lock(&p->mutex);
7a5cc33c
JQ
1175 /* We could arrive here for two reasons:
1176 - normal quit, i.e. everything went fine, just finished
1177 - error quit: We close the channels so the channel threads
1178 finish the qio_channel_read_all_eof() */
1179 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
f986c3d2
JQ
1180 qemu_mutex_unlock(&p->mutex);
1181 }
1182}
1183
1184int multifd_load_cleanup(Error **errp)
1185{
1186 int i;
1187 int ret = 0;
1188
1189 if (!migrate_use_multifd()) {
1190 return 0;
1191 }
66770707
JQ
1192 multifd_recv_terminate_threads(NULL);
1193 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
1194 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1195
66770707
JQ
1196 if (p->running) {
1197 qemu_thread_join(&p->thread);
1198 }
60df2d4a
JQ
1199 object_unref(OBJECT(p->c));
1200 p->c = NULL;
f986c3d2 1201 qemu_mutex_destroy(&p->mutex);
6df264ac 1202 qemu_sem_destroy(&p->sem_sync);
f986c3d2
JQ
1203 g_free(p->name);
1204 p->name = NULL;
34c55a94
JQ
1205 multifd_pages_clear(p->pages);
1206 p->pages = NULL;
2a26c979
JQ
1207 p->packet_len = 0;
1208 g_free(p->packet);
1209 p->packet = NULL;
f986c3d2 1210 }
6df264ac 1211 qemu_sem_destroy(&multifd_recv_state->sem_sync);
f986c3d2
JQ
1212 g_free(multifd_recv_state->params);
1213 multifd_recv_state->params = NULL;
1214 g_free(multifd_recv_state);
1215 multifd_recv_state = NULL;
1216
1217 return ret;
1218}
1219
6df264ac
JQ
1220static void multifd_recv_sync_main(void)
1221{
1222 int i;
1223
1224 if (!migrate_use_multifd()) {
1225 return;
1226 }
1227 for (i = 0; i < migrate_multifd_channels(); i++) {
1228 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1229
6df264ac
JQ
1230 trace_multifd_recv_sync_main_wait(p->id);
1231 qemu_sem_wait(&multifd_recv_state->sem_sync);
1232 qemu_mutex_lock(&p->mutex);
1233 if (multifd_recv_state->packet_num < p->packet_num) {
1234 multifd_recv_state->packet_num = p->packet_num;
1235 }
1236 qemu_mutex_unlock(&p->mutex);
1237 }
1238 for (i = 0; i < migrate_multifd_channels(); i++) {
1239 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1240
1241 trace_multifd_recv_sync_main_signal(p->id);
6df264ac
JQ
1242 qemu_sem_post(&p->sem_sync);
1243 }
1244 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1245}
1246
f986c3d2
JQ
1247static void *multifd_recv_thread(void *opaque)
1248{
1249 MultiFDRecvParams *p = opaque;
2a26c979
JQ
1250 Error *local_err = NULL;
1251 int ret;
f986c3d2 1252
408ea6ae 1253 trace_multifd_recv_thread_start(p->id);
74637e6f 1254 rcu_register_thread();
408ea6ae 1255
f986c3d2 1256 while (true) {
6df264ac
JQ
1257 uint32_t used;
1258 uint32_t flags;
0beb5ed3 1259
8b2db7f5
JQ
1260 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1261 p->packet_len, &local_err);
1262 if (ret == 0) { /* EOF */
1263 break;
1264 }
1265 if (ret == -1) { /* Error */
1266 break;
1267 }
2a26c979 1268
6df264ac
JQ
1269 qemu_mutex_lock(&p->mutex);
1270 ret = multifd_recv_unfill_packet(p, &local_err);
1271 if (ret) {
f986c3d2
JQ
1272 qemu_mutex_unlock(&p->mutex);
1273 break;
1274 }
6df264ac
JQ
1275
1276 used = p->pages->used;
1277 flags = p->flags;
1278 trace_multifd_recv(p->id, p->packet_num, used, flags);
6df264ac
JQ
1279 p->num_packets++;
1280 p->num_pages += used;
f986c3d2 1281 qemu_mutex_unlock(&p->mutex);
6df264ac 1282
8b2db7f5
JQ
1283 ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1284 if (ret != 0) {
1285 break;
1286 }
1287
6df264ac
JQ
1288 if (flags & MULTIFD_FLAG_SYNC) {
1289 qemu_sem_post(&multifd_recv_state->sem_sync);
1290 qemu_sem_wait(&p->sem_sync);
1291 }
f986c3d2
JQ
1292 }
1293
d82628e4
JQ
1294 if (local_err) {
1295 multifd_recv_terminate_threads(local_err);
1296 }
66770707
JQ
1297 qemu_mutex_lock(&p->mutex);
1298 p->running = false;
1299 qemu_mutex_unlock(&p->mutex);
1300
74637e6f 1301 rcu_unregister_thread();
408ea6ae
JQ
1302 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1303
f986c3d2
JQ
1304 return NULL;
1305}
1306
1307int multifd_load_setup(void)
1308{
1309 int thread_count;
34c55a94 1310 uint32_t page_count = migrate_multifd_page_count();
f986c3d2
JQ
1311 uint8_t i;
1312
1313 if (!migrate_use_multifd()) {
1314 return 0;
1315 }
1316 thread_count = migrate_multifd_channels();
1317 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1318 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 1319 atomic_set(&multifd_recv_state->count, 0);
6df264ac 1320 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
34c55a94 1321
f986c3d2
JQ
1322 for (i = 0; i < thread_count; i++) {
1323 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1324
1325 qemu_mutex_init(&p->mutex);
6df264ac 1326 qemu_sem_init(&p->sem_sync, 0);
f986c3d2 1327 p->id = i;
34c55a94 1328 p->pages = multifd_pages_init(page_count);
2a26c979
JQ
1329 p->packet_len = sizeof(MultiFDPacket_t)
1330 + sizeof(ram_addr_t) * page_count;
1331 p->packet = g_malloc0(p->packet_len);
f986c3d2 1332 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
1333 }
1334 return 0;
1335}
1336
62c1e0ca
JQ
1337bool multifd_recv_all_channels_created(void)
1338{
1339 int thread_count = migrate_multifd_channels();
1340
1341 if (!migrate_use_multifd()) {
1342 return true;
1343 }
1344
1345 return thread_count == atomic_read(&multifd_recv_state->count);
1346}
1347
49ed0d24
FL
1348/*
1349 * Try to receive all multifd channels to get ready for the migration.
1350 * - Return true and do not set @errp when correctly receving all channels;
1351 * - Return false and do not set @errp when correctly receiving the current one;
1352 * - Return false and set @errp when failing to receive the current channel.
1353 */
1354bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
71bb07db 1355{
60df2d4a 1356 MultiFDRecvParams *p;
af8b7d2b
JQ
1357 Error *local_err = NULL;
1358 int id;
60df2d4a 1359
af8b7d2b
JQ
1360 id = multifd_recv_initial_packet(ioc, &local_err);
1361 if (id < 0) {
1362 multifd_recv_terminate_threads(local_err);
49ed0d24
FL
1363 error_propagate_prepend(errp, local_err,
1364 "failed to receive packet"
1365 " via multifd channel %d: ",
1366 atomic_read(&multifd_recv_state->count));
81e62053 1367 return false;
af8b7d2b
JQ
1368 }
1369
1370 p = &multifd_recv_state->params[id];
1371 if (p->c != NULL) {
1372 error_setg(&local_err, "multifd: received id '%d' already setup'",
1373 id);
1374 multifd_recv_terminate_threads(local_err);
49ed0d24 1375 error_propagate(errp, local_err);
81e62053 1376 return false;
af8b7d2b 1377 }
60df2d4a
JQ
1378 p->c = ioc;
1379 object_ref(OBJECT(ioc));
408ea6ae
JQ
1380 /* initial packet */
1381 p->num_packets = 1;
60df2d4a
JQ
1382
1383 p->running = true;
1384 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1385 QEMU_THREAD_JOINABLE);
1386 atomic_inc(&multifd_recv_state->count);
49ed0d24
FL
1387 return atomic_read(&multifd_recv_state->count) ==
1388 migrate_multifd_channels();
71bb07db
JQ
1389}
1390
56e93d26 1391/**
3d0684b2 1392 * save_page_header: write page header to wire
56e93d26
JQ
1393 *
1394 * If this is the 1st block, it also writes the block identification
1395 *
3d0684b2 1396 * Returns the number of bytes written
56e93d26
JQ
1397 *
1398 * @f: QEMUFile where to send the data
1399 * @block: block that contains the page we want to send
1400 * @offset: offset inside the block for the page
1401 * in the lower bits, it contains flags
1402 */
2bf3aa85
JQ
1403static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1404 ram_addr_t offset)
56e93d26 1405{
9f5f380b 1406 size_t size, len;
56e93d26 1407
24795694
JQ
1408 if (block == rs->last_sent_block) {
1409 offset |= RAM_SAVE_FLAG_CONTINUE;
1410 }
2bf3aa85 1411 qemu_put_be64(f, offset);
56e93d26
JQ
1412 size = 8;
1413
1414 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 1415 len = strlen(block->idstr);
2bf3aa85
JQ
1416 qemu_put_byte(f, len);
1417 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 1418 size += 1 + len;
24795694 1419 rs->last_sent_block = block;
56e93d26
JQ
1420 }
1421 return size;
1422}
1423
3d0684b2
JQ
1424/**
1425 * mig_throttle_guest_down: throotle down the guest
1426 *
1427 * Reduce amount of guest cpu execution to hopefully slow down memory
1428 * writes. If guest dirty memory rate is reduced below the rate at
1429 * which we can transfer pages to the destination then we should be
1430 * able to complete migration. Some workloads dirty memory way too
1431 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
1432 */
1433static void mig_throttle_guest_down(void)
1434{
1435 MigrationState *s = migrate_get_current();
2594f56d
DB
1436 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1437 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
4cbc9c7f 1438 int pct_max = s->parameters.max_cpu_throttle;
070afca2
JH
1439
1440 /* We have not started throttling yet. Let's start it. */
1441 if (!cpu_throttle_active()) {
1442 cpu_throttle_set(pct_initial);
1443 } else {
1444 /* Throttling already on, just increase the rate */
4cbc9c7f
LQ
1445 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1446 pct_max));
070afca2
JH
1447 }
1448}
1449
3d0684b2
JQ
1450/**
1451 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1452 *
6f37bb8b 1453 * @rs: current RAM state
3d0684b2
JQ
1454 * @current_addr: address for the zero page
1455 *
1456 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
1457 * The important thing is that a stale (not-yet-0'd) page be replaced
1458 * by the new data.
1459 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 1460 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 1461 */
6f37bb8b 1462static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 1463{
6f37bb8b 1464 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
1465 return;
1466 }
1467
1468 /* We don't care if this fails to allocate a new cache page
1469 * as long as it updated an old one */
c00e0928 1470 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 1471 ram_counters.dirty_sync_count);
56e93d26
JQ
1472}
1473
1474#define ENCODING_FLAG_XBZRLE 0x1
1475
1476/**
1477 * save_xbzrle_page: compress and send current page
1478 *
1479 * Returns: 1 means that we wrote the page
1480 * 0 means that page is identical to the one already sent
1481 * -1 means that xbzrle would be longer than normal
1482 *
5a987738 1483 * @rs: current RAM state
3d0684b2
JQ
1484 * @current_data: pointer to the address of the page contents
1485 * @current_addr: addr of the page
56e93d26
JQ
1486 * @block: block that contains the page we want to send
1487 * @offset: offset inside the block for the page
1488 * @last_stage: if we are at the completion stage
56e93d26 1489 */
204b88b8 1490static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1491 ram_addr_t current_addr, RAMBlock *block,
072c2511 1492 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1493{
1494 int encoded_len = 0, bytes_xbzrle;
1495 uint8_t *prev_cached_page;
1496
9360447d
JQ
1497 if (!cache_is_cached(XBZRLE.cache, current_addr,
1498 ram_counters.dirty_sync_count)) {
1499 xbzrle_counters.cache_miss++;
56e93d26
JQ
1500 if (!last_stage) {
1501 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1502 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1503 return -1;
1504 } else {
1505 /* update *current_data when the page has been
1506 inserted into cache */
1507 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1508 }
1509 }
1510 return -1;
1511 }
1512
1513 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1514
1515 /* save current buffer into memory */
1516 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1517
1518 /* XBZRLE encoding (if there is no overflow) */
1519 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1520 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1521 TARGET_PAGE_SIZE);
1522 if (encoded_len == 0) {
55c4446b 1523 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1524 return 0;
1525 } else if (encoded_len == -1) {
55c4446b 1526 trace_save_xbzrle_page_overflow();
9360447d 1527 xbzrle_counters.overflow++;
56e93d26
JQ
1528 /* update data in the cache */
1529 if (!last_stage) {
1530 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1531 *current_data = prev_cached_page;
1532 }
1533 return -1;
1534 }
1535
1536 /* we need to update the data in the cache, in order to get the same data */
1537 if (!last_stage) {
1538 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1539 }
1540
1541 /* Send XBZRLE based compressed page */
2bf3aa85 1542 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1543 offset | RAM_SAVE_FLAG_XBZRLE);
1544 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1545 qemu_put_be16(rs->f, encoded_len);
1546 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1547 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1548 xbzrle_counters.pages++;
1549 xbzrle_counters.bytes += bytes_xbzrle;
1550 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1551
1552 return 1;
1553}
1554
3d0684b2
JQ
1555/**
1556 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1557 *
3d0684b2
JQ
1558 * Called with rcu_read_lock() to protect migration_bitmap
1559 *
1560 * Returns the byte offset within memory region of the start of a dirty page
1561 *
6f37bb8b 1562 * @rs: current RAM state
3d0684b2 1563 * @rb: RAMBlock where to search for dirty pages
a935e30f 1564 * @start: page where we start the search
f3f491fc 1565 */
56e93d26 1566static inline
a935e30f 1567unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1568 unsigned long start)
56e93d26 1569{
6b6712ef
JQ
1570 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1571 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1572 unsigned long next;
1573
fbd162e6 1574 if (ramblock_is_ignored(rb)) {
b895de50
CLG
1575 return size;
1576 }
1577
6b6712ef
JQ
1578 if (rs->ram_bulk_stage && start > 0) {
1579 next = start + 1;
56e93d26 1580 } else {
6b6712ef 1581 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1582 }
1583
6b6712ef 1584 return next;
56e93d26
JQ
1585}
1586
06b10688 1587static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1588 RAMBlock *rb,
1589 unsigned long page)
a82d593b
DDAG
1590{
1591 bool ret;
a82d593b 1592
386a907b 1593 qemu_mutex_lock(&rs->bitmap_mutex);
6b6712ef 1594 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1595
1596 if (ret) {
0d8ec885 1597 rs->migration_dirty_pages--;
a82d593b 1598 }
386a907b
WW
1599 qemu_mutex_unlock(&rs->bitmap_mutex);
1600
a82d593b
DDAG
1601 return ret;
1602}
1603
15440dd5
JQ
1604static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1605 ram_addr_t start, ram_addr_t length)
56e93d26 1606{
0d8ec885 1607 rs->migration_dirty_pages +=
6b6712ef 1608 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 1609 &rs->num_dirty_pages_period);
56e93d26
JQ
1610}
1611
3d0684b2
JQ
1612/**
1613 * ram_pagesize_summary: calculate all the pagesizes of a VM
1614 *
1615 * Returns a summary bitmap of the page sizes of all RAMBlocks
1616 *
1617 * For VMs with just normal pages this is equivalent to the host page
1618 * size. If it's got some huge pages then it's the OR of all the
1619 * different page sizes.
e8ca1db2
DDAG
1620 */
1621uint64_t ram_pagesize_summary(void)
1622{
1623 RAMBlock *block;
1624 uint64_t summary = 0;
1625
fbd162e6 1626 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
e8ca1db2
DDAG
1627 summary |= block->page_size;
1628 }
1629
1630 return summary;
1631}
1632
aecbfe9c
XG
1633uint64_t ram_get_total_transferred_pages(void)
1634{
1635 return ram_counters.normal + ram_counters.duplicate +
1636 compression_counters.pages + xbzrle_counters.pages;
1637}
1638
b734035b
XG
1639static void migration_update_rates(RAMState *rs, int64_t end_time)
1640{
be8b02ed 1641 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
76e03000 1642 double compressed_size;
b734035b
XG
1643
1644 /* calculate period counters */
1645 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1646 / (end_time - rs->time_last_bitmap_sync);
1647
be8b02ed 1648 if (!page_count) {
b734035b
XG
1649 return;
1650 }
1651
1652 if (migrate_use_xbzrle()) {
1653 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
be8b02ed 1654 rs->xbzrle_cache_miss_prev) / page_count;
b734035b
XG
1655 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1656 }
76e03000
XG
1657
1658 if (migrate_use_compression()) {
1659 compression_counters.busy_rate = (double)(compression_counters.busy -
1660 rs->compress_thread_busy_prev) / page_count;
1661 rs->compress_thread_busy_prev = compression_counters.busy;
1662
1663 compressed_size = compression_counters.compressed_size -
1664 rs->compressed_size_prev;
1665 if (compressed_size) {
1666 double uncompressed_size = (compression_counters.pages -
1667 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1668
1669 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1670 compression_counters.compression_rate =
1671 uncompressed_size / compressed_size;
1672
1673 rs->compress_pages_prev = compression_counters.pages;
1674 rs->compressed_size_prev = compression_counters.compressed_size;
1675 }
1676 }
b734035b
XG
1677}
1678
8d820d6f 1679static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1680{
1681 RAMBlock *block;
56e93d26 1682 int64_t end_time;
c4bdf0cf 1683 uint64_t bytes_xfer_now;
56e93d26 1684
9360447d 1685 ram_counters.dirty_sync_count++;
56e93d26 1686
f664da80
JQ
1687 if (!rs->time_last_bitmap_sync) {
1688 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1689 }
1690
1691 trace_migration_bitmap_sync_start();
9c1f8f44 1692 memory_global_dirty_log_sync();
56e93d26 1693
108cfae0 1694 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 1695 rcu_read_lock();
fbd162e6 1696 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
15440dd5 1697 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26 1698 }
650af890 1699 ram_counters.remaining = ram_bytes_remaining();
56e93d26 1700 rcu_read_unlock();
108cfae0 1701 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1702
a66cd90c 1703 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1704
56e93d26
JQ
1705 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1706
1707 /* more than 1 second = 1000 millisecons */
f664da80 1708 if (end_time > rs->time_last_bitmap_sync + 1000) {
9360447d 1709 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1710
9ac78b61
PL
1711 /* During block migration the auto-converge logic incorrectly detects
1712 * that ram migration makes no progress. Avoid this by disabling the
1713 * throttling logic during the bulk phase of block migration. */
1714 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1715 /* The following detection logic can be refined later. For now:
1716 Check to see if the dirtied bytes is 50% more than the approx.
1717 amount of bytes that just got transferred since the last time we
070afca2
JH
1718 were in this routine. If that happens twice, start or increase
1719 throttling */
070afca2 1720
d693c6f1 1721 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1722 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1723 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1724 trace_migration_throttle();
8d820d6f 1725 rs->dirty_rate_high_cnt = 0;
070afca2 1726 mig_throttle_guest_down();
d693c6f1 1727 }
56e93d26 1728 }
070afca2 1729
b734035b
XG
1730 migration_update_rates(rs, end_time);
1731
be8b02ed 1732 rs->target_page_count_prev = rs->target_page_count;
d693c6f1
FF
1733
1734 /* reset period counters */
f664da80 1735 rs->time_last_bitmap_sync = end_time;
a66cd90c 1736 rs->num_dirty_pages_period = 0;
d2a4d85a 1737 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1738 }
4addcd4f 1739 if (migrate_use_events()) {
3ab72385 1740 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
4addcd4f 1741 }
56e93d26
JQ
1742}
1743
6c97ec5f
XG
1744/**
1745 * save_zero_page_to_file: send the zero page to the file
1746 *
1747 * Returns the size of data written to the file, 0 means the page is not
1748 * a zero page
1749 *
1750 * @rs: current RAM state
1751 * @file: the file where the data is saved
1752 * @block: block that contains the page we want to send
1753 * @offset: offset inside the block for the page
1754 */
1755static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1756 RAMBlock *block, ram_addr_t offset)
1757{
1758 uint8_t *p = block->host + offset;
1759 int len = 0;
1760
1761 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1762 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1763 qemu_put_byte(file, 0);
1764 len += 1;
1765 }
1766 return len;
1767}
1768
56e93d26 1769/**
3d0684b2 1770 * save_zero_page: send the zero page to the stream
56e93d26 1771 *
3d0684b2 1772 * Returns the number of pages written.
56e93d26 1773 *
f7ccd61b 1774 * @rs: current RAM state
56e93d26
JQ
1775 * @block: block that contains the page we want to send
1776 * @offset: offset inside the block for the page
56e93d26 1777 */
7faccdc3 1778static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1779{
6c97ec5f 1780 int len = save_zero_page_to_file(rs, rs->f, block, offset);
56e93d26 1781
6c97ec5f 1782 if (len) {
9360447d 1783 ram_counters.duplicate++;
6c97ec5f
XG
1784 ram_counters.transferred += len;
1785 return 1;
56e93d26 1786 }
6c97ec5f 1787 return -1;
56e93d26
JQ
1788}
1789
5727309d 1790static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1791{
5727309d 1792 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1793 return;
1794 }
1795
aaa2064c 1796 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1797}
1798
059ff0fb
XG
1799/*
1800 * @pages: the number of pages written by the control path,
1801 * < 0 - error
1802 * > 0 - number of pages written
1803 *
1804 * Return true if the pages has been saved, otherwise false is returned.
1805 */
1806static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1807 int *pages)
1808{
1809 uint64_t bytes_xmit = 0;
1810 int ret;
1811
1812 *pages = -1;
1813 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1814 &bytes_xmit);
1815 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1816 return false;
1817 }
1818
1819 if (bytes_xmit) {
1820 ram_counters.transferred += bytes_xmit;
1821 *pages = 1;
1822 }
1823
1824 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1825 return true;
1826 }
1827
1828 if (bytes_xmit > 0) {
1829 ram_counters.normal++;
1830 } else if (bytes_xmit == 0) {
1831 ram_counters.duplicate++;
1832 }
1833
1834 return true;
1835}
1836
65dacaa0
XG
1837/*
1838 * directly send the page to the stream
1839 *
1840 * Returns the number of pages written.
1841 *
1842 * @rs: current RAM state
1843 * @block: block that contains the page we want to send
1844 * @offset: offset inside the block for the page
1845 * @buf: the page to be sent
1846 * @async: send to page asyncly
1847 */
1848static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1849 uint8_t *buf, bool async)
1850{
1851 ram_counters.transferred += save_page_header(rs, rs->f, block,
1852 offset | RAM_SAVE_FLAG_PAGE);
1853 if (async) {
1854 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1855 migrate_release_ram() &
1856 migration_in_postcopy());
1857 } else {
1858 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1859 }
1860 ram_counters.transferred += TARGET_PAGE_SIZE;
1861 ram_counters.normal++;
1862 return 1;
1863}
1864
56e93d26 1865/**
3d0684b2 1866 * ram_save_page: send the given page to the stream
56e93d26 1867 *
3d0684b2 1868 * Returns the number of pages written.
3fd3c4b3
DDAG
1869 * < 0 - error
1870 * >=0 - Number of pages written - this might legally be 0
1871 * if xbzrle noticed the page was the same.
56e93d26 1872 *
6f37bb8b 1873 * @rs: current RAM state
56e93d26
JQ
1874 * @block: block that contains the page we want to send
1875 * @offset: offset inside the block for the page
1876 * @last_stage: if we are at the completion stage
56e93d26 1877 */
a0a8aa14 1878static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1879{
1880 int pages = -1;
56e93d26 1881 uint8_t *p;
56e93d26 1882 bool send_async = true;
a08f6890 1883 RAMBlock *block = pss->block;
a935e30f 1884 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 1885 ram_addr_t current_addr = block->offset + offset;
56e93d26 1886
2f68e399 1887 p = block->host + offset;
1db9d8e5 1888 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1889
56e93d26 1890 XBZRLE_cache_lock();
d7400a34
XG
1891 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1892 migrate_use_xbzrle()) {
059ff0fb
XG
1893 pages = save_xbzrle_page(rs, &p, current_addr, block,
1894 offset, last_stage);
1895 if (!last_stage) {
1896 /* Can't send this cached data async, since the cache page
1897 * might get updated before it gets to the wire
56e93d26 1898 */
059ff0fb 1899 send_async = false;
56e93d26
JQ
1900 }
1901 }
1902
1903 /* XBZRLE overflow or normal page */
1904 if (pages == -1) {
65dacaa0 1905 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1906 }
1907
1908 XBZRLE_cache_unlock();
1909
1910 return pages;
1911}
1912
b9ee2f7d
JQ
1913static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1914 ram_addr_t offset)
1915{
b9ee2f7d 1916 multifd_queue_page(block, offset);
b9ee2f7d
JQ
1917 ram_counters.normal++;
1918
1919 return 1;
1920}
1921
5e5fdcff 1922static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
6ef3771c 1923 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1924{
53518d94 1925 RAMState *rs = ram_state;
a7a9a88f 1926 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
5e5fdcff 1927 bool zero_page = false;
6ef3771c 1928 int ret;
56e93d26 1929
5e5fdcff
XG
1930 if (save_zero_page_to_file(rs, f, block, offset)) {
1931 zero_page = true;
1932 goto exit;
1933 }
1934
6ef3771c 1935 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1936
1937 /*
1938 * copy it to a internal buffer to avoid it being modified by VM
1939 * so that we can catch up the error during compression and
1940 * decompression
1941 */
1942 memcpy(source_buf, p, TARGET_PAGE_SIZE);
6ef3771c
XG
1943 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1944 if (ret < 0) {
1945 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
b3be2896 1946 error_report("compressed data failed!");
5e5fdcff 1947 return false;
b3be2896 1948 }
56e93d26 1949
5e5fdcff 1950exit:
6ef3771c 1951 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
5e5fdcff
XG
1952 return zero_page;
1953}
1954
1955static void
1956update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1957{
76e03000
XG
1958 ram_counters.transferred += bytes_xmit;
1959
5e5fdcff
XG
1960 if (param->zero_page) {
1961 ram_counters.duplicate++;
76e03000 1962 return;
5e5fdcff 1963 }
76e03000
XG
1964
1965 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1966 compression_counters.compressed_size += bytes_xmit - 8;
1967 compression_counters.pages++;
56e93d26
JQ
1968}
1969
32b05495
XG
1970static bool save_page_use_compression(RAMState *rs);
1971
ce25d337 1972static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1973{
1974 int idx, len, thread_count;
1975
32b05495 1976 if (!save_page_use_compression(rs)) {
56e93d26
JQ
1977 return;
1978 }
1979 thread_count = migrate_compress_threads();
a7a9a88f 1980
0d9f9a5c 1981 qemu_mutex_lock(&comp_done_lock);
56e93d26 1982 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1983 while (!comp_param[idx].done) {
0d9f9a5c 1984 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1985 }
a7a9a88f 1986 }
0d9f9a5c 1987 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1988
1989 for (idx = 0; idx < thread_count; idx++) {
1990 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1991 if (!comp_param[idx].quit) {
ce25d337 1992 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
5e5fdcff
XG
1993 /*
1994 * it's safe to fetch zero_page without holding comp_done_lock
1995 * as there is no further request submitted to the thread,
1996 * i.e, the thread should be waiting for a request at this point.
1997 */
1998 update_compress_thread_counts(&comp_param[idx], len);
56e93d26 1999 }
a7a9a88f 2000 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
2001 }
2002}
2003
2004static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2005 ram_addr_t offset)
2006{
2007 param->block = block;
2008 param->offset = offset;
2009}
2010
ce25d337
JQ
2011static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2012 ram_addr_t offset)
56e93d26
JQ
2013{
2014 int idx, thread_count, bytes_xmit = -1, pages = -1;
1d58872a 2015 bool wait = migrate_compress_wait_thread();
56e93d26
JQ
2016
2017 thread_count = migrate_compress_threads();
0d9f9a5c 2018 qemu_mutex_lock(&comp_done_lock);
1d58872a
XG
2019retry:
2020 for (idx = 0; idx < thread_count; idx++) {
2021 if (comp_param[idx].done) {
2022 comp_param[idx].done = false;
2023 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2024 qemu_mutex_lock(&comp_param[idx].mutex);
2025 set_compress_params(&comp_param[idx], block, offset);
2026 qemu_cond_signal(&comp_param[idx].cond);
2027 qemu_mutex_unlock(&comp_param[idx].mutex);
2028 pages = 1;
5e5fdcff 2029 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
56e93d26 2030 break;
56e93d26
JQ
2031 }
2032 }
1d58872a
XG
2033
2034 /*
2035 * wait for the free thread if the user specifies 'compress-wait-thread',
2036 * otherwise we will post the page out in the main thread as normal page.
2037 */
2038 if (pages < 0 && wait) {
2039 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2040 goto retry;
2041 }
0d9f9a5c 2042 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
2043
2044 return pages;
2045}
2046
3d0684b2
JQ
2047/**
2048 * find_dirty_block: find the next dirty page and update any state
2049 * associated with the search process.
b9e60928 2050 *
3d0684b2 2051 * Returns if a page is found
b9e60928 2052 *
6f37bb8b 2053 * @rs: current RAM state
3d0684b2
JQ
2054 * @pss: data about the state of the current dirty page scan
2055 * @again: set to false if the search has scanned the whole of RAM
b9e60928 2056 */
f20e2865 2057static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 2058{
f20e2865 2059 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 2060 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 2061 pss->page >= rs->last_page) {
b9e60928
DDAG
2062 /*
2063 * We've been once around the RAM and haven't found anything.
2064 * Give up.
2065 */
2066 *again = false;
2067 return false;
2068 }
a935e30f 2069 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 2070 /* Didn't find anything in this RAM Block */
a935e30f 2071 pss->page = 0;
b9e60928
DDAG
2072 pss->block = QLIST_NEXT_RCU(pss->block, next);
2073 if (!pss->block) {
48df9d80
XG
2074 /*
2075 * If memory migration starts over, we will meet a dirtied page
2076 * which may still exists in compression threads's ring, so we
2077 * should flush the compressed data to make sure the new page
2078 * is not overwritten by the old one in the destination.
2079 *
2080 * Also If xbzrle is on, stop using the data compression at this
2081 * point. In theory, xbzrle can do better than compression.
2082 */
2083 flush_compressed_data(rs);
2084
b9e60928
DDAG
2085 /* Hit the end of the list */
2086 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2087 /* Flag that we've looped */
2088 pss->complete_round = true;
6f37bb8b 2089 rs->ram_bulk_stage = false;
b9e60928
DDAG
2090 }
2091 /* Didn't find anything this time, but try again on the new block */
2092 *again = true;
2093 return false;
2094 } else {
2095 /* Can go around again, but... */
2096 *again = true;
2097 /* We've found something so probably don't need to */
2098 return true;
2099 }
2100}
2101
3d0684b2
JQ
2102/**
2103 * unqueue_page: gets a page of the queue
2104 *
a82d593b 2105 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 2106 *
3d0684b2
JQ
2107 * Returns the block of the page (or NULL if none available)
2108 *
ec481c6c 2109 * @rs: current RAM state
3d0684b2 2110 * @offset: used to return the offset within the RAMBlock
a82d593b 2111 */
f20e2865 2112static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
2113{
2114 RAMBlock *block = NULL;
2115
ae526e32
XG
2116 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2117 return NULL;
2118 }
2119
ec481c6c
JQ
2120 qemu_mutex_lock(&rs->src_page_req_mutex);
2121 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2122 struct RAMSrcPageRequest *entry =
2123 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
2124 block = entry->rb;
2125 *offset = entry->offset;
a82d593b
DDAG
2126
2127 if (entry->len > TARGET_PAGE_SIZE) {
2128 entry->len -= TARGET_PAGE_SIZE;
2129 entry->offset += TARGET_PAGE_SIZE;
2130 } else {
2131 memory_region_unref(block->mr);
ec481c6c 2132 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b 2133 g_free(entry);
e03a34f8 2134 migration_consume_urgent_request();
a82d593b
DDAG
2135 }
2136 }
ec481c6c 2137 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
2138
2139 return block;
2140}
2141
3d0684b2
JQ
2142/**
2143 * get_queued_page: unqueue a page from the postocpy requests
2144 *
2145 * Skips pages that are already sent (!dirty)
a82d593b 2146 *
3d0684b2 2147 * Returns if a queued page is found
a82d593b 2148 *
6f37bb8b 2149 * @rs: current RAM state
3d0684b2 2150 * @pss: data about the state of the current dirty page scan
a82d593b 2151 */
f20e2865 2152static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
2153{
2154 RAMBlock *block;
2155 ram_addr_t offset;
2156 bool dirty;
2157
2158 do {
f20e2865 2159 block = unqueue_page(rs, &offset);
a82d593b
DDAG
2160 /*
2161 * We're sending this page, and since it's postcopy nothing else
2162 * will dirty it, and we must make sure it doesn't get sent again
2163 * even if this queue request was received after the background
2164 * search already sent it.
2165 */
2166 if (block) {
f20e2865
JQ
2167 unsigned long page;
2168
6b6712ef
JQ
2169 page = offset >> TARGET_PAGE_BITS;
2170 dirty = test_bit(page, block->bmap);
a82d593b 2171 if (!dirty) {
06b10688 2172 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 2173 page, test_bit(page, block->unsentmap));
a82d593b 2174 } else {
f20e2865 2175 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
2176 }
2177 }
2178
2179 } while (block && !dirty);
2180
2181 if (block) {
2182 /*
2183 * As soon as we start servicing pages out of order, then we have
2184 * to kill the bulk stage, since the bulk stage assumes
2185 * in (migration_bitmap_find_and_reset_dirty) that every page is
2186 * dirty, that's no longer true.
2187 */
6f37bb8b 2188 rs->ram_bulk_stage = false;
a82d593b
DDAG
2189
2190 /*
2191 * We want the background search to continue from the queued page
2192 * since the guest is likely to want other pages near to the page
2193 * it just requested.
2194 */
2195 pss->block = block;
a935e30f 2196 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
2197 }
2198
2199 return !!block;
2200}
2201
6c595cde 2202/**
5e58f968
JQ
2203 * migration_page_queue_free: drop any remaining pages in the ram
2204 * request queue
6c595cde 2205 *
3d0684b2
JQ
2206 * It should be empty at the end anyway, but in error cases there may
2207 * be some left. in case that there is any page left, we drop it.
2208 *
6c595cde 2209 */
83c13382 2210static void migration_page_queue_free(RAMState *rs)
6c595cde 2211{
ec481c6c 2212 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
2213 /* This queue generally should be empty - but in the case of a failed
2214 * migration might have some droppings in.
2215 */
2216 rcu_read_lock();
ec481c6c 2217 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 2218 memory_region_unref(mspr->rb->mr);
ec481c6c 2219 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
2220 g_free(mspr);
2221 }
2222 rcu_read_unlock();
2223}
2224
2225/**
3d0684b2
JQ
2226 * ram_save_queue_pages: queue the page for transmission
2227 *
2228 * A request from postcopy destination for example.
2229 *
2230 * Returns zero on success or negative on error
2231 *
3d0684b2
JQ
2232 * @rbname: Name of the RAMBLock of the request. NULL means the
2233 * same that last one.
2234 * @start: starting address from the start of the RAMBlock
2235 * @len: length (in bytes) to send
6c595cde 2236 */
96506894 2237int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
2238{
2239 RAMBlock *ramblock;
53518d94 2240 RAMState *rs = ram_state;
6c595cde 2241
9360447d 2242 ram_counters.postcopy_requests++;
6c595cde
DDAG
2243 rcu_read_lock();
2244 if (!rbname) {
2245 /* Reuse last RAMBlock */
68a098f3 2246 ramblock = rs->last_req_rb;
6c595cde
DDAG
2247
2248 if (!ramblock) {
2249 /*
2250 * Shouldn't happen, we can't reuse the last RAMBlock if
2251 * it's the 1st request.
2252 */
2253 error_report("ram_save_queue_pages no previous block");
2254 goto err;
2255 }
2256 } else {
2257 ramblock = qemu_ram_block_by_name(rbname);
2258
2259 if (!ramblock) {
2260 /* We shouldn't be asked for a non-existent RAMBlock */
2261 error_report("ram_save_queue_pages no block '%s'", rbname);
2262 goto err;
2263 }
68a098f3 2264 rs->last_req_rb = ramblock;
6c595cde
DDAG
2265 }
2266 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2267 if (start+len > ramblock->used_length) {
9458ad6b
JQ
2268 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2269 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
2270 __func__, start, len, ramblock->used_length);
2271 goto err;
2272 }
2273
ec481c6c
JQ
2274 struct RAMSrcPageRequest *new_entry =
2275 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
2276 new_entry->rb = ramblock;
2277 new_entry->offset = start;
2278 new_entry->len = len;
2279
2280 memory_region_ref(ramblock->mr);
ec481c6c
JQ
2281 qemu_mutex_lock(&rs->src_page_req_mutex);
2282 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
e03a34f8 2283 migration_make_urgent_request();
ec481c6c 2284 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
2285 rcu_read_unlock();
2286
2287 return 0;
2288
2289err:
2290 rcu_read_unlock();
2291 return -1;
2292}
2293
d7400a34
XG
2294static bool save_page_use_compression(RAMState *rs)
2295{
2296 if (!migrate_use_compression()) {
2297 return false;
2298 }
2299
2300 /*
2301 * If xbzrle is on, stop using the data compression after first
2302 * round of migration even if compression is enabled. In theory,
2303 * xbzrle can do better than compression.
2304 */
2305 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2306 return true;
2307 }
2308
2309 return false;
2310}
2311
5e5fdcff
XG
2312/*
2313 * try to compress the page before posting it out, return true if the page
2314 * has been properly handled by compression, otherwise needs other
2315 * paths to handle it
2316 */
2317static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2318{
2319 if (!save_page_use_compression(rs)) {
2320 return false;
2321 }
2322
2323 /*
2324 * When starting the process of a new block, the first page of
2325 * the block should be sent out before other pages in the same
2326 * block, and all the pages in last block should have been sent
2327 * out, keeping this order is important, because the 'cont' flag
2328 * is used to avoid resending the block name.
2329 *
2330 * We post the fist page as normal page as compression will take
2331 * much CPU resource.
2332 */
2333 if (block != rs->last_sent_block) {
2334 flush_compressed_data(rs);
2335 return false;
2336 }
2337
2338 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2339 return true;
2340 }
2341
76e03000 2342 compression_counters.busy++;
5e5fdcff
XG
2343 return false;
2344}
2345
a82d593b 2346/**
3d0684b2 2347 * ram_save_target_page: save one target page
a82d593b 2348 *
3d0684b2 2349 * Returns the number of pages written
a82d593b 2350 *
6f37bb8b 2351 * @rs: current RAM state
3d0684b2 2352 * @pss: data about the page we want to send
a82d593b 2353 * @last_stage: if we are at the completion stage
a82d593b 2354 */
a0a8aa14 2355static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2356 bool last_stage)
a82d593b 2357{
a8ec91f9
XG
2358 RAMBlock *block = pss->block;
2359 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2360 int res;
2361
2362 if (control_save_page(rs, block, offset, &res)) {
2363 return res;
2364 }
2365
5e5fdcff
XG
2366 if (save_compress_page(rs, block, offset)) {
2367 return 1;
d7400a34
XG
2368 }
2369
2370 res = save_zero_page(rs, block, offset);
2371 if (res > 0) {
2372 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2373 * page would be stale
2374 */
2375 if (!save_page_use_compression(rs)) {
2376 XBZRLE_cache_lock();
2377 xbzrle_cache_zero_page(rs, block->offset + offset);
2378 XBZRLE_cache_unlock();
2379 }
2380 ram_release_pages(block->idstr, offset, res);
2381 return res;
2382 }
2383
da3f56cb 2384 /*
5e5fdcff
XG
2385 * do not use multifd for compression as the first page in the new
2386 * block should be posted out before sending the compressed page
da3f56cb 2387 */
5e5fdcff 2388 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
b9ee2f7d 2389 return ram_save_multifd_page(rs, block, offset);
a82d593b
DDAG
2390 }
2391
1faa5665 2392 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
2393}
2394
2395/**
3d0684b2 2396 * ram_save_host_page: save a whole host page
a82d593b 2397 *
3d0684b2
JQ
2398 * Starting at *offset send pages up to the end of the current host
2399 * page. It's valid for the initial offset to point into the middle of
2400 * a host page in which case the remainder of the hostpage is sent.
2401 * Only dirty target pages are sent. Note that the host page size may
2402 * be a huge page for this block.
1eb3fc0a
DDAG
2403 * The saving stops at the boundary of the used_length of the block
2404 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 2405 *
3d0684b2
JQ
2406 * Returns the number of pages written or negative on error
2407 *
6f37bb8b 2408 * @rs: current RAM state
3d0684b2 2409 * @ms: current migration state
3d0684b2 2410 * @pss: data about the page we want to send
a82d593b 2411 * @last_stage: if we are at the completion stage
a82d593b 2412 */
a0a8aa14 2413static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 2414 bool last_stage)
a82d593b
DDAG
2415{
2416 int tmppages, pages = 0;
a935e30f
JQ
2417 size_t pagesize_bits =
2418 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 2419
fbd162e6 2420 if (ramblock_is_ignored(pss->block)) {
b895de50
CLG
2421 error_report("block %s should not be migrated !", pss->block->idstr);
2422 return 0;
2423 }
2424
a82d593b 2425 do {
1faa5665
XG
2426 /* Check the pages is dirty and if it is send it */
2427 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2428 pss->page++;
2429 continue;
2430 }
2431
f20e2865 2432 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
2433 if (tmppages < 0) {
2434 return tmppages;
2435 }
2436
2437 pages += tmppages;
1faa5665
XG
2438 if (pss->block->unsentmap) {
2439 clear_bit(pss->page, pss->block->unsentmap);
2440 }
2441
a935e30f 2442 pss->page++;
1eb3fc0a
DDAG
2443 } while ((pss->page & (pagesize_bits - 1)) &&
2444 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
2445
2446 /* The offset we leave with is the last one we looked at */
a935e30f 2447 pss->page--;
a82d593b
DDAG
2448 return pages;
2449}
6c595cde 2450
56e93d26 2451/**
3d0684b2 2452 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
2453 *
2454 * Called within an RCU critical section.
2455 *
e8f3735f
XG
2456 * Returns the number of pages written where zero means no dirty pages,
2457 * or negative on error
56e93d26 2458 *
6f37bb8b 2459 * @rs: current RAM state
56e93d26 2460 * @last_stage: if we are at the completion stage
a82d593b
DDAG
2461 *
2462 * On systems where host-page-size > target-page-size it will send all the
2463 * pages in a host page that are dirty.
56e93d26
JQ
2464 */
2465
ce25d337 2466static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 2467{
b8fb8cb7 2468 PageSearchStatus pss;
56e93d26 2469 int pages = 0;
b9e60928 2470 bool again, found;
56e93d26 2471
0827b9e9
AA
2472 /* No dirty page as there is zero RAM */
2473 if (!ram_bytes_total()) {
2474 return pages;
2475 }
2476
6f37bb8b 2477 pss.block = rs->last_seen_block;
a935e30f 2478 pss.page = rs->last_page;
b8fb8cb7
DDAG
2479 pss.complete_round = false;
2480
2481 if (!pss.block) {
2482 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2483 }
56e93d26 2484
b9e60928 2485 do {
a82d593b 2486 again = true;
f20e2865 2487 found = get_queued_page(rs, &pss);
b9e60928 2488
a82d593b
DDAG
2489 if (!found) {
2490 /* priority queue empty, so just search for something dirty */
f20e2865 2491 found = find_dirty_block(rs, &pss, &again);
a82d593b 2492 }
f3f491fc 2493
a82d593b 2494 if (found) {
f20e2865 2495 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 2496 }
b9e60928 2497 } while (!pages && again);
56e93d26 2498
6f37bb8b 2499 rs->last_seen_block = pss.block;
a935e30f 2500 rs->last_page = pss.page;
56e93d26
JQ
2501
2502 return pages;
2503}
2504
2505void acct_update_position(QEMUFile *f, size_t size, bool zero)
2506{
2507 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 2508
56e93d26 2509 if (zero) {
9360447d 2510 ram_counters.duplicate += pages;
56e93d26 2511 } else {
9360447d
JQ
2512 ram_counters.normal += pages;
2513 ram_counters.transferred += size;
56e93d26
JQ
2514 qemu_update_position(f, size);
2515 }
2516}
2517
fbd162e6 2518static uint64_t ram_bytes_total_common(bool count_ignored)
56e93d26
JQ
2519{
2520 RAMBlock *block;
2521 uint64_t total = 0;
2522
2523 rcu_read_lock();
fbd162e6
YK
2524 if (count_ignored) {
2525 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2526 total += block->used_length;
2527 }
2528 } else {
2529 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2530 total += block->used_length;
2531 }
99e15582 2532 }
56e93d26
JQ
2533 rcu_read_unlock();
2534 return total;
2535}
2536
fbd162e6
YK
2537uint64_t ram_bytes_total(void)
2538{
2539 return ram_bytes_total_common(false);
2540}
2541
f265e0e4 2542static void xbzrle_load_setup(void)
56e93d26 2543{
f265e0e4 2544 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
2545}
2546
f265e0e4
JQ
2547static void xbzrle_load_cleanup(void)
2548{
2549 g_free(XBZRLE.decoded_buf);
2550 XBZRLE.decoded_buf = NULL;
2551}
2552
7d7c96be
PX
2553static void ram_state_cleanup(RAMState **rsp)
2554{
b9ccaf6d
DDAG
2555 if (*rsp) {
2556 migration_page_queue_free(*rsp);
2557 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2558 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2559 g_free(*rsp);
2560 *rsp = NULL;
2561 }
7d7c96be
PX
2562}
2563
84593a08
PX
2564static void xbzrle_cleanup(void)
2565{
2566 XBZRLE_cache_lock();
2567 if (XBZRLE.cache) {
2568 cache_fini(XBZRLE.cache);
2569 g_free(XBZRLE.encoded_buf);
2570 g_free(XBZRLE.current_buf);
2571 g_free(XBZRLE.zero_target_page);
2572 XBZRLE.cache = NULL;
2573 XBZRLE.encoded_buf = NULL;
2574 XBZRLE.current_buf = NULL;
2575 XBZRLE.zero_target_page = NULL;
2576 }
2577 XBZRLE_cache_unlock();
2578}
2579
f265e0e4 2580static void ram_save_cleanup(void *opaque)
56e93d26 2581{
53518d94 2582 RAMState **rsp = opaque;
6b6712ef 2583 RAMBlock *block;
eb859c53 2584
2ff64038
LZ
2585 /* caller have hold iothread lock or is in a bh, so there is
2586 * no writing race against this migration_bitmap
2587 */
6b6712ef
JQ
2588 memory_global_dirty_log_stop();
2589
fbd162e6 2590 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2591 g_free(block->bmap);
2592 block->bmap = NULL;
2593 g_free(block->unsentmap);
2594 block->unsentmap = NULL;
56e93d26
JQ
2595 }
2596
84593a08 2597 xbzrle_cleanup();
f0afa331 2598 compress_threads_save_cleanup();
7d7c96be 2599 ram_state_cleanup(rsp);
56e93d26
JQ
2600}
2601
6f37bb8b 2602static void ram_state_reset(RAMState *rs)
56e93d26 2603{
6f37bb8b
JQ
2604 rs->last_seen_block = NULL;
2605 rs->last_sent_block = NULL;
269ace29 2606 rs->last_page = 0;
6f37bb8b
JQ
2607 rs->last_version = ram_list.version;
2608 rs->ram_bulk_stage = true;
56e93d26
JQ
2609}
2610
2611#define MAX_WAIT 50 /* ms, half buffered_file limit */
2612
4f2e4252
DDAG
2613/*
2614 * 'expected' is the value you expect the bitmap mostly to be full
2615 * of; it won't bother printing lines that are all this value.
2616 * If 'todump' is null the migration bitmap is dumped.
2617 */
6b6712ef
JQ
2618void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2619 unsigned long pages)
4f2e4252 2620{
4f2e4252
DDAG
2621 int64_t cur;
2622 int64_t linelen = 128;
2623 char linebuf[129];
2624
6b6712ef 2625 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
2626 int64_t curb;
2627 bool found = false;
2628 /*
2629 * Last line; catch the case where the line length
2630 * is longer than remaining ram
2631 */
6b6712ef
JQ
2632 if (cur + linelen > pages) {
2633 linelen = pages - cur;
4f2e4252
DDAG
2634 }
2635 for (curb = 0; curb < linelen; curb++) {
2636 bool thisbit = test_bit(cur + curb, todump);
2637 linebuf[curb] = thisbit ? '1' : '.';
2638 found = found || (thisbit != expected);
2639 }
2640 if (found) {
2641 linebuf[curb] = '\0';
2642 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2643 }
2644 }
2645}
2646
e0b266f0
DDAG
2647/* **** functions for postcopy ***** */
2648
ced1c616
PB
2649void ram_postcopy_migrated_memory_release(MigrationState *ms)
2650{
2651 struct RAMBlock *block;
ced1c616 2652
fbd162e6 2653 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2654 unsigned long *bitmap = block->bmap;
2655 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2656 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2657
2658 while (run_start < range) {
2659 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2660 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2661 (run_end - run_start) << TARGET_PAGE_BITS);
2662 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2663 }
2664 }
2665}
2666
3d0684b2
JQ
2667/**
2668 * postcopy_send_discard_bm_ram: discard a RAMBlock
2669 *
2670 * Returns zero on success
2671 *
e0b266f0
DDAG
2672 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2673 * Note: At this point the 'unsentmap' is the processed bitmap combined
2674 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
2675 *
2676 * @ms: current migration state
2677 * @pds: state for postcopy
2678 * @start: RAMBlock starting page
2679 * @length: RAMBlock size
e0b266f0
DDAG
2680 */
2681static int postcopy_send_discard_bm_ram(MigrationState *ms,
2682 PostcopyDiscardState *pds,
6b6712ef 2683 RAMBlock *block)
e0b266f0 2684{
6b6712ef 2685 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2686 unsigned long current;
6b6712ef 2687 unsigned long *unsentmap = block->unsentmap;
e0b266f0 2688
6b6712ef 2689 for (current = 0; current < end; ) {
e0b266f0
DDAG
2690 unsigned long one = find_next_bit(unsentmap, end, current);
2691
2692 if (one <= end) {
2693 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2694 unsigned long discard_length;
2695
2696 if (zero >= end) {
2697 discard_length = end - one;
2698 } else {
2699 discard_length = zero - one;
2700 }
d688c62d
DDAG
2701 if (discard_length) {
2702 postcopy_discard_send_range(ms, pds, one, discard_length);
2703 }
e0b266f0
DDAG
2704 current = one + discard_length;
2705 } else {
2706 current = one;
2707 }
2708 }
2709
2710 return 0;
2711}
2712
3d0684b2
JQ
2713/**
2714 * postcopy_each_ram_send_discard: discard all RAMBlocks
2715 *
2716 * Returns 0 for success or negative for error
2717 *
e0b266f0
DDAG
2718 * Utility for the outgoing postcopy code.
2719 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2720 * passing it bitmap indexes and name.
e0b266f0
DDAG
2721 * (qemu_ram_foreach_block ends up passing unscaled lengths
2722 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2723 *
2724 * @ms: current migration state
e0b266f0
DDAG
2725 */
2726static int postcopy_each_ram_send_discard(MigrationState *ms)
2727{
2728 struct RAMBlock *block;
2729 int ret;
2730
fbd162e6 2731 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2732 PostcopyDiscardState *pds =
2733 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2734
2735 /*
2736 * Postcopy sends chunks of bitmap over the wire, but it
2737 * just needs indexes at this point, avoids it having
2738 * target page specific code.
2739 */
6b6712ef 2740 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
2741 postcopy_discard_send_finish(ms, pds);
2742 if (ret) {
2743 return ret;
2744 }
2745 }
2746
2747 return 0;
2748}
2749
3d0684b2
JQ
2750/**
2751 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2752 *
2753 * Helper for postcopy_chunk_hostpages; it's called twice to
2754 * canonicalize the two bitmaps, that are similar, but one is
2755 * inverted.
99e314eb 2756 *
3d0684b2
JQ
2757 * Postcopy requires that all target pages in a hostpage are dirty or
2758 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2759 *
3d0684b2
JQ
2760 * @ms: current migration state
2761 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2762 * otherwise we need to canonicalize partially dirty host pages
2763 * @block: block that contains the page we want to canonicalize
2764 * @pds: state for postcopy
99e314eb
DDAG
2765 */
2766static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2767 RAMBlock *block,
2768 PostcopyDiscardState *pds)
2769{
53518d94 2770 RAMState *rs = ram_state;
6b6712ef
JQ
2771 unsigned long *bitmap = block->bmap;
2772 unsigned long *unsentmap = block->unsentmap;
29c59172 2773 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2774 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2775 unsigned long run_start;
2776
29c59172
DDAG
2777 if (block->page_size == TARGET_PAGE_SIZE) {
2778 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2779 return;
2780 }
2781
99e314eb
DDAG
2782 if (unsent_pass) {
2783 /* Find a sent page */
6b6712ef 2784 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
2785 } else {
2786 /* Find a dirty page */
6b6712ef 2787 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
2788 }
2789
6b6712ef 2790 while (run_start < pages) {
99e314eb
DDAG
2791 bool do_fixup = false;
2792 unsigned long fixup_start_addr;
2793 unsigned long host_offset;
2794
2795 /*
2796 * If the start of this run of pages is in the middle of a host
2797 * page, then we need to fixup this host page.
2798 */
2799 host_offset = run_start % host_ratio;
2800 if (host_offset) {
2801 do_fixup = true;
2802 run_start -= host_offset;
2803 fixup_start_addr = run_start;
2804 /* For the next pass */
2805 run_start = run_start + host_ratio;
2806 } else {
2807 /* Find the end of this run */
2808 unsigned long run_end;
2809 if (unsent_pass) {
6b6712ef 2810 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 2811 } else {
6b6712ef 2812 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2813 }
2814 /*
2815 * If the end isn't at the start of a host page, then the
2816 * run doesn't finish at the end of a host page
2817 * and we need to discard.
2818 */
2819 host_offset = run_end % host_ratio;
2820 if (host_offset) {
2821 do_fixup = true;
2822 fixup_start_addr = run_end - host_offset;
2823 /*
2824 * This host page has gone, the next loop iteration starts
2825 * from after the fixup
2826 */
2827 run_start = fixup_start_addr + host_ratio;
2828 } else {
2829 /*
2830 * No discards on this iteration, next loop starts from
2831 * next sent/dirty page
2832 */
2833 run_start = run_end + 1;
2834 }
2835 }
2836
2837 if (do_fixup) {
2838 unsigned long page;
2839
2840 /* Tell the destination to discard this page */
2841 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2842 /* For the unsent_pass we:
2843 * discard partially sent pages
2844 * For the !unsent_pass (dirty) we:
2845 * discard partially dirty pages that were sent
2846 * (any partially sent pages were already discarded
2847 * by the previous unsent_pass)
2848 */
2849 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2850 host_ratio);
2851 }
2852
2853 /* Clean up the bitmap */
2854 for (page = fixup_start_addr;
2855 page < fixup_start_addr + host_ratio; page++) {
2856 /* All pages in this host page are now not sent */
2857 set_bit(page, unsentmap);
2858
2859 /*
2860 * Remark them as dirty, updating the count for any pages
2861 * that weren't previously dirty.
2862 */
0d8ec885 2863 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2864 }
2865 }
2866
2867 if (unsent_pass) {
2868 /* Find the next sent page for the next iteration */
6b6712ef 2869 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
2870 } else {
2871 /* Find the next dirty page for the next iteration */
6b6712ef 2872 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2873 }
2874 }
2875}
2876
3d0684b2
JQ
2877/**
2878 * postcopy_chuck_hostpages: discrad any partially sent host page
2879 *
99e314eb
DDAG
2880 * Utility for the outgoing postcopy code.
2881 *
2882 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2883 * dirty host-page size chunks as all dirty. In this case the host-page
2884 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2885 *
3d0684b2
JQ
2886 * Returns zero on success
2887 *
2888 * @ms: current migration state
6b6712ef 2889 * @block: block we want to work with
99e314eb 2890 */
6b6712ef 2891static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2892{
6b6712ef
JQ
2893 PostcopyDiscardState *pds =
2894 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2895
6b6712ef
JQ
2896 /* First pass: Discard all partially sent host pages */
2897 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2898 /*
2899 * Second pass: Ensure that all partially dirty host pages are made
2900 * fully dirty.
2901 */
2902 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 2903
6b6712ef 2904 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
2905 return 0;
2906}
2907
3d0684b2
JQ
2908/**
2909 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2910 *
2911 * Returns zero on success
2912 *
e0b266f0
DDAG
2913 * Transmit the set of pages to be discarded after precopy to the target
2914 * these are pages that:
2915 * a) Have been previously transmitted but are now dirty again
2916 * b) Pages that have never been transmitted, this ensures that
2917 * any pages on the destination that have been mapped by background
2918 * tasks get discarded (transparent huge pages is the specific concern)
2919 * Hopefully this is pretty sparse
3d0684b2
JQ
2920 *
2921 * @ms: current migration state
e0b266f0
DDAG
2922 */
2923int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2924{
53518d94 2925 RAMState *rs = ram_state;
6b6712ef 2926 RAMBlock *block;
e0b266f0 2927 int ret;
e0b266f0
DDAG
2928
2929 rcu_read_lock();
2930
2931 /* This should be our last sync, the src is now paused */
eb859c53 2932 migration_bitmap_sync(rs);
e0b266f0 2933
6b6712ef
JQ
2934 /* Easiest way to make sure we don't resume in the middle of a host-page */
2935 rs->last_seen_block = NULL;
2936 rs->last_sent_block = NULL;
2937 rs->last_page = 0;
e0b266f0 2938
fbd162e6 2939 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
6b6712ef
JQ
2940 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2941 unsigned long *bitmap = block->bmap;
2942 unsigned long *unsentmap = block->unsentmap;
2943
2944 if (!unsentmap) {
2945 /* We don't have a safe way to resize the sentmap, so
2946 * if the bitmap was resized it will be NULL at this
2947 * point.
2948 */
2949 error_report("migration ram resized during precopy phase");
2950 rcu_read_unlock();
2951 return -EINVAL;
2952 }
2953 /* Deal with TPS != HPS and huge pages */
2954 ret = postcopy_chunk_hostpages(ms, block);
2955 if (ret) {
2956 rcu_read_unlock();
2957 return ret;
2958 }
e0b266f0 2959
6b6712ef
JQ
2960 /*
2961 * Update the unsentmap to be unsentmap = unsentmap | dirty
2962 */
2963 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 2964#ifdef DEBUG_POSTCOPY
6b6712ef 2965 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 2966#endif
6b6712ef
JQ
2967 }
2968 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2969
2970 ret = postcopy_each_ram_send_discard(ms);
2971 rcu_read_unlock();
2972
2973 return ret;
2974}
2975
3d0684b2
JQ
2976/**
2977 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2978 *
3d0684b2 2979 * Returns zero on success
e0b266f0 2980 *
36449157
JQ
2981 * @rbname: name of the RAMBlock of the request. NULL means the
2982 * same that last one.
3d0684b2
JQ
2983 * @start: RAMBlock starting page
2984 * @length: RAMBlock size
e0b266f0 2985 */
aaa2064c 2986int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
2987{
2988 int ret = -1;
2989
36449157 2990 trace_ram_discard_range(rbname, start, length);
d3a5038c 2991
e0b266f0 2992 rcu_read_lock();
36449157 2993 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2994
2995 if (!rb) {
36449157 2996 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
2997 goto err;
2998 }
2999
814bb08f
PX
3000 /*
3001 * On source VM, we don't need to update the received bitmap since
3002 * we don't even have one.
3003 */
3004 if (rb->receivedmap) {
3005 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3006 length >> qemu_target_page_bits());
3007 }
3008
d3a5038c 3009 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
3010
3011err:
3012 rcu_read_unlock();
3013
3014 return ret;
3015}
3016
84593a08
PX
3017/*
3018 * For every allocation, we will try not to crash the VM if the
3019 * allocation failed.
3020 */
3021static int xbzrle_init(void)
3022{
3023 Error *local_err = NULL;
3024
3025 if (!migrate_use_xbzrle()) {
3026 return 0;
3027 }
3028
3029 XBZRLE_cache_lock();
3030
3031 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3032 if (!XBZRLE.zero_target_page) {
3033 error_report("%s: Error allocating zero page", __func__);
3034 goto err_out;
3035 }
3036
3037 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3038 TARGET_PAGE_SIZE, &local_err);
3039 if (!XBZRLE.cache) {
3040 error_report_err(local_err);
3041 goto free_zero_page;
3042 }
3043
3044 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3045 if (!XBZRLE.encoded_buf) {
3046 error_report("%s: Error allocating encoded_buf", __func__);
3047 goto free_cache;
3048 }
3049
3050 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3051 if (!XBZRLE.current_buf) {
3052 error_report("%s: Error allocating current_buf", __func__);
3053 goto free_encoded_buf;
3054 }
3055
3056 /* We are all good */
3057 XBZRLE_cache_unlock();
3058 return 0;
3059
3060free_encoded_buf:
3061 g_free(XBZRLE.encoded_buf);
3062 XBZRLE.encoded_buf = NULL;
3063free_cache:
3064 cache_fini(XBZRLE.cache);
3065 XBZRLE.cache = NULL;
3066free_zero_page:
3067 g_free(XBZRLE.zero_target_page);
3068 XBZRLE.zero_target_page = NULL;
3069err_out:
3070 XBZRLE_cache_unlock();
3071 return -ENOMEM;
3072}
3073
53518d94 3074static int ram_state_init(RAMState **rsp)
56e93d26 3075{
7d00ee6a
PX
3076 *rsp = g_try_new0(RAMState, 1);
3077
3078 if (!*rsp) {
3079 error_report("%s: Init ramstate fail", __func__);
3080 return -1;
3081 }
53518d94
JQ
3082
3083 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3084 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3085 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 3086
7d00ee6a
PX
3087 /*
3088 * Count the total number of pages used by ram blocks not including any
3089 * gaps due to alignment or unplugs.
3090 */
3091 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3092
3093 ram_state_reset(*rsp);
3094
3095 return 0;
3096}
3097
d6eff5d7 3098static void ram_list_init_bitmaps(void)
7d00ee6a 3099{
d6eff5d7
PX
3100 RAMBlock *block;
3101 unsigned long pages;
56e93d26 3102
0827b9e9
AA
3103 /* Skip setting bitmap if there is no RAM */
3104 if (ram_bytes_total()) {
fbd162e6 3105 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d6eff5d7 3106 pages = block->max_length >> TARGET_PAGE_BITS;
6b6712ef
JQ
3107 block->bmap = bitmap_new(pages);
3108 bitmap_set(block->bmap, 0, pages);
3109 if (migrate_postcopy_ram()) {
3110 block->unsentmap = bitmap_new(pages);
3111 bitmap_set(block->unsentmap, 0, pages);
3112 }
0827b9e9 3113 }
f3f491fc 3114 }
d6eff5d7
PX
3115}
3116
3117static void ram_init_bitmaps(RAMState *rs)
3118{
3119 /* For memory_global_dirty_log_start below. */
3120 qemu_mutex_lock_iothread();
3121 qemu_mutex_lock_ramlist();
3122 rcu_read_lock();
f3f491fc 3123
d6eff5d7 3124 ram_list_init_bitmaps();
56e93d26 3125 memory_global_dirty_log_start();
d6eff5d7
PX
3126 migration_bitmap_sync(rs);
3127
3128 rcu_read_unlock();
56e93d26 3129 qemu_mutex_unlock_ramlist();
49877834 3130 qemu_mutex_unlock_iothread();
d6eff5d7
PX
3131}
3132
3133static int ram_init_all(RAMState **rsp)
3134{
3135 if (ram_state_init(rsp)) {
3136 return -1;
3137 }
3138
3139 if (xbzrle_init()) {
3140 ram_state_cleanup(rsp);
3141 return -1;
3142 }
3143
3144 ram_init_bitmaps(*rsp);
a91246c9
HZ
3145
3146 return 0;
3147}
3148
08614f34
PX
3149static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3150{
3151 RAMBlock *block;
3152 uint64_t pages = 0;
3153
3154 /*
3155 * Postcopy is not using xbzrle/compression, so no need for that.
3156 * Also, since source are already halted, we don't need to care
3157 * about dirty page logging as well.
3158 */
3159
fbd162e6 3160 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
08614f34
PX
3161 pages += bitmap_count_one(block->bmap,
3162 block->used_length >> TARGET_PAGE_BITS);
3163 }
3164
3165 /* This may not be aligned with current bitmaps. Recalculate. */
3166 rs->migration_dirty_pages = pages;
3167
3168 rs->last_seen_block = NULL;
3169 rs->last_sent_block = NULL;
3170 rs->last_page = 0;
3171 rs->last_version = ram_list.version;
3172 /*
3173 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3174 * matter what we have sent.
3175 */
3176 rs->ram_bulk_stage = false;
3177
3178 /* Update RAMState cache of output QEMUFile */
3179 rs->f = out;
3180
3181 trace_ram_state_resume_prepare(pages);
3182}
3183
6bcb05fc
WW
3184/*
3185 * This function clears bits of the free pages reported by the caller from the
3186 * migration dirty bitmap. @addr is the host address corresponding to the
3187 * start of the continuous guest free pages, and @len is the total bytes of
3188 * those pages.
3189 */
3190void qemu_guest_free_page_hint(void *addr, size_t len)
3191{
3192 RAMBlock *block;
3193 ram_addr_t offset;
3194 size_t used_len, start, npages;
3195 MigrationState *s = migrate_get_current();
3196
3197 /* This function is currently expected to be used during live migration */
3198 if (!migration_is_setup_or_active(s->state)) {
3199 return;
3200 }
3201
3202 for (; len > 0; len -= used_len, addr += used_len) {
3203 block = qemu_ram_block_from_host(addr, false, &offset);
3204 if (unlikely(!block || offset >= block->used_length)) {
3205 /*
3206 * The implementation might not support RAMBlock resize during
3207 * live migration, but it could happen in theory with future
3208 * updates. So we add a check here to capture that case.
3209 */
3210 error_report_once("%s unexpected error", __func__);
3211 return;
3212 }
3213
3214 if (len <= block->used_length - offset) {
3215 used_len = len;
3216 } else {
3217 used_len = block->used_length - offset;
3218 }
3219
3220 start = offset >> TARGET_PAGE_BITS;
3221 npages = used_len >> TARGET_PAGE_BITS;
3222
3223 qemu_mutex_lock(&ram_state->bitmap_mutex);
3224 ram_state->migration_dirty_pages -=
3225 bitmap_count_one_with_offset(block->bmap, start, npages);
3226 bitmap_clear(block->bmap, start, npages);
3227 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3228 }
3229}
3230
3d0684b2
JQ
3231/*
3232 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
3233 * long-running RCU critical section. When rcu-reclaims in the code
3234 * start to become numerous it will be necessary to reduce the
3235 * granularity of these critical sections.
3236 */
3237
3d0684b2
JQ
3238/**
3239 * ram_save_setup: Setup RAM for migration
3240 *
3241 * Returns zero to indicate success and negative for error
3242 *
3243 * @f: QEMUFile where to send the data
3244 * @opaque: RAMState pointer
3245 */
a91246c9
HZ
3246static int ram_save_setup(QEMUFile *f, void *opaque)
3247{
53518d94 3248 RAMState **rsp = opaque;
a91246c9
HZ
3249 RAMBlock *block;
3250
dcaf446e
XG
3251 if (compress_threads_save_setup()) {
3252 return -1;
3253 }
3254
a91246c9
HZ
3255 /* migration has already setup the bitmap, reuse it. */
3256 if (!migration_in_colo_state()) {
7d00ee6a 3257 if (ram_init_all(rsp) != 0) {
dcaf446e 3258 compress_threads_save_cleanup();
a91246c9 3259 return -1;
53518d94 3260 }
a91246c9 3261 }
53518d94 3262 (*rsp)->f = f;
a91246c9
HZ
3263
3264 rcu_read_lock();
56e93d26 3265
fbd162e6 3266 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
56e93d26 3267
b895de50 3268 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26
JQ
3269 qemu_put_byte(f, strlen(block->idstr));
3270 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3271 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
3272 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3273 qemu_put_be64(f, block->page_size);
3274 }
fbd162e6
YK
3275 if (migrate_ignore_shared()) {
3276 qemu_put_be64(f, block->mr->addr);
3277 qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3278 }
56e93d26
JQ
3279 }
3280
3281 rcu_read_unlock();
3282
3283 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3284 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3285
6df264ac 3286 multifd_send_sync_main();
56e93d26 3287 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3288 qemu_fflush(f);
56e93d26
JQ
3289
3290 return 0;
3291}
3292
3d0684b2
JQ
3293/**
3294 * ram_save_iterate: iterative stage for migration
3295 *
3296 * Returns zero to indicate success and negative for error
3297 *
3298 * @f: QEMUFile where to send the data
3299 * @opaque: RAMState pointer
3300 */
56e93d26
JQ
3301static int ram_save_iterate(QEMUFile *f, void *opaque)
3302{
53518d94
JQ
3303 RAMState **temp = opaque;
3304 RAMState *rs = *temp;
56e93d26
JQ
3305 int ret;
3306 int i;
3307 int64_t t0;
5c90308f 3308 int done = 0;
56e93d26 3309
b2557345
PL
3310 if (blk_mig_bulk_active()) {
3311 /* Avoid transferring ram during bulk phase of block migration as
3312 * the bulk phase will usually take a long time and transferring
3313 * ram updates during that time is pointless. */
3314 goto out;
3315 }
3316
56e93d26 3317 rcu_read_lock();
6f37bb8b
JQ
3318 if (ram_list.version != rs->last_version) {
3319 ram_state_reset(rs);
56e93d26
JQ
3320 }
3321
3322 /* Read version before ram_list.blocks */
3323 smp_rmb();
3324
3325 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3326
3327 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3328 i = 0;
e03a34f8
DDAG
3329 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3330 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
56e93d26
JQ
3331 int pages;
3332
e03a34f8
DDAG
3333 if (qemu_file_get_error(f)) {
3334 break;
3335 }
3336
ce25d337 3337 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
3338 /* no more pages to sent */
3339 if (pages == 0) {
5c90308f 3340 done = 1;
56e93d26
JQ
3341 break;
3342 }
e8f3735f
XG
3343
3344 if (pages < 0) {
3345 qemu_file_set_error(f, pages);
3346 break;
3347 }
3348
be8b02ed 3349 rs->target_page_count += pages;
070afca2 3350
56e93d26
JQ
3351 /* we want to check in the 1st loop, just in case it was the 1st time
3352 and we had to sync the dirty bitmap.
3353 qemu_get_clock_ns() is a bit expensive, so we only check each some
3354 iterations
3355 */
3356 if ((i & 63) == 0) {
3357 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3358 if (t1 > MAX_WAIT) {
55c4446b 3359 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
3360 break;
3361 }
3362 }
3363 i++;
3364 }
56e93d26
JQ
3365 rcu_read_unlock();
3366
3367 /*
3368 * Must occur before EOS (or any QEMUFile operation)
3369 * because of RDMA protocol.
3370 */
3371 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3372
6df264ac 3373 multifd_send_sync_main();
b2557345 3374out:
56e93d26 3375 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3376 qemu_fflush(f);
9360447d 3377 ram_counters.transferred += 8;
56e93d26
JQ
3378
3379 ret = qemu_file_get_error(f);
3380 if (ret < 0) {
3381 return ret;
3382 }
3383
5c90308f 3384 return done;
56e93d26
JQ
3385}
3386
3d0684b2
JQ
3387/**
3388 * ram_save_complete: function called to send the remaining amount of ram
3389 *
e8f3735f 3390 * Returns zero to indicate success or negative on error
3d0684b2
JQ
3391 *
3392 * Called with iothread lock
3393 *
3394 * @f: QEMUFile where to send the data
3395 * @opaque: RAMState pointer
3396 */
56e93d26
JQ
3397static int ram_save_complete(QEMUFile *f, void *opaque)
3398{
53518d94
JQ
3399 RAMState **temp = opaque;
3400 RAMState *rs = *temp;
e8f3735f 3401 int ret = 0;
6f37bb8b 3402
56e93d26
JQ
3403 rcu_read_lock();
3404
5727309d 3405 if (!migration_in_postcopy()) {
8d820d6f 3406 migration_bitmap_sync(rs);
663e6c1d 3407 }
56e93d26
JQ
3408
3409 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3410
3411 /* try transferring iterative blocks of memory */
3412
3413 /* flush all remaining blocks regardless of rate limiting */
3414 while (true) {
3415 int pages;
3416
ce25d337 3417 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
3418 /* no more blocks to sent */
3419 if (pages == 0) {
3420 break;
3421 }
e8f3735f
XG
3422 if (pages < 0) {
3423 ret = pages;
3424 break;
3425 }
56e93d26
JQ
3426 }
3427
ce25d337 3428 flush_compressed_data(rs);
56e93d26 3429 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
3430
3431 rcu_read_unlock();
d09a6fde 3432
6df264ac 3433 multifd_send_sync_main();
56e93d26 3434 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
35374cbd 3435 qemu_fflush(f);
56e93d26 3436
e8f3735f 3437 return ret;
56e93d26
JQ
3438}
3439
c31b098f 3440static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
3441 uint64_t *res_precopy_only,
3442 uint64_t *res_compatible,
3443 uint64_t *res_postcopy_only)
56e93d26 3444{
53518d94
JQ
3445 RAMState **temp = opaque;
3446 RAMState *rs = *temp;
56e93d26
JQ
3447 uint64_t remaining_size;
3448
9edabd4d 3449 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3450
5727309d 3451 if (!migration_in_postcopy() &&
663e6c1d 3452 remaining_size < max_size) {
56e93d26
JQ
3453 qemu_mutex_lock_iothread();
3454 rcu_read_lock();
8d820d6f 3455 migration_bitmap_sync(rs);
56e93d26
JQ
3456 rcu_read_unlock();
3457 qemu_mutex_unlock_iothread();
9edabd4d 3458 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 3459 }
c31b098f 3460
86e1167e
VSO
3461 if (migrate_postcopy_ram()) {
3462 /* We can do postcopy, and all the data is postcopiable */
47995026 3463 *res_compatible += remaining_size;
86e1167e 3464 } else {
47995026 3465 *res_precopy_only += remaining_size;
86e1167e 3466 }
56e93d26
JQ
3467}
3468
3469static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3470{
3471 unsigned int xh_len;
3472 int xh_flags;
063e760a 3473 uint8_t *loaded_data;
56e93d26 3474
56e93d26
JQ
3475 /* extract RLE header */
3476 xh_flags = qemu_get_byte(f);
3477 xh_len = qemu_get_be16(f);
3478
3479 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3480 error_report("Failed to load XBZRLE page - wrong compression!");
3481 return -1;
3482 }
3483
3484 if (xh_len > TARGET_PAGE_SIZE) {
3485 error_report("Failed to load XBZRLE page - len overflow!");
3486 return -1;
3487 }
f265e0e4 3488 loaded_data = XBZRLE.decoded_buf;
56e93d26 3489 /* load data and decode */
f265e0e4 3490 /* it can change loaded_data to point to an internal buffer */
063e760a 3491 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
3492
3493 /* decode RLE */
063e760a 3494 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
3495 TARGET_PAGE_SIZE) == -1) {
3496 error_report("Failed to load XBZRLE page - decode error!");
3497 return -1;
3498 }
3499
3500 return 0;
3501}
3502
3d0684b2
JQ
3503/**
3504 * ram_block_from_stream: read a RAMBlock id from the migration stream
3505 *
3506 * Must be called from within a rcu critical section.
3507 *
56e93d26 3508 * Returns a pointer from within the RCU-protected ram_list.
a7180877 3509 *
3d0684b2
JQ
3510 * @f: QEMUFile where to read the data from
3511 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 3512 */
3d0684b2 3513static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
3514{
3515 static RAMBlock *block = NULL;
3516 char id[256];
3517 uint8_t len;
3518
3519 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 3520 if (!block) {
56e93d26
JQ
3521 error_report("Ack, bad migration stream!");
3522 return NULL;
3523 }
4c4bad48 3524 return block;
56e93d26
JQ
3525 }
3526
3527 len = qemu_get_byte(f);
3528 qemu_get_buffer(f, (uint8_t *)id, len);
3529 id[len] = 0;
3530
e3dd7493 3531 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
3532 if (!block) {
3533 error_report("Can't find block %s", id);
3534 return NULL;
56e93d26
JQ
3535 }
3536
fbd162e6 3537 if (ramblock_is_ignored(block)) {
b895de50
CLG
3538 error_report("block %s should not be migrated !", id);
3539 return NULL;
3540 }
3541
4c4bad48
HZ
3542 return block;
3543}
3544
3545static inline void *host_from_ram_block_offset(RAMBlock *block,
3546 ram_addr_t offset)
3547{
3548 if (!offset_in_ramblock(block, offset)) {
3549 return NULL;
3550 }
3551
3552 return block->host + offset;
56e93d26
JQ
3553}
3554
13af18f2
ZC
3555static inline void *colo_cache_from_block_offset(RAMBlock *block,
3556 ram_addr_t offset)
3557{
3558 if (!offset_in_ramblock(block, offset)) {
3559 return NULL;
3560 }
3561 if (!block->colo_cache) {
3562 error_report("%s: colo_cache is NULL in block :%s",
3563 __func__, block->idstr);
3564 return NULL;
3565 }
7d9acafa
ZC
3566
3567 /*
3568 * During colo checkpoint, we need bitmap of these migrated pages.
3569 * It help us to decide which pages in ram cache should be flushed
3570 * into VM's RAM later.
3571 */
3572 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3573 ram_state->migration_dirty_pages++;
3574 }
13af18f2
ZC
3575 return block->colo_cache + offset;
3576}
3577
3d0684b2
JQ
3578/**
3579 * ram_handle_compressed: handle the zero page case
3580 *
56e93d26
JQ
3581 * If a page (or a whole RDMA chunk) has been
3582 * determined to be zero, then zap it.
3d0684b2
JQ
3583 *
3584 * @host: host address for the zero page
3585 * @ch: what the page is filled from. We only support zero
3586 * @size: size of the zero page
56e93d26
JQ
3587 */
3588void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3589{
3590 if (ch != 0 || !is_zero_range(host, size)) {
3591 memset(host, ch, size);
3592 }
3593}
3594
797ca154
XG
3595/* return the size after decompression, or negative value on error */
3596static int
3597qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3598 const uint8_t *source, size_t source_len)
3599{
3600 int err;
3601
3602 err = inflateReset(stream);
3603 if (err != Z_OK) {
3604 return -1;
3605 }
3606
3607 stream->avail_in = source_len;
3608 stream->next_in = (uint8_t *)source;
3609 stream->avail_out = dest_len;
3610 stream->next_out = dest;
3611
3612 err = inflate(stream, Z_NO_FLUSH);
3613 if (err != Z_STREAM_END) {
3614 return -1;
3615 }
3616
3617 return stream->total_out;
3618}
3619
56e93d26
JQ
3620static void *do_data_decompress(void *opaque)
3621{
3622 DecompressParam *param = opaque;
3623 unsigned long pagesize;
33d151f4 3624 uint8_t *des;
34ab9e97 3625 int len, ret;
56e93d26 3626
33d151f4 3627 qemu_mutex_lock(&param->mutex);
90e56fb4 3628 while (!param->quit) {
33d151f4
LL
3629 if (param->des) {
3630 des = param->des;
3631 len = param->len;
3632 param->des = 0;
3633 qemu_mutex_unlock(&param->mutex);
3634
56e93d26 3635 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
3636
3637 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3638 param->compbuf, len);
f548222c 3639 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
3640 error_report("decompress data failed");
3641 qemu_file_set_error(decomp_file, ret);
3642 }
73a8912b 3643
33d151f4
LL
3644 qemu_mutex_lock(&decomp_done_lock);
3645 param->done = true;
3646 qemu_cond_signal(&decomp_done_cond);
3647 qemu_mutex_unlock(&decomp_done_lock);
3648
3649 qemu_mutex_lock(&param->mutex);
3650 } else {
3651 qemu_cond_wait(&param->cond, &param->mutex);
3652 }
56e93d26 3653 }
33d151f4 3654 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
3655
3656 return NULL;
3657}
3658
34ab9e97 3659static int wait_for_decompress_done(void)
5533b2e9
LL
3660{
3661 int idx, thread_count;
3662
3663 if (!migrate_use_compression()) {
34ab9e97 3664 return 0;
5533b2e9
LL
3665 }
3666
3667 thread_count = migrate_decompress_threads();
3668 qemu_mutex_lock(&decomp_done_lock);
3669 for (idx = 0; idx < thread_count; idx++) {
3670 while (!decomp_param[idx].done) {
3671 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3672 }
3673 }
3674 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 3675 return qemu_file_get_error(decomp_file);
5533b2e9
LL
3676}
3677
f0afa331 3678static void compress_threads_load_cleanup(void)
56e93d26
JQ
3679{
3680 int i, thread_count;
3681
3416ab5b
JQ
3682 if (!migrate_use_compression()) {
3683 return;
3684 }
56e93d26
JQ
3685 thread_count = migrate_decompress_threads();
3686 for (i = 0; i < thread_count; i++) {
797ca154
XG
3687 /*
3688 * we use it as a indicator which shows if the thread is
3689 * properly init'd or not
3690 */
3691 if (!decomp_param[i].compbuf) {
3692 break;
3693 }
3694
56e93d26 3695 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 3696 decomp_param[i].quit = true;
56e93d26
JQ
3697 qemu_cond_signal(&decomp_param[i].cond);
3698 qemu_mutex_unlock(&decomp_param[i].mutex);
3699 }
3700 for (i = 0; i < thread_count; i++) {
797ca154
XG
3701 if (!decomp_param[i].compbuf) {
3702 break;
3703 }
3704
56e93d26
JQ
3705 qemu_thread_join(decompress_threads + i);
3706 qemu_mutex_destroy(&decomp_param[i].mutex);
3707 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 3708 inflateEnd(&decomp_param[i].stream);
56e93d26 3709 g_free(decomp_param[i].compbuf);
797ca154 3710 decomp_param[i].compbuf = NULL;
56e93d26
JQ
3711 }
3712 g_free(decompress_threads);
3713 g_free(decomp_param);
56e93d26
JQ
3714 decompress_threads = NULL;
3715 decomp_param = NULL;
34ab9e97 3716 decomp_file = NULL;
56e93d26
JQ
3717}
3718
34ab9e97 3719static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
3720{
3721 int i, thread_count;
3722
3723 if (!migrate_use_compression()) {
3724 return 0;
3725 }
3726
3727 thread_count = migrate_decompress_threads();
3728 decompress_threads = g_new0(QemuThread, thread_count);
3729 decomp_param = g_new0(DecompressParam, thread_count);
3730 qemu_mutex_init(&decomp_done_lock);
3731 qemu_cond_init(&decomp_done_cond);
34ab9e97 3732 decomp_file = f;
797ca154
XG
3733 for (i = 0; i < thread_count; i++) {
3734 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3735 goto exit;
3736 }
3737
3738 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3739 qemu_mutex_init(&decomp_param[i].mutex);
3740 qemu_cond_init(&decomp_param[i].cond);
3741 decomp_param[i].done = true;
3742 decomp_param[i].quit = false;
3743 qemu_thread_create(decompress_threads + i, "decompress",
3744 do_data_decompress, decomp_param + i,
3745 QEMU_THREAD_JOINABLE);
3746 }
3747 return 0;
3748exit:
3749 compress_threads_load_cleanup();
3750 return -1;
3751}
3752
c1bc6626 3753static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3754 void *host, int len)
3755{
3756 int idx, thread_count;
3757
3758 thread_count = migrate_decompress_threads();
73a8912b 3759 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3760 while (true) {
3761 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3762 if (decomp_param[idx].done) {
33d151f4
LL
3763 decomp_param[idx].done = false;
3764 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3765 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3766 decomp_param[idx].des = host;
3767 decomp_param[idx].len = len;
33d151f4
LL
3768 qemu_cond_signal(&decomp_param[idx].cond);
3769 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3770 break;
3771 }
3772 }
3773 if (idx < thread_count) {
3774 break;
73a8912b
LL
3775 } else {
3776 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3777 }
3778 }
73a8912b 3779 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3780}
3781
13af18f2
ZC
3782/*
3783 * colo cache: this is for secondary VM, we cache the whole
3784 * memory of the secondary VM, it is need to hold the global lock
3785 * to call this helper.
3786 */
3787int colo_init_ram_cache(void)
3788{
3789 RAMBlock *block;
3790
3791 rcu_read_lock();
fbd162e6 3792 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
3793 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3794 NULL,
3795 false);
3796 if (!block->colo_cache) {
3797 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3798 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3799 block->used_length);
3800 goto out_locked;
3801 }
3802 memcpy(block->colo_cache, block->host, block->used_length);
3803 }
3804 rcu_read_unlock();
7d9acafa
ZC
3805 /*
3806 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3807 * with to decide which page in cache should be flushed into SVM's RAM. Here
3808 * we use the same name 'ram_bitmap' as for migration.
3809 */
3810 if (ram_bytes_total()) {
3811 RAMBlock *block;
3812
fbd162e6 3813 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3814 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3815
3816 block->bmap = bitmap_new(pages);
3817 bitmap_set(block->bmap, 0, pages);
3818 }
3819 }
3820 ram_state = g_new0(RAMState, 1);
3821 ram_state->migration_dirty_pages = 0;
d1955d22 3822 memory_global_dirty_log_start();
7d9acafa 3823
13af18f2
ZC
3824 return 0;
3825
3826out_locked:
7d9acafa 3827
fbd162e6 3828 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
3829 if (block->colo_cache) {
3830 qemu_anon_ram_free(block->colo_cache, block->used_length);
3831 block->colo_cache = NULL;
3832 }
3833 }
3834
3835 rcu_read_unlock();
3836 return -errno;
3837}
3838
3839/* It is need to hold the global lock to call this helper */
3840void colo_release_ram_cache(void)
3841{
3842 RAMBlock *block;
3843
d1955d22 3844 memory_global_dirty_log_stop();
fbd162e6 3845 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
7d9acafa
ZC
3846 g_free(block->bmap);
3847 block->bmap = NULL;
3848 }
3849
13af18f2 3850 rcu_read_lock();
7d9acafa 3851
fbd162e6 3852 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
13af18f2
ZC
3853 if (block->colo_cache) {
3854 qemu_anon_ram_free(block->colo_cache, block->used_length);
3855 block->colo_cache = NULL;
3856 }
3857 }
7d9acafa 3858
13af18f2 3859 rcu_read_unlock();
7d9acafa
ZC
3860 g_free(ram_state);
3861 ram_state = NULL;
13af18f2
ZC
3862}
3863
f265e0e4
JQ
3864/**
3865 * ram_load_setup: Setup RAM for migration incoming side
3866 *
3867 * Returns zero to indicate success and negative for error
3868 *
3869 * @f: QEMUFile where to receive the data
3870 * @opaque: RAMState pointer
3871 */
3872static int ram_load_setup(QEMUFile *f, void *opaque)
3873{
34ab9e97 3874 if (compress_threads_load_setup(f)) {
797ca154
XG
3875 return -1;
3876 }
3877
f265e0e4 3878 xbzrle_load_setup();
f9494614 3879 ramblock_recv_map_init();
13af18f2 3880
f265e0e4
JQ
3881 return 0;
3882}
3883
3884static int ram_load_cleanup(void *opaque)
3885{
f9494614 3886 RAMBlock *rb;
56eb90af 3887
fbd162e6 3888 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
56eb90af
JH
3889 if (ramblock_is_pmem(rb)) {
3890 pmem_persist(rb->host, rb->used_length);
3891 }
3892 }
3893
f265e0e4 3894 xbzrle_load_cleanup();
f0afa331 3895 compress_threads_load_cleanup();
f9494614 3896
fbd162e6 3897 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
f9494614
AP
3898 g_free(rb->receivedmap);
3899 rb->receivedmap = NULL;
3900 }
13af18f2 3901
f265e0e4
JQ
3902 return 0;
3903}
3904
3d0684b2
JQ
3905/**
3906 * ram_postcopy_incoming_init: allocate postcopy data structures
3907 *
3908 * Returns 0 for success and negative if there was one error
3909 *
3910 * @mis: current migration incoming state
3911 *
3912 * Allocate data structures etc needed by incoming migration with
3913 * postcopy-ram. postcopy-ram's similarly names
3914 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3915 */
3916int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3917{
c136180c 3918 return postcopy_ram_incoming_init(mis);
1caddf8a
DDAG
3919}
3920
3d0684b2
JQ
3921/**
3922 * ram_load_postcopy: load a page in postcopy case
3923 *
3924 * Returns 0 for success or -errno in case of error
3925 *
a7180877
DDAG
3926 * Called in postcopy mode by ram_load().
3927 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3928 *
3929 * @f: QEMUFile where to send the data
a7180877
DDAG
3930 */
3931static int ram_load_postcopy(QEMUFile *f)
3932{
3933 int flags = 0, ret = 0;
3934 bool place_needed = false;
1aa83678 3935 bool matches_target_page_size = false;
a7180877
DDAG
3936 MigrationIncomingState *mis = migration_incoming_get_current();
3937 /* Temporary page that is later 'placed' */
3938 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 3939 void *last_host = NULL;
a3b6ff6d 3940 bool all_zero = false;
a7180877
DDAG
3941
3942 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3943 ram_addr_t addr;
3944 void *host = NULL;
3945 void *page_buffer = NULL;
3946 void *place_source = NULL;
df9ff5e1 3947 RAMBlock *block = NULL;
a7180877 3948 uint8_t ch;
a7180877
DDAG
3949
3950 addr = qemu_get_be64(f);
7a9ddfbf
PX
3951
3952 /*
3953 * If qemu file error, we should stop here, and then "addr"
3954 * may be invalid
3955 */
3956 ret = qemu_file_get_error(f);
3957 if (ret) {
3958 break;
3959 }
3960
a7180877
DDAG
3961 flags = addr & ~TARGET_PAGE_MASK;
3962 addr &= TARGET_PAGE_MASK;
3963
3964 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3965 place_needed = false;
bb890ed5 3966 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 3967 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3968
3969 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3970 if (!host) {
3971 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3972 ret = -EINVAL;
3973 break;
3974 }
1aa83678 3975 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
a7180877 3976 /*
28abd200
DDAG
3977 * Postcopy requires that we place whole host pages atomically;
3978 * these may be huge pages for RAMBlocks that are backed by
3979 * hugetlbfs.
a7180877
DDAG
3980 * To make it atomic, the data is read into a temporary page
3981 * that's moved into place later.
3982 * The migration protocol uses, possibly smaller, target-pages
3983 * however the source ensures it always sends all the components
3984 * of a host page in order.
3985 */
3986 page_buffer = postcopy_host_page +
28abd200 3987 ((uintptr_t)host & (block->page_size - 1));
a7180877 3988 /* If all TP are zero then we can optimise the place */
28abd200 3989 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 3990 all_zero = true;
c53b7ddc
DDAG
3991 } else {
3992 /* not the 1st TP within the HP */
3993 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 3994 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
3995 host, last_host);
3996 ret = -EINVAL;
3997 break;
3998 }
a7180877
DDAG
3999 }
4000
c53b7ddc 4001
a7180877
DDAG
4002 /*
4003 * If it's the last part of a host page then we place the host
4004 * page
4005 */
4006 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 4007 (block->page_size - 1)) == 0;
a7180877
DDAG
4008 place_source = postcopy_host_page;
4009 }
c53b7ddc 4010 last_host = host;
a7180877
DDAG
4011
4012 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 4013 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
4014 ch = qemu_get_byte(f);
4015 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4016 if (ch) {
4017 all_zero = false;
4018 }
4019 break;
4020
4021 case RAM_SAVE_FLAG_PAGE:
4022 all_zero = false;
1aa83678
PX
4023 if (!matches_target_page_size) {
4024 /* For huge pages, we always use temporary buffer */
a7180877
DDAG
4025 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4026 } else {
1aa83678
PX
4027 /*
4028 * For small pages that matches target page size, we
4029 * avoid the qemu_file copy. Instead we directly use
4030 * the buffer of QEMUFile to place the page. Note: we
4031 * cannot do any QEMUFile operation before using that
4032 * buffer to make sure the buffer is valid when
4033 * placing the page.
a7180877
DDAG
4034 */
4035 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4036 TARGET_PAGE_SIZE);
4037 }
4038 break;
4039 case RAM_SAVE_FLAG_EOS:
4040 /* normal exit */
6df264ac 4041 multifd_recv_sync_main();
a7180877
DDAG
4042 break;
4043 default:
4044 error_report("Unknown combination of migration flags: %#x"
4045 " (postcopy mode)", flags);
4046 ret = -EINVAL;
7a9ddfbf
PX
4047 break;
4048 }
4049
4050 /* Detect for any possible file errors */
4051 if (!ret && qemu_file_get_error(f)) {
4052 ret = qemu_file_get_error(f);
a7180877
DDAG
4053 }
4054
7a9ddfbf 4055 if (!ret && place_needed) {
a7180877 4056 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
4057 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4058
a7180877 4059 if (all_zero) {
df9ff5e1 4060 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 4061 block);
a7180877 4062 } else {
df9ff5e1 4063 ret = postcopy_place_page(mis, place_dest,
8be4620b 4064 place_source, block);
a7180877
DDAG
4065 }
4066 }
a7180877
DDAG
4067 }
4068
4069 return ret;
4070}
4071
acab30b8
DHB
4072static bool postcopy_is_advised(void)
4073{
4074 PostcopyState ps = postcopy_state_get();
4075 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4076}
4077
4078static bool postcopy_is_running(void)
4079{
4080 PostcopyState ps = postcopy_state_get();
4081 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4082}
4083
e6f4aa18
ZC
4084/*
4085 * Flush content of RAM cache into SVM's memory.
4086 * Only flush the pages that be dirtied by PVM or SVM or both.
4087 */
4088static void colo_flush_ram_cache(void)
4089{
4090 RAMBlock *block = NULL;
4091 void *dst_host;
4092 void *src_host;
4093 unsigned long offset = 0;
4094
d1955d22
HZ
4095 memory_global_dirty_log_sync();
4096 rcu_read_lock();
fbd162e6 4097 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
d1955d22
HZ
4098 migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4099 }
4100 rcu_read_unlock();
4101
e6f4aa18
ZC
4102 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4103 rcu_read_lock();
4104 block = QLIST_FIRST_RCU(&ram_list.blocks);
4105
4106 while (block) {
4107 offset = migration_bitmap_find_dirty(ram_state, block, offset);
4108
4109 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4110 offset = 0;
4111 block = QLIST_NEXT_RCU(block, next);
4112 } else {
4113 migration_bitmap_clear_dirty(ram_state, block, offset);
4114 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4115 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4116 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4117 }
4118 }
4119
4120 rcu_read_unlock();
4121 trace_colo_flush_ram_cache_end();
4122}
4123
56e93d26
JQ
4124static int ram_load(QEMUFile *f, void *opaque, int version_id)
4125{
edc60127 4126 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
4127 static uint64_t seq_iter;
4128 int len = 0;
a7180877
DDAG
4129 /*
4130 * If system is running in postcopy mode, page inserts to host memory must
4131 * be atomic
4132 */
acab30b8 4133 bool postcopy_running = postcopy_is_running();
ef08fb38 4134 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 4135 bool postcopy_advised = postcopy_is_advised();
56e93d26
JQ
4136
4137 seq_iter++;
4138
4139 if (version_id != 4) {
4140 ret = -EINVAL;
4141 }
4142
edc60127
JQ
4143 if (!migrate_use_compression()) {
4144 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4145 }
56e93d26
JQ
4146 /* This RCU critical section can be very long running.
4147 * When RCU reclaims in the code start to become numerous,
4148 * it will be necessary to reduce the granularity of this
4149 * critical section.
4150 */
4151 rcu_read_lock();
a7180877
DDAG
4152
4153 if (postcopy_running) {
4154 ret = ram_load_postcopy(f);
4155 }
4156
4157 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 4158 ram_addr_t addr, total_ram_bytes;
a776aa15 4159 void *host = NULL;
56e93d26
JQ
4160 uint8_t ch;
4161
4162 addr = qemu_get_be64(f);
4163 flags = addr & ~TARGET_PAGE_MASK;
4164 addr &= TARGET_PAGE_MASK;
4165
edc60127
JQ
4166 if (flags & invalid_flags) {
4167 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4168 error_report("Received an unexpected compressed page");
4169 }
4170
4171 ret = -EINVAL;
4172 break;
4173 }
4174
bb890ed5 4175 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 4176 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
4177 RAMBlock *block = ram_block_from_stream(f, flags);
4178
13af18f2
ZC
4179 /*
4180 * After going into COLO, we should load the Page into colo_cache.
4181 */
4182 if (migration_incoming_in_colo_state()) {
4183 host = colo_cache_from_block_offset(block, addr);
4184 } else {
4185 host = host_from_ram_block_offset(block, addr);
4186 }
a776aa15
DDAG
4187 if (!host) {
4188 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4189 ret = -EINVAL;
4190 break;
4191 }
13af18f2
ZC
4192
4193 if (!migration_incoming_in_colo_state()) {
4194 ramblock_recv_bitmap_set(block, host);
4195 }
4196
1db9d8e5 4197 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
4198 }
4199
56e93d26
JQ
4200 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4201 case RAM_SAVE_FLAG_MEM_SIZE:
4202 /* Synchronize RAM block list */
4203 total_ram_bytes = addr;
4204 while (!ret && total_ram_bytes) {
4205 RAMBlock *block;
56e93d26
JQ
4206 char id[256];
4207 ram_addr_t length;
4208
4209 len = qemu_get_byte(f);
4210 qemu_get_buffer(f, (uint8_t *)id, len);
4211 id[len] = 0;
4212 length = qemu_get_be64(f);
4213
e3dd7493 4214 block = qemu_ram_block_by_name(id);
b895de50
CLG
4215 if (block && !qemu_ram_is_migratable(block)) {
4216 error_report("block %s should not be migrated !", id);
4217 ret = -EINVAL;
4218 } else if (block) {
e3dd7493
DDAG
4219 if (length != block->used_length) {
4220 Error *local_err = NULL;
56e93d26 4221
fa53a0e5 4222 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
4223 &local_err);
4224 if (local_err) {
4225 error_report_err(local_err);
56e93d26 4226 }
56e93d26 4227 }
ef08fb38
DDAG
4228 /* For postcopy we need to check hugepage sizes match */
4229 if (postcopy_advised &&
4230 block->page_size != qemu_host_page_size) {
4231 uint64_t remote_page_size = qemu_get_be64(f);
4232 if (remote_page_size != block->page_size) {
4233 error_report("Mismatched RAM page size %s "
4234 "(local) %zd != %" PRId64,
4235 id, block->page_size,
4236 remote_page_size);
4237 ret = -EINVAL;
4238 }
4239 }
fbd162e6
YK
4240 if (migrate_ignore_shared()) {
4241 hwaddr addr = qemu_get_be64(f);
4242 bool ignored = qemu_get_byte(f);
4243 if (ignored != ramblock_is_ignored(block)) {
4244 error_report("RAM block %s should %s be migrated",
4245 id, ignored ? "" : "not");
4246 ret = -EINVAL;
4247 }
4248 if (ramblock_is_ignored(block) &&
4249 block->mr->addr != addr) {
4250 error_report("Mismatched GPAs for block %s "
4251 "%" PRId64 "!= %" PRId64,
4252 id, (uint64_t)addr,
4253 (uint64_t)block->mr->addr);
4254 ret = -EINVAL;
4255 }
4256 }
e3dd7493
DDAG
4257 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4258 block->idstr);
4259 } else {
56e93d26
JQ
4260 error_report("Unknown ramblock \"%s\", cannot "
4261 "accept migration", id);
4262 ret = -EINVAL;
4263 }
4264
4265 total_ram_bytes -= length;
4266 }
4267 break;
a776aa15 4268
bb890ed5 4269 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
4270 ch = qemu_get_byte(f);
4271 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4272 break;
a776aa15 4273
56e93d26 4274 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
4275 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4276 break;
56e93d26 4277
a776aa15 4278 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
4279 len = qemu_get_be32(f);
4280 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4281 error_report("Invalid compressed data length: %d", len);
4282 ret = -EINVAL;
4283 break;
4284 }
c1bc6626 4285 decompress_data_with_multi_threads(f, host, len);
56e93d26 4286 break;
a776aa15 4287
56e93d26 4288 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
4289 if (load_xbzrle(f, addr, host) < 0) {
4290 error_report("Failed to decompress XBZRLE page at "
4291 RAM_ADDR_FMT, addr);
4292 ret = -EINVAL;
4293 break;
4294 }
4295 break;
4296 case RAM_SAVE_FLAG_EOS:
4297 /* normal exit */
6df264ac 4298 multifd_recv_sync_main();
56e93d26
JQ
4299 break;
4300 default:
4301 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 4302 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
4303 } else {
4304 error_report("Unknown combination of migration flags: %#x",
4305 flags);
4306 ret = -EINVAL;
4307 }
4308 }
4309 if (!ret) {
4310 ret = qemu_file_get_error(f);
4311 }
4312 }
4313
34ab9e97 4314 ret |= wait_for_decompress_done();
56e93d26 4315 rcu_read_unlock();
55c4446b 4316 trace_ram_load_complete(ret, seq_iter);
e6f4aa18
ZC
4317
4318 if (!ret && migration_incoming_in_colo_state()) {
4319 colo_flush_ram_cache();
4320 }
56e93d26
JQ
4321 return ret;
4322}
4323
c6467627
VSO
4324static bool ram_has_postcopy(void *opaque)
4325{
469dd51b 4326 RAMBlock *rb;
fbd162e6 4327 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
469dd51b
JH
4328 if (ramblock_is_pmem(rb)) {
4329 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4330 "is not supported now!", rb->idstr, rb->host);
4331 return false;
4332 }
4333 }
4334
c6467627
VSO
4335 return migrate_postcopy_ram();
4336}
4337
edd090c7
PX
4338/* Sync all the dirty bitmap with destination VM. */
4339static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4340{
4341 RAMBlock *block;
4342 QEMUFile *file = s->to_dst_file;
4343 int ramblock_count = 0;
4344
4345 trace_ram_dirty_bitmap_sync_start();
4346
fbd162e6 4347 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
edd090c7
PX
4348 qemu_savevm_send_recv_bitmap(file, block->idstr);
4349 trace_ram_dirty_bitmap_request(block->idstr);
4350 ramblock_count++;
4351 }
4352
4353 trace_ram_dirty_bitmap_sync_wait();
4354
4355 /* Wait until all the ramblocks' dirty bitmap synced */
4356 while (ramblock_count--) {
4357 qemu_sem_wait(&s->rp_state.rp_sem);
4358 }
4359
4360 trace_ram_dirty_bitmap_sync_complete();
4361
4362 return 0;
4363}
4364
4365static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4366{
4367 qemu_sem_post(&s->rp_state.rp_sem);
4368}
4369
a335debb
PX
4370/*
4371 * Read the received bitmap, revert it as the initial dirty bitmap.
4372 * This is only used when the postcopy migration is paused but wants
4373 * to resume from a middle point.
4374 */
4375int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4376{
4377 int ret = -EINVAL;
4378 QEMUFile *file = s->rp_state.from_dst_file;
4379 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
a725ef9f 4380 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
a335debb
PX
4381 uint64_t size, end_mark;
4382
4383 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4384
4385 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4386 error_report("%s: incorrect state %s", __func__,
4387 MigrationStatus_str(s->state));
4388 return -EINVAL;
4389 }
4390
4391 /*
4392 * Note: see comments in ramblock_recv_bitmap_send() on why we
4393 * need the endianess convertion, and the paddings.
4394 */
4395 local_size = ROUND_UP(local_size, 8);
4396
4397 /* Add paddings */
4398 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4399
4400 size = qemu_get_be64(file);
4401
4402 /* The size of the bitmap should match with our ramblock */
4403 if (size != local_size) {
4404 error_report("%s: ramblock '%s' bitmap size mismatch "
4405 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4406 block->idstr, size, local_size);
4407 ret = -EINVAL;
4408 goto out;
4409 }
4410
4411 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4412 end_mark = qemu_get_be64(file);
4413
4414 ret = qemu_file_get_error(file);
4415 if (ret || size != local_size) {
4416 error_report("%s: read bitmap failed for ramblock '%s': %d"
4417 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4418 __func__, block->idstr, ret, local_size, size);
4419 ret = -EIO;
4420 goto out;
4421 }
4422
4423 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4424 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4425 __func__, block->idstr, end_mark);
4426 ret = -EINVAL;
4427 goto out;
4428 }
4429
4430 /*
4431 * Endianess convertion. We are during postcopy (though paused).
4432 * The dirty bitmap won't change. We can directly modify it.
4433 */
4434 bitmap_from_le(block->bmap, le_bitmap, nbits);
4435
4436 /*
4437 * What we received is "received bitmap". Revert it as the initial
4438 * dirty bitmap for this ramblock.
4439 */
4440 bitmap_complement(block->bmap, block->bmap, nbits);
4441
4442 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4443
edd090c7
PX
4444 /*
4445 * We succeeded to sync bitmap for current ramblock. If this is
4446 * the last one to sync, we need to notify the main send thread.
4447 */
4448 ram_dirty_bitmap_reload_notify(s);
4449
a335debb
PX
4450 ret = 0;
4451out:
bf269906 4452 g_free(le_bitmap);
a335debb
PX
4453 return ret;
4454}
4455
edd090c7
PX
4456static int ram_resume_prepare(MigrationState *s, void *opaque)
4457{
4458 RAMState *rs = *(RAMState **)opaque;
08614f34 4459 int ret;
edd090c7 4460
08614f34
PX
4461 ret = ram_dirty_bitmap_sync_all(s, rs);
4462 if (ret) {
4463 return ret;
4464 }
4465
4466 ram_state_resume_prepare(rs, s->to_dst_file);
4467
4468 return 0;
edd090c7
PX
4469}
4470
56e93d26 4471static SaveVMHandlers savevm_ram_handlers = {
9907e842 4472 .save_setup = ram_save_setup,
56e93d26 4473 .save_live_iterate = ram_save_iterate,
763c906b 4474 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 4475 .save_live_complete_precopy = ram_save_complete,
c6467627 4476 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
4477 .save_live_pending = ram_save_pending,
4478 .load_state = ram_load,
f265e0e4
JQ
4479 .save_cleanup = ram_save_cleanup,
4480 .load_setup = ram_load_setup,
4481 .load_cleanup = ram_load_cleanup,
edd090c7 4482 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
4483};
4484
4485void ram_mig_init(void)
4486{
4487 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 4488 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 4489}
This page took 0.951326 seconds and 4 git commands to generate.