]> Git Repo - qemu.git/blame - migration/ram.c
Merge remote-tracking branch 'remotes/stefanha/tags/block-pull-request' into staging
[qemu.git] / migration / ram.c
CommitLineData
56e93d26
JQ
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
76cc7b58
JQ
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <[email protected]>
56e93d26
JQ
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
e688df6b 28
1393a485 29#include "qemu/osdep.h"
33c11879 30#include "cpu.h"
56e93d26 31#include <zlib.h>
f348b6d1 32#include "qemu/cutils.h"
56e93d26
JQ
33#include "qemu/bitops.h"
34#include "qemu/bitmap.h"
7205c9ec 35#include "qemu/main-loop.h"
709e3fe8 36#include "xbzrle.h"
7b1e1a22 37#include "ram.h"
6666c96a 38#include "migration.h"
71bb07db 39#include "socket.h"
f2a8f0a6 40#include "migration/register.h"
7b1e1a22 41#include "migration/misc.h"
08a0aee1 42#include "qemu-file.h"
be07b0ac 43#include "postcopy-ram.h"
53d37d36 44#include "page_cache.h"
56e93d26 45#include "qemu/error-report.h"
e688df6b 46#include "qapi/error.h"
9af23989 47#include "qapi/qapi-events-migration.h"
8acabf69 48#include "qapi/qmp/qerror.h"
56e93d26 49#include "trace.h"
56e93d26 50#include "exec/ram_addr.h"
f9494614 51#include "exec/target_page.h"
56e93d26 52#include "qemu/rcu_queue.h"
a91246c9 53#include "migration/colo.h"
53d37d36 54#include "block.h"
af8b7d2b
JQ
55#include "sysemu/sysemu.h"
56#include "qemu/uuid.h"
edd090c7 57#include "savevm.h"
56e93d26 58
56e93d26
JQ
59/***********************************************************/
60/* ram save/restore */
61
bb890ed5
JQ
62/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63 * worked for pages that where filled with the same char. We switched
64 * it to only search for the zero value. And to avoid confusion with
65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66 */
67
56e93d26 68#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
bb890ed5 69#define RAM_SAVE_FLAG_ZERO 0x02
56e93d26
JQ
70#define RAM_SAVE_FLAG_MEM_SIZE 0x04
71#define RAM_SAVE_FLAG_PAGE 0x08
72#define RAM_SAVE_FLAG_EOS 0x10
73#define RAM_SAVE_FLAG_CONTINUE 0x20
74#define RAM_SAVE_FLAG_XBZRLE 0x40
75/* 0x80 is reserved in migration.h start with 0x100 next */
76#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
77
56e93d26
JQ
78static inline bool is_zero_range(uint8_t *p, uint64_t size)
79{
a1febc49 80 return buffer_is_zero(p, size);
56e93d26
JQ
81}
82
9360447d
JQ
83XBZRLECacheStats xbzrle_counters;
84
56e93d26
JQ
85/* struct contains XBZRLE cache and a static page
86 used by the compression */
87static struct {
88 /* buffer used for XBZRLE encoding */
89 uint8_t *encoded_buf;
90 /* buffer for storing page content */
91 uint8_t *current_buf;
92 /* Cache for XBZRLE, Protected by lock. */
93 PageCache *cache;
94 QemuMutex lock;
c00e0928
JQ
95 /* it will store a page full of zeros */
96 uint8_t *zero_target_page;
f265e0e4
JQ
97 /* buffer used for XBZRLE decoding */
98 uint8_t *decoded_buf;
56e93d26
JQ
99} XBZRLE;
100
56e93d26
JQ
101static void XBZRLE_cache_lock(void)
102{
103 if (migrate_use_xbzrle())
104 qemu_mutex_lock(&XBZRLE.lock);
105}
106
107static void XBZRLE_cache_unlock(void)
108{
109 if (migrate_use_xbzrle())
110 qemu_mutex_unlock(&XBZRLE.lock);
111}
112
3d0684b2
JQ
113/**
114 * xbzrle_cache_resize: resize the xbzrle cache
115 *
116 * This function is called from qmp_migrate_set_cache_size in main
117 * thread, possibly while a migration is in progress. A running
118 * migration may be using the cache and might finish during this call,
119 * hence changes to the cache are protected by XBZRLE.lock().
120 *
c9dede2d 121 * Returns 0 for success or -1 for error
3d0684b2
JQ
122 *
123 * @new_size: new cache size
8acabf69 124 * @errp: set *errp if the check failed, with reason
56e93d26 125 */
c9dede2d 126int xbzrle_cache_resize(int64_t new_size, Error **errp)
56e93d26
JQ
127{
128 PageCache *new_cache;
c9dede2d 129 int64_t ret = 0;
56e93d26 130
8acabf69
JQ
131 /* Check for truncation */
132 if (new_size != (size_t)new_size) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeding address space");
135 return -1;
136 }
137
2a313e5c
JQ
138 if (new_size == migrate_xbzrle_cache_size()) {
139 /* nothing to do */
c9dede2d 140 return 0;
2a313e5c
JQ
141 }
142
56e93d26
JQ
143 XBZRLE_cache_lock();
144
145 if (XBZRLE.cache != NULL) {
80f8dfde 146 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
56e93d26 147 if (!new_cache) {
56e93d26
JQ
148 ret = -1;
149 goto out;
150 }
151
152 cache_fini(XBZRLE.cache);
153 XBZRLE.cache = new_cache;
154 }
56e93d26
JQ
155out:
156 XBZRLE_cache_unlock();
157 return ret;
158}
159
b895de50
CLG
160/* Should be holding either ram_list.mutex, or the RCU lock. */
161#define RAMBLOCK_FOREACH_MIGRATABLE(block) \
162 RAMBLOCK_FOREACH(block) \
163 if (!qemu_ram_is_migratable(block)) {} else
164
f9494614
AP
165static void ramblock_recv_map_init(void)
166{
167 RAMBlock *rb;
168
b895de50 169 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
170 assert(!rb->receivedmap);
171 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
172 }
173}
174
175int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
176{
177 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
178 rb->receivedmap);
179}
180
1cba9f6e
DDAG
181bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
182{
183 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
184}
185
f9494614
AP
186void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
187{
188 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
189}
190
191void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
192 size_t nr)
193{
194 bitmap_set_atomic(rb->receivedmap,
195 ramblock_recv_bitmap_offset(host_addr, rb),
196 nr);
197}
198
a335debb
PX
199#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
200
201/*
202 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
203 *
204 * Returns >0 if success with sent bytes, or <0 if error.
205 */
206int64_t ramblock_recv_bitmap_send(QEMUFile *file,
207 const char *block_name)
208{
209 RAMBlock *block = qemu_ram_block_by_name(block_name);
210 unsigned long *le_bitmap, nbits;
211 uint64_t size;
212
213 if (!block) {
214 error_report("%s: invalid block name: %s", __func__, block_name);
215 return -1;
216 }
217
218 nbits = block->used_length >> TARGET_PAGE_BITS;
219
220 /*
221 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
222 * machines we may need 4 more bytes for padding (see below
223 * comment). So extend it a bit before hand.
224 */
225 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
226
227 /*
228 * Always use little endian when sending the bitmap. This is
229 * required that when source and destination VMs are not using the
230 * same endianess. (Note: big endian won't work.)
231 */
232 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
233
234 /* Size of the bitmap, in bytes */
235 size = nbits / 8;
236
237 /*
238 * size is always aligned to 8 bytes for 64bit machines, but it
239 * may not be true for 32bit machines. We need this padding to
240 * make sure the migration can survive even between 32bit and
241 * 64bit machines.
242 */
243 size = ROUND_UP(size, 8);
244
245 qemu_put_be64(file, size);
246 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
247 /*
248 * Mark as an end, in case the middle part is screwed up due to
249 * some "misterious" reason.
250 */
251 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
252 qemu_fflush(file);
253
bf269906 254 g_free(le_bitmap);
a335debb
PX
255
256 if (qemu_file_get_error(file)) {
257 return qemu_file_get_error(file);
258 }
259
260 return size + sizeof(size);
261}
262
ec481c6c
JQ
263/*
264 * An outstanding page request, on the source, having been received
265 * and queued
266 */
267struct RAMSrcPageRequest {
268 RAMBlock *rb;
269 hwaddr offset;
270 hwaddr len;
271
272 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
273};
274
6f37bb8b
JQ
275/* State of RAM for migration */
276struct RAMState {
204b88b8
JQ
277 /* QEMUFile used for this migration */
278 QEMUFile *f;
6f37bb8b
JQ
279 /* Last block that we have visited searching for dirty pages */
280 RAMBlock *last_seen_block;
281 /* Last block from where we have sent data */
282 RAMBlock *last_sent_block;
269ace29
JQ
283 /* Last dirty target page we have sent */
284 ram_addr_t last_page;
6f37bb8b
JQ
285 /* last ram version we have seen */
286 uint32_t last_version;
287 /* We are in the first round */
288 bool ram_bulk_stage;
8d820d6f
JQ
289 /* How many times we have dirty too many pages */
290 int dirty_rate_high_cnt;
f664da80
JQ
291 /* these variables are used for bitmap sync */
292 /* last time we did a full bitmap_sync */
293 int64_t time_last_bitmap_sync;
eac74159 294 /* bytes transferred at start_time */
c4bdf0cf 295 uint64_t bytes_xfer_prev;
a66cd90c 296 /* number of dirty pages since start_time */
68908ed6 297 uint64_t num_dirty_pages_period;
b5833fde
JQ
298 /* xbzrle misses since the beginning of the period */
299 uint64_t xbzrle_cache_miss_prev;
36040d9c
JQ
300 /* number of iterations at the beginning of period */
301 uint64_t iterations_prev;
23b28c3c
JQ
302 /* Iterations since start */
303 uint64_t iterations;
9360447d 304 /* number of dirty bits in the bitmap */
2dfaf12e
PX
305 uint64_t migration_dirty_pages;
306 /* protects modification of the bitmap */
108cfae0 307 QemuMutex bitmap_mutex;
68a098f3
JQ
308 /* The RAMBlock used in the last src_page_requests */
309 RAMBlock *last_req_rb;
ec481c6c
JQ
310 /* Queue of outstanding page requests from the destination */
311 QemuMutex src_page_req_mutex;
312 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
6f37bb8b
JQ
313};
314typedef struct RAMState RAMState;
315
53518d94 316static RAMState *ram_state;
6f37bb8b 317
9edabd4d 318uint64_t ram_bytes_remaining(void)
2f4fde93 319{
bae416e5
DDAG
320 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
321 0;
2f4fde93
JQ
322}
323
9360447d 324MigrationStats ram_counters;
96506894 325
b8fb8cb7
DDAG
326/* used by the search for pages to send */
327struct PageSearchStatus {
328 /* Current block being searched */
329 RAMBlock *block;
a935e30f
JQ
330 /* Current page to search from */
331 unsigned long page;
b8fb8cb7
DDAG
332 /* Set once we wrap around */
333 bool complete_round;
334};
335typedef struct PageSearchStatus PageSearchStatus;
336
56e93d26 337struct CompressParam {
56e93d26 338 bool done;
90e56fb4 339 bool quit;
56e93d26
JQ
340 QEMUFile *file;
341 QemuMutex mutex;
342 QemuCond cond;
343 RAMBlock *block;
344 ram_addr_t offset;
34ab9e97
XG
345
346 /* internally used fields */
dcaf446e 347 z_stream stream;
34ab9e97 348 uint8_t *originbuf;
56e93d26
JQ
349};
350typedef struct CompressParam CompressParam;
351
352struct DecompressParam {
73a8912b 353 bool done;
90e56fb4 354 bool quit;
56e93d26
JQ
355 QemuMutex mutex;
356 QemuCond cond;
357 void *des;
d341d9f3 358 uint8_t *compbuf;
56e93d26 359 int len;
797ca154 360 z_stream stream;
56e93d26
JQ
361};
362typedef struct DecompressParam DecompressParam;
363
364static CompressParam *comp_param;
365static QemuThread *compress_threads;
366/* comp_done_cond is used to wake up the migration thread when
367 * one of the compression threads has finished the compression.
368 * comp_done_lock is used to co-work with comp_done_cond.
369 */
0d9f9a5c
LL
370static QemuMutex comp_done_lock;
371static QemuCond comp_done_cond;
56e93d26
JQ
372/* The empty QEMUFileOps will be used by file in CompressParam */
373static const QEMUFileOps empty_ops = { };
374
34ab9e97 375static QEMUFile *decomp_file;
56e93d26
JQ
376static DecompressParam *decomp_param;
377static QemuThread *decompress_threads;
73a8912b
LL
378static QemuMutex decomp_done_lock;
379static QemuCond decomp_done_cond;
56e93d26 380
dcaf446e 381static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
34ab9e97 382 ram_addr_t offset, uint8_t *source_buf);
56e93d26
JQ
383
384static void *do_data_compress(void *opaque)
385{
386 CompressParam *param = opaque;
a7a9a88f
LL
387 RAMBlock *block;
388 ram_addr_t offset;
56e93d26 389
a7a9a88f 390 qemu_mutex_lock(&param->mutex);
90e56fb4 391 while (!param->quit) {
a7a9a88f
LL
392 if (param->block) {
393 block = param->block;
394 offset = param->offset;
395 param->block = NULL;
396 qemu_mutex_unlock(&param->mutex);
397
34ab9e97
XG
398 do_compress_ram_page(param->file, &param->stream, block, offset,
399 param->originbuf);
a7a9a88f 400
0d9f9a5c 401 qemu_mutex_lock(&comp_done_lock);
a7a9a88f 402 param->done = true;
0d9f9a5c
LL
403 qemu_cond_signal(&comp_done_cond);
404 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
405
406 qemu_mutex_lock(&param->mutex);
407 } else {
56e93d26
JQ
408 qemu_cond_wait(&param->cond, &param->mutex);
409 }
56e93d26 410 }
a7a9a88f 411 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
412
413 return NULL;
414}
415
416static inline void terminate_compression_threads(void)
417{
418 int idx, thread_count;
419
420 thread_count = migrate_compress_threads();
3d0684b2 421
56e93d26
JQ
422 for (idx = 0; idx < thread_count; idx++) {
423 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 424 comp_param[idx].quit = true;
56e93d26
JQ
425 qemu_cond_signal(&comp_param[idx].cond);
426 qemu_mutex_unlock(&comp_param[idx].mutex);
427 }
428}
429
f0afa331 430static void compress_threads_save_cleanup(void)
56e93d26
JQ
431{
432 int i, thread_count;
433
434 if (!migrate_use_compression()) {
435 return;
436 }
437 terminate_compression_threads();
438 thread_count = migrate_compress_threads();
439 for (i = 0; i < thread_count; i++) {
dcaf446e
XG
440 /*
441 * we use it as a indicator which shows if the thread is
442 * properly init'd or not
443 */
444 if (!comp_param[i].file) {
445 break;
446 }
56e93d26 447 qemu_thread_join(compress_threads + i);
56e93d26
JQ
448 qemu_mutex_destroy(&comp_param[i].mutex);
449 qemu_cond_destroy(&comp_param[i].cond);
dcaf446e 450 deflateEnd(&comp_param[i].stream);
34ab9e97 451 g_free(comp_param[i].originbuf);
dcaf446e
XG
452 qemu_fclose(comp_param[i].file);
453 comp_param[i].file = NULL;
56e93d26 454 }
0d9f9a5c
LL
455 qemu_mutex_destroy(&comp_done_lock);
456 qemu_cond_destroy(&comp_done_cond);
56e93d26
JQ
457 g_free(compress_threads);
458 g_free(comp_param);
56e93d26
JQ
459 compress_threads = NULL;
460 comp_param = NULL;
56e93d26
JQ
461}
462
dcaf446e 463static int compress_threads_save_setup(void)
56e93d26
JQ
464{
465 int i, thread_count;
466
467 if (!migrate_use_compression()) {
dcaf446e 468 return 0;
56e93d26 469 }
56e93d26
JQ
470 thread_count = migrate_compress_threads();
471 compress_threads = g_new0(QemuThread, thread_count);
472 comp_param = g_new0(CompressParam, thread_count);
0d9f9a5c
LL
473 qemu_cond_init(&comp_done_cond);
474 qemu_mutex_init(&comp_done_lock);
56e93d26 475 for (i = 0; i < thread_count; i++) {
34ab9e97
XG
476 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
477 if (!comp_param[i].originbuf) {
478 goto exit;
479 }
480
dcaf446e
XG
481 if (deflateInit(&comp_param[i].stream,
482 migrate_compress_level()) != Z_OK) {
34ab9e97 483 g_free(comp_param[i].originbuf);
dcaf446e
XG
484 goto exit;
485 }
486
e110aa91
C
487 /* comp_param[i].file is just used as a dummy buffer to save data,
488 * set its ops to empty.
56e93d26
JQ
489 */
490 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
491 comp_param[i].done = true;
90e56fb4 492 comp_param[i].quit = false;
56e93d26
JQ
493 qemu_mutex_init(&comp_param[i].mutex);
494 qemu_cond_init(&comp_param[i].cond);
495 qemu_thread_create(compress_threads + i, "compress",
496 do_data_compress, comp_param + i,
497 QEMU_THREAD_JOINABLE);
498 }
dcaf446e
XG
499 return 0;
500
501exit:
502 compress_threads_save_cleanup();
503 return -1;
56e93d26
JQ
504}
505
f986c3d2
JQ
506/* Multiple fd's */
507
af8b7d2b
JQ
508#define MULTIFD_MAGIC 0x11223344U
509#define MULTIFD_VERSION 1
510
511typedef struct {
512 uint32_t magic;
513 uint32_t version;
514 unsigned char uuid[16]; /* QemuUUID */
515 uint8_t id;
516} __attribute__((packed)) MultiFDInit_t;
517
8c4598f2
JQ
518typedef struct {
519 /* this fields are not changed once the thread is created */
520 /* channel number */
f986c3d2 521 uint8_t id;
8c4598f2 522 /* channel thread name */
f986c3d2 523 char *name;
8c4598f2 524 /* channel thread id */
f986c3d2 525 QemuThread thread;
8c4598f2 526 /* communication channel */
60df2d4a 527 QIOChannel *c;
8c4598f2 528 /* sem where to wait for more work */
f986c3d2 529 QemuSemaphore sem;
8c4598f2 530 /* this mutex protects the following parameters */
f986c3d2 531 QemuMutex mutex;
8c4598f2 532 /* is this channel thread running */
66770707 533 bool running;
8c4598f2 534 /* should this thread finish */
f986c3d2 535 bool quit;
8c4598f2
JQ
536} MultiFDSendParams;
537
538typedef struct {
539 /* this fields are not changed once the thread is created */
540 /* channel number */
541 uint8_t id;
542 /* channel thread name */
543 char *name;
544 /* channel thread id */
545 QemuThread thread;
546 /* communication channel */
547 QIOChannel *c;
548 /* sem where to wait for more work */
549 QemuSemaphore sem;
550 /* this mutex protects the following parameters */
551 QemuMutex mutex;
552 /* is this channel thread running */
553 bool running;
554 /* should this thread finish */
555 bool quit;
556} MultiFDRecvParams;
f986c3d2 557
af8b7d2b
JQ
558static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
559{
560 MultiFDInit_t msg;
561 int ret;
562
563 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
564 msg.version = cpu_to_be32(MULTIFD_VERSION);
565 msg.id = p->id;
566 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
567
568 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
569 if (ret != 0) {
570 return -1;
571 }
572 return 0;
573}
574
575static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
576{
577 MultiFDInit_t msg;
578 int ret;
579
580 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
581 if (ret != 0) {
582 return -1;
583 }
584
585 be32_to_cpus(&msg.magic);
586 be32_to_cpus(&msg.version);
587
588 if (msg.magic != MULTIFD_MAGIC) {
589 error_setg(errp, "multifd: received packet magic %x "
590 "expected %x", msg.magic, MULTIFD_MAGIC);
591 return -1;
592 }
593
594 if (msg.version != MULTIFD_VERSION) {
595 error_setg(errp, "multifd: received packet version %d "
596 "expected %d", msg.version, MULTIFD_VERSION);
597 return -1;
598 }
599
600 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
601 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
602 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
603
604 error_setg(errp, "multifd: received uuid '%s' and expected "
605 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
606 g_free(uuid);
607 g_free(msg_uuid);
608 return -1;
609 }
610
611 if (msg.id > migrate_multifd_channels()) {
612 error_setg(errp, "multifd: received channel version %d "
613 "expected %d", msg.version, MULTIFD_VERSION);
614 return -1;
615 }
616
617 return msg.id;
618}
619
f986c3d2
JQ
620struct {
621 MultiFDSendParams *params;
622 /* number of created threads */
623 int count;
624} *multifd_send_state;
625
66770707 626static void multifd_send_terminate_threads(Error *err)
f986c3d2
JQ
627{
628 int i;
629
7a169d74
JQ
630 if (err) {
631 MigrationState *s = migrate_get_current();
632 migrate_set_error(s, err);
633 if (s->state == MIGRATION_STATUS_SETUP ||
634 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
635 s->state == MIGRATION_STATUS_DEVICE ||
636 s->state == MIGRATION_STATUS_ACTIVE) {
637 migrate_set_state(&s->state, s->state,
638 MIGRATION_STATUS_FAILED);
639 }
640 }
641
66770707 642 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
643 MultiFDSendParams *p = &multifd_send_state->params[i];
644
645 qemu_mutex_lock(&p->mutex);
646 p->quit = true;
647 qemu_sem_post(&p->sem);
648 qemu_mutex_unlock(&p->mutex);
649 }
650}
651
652int multifd_save_cleanup(Error **errp)
653{
654 int i;
655 int ret = 0;
656
657 if (!migrate_use_multifd()) {
658 return 0;
659 }
66770707
JQ
660 multifd_send_terminate_threads(NULL);
661 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
662 MultiFDSendParams *p = &multifd_send_state->params[i];
663
66770707
JQ
664 if (p->running) {
665 qemu_thread_join(&p->thread);
666 }
60df2d4a
JQ
667 socket_send_channel_destroy(p->c);
668 p->c = NULL;
f986c3d2
JQ
669 qemu_mutex_destroy(&p->mutex);
670 qemu_sem_destroy(&p->sem);
671 g_free(p->name);
672 p->name = NULL;
673 }
674 g_free(multifd_send_state->params);
675 multifd_send_state->params = NULL;
676 g_free(multifd_send_state);
677 multifd_send_state = NULL;
678 return ret;
679}
680
681static void *multifd_send_thread(void *opaque)
682{
683 MultiFDSendParams *p = opaque;
af8b7d2b
JQ
684 Error *local_err = NULL;
685
686 if (multifd_send_initial_packet(p, &local_err) < 0) {
687 goto out;
688 }
f986c3d2
JQ
689
690 while (true) {
691 qemu_mutex_lock(&p->mutex);
692 if (p->quit) {
693 qemu_mutex_unlock(&p->mutex);
694 break;
695 }
696 qemu_mutex_unlock(&p->mutex);
697 qemu_sem_wait(&p->sem);
698 }
699
af8b7d2b
JQ
700out:
701 if (local_err) {
702 multifd_send_terminate_threads(local_err);
703 }
704
66770707
JQ
705 qemu_mutex_lock(&p->mutex);
706 p->running = false;
707 qemu_mutex_unlock(&p->mutex);
708
f986c3d2
JQ
709 return NULL;
710}
711
60df2d4a
JQ
712static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
713{
714 MultiFDSendParams *p = opaque;
715 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
716 Error *local_err = NULL;
717
718 if (qio_task_propagate_error(task, &local_err)) {
719 if (multifd_save_cleanup(&local_err) != 0) {
720 migrate_set_error(migrate_get_current(), local_err);
721 }
722 } else {
723 p->c = QIO_CHANNEL(sioc);
724 qio_channel_set_delay(p->c, false);
725 p->running = true;
726 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
727 QEMU_THREAD_JOINABLE);
728
729 atomic_inc(&multifd_send_state->count);
730 }
731}
732
f986c3d2
JQ
733int multifd_save_setup(void)
734{
735 int thread_count;
736 uint8_t i;
737
738 if (!migrate_use_multifd()) {
739 return 0;
740 }
741 thread_count = migrate_multifd_channels();
742 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
743 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
66770707 744 atomic_set(&multifd_send_state->count, 0);
f986c3d2
JQ
745 for (i = 0; i < thread_count; i++) {
746 MultiFDSendParams *p = &multifd_send_state->params[i];
747
748 qemu_mutex_init(&p->mutex);
749 qemu_sem_init(&p->sem, 0);
750 p->quit = false;
751 p->id = i;
752 p->name = g_strdup_printf("multifdsend_%d", i);
60df2d4a 753 socket_send_channel_create(multifd_new_send_channel_async, p);
f986c3d2
JQ
754 }
755 return 0;
756}
757
f986c3d2
JQ
758struct {
759 MultiFDRecvParams *params;
760 /* number of created threads */
761 int count;
762} *multifd_recv_state;
763
66770707 764static void multifd_recv_terminate_threads(Error *err)
f986c3d2
JQ
765{
766 int i;
767
7a169d74
JQ
768 if (err) {
769 MigrationState *s = migrate_get_current();
770 migrate_set_error(s, err);
771 if (s->state == MIGRATION_STATUS_SETUP ||
772 s->state == MIGRATION_STATUS_ACTIVE) {
773 migrate_set_state(&s->state, s->state,
774 MIGRATION_STATUS_FAILED);
775 }
776 }
777
66770707 778 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
779 MultiFDRecvParams *p = &multifd_recv_state->params[i];
780
781 qemu_mutex_lock(&p->mutex);
782 p->quit = true;
783 qemu_sem_post(&p->sem);
784 qemu_mutex_unlock(&p->mutex);
785 }
786}
787
788int multifd_load_cleanup(Error **errp)
789{
790 int i;
791 int ret = 0;
792
793 if (!migrate_use_multifd()) {
794 return 0;
795 }
66770707
JQ
796 multifd_recv_terminate_threads(NULL);
797 for (i = 0; i < migrate_multifd_channels(); i++) {
f986c3d2
JQ
798 MultiFDRecvParams *p = &multifd_recv_state->params[i];
799
66770707
JQ
800 if (p->running) {
801 qemu_thread_join(&p->thread);
802 }
60df2d4a
JQ
803 object_unref(OBJECT(p->c));
804 p->c = NULL;
f986c3d2
JQ
805 qemu_mutex_destroy(&p->mutex);
806 qemu_sem_destroy(&p->sem);
807 g_free(p->name);
808 p->name = NULL;
809 }
810 g_free(multifd_recv_state->params);
811 multifd_recv_state->params = NULL;
812 g_free(multifd_recv_state);
813 multifd_recv_state = NULL;
814
815 return ret;
816}
817
818static void *multifd_recv_thread(void *opaque)
819{
820 MultiFDRecvParams *p = opaque;
821
822 while (true) {
823 qemu_mutex_lock(&p->mutex);
824 if (p->quit) {
825 qemu_mutex_unlock(&p->mutex);
826 break;
827 }
828 qemu_mutex_unlock(&p->mutex);
829 qemu_sem_wait(&p->sem);
830 }
831
66770707
JQ
832 qemu_mutex_lock(&p->mutex);
833 p->running = false;
834 qemu_mutex_unlock(&p->mutex);
835
f986c3d2
JQ
836 return NULL;
837}
838
839int multifd_load_setup(void)
840{
841 int thread_count;
842 uint8_t i;
843
844 if (!migrate_use_multifd()) {
845 return 0;
846 }
847 thread_count = migrate_multifd_channels();
848 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
849 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
66770707 850 atomic_set(&multifd_recv_state->count, 0);
f986c3d2
JQ
851 for (i = 0; i < thread_count; i++) {
852 MultiFDRecvParams *p = &multifd_recv_state->params[i];
853
854 qemu_mutex_init(&p->mutex);
855 qemu_sem_init(&p->sem, 0);
856 p->quit = false;
857 p->id = i;
858 p->name = g_strdup_printf("multifdrecv_%d", i);
f986c3d2
JQ
859 }
860 return 0;
861}
862
62c1e0ca
JQ
863bool multifd_recv_all_channels_created(void)
864{
865 int thread_count = migrate_multifd_channels();
866
867 if (!migrate_use_multifd()) {
868 return true;
869 }
870
871 return thread_count == atomic_read(&multifd_recv_state->count);
872}
873
71bb07db
JQ
874void multifd_recv_new_channel(QIOChannel *ioc)
875{
60df2d4a 876 MultiFDRecvParams *p;
af8b7d2b
JQ
877 Error *local_err = NULL;
878 int id;
60df2d4a 879
af8b7d2b
JQ
880 id = multifd_recv_initial_packet(ioc, &local_err);
881 if (id < 0) {
882 multifd_recv_terminate_threads(local_err);
883 return;
884 }
885
886 p = &multifd_recv_state->params[id];
887 if (p->c != NULL) {
888 error_setg(&local_err, "multifd: received id '%d' already setup'",
889 id);
890 multifd_recv_terminate_threads(local_err);
891 return;
892 }
60df2d4a
JQ
893 p->c = ioc;
894 object_ref(OBJECT(ioc));
895
896 p->running = true;
897 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
898 QEMU_THREAD_JOINABLE);
899 atomic_inc(&multifd_recv_state->count);
36c2f8be
JQ
900 if (multifd_recv_state->count == migrate_multifd_channels()) {
901 migration_incoming_process();
902 }
71bb07db
JQ
903}
904
56e93d26 905/**
3d0684b2 906 * save_page_header: write page header to wire
56e93d26
JQ
907 *
908 * If this is the 1st block, it also writes the block identification
909 *
3d0684b2 910 * Returns the number of bytes written
56e93d26
JQ
911 *
912 * @f: QEMUFile where to send the data
913 * @block: block that contains the page we want to send
914 * @offset: offset inside the block for the page
915 * in the lower bits, it contains flags
916 */
2bf3aa85
JQ
917static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
918 ram_addr_t offset)
56e93d26 919{
9f5f380b 920 size_t size, len;
56e93d26 921
24795694
JQ
922 if (block == rs->last_sent_block) {
923 offset |= RAM_SAVE_FLAG_CONTINUE;
924 }
2bf3aa85 925 qemu_put_be64(f, offset);
56e93d26
JQ
926 size = 8;
927
928 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
9f5f380b 929 len = strlen(block->idstr);
2bf3aa85
JQ
930 qemu_put_byte(f, len);
931 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
9f5f380b 932 size += 1 + len;
24795694 933 rs->last_sent_block = block;
56e93d26
JQ
934 }
935 return size;
936}
937
3d0684b2
JQ
938/**
939 * mig_throttle_guest_down: throotle down the guest
940 *
941 * Reduce amount of guest cpu execution to hopefully slow down memory
942 * writes. If guest dirty memory rate is reduced below the rate at
943 * which we can transfer pages to the destination then we should be
944 * able to complete migration. Some workloads dirty memory way too
945 * fast and will not effectively converge, even with auto-converge.
070afca2
JH
946 */
947static void mig_throttle_guest_down(void)
948{
949 MigrationState *s = migrate_get_current();
2594f56d
DB
950 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
951 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
070afca2
JH
952
953 /* We have not started throttling yet. Let's start it. */
954 if (!cpu_throttle_active()) {
955 cpu_throttle_set(pct_initial);
956 } else {
957 /* Throttling already on, just increase the rate */
958 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
959 }
960}
961
3d0684b2
JQ
962/**
963 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
964 *
6f37bb8b 965 * @rs: current RAM state
3d0684b2
JQ
966 * @current_addr: address for the zero page
967 *
968 * Update the xbzrle cache to reflect a page that's been sent as all 0.
56e93d26
JQ
969 * The important thing is that a stale (not-yet-0'd) page be replaced
970 * by the new data.
971 * As a bonus, if the page wasn't in the cache it gets added so that
3d0684b2 972 * when a small write is made into the 0'd page it gets XBZRLE sent.
56e93d26 973 */
6f37bb8b 974static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
56e93d26 975{
6f37bb8b 976 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
56e93d26
JQ
977 return;
978 }
979
980 /* We don't care if this fails to allocate a new cache page
981 * as long as it updated an old one */
c00e0928 982 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
9360447d 983 ram_counters.dirty_sync_count);
56e93d26
JQ
984}
985
986#define ENCODING_FLAG_XBZRLE 0x1
987
988/**
989 * save_xbzrle_page: compress and send current page
990 *
991 * Returns: 1 means that we wrote the page
992 * 0 means that page is identical to the one already sent
993 * -1 means that xbzrle would be longer than normal
994 *
5a987738 995 * @rs: current RAM state
3d0684b2
JQ
996 * @current_data: pointer to the address of the page contents
997 * @current_addr: addr of the page
56e93d26
JQ
998 * @block: block that contains the page we want to send
999 * @offset: offset inside the block for the page
1000 * @last_stage: if we are at the completion stage
56e93d26 1001 */
204b88b8 1002static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
56e93d26 1003 ram_addr_t current_addr, RAMBlock *block,
072c2511 1004 ram_addr_t offset, bool last_stage)
56e93d26
JQ
1005{
1006 int encoded_len = 0, bytes_xbzrle;
1007 uint8_t *prev_cached_page;
1008
9360447d
JQ
1009 if (!cache_is_cached(XBZRLE.cache, current_addr,
1010 ram_counters.dirty_sync_count)) {
1011 xbzrle_counters.cache_miss++;
56e93d26
JQ
1012 if (!last_stage) {
1013 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
9360447d 1014 ram_counters.dirty_sync_count) == -1) {
56e93d26
JQ
1015 return -1;
1016 } else {
1017 /* update *current_data when the page has been
1018 inserted into cache */
1019 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1020 }
1021 }
1022 return -1;
1023 }
1024
1025 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1026
1027 /* save current buffer into memory */
1028 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1029
1030 /* XBZRLE encoding (if there is no overflow) */
1031 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1032 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1033 TARGET_PAGE_SIZE);
1034 if (encoded_len == 0) {
55c4446b 1035 trace_save_xbzrle_page_skipping();
56e93d26
JQ
1036 return 0;
1037 } else if (encoded_len == -1) {
55c4446b 1038 trace_save_xbzrle_page_overflow();
9360447d 1039 xbzrle_counters.overflow++;
56e93d26
JQ
1040 /* update data in the cache */
1041 if (!last_stage) {
1042 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1043 *current_data = prev_cached_page;
1044 }
1045 return -1;
1046 }
1047
1048 /* we need to update the data in the cache, in order to get the same data */
1049 if (!last_stage) {
1050 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1051 }
1052
1053 /* Send XBZRLE based compressed page */
2bf3aa85 1054 bytes_xbzrle = save_page_header(rs, rs->f, block,
204b88b8
JQ
1055 offset | RAM_SAVE_FLAG_XBZRLE);
1056 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1057 qemu_put_be16(rs->f, encoded_len);
1058 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
56e93d26 1059 bytes_xbzrle += encoded_len + 1 + 2;
9360447d
JQ
1060 xbzrle_counters.pages++;
1061 xbzrle_counters.bytes += bytes_xbzrle;
1062 ram_counters.transferred += bytes_xbzrle;
56e93d26
JQ
1063
1064 return 1;
1065}
1066
3d0684b2
JQ
1067/**
1068 * migration_bitmap_find_dirty: find the next dirty page from start
f3f491fc 1069 *
3d0684b2
JQ
1070 * Called with rcu_read_lock() to protect migration_bitmap
1071 *
1072 * Returns the byte offset within memory region of the start of a dirty page
1073 *
6f37bb8b 1074 * @rs: current RAM state
3d0684b2 1075 * @rb: RAMBlock where to search for dirty pages
a935e30f 1076 * @start: page where we start the search
f3f491fc 1077 */
56e93d26 1078static inline
a935e30f 1079unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
f20e2865 1080 unsigned long start)
56e93d26 1081{
6b6712ef
JQ
1082 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1083 unsigned long *bitmap = rb->bmap;
56e93d26
JQ
1084 unsigned long next;
1085
b895de50
CLG
1086 if (!qemu_ram_is_migratable(rb)) {
1087 return size;
1088 }
1089
6b6712ef
JQ
1090 if (rs->ram_bulk_stage && start > 0) {
1091 next = start + 1;
56e93d26 1092 } else {
6b6712ef 1093 next = find_next_bit(bitmap, size, start);
56e93d26
JQ
1094 }
1095
6b6712ef 1096 return next;
56e93d26
JQ
1097}
1098
06b10688 1099static inline bool migration_bitmap_clear_dirty(RAMState *rs,
f20e2865
JQ
1100 RAMBlock *rb,
1101 unsigned long page)
a82d593b
DDAG
1102{
1103 bool ret;
a82d593b 1104
6b6712ef 1105 ret = test_and_clear_bit(page, rb->bmap);
a82d593b
DDAG
1106
1107 if (ret) {
0d8ec885 1108 rs->migration_dirty_pages--;
a82d593b
DDAG
1109 }
1110 return ret;
1111}
1112
15440dd5
JQ
1113static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1114 ram_addr_t start, ram_addr_t length)
56e93d26 1115{
0d8ec885 1116 rs->migration_dirty_pages +=
6b6712ef 1117 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
0d8ec885 1118 &rs->num_dirty_pages_period);
56e93d26
JQ
1119}
1120
3d0684b2
JQ
1121/**
1122 * ram_pagesize_summary: calculate all the pagesizes of a VM
1123 *
1124 * Returns a summary bitmap of the page sizes of all RAMBlocks
1125 *
1126 * For VMs with just normal pages this is equivalent to the host page
1127 * size. If it's got some huge pages then it's the OR of all the
1128 * different page sizes.
e8ca1db2
DDAG
1129 */
1130uint64_t ram_pagesize_summary(void)
1131{
1132 RAMBlock *block;
1133 uint64_t summary = 0;
1134
b895de50 1135 RAMBLOCK_FOREACH_MIGRATABLE(block) {
e8ca1db2
DDAG
1136 summary |= block->page_size;
1137 }
1138
1139 return summary;
1140}
1141
8d820d6f 1142static void migration_bitmap_sync(RAMState *rs)
56e93d26
JQ
1143{
1144 RAMBlock *block;
56e93d26 1145 int64_t end_time;
c4bdf0cf 1146 uint64_t bytes_xfer_now;
56e93d26 1147
9360447d 1148 ram_counters.dirty_sync_count++;
56e93d26 1149
f664da80
JQ
1150 if (!rs->time_last_bitmap_sync) {
1151 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
56e93d26
JQ
1152 }
1153
1154 trace_migration_bitmap_sync_start();
9c1f8f44 1155 memory_global_dirty_log_sync();
56e93d26 1156
108cfae0 1157 qemu_mutex_lock(&rs->bitmap_mutex);
56e93d26 1158 rcu_read_lock();
b895de50 1159 RAMBLOCK_FOREACH_MIGRATABLE(block) {
15440dd5 1160 migration_bitmap_sync_range(rs, block, 0, block->used_length);
56e93d26
JQ
1161 }
1162 rcu_read_unlock();
108cfae0 1163 qemu_mutex_unlock(&rs->bitmap_mutex);
56e93d26 1164
a66cd90c 1165 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1ffb5dfd 1166
56e93d26
JQ
1167 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1168
1169 /* more than 1 second = 1000 millisecons */
f664da80 1170 if (end_time > rs->time_last_bitmap_sync + 1000) {
d693c6f1 1171 /* calculate period counters */
9360447d 1172 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
d693c6f1 1173 / (end_time - rs->time_last_bitmap_sync);
9360447d 1174 bytes_xfer_now = ram_counters.transferred;
d693c6f1 1175
9ac78b61
PL
1176 /* During block migration the auto-converge logic incorrectly detects
1177 * that ram migration makes no progress. Avoid this by disabling the
1178 * throttling logic during the bulk phase of block migration. */
1179 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
56e93d26
JQ
1180 /* The following detection logic can be refined later. For now:
1181 Check to see if the dirtied bytes is 50% more than the approx.
1182 amount of bytes that just got transferred since the last time we
070afca2
JH
1183 were in this routine. If that happens twice, start or increase
1184 throttling */
070afca2 1185
d693c6f1 1186 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
eac74159 1187 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
b4a3c64b 1188 (++rs->dirty_rate_high_cnt >= 2)) {
56e93d26 1189 trace_migration_throttle();
8d820d6f 1190 rs->dirty_rate_high_cnt = 0;
070afca2 1191 mig_throttle_guest_down();
d693c6f1 1192 }
56e93d26 1193 }
070afca2 1194
56e93d26 1195 if (migrate_use_xbzrle()) {
23b28c3c 1196 if (rs->iterations_prev != rs->iterations) {
9360447d
JQ
1197 xbzrle_counters.cache_miss_rate =
1198 (double)(xbzrle_counters.cache_miss -
b5833fde 1199 rs->xbzrle_cache_miss_prev) /
23b28c3c 1200 (rs->iterations - rs->iterations_prev);
56e93d26 1201 }
23b28c3c 1202 rs->iterations_prev = rs->iterations;
9360447d 1203 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
56e93d26 1204 }
d693c6f1
FF
1205
1206 /* reset period counters */
f664da80 1207 rs->time_last_bitmap_sync = end_time;
a66cd90c 1208 rs->num_dirty_pages_period = 0;
d2a4d85a 1209 rs->bytes_xfer_prev = bytes_xfer_now;
56e93d26 1210 }
4addcd4f 1211 if (migrate_use_events()) {
9360447d 1212 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
4addcd4f 1213 }
56e93d26
JQ
1214}
1215
1216/**
3d0684b2 1217 * save_zero_page: send the zero page to the stream
56e93d26 1218 *
3d0684b2 1219 * Returns the number of pages written.
56e93d26 1220 *
f7ccd61b 1221 * @rs: current RAM state
56e93d26
JQ
1222 * @block: block that contains the page we want to send
1223 * @offset: offset inside the block for the page
56e93d26 1224 */
7faccdc3 1225static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
56e93d26 1226{
7faccdc3 1227 uint8_t *p = block->host + offset;
56e93d26
JQ
1228 int pages = -1;
1229
1230 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
9360447d
JQ
1231 ram_counters.duplicate++;
1232 ram_counters.transferred +=
bb890ed5 1233 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
ce25d337 1234 qemu_put_byte(rs->f, 0);
9360447d 1235 ram_counters.transferred += 1;
56e93d26
JQ
1236 pages = 1;
1237 }
1238
1239 return pages;
1240}
1241
5727309d 1242static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
53f09a10 1243{
5727309d 1244 if (!migrate_release_ram() || !migration_in_postcopy()) {
53f09a10
PB
1245 return;
1246 }
1247
aaa2064c 1248 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
53f09a10
PB
1249}
1250
059ff0fb
XG
1251/*
1252 * @pages: the number of pages written by the control path,
1253 * < 0 - error
1254 * > 0 - number of pages written
1255 *
1256 * Return true if the pages has been saved, otherwise false is returned.
1257 */
1258static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1259 int *pages)
1260{
1261 uint64_t bytes_xmit = 0;
1262 int ret;
1263
1264 *pages = -1;
1265 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1266 &bytes_xmit);
1267 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1268 return false;
1269 }
1270
1271 if (bytes_xmit) {
1272 ram_counters.transferred += bytes_xmit;
1273 *pages = 1;
1274 }
1275
1276 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1277 return true;
1278 }
1279
1280 if (bytes_xmit > 0) {
1281 ram_counters.normal++;
1282 } else if (bytes_xmit == 0) {
1283 ram_counters.duplicate++;
1284 }
1285
1286 return true;
1287}
1288
65dacaa0
XG
1289/*
1290 * directly send the page to the stream
1291 *
1292 * Returns the number of pages written.
1293 *
1294 * @rs: current RAM state
1295 * @block: block that contains the page we want to send
1296 * @offset: offset inside the block for the page
1297 * @buf: the page to be sent
1298 * @async: send to page asyncly
1299 */
1300static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301 uint8_t *buf, bool async)
1302{
1303 ram_counters.transferred += save_page_header(rs, rs->f, block,
1304 offset | RAM_SAVE_FLAG_PAGE);
1305 if (async) {
1306 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1307 migrate_release_ram() &
1308 migration_in_postcopy());
1309 } else {
1310 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1311 }
1312 ram_counters.transferred += TARGET_PAGE_SIZE;
1313 ram_counters.normal++;
1314 return 1;
1315}
1316
56e93d26 1317/**
3d0684b2 1318 * ram_save_page: send the given page to the stream
56e93d26 1319 *
3d0684b2 1320 * Returns the number of pages written.
3fd3c4b3
DDAG
1321 * < 0 - error
1322 * >=0 - Number of pages written - this might legally be 0
1323 * if xbzrle noticed the page was the same.
56e93d26 1324 *
6f37bb8b 1325 * @rs: current RAM state
56e93d26
JQ
1326 * @block: block that contains the page we want to send
1327 * @offset: offset inside the block for the page
1328 * @last_stage: if we are at the completion stage
56e93d26 1329 */
a0a8aa14 1330static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
56e93d26
JQ
1331{
1332 int pages = -1;
56e93d26 1333 uint8_t *p;
56e93d26 1334 bool send_async = true;
a08f6890 1335 RAMBlock *block = pss->block;
a935e30f 1336 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
059ff0fb 1337 ram_addr_t current_addr = block->offset + offset;
56e93d26 1338
2f68e399 1339 p = block->host + offset;
1db9d8e5 1340 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
56e93d26 1341
56e93d26 1342 XBZRLE_cache_lock();
d7400a34
XG
1343 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1344 migrate_use_xbzrle()) {
059ff0fb
XG
1345 pages = save_xbzrle_page(rs, &p, current_addr, block,
1346 offset, last_stage);
1347 if (!last_stage) {
1348 /* Can't send this cached data async, since the cache page
1349 * might get updated before it gets to the wire
56e93d26 1350 */
059ff0fb 1351 send_async = false;
56e93d26
JQ
1352 }
1353 }
1354
1355 /* XBZRLE overflow or normal page */
1356 if (pages == -1) {
65dacaa0 1357 pages = save_normal_page(rs, block, offset, p, send_async);
56e93d26
JQ
1358 }
1359
1360 XBZRLE_cache_unlock();
1361
1362 return pages;
1363}
1364
dcaf446e 1365static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
34ab9e97 1366 ram_addr_t offset, uint8_t *source_buf)
56e93d26 1367{
53518d94 1368 RAMState *rs = ram_state;
56e93d26 1369 int bytes_sent, blen;
a7a9a88f 1370 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
56e93d26 1371
2bf3aa85 1372 bytes_sent = save_page_header(rs, f, block, offset |
56e93d26 1373 RAM_SAVE_FLAG_COMPRESS_PAGE);
34ab9e97
XG
1374
1375 /*
1376 * copy it to a internal buffer to avoid it being modified by VM
1377 * so that we can catch up the error during compression and
1378 * decompression
1379 */
1380 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1381 blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
b3be2896
LL
1382 if (blen < 0) {
1383 bytes_sent = 0;
1384 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1385 error_report("compressed data failed!");
1386 } else {
1387 bytes_sent += blen;
5727309d 1388 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
b3be2896 1389 }
56e93d26
JQ
1390
1391 return bytes_sent;
1392}
1393
ce25d337 1394static void flush_compressed_data(RAMState *rs)
56e93d26
JQ
1395{
1396 int idx, len, thread_count;
1397
1398 if (!migrate_use_compression()) {
1399 return;
1400 }
1401 thread_count = migrate_compress_threads();
a7a9a88f 1402
0d9f9a5c 1403 qemu_mutex_lock(&comp_done_lock);
56e93d26 1404 for (idx = 0; idx < thread_count; idx++) {
a7a9a88f 1405 while (!comp_param[idx].done) {
0d9f9a5c 1406 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26 1407 }
a7a9a88f 1408 }
0d9f9a5c 1409 qemu_mutex_unlock(&comp_done_lock);
a7a9a88f
LL
1410
1411 for (idx = 0; idx < thread_count; idx++) {
1412 qemu_mutex_lock(&comp_param[idx].mutex);
90e56fb4 1413 if (!comp_param[idx].quit) {
ce25d337 1414 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
9360447d 1415 ram_counters.transferred += len;
56e93d26 1416 }
a7a9a88f 1417 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26
JQ
1418 }
1419}
1420
1421static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1422 ram_addr_t offset)
1423{
1424 param->block = block;
1425 param->offset = offset;
1426}
1427
ce25d337
JQ
1428static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1429 ram_addr_t offset)
56e93d26
JQ
1430{
1431 int idx, thread_count, bytes_xmit = -1, pages = -1;
1432
1433 thread_count = migrate_compress_threads();
0d9f9a5c 1434 qemu_mutex_lock(&comp_done_lock);
56e93d26
JQ
1435 while (true) {
1436 for (idx = 0; idx < thread_count; idx++) {
1437 if (comp_param[idx].done) {
a7a9a88f 1438 comp_param[idx].done = false;
ce25d337 1439 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
a7a9a88f 1440 qemu_mutex_lock(&comp_param[idx].mutex);
56e93d26 1441 set_compress_params(&comp_param[idx], block, offset);
a7a9a88f
LL
1442 qemu_cond_signal(&comp_param[idx].cond);
1443 qemu_mutex_unlock(&comp_param[idx].mutex);
56e93d26 1444 pages = 1;
9360447d
JQ
1445 ram_counters.normal++;
1446 ram_counters.transferred += bytes_xmit;
56e93d26
JQ
1447 break;
1448 }
1449 }
1450 if (pages > 0) {
1451 break;
1452 } else {
0d9f9a5c 1453 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
56e93d26
JQ
1454 }
1455 }
0d9f9a5c 1456 qemu_mutex_unlock(&comp_done_lock);
56e93d26
JQ
1457
1458 return pages;
1459}
1460
3d0684b2
JQ
1461/**
1462 * find_dirty_block: find the next dirty page and update any state
1463 * associated with the search process.
b9e60928 1464 *
3d0684b2 1465 * Returns if a page is found
b9e60928 1466 *
6f37bb8b 1467 * @rs: current RAM state
3d0684b2
JQ
1468 * @pss: data about the state of the current dirty page scan
1469 * @again: set to false if the search has scanned the whole of RAM
b9e60928 1470 */
f20e2865 1471static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
b9e60928 1472{
f20e2865 1473 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
6f37bb8b 1474 if (pss->complete_round && pss->block == rs->last_seen_block &&
a935e30f 1475 pss->page >= rs->last_page) {
b9e60928
DDAG
1476 /*
1477 * We've been once around the RAM and haven't found anything.
1478 * Give up.
1479 */
1480 *again = false;
1481 return false;
1482 }
a935e30f 1483 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
b9e60928 1484 /* Didn't find anything in this RAM Block */
a935e30f 1485 pss->page = 0;
b9e60928
DDAG
1486 pss->block = QLIST_NEXT_RCU(pss->block, next);
1487 if (!pss->block) {
1488 /* Hit the end of the list */
1489 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1490 /* Flag that we've looped */
1491 pss->complete_round = true;
6f37bb8b 1492 rs->ram_bulk_stage = false;
b9e60928
DDAG
1493 if (migrate_use_xbzrle()) {
1494 /* If xbzrle is on, stop using the data compression at this
1495 * point. In theory, xbzrle can do better than compression.
1496 */
ce25d337 1497 flush_compressed_data(rs);
b9e60928
DDAG
1498 }
1499 }
1500 /* Didn't find anything this time, but try again on the new block */
1501 *again = true;
1502 return false;
1503 } else {
1504 /* Can go around again, but... */
1505 *again = true;
1506 /* We've found something so probably don't need to */
1507 return true;
1508 }
1509}
1510
3d0684b2
JQ
1511/**
1512 * unqueue_page: gets a page of the queue
1513 *
a82d593b 1514 * Helper for 'get_queued_page' - gets a page off the queue
a82d593b 1515 *
3d0684b2
JQ
1516 * Returns the block of the page (or NULL if none available)
1517 *
ec481c6c 1518 * @rs: current RAM state
3d0684b2 1519 * @offset: used to return the offset within the RAMBlock
a82d593b 1520 */
f20e2865 1521static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
a82d593b
DDAG
1522{
1523 RAMBlock *block = NULL;
1524
ec481c6c
JQ
1525 qemu_mutex_lock(&rs->src_page_req_mutex);
1526 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1527 struct RAMSrcPageRequest *entry =
1528 QSIMPLEQ_FIRST(&rs->src_page_requests);
a82d593b
DDAG
1529 block = entry->rb;
1530 *offset = entry->offset;
a82d593b
DDAG
1531
1532 if (entry->len > TARGET_PAGE_SIZE) {
1533 entry->len -= TARGET_PAGE_SIZE;
1534 entry->offset += TARGET_PAGE_SIZE;
1535 } else {
1536 memory_region_unref(block->mr);
ec481c6c 1537 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
a82d593b
DDAG
1538 g_free(entry);
1539 }
1540 }
ec481c6c 1541 qemu_mutex_unlock(&rs->src_page_req_mutex);
a82d593b
DDAG
1542
1543 return block;
1544}
1545
3d0684b2
JQ
1546/**
1547 * get_queued_page: unqueue a page from the postocpy requests
1548 *
1549 * Skips pages that are already sent (!dirty)
a82d593b 1550 *
3d0684b2 1551 * Returns if a queued page is found
a82d593b 1552 *
6f37bb8b 1553 * @rs: current RAM state
3d0684b2 1554 * @pss: data about the state of the current dirty page scan
a82d593b 1555 */
f20e2865 1556static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
a82d593b
DDAG
1557{
1558 RAMBlock *block;
1559 ram_addr_t offset;
1560 bool dirty;
1561
1562 do {
f20e2865 1563 block = unqueue_page(rs, &offset);
a82d593b
DDAG
1564 /*
1565 * We're sending this page, and since it's postcopy nothing else
1566 * will dirty it, and we must make sure it doesn't get sent again
1567 * even if this queue request was received after the background
1568 * search already sent it.
1569 */
1570 if (block) {
f20e2865
JQ
1571 unsigned long page;
1572
6b6712ef
JQ
1573 page = offset >> TARGET_PAGE_BITS;
1574 dirty = test_bit(page, block->bmap);
a82d593b 1575 if (!dirty) {
06b10688 1576 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
6b6712ef 1577 page, test_bit(page, block->unsentmap));
a82d593b 1578 } else {
f20e2865 1579 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
a82d593b
DDAG
1580 }
1581 }
1582
1583 } while (block && !dirty);
1584
1585 if (block) {
1586 /*
1587 * As soon as we start servicing pages out of order, then we have
1588 * to kill the bulk stage, since the bulk stage assumes
1589 * in (migration_bitmap_find_and_reset_dirty) that every page is
1590 * dirty, that's no longer true.
1591 */
6f37bb8b 1592 rs->ram_bulk_stage = false;
a82d593b
DDAG
1593
1594 /*
1595 * We want the background search to continue from the queued page
1596 * since the guest is likely to want other pages near to the page
1597 * it just requested.
1598 */
1599 pss->block = block;
a935e30f 1600 pss->page = offset >> TARGET_PAGE_BITS;
a82d593b
DDAG
1601 }
1602
1603 return !!block;
1604}
1605
6c595cde 1606/**
5e58f968
JQ
1607 * migration_page_queue_free: drop any remaining pages in the ram
1608 * request queue
6c595cde 1609 *
3d0684b2
JQ
1610 * It should be empty at the end anyway, but in error cases there may
1611 * be some left. in case that there is any page left, we drop it.
1612 *
6c595cde 1613 */
83c13382 1614static void migration_page_queue_free(RAMState *rs)
6c595cde 1615{
ec481c6c 1616 struct RAMSrcPageRequest *mspr, *next_mspr;
6c595cde
DDAG
1617 /* This queue generally should be empty - but in the case of a failed
1618 * migration might have some droppings in.
1619 */
1620 rcu_read_lock();
ec481c6c 1621 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
6c595cde 1622 memory_region_unref(mspr->rb->mr);
ec481c6c 1623 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
6c595cde
DDAG
1624 g_free(mspr);
1625 }
1626 rcu_read_unlock();
1627}
1628
1629/**
3d0684b2
JQ
1630 * ram_save_queue_pages: queue the page for transmission
1631 *
1632 * A request from postcopy destination for example.
1633 *
1634 * Returns zero on success or negative on error
1635 *
3d0684b2
JQ
1636 * @rbname: Name of the RAMBLock of the request. NULL means the
1637 * same that last one.
1638 * @start: starting address from the start of the RAMBlock
1639 * @len: length (in bytes) to send
6c595cde 1640 */
96506894 1641int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
6c595cde
DDAG
1642{
1643 RAMBlock *ramblock;
53518d94 1644 RAMState *rs = ram_state;
6c595cde 1645
9360447d 1646 ram_counters.postcopy_requests++;
6c595cde
DDAG
1647 rcu_read_lock();
1648 if (!rbname) {
1649 /* Reuse last RAMBlock */
68a098f3 1650 ramblock = rs->last_req_rb;
6c595cde
DDAG
1651
1652 if (!ramblock) {
1653 /*
1654 * Shouldn't happen, we can't reuse the last RAMBlock if
1655 * it's the 1st request.
1656 */
1657 error_report("ram_save_queue_pages no previous block");
1658 goto err;
1659 }
1660 } else {
1661 ramblock = qemu_ram_block_by_name(rbname);
1662
1663 if (!ramblock) {
1664 /* We shouldn't be asked for a non-existent RAMBlock */
1665 error_report("ram_save_queue_pages no block '%s'", rbname);
1666 goto err;
1667 }
68a098f3 1668 rs->last_req_rb = ramblock;
6c595cde
DDAG
1669 }
1670 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1671 if (start+len > ramblock->used_length) {
9458ad6b
JQ
1672 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1673 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
6c595cde
DDAG
1674 __func__, start, len, ramblock->used_length);
1675 goto err;
1676 }
1677
ec481c6c
JQ
1678 struct RAMSrcPageRequest *new_entry =
1679 g_malloc0(sizeof(struct RAMSrcPageRequest));
6c595cde
DDAG
1680 new_entry->rb = ramblock;
1681 new_entry->offset = start;
1682 new_entry->len = len;
1683
1684 memory_region_ref(ramblock->mr);
ec481c6c
JQ
1685 qemu_mutex_lock(&rs->src_page_req_mutex);
1686 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1687 qemu_mutex_unlock(&rs->src_page_req_mutex);
6c595cde
DDAG
1688 rcu_read_unlock();
1689
1690 return 0;
1691
1692err:
1693 rcu_read_unlock();
1694 return -1;
1695}
1696
d7400a34
XG
1697static bool save_page_use_compression(RAMState *rs)
1698{
1699 if (!migrate_use_compression()) {
1700 return false;
1701 }
1702
1703 /*
1704 * If xbzrle is on, stop using the data compression after first
1705 * round of migration even if compression is enabled. In theory,
1706 * xbzrle can do better than compression.
1707 */
1708 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1709 return true;
1710 }
1711
1712 return false;
1713}
1714
a82d593b 1715/**
3d0684b2 1716 * ram_save_target_page: save one target page
a82d593b 1717 *
3d0684b2 1718 * Returns the number of pages written
a82d593b 1719 *
6f37bb8b 1720 * @rs: current RAM state
3d0684b2 1721 * @pss: data about the page we want to send
a82d593b 1722 * @last_stage: if we are at the completion stage
a82d593b 1723 */
a0a8aa14 1724static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1725 bool last_stage)
a82d593b 1726{
a8ec91f9
XG
1727 RAMBlock *block = pss->block;
1728 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1729 int res;
1730
1731 if (control_save_page(rs, block, offset, &res)) {
1732 return res;
1733 }
1734
1faa5665 1735 /*
d7400a34
XG
1736 * When starting the process of a new block, the first page of
1737 * the block should be sent out before other pages in the same
1738 * block, and all the pages in last block should have been sent
1739 * out, keeping this order is important, because the 'cont' flag
1740 * is used to avoid resending the block name.
1faa5665 1741 */
d7400a34
XG
1742 if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1743 flush_compressed_data(rs);
1744 }
1745
1746 res = save_zero_page(rs, block, offset);
1747 if (res > 0) {
1748 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1749 * page would be stale
1750 */
1751 if (!save_page_use_compression(rs)) {
1752 XBZRLE_cache_lock();
1753 xbzrle_cache_zero_page(rs, block->offset + offset);
1754 XBZRLE_cache_unlock();
1755 }
1756 ram_release_pages(block->idstr, offset, res);
1757 return res;
1758 }
1759
da3f56cb
XG
1760 /*
1761 * Make sure the first page is sent out before other pages.
1762 *
1763 * we post it as normal page as compression will take much
1764 * CPU resource.
1765 */
1766 if (block == rs->last_sent_block && save_page_use_compression(rs)) {
701b1876 1767 return compress_page_with_multi_thread(rs, block, offset);
a82d593b
DDAG
1768 }
1769
1faa5665 1770 return ram_save_page(rs, pss, last_stage);
a82d593b
DDAG
1771}
1772
1773/**
3d0684b2 1774 * ram_save_host_page: save a whole host page
a82d593b 1775 *
3d0684b2
JQ
1776 * Starting at *offset send pages up to the end of the current host
1777 * page. It's valid for the initial offset to point into the middle of
1778 * a host page in which case the remainder of the hostpage is sent.
1779 * Only dirty target pages are sent. Note that the host page size may
1780 * be a huge page for this block.
1eb3fc0a
DDAG
1781 * The saving stops at the boundary of the used_length of the block
1782 * if the RAMBlock isn't a multiple of the host page size.
a82d593b 1783 *
3d0684b2
JQ
1784 * Returns the number of pages written or negative on error
1785 *
6f37bb8b 1786 * @rs: current RAM state
3d0684b2 1787 * @ms: current migration state
3d0684b2 1788 * @pss: data about the page we want to send
a82d593b 1789 * @last_stage: if we are at the completion stage
a82d593b 1790 */
a0a8aa14 1791static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
f20e2865 1792 bool last_stage)
a82d593b
DDAG
1793{
1794 int tmppages, pages = 0;
a935e30f
JQ
1795 size_t pagesize_bits =
1796 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
4c011c37 1797
b895de50
CLG
1798 if (!qemu_ram_is_migratable(pss->block)) {
1799 error_report("block %s should not be migrated !", pss->block->idstr);
1800 return 0;
1801 }
1802
a82d593b 1803 do {
1faa5665
XG
1804 /* Check the pages is dirty and if it is send it */
1805 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1806 pss->page++;
1807 continue;
1808 }
1809
f20e2865 1810 tmppages = ram_save_target_page(rs, pss, last_stage);
a82d593b
DDAG
1811 if (tmppages < 0) {
1812 return tmppages;
1813 }
1814
1815 pages += tmppages;
1faa5665
XG
1816 if (pss->block->unsentmap) {
1817 clear_bit(pss->page, pss->block->unsentmap);
1818 }
1819
a935e30f 1820 pss->page++;
1eb3fc0a
DDAG
1821 } while ((pss->page & (pagesize_bits - 1)) &&
1822 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
a82d593b
DDAG
1823
1824 /* The offset we leave with is the last one we looked at */
a935e30f 1825 pss->page--;
a82d593b
DDAG
1826 return pages;
1827}
6c595cde 1828
56e93d26 1829/**
3d0684b2 1830 * ram_find_and_save_block: finds a dirty page and sends it to f
56e93d26
JQ
1831 *
1832 * Called within an RCU critical section.
1833 *
3d0684b2 1834 * Returns the number of pages written where zero means no dirty pages
56e93d26 1835 *
6f37bb8b 1836 * @rs: current RAM state
56e93d26 1837 * @last_stage: if we are at the completion stage
a82d593b
DDAG
1838 *
1839 * On systems where host-page-size > target-page-size it will send all the
1840 * pages in a host page that are dirty.
56e93d26
JQ
1841 */
1842
ce25d337 1843static int ram_find_and_save_block(RAMState *rs, bool last_stage)
56e93d26 1844{
b8fb8cb7 1845 PageSearchStatus pss;
56e93d26 1846 int pages = 0;
b9e60928 1847 bool again, found;
56e93d26 1848
0827b9e9
AA
1849 /* No dirty page as there is zero RAM */
1850 if (!ram_bytes_total()) {
1851 return pages;
1852 }
1853
6f37bb8b 1854 pss.block = rs->last_seen_block;
a935e30f 1855 pss.page = rs->last_page;
b8fb8cb7
DDAG
1856 pss.complete_round = false;
1857
1858 if (!pss.block) {
1859 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1860 }
56e93d26 1861
b9e60928 1862 do {
a82d593b 1863 again = true;
f20e2865 1864 found = get_queued_page(rs, &pss);
b9e60928 1865
a82d593b
DDAG
1866 if (!found) {
1867 /* priority queue empty, so just search for something dirty */
f20e2865 1868 found = find_dirty_block(rs, &pss, &again);
a82d593b 1869 }
f3f491fc 1870
a82d593b 1871 if (found) {
f20e2865 1872 pages = ram_save_host_page(rs, &pss, last_stage);
56e93d26 1873 }
b9e60928 1874 } while (!pages && again);
56e93d26 1875
6f37bb8b 1876 rs->last_seen_block = pss.block;
a935e30f 1877 rs->last_page = pss.page;
56e93d26
JQ
1878
1879 return pages;
1880}
1881
1882void acct_update_position(QEMUFile *f, size_t size, bool zero)
1883{
1884 uint64_t pages = size / TARGET_PAGE_SIZE;
f7ccd61b 1885
56e93d26 1886 if (zero) {
9360447d 1887 ram_counters.duplicate += pages;
56e93d26 1888 } else {
9360447d
JQ
1889 ram_counters.normal += pages;
1890 ram_counters.transferred += size;
56e93d26
JQ
1891 qemu_update_position(f, size);
1892 }
1893}
1894
56e93d26
JQ
1895uint64_t ram_bytes_total(void)
1896{
1897 RAMBlock *block;
1898 uint64_t total = 0;
1899
1900 rcu_read_lock();
b895de50 1901 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26 1902 total += block->used_length;
99e15582 1903 }
56e93d26
JQ
1904 rcu_read_unlock();
1905 return total;
1906}
1907
f265e0e4 1908static void xbzrle_load_setup(void)
56e93d26 1909{
f265e0e4 1910 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
56e93d26
JQ
1911}
1912
f265e0e4
JQ
1913static void xbzrle_load_cleanup(void)
1914{
1915 g_free(XBZRLE.decoded_buf);
1916 XBZRLE.decoded_buf = NULL;
1917}
1918
7d7c96be
PX
1919static void ram_state_cleanup(RAMState **rsp)
1920{
b9ccaf6d
DDAG
1921 if (*rsp) {
1922 migration_page_queue_free(*rsp);
1923 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1924 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1925 g_free(*rsp);
1926 *rsp = NULL;
1927 }
7d7c96be
PX
1928}
1929
84593a08
PX
1930static void xbzrle_cleanup(void)
1931{
1932 XBZRLE_cache_lock();
1933 if (XBZRLE.cache) {
1934 cache_fini(XBZRLE.cache);
1935 g_free(XBZRLE.encoded_buf);
1936 g_free(XBZRLE.current_buf);
1937 g_free(XBZRLE.zero_target_page);
1938 XBZRLE.cache = NULL;
1939 XBZRLE.encoded_buf = NULL;
1940 XBZRLE.current_buf = NULL;
1941 XBZRLE.zero_target_page = NULL;
1942 }
1943 XBZRLE_cache_unlock();
1944}
1945
f265e0e4 1946static void ram_save_cleanup(void *opaque)
56e93d26 1947{
53518d94 1948 RAMState **rsp = opaque;
6b6712ef 1949 RAMBlock *block;
eb859c53 1950
2ff64038
LZ
1951 /* caller have hold iothread lock or is in a bh, so there is
1952 * no writing race against this migration_bitmap
1953 */
6b6712ef
JQ
1954 memory_global_dirty_log_stop();
1955
b895de50 1956 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
1957 g_free(block->bmap);
1958 block->bmap = NULL;
1959 g_free(block->unsentmap);
1960 block->unsentmap = NULL;
56e93d26
JQ
1961 }
1962
84593a08 1963 xbzrle_cleanup();
f0afa331 1964 compress_threads_save_cleanup();
7d7c96be 1965 ram_state_cleanup(rsp);
56e93d26
JQ
1966}
1967
6f37bb8b 1968static void ram_state_reset(RAMState *rs)
56e93d26 1969{
6f37bb8b
JQ
1970 rs->last_seen_block = NULL;
1971 rs->last_sent_block = NULL;
269ace29 1972 rs->last_page = 0;
6f37bb8b
JQ
1973 rs->last_version = ram_list.version;
1974 rs->ram_bulk_stage = true;
56e93d26
JQ
1975}
1976
1977#define MAX_WAIT 50 /* ms, half buffered_file limit */
1978
4f2e4252
DDAG
1979/*
1980 * 'expected' is the value you expect the bitmap mostly to be full
1981 * of; it won't bother printing lines that are all this value.
1982 * If 'todump' is null the migration bitmap is dumped.
1983 */
6b6712ef
JQ
1984void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1985 unsigned long pages)
4f2e4252 1986{
4f2e4252
DDAG
1987 int64_t cur;
1988 int64_t linelen = 128;
1989 char linebuf[129];
1990
6b6712ef 1991 for (cur = 0; cur < pages; cur += linelen) {
4f2e4252
DDAG
1992 int64_t curb;
1993 bool found = false;
1994 /*
1995 * Last line; catch the case where the line length
1996 * is longer than remaining ram
1997 */
6b6712ef
JQ
1998 if (cur + linelen > pages) {
1999 linelen = pages - cur;
4f2e4252
DDAG
2000 }
2001 for (curb = 0; curb < linelen; curb++) {
2002 bool thisbit = test_bit(cur + curb, todump);
2003 linebuf[curb] = thisbit ? '1' : '.';
2004 found = found || (thisbit != expected);
2005 }
2006 if (found) {
2007 linebuf[curb] = '\0';
2008 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2009 }
2010 }
2011}
2012
e0b266f0
DDAG
2013/* **** functions for postcopy ***** */
2014
ced1c616
PB
2015void ram_postcopy_migrated_memory_release(MigrationState *ms)
2016{
2017 struct RAMBlock *block;
ced1c616 2018
b895de50 2019 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2020 unsigned long *bitmap = block->bmap;
2021 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2022 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
ced1c616
PB
2023
2024 while (run_start < range) {
2025 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
aaa2064c 2026 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
ced1c616
PB
2027 (run_end - run_start) << TARGET_PAGE_BITS);
2028 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2029 }
2030 }
2031}
2032
3d0684b2
JQ
2033/**
2034 * postcopy_send_discard_bm_ram: discard a RAMBlock
2035 *
2036 * Returns zero on success
2037 *
e0b266f0
DDAG
2038 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2039 * Note: At this point the 'unsentmap' is the processed bitmap combined
2040 * with the dirtymap; so a '1' means it's either dirty or unsent.
3d0684b2
JQ
2041 *
2042 * @ms: current migration state
2043 * @pds: state for postcopy
2044 * @start: RAMBlock starting page
2045 * @length: RAMBlock size
e0b266f0
DDAG
2046 */
2047static int postcopy_send_discard_bm_ram(MigrationState *ms,
2048 PostcopyDiscardState *pds,
6b6712ef 2049 RAMBlock *block)
e0b266f0 2050{
6b6712ef 2051 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
e0b266f0 2052 unsigned long current;
6b6712ef 2053 unsigned long *unsentmap = block->unsentmap;
e0b266f0 2054
6b6712ef 2055 for (current = 0; current < end; ) {
e0b266f0
DDAG
2056 unsigned long one = find_next_bit(unsentmap, end, current);
2057
2058 if (one <= end) {
2059 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2060 unsigned long discard_length;
2061
2062 if (zero >= end) {
2063 discard_length = end - one;
2064 } else {
2065 discard_length = zero - one;
2066 }
d688c62d
DDAG
2067 if (discard_length) {
2068 postcopy_discard_send_range(ms, pds, one, discard_length);
2069 }
e0b266f0
DDAG
2070 current = one + discard_length;
2071 } else {
2072 current = one;
2073 }
2074 }
2075
2076 return 0;
2077}
2078
3d0684b2
JQ
2079/**
2080 * postcopy_each_ram_send_discard: discard all RAMBlocks
2081 *
2082 * Returns 0 for success or negative for error
2083 *
e0b266f0
DDAG
2084 * Utility for the outgoing postcopy code.
2085 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2086 * passing it bitmap indexes and name.
e0b266f0
DDAG
2087 * (qemu_ram_foreach_block ends up passing unscaled lengths
2088 * which would mean postcopy code would have to deal with target page)
3d0684b2
JQ
2089 *
2090 * @ms: current migration state
e0b266f0
DDAG
2091 */
2092static int postcopy_each_ram_send_discard(MigrationState *ms)
2093{
2094 struct RAMBlock *block;
2095 int ret;
2096
b895de50 2097 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2098 PostcopyDiscardState *pds =
2099 postcopy_discard_send_init(ms, block->idstr);
e0b266f0
DDAG
2100
2101 /*
2102 * Postcopy sends chunks of bitmap over the wire, but it
2103 * just needs indexes at this point, avoids it having
2104 * target page specific code.
2105 */
6b6712ef 2106 ret = postcopy_send_discard_bm_ram(ms, pds, block);
e0b266f0
DDAG
2107 postcopy_discard_send_finish(ms, pds);
2108 if (ret) {
2109 return ret;
2110 }
2111 }
2112
2113 return 0;
2114}
2115
3d0684b2
JQ
2116/**
2117 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2118 *
2119 * Helper for postcopy_chunk_hostpages; it's called twice to
2120 * canonicalize the two bitmaps, that are similar, but one is
2121 * inverted.
99e314eb 2122 *
3d0684b2
JQ
2123 * Postcopy requires that all target pages in a hostpage are dirty or
2124 * clean, not a mix. This function canonicalizes the bitmaps.
99e314eb 2125 *
3d0684b2
JQ
2126 * @ms: current migration state
2127 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2128 * otherwise we need to canonicalize partially dirty host pages
2129 * @block: block that contains the page we want to canonicalize
2130 * @pds: state for postcopy
99e314eb
DDAG
2131 */
2132static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2133 RAMBlock *block,
2134 PostcopyDiscardState *pds)
2135{
53518d94 2136 RAMState *rs = ram_state;
6b6712ef
JQ
2137 unsigned long *bitmap = block->bmap;
2138 unsigned long *unsentmap = block->unsentmap;
29c59172 2139 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
6b6712ef 2140 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
99e314eb
DDAG
2141 unsigned long run_start;
2142
29c59172
DDAG
2143 if (block->page_size == TARGET_PAGE_SIZE) {
2144 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2145 return;
2146 }
2147
99e314eb
DDAG
2148 if (unsent_pass) {
2149 /* Find a sent page */
6b6712ef 2150 run_start = find_next_zero_bit(unsentmap, pages, 0);
99e314eb
DDAG
2151 } else {
2152 /* Find a dirty page */
6b6712ef 2153 run_start = find_next_bit(bitmap, pages, 0);
99e314eb
DDAG
2154 }
2155
6b6712ef 2156 while (run_start < pages) {
99e314eb
DDAG
2157 bool do_fixup = false;
2158 unsigned long fixup_start_addr;
2159 unsigned long host_offset;
2160
2161 /*
2162 * If the start of this run of pages is in the middle of a host
2163 * page, then we need to fixup this host page.
2164 */
2165 host_offset = run_start % host_ratio;
2166 if (host_offset) {
2167 do_fixup = true;
2168 run_start -= host_offset;
2169 fixup_start_addr = run_start;
2170 /* For the next pass */
2171 run_start = run_start + host_ratio;
2172 } else {
2173 /* Find the end of this run */
2174 unsigned long run_end;
2175 if (unsent_pass) {
6b6712ef 2176 run_end = find_next_bit(unsentmap, pages, run_start + 1);
99e314eb 2177 } else {
6b6712ef 2178 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
99e314eb
DDAG
2179 }
2180 /*
2181 * If the end isn't at the start of a host page, then the
2182 * run doesn't finish at the end of a host page
2183 * and we need to discard.
2184 */
2185 host_offset = run_end % host_ratio;
2186 if (host_offset) {
2187 do_fixup = true;
2188 fixup_start_addr = run_end - host_offset;
2189 /*
2190 * This host page has gone, the next loop iteration starts
2191 * from after the fixup
2192 */
2193 run_start = fixup_start_addr + host_ratio;
2194 } else {
2195 /*
2196 * No discards on this iteration, next loop starts from
2197 * next sent/dirty page
2198 */
2199 run_start = run_end + 1;
2200 }
2201 }
2202
2203 if (do_fixup) {
2204 unsigned long page;
2205
2206 /* Tell the destination to discard this page */
2207 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2208 /* For the unsent_pass we:
2209 * discard partially sent pages
2210 * For the !unsent_pass (dirty) we:
2211 * discard partially dirty pages that were sent
2212 * (any partially sent pages were already discarded
2213 * by the previous unsent_pass)
2214 */
2215 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2216 host_ratio);
2217 }
2218
2219 /* Clean up the bitmap */
2220 for (page = fixup_start_addr;
2221 page < fixup_start_addr + host_ratio; page++) {
2222 /* All pages in this host page are now not sent */
2223 set_bit(page, unsentmap);
2224
2225 /*
2226 * Remark them as dirty, updating the count for any pages
2227 * that weren't previously dirty.
2228 */
0d8ec885 2229 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
99e314eb
DDAG
2230 }
2231 }
2232
2233 if (unsent_pass) {
2234 /* Find the next sent page for the next iteration */
6b6712ef 2235 run_start = find_next_zero_bit(unsentmap, pages, run_start);
99e314eb
DDAG
2236 } else {
2237 /* Find the next dirty page for the next iteration */
6b6712ef 2238 run_start = find_next_bit(bitmap, pages, run_start);
99e314eb
DDAG
2239 }
2240 }
2241}
2242
3d0684b2
JQ
2243/**
2244 * postcopy_chuck_hostpages: discrad any partially sent host page
2245 *
99e314eb
DDAG
2246 * Utility for the outgoing postcopy code.
2247 *
2248 * Discard any partially sent host-page size chunks, mark any partially
29c59172
DDAG
2249 * dirty host-page size chunks as all dirty. In this case the host-page
2250 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
99e314eb 2251 *
3d0684b2
JQ
2252 * Returns zero on success
2253 *
2254 * @ms: current migration state
6b6712ef 2255 * @block: block we want to work with
99e314eb 2256 */
6b6712ef 2257static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
99e314eb 2258{
6b6712ef
JQ
2259 PostcopyDiscardState *pds =
2260 postcopy_discard_send_init(ms, block->idstr);
99e314eb 2261
6b6712ef
JQ
2262 /* First pass: Discard all partially sent host pages */
2263 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2264 /*
2265 * Second pass: Ensure that all partially dirty host pages are made
2266 * fully dirty.
2267 */
2268 postcopy_chunk_hostpages_pass(ms, false, block, pds);
99e314eb 2269
6b6712ef 2270 postcopy_discard_send_finish(ms, pds);
99e314eb
DDAG
2271 return 0;
2272}
2273
3d0684b2
JQ
2274/**
2275 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2276 *
2277 * Returns zero on success
2278 *
e0b266f0
DDAG
2279 * Transmit the set of pages to be discarded after precopy to the target
2280 * these are pages that:
2281 * a) Have been previously transmitted but are now dirty again
2282 * b) Pages that have never been transmitted, this ensures that
2283 * any pages on the destination that have been mapped by background
2284 * tasks get discarded (transparent huge pages is the specific concern)
2285 * Hopefully this is pretty sparse
3d0684b2
JQ
2286 *
2287 * @ms: current migration state
e0b266f0
DDAG
2288 */
2289int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2290{
53518d94 2291 RAMState *rs = ram_state;
6b6712ef 2292 RAMBlock *block;
e0b266f0 2293 int ret;
e0b266f0
DDAG
2294
2295 rcu_read_lock();
2296
2297 /* This should be our last sync, the src is now paused */
eb859c53 2298 migration_bitmap_sync(rs);
e0b266f0 2299
6b6712ef
JQ
2300 /* Easiest way to make sure we don't resume in the middle of a host-page */
2301 rs->last_seen_block = NULL;
2302 rs->last_sent_block = NULL;
2303 rs->last_page = 0;
e0b266f0 2304
b895de50 2305 RAMBLOCK_FOREACH_MIGRATABLE(block) {
6b6712ef
JQ
2306 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2307 unsigned long *bitmap = block->bmap;
2308 unsigned long *unsentmap = block->unsentmap;
2309
2310 if (!unsentmap) {
2311 /* We don't have a safe way to resize the sentmap, so
2312 * if the bitmap was resized it will be NULL at this
2313 * point.
2314 */
2315 error_report("migration ram resized during precopy phase");
2316 rcu_read_unlock();
2317 return -EINVAL;
2318 }
2319 /* Deal with TPS != HPS and huge pages */
2320 ret = postcopy_chunk_hostpages(ms, block);
2321 if (ret) {
2322 rcu_read_unlock();
2323 return ret;
2324 }
e0b266f0 2325
6b6712ef
JQ
2326 /*
2327 * Update the unsentmap to be unsentmap = unsentmap | dirty
2328 */
2329 bitmap_or(unsentmap, unsentmap, bitmap, pages);
e0b266f0 2330#ifdef DEBUG_POSTCOPY
6b6712ef 2331 ram_debug_dump_bitmap(unsentmap, true, pages);
e0b266f0 2332#endif
6b6712ef
JQ
2333 }
2334 trace_ram_postcopy_send_discard_bitmap();
e0b266f0
DDAG
2335
2336 ret = postcopy_each_ram_send_discard(ms);
2337 rcu_read_unlock();
2338
2339 return ret;
2340}
2341
3d0684b2
JQ
2342/**
2343 * ram_discard_range: discard dirtied pages at the beginning of postcopy
e0b266f0 2344 *
3d0684b2 2345 * Returns zero on success
e0b266f0 2346 *
36449157
JQ
2347 * @rbname: name of the RAMBlock of the request. NULL means the
2348 * same that last one.
3d0684b2
JQ
2349 * @start: RAMBlock starting page
2350 * @length: RAMBlock size
e0b266f0 2351 */
aaa2064c 2352int ram_discard_range(const char *rbname, uint64_t start, size_t length)
e0b266f0
DDAG
2353{
2354 int ret = -1;
2355
36449157 2356 trace_ram_discard_range(rbname, start, length);
d3a5038c 2357
e0b266f0 2358 rcu_read_lock();
36449157 2359 RAMBlock *rb = qemu_ram_block_by_name(rbname);
e0b266f0
DDAG
2360
2361 if (!rb) {
36449157 2362 error_report("ram_discard_range: Failed to find block '%s'", rbname);
e0b266f0
DDAG
2363 goto err;
2364 }
2365
f9494614
AP
2366 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2367 length >> qemu_target_page_bits());
d3a5038c 2368 ret = ram_block_discard_range(rb, start, length);
e0b266f0
DDAG
2369
2370err:
2371 rcu_read_unlock();
2372
2373 return ret;
2374}
2375
84593a08
PX
2376/*
2377 * For every allocation, we will try not to crash the VM if the
2378 * allocation failed.
2379 */
2380static int xbzrle_init(void)
2381{
2382 Error *local_err = NULL;
2383
2384 if (!migrate_use_xbzrle()) {
2385 return 0;
2386 }
2387
2388 XBZRLE_cache_lock();
2389
2390 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2391 if (!XBZRLE.zero_target_page) {
2392 error_report("%s: Error allocating zero page", __func__);
2393 goto err_out;
2394 }
2395
2396 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2397 TARGET_PAGE_SIZE, &local_err);
2398 if (!XBZRLE.cache) {
2399 error_report_err(local_err);
2400 goto free_zero_page;
2401 }
2402
2403 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2404 if (!XBZRLE.encoded_buf) {
2405 error_report("%s: Error allocating encoded_buf", __func__);
2406 goto free_cache;
2407 }
2408
2409 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2410 if (!XBZRLE.current_buf) {
2411 error_report("%s: Error allocating current_buf", __func__);
2412 goto free_encoded_buf;
2413 }
2414
2415 /* We are all good */
2416 XBZRLE_cache_unlock();
2417 return 0;
2418
2419free_encoded_buf:
2420 g_free(XBZRLE.encoded_buf);
2421 XBZRLE.encoded_buf = NULL;
2422free_cache:
2423 cache_fini(XBZRLE.cache);
2424 XBZRLE.cache = NULL;
2425free_zero_page:
2426 g_free(XBZRLE.zero_target_page);
2427 XBZRLE.zero_target_page = NULL;
2428err_out:
2429 XBZRLE_cache_unlock();
2430 return -ENOMEM;
2431}
2432
53518d94 2433static int ram_state_init(RAMState **rsp)
56e93d26 2434{
7d00ee6a
PX
2435 *rsp = g_try_new0(RAMState, 1);
2436
2437 if (!*rsp) {
2438 error_report("%s: Init ramstate fail", __func__);
2439 return -1;
2440 }
53518d94
JQ
2441
2442 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2443 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2444 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
56e93d26 2445
7d00ee6a
PX
2446 /*
2447 * Count the total number of pages used by ram blocks not including any
2448 * gaps due to alignment or unplugs.
2449 */
2450 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2451
2452 ram_state_reset(*rsp);
2453
2454 return 0;
2455}
2456
d6eff5d7 2457static void ram_list_init_bitmaps(void)
7d00ee6a 2458{
d6eff5d7
PX
2459 RAMBlock *block;
2460 unsigned long pages;
56e93d26 2461
0827b9e9
AA
2462 /* Skip setting bitmap if there is no RAM */
2463 if (ram_bytes_total()) {
b895de50 2464 RAMBLOCK_FOREACH_MIGRATABLE(block) {
d6eff5d7 2465 pages = block->max_length >> TARGET_PAGE_BITS;
6b6712ef
JQ
2466 block->bmap = bitmap_new(pages);
2467 bitmap_set(block->bmap, 0, pages);
2468 if (migrate_postcopy_ram()) {
2469 block->unsentmap = bitmap_new(pages);
2470 bitmap_set(block->unsentmap, 0, pages);
2471 }
0827b9e9 2472 }
f3f491fc 2473 }
d6eff5d7
PX
2474}
2475
2476static void ram_init_bitmaps(RAMState *rs)
2477{
2478 /* For memory_global_dirty_log_start below. */
2479 qemu_mutex_lock_iothread();
2480 qemu_mutex_lock_ramlist();
2481 rcu_read_lock();
f3f491fc 2482
d6eff5d7 2483 ram_list_init_bitmaps();
56e93d26 2484 memory_global_dirty_log_start();
d6eff5d7
PX
2485 migration_bitmap_sync(rs);
2486
2487 rcu_read_unlock();
56e93d26 2488 qemu_mutex_unlock_ramlist();
49877834 2489 qemu_mutex_unlock_iothread();
d6eff5d7
PX
2490}
2491
2492static int ram_init_all(RAMState **rsp)
2493{
2494 if (ram_state_init(rsp)) {
2495 return -1;
2496 }
2497
2498 if (xbzrle_init()) {
2499 ram_state_cleanup(rsp);
2500 return -1;
2501 }
2502
2503 ram_init_bitmaps(*rsp);
a91246c9
HZ
2504
2505 return 0;
2506}
2507
08614f34
PX
2508static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2509{
2510 RAMBlock *block;
2511 uint64_t pages = 0;
2512
2513 /*
2514 * Postcopy is not using xbzrle/compression, so no need for that.
2515 * Also, since source are already halted, we don't need to care
2516 * about dirty page logging as well.
2517 */
2518
2519 RAMBLOCK_FOREACH(block) {
2520 pages += bitmap_count_one(block->bmap,
2521 block->used_length >> TARGET_PAGE_BITS);
2522 }
2523
2524 /* This may not be aligned with current bitmaps. Recalculate. */
2525 rs->migration_dirty_pages = pages;
2526
2527 rs->last_seen_block = NULL;
2528 rs->last_sent_block = NULL;
2529 rs->last_page = 0;
2530 rs->last_version = ram_list.version;
2531 /*
2532 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2533 * matter what we have sent.
2534 */
2535 rs->ram_bulk_stage = false;
2536
2537 /* Update RAMState cache of output QEMUFile */
2538 rs->f = out;
2539
2540 trace_ram_state_resume_prepare(pages);
2541}
2542
3d0684b2
JQ
2543/*
2544 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
a91246c9
HZ
2545 * long-running RCU critical section. When rcu-reclaims in the code
2546 * start to become numerous it will be necessary to reduce the
2547 * granularity of these critical sections.
2548 */
2549
3d0684b2
JQ
2550/**
2551 * ram_save_setup: Setup RAM for migration
2552 *
2553 * Returns zero to indicate success and negative for error
2554 *
2555 * @f: QEMUFile where to send the data
2556 * @opaque: RAMState pointer
2557 */
a91246c9
HZ
2558static int ram_save_setup(QEMUFile *f, void *opaque)
2559{
53518d94 2560 RAMState **rsp = opaque;
a91246c9
HZ
2561 RAMBlock *block;
2562
dcaf446e
XG
2563 if (compress_threads_save_setup()) {
2564 return -1;
2565 }
2566
a91246c9
HZ
2567 /* migration has already setup the bitmap, reuse it. */
2568 if (!migration_in_colo_state()) {
7d00ee6a 2569 if (ram_init_all(rsp) != 0) {
dcaf446e 2570 compress_threads_save_cleanup();
a91246c9 2571 return -1;
53518d94 2572 }
a91246c9 2573 }
53518d94 2574 (*rsp)->f = f;
a91246c9
HZ
2575
2576 rcu_read_lock();
56e93d26
JQ
2577
2578 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2579
b895de50 2580 RAMBLOCK_FOREACH_MIGRATABLE(block) {
56e93d26
JQ
2581 qemu_put_byte(f, strlen(block->idstr));
2582 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2583 qemu_put_be64(f, block->used_length);
ef08fb38
DDAG
2584 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2585 qemu_put_be64(f, block->page_size);
2586 }
56e93d26
JQ
2587 }
2588
2589 rcu_read_unlock();
2590
2591 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2592 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2593
2594 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2595
2596 return 0;
2597}
2598
3d0684b2
JQ
2599/**
2600 * ram_save_iterate: iterative stage for migration
2601 *
2602 * Returns zero to indicate success and negative for error
2603 *
2604 * @f: QEMUFile where to send the data
2605 * @opaque: RAMState pointer
2606 */
56e93d26
JQ
2607static int ram_save_iterate(QEMUFile *f, void *opaque)
2608{
53518d94
JQ
2609 RAMState **temp = opaque;
2610 RAMState *rs = *temp;
56e93d26
JQ
2611 int ret;
2612 int i;
2613 int64_t t0;
5c90308f 2614 int done = 0;
56e93d26 2615
b2557345
PL
2616 if (blk_mig_bulk_active()) {
2617 /* Avoid transferring ram during bulk phase of block migration as
2618 * the bulk phase will usually take a long time and transferring
2619 * ram updates during that time is pointless. */
2620 goto out;
2621 }
2622
56e93d26 2623 rcu_read_lock();
6f37bb8b
JQ
2624 if (ram_list.version != rs->last_version) {
2625 ram_state_reset(rs);
56e93d26
JQ
2626 }
2627
2628 /* Read version before ram_list.blocks */
2629 smp_rmb();
2630
2631 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2632
2633 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2634 i = 0;
2635 while ((ret = qemu_file_rate_limit(f)) == 0) {
2636 int pages;
2637
ce25d337 2638 pages = ram_find_and_save_block(rs, false);
56e93d26
JQ
2639 /* no more pages to sent */
2640 if (pages == 0) {
5c90308f 2641 done = 1;
56e93d26
JQ
2642 break;
2643 }
23b28c3c 2644 rs->iterations++;
070afca2 2645
56e93d26
JQ
2646 /* we want to check in the 1st loop, just in case it was the 1st time
2647 and we had to sync the dirty bitmap.
2648 qemu_get_clock_ns() is a bit expensive, so we only check each some
2649 iterations
2650 */
2651 if ((i & 63) == 0) {
2652 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2653 if (t1 > MAX_WAIT) {
55c4446b 2654 trace_ram_save_iterate_big_wait(t1, i);
56e93d26
JQ
2655 break;
2656 }
2657 }
2658 i++;
2659 }
ce25d337 2660 flush_compressed_data(rs);
56e93d26
JQ
2661 rcu_read_unlock();
2662
2663 /*
2664 * Must occur before EOS (or any QEMUFile operation)
2665 * because of RDMA protocol.
2666 */
2667 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2668
b2557345 2669out:
56e93d26 2670 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
9360447d 2671 ram_counters.transferred += 8;
56e93d26
JQ
2672
2673 ret = qemu_file_get_error(f);
2674 if (ret < 0) {
2675 return ret;
2676 }
2677
5c90308f 2678 return done;
56e93d26
JQ
2679}
2680
3d0684b2
JQ
2681/**
2682 * ram_save_complete: function called to send the remaining amount of ram
2683 *
2684 * Returns zero to indicate success
2685 *
2686 * Called with iothread lock
2687 *
2688 * @f: QEMUFile where to send the data
2689 * @opaque: RAMState pointer
2690 */
56e93d26
JQ
2691static int ram_save_complete(QEMUFile *f, void *opaque)
2692{
53518d94
JQ
2693 RAMState **temp = opaque;
2694 RAMState *rs = *temp;
6f37bb8b 2695
56e93d26
JQ
2696 rcu_read_lock();
2697
5727309d 2698 if (!migration_in_postcopy()) {
8d820d6f 2699 migration_bitmap_sync(rs);
663e6c1d 2700 }
56e93d26
JQ
2701
2702 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2703
2704 /* try transferring iterative blocks of memory */
2705
2706 /* flush all remaining blocks regardless of rate limiting */
2707 while (true) {
2708 int pages;
2709
ce25d337 2710 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
56e93d26
JQ
2711 /* no more blocks to sent */
2712 if (pages == 0) {
2713 break;
2714 }
2715 }
2716
ce25d337 2717 flush_compressed_data(rs);
56e93d26 2718 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
56e93d26
JQ
2719
2720 rcu_read_unlock();
d09a6fde 2721
56e93d26
JQ
2722 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2723
2724 return 0;
2725}
2726
c31b098f 2727static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
47995026
VSO
2728 uint64_t *res_precopy_only,
2729 uint64_t *res_compatible,
2730 uint64_t *res_postcopy_only)
56e93d26 2731{
53518d94
JQ
2732 RAMState **temp = opaque;
2733 RAMState *rs = *temp;
56e93d26
JQ
2734 uint64_t remaining_size;
2735
9edabd4d 2736 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2737
5727309d 2738 if (!migration_in_postcopy() &&
663e6c1d 2739 remaining_size < max_size) {
56e93d26
JQ
2740 qemu_mutex_lock_iothread();
2741 rcu_read_lock();
8d820d6f 2742 migration_bitmap_sync(rs);
56e93d26
JQ
2743 rcu_read_unlock();
2744 qemu_mutex_unlock_iothread();
9edabd4d 2745 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
56e93d26 2746 }
c31b098f 2747
86e1167e
VSO
2748 if (migrate_postcopy_ram()) {
2749 /* We can do postcopy, and all the data is postcopiable */
47995026 2750 *res_compatible += remaining_size;
86e1167e 2751 } else {
47995026 2752 *res_precopy_only += remaining_size;
86e1167e 2753 }
56e93d26
JQ
2754}
2755
2756static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2757{
2758 unsigned int xh_len;
2759 int xh_flags;
063e760a 2760 uint8_t *loaded_data;
56e93d26 2761
56e93d26
JQ
2762 /* extract RLE header */
2763 xh_flags = qemu_get_byte(f);
2764 xh_len = qemu_get_be16(f);
2765
2766 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2767 error_report("Failed to load XBZRLE page - wrong compression!");
2768 return -1;
2769 }
2770
2771 if (xh_len > TARGET_PAGE_SIZE) {
2772 error_report("Failed to load XBZRLE page - len overflow!");
2773 return -1;
2774 }
f265e0e4 2775 loaded_data = XBZRLE.decoded_buf;
56e93d26 2776 /* load data and decode */
f265e0e4 2777 /* it can change loaded_data to point to an internal buffer */
063e760a 2778 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
56e93d26
JQ
2779
2780 /* decode RLE */
063e760a 2781 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
56e93d26
JQ
2782 TARGET_PAGE_SIZE) == -1) {
2783 error_report("Failed to load XBZRLE page - decode error!");
2784 return -1;
2785 }
2786
2787 return 0;
2788}
2789
3d0684b2
JQ
2790/**
2791 * ram_block_from_stream: read a RAMBlock id from the migration stream
2792 *
2793 * Must be called from within a rcu critical section.
2794 *
56e93d26 2795 * Returns a pointer from within the RCU-protected ram_list.
a7180877 2796 *
3d0684b2
JQ
2797 * @f: QEMUFile where to read the data from
2798 * @flags: Page flags (mostly to see if it's a continuation of previous block)
a7180877 2799 */
3d0684b2 2800static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
56e93d26
JQ
2801{
2802 static RAMBlock *block = NULL;
2803 char id[256];
2804 uint8_t len;
2805
2806 if (flags & RAM_SAVE_FLAG_CONTINUE) {
4c4bad48 2807 if (!block) {
56e93d26
JQ
2808 error_report("Ack, bad migration stream!");
2809 return NULL;
2810 }
4c4bad48 2811 return block;
56e93d26
JQ
2812 }
2813
2814 len = qemu_get_byte(f);
2815 qemu_get_buffer(f, (uint8_t *)id, len);
2816 id[len] = 0;
2817
e3dd7493 2818 block = qemu_ram_block_by_name(id);
4c4bad48
HZ
2819 if (!block) {
2820 error_report("Can't find block %s", id);
2821 return NULL;
56e93d26
JQ
2822 }
2823
b895de50
CLG
2824 if (!qemu_ram_is_migratable(block)) {
2825 error_report("block %s should not be migrated !", id);
2826 return NULL;
2827 }
2828
4c4bad48
HZ
2829 return block;
2830}
2831
2832static inline void *host_from_ram_block_offset(RAMBlock *block,
2833 ram_addr_t offset)
2834{
2835 if (!offset_in_ramblock(block, offset)) {
2836 return NULL;
2837 }
2838
2839 return block->host + offset;
56e93d26
JQ
2840}
2841
3d0684b2
JQ
2842/**
2843 * ram_handle_compressed: handle the zero page case
2844 *
56e93d26
JQ
2845 * If a page (or a whole RDMA chunk) has been
2846 * determined to be zero, then zap it.
3d0684b2
JQ
2847 *
2848 * @host: host address for the zero page
2849 * @ch: what the page is filled from. We only support zero
2850 * @size: size of the zero page
56e93d26
JQ
2851 */
2852void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2853{
2854 if (ch != 0 || !is_zero_range(host, size)) {
2855 memset(host, ch, size);
2856 }
2857}
2858
797ca154
XG
2859/* return the size after decompression, or negative value on error */
2860static int
2861qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2862 const uint8_t *source, size_t source_len)
2863{
2864 int err;
2865
2866 err = inflateReset(stream);
2867 if (err != Z_OK) {
2868 return -1;
2869 }
2870
2871 stream->avail_in = source_len;
2872 stream->next_in = (uint8_t *)source;
2873 stream->avail_out = dest_len;
2874 stream->next_out = dest;
2875
2876 err = inflate(stream, Z_NO_FLUSH);
2877 if (err != Z_STREAM_END) {
2878 return -1;
2879 }
2880
2881 return stream->total_out;
2882}
2883
56e93d26
JQ
2884static void *do_data_decompress(void *opaque)
2885{
2886 DecompressParam *param = opaque;
2887 unsigned long pagesize;
33d151f4 2888 uint8_t *des;
34ab9e97 2889 int len, ret;
56e93d26 2890
33d151f4 2891 qemu_mutex_lock(&param->mutex);
90e56fb4 2892 while (!param->quit) {
33d151f4
LL
2893 if (param->des) {
2894 des = param->des;
2895 len = param->len;
2896 param->des = 0;
2897 qemu_mutex_unlock(&param->mutex);
2898
56e93d26 2899 pagesize = TARGET_PAGE_SIZE;
34ab9e97
XG
2900
2901 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2902 param->compbuf, len);
f548222c 2903 if (ret < 0 && migrate_get_current()->decompress_error_check) {
34ab9e97
XG
2904 error_report("decompress data failed");
2905 qemu_file_set_error(decomp_file, ret);
2906 }
73a8912b 2907
33d151f4
LL
2908 qemu_mutex_lock(&decomp_done_lock);
2909 param->done = true;
2910 qemu_cond_signal(&decomp_done_cond);
2911 qemu_mutex_unlock(&decomp_done_lock);
2912
2913 qemu_mutex_lock(&param->mutex);
2914 } else {
2915 qemu_cond_wait(&param->cond, &param->mutex);
2916 }
56e93d26 2917 }
33d151f4 2918 qemu_mutex_unlock(&param->mutex);
56e93d26
JQ
2919
2920 return NULL;
2921}
2922
34ab9e97 2923static int wait_for_decompress_done(void)
5533b2e9
LL
2924{
2925 int idx, thread_count;
2926
2927 if (!migrate_use_compression()) {
34ab9e97 2928 return 0;
5533b2e9
LL
2929 }
2930
2931 thread_count = migrate_decompress_threads();
2932 qemu_mutex_lock(&decomp_done_lock);
2933 for (idx = 0; idx < thread_count; idx++) {
2934 while (!decomp_param[idx].done) {
2935 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2936 }
2937 }
2938 qemu_mutex_unlock(&decomp_done_lock);
34ab9e97 2939 return qemu_file_get_error(decomp_file);
5533b2e9
LL
2940}
2941
f0afa331 2942static void compress_threads_load_cleanup(void)
56e93d26
JQ
2943{
2944 int i, thread_count;
2945
3416ab5b
JQ
2946 if (!migrate_use_compression()) {
2947 return;
2948 }
56e93d26
JQ
2949 thread_count = migrate_decompress_threads();
2950 for (i = 0; i < thread_count; i++) {
797ca154
XG
2951 /*
2952 * we use it as a indicator which shows if the thread is
2953 * properly init'd or not
2954 */
2955 if (!decomp_param[i].compbuf) {
2956 break;
2957 }
2958
56e93d26 2959 qemu_mutex_lock(&decomp_param[i].mutex);
90e56fb4 2960 decomp_param[i].quit = true;
56e93d26
JQ
2961 qemu_cond_signal(&decomp_param[i].cond);
2962 qemu_mutex_unlock(&decomp_param[i].mutex);
2963 }
2964 for (i = 0; i < thread_count; i++) {
797ca154
XG
2965 if (!decomp_param[i].compbuf) {
2966 break;
2967 }
2968
56e93d26
JQ
2969 qemu_thread_join(decompress_threads + i);
2970 qemu_mutex_destroy(&decomp_param[i].mutex);
2971 qemu_cond_destroy(&decomp_param[i].cond);
797ca154 2972 inflateEnd(&decomp_param[i].stream);
56e93d26 2973 g_free(decomp_param[i].compbuf);
797ca154 2974 decomp_param[i].compbuf = NULL;
56e93d26
JQ
2975 }
2976 g_free(decompress_threads);
2977 g_free(decomp_param);
56e93d26
JQ
2978 decompress_threads = NULL;
2979 decomp_param = NULL;
34ab9e97 2980 decomp_file = NULL;
56e93d26
JQ
2981}
2982
34ab9e97 2983static int compress_threads_load_setup(QEMUFile *f)
797ca154
XG
2984{
2985 int i, thread_count;
2986
2987 if (!migrate_use_compression()) {
2988 return 0;
2989 }
2990
2991 thread_count = migrate_decompress_threads();
2992 decompress_threads = g_new0(QemuThread, thread_count);
2993 decomp_param = g_new0(DecompressParam, thread_count);
2994 qemu_mutex_init(&decomp_done_lock);
2995 qemu_cond_init(&decomp_done_cond);
34ab9e97 2996 decomp_file = f;
797ca154
XG
2997 for (i = 0; i < thread_count; i++) {
2998 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2999 goto exit;
3000 }
3001
3002 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3003 qemu_mutex_init(&decomp_param[i].mutex);
3004 qemu_cond_init(&decomp_param[i].cond);
3005 decomp_param[i].done = true;
3006 decomp_param[i].quit = false;
3007 qemu_thread_create(decompress_threads + i, "decompress",
3008 do_data_decompress, decomp_param + i,
3009 QEMU_THREAD_JOINABLE);
3010 }
3011 return 0;
3012exit:
3013 compress_threads_load_cleanup();
3014 return -1;
3015}
3016
c1bc6626 3017static void decompress_data_with_multi_threads(QEMUFile *f,
56e93d26
JQ
3018 void *host, int len)
3019{
3020 int idx, thread_count;
3021
3022 thread_count = migrate_decompress_threads();
73a8912b 3023 qemu_mutex_lock(&decomp_done_lock);
56e93d26
JQ
3024 while (true) {
3025 for (idx = 0; idx < thread_count; idx++) {
73a8912b 3026 if (decomp_param[idx].done) {
33d151f4
LL
3027 decomp_param[idx].done = false;
3028 qemu_mutex_lock(&decomp_param[idx].mutex);
c1bc6626 3029 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
56e93d26
JQ
3030 decomp_param[idx].des = host;
3031 decomp_param[idx].len = len;
33d151f4
LL
3032 qemu_cond_signal(&decomp_param[idx].cond);
3033 qemu_mutex_unlock(&decomp_param[idx].mutex);
56e93d26
JQ
3034 break;
3035 }
3036 }
3037 if (idx < thread_count) {
3038 break;
73a8912b
LL
3039 } else {
3040 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
56e93d26
JQ
3041 }
3042 }
73a8912b 3043 qemu_mutex_unlock(&decomp_done_lock);
56e93d26
JQ
3044}
3045
f265e0e4
JQ
3046/**
3047 * ram_load_setup: Setup RAM for migration incoming side
3048 *
3049 * Returns zero to indicate success and negative for error
3050 *
3051 * @f: QEMUFile where to receive the data
3052 * @opaque: RAMState pointer
3053 */
3054static int ram_load_setup(QEMUFile *f, void *opaque)
3055{
34ab9e97 3056 if (compress_threads_load_setup(f)) {
797ca154
XG
3057 return -1;
3058 }
3059
f265e0e4 3060 xbzrle_load_setup();
f9494614 3061 ramblock_recv_map_init();
f265e0e4
JQ
3062 return 0;
3063}
3064
3065static int ram_load_cleanup(void *opaque)
3066{
f9494614 3067 RAMBlock *rb;
f265e0e4 3068 xbzrle_load_cleanup();
f0afa331 3069 compress_threads_load_cleanup();
f9494614 3070
b895de50 3071 RAMBLOCK_FOREACH_MIGRATABLE(rb) {
f9494614
AP
3072 g_free(rb->receivedmap);
3073 rb->receivedmap = NULL;
3074 }
f265e0e4
JQ
3075 return 0;
3076}
3077
3d0684b2
JQ
3078/**
3079 * ram_postcopy_incoming_init: allocate postcopy data structures
3080 *
3081 * Returns 0 for success and negative if there was one error
3082 *
3083 * @mis: current migration incoming state
3084 *
3085 * Allocate data structures etc needed by incoming migration with
3086 * postcopy-ram. postcopy-ram's similarly names
3087 * postcopy_ram_incoming_init does the work.
1caddf8a
DDAG
3088 */
3089int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3090{
b8c48993 3091 unsigned long ram_pages = last_ram_page();
1caddf8a
DDAG
3092
3093 return postcopy_ram_incoming_init(mis, ram_pages);
3094}
3095
3d0684b2
JQ
3096/**
3097 * ram_load_postcopy: load a page in postcopy case
3098 *
3099 * Returns 0 for success or -errno in case of error
3100 *
a7180877
DDAG
3101 * Called in postcopy mode by ram_load().
3102 * rcu_read_lock is taken prior to this being called.
3d0684b2
JQ
3103 *
3104 * @f: QEMUFile where to send the data
a7180877
DDAG
3105 */
3106static int ram_load_postcopy(QEMUFile *f)
3107{
3108 int flags = 0, ret = 0;
3109 bool place_needed = false;
28abd200 3110 bool matching_page_sizes = false;
a7180877
DDAG
3111 MigrationIncomingState *mis = migration_incoming_get_current();
3112 /* Temporary page that is later 'placed' */
3113 void *postcopy_host_page = postcopy_get_tmp_page(mis);
c53b7ddc 3114 void *last_host = NULL;
a3b6ff6d 3115 bool all_zero = false;
a7180877
DDAG
3116
3117 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3118 ram_addr_t addr;
3119 void *host = NULL;
3120 void *page_buffer = NULL;
3121 void *place_source = NULL;
df9ff5e1 3122 RAMBlock *block = NULL;
a7180877 3123 uint8_t ch;
a7180877
DDAG
3124
3125 addr = qemu_get_be64(f);
7a9ddfbf
PX
3126
3127 /*
3128 * If qemu file error, we should stop here, and then "addr"
3129 * may be invalid
3130 */
3131 ret = qemu_file_get_error(f);
3132 if (ret) {
3133 break;
3134 }
3135
a7180877
DDAG
3136 flags = addr & ~TARGET_PAGE_MASK;
3137 addr &= TARGET_PAGE_MASK;
3138
3139 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3140 place_needed = false;
bb890ed5 3141 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
df9ff5e1 3142 block = ram_block_from_stream(f, flags);
4c4bad48
HZ
3143
3144 host = host_from_ram_block_offset(block, addr);
a7180877
DDAG
3145 if (!host) {
3146 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3147 ret = -EINVAL;
3148 break;
3149 }
28abd200 3150 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
a7180877 3151 /*
28abd200
DDAG
3152 * Postcopy requires that we place whole host pages atomically;
3153 * these may be huge pages for RAMBlocks that are backed by
3154 * hugetlbfs.
a7180877
DDAG
3155 * To make it atomic, the data is read into a temporary page
3156 * that's moved into place later.
3157 * The migration protocol uses, possibly smaller, target-pages
3158 * however the source ensures it always sends all the components
3159 * of a host page in order.
3160 */
3161 page_buffer = postcopy_host_page +
28abd200 3162 ((uintptr_t)host & (block->page_size - 1));
a7180877 3163 /* If all TP are zero then we can optimise the place */
28abd200 3164 if (!((uintptr_t)host & (block->page_size - 1))) {
a7180877 3165 all_zero = true;
c53b7ddc
DDAG
3166 } else {
3167 /* not the 1st TP within the HP */
3168 if (host != (last_host + TARGET_PAGE_SIZE)) {
9af9e0fe 3169 error_report("Non-sequential target page %p/%p",
c53b7ddc
DDAG
3170 host, last_host);
3171 ret = -EINVAL;
3172 break;
3173 }
a7180877
DDAG
3174 }
3175
c53b7ddc 3176
a7180877
DDAG
3177 /*
3178 * If it's the last part of a host page then we place the host
3179 * page
3180 */
3181 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
28abd200 3182 (block->page_size - 1)) == 0;
a7180877
DDAG
3183 place_source = postcopy_host_page;
3184 }
c53b7ddc 3185 last_host = host;
a7180877
DDAG
3186
3187 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
bb890ed5 3188 case RAM_SAVE_FLAG_ZERO:
a7180877
DDAG
3189 ch = qemu_get_byte(f);
3190 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3191 if (ch) {
3192 all_zero = false;
3193 }
3194 break;
3195
3196 case RAM_SAVE_FLAG_PAGE:
3197 all_zero = false;
3198 if (!place_needed || !matching_page_sizes) {
3199 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3200 } else {
3201 /* Avoids the qemu_file copy during postcopy, which is
3202 * going to do a copy later; can only do it when we
3203 * do this read in one go (matching page sizes)
3204 */
3205 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3206 TARGET_PAGE_SIZE);
3207 }
3208 break;
3209 case RAM_SAVE_FLAG_EOS:
3210 /* normal exit */
3211 break;
3212 default:
3213 error_report("Unknown combination of migration flags: %#x"
3214 " (postcopy mode)", flags);
3215 ret = -EINVAL;
7a9ddfbf
PX
3216 break;
3217 }
3218
3219 /* Detect for any possible file errors */
3220 if (!ret && qemu_file_get_error(f)) {
3221 ret = qemu_file_get_error(f);
a7180877
DDAG
3222 }
3223
7a9ddfbf 3224 if (!ret && place_needed) {
a7180877 3225 /* This gets called at the last target page in the host page */
df9ff5e1
DDAG
3226 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3227
a7180877 3228 if (all_zero) {
df9ff5e1 3229 ret = postcopy_place_page_zero(mis, place_dest,
8be4620b 3230 block);
a7180877 3231 } else {
df9ff5e1 3232 ret = postcopy_place_page(mis, place_dest,
8be4620b 3233 place_source, block);
a7180877
DDAG
3234 }
3235 }
a7180877
DDAG
3236 }
3237
3238 return ret;
3239}
3240
acab30b8
DHB
3241static bool postcopy_is_advised(void)
3242{
3243 PostcopyState ps = postcopy_state_get();
3244 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3245}
3246
3247static bool postcopy_is_running(void)
3248{
3249 PostcopyState ps = postcopy_state_get();
3250 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3251}
3252
56e93d26
JQ
3253static int ram_load(QEMUFile *f, void *opaque, int version_id)
3254{
edc60127 3255 int flags = 0, ret = 0, invalid_flags = 0;
56e93d26
JQ
3256 static uint64_t seq_iter;
3257 int len = 0;
a7180877
DDAG
3258 /*
3259 * If system is running in postcopy mode, page inserts to host memory must
3260 * be atomic
3261 */
acab30b8 3262 bool postcopy_running = postcopy_is_running();
ef08fb38 3263 /* ADVISE is earlier, it shows the source has the postcopy capability on */
acab30b8 3264 bool postcopy_advised = postcopy_is_advised();
56e93d26
JQ
3265
3266 seq_iter++;
3267
3268 if (version_id != 4) {
3269 ret = -EINVAL;
3270 }
3271
edc60127
JQ
3272 if (!migrate_use_compression()) {
3273 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3274 }
56e93d26
JQ
3275 /* This RCU critical section can be very long running.
3276 * When RCU reclaims in the code start to become numerous,
3277 * it will be necessary to reduce the granularity of this
3278 * critical section.
3279 */
3280 rcu_read_lock();
a7180877
DDAG
3281
3282 if (postcopy_running) {
3283 ret = ram_load_postcopy(f);
3284 }
3285
3286 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
56e93d26 3287 ram_addr_t addr, total_ram_bytes;
a776aa15 3288 void *host = NULL;
56e93d26
JQ
3289 uint8_t ch;
3290
3291 addr = qemu_get_be64(f);
3292 flags = addr & ~TARGET_PAGE_MASK;
3293 addr &= TARGET_PAGE_MASK;
3294
edc60127
JQ
3295 if (flags & invalid_flags) {
3296 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3297 error_report("Received an unexpected compressed page");
3298 }
3299
3300 ret = -EINVAL;
3301 break;
3302 }
3303
bb890ed5 3304 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
a776aa15 3305 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4c4bad48
HZ
3306 RAMBlock *block = ram_block_from_stream(f, flags);
3307
3308 host = host_from_ram_block_offset(block, addr);
a776aa15
DDAG
3309 if (!host) {
3310 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3311 ret = -EINVAL;
3312 break;
3313 }
f9494614 3314 ramblock_recv_bitmap_set(block, host);
1db9d8e5 3315 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
a776aa15
DDAG
3316 }
3317
56e93d26
JQ
3318 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3319 case RAM_SAVE_FLAG_MEM_SIZE:
3320 /* Synchronize RAM block list */
3321 total_ram_bytes = addr;
3322 while (!ret && total_ram_bytes) {
3323 RAMBlock *block;
56e93d26
JQ
3324 char id[256];
3325 ram_addr_t length;
3326
3327 len = qemu_get_byte(f);
3328 qemu_get_buffer(f, (uint8_t *)id, len);
3329 id[len] = 0;
3330 length = qemu_get_be64(f);
3331
e3dd7493 3332 block = qemu_ram_block_by_name(id);
b895de50
CLG
3333 if (block && !qemu_ram_is_migratable(block)) {
3334 error_report("block %s should not be migrated !", id);
3335 ret = -EINVAL;
3336 } else if (block) {
e3dd7493
DDAG
3337 if (length != block->used_length) {
3338 Error *local_err = NULL;
56e93d26 3339
fa53a0e5 3340 ret = qemu_ram_resize(block, length,
e3dd7493
DDAG
3341 &local_err);
3342 if (local_err) {
3343 error_report_err(local_err);
56e93d26 3344 }
56e93d26 3345 }
ef08fb38
DDAG
3346 /* For postcopy we need to check hugepage sizes match */
3347 if (postcopy_advised &&
3348 block->page_size != qemu_host_page_size) {
3349 uint64_t remote_page_size = qemu_get_be64(f);
3350 if (remote_page_size != block->page_size) {
3351 error_report("Mismatched RAM page size %s "
3352 "(local) %zd != %" PRId64,
3353 id, block->page_size,
3354 remote_page_size);
3355 ret = -EINVAL;
3356 }
3357 }
e3dd7493
DDAG
3358 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3359 block->idstr);
3360 } else {
56e93d26
JQ
3361 error_report("Unknown ramblock \"%s\", cannot "
3362 "accept migration", id);
3363 ret = -EINVAL;
3364 }
3365
3366 total_ram_bytes -= length;
3367 }
3368 break;
a776aa15 3369
bb890ed5 3370 case RAM_SAVE_FLAG_ZERO:
56e93d26
JQ
3371 ch = qemu_get_byte(f);
3372 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3373 break;
a776aa15 3374
56e93d26 3375 case RAM_SAVE_FLAG_PAGE:
56e93d26
JQ
3376 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3377 break;
56e93d26 3378
a776aa15 3379 case RAM_SAVE_FLAG_COMPRESS_PAGE:
56e93d26
JQ
3380 len = qemu_get_be32(f);
3381 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3382 error_report("Invalid compressed data length: %d", len);
3383 ret = -EINVAL;
3384 break;
3385 }
c1bc6626 3386 decompress_data_with_multi_threads(f, host, len);
56e93d26 3387 break;
a776aa15 3388
56e93d26 3389 case RAM_SAVE_FLAG_XBZRLE:
56e93d26
JQ
3390 if (load_xbzrle(f, addr, host) < 0) {
3391 error_report("Failed to decompress XBZRLE page at "
3392 RAM_ADDR_FMT, addr);
3393 ret = -EINVAL;
3394 break;
3395 }
3396 break;
3397 case RAM_SAVE_FLAG_EOS:
3398 /* normal exit */
3399 break;
3400 default:
3401 if (flags & RAM_SAVE_FLAG_HOOK) {
632e3a5c 3402 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
56e93d26
JQ
3403 } else {
3404 error_report("Unknown combination of migration flags: %#x",
3405 flags);
3406 ret = -EINVAL;
3407 }
3408 }
3409 if (!ret) {
3410 ret = qemu_file_get_error(f);
3411 }
3412 }
3413
34ab9e97 3414 ret |= wait_for_decompress_done();
56e93d26 3415 rcu_read_unlock();
55c4446b 3416 trace_ram_load_complete(ret, seq_iter);
56e93d26
JQ
3417 return ret;
3418}
3419
c6467627
VSO
3420static bool ram_has_postcopy(void *opaque)
3421{
3422 return migrate_postcopy_ram();
3423}
3424
edd090c7
PX
3425/* Sync all the dirty bitmap with destination VM. */
3426static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3427{
3428 RAMBlock *block;
3429 QEMUFile *file = s->to_dst_file;
3430 int ramblock_count = 0;
3431
3432 trace_ram_dirty_bitmap_sync_start();
3433
3434 RAMBLOCK_FOREACH(block) {
3435 qemu_savevm_send_recv_bitmap(file, block->idstr);
3436 trace_ram_dirty_bitmap_request(block->idstr);
3437 ramblock_count++;
3438 }
3439
3440 trace_ram_dirty_bitmap_sync_wait();
3441
3442 /* Wait until all the ramblocks' dirty bitmap synced */
3443 while (ramblock_count--) {
3444 qemu_sem_wait(&s->rp_state.rp_sem);
3445 }
3446
3447 trace_ram_dirty_bitmap_sync_complete();
3448
3449 return 0;
3450}
3451
3452static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3453{
3454 qemu_sem_post(&s->rp_state.rp_sem);
3455}
3456
a335debb
PX
3457/*
3458 * Read the received bitmap, revert it as the initial dirty bitmap.
3459 * This is only used when the postcopy migration is paused but wants
3460 * to resume from a middle point.
3461 */
3462int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3463{
3464 int ret = -EINVAL;
3465 QEMUFile *file = s->rp_state.from_dst_file;
3466 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3467 uint64_t local_size = nbits / 8;
3468 uint64_t size, end_mark;
3469
3470 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3471
3472 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3473 error_report("%s: incorrect state %s", __func__,
3474 MigrationStatus_str(s->state));
3475 return -EINVAL;
3476 }
3477
3478 /*
3479 * Note: see comments in ramblock_recv_bitmap_send() on why we
3480 * need the endianess convertion, and the paddings.
3481 */
3482 local_size = ROUND_UP(local_size, 8);
3483
3484 /* Add paddings */
3485 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3486
3487 size = qemu_get_be64(file);
3488
3489 /* The size of the bitmap should match with our ramblock */
3490 if (size != local_size) {
3491 error_report("%s: ramblock '%s' bitmap size mismatch "
3492 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3493 block->idstr, size, local_size);
3494 ret = -EINVAL;
3495 goto out;
3496 }
3497
3498 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3499 end_mark = qemu_get_be64(file);
3500
3501 ret = qemu_file_get_error(file);
3502 if (ret || size != local_size) {
3503 error_report("%s: read bitmap failed for ramblock '%s': %d"
3504 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3505 __func__, block->idstr, ret, local_size, size);
3506 ret = -EIO;
3507 goto out;
3508 }
3509
3510 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3511 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3512 __func__, block->idstr, end_mark);
3513 ret = -EINVAL;
3514 goto out;
3515 }
3516
3517 /*
3518 * Endianess convertion. We are during postcopy (though paused).
3519 * The dirty bitmap won't change. We can directly modify it.
3520 */
3521 bitmap_from_le(block->bmap, le_bitmap, nbits);
3522
3523 /*
3524 * What we received is "received bitmap". Revert it as the initial
3525 * dirty bitmap for this ramblock.
3526 */
3527 bitmap_complement(block->bmap, block->bmap, nbits);
3528
3529 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3530
edd090c7
PX
3531 /*
3532 * We succeeded to sync bitmap for current ramblock. If this is
3533 * the last one to sync, we need to notify the main send thread.
3534 */
3535 ram_dirty_bitmap_reload_notify(s);
3536
a335debb
PX
3537 ret = 0;
3538out:
bf269906 3539 g_free(le_bitmap);
a335debb
PX
3540 return ret;
3541}
3542
edd090c7
PX
3543static int ram_resume_prepare(MigrationState *s, void *opaque)
3544{
3545 RAMState *rs = *(RAMState **)opaque;
08614f34 3546 int ret;
edd090c7 3547
08614f34
PX
3548 ret = ram_dirty_bitmap_sync_all(s, rs);
3549 if (ret) {
3550 return ret;
3551 }
3552
3553 ram_state_resume_prepare(rs, s->to_dst_file);
3554
3555 return 0;
edd090c7
PX
3556}
3557
56e93d26 3558static SaveVMHandlers savevm_ram_handlers = {
9907e842 3559 .save_setup = ram_save_setup,
56e93d26 3560 .save_live_iterate = ram_save_iterate,
763c906b 3561 .save_live_complete_postcopy = ram_save_complete,
a3e06c3d 3562 .save_live_complete_precopy = ram_save_complete,
c6467627 3563 .has_postcopy = ram_has_postcopy,
56e93d26
JQ
3564 .save_live_pending = ram_save_pending,
3565 .load_state = ram_load,
f265e0e4
JQ
3566 .save_cleanup = ram_save_cleanup,
3567 .load_setup = ram_load_setup,
3568 .load_cleanup = ram_load_cleanup,
edd090c7 3569 .resume_prepare = ram_resume_prepare,
56e93d26
JQ
3570};
3571
3572void ram_mig_init(void)
3573{
3574 qemu_mutex_init(&XBZRLE.lock);
6f37bb8b 3575 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
56e93d26 3576}
This page took 0.790593 seconds and 4 git commands to generate.