#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
+typedef struct PostcopyBlocktimeContext {
+ /* time when page fault initiated per vCPU */
+ uint32_t *page_fault_vcpu_time;
+ /* page address per vCPU */
+ uintptr_t *vcpu_addr;
+ uint32_t total_blocktime;
+ /* blocktime per vCPU */
+ uint32_t *vcpu_blocktime;
+ /* point in time when last page fault was initiated */
+ uint32_t last_begin;
+ /* number of vCPU are suspended */
+ int smp_cpus_down;
+ uint64_t start_time;
+
+ /*
+ * Handler for exit event, necessary for
+ * releasing whole blocktime_ctx
+ */
+ Notifier exit_notifier;
+} PostcopyBlocktimeContext;
+
+static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
+{
+ g_free(ctx->page_fault_vcpu_time);
+ g_free(ctx->vcpu_addr);
+ g_free(ctx->vcpu_blocktime);
+ g_free(ctx);
+}
+
+static void migration_exit_cb(Notifier *n, void *data)
+{
+ PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
+ exit_notifier);
+ destroy_blocktime_context(ctx);
+}
+
+static struct PostcopyBlocktimeContext *blocktime_context_new(void)
+{
+ PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
+ ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
+ ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
+ ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
+
+ ctx->exit_notifier.notify = migration_exit_cb;
+ ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ qemu_add_exit_notifier(&ctx->exit_notifier);
+ return ctx;
+}
+
+static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
+{
+ uint32List *list = NULL, *entry = NULL;
+ int i;
+
+ for (i = smp_cpus - 1; i >= 0; i--) {
+ entry = g_new0(uint32List, 1);
+ entry->value = ctx->vcpu_blocktime[i];
+ entry->next = list;
+ list = entry;
+ }
+
+ return list;
+}
+
+/*
+ * This function just populates MigrationInfo from postcopy's
+ * blocktime context. It will not populate MigrationInfo,
+ * unless postcopy-blocktime capability was set.
+ *
+ * @info: pointer to MigrationInfo to populate
+ */
+void fill_destination_postcopy_migration_info(MigrationInfo *info)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
+
+ if (!bc) {
+ return;
+ }
+
+ info->has_postcopy_blocktime = true;
+ info->postcopy_blocktime = bc->total_blocktime;
+ info->has_postcopy_vcpu_blocktime = true;
+ info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
+}
+
+static uint32_t get_postcopy_total_blocktime(void)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
+
+ if (!bc) {
+ return 0;
+ }
+
+ return bc->total_blocktime;
+}
/**
* receive_ufd_features: check userfault fd features, to request only supported
}
}
+#ifdef UFFD_FEATURE_THREAD_ID
+ if (migrate_postcopy_blocktime() && mis &&
+ UFFD_FEATURE_THREAD_ID & supported_features) {
+ /* kernel supports that feature */
+ /* don't create blocktime_context if it exists */
+ if (!mis->blocktime_ctx) {
+ mis->blocktime_ctx = blocktime_context_new();
+ }
+
+ asked_features |= UFFD_FEATURE_THREAD_ID;
+ }
+#endif
+
/*
* request features, even if asked_features is 0, due to
* kernel expects UFFD_API before UFFDIO_REGISTER, per
RAMBlock *rb = qemu_ram_block_by_name(block_name);
size_t pagesize = qemu_ram_pagesize(rb);
- if (qemu_ram_is_shared(rb)) {
- error_report("Postcopy on shared RAM (%s) is not yet supported",
- block_name);
- return 1;
- }
-
if (length % pagesize) {
error_report("Postcopy requires RAM blocks to be a page size multiple,"
" block %s is 0x" RAM_ADDR_FMT " bytes with a "
}
/* We don't support postcopy with shared RAM yet */
- if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
+ if (qemu_ram_foreach_migratable_block(test_ramblock_postcopiable, NULL)) {
goto out;
}
* postcopy later; must be called prior to any precopy.
* called from arch_init's similarly named ram_postcopy_incoming_init
*/
-int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+int postcopy_ram_incoming_init(MigrationIncomingState *mis)
{
- if (qemu_ram_foreach_block(init_range, NULL)) {
+ if (qemu_ram_foreach_migratable_block(init_range, NULL)) {
return -1;
}
trace_postcopy_ram_incoming_cleanup_entry();
if (mis->have_fault_thread) {
- if (qemu_ram_foreach_block(cleanup_range, mis)) {
+ Error *local_err = NULL;
+
+ if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
+ error_report_err(local_err);
+ return -1;
+ }
+
+ if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) {
return -1;
}
/* Let the fault thread quit */
munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
mis->postcopy_tmp_zero_page = NULL;
}
+ trace_postcopy_ram_incoming_cleanup_blocktime(
+ get_postcopy_total_blocktime());
+
trace_postcopy_ram_incoming_cleanup_exit();
return 0;
}
*/
int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
{
- if (qemu_ram_foreach_block(nhp_range, mis)) {
+ if (qemu_ram_foreach_migratable_block(nhp_range, mis)) {
return -1;
}
/*
* Mark the given area of RAM as requiring notification to unwritten areas
- * Used as a callback on qemu_ram_foreach_block.
+ * Used as a callback on qemu_ram_foreach_migratable_block.
* host_addr: Base of area to mark
* offset: Offset in the whole ram arena
* length: Length of the section
return 0;
}
+int postcopy_wake_shared(struct PostCopyFD *pcfd,
+ uint64_t client_addr,
+ RAMBlock *rb)
+{
+ size_t pagesize = qemu_ram_pagesize(rb);
+ struct uffdio_range range;
+ int ret;
+ trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
+ range.start = client_addr & ~(pagesize - 1);
+ range.len = pagesize;
+ ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
+ if (ret) {
+ error_report("%s: Failed to wake: %zx in %s (%s)",
+ __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
+ strerror(errno));
+ }
+ return ret;
+}
+
/*
* Callback from shared fault handlers to ask for a page,
* the page must be specified by a RAMBlock and an offset in that rb
trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
rb_offset);
- /* TODO: Check bitmap to see if we already have the page */
+ if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
+ trace_postcopy_request_shared_page_present(pcfd->idstr,
+ qemu_ram_get_idstr(rb), rb_offset);
+ return postcopy_wake_shared(pcfd, client_addr, rb);
+ }
if (rb != mis->last_rb) {
mis->last_rb = rb;
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
return 0;
}
+static int get_mem_fault_cpu_index(uint32_t pid)
+{
+ CPUState *cpu_iter;
+
+ CPU_FOREACH(cpu_iter) {
+ if (cpu_iter->thread_id == pid) {
+ trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
+ return cpu_iter->cpu_index;
+ }
+ }
+ trace_get_mem_fault_cpu_index(-1, pid);
+ return -1;
+}
+
+static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
+{
+ int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
+ dc->start_time;
+ return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
+}
+
+/*
+ * This function is being called when pagefault occurs. It
+ * tracks down vCPU blocking time.
+ *
+ * @addr: faulted host virtual address
+ * @ptid: faulted process thread id
+ * @rb: ramblock appropriate to addr
+ */
+static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
+ RAMBlock *rb)
+{
+ int cpu, already_received;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
+ uint32_t low_time_offset;
+
+ if (!dc || ptid == 0) {
+ return;
+ }
+ cpu = get_mem_fault_cpu_index(ptid);
+ if (cpu < 0) {
+ return;
+ }
+
+ low_time_offset = get_low_time_offset(dc);
+ if (dc->vcpu_addr[cpu] == 0) {
+ atomic_inc(&dc->smp_cpus_down);
+ }
+
+ atomic_xchg(&dc->last_begin, low_time_offset);
+ atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
+ atomic_xchg(&dc->vcpu_addr[cpu], addr);
+
+ /* check it here, not at the begining of the function,
+ * due to, check could accur early than bitmap_set in
+ * qemu_ufd_copy_ioctl */
+ already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
+ if (already_received) {
+ atomic_xchg(&dc->vcpu_addr[cpu], 0);
+ atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
+ atomic_dec(&dc->smp_cpus_down);
+ }
+ trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
+ cpu, already_received);
+}
+
+/*
+ * This function just provide calculated blocktime per cpu and trace it.
+ * Total blocktime is calculated in mark_postcopy_blocktime_end.
+ *
+ *
+ * Assume we have 3 CPU
+ *
+ * S1 E1 S1 E1
+ * -----***********------------xxx***************------------------------> CPU1
+ *
+ * S2 E2
+ * ------------****************xxx---------------------------------------> CPU2
+ *
+ * S3 E3
+ * ------------------------****xxx********-------------------------------> CPU3
+ *
+ * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
+ * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
+ * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
+ * it's a part of total blocktime.
+ * S1 - here is last_begin
+ * Legend of the picture is following:
+ * * - means blocktime per vCPU
+ * x - means overlapped blocktime (total blocktime)
+ *
+ * @addr: host virtual address
+ */
+static void mark_postcopy_blocktime_end(uintptr_t addr)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
+ int i, affected_cpu = 0;
+ bool vcpu_total_blocktime = false;
+ uint32_t read_vcpu_time, low_time_offset;
+
+ if (!dc) {
+ return;
+ }
+
+ low_time_offset = get_low_time_offset(dc);
+ /* lookup cpu, to clear it,
+ * that algorithm looks straighforward, but it's not
+ * optimal, more optimal algorithm is keeping tree or hash
+ * where key is address value is a list of */
+ for (i = 0; i < smp_cpus; i++) {
+ uint32_t vcpu_blocktime = 0;
+
+ read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
+ if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
+ read_vcpu_time == 0) {
+ continue;
+ }
+ atomic_xchg(&dc->vcpu_addr[i], 0);
+ vcpu_blocktime = low_time_offset - read_vcpu_time;
+ affected_cpu += 1;
+ /* we need to know is that mark_postcopy_end was due to
+ * faulted page, another possible case it's prefetched
+ * page and in that case we shouldn't be here */
+ if (!vcpu_total_blocktime &&
+ atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
+ vcpu_total_blocktime = true;
+ }
+ /* continue cycle, due to one page could affect several vCPUs */
+ dc->vcpu_blocktime[i] += vcpu_blocktime;
+ }
+
+ atomic_sub(&dc->smp_cpus_down, affected_cpu);
+ if (vcpu_total_blocktime) {
+ dc->total_blocktime += low_time_offset - atomic_fetch_add(
+ &dc->last_begin, 0);
+ }
+ trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
+ affected_cpu);
+}
+
+static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
+{
+ trace_postcopy_pause_fault_thread();
+
+ qemu_sem_wait(&mis->postcopy_pause_sem_fault);
+
+ trace_postcopy_pause_fault_thread_continued();
+
+ return true;
+}
+
/*
* Handle faults detected by the USERFAULT markings
*/
break;
}
+ if (!mis->to_src_file) {
+ /*
+ * Possibly someone tells us that the return path is
+ * broken already using the event. We should hold until
+ * the channel is rebuilt.
+ */
+ if (postcopy_pause_fault_thread(mis)) {
+ mis->last_rb = NULL;
+ /* Continue to read the userfaultfd */
+ } else {
+ error_report("%s: paused but don't allow to continue",
+ __func__);
+ break;
+ }
+ }
+
if (pfd[1].revents) {
uint64_t tmp64 = 0;
rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
qemu_ram_get_idstr(rb),
- rb_offset);
+ rb_offset,
+ msg.arg.pagefault.feat.ptid);
+ mark_postcopy_blocktime_begin(
+ (uintptr_t)(msg.arg.pagefault.address),
+ msg.arg.pagefault.feat.ptid, rb);
+
+retry:
/*
* Send the request to the source - we want to request one
* of our host page sizes (which is >= TPS)
*/
if (rb != mis->last_rb) {
mis->last_rb = rb;
- migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
- rb_offset, qemu_ram_pagesize(rb));
+ ret = migrate_send_rp_req_pages(mis,
+ qemu_ram_get_idstr(rb),
+ rb_offset,
+ qemu_ram_pagesize(rb));
} else {
/* Save some space */
- migrate_send_rp_req_pages(mis, NULL,
- rb_offset, qemu_ram_pagesize(rb));
+ ret = migrate_send_rp_req_pages(mis,
+ NULL,
+ rb_offset,
+ qemu_ram_pagesize(rb));
+ }
+
+ if (ret) {
+ /* May be network failure, try to wait for recovery */
+ if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
+ /* We got reconnected somehow, try to continue */
+ mis->last_rb = NULL;
+ goto retry;
+ } else {
+ /* This is a unavoidable fault */
+ error_report("%s: migrate_send_rp_req_pages() get %d",
+ __func__, ret);
+ break;
+ }
}
}
}
}
trace_postcopy_ram_fault_thread_exit();
+ g_free(pfd);
return NULL;
}
mis->have_fault_thread = true;
/* Mark so that we get notified of accesses to unwritten areas */
- if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+ if (qemu_ram_foreach_migratable_block(ram_block_enable_notify, mis)) {
return -1;
}
if (!ret) {
ramblock_recv_bitmap_set_range(rb, host_addr,
pagesize / qemu_target_page_size());
+ mark_postcopy_blocktime_end((uintptr_t)host_addr);
+
}
return ret;
}
+int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
+{
+ int i;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ GArray *pcrfds = mis->postcopy_remote_fds;
+
+ for (i = 0; i < pcrfds->len; i++) {
+ struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
+ int ret = cur->waker(cur, rb, offset);
+ if (ret) {
+ return ret;
+ }
+ }
+ return 0;
+}
+
/*
* Place a host page (from) at (host) atomically
* returns 0 on success
}
trace_postcopy_place_page(host);
- return 0;
+ return postcopy_notify_shared_wake(rb,
+ qemu_ram_block_host_offset(rb, host));
}
/*
return -e;
}
+ return postcopy_notify_shared_wake(rb,
+ qemu_ram_block_host_offset(rb,
+ host));
} else {
/* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
if (!mis->postcopy_tmp_zero_page) {
return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
rb);
}
-
- return 0;
}
/*
#else
/* No target OS support, stubs just fail */
+void fill_destination_postcopy_migration_info(MigrationInfo *info)
+{
+}
+
bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
{
error_report("%s: No OS support", __func__);
return false;
}
-int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+int postcopy_ram_incoming_init(MigrationIncomingState *mis)
{
error_report("postcopy_ram_incoming_init: No OS support");
return -1;
return NULL;
}
+int postcopy_wake_shared(struct PostCopyFD *pcfd,
+ uint64_t client_addr,
+ RAMBlock *rb)
+{
+ assert(0);
+ return -1;
+}
#endif
/* ------------------------------------------------------------------------- */