/* Needed early for CONFIG_BSD etc. */
#include "qemu/osdep.h"
-
+#include "qemu-common.h"
+#include "qemu/config-file.h"
+#include "cpu.h"
#include "monitor/monitor.h"
#include "qapi/qmp/qerror.h"
#include "qemu/error-report.h"
#include "sysemu/sysemu.h"
+#include "sysemu/block-backend.h"
#include "exec/gdbstub.h"
#include "sysemu/dma.h"
+#include "sysemu/hw_accel.h"
#include "sysemu/kvm.h"
+#include "sysemu/hax.h"
#include "qmp-commands.h"
+#include "exec/exec-all.h"
#include "qemu/thread.h"
#include "sysemu/cpus.h"
#include "qemu/main-loop.h"
#include "qemu/bitmap.h"
#include "qemu/seqlock.h"
+#include "tcg.h"
#include "qapi-event.h"
#include "hw/nmi.h"
#include "sysemu/replay.h"
-#ifndef _WIN32
-#include "qemu/compatfd.h"
-#endif
-
#ifdef CONFIG_LINUX
#include <sys/prctl.h>
#endif /* CONFIG_LINUX */
-static CPUState *next_cpu;
int64_t max_delay;
int64_t max_advance;
} TimersState;
static TimersState timers_state;
+bool mttcg_enabled;
+
+/*
+ * We default to false if we know other options have been enabled
+ * which are currently incompatible with MTTCG. Otherwise when each
+ * guest (target) has been updated to support:
+ * - atomic instructions
+ * - memory ordering primitives (barriers)
+ * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
+ *
+ * Once a guest architecture has been converted to the new primitives
+ * there are two remaining limitations to check.
+ *
+ * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
+ * - The host must have a stronger memory order than the guest
+ *
+ * It may be possible in future to support strong guests on weak hosts
+ * but that will require tagging all load/stores in a guest with their
+ * implicit memory order requirements which would likely slow things
+ * down a lot.
+ */
+
+static bool check_tcg_memory_orders_compatible(void)
+{
+#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
+ return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
+#else
+ return false;
+#endif
+}
+
+static bool default_mttcg_enabled(void)
+{
+ QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
+ const char *rr = qemu_opt_get(icount_opts, "rr");
+
+ if (rr || TCG_OVERSIZED_GUEST) {
+ return false;
+ } else {
+#ifdef TARGET_SUPPORTS_MTTCG
+ return check_tcg_memory_orders_compatible();
+#else
+ return false;
+#endif
+ }
+}
+
+void qemu_tcg_configure(QemuOpts *opts, Error **errp)
+{
+ const char *t = qemu_opt_get(opts, "thread");
+ if (t) {
+ if (strcmp(t, "multi") == 0) {
+ if (TCG_OVERSIZED_GUEST) {
+ error_setg(errp, "No MTTCG when guest word size > hosts");
+ } else {
+ if (!check_tcg_memory_orders_compatible()) {
+ error_report("Guest expects a stronger memory ordering "
+ "than the host provides");
+ error_printf("This may cause strange/hard to debug errors");
+ }
+ mttcg_enabled = true;
+ }
+ } else if (strcmp(t, "single") == 0) {
+ mttcg_enabled = false;
+ } else {
+ error_setg(errp, "Invalid 'thread' setting %s", t);
+ }
+ } else {
+ mttcg_enabled = default_mttcg_enabled();
+ }
+}
int64_t cpu_get_icount_raw(void)
{
return icount << icount_time_shift;
}
-/* return the host CPU cycle counter and handle stop/restart */
-/* Caller must hold the BQL */
+/* return the time elapsed in VM between vm_start and vm_stop. Unless
+ * icount is active, cpu_get_ticks() uses units of the host CPU cycle
+ * counter.
+ *
+ * Caller must hold the BQL
+ */
int64_t cpu_get_ticks(void)
{
int64_t ticks;
static int64_t cpu_get_clock_locked(void)
{
- int64_t ticks;
+ int64_t time;
- ticks = timers_state.cpu_clock_offset;
+ time = timers_state.cpu_clock_offset;
if (timers_state.cpu_ticks_enabled) {
- ticks += get_clock();
+ time += get_clock();
}
- return ticks;
+ return time;
}
-/* return the host CPU monotonic timer and handle stop/restart */
+/* Return the monotonic time elapsed in VM, i.e.,
+ * the time between vm_start and vm_stop
+ */
int64_t cpu_get_clock(void)
{
int64_t ti;
}
/* enable cpu_get_ticks()
- * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
*/
void cpu_enable_ticks(void)
{
/* Here, the really thing protected by seqlock is cpu_clock_offset. */
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
if (!timers_state.cpu_ticks_enabled) {
timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
timers_state.cpu_clock_offset -= get_clock();
timers_state.cpu_ticks_enabled = 1;
}
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
}
/* disable cpu_get_ticks() : the clock is stopped. You must not call
* cpu_get_ticks() after that.
- * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
*/
void cpu_disable_ticks(void)
{
/* Here, the really thing protected by seqlock is cpu_clock_offset. */
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
if (timers_state.cpu_ticks_enabled) {
timers_state.cpu_ticks_offset += cpu_get_host_ticks();
timers_state.cpu_clock_offset = cpu_get_clock_locked();
timers_state.cpu_ticks_enabled = 0;
}
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
}
/* Correlation between real and virtual time is always going to be
fairly approximate, so ignore small variation.
When the guest is idle real and virtual time will be aligned in
the IO wait loop. */
-#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
+#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
static void icount_adjust(void)
{
return;
}
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
cur_time = cpu_get_clock_locked();
cur_icount = cpu_get_icount_locked();
last_delta = delta;
timers_state.qemu_icount_bias = cur_icount
- (timers_state.qemu_icount << icount_time_shift);
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
}
static void icount_adjust_rt(void *opaque)
{
timer_mod(icount_vm_timer,
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
- get_ticks_per_sec() / 10);
+ NANOSECONDS_PER_SECOND / 10);
icount_adjust();
}
static void icount_warp_rt(void)
{
+ unsigned seq;
+ int64_t warp_start;
+
/* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
* changes from -1 to another value, so the race here is okay.
*/
- if (atomic_read(&vm_clock_warp_start) == -1) {
+ do {
+ seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+ warp_start = vm_clock_warp_start;
+ } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
+
+ if (warp_start == -1) {
return;
}
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
if (runstate_is_running()) {
int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
cpu_get_clock_locked());
timers_state.qemu_icount_bias += warp_delta;
}
vm_clock_warp_start = -1;
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
timers_state.qemu_icount_bias += warp;
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
* It is useful when we want a deterministic execution time,
* isolated from host latencies.
*/
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
timers_state.qemu_icount_bias += deadline;
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
} else {
/*
* you will not be sending network packets continuously instead of
* every 100ms.
*/
- seqlock_write_lock(&timers_state.vm_clock_seqlock);
+ seqlock_write_begin(&timers_state.vm_clock_seqlock);
if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
vm_clock_warp_start = clock;
}
- seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+ seqlock_write_end(&timers_state.vm_clock_seqlock);
timer_mod_anticipate(icount_warp_timer, clock + deadline);
}
} else if (deadline == 0) {
}
};
-static void cpu_throttle_thread(void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
{
- CPUState *cpu = opaque;
double pct;
double throttle_ratio;
long sleeptime_ns;
}
CPU_FOREACH(cpu) {
if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
- async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
+ async_run_on_cpu(cpu, cpu_throttle_thread,
+ RUN_ON_CPU_NULL);
}
}
void cpu_ticks_init(void)
{
- seqlock_init(&timers_state.vm_clock_seqlock, NULL);
+ seqlock_init(&timers_state.vm_clock_seqlock);
vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
cpu_throttle_timer_tick, NULL);
icount_adjust_vm, NULL);
timer_mod(icount_vm_timer,
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
- get_ticks_per_sec() / 10);
+ NANOSECONDS_PER_SECOND / 10);
+}
+
+/***********************************************************/
+/* TCG vCPU kick timer
+ *
+ * The kick timer is responsible for moving single threaded vCPU
+ * emulation on to the next vCPU. If more than one vCPU is running a
+ * timer event with force a cpu->exit so the next vCPU can get
+ * scheduled.
+ *
+ * The timer is removed if all vCPUs are idle and restarted again once
+ * idleness is complete.
+ */
+
+static QEMUTimer *tcg_kick_vcpu_timer;
+static CPUState *tcg_current_rr_cpu;
+
+#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
+
+static inline int64_t qemu_tcg_next_kick(void)
+{
+ return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
+}
+
+/* Kick the currently round-robin scheduled vCPU */
+static void qemu_cpu_kick_rr_cpu(void)
+{
+ CPUState *cpu;
+ do {
+ cpu = atomic_mb_read(&tcg_current_rr_cpu);
+ if (cpu) {
+ cpu_exit(cpu);
+ }
+ } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
+}
+
+static void kick_tcg_thread(void *opaque)
+{
+ timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+ qemu_cpu_kick_rr_cpu();
+}
+
+static void start_tcg_kick_timer(void)
+{
+ if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
+ tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ kick_tcg_thread, NULL);
+ timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
+ }
+}
+
+static void stop_tcg_kick_timer(void)
+{
+ if (tcg_kick_vcpu_timer) {
+ timer_del(tcg_kick_vcpu_timer);
+ tcg_kick_vcpu_timer = NULL;
+ }
}
/***********************************************************/
}
bdrv_drain_all();
+ replay_disable_events();
ret = bdrv_flush_all();
return ret;
raise(SIGBUS);
sigemptyset(&set);
sigaddset(&set, SIGBUS);
- sigprocmask(SIG_UNBLOCK, &set, NULL);
+ pthread_sigmask(SIG_UNBLOCK, &set, NULL);
}
perror("Failed to re-raise SIGBUS!\n");
abort();
}
-static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
- void *ctx)
+static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
{
- if (kvm_on_sigbus(siginfo->ssi_code,
- (void *)(intptr_t)siginfo->ssi_addr)) {
+ if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
sigbus_reraise();
}
+
+ if (current_cpu) {
+ /* Called asynchronously in VCPU thread. */
+ if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
+ sigbus_reraise();
+ }
+ } else {
+ /* Called synchronously (via signalfd) in main thread. */
+ if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
+ sigbus_reraise();
+ }
+ }
}
static void qemu_init_sigbus(void)
memset(&action, 0, sizeof(action));
action.sa_flags = SA_SIGINFO;
- action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+ action.sa_sigaction = sigbus_handler;
sigaction(SIGBUS, &action, NULL);
prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
}
-
-static void qemu_kvm_eat_signals(CPUState *cpu)
-{
- struct timespec ts = { 0, 0 };
- siginfo_t siginfo;
- sigset_t waitset;
- sigset_t chkset;
- int r;
-
- sigemptyset(&waitset);
- sigaddset(&waitset, SIG_IPI);
- sigaddset(&waitset, SIGBUS);
-
- do {
- r = sigtimedwait(&waitset, &siginfo, &ts);
- if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
- perror("sigtimedwait");
- exit(1);
- }
-
- switch (r) {
- case SIGBUS:
- if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
- sigbus_reraise();
- }
- break;
- default:
- break;
- }
-
- r = sigpending(&chkset);
- if (r == -1) {
- perror("sigpending");
- exit(1);
- }
- } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
-}
-
#else /* !CONFIG_LINUX */
-
static void qemu_init_sigbus(void)
{
}
-
-static void qemu_kvm_eat_signals(CPUState *cpu)
-{
-}
#endif /* !CONFIG_LINUX */
-#ifndef _WIN32
-static void dummy_signal(int sig)
-{
-}
-
-static void qemu_kvm_init_cpu_signals(CPUState *cpu)
-{
- int r;
- sigset_t set;
- struct sigaction sigact;
-
- memset(&sigact, 0, sizeof(sigact));
- sigact.sa_handler = dummy_signal;
- sigaction(SIG_IPI, &sigact, NULL);
-
- pthread_sigmask(SIG_BLOCK, NULL, &set);
- sigdelset(&set, SIG_IPI);
- sigdelset(&set, SIGBUS);
- r = kvm_set_signal_mask(cpu, &set);
- if (r) {
- fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
- exit(1);
- }
-}
-
-#else /* _WIN32 */
-static void qemu_kvm_init_cpu_signals(CPUState *cpu)
-{
- abort();
-}
-#endif /* _WIN32 */
-
static QemuMutex qemu_global_mutex;
-static QemuCond qemu_io_proceeded_cond;
-static unsigned iothread_requesting_mutex;
static QemuThread io_thread;
static QemuCond qemu_cpu_cond;
/* system init */
static QemuCond qemu_pause_cond;
-static QemuCond qemu_work_cond;
void qemu_init_cpu_loop(void)
{
qemu_init_sigbus();
qemu_cond_init(&qemu_cpu_cond);
qemu_cond_init(&qemu_pause_cond);
- qemu_cond_init(&qemu_work_cond);
- qemu_cond_init(&qemu_io_proceeded_cond);
qemu_mutex_init(&qemu_global_mutex);
qemu_thread_get_self(&io_thread);
}
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
{
- struct qemu_work_item wi;
-
- if (qemu_cpu_is_self(cpu)) {
- func(data);
- return;
- }
-
- wi.func = func;
- wi.data = data;
- wi.free = false;
-
- qemu_mutex_lock(&cpu->work_mutex);
- if (cpu->queued_work_first == NULL) {
- cpu->queued_work_first = &wi;
- } else {
- cpu->queued_work_last->next = &wi;
- }
- cpu->queued_work_last = &wi;
- wi.next = NULL;
- wi.done = false;
- qemu_mutex_unlock(&cpu->work_mutex);
-
- qemu_cpu_kick(cpu);
- while (!atomic_mb_read(&wi.done)) {
- CPUState *self_cpu = current_cpu;
-
- qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
- current_cpu = self_cpu;
- }
+ do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
}
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
{
- struct qemu_work_item *wi;
-
- if (qemu_cpu_is_self(cpu)) {
- func(data);
- return;
+ if (kvm_destroy_vcpu(cpu) < 0) {
+ error_report("kvm_destroy_vcpu failed");
+ exit(EXIT_FAILURE);
}
-
- wi = g_malloc0(sizeof(struct qemu_work_item));
- wi->func = func;
- wi->data = data;
- wi->free = true;
-
- qemu_mutex_lock(&cpu->work_mutex);
- if (cpu->queued_work_first == NULL) {
- cpu->queued_work_first = wi;
- } else {
- cpu->queued_work_last->next = wi;
- }
- cpu->queued_work_last = wi;
- wi->next = NULL;
- wi->done = false;
- qemu_mutex_unlock(&cpu->work_mutex);
-
- qemu_cpu_kick(cpu);
}
-static void flush_queued_work(CPUState *cpu)
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
{
- struct qemu_work_item *wi;
-
- if (cpu->queued_work_first == NULL) {
- return;
- }
-
- qemu_mutex_lock(&cpu->work_mutex);
- while (cpu->queued_work_first != NULL) {
- wi = cpu->queued_work_first;
- cpu->queued_work_first = wi->next;
- if (!cpu->queued_work_first) {
- cpu->queued_work_last = NULL;
- }
- qemu_mutex_unlock(&cpu->work_mutex);
- wi->func(wi->data);
- qemu_mutex_lock(&cpu->work_mutex);
- if (wi->free) {
- g_free(wi);
- } else {
- atomic_mb_set(&wi->done, true);
- }
- }
- qemu_mutex_unlock(&cpu->work_mutex);
- qemu_cond_broadcast(&qemu_work_cond);
}
static void qemu_wait_io_event_common(CPUState *cpu)
{
+ atomic_mb_set(&cpu->thread_kicked, false);
if (cpu->stop) {
cpu->stop = false;
cpu->stopped = true;
qemu_cond_broadcast(&qemu_pause_cond);
}
- flush_queued_work(cpu);
- cpu->thread_kicked = false;
+ process_queued_cpu_work(cpu);
+}
+
+static bool qemu_tcg_should_sleep(CPUState *cpu)
+{
+ if (mttcg_enabled) {
+ return cpu_thread_is_idle(cpu);
+ } else {
+ return all_cpu_threads_idle();
+ }
}
static void qemu_tcg_wait_io_event(CPUState *cpu)
{
- while (all_cpu_threads_idle()) {
+ while (qemu_tcg_should_sleep(cpu)) {
+ stop_tcg_kick_timer();
qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
}
- while (iothread_requesting_mutex) {
- qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
- }
+ start_tcg_kick_timer();
- CPU_FOREACH(cpu) {
- qemu_wait_io_event_common(cpu);
- }
+ qemu_wait_io_event_common(cpu);
}
static void qemu_kvm_wait_io_event(CPUState *cpu)
qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
}
- qemu_kvm_eat_signals(cpu);
qemu_wait_io_event_common(cpu);
}
exit(1);
}
- qemu_kvm_init_cpu_signals(cpu);
+ kvm_init_cpu_signals(cpu);
/* signal CPU creation */
cpu->created = true;
qemu_cond_signal(&qemu_cpu_cond);
- while (1) {
+ do {
if (cpu_can_run(cpu)) {
r = kvm_cpu_exec(cpu);
if (r == EXCP_DEBUG) {
}
}
qemu_kvm_wait_io_event(cpu);
- }
+ } while (!cpu->unplug || cpu_can_run(cpu));
+ qemu_kvm_destroy_vcpu(cpu);
+ cpu->created = false;
+ qemu_cond_signal(&qemu_cpu_cond);
+ qemu_mutex_unlock_iothread();
return NULL;
}
qemu_thread_get_self(cpu->thread);
cpu->thread_id = qemu_get_thread_id();
cpu->can_do_io = 1;
+ current_cpu = cpu;
sigemptyset(&waitset);
sigaddset(&waitset, SIG_IPI);
cpu->created = true;
qemu_cond_signal(&qemu_cpu_cond);
- current_cpu = cpu;
while (1) {
- current_cpu = NULL;
qemu_mutex_unlock_iothread();
do {
int sig;
exit(1);
}
qemu_mutex_lock_iothread();
- current_cpu = cpu;
qemu_wait_io_event_common(cpu);
}
#endif
}
-static void tcg_exec_all(void);
+static int64_t tcg_get_icount_limit(void)
+{
+ int64_t deadline;
-static void *qemu_tcg_cpu_thread_fn(void *arg)
+ if (replay_mode != REPLAY_MODE_PLAY) {
+ deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+ /* Maintain prior (possibly buggy) behaviour where if no deadline
+ * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+ * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+ * nanoseconds.
+ */
+ if ((deadline < 0) || (deadline > INT32_MAX)) {
+ deadline = INT32_MAX;
+ }
+
+ return qemu_icount_round(deadline);
+ } else {
+ return replay_get_instructions();
+ }
+}
+
+static void handle_icount_deadline(void)
+{
+ if (use_icount) {
+ int64_t deadline =
+ qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+ if (deadline == 0) {
+ qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+ }
+ }
+}
+
+static int tcg_cpu_exec(CPUState *cpu)
+{
+ int ret;
+#ifdef CONFIG_PROFILER
+ int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+ ti = profile_getclock();
+#endif
+ if (use_icount) {
+ int64_t count;
+ int decr;
+ timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+ + cpu->icount_extra);
+ cpu->icount_decr.u16.low = 0;
+ cpu->icount_extra = 0;
+ count = tcg_get_icount_limit();
+ timers_state.qemu_icount += count;
+ decr = (count > 0xffff) ? 0xffff : count;
+ count -= decr;
+ cpu->icount_decr.u16.low = decr;
+ cpu->icount_extra = count;
+ }
+ qemu_mutex_unlock_iothread();
+ cpu_exec_start(cpu);
+ ret = cpu_exec(cpu);
+ cpu_exec_end(cpu);
+ qemu_mutex_lock_iothread();
+#ifdef CONFIG_PROFILER
+ tcg_time += profile_getclock() - ti;
+#endif
+ if (use_icount) {
+ /* Fold pending instructions back into the
+ instruction counter, and clear the interrupt flag. */
+ timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+ + cpu->icount_extra);
+ cpu->icount_decr.u32 = 0;
+ cpu->icount_extra = 0;
+ replay_account_executed_instructions();
+ }
+ return ret;
+}
+
+/* Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ if (cpu->unplug && !cpu_can_run(cpu)) {
+ qemu_tcg_destroy_vcpu(cpu);
+ cpu->created = false;
+ qemu_cond_signal(&qemu_cpu_cond);
+ break;
+ }
+ }
+}
+
+/* Single-threaded TCG
+ *
+ * In the single-threaded case each vCPU is simulated in turn. If
+ * there is more than a single vCPU we create a simple timer to kick
+ * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
+ * This is done explicitly rather than relying on side-effects
+ * elsewhere.
+ */
+
+static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
{
CPUState *cpu = arg;
/* process any pending work */
CPU_FOREACH(cpu) {
+ current_cpu = cpu;
qemu_wait_io_event_common(cpu);
}
}
+ start_tcg_kick_timer();
+
+ cpu = first_cpu;
+
/* process any pending work */
- atomic_mb_set(&exit_request, 1);
+ cpu->exit_request = 1;
+
+ while (1) {
+ /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
+ qemu_account_warp_timer();
+
+ if (!cpu) {
+ cpu = first_cpu;
+ }
+
+ while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
+
+ atomic_mb_set(&tcg_current_rr_cpu, cpu);
+ current_cpu = cpu;
+
+ qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+ (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+ if (cpu_can_run(cpu)) {
+ int r;
+ r = tcg_cpu_exec(cpu);
+ if (r == EXCP_DEBUG) {
+ cpu_handle_guest_debug(cpu);
+ break;
+ } else if (r == EXCP_ATOMIC) {
+ qemu_mutex_unlock_iothread();
+ cpu_exec_step_atomic(cpu);
+ qemu_mutex_lock_iothread();
+ break;
+ }
+ } else if (cpu->stop) {
+ if (cpu->unplug) {
+ cpu = CPU_NEXT(cpu);
+ }
+ break;
+ }
+
+ cpu = CPU_NEXT(cpu);
+ } /* while (cpu && !cpu->exit_request).. */
+
+ /* Does not need atomic_mb_set because a spurious wakeup is okay. */
+ atomic_set(&tcg_current_rr_cpu, NULL);
+
+ if (cpu && cpu->exit_request) {
+ atomic_mb_set(&cpu->exit_request, 0);
+ }
+
+ handle_icount_deadline();
+
+ qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
+ deal_with_unplugged_cpus();
+ }
+
+ return NULL;
+}
+
+static void *qemu_hax_cpu_thread_fn(void *arg)
+{
+ CPUState *cpu = arg;
+ int r;
+ qemu_thread_get_self(cpu->thread);
+ qemu_mutex_lock(&qemu_global_mutex);
+
+ cpu->thread_id = qemu_get_thread_id();
+ cpu->created = true;
+ cpu->halted = 0;
+ current_cpu = cpu;
+
+ hax_init_vcpu(cpu);
+ qemu_cond_signal(&qemu_cpu_cond);
while (1) {
- tcg_exec_all();
+ if (cpu_can_run(cpu)) {
+ r = hax_smp_cpu_exec(cpu);
+ if (r == EXCP_DEBUG) {
+ cpu_handle_guest_debug(cpu);
+ }
+ }
+
+ while (cpu_thread_is_idle(cpu)) {
+ qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
+ }
+#ifdef _WIN32
+ SleepEx(0, TRUE);
+#endif
+ qemu_wait_io_event_common(cpu);
+ }
+ return NULL;
+}
+
+#ifdef _WIN32
+static void CALLBACK dummy_apc_func(ULONG_PTR unused)
+{
+}
+#endif
+
+/* Multi-threaded TCG
+ *
+ * In the multi-threaded case each vCPU has its own thread. The TLS
+ * variable current_cpu can be used deep in the code to find the
+ * current CPUState for a given thread.
+ */
+
+static void *qemu_tcg_cpu_thread_fn(void *arg)
+{
+ CPUState *cpu = arg;
- if (use_icount) {
- int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+ rcu_register_thread();
+
+ qemu_mutex_lock_iothread();
+ qemu_thread_get_self(cpu->thread);
- if (deadline == 0) {
- qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+ cpu->thread_id = qemu_get_thread_id();
+ cpu->created = true;
+ cpu->can_do_io = 1;
+ current_cpu = cpu;
+ qemu_cond_signal(&qemu_cpu_cond);
+
+ /* process any pending work */
+ cpu->exit_request = 1;
+
+ while (1) {
+ if (cpu_can_run(cpu)) {
+ int r;
+ r = tcg_cpu_exec(cpu);
+ switch (r) {
+ case EXCP_DEBUG:
+ cpu_handle_guest_debug(cpu);
+ break;
+ case EXCP_HALTED:
+ /* during start-up the vCPU is reset and the thread is
+ * kicked several times. If we don't ensure we go back
+ * to sleep in the halted state we won't cleanly
+ * start-up when the vCPU is enabled.
+ *
+ * cpu->halted should ensure we sleep in wait_io_event
+ */
+ g_assert(cpu->halted);
+ break;
+ case EXCP_ATOMIC:
+ qemu_mutex_unlock_iothread();
+ cpu_exec_step_atomic(cpu);
+ qemu_mutex_lock_iothread();
+ default:
+ /* Ignore everything else? */
+ break;
}
}
- qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+
+ handle_icount_deadline();
+
+ atomic_mb_set(&cpu->exit_request, 0);
+ qemu_tcg_wait_io_event(cpu);
}
return NULL;
exit(1);
}
#else /* _WIN32 */
- abort();
-#endif
-}
-
-static void qemu_cpu_kick_no_halt(void)
-{
- CPUState *cpu;
- /* Ensure whatever caused the exit has reached the CPU threads before
- * writing exit_request.
- */
- atomic_mb_set(&exit_request, 1);
- cpu = atomic_mb_read(&tcg_current_cpu);
- if (cpu) {
- cpu_exit(cpu);
+ if (!qemu_cpu_is_self(cpu)) {
+ if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
+ fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
+ __func__, GetLastError());
+ exit(1);
+ }
}
+#endif
}
void qemu_cpu_kick(CPUState *cpu)
{
qemu_cond_broadcast(cpu->halt_cond);
if (tcg_enabled()) {
- qemu_cpu_kick_no_halt();
+ cpu_exit(cpu);
+ /* NOP unless doing single-thread RR */
+ qemu_cpu_kick_rr_cpu();
} else {
+ if (hax_enabled()) {
+ /*
+ * FIXME: race condition with the exit_request check in
+ * hax_vcpu_hax_exec
+ */
+ cpu->exit_request = 1;
+ }
qemu_cpu_kick_thread(cpu);
}
}
void qemu_mutex_lock_iothread(void)
{
- atomic_inc(&iothread_requesting_mutex);
- /* In the simple case there is no need to bump the VCPU thread out of
- * TCG code execution.
- */
- if (!tcg_enabled() || qemu_in_vcpu_thread() ||
- !first_cpu || !first_cpu->created) {
- qemu_mutex_lock(&qemu_global_mutex);
- atomic_dec(&iothread_requesting_mutex);
- } else {
- if (qemu_mutex_trylock(&qemu_global_mutex)) {
- qemu_cpu_kick_no_halt();
- qemu_mutex_lock(&qemu_global_mutex);
- }
- atomic_dec(&iothread_requesting_mutex);
- qemu_cond_broadcast(&qemu_io_proceeded_cond);
- }
+ g_assert(!qemu_mutex_iothread_locked());
+ qemu_mutex_lock(&qemu_global_mutex);
iothread_locked = true;
}
void qemu_mutex_unlock_iothread(void)
{
+ g_assert(qemu_mutex_iothread_locked());
iothread_locked = false;
qemu_mutex_unlock(&qemu_global_mutex);
}
-static int all_vcpus_paused(void)
+static bool all_vcpus_paused(void)
{
CPUState *cpu;
CPU_FOREACH(cpu) {
if (!cpu->stopped) {
- return 0;
+ return false;
}
}
- return 1;
+ return true;
}
void pause_all_vcpus(void)
if (qemu_in_vcpu_thread()) {
cpu_stop_current();
- if (!kvm_enabled()) {
- CPU_FOREACH(cpu) {
- cpu->stop = false;
- cpu->stopped = true;
- }
- return;
- }
}
while (!all_vcpus_paused()) {
}
}
+void cpu_remove(CPUState *cpu)
+{
+ cpu->stop = true;
+ cpu->unplug = true;
+ qemu_cpu_kick(cpu);
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+ cpu_remove(cpu);
+ while (cpu->created) {
+ qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+ }
+}
+
/* For temporary buffers for forming a name */
#define VCPU_THREAD_NAME_SIZE 16
static void qemu_tcg_init_vcpu(CPUState *cpu)
{
char thread_name[VCPU_THREAD_NAME_SIZE];
- static QemuCond *tcg_halt_cond;
- static QemuThread *tcg_cpu_thread;
+ static QemuCond *single_tcg_halt_cond;
+ static QemuThread *single_tcg_cpu_thread;
- /* share a single thread for all cpus with TCG */
- if (!tcg_cpu_thread) {
+ if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
cpu->thread = g_malloc0(sizeof(QemuThread));
cpu->halt_cond = g_malloc0(sizeof(QemuCond));
qemu_cond_init(cpu->halt_cond);
- tcg_halt_cond = cpu->halt_cond;
- snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
+
+ if (qemu_tcg_mttcg_enabled()) {
+ /* create a thread per vCPU with TCG (MTTCG) */
+ parallel_cpus = true;
+ snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
cpu->cpu_index);
- qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
- cpu, QEMU_THREAD_JOINABLE);
+
+ qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
+ cpu, QEMU_THREAD_JOINABLE);
+
+ } else {
+ /* share a single thread for all cpus with TCG */
+ snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
+ qemu_thread_create(cpu->thread, thread_name,
+ qemu_tcg_rr_cpu_thread_fn,
+ cpu, QEMU_THREAD_JOINABLE);
+
+ single_tcg_halt_cond = cpu->halt_cond;
+ single_tcg_cpu_thread = cpu->thread;
+ }
#ifdef _WIN32
cpu->hThread = qemu_thread_get_handle(cpu->thread);
#endif
while (!cpu->created) {
qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
}
- tcg_cpu_thread = cpu->thread;
} else {
- cpu->thread = tcg_cpu_thread;
- cpu->halt_cond = tcg_halt_cond;
+ /* For non-MTTCG cases we share the thread */
+ cpu->thread = single_tcg_cpu_thread;
+ cpu->halt_cond = single_tcg_halt_cond;
+ }
+}
+
+static void qemu_hax_start_vcpu(CPUState *cpu)
+{
+ char thread_name[VCPU_THREAD_NAME_SIZE];
+
+ cpu->thread = g_malloc0(sizeof(QemuThread));
+ cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+ qemu_cond_init(cpu->halt_cond);
+
+ snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
+ cpu->cpu_index);
+ qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
+ cpu, QEMU_THREAD_JOINABLE);
+#ifdef _WIN32
+ cpu->hThread = qemu_thread_get_handle(cpu->thread);
+#endif
+ while (!cpu->created) {
+ qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
}
}
if (kvm_enabled()) {
qemu_kvm_start_vcpu(cpu);
+ } else if (hax_enabled()) {
+ qemu_hax_start_vcpu(cpu);
} else if (tcg_enabled()) {
qemu_tcg_init_vcpu(cpu);
} else {
return do_vm_stop(state);
}
-/* does a state transition even if the VM is already stopped,
- current state is forgotten forever */
-int vm_stop_force_state(RunState state)
+/**
+ * Prepare for (re)starting the VM.
+ * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
+ * running or in case of an error condition), 0 otherwise.
+ */
+int vm_prepare_start(void)
{
- if (runstate_is_running()) {
- return vm_stop(state);
- } else {
- runstate_set(state);
+ RunState requested;
+ int res = 0;
- bdrv_drain_all();
- /* Make sure to return an error if the flush in a previous vm_stop()
- * failed. */
- return bdrv_flush_all();
+ qemu_vmstop_requested(&requested);
+ if (runstate_is_running() && requested == RUN_STATE__MAX) {
+ return -1;
}
-}
-
-static int64_t tcg_get_icount_limit(void)
-{
- int64_t deadline;
-
- if (replay_mode != REPLAY_MODE_PLAY) {
- deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
- /* Maintain prior (possibly buggy) behaviour where if no deadline
- * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
- * INT32_MAX nanoseconds ahead, we still use INT32_MAX
- * nanoseconds.
- */
- if ((deadline < 0) || (deadline > INT32_MAX)) {
- deadline = INT32_MAX;
- }
-
- return qemu_icount_round(deadline);
+ /* Ensure that a STOP/RESUME pair of events is emitted if a
+ * vmstop request was pending. The BLOCK_IO_ERROR event, for
+ * example, according to documentation is always followed by
+ * the STOP event.
+ */
+ if (runstate_is_running()) {
+ qapi_event_send_stop(&error_abort);
+ res = -1;
} else {
- return replay_get_instructions();
+ replay_enable_events();
+ cpu_enable_ticks();
+ runstate_set(RUN_STATE_RUNNING);
+ vm_state_notify(1, RUN_STATE_RUNNING);
}
+
+ /* We are sending this now, but the CPUs will be resumed shortly later */
+ qapi_event_send_resume(&error_abort);
+ return res;
}
-static int tcg_cpu_exec(CPUState *cpu)
+void vm_start(void)
{
- int ret;
-#ifdef CONFIG_PROFILER
- int64_t ti;
-#endif
-
-#ifdef CONFIG_PROFILER
- ti = profile_getclock();
-#endif
- if (use_icount) {
- int64_t count;
- int decr;
- timers_state.qemu_icount -= (cpu->icount_decr.u16.low
- + cpu->icount_extra);
- cpu->icount_decr.u16.low = 0;
- cpu->icount_extra = 0;
- count = tcg_get_icount_limit();
- timers_state.qemu_icount += count;
- decr = (count > 0xffff) ? 0xffff : count;
- count -= decr;
- cpu->icount_decr.u16.low = decr;
- cpu->icount_extra = count;
- }
- ret = cpu_exec(cpu);
-#ifdef CONFIG_PROFILER
- tcg_time += profile_getclock() - ti;
-#endif
- if (use_icount) {
- /* Fold pending instructions back into the
- instruction counter, and clear the interrupt flag. */
- timers_state.qemu_icount -= (cpu->icount_decr.u16.low
- + cpu->icount_extra);
- cpu->icount_decr.u32 = 0;
- cpu->icount_extra = 0;
- replay_account_executed_instructions();
+ if (!vm_prepare_start()) {
+ resume_all_vcpus();
}
- return ret;
}
-static void tcg_exec_all(void)
+/* does a state transition even if the VM is already stopped,
+ current state is forgotten forever */
+int vm_stop_force_state(RunState state)
{
- int r;
-
- /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
- qemu_account_warp_timer();
-
- if (next_cpu == NULL) {
- next_cpu = first_cpu;
- }
- for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
- CPUState *cpu = next_cpu;
-
- qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
- (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+ if (runstate_is_running()) {
+ return vm_stop(state);
+ } else {
+ runstate_set(state);
- if (cpu_can_run(cpu)) {
- r = tcg_cpu_exec(cpu);
- if (r == EXCP_DEBUG) {
- cpu_handle_guest_debug(cpu);
- break;
- }
- } else if (cpu->stop || cpu->stopped) {
- break;
- }
+ bdrv_drain_all();
+ /* Make sure to return an error if the flush in a previous vm_stop()
+ * failed. */
+ return bdrv_flush_all();
}
-
- /* Pairs with smp_wmb in qemu_cpu_kick. */
- atomic_mb_set(&exit_request, 0);
}
void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
void qmp_inject_nmi(Error **errp)
{
-#if defined(TARGET_I386)
- CPUState *cs;
-
- CPU_FOREACH(cs) {
- X86CPU *cpu = X86_CPU(cs);
-
- if (!cpu->apic_state) {
- cpu_interrupt(cs, CPU_INTERRUPT_NMI);
- } else {
- apic_deliver_nmi(cpu->apic_state);
- }
- }
-#else
nmi_monitor_handle(monitor_get_cpu_index(), errp);
-#endif
}
void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)