4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
29 #include "monitor/monitor.h"
30 #include "qapi/qmp/qerror.h"
31 #include "qemu/error-report.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/block-backend.h"
34 #include "exec/gdbstub.h"
35 #include "sysemu/dma.h"
36 #include "sysemu/kvm.h"
37 #include "qmp-commands.h"
38 #include "exec/exec-all.h"
40 #include "qemu/thread.h"
41 #include "sysemu/cpus.h"
42 #include "sysemu/qtest.h"
43 #include "qemu/main-loop.h"
44 #include "qemu/bitmap.h"
45 #include "qemu/seqlock.h"
46 #include "qapi-event.h"
48 #include "sysemu/replay.h"
51 #include "qemu/compatfd.h"
56 #include <sys/prctl.h>
59 #define PR_MCE_KILL 33
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
70 #endif /* CONFIG_LINUX */
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
83 bool cpu_is_stopped(CPUState *cpu)
85 return cpu->stopped || !runstate_is_running();
88 static bool cpu_thread_is_idle(CPUState *cpu)
90 if (cpu->stop || cpu->queued_work_first) {
93 if (cpu_is_stopped(cpu)) {
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
103 static bool all_cpu_threads_idle(void)
108 if (!cpu_thread_is_idle(cpu)) {
115 /***********************************************************/
116 /* guest cycle counter */
118 /* Protected by TimersState seqlock */
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
136 /* cpu_clock_offset can be read out of BQL, so protect it with
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
150 static TimersState timers_state;
152 int64_t cpu_get_icount_raw(void)
155 CPUState *cpu = current_cpu;
157 icount = timers_state.qemu_icount;
159 if (!cpu->can_do_io) {
160 fprintf(stderr, "Bad icount read\n");
163 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
168 /* Return the virtual CPU time, based on the instruction counter. */
169 static int64_t cpu_get_icount_locked(void)
171 int64_t icount = cpu_get_icount_raw();
172 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
175 int64_t cpu_get_icount(void)
181 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
182 icount = cpu_get_icount_locked();
183 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
188 int64_t cpu_icount_to_ns(int64_t icount)
190 return icount << icount_time_shift;
193 /* return the time elapsed in VM between vm_start and vm_stop. Unless
194 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
197 * Caller must hold the BQL
199 int64_t cpu_get_ticks(void)
204 return cpu_get_icount();
207 ticks = timers_state.cpu_ticks_offset;
208 if (timers_state.cpu_ticks_enabled) {
209 ticks += cpu_get_host_ticks();
212 if (timers_state.cpu_ticks_prev > ticks) {
213 /* Note: non increasing ticks may happen if the host uses
215 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
216 ticks = timers_state.cpu_ticks_prev;
219 timers_state.cpu_ticks_prev = ticks;
223 static int64_t cpu_get_clock_locked(void)
227 time = timers_state.cpu_clock_offset;
228 if (timers_state.cpu_ticks_enabled) {
235 /* Return the monotonic time elapsed in VM, i.e.,
236 * the time between vm_start and vm_stop
238 int64_t cpu_get_clock(void)
244 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
245 ti = cpu_get_clock_locked();
246 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
251 /* enable cpu_get_ticks()
252 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
254 void cpu_enable_ticks(void)
256 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
257 seqlock_write_begin(&timers_state.vm_clock_seqlock);
258 if (!timers_state.cpu_ticks_enabled) {
259 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
260 timers_state.cpu_clock_offset -= get_clock();
261 timers_state.cpu_ticks_enabled = 1;
263 seqlock_write_end(&timers_state.vm_clock_seqlock);
266 /* disable cpu_get_ticks() : the clock is stopped. You must not call
267 * cpu_get_ticks() after that.
268 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
270 void cpu_disable_ticks(void)
272 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
273 seqlock_write_begin(&timers_state.vm_clock_seqlock);
274 if (timers_state.cpu_ticks_enabled) {
275 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
276 timers_state.cpu_clock_offset = cpu_get_clock_locked();
277 timers_state.cpu_ticks_enabled = 0;
279 seqlock_write_end(&timers_state.vm_clock_seqlock);
282 /* Correlation between real and virtual time is always going to be
283 fairly approximate, so ignore small variation.
284 When the guest is idle real and virtual time will be aligned in
286 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
288 static void icount_adjust(void)
294 /* Protected by TimersState mutex. */
295 static int64_t last_delta;
297 /* If the VM is not running, then do nothing. */
298 if (!runstate_is_running()) {
302 seqlock_write_begin(&timers_state.vm_clock_seqlock);
303 cur_time = cpu_get_clock_locked();
304 cur_icount = cpu_get_icount_locked();
306 delta = cur_icount - cur_time;
307 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
309 && last_delta + ICOUNT_WOBBLE < delta * 2
310 && icount_time_shift > 0) {
311 /* The guest is getting too far ahead. Slow time down. */
315 && last_delta - ICOUNT_WOBBLE > delta * 2
316 && icount_time_shift < MAX_ICOUNT_SHIFT) {
317 /* The guest is getting too far behind. Speed time up. */
321 timers_state.qemu_icount_bias = cur_icount
322 - (timers_state.qemu_icount << icount_time_shift);
323 seqlock_write_end(&timers_state.vm_clock_seqlock);
326 static void icount_adjust_rt(void *opaque)
328 timer_mod(icount_rt_timer,
329 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
333 static void icount_adjust_vm(void *opaque)
335 timer_mod(icount_vm_timer,
336 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
337 NANOSECONDS_PER_SECOND / 10);
341 static int64_t qemu_icount_round(int64_t count)
343 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
346 static void icount_warp_rt(void)
351 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
352 * changes from -1 to another value, so the race here is okay.
355 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
356 warp_start = vm_clock_warp_start;
357 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
359 if (warp_start == -1) {
363 seqlock_write_begin(&timers_state.vm_clock_seqlock);
364 if (runstate_is_running()) {
365 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
366 cpu_get_clock_locked());
369 warp_delta = clock - vm_clock_warp_start;
370 if (use_icount == 2) {
372 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
373 * far ahead of real time.
375 int64_t cur_icount = cpu_get_icount_locked();
376 int64_t delta = clock - cur_icount;
377 warp_delta = MIN(warp_delta, delta);
379 timers_state.qemu_icount_bias += warp_delta;
381 vm_clock_warp_start = -1;
382 seqlock_write_end(&timers_state.vm_clock_seqlock);
384 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
385 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
389 static void icount_timer_cb(void *opaque)
391 /* No need for a checkpoint because the timer already synchronizes
392 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
397 void qtest_clock_warp(int64_t dest)
399 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
400 AioContext *aio_context;
401 assert(qtest_enabled());
402 aio_context = qemu_get_aio_context();
403 while (clock < dest) {
404 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
405 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
407 seqlock_write_begin(&timers_state.vm_clock_seqlock);
408 timers_state.qemu_icount_bias += warp;
409 seqlock_write_end(&timers_state.vm_clock_seqlock);
411 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
412 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
413 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
415 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
418 void qemu_start_warp_timer(void)
427 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
428 * do not fire, so computing the deadline does not make sense.
430 if (!runstate_is_running()) {
434 /* warp clock deterministically in record/replay mode */
435 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
439 if (!all_cpu_threads_idle()) {
443 if (qtest_enabled()) {
444 /* When testing, qtest commands advance icount. */
448 /* We want to use the earliest deadline from ALL vm_clocks */
449 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
450 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
452 static bool notified;
453 if (!icount_sleep && !notified) {
454 error_report("WARNING: icount sleep disabled and no active timers");
462 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
463 * sleep. Otherwise, the CPU might be waiting for a future timer
464 * interrupt to wake it up, but the interrupt never comes because
465 * the vCPU isn't running any insns and thus doesn't advance the
466 * QEMU_CLOCK_VIRTUAL.
470 * We never let VCPUs sleep in no sleep icount mode.
471 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
472 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
473 * It is useful when we want a deterministic execution time,
474 * isolated from host latencies.
476 seqlock_write_begin(&timers_state.vm_clock_seqlock);
477 timers_state.qemu_icount_bias += deadline;
478 seqlock_write_end(&timers_state.vm_clock_seqlock);
479 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
482 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
483 * "real" time, (related to the time left until the next event) has
484 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
485 * This avoids that the warps are visible externally; for example,
486 * you will not be sending network packets continuously instead of
489 seqlock_write_begin(&timers_state.vm_clock_seqlock);
490 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
491 vm_clock_warp_start = clock;
493 seqlock_write_end(&timers_state.vm_clock_seqlock);
494 timer_mod_anticipate(icount_warp_timer, clock + deadline);
496 } else if (deadline == 0) {
497 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
501 static void qemu_account_warp_timer(void)
503 if (!use_icount || !icount_sleep) {
507 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
508 * do not fire, so computing the deadline does not make sense.
510 if (!runstate_is_running()) {
514 /* warp clock deterministically in record/replay mode */
515 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
519 timer_del(icount_warp_timer);
523 static bool icount_state_needed(void *opaque)
529 * This is a subsection for icount migration.
531 static const VMStateDescription icount_vmstate_timers = {
532 .name = "timer/icount",
534 .minimum_version_id = 1,
535 .needed = icount_state_needed,
536 .fields = (VMStateField[]) {
537 VMSTATE_INT64(qemu_icount_bias, TimersState),
538 VMSTATE_INT64(qemu_icount, TimersState),
539 VMSTATE_END_OF_LIST()
543 static const VMStateDescription vmstate_timers = {
546 .minimum_version_id = 1,
547 .fields = (VMStateField[]) {
548 VMSTATE_INT64(cpu_ticks_offset, TimersState),
549 VMSTATE_INT64(dummy, TimersState),
550 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
551 VMSTATE_END_OF_LIST()
553 .subsections = (const VMStateDescription*[]) {
554 &icount_vmstate_timers,
559 static void cpu_throttle_thread(CPUState *cpu, void *opaque)
562 double throttle_ratio;
565 if (!cpu_throttle_get_percentage()) {
569 pct = (double)cpu_throttle_get_percentage()/100;
570 throttle_ratio = pct / (1 - pct);
571 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
573 qemu_mutex_unlock_iothread();
574 atomic_set(&cpu->throttle_thread_scheduled, 0);
575 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
576 qemu_mutex_lock_iothread();
579 static void cpu_throttle_timer_tick(void *opaque)
584 /* Stop the timer if needed */
585 if (!cpu_throttle_get_percentage()) {
589 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
590 async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
594 pct = (double)cpu_throttle_get_percentage()/100;
595 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
596 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
599 void cpu_throttle_set(int new_throttle_pct)
601 /* Ensure throttle percentage is within valid range */
602 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
603 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
605 atomic_set(&throttle_percentage, new_throttle_pct);
607 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
608 CPU_THROTTLE_TIMESLICE_NS);
611 void cpu_throttle_stop(void)
613 atomic_set(&throttle_percentage, 0);
616 bool cpu_throttle_active(void)
618 return (cpu_throttle_get_percentage() != 0);
621 int cpu_throttle_get_percentage(void)
623 return atomic_read(&throttle_percentage);
626 void cpu_ticks_init(void)
628 seqlock_init(&timers_state.vm_clock_seqlock);
629 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
630 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
631 cpu_throttle_timer_tick, NULL);
634 void configure_icount(QemuOpts *opts, Error **errp)
637 char *rem_str = NULL;
639 option = qemu_opt_get(opts, "shift");
641 if (qemu_opt_get(opts, "align") != NULL) {
642 error_setg(errp, "Please specify shift option when using align");
647 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
649 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
650 icount_timer_cb, NULL);
653 icount_align_option = qemu_opt_get_bool(opts, "align", false);
655 if (icount_align_option && !icount_sleep) {
656 error_setg(errp, "align=on and sleep=off are incompatible");
658 if (strcmp(option, "auto") != 0) {
660 icount_time_shift = strtol(option, &rem_str, 0);
661 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
662 error_setg(errp, "icount: Invalid shift value");
666 } else if (icount_align_option) {
667 error_setg(errp, "shift=auto and align=on are incompatible");
668 } else if (!icount_sleep) {
669 error_setg(errp, "shift=auto and sleep=off are incompatible");
674 /* 125MIPS seems a reasonable initial guess at the guest speed.
675 It will be corrected fairly quickly anyway. */
676 icount_time_shift = 3;
678 /* Have both realtime and virtual time triggers for speed adjustment.
679 The realtime trigger catches emulated time passing too slowly,
680 the virtual time trigger catches emulated time passing too fast.
681 Realtime triggers occur even when idle, so use them less frequently
683 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
684 icount_adjust_rt, NULL);
685 timer_mod(icount_rt_timer,
686 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
687 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
688 icount_adjust_vm, NULL);
689 timer_mod(icount_vm_timer,
690 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
691 NANOSECONDS_PER_SECOND / 10);
694 /***********************************************************/
695 void hw_error(const char *fmt, ...)
701 fprintf(stderr, "qemu: hardware error: ");
702 vfprintf(stderr, fmt, ap);
703 fprintf(stderr, "\n");
705 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
706 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
712 void cpu_synchronize_all_states(void)
717 cpu_synchronize_state(cpu);
721 void cpu_synchronize_all_post_reset(void)
726 cpu_synchronize_post_reset(cpu);
730 void cpu_synchronize_all_post_init(void)
735 cpu_synchronize_post_init(cpu);
739 static int do_vm_stop(RunState state)
743 if (runstate_is_running()) {
747 vm_state_notify(0, state);
748 qapi_event_send_stop(&error_abort);
752 replay_disable_events();
753 ret = bdrv_flush_all();
758 static bool cpu_can_run(CPUState *cpu)
763 if (cpu_is_stopped(cpu)) {
769 static void cpu_handle_guest_debug(CPUState *cpu)
771 gdb_set_stop_cpu(cpu);
772 qemu_system_debug_request();
777 static void sigbus_reraise(void)
780 struct sigaction action;
782 memset(&action, 0, sizeof(action));
783 action.sa_handler = SIG_DFL;
784 if (!sigaction(SIGBUS, &action, NULL)) {
787 sigaddset(&set, SIGBUS);
788 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
790 perror("Failed to re-raise SIGBUS!\n");
794 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
797 if (kvm_on_sigbus(siginfo->ssi_code,
798 (void *)(intptr_t)siginfo->ssi_addr)) {
803 static void qemu_init_sigbus(void)
805 struct sigaction action;
807 memset(&action, 0, sizeof(action));
808 action.sa_flags = SA_SIGINFO;
809 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
810 sigaction(SIGBUS, &action, NULL);
812 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
815 static void qemu_kvm_eat_signals(CPUState *cpu)
817 struct timespec ts = { 0, 0 };
823 sigemptyset(&waitset);
824 sigaddset(&waitset, SIG_IPI);
825 sigaddset(&waitset, SIGBUS);
828 r = sigtimedwait(&waitset, &siginfo, &ts);
829 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
830 perror("sigtimedwait");
836 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
844 r = sigpending(&chkset);
846 perror("sigpending");
849 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
852 #else /* !CONFIG_LINUX */
854 static void qemu_init_sigbus(void)
858 static void qemu_kvm_eat_signals(CPUState *cpu)
861 #endif /* !CONFIG_LINUX */
864 static void dummy_signal(int sig)
868 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
872 struct sigaction sigact;
874 memset(&sigact, 0, sizeof(sigact));
875 sigact.sa_handler = dummy_signal;
876 sigaction(SIG_IPI, &sigact, NULL);
878 pthread_sigmask(SIG_BLOCK, NULL, &set);
879 sigdelset(&set, SIG_IPI);
880 sigdelset(&set, SIGBUS);
881 r = kvm_set_signal_mask(cpu, &set);
883 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
889 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
895 static QemuMutex qemu_global_mutex;
896 static QemuCond qemu_io_proceeded_cond;
897 static unsigned iothread_requesting_mutex;
899 static QemuThread io_thread;
902 static QemuCond qemu_cpu_cond;
904 static QemuCond qemu_pause_cond;
906 void qemu_init_cpu_loop(void)
909 qemu_cond_init(&qemu_cpu_cond);
910 qemu_cond_init(&qemu_pause_cond);
911 qemu_cond_init(&qemu_io_proceeded_cond);
912 qemu_mutex_init(&qemu_global_mutex);
914 qemu_thread_get_self(&io_thread);
917 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
919 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
922 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
924 if (kvm_destroy_vcpu(cpu) < 0) {
925 error_report("kvm_destroy_vcpu failed");
930 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
934 static void qemu_wait_io_event_common(CPUState *cpu)
939 qemu_cond_broadcast(&qemu_pause_cond);
941 process_queued_cpu_work(cpu);
942 cpu->thread_kicked = false;
945 static void qemu_tcg_wait_io_event(CPUState *cpu)
947 while (all_cpu_threads_idle()) {
948 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
951 while (iothread_requesting_mutex) {
952 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
956 qemu_wait_io_event_common(cpu);
960 static void qemu_kvm_wait_io_event(CPUState *cpu)
962 while (cpu_thread_is_idle(cpu)) {
963 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
966 qemu_kvm_eat_signals(cpu);
967 qemu_wait_io_event_common(cpu);
970 static void *qemu_kvm_cpu_thread_fn(void *arg)
975 rcu_register_thread();
977 qemu_mutex_lock_iothread();
978 qemu_thread_get_self(cpu->thread);
979 cpu->thread_id = qemu_get_thread_id();
983 r = kvm_init_vcpu(cpu);
985 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
989 qemu_kvm_init_cpu_signals(cpu);
991 /* signal CPU creation */
993 qemu_cond_signal(&qemu_cpu_cond);
996 if (cpu_can_run(cpu)) {
997 r = kvm_cpu_exec(cpu);
998 if (r == EXCP_DEBUG) {
999 cpu_handle_guest_debug(cpu);
1002 qemu_kvm_wait_io_event(cpu);
1003 } while (!cpu->unplug || cpu_can_run(cpu));
1005 qemu_kvm_destroy_vcpu(cpu);
1006 cpu->created = false;
1007 qemu_cond_signal(&qemu_cpu_cond);
1008 qemu_mutex_unlock_iothread();
1012 static void *qemu_dummy_cpu_thread_fn(void *arg)
1015 fprintf(stderr, "qtest is not supported under Windows\n");
1018 CPUState *cpu = arg;
1022 rcu_register_thread();
1024 qemu_mutex_lock_iothread();
1025 qemu_thread_get_self(cpu->thread);
1026 cpu->thread_id = qemu_get_thread_id();
1029 sigemptyset(&waitset);
1030 sigaddset(&waitset, SIG_IPI);
1032 /* signal CPU creation */
1033 cpu->created = true;
1034 qemu_cond_signal(&qemu_cpu_cond);
1039 qemu_mutex_unlock_iothread();
1042 r = sigwait(&waitset, &sig);
1043 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1048 qemu_mutex_lock_iothread();
1050 qemu_wait_io_event_common(cpu);
1057 static int64_t tcg_get_icount_limit(void)
1061 if (replay_mode != REPLAY_MODE_PLAY) {
1062 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1064 /* Maintain prior (possibly buggy) behaviour where if no deadline
1065 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1066 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1069 if ((deadline < 0) || (deadline > INT32_MAX)) {
1070 deadline = INT32_MAX;
1073 return qemu_icount_round(deadline);
1075 return replay_get_instructions();
1079 static void handle_icount_deadline(void)
1083 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1085 if (deadline == 0) {
1086 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1091 static int tcg_cpu_exec(CPUState *cpu)
1094 #ifdef CONFIG_PROFILER
1098 #ifdef CONFIG_PROFILER
1099 ti = profile_getclock();
1104 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1105 + cpu->icount_extra);
1106 cpu->icount_decr.u16.low = 0;
1107 cpu->icount_extra = 0;
1108 count = tcg_get_icount_limit();
1109 timers_state.qemu_icount += count;
1110 decr = (count > 0xffff) ? 0xffff : count;
1112 cpu->icount_decr.u16.low = decr;
1113 cpu->icount_extra = count;
1115 cpu_exec_start(cpu);
1116 ret = cpu_exec(cpu);
1118 #ifdef CONFIG_PROFILER
1119 tcg_time += profile_getclock() - ti;
1122 /* Fold pending instructions back into the
1123 instruction counter, and clear the interrupt flag. */
1124 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1125 + cpu->icount_extra);
1126 cpu->icount_decr.u32 = 0;
1127 cpu->icount_extra = 0;
1128 replay_account_executed_instructions();
1133 /* Destroy any remaining vCPUs which have been unplugged and have
1136 static void deal_with_unplugged_cpus(void)
1141 if (cpu->unplug && !cpu_can_run(cpu)) {
1142 qemu_tcg_destroy_vcpu(cpu);
1143 cpu->created = false;
1144 qemu_cond_signal(&qemu_cpu_cond);
1150 static void *qemu_tcg_cpu_thread_fn(void *arg)
1152 CPUState *cpu = arg;
1154 rcu_register_thread();
1156 qemu_mutex_lock_iothread();
1157 qemu_thread_get_self(cpu->thread);
1160 cpu->thread_id = qemu_get_thread_id();
1161 cpu->created = true;
1164 qemu_cond_signal(&qemu_cpu_cond);
1166 /* wait for initial kick-off after machine start */
1167 while (first_cpu->stopped) {
1168 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1170 /* process any pending work */
1172 qemu_wait_io_event_common(cpu);
1176 /* process any pending work */
1177 atomic_mb_set(&exit_request, 1);
1182 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1183 qemu_account_warp_timer();
1189 for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1191 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1192 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1194 if (cpu_can_run(cpu)) {
1196 r = tcg_cpu_exec(cpu);
1197 if (r == EXCP_DEBUG) {
1198 cpu_handle_guest_debug(cpu);
1201 } else if (cpu->stop || cpu->stopped) {
1203 cpu = CPU_NEXT(cpu);
1210 /* Pairs with smp_wmb in qemu_cpu_kick. */
1211 atomic_mb_set(&exit_request, 0);
1213 handle_icount_deadline();
1215 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1216 deal_with_unplugged_cpus();
1222 static void qemu_cpu_kick_thread(CPUState *cpu)
1227 if (cpu->thread_kicked) {
1230 cpu->thread_kicked = true;
1231 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1233 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1241 static void qemu_cpu_kick_no_halt(void)
1244 /* Ensure whatever caused the exit has reached the CPU threads before
1245 * writing exit_request.
1247 atomic_mb_set(&exit_request, 1);
1248 cpu = atomic_mb_read(&tcg_current_cpu);
1254 void qemu_cpu_kick(CPUState *cpu)
1256 qemu_cond_broadcast(cpu->halt_cond);
1257 if (tcg_enabled()) {
1258 qemu_cpu_kick_no_halt();
1260 qemu_cpu_kick_thread(cpu);
1264 void qemu_cpu_kick_self(void)
1266 assert(current_cpu);
1267 qemu_cpu_kick_thread(current_cpu);
1270 bool qemu_cpu_is_self(CPUState *cpu)
1272 return qemu_thread_is_self(cpu->thread);
1275 bool qemu_in_vcpu_thread(void)
1277 return current_cpu && qemu_cpu_is_self(current_cpu);
1280 static __thread bool iothread_locked = false;
1282 bool qemu_mutex_iothread_locked(void)
1284 return iothread_locked;
1287 void qemu_mutex_lock_iothread(void)
1289 atomic_inc(&iothread_requesting_mutex);
1290 /* In the simple case there is no need to bump the VCPU thread out of
1291 * TCG code execution.
1293 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1294 !first_cpu || !first_cpu->created) {
1295 qemu_mutex_lock(&qemu_global_mutex);
1296 atomic_dec(&iothread_requesting_mutex);
1298 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1299 qemu_cpu_kick_no_halt();
1300 qemu_mutex_lock(&qemu_global_mutex);
1302 atomic_dec(&iothread_requesting_mutex);
1303 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1305 iothread_locked = true;
1308 void qemu_mutex_unlock_iothread(void)
1310 iothread_locked = false;
1311 qemu_mutex_unlock(&qemu_global_mutex);
1314 static bool all_vcpus_paused(void)
1319 if (!cpu->stopped) {
1327 void pause_all_vcpus(void)
1331 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1337 if (qemu_in_vcpu_thread()) {
1339 if (!kvm_enabled()) {
1342 cpu->stopped = true;
1348 while (!all_vcpus_paused()) {
1349 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1356 void cpu_resume(CPUState *cpu)
1359 cpu->stopped = false;
1363 void resume_all_vcpus(void)
1367 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1373 void cpu_remove(CPUState *cpu)
1380 void cpu_remove_sync(CPUState *cpu)
1383 while (cpu->created) {
1384 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1388 /* For temporary buffers for forming a name */
1389 #define VCPU_THREAD_NAME_SIZE 16
1391 static void qemu_tcg_init_vcpu(CPUState *cpu)
1393 char thread_name[VCPU_THREAD_NAME_SIZE];
1394 static QemuCond *tcg_halt_cond;
1395 static QemuThread *tcg_cpu_thread;
1397 /* share a single thread for all cpus with TCG */
1398 if (!tcg_cpu_thread) {
1399 cpu->thread = g_malloc0(sizeof(QemuThread));
1400 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1401 qemu_cond_init(cpu->halt_cond);
1402 tcg_halt_cond = cpu->halt_cond;
1403 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1405 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1406 cpu, QEMU_THREAD_JOINABLE);
1408 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1410 while (!cpu->created) {
1411 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1413 tcg_cpu_thread = cpu->thread;
1415 cpu->thread = tcg_cpu_thread;
1416 cpu->halt_cond = tcg_halt_cond;
1420 static void qemu_kvm_start_vcpu(CPUState *cpu)
1422 char thread_name[VCPU_THREAD_NAME_SIZE];
1424 cpu->thread = g_malloc0(sizeof(QemuThread));
1425 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1426 qemu_cond_init(cpu->halt_cond);
1427 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1429 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1430 cpu, QEMU_THREAD_JOINABLE);
1431 while (!cpu->created) {
1432 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1436 static void qemu_dummy_start_vcpu(CPUState *cpu)
1438 char thread_name[VCPU_THREAD_NAME_SIZE];
1440 cpu->thread = g_malloc0(sizeof(QemuThread));
1441 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1442 qemu_cond_init(cpu->halt_cond);
1443 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1445 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1446 QEMU_THREAD_JOINABLE);
1447 while (!cpu->created) {
1448 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1452 void qemu_init_vcpu(CPUState *cpu)
1454 cpu->nr_cores = smp_cores;
1455 cpu->nr_threads = smp_threads;
1456 cpu->stopped = true;
1459 /* If the target cpu hasn't set up any address spaces itself,
1460 * give it the default one.
1462 AddressSpace *as = address_space_init_shareable(cpu->memory,
1465 cpu_address_space_init(cpu, as, 0);
1468 if (kvm_enabled()) {
1469 qemu_kvm_start_vcpu(cpu);
1470 } else if (tcg_enabled()) {
1471 qemu_tcg_init_vcpu(cpu);
1473 qemu_dummy_start_vcpu(cpu);
1477 void cpu_stop_current(void)
1480 current_cpu->stop = false;
1481 current_cpu->stopped = true;
1482 cpu_exit(current_cpu);
1483 qemu_cond_broadcast(&qemu_pause_cond);
1487 int vm_stop(RunState state)
1489 if (qemu_in_vcpu_thread()) {
1490 qemu_system_vmstop_request_prepare();
1491 qemu_system_vmstop_request(state);
1493 * FIXME: should not return to device code in case
1494 * vm_stop() has been requested.
1500 return do_vm_stop(state);
1503 /* does a state transition even if the VM is already stopped,
1504 current state is forgotten forever */
1505 int vm_stop_force_state(RunState state)
1507 if (runstate_is_running()) {
1508 return vm_stop(state);
1510 runstate_set(state);
1513 /* Make sure to return an error if the flush in a previous vm_stop()
1515 return bdrv_flush_all();
1519 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1521 /* XXX: implement xxx_cpu_list for targets that still miss it */
1522 #if defined(cpu_list)
1523 cpu_list(f, cpu_fprintf);
1527 CpuInfoList *qmp_query_cpus(Error **errp)
1529 CpuInfoList *head = NULL, *cur_item = NULL;
1534 #if defined(TARGET_I386)
1535 X86CPU *x86_cpu = X86_CPU(cpu);
1536 CPUX86State *env = &x86_cpu->env;
1537 #elif defined(TARGET_PPC)
1538 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1539 CPUPPCState *env = &ppc_cpu->env;
1540 #elif defined(TARGET_SPARC)
1541 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1542 CPUSPARCState *env = &sparc_cpu->env;
1543 #elif defined(TARGET_MIPS)
1544 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1545 CPUMIPSState *env = &mips_cpu->env;
1546 #elif defined(TARGET_TRICORE)
1547 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1548 CPUTriCoreState *env = &tricore_cpu->env;
1551 cpu_synchronize_state(cpu);
1553 info = g_malloc0(sizeof(*info));
1554 info->value = g_malloc0(sizeof(*info->value));
1555 info->value->CPU = cpu->cpu_index;
1556 info->value->current = (cpu == first_cpu);
1557 info->value->halted = cpu->halted;
1558 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1559 info->value->thread_id = cpu->thread_id;
1560 #if defined(TARGET_I386)
1561 info->value->arch = CPU_INFO_ARCH_X86;
1562 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1563 #elif defined(TARGET_PPC)
1564 info->value->arch = CPU_INFO_ARCH_PPC;
1565 info->value->u.ppc.nip = env->nip;
1566 #elif defined(TARGET_SPARC)
1567 info->value->arch = CPU_INFO_ARCH_SPARC;
1568 info->value->u.q_sparc.pc = env->pc;
1569 info->value->u.q_sparc.npc = env->npc;
1570 #elif defined(TARGET_MIPS)
1571 info->value->arch = CPU_INFO_ARCH_MIPS;
1572 info->value->u.q_mips.PC = env->active_tc.PC;
1573 #elif defined(TARGET_TRICORE)
1574 info->value->arch = CPU_INFO_ARCH_TRICORE;
1575 info->value->u.tricore.PC = env->PC;
1577 info->value->arch = CPU_INFO_ARCH_OTHER;
1580 /* XXX: waiting for the qapi to support GSList */
1582 head = cur_item = info;
1584 cur_item->next = info;
1592 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1593 bool has_cpu, int64_t cpu_index, Error **errp)
1599 int64_t orig_addr = addr, orig_size = size;
1605 cpu = qemu_get_cpu(cpu_index);
1607 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1612 f = fopen(filename, "wb");
1614 error_setg_file_open(errp, errno, filename);
1622 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1623 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1624 " specified", orig_addr, orig_size);
1627 if (fwrite(buf, 1, l, f) != l) {
1628 error_setg(errp, QERR_IO_ERROR);
1639 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1646 f = fopen(filename, "wb");
1648 error_setg_file_open(errp, errno, filename);
1656 cpu_physical_memory_read(addr, buf, l);
1657 if (fwrite(buf, 1, l, f) != l) {
1658 error_setg(errp, QERR_IO_ERROR);
1669 void qmp_inject_nmi(Error **errp)
1671 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1674 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1680 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1681 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1682 if (icount_align_option) {
1683 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1684 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1686 cpu_fprintf(f, "Max guest delay NA\n");
1687 cpu_fprintf(f, "Max guest advance NA\n");