4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
47 #include "qemu/thread.h"
48 #include "qemu/plugin.h"
49 #include "sysemu/cpus.h"
50 #include "sysemu/qtest.h"
51 #include "qemu/main-loop.h"
52 #include "qemu/option.h"
53 #include "qemu/bitmap.h"
54 #include "qemu/seqlock.h"
55 #include "qemu/guest-random.h"
58 #include "sysemu/replay.h"
59 #include "sysemu/runstate.h"
60 #include "hw/boards.h"
65 #include <sys/prctl.h>
68 #define PR_MCE_KILL 33
71 #ifndef PR_MCE_KILL_SET
72 #define PR_MCE_KILL_SET 1
75 #ifndef PR_MCE_KILL_EARLY
76 #define PR_MCE_KILL_EARLY 1
79 #endif /* CONFIG_LINUX */
81 static QemuMutex qemu_global_mutex;
86 /* vcpu throttling controls */
87 static QEMUTimer *throttle_timer;
88 static unsigned int throttle_percentage;
90 #define CPU_THROTTLE_PCT_MIN 1
91 #define CPU_THROTTLE_PCT_MAX 99
92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
94 bool cpu_is_stopped(CPUState *cpu)
96 return cpu->stopped || !runstate_is_running();
99 static bool cpu_thread_is_idle(CPUState *cpu)
101 if (cpu->stop || cpu->queued_work_first) {
104 if (cpu_is_stopped(cpu)) {
107 if (!cpu->halted || cpu_has_work(cpu) ||
108 kvm_halt_in_kernel()) {
114 static bool all_cpu_threads_idle(void)
119 if (!cpu_thread_is_idle(cpu)) {
126 /***********************************************************/
127 /* guest cycle counter */
129 /* Protected by TimersState seqlock */
131 static bool icount_sleep = true;
132 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
133 #define MAX_ICOUNT_SHIFT 10
135 typedef struct TimersState {
136 /* Protected by BQL. */
137 int64_t cpu_ticks_prev;
138 int64_t cpu_ticks_offset;
140 /* Protect fields that can be respectively read outside the
141 * BQL, and written from multiple threads.
143 QemuSeqLock vm_clock_seqlock;
144 QemuSpin vm_clock_lock;
146 int16_t cpu_ticks_enabled;
148 /* Conversion factor from emulated instructions to virtual clock ticks. */
149 int16_t icount_time_shift;
151 /* Compensate for varying guest execution speed. */
152 int64_t qemu_icount_bias;
154 int64_t vm_clock_warp_start;
155 int64_t cpu_clock_offset;
157 /* Only written by TCG thread */
160 /* for adjusting icount */
161 QEMUTimer *icount_rt_timer;
162 QEMUTimer *icount_vm_timer;
163 QEMUTimer *icount_warp_timer;
166 static TimersState timers_state;
170 /* The current number of executed instructions is based on what we
171 * originally budgeted minus the current state of the decrementing
172 * icount counters in extra/u16.low.
174 static int64_t cpu_get_icount_executed(CPUState *cpu)
176 return (cpu->icount_budget -
177 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
181 * Update the global shared timer_state.qemu_icount to take into
182 * account executed instructions. This is done by the TCG vCPU
183 * thread so the main-loop can see time has moved forward.
185 static void cpu_update_icount_locked(CPUState *cpu)
187 int64_t executed = cpu_get_icount_executed(cpu);
188 cpu->icount_budget -= executed;
190 atomic_set_i64(&timers_state.qemu_icount,
191 timers_state.qemu_icount + executed);
195 * Update the global shared timer_state.qemu_icount to take into
196 * account executed instructions. This is done by the TCG vCPU
197 * thread so the main-loop can see time has moved forward.
199 void cpu_update_icount(CPUState *cpu)
201 seqlock_write_lock(&timers_state.vm_clock_seqlock,
202 &timers_state.vm_clock_lock);
203 cpu_update_icount_locked(cpu);
204 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
205 &timers_state.vm_clock_lock);
208 static int64_t cpu_get_icount_raw_locked(void)
210 CPUState *cpu = current_cpu;
212 if (cpu && cpu->running) {
213 if (!cpu->can_do_io) {
214 error_report("Bad icount read");
217 /* Take into account what has run */
218 cpu_update_icount_locked(cpu);
220 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
221 return atomic_read_i64(&timers_state.qemu_icount);
224 static int64_t cpu_get_icount_locked(void)
226 int64_t icount = cpu_get_icount_raw_locked();
227 return atomic_read_i64(&timers_state.qemu_icount_bias) +
228 cpu_icount_to_ns(icount);
231 int64_t cpu_get_icount_raw(void)
237 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
238 icount = cpu_get_icount_raw_locked();
239 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
244 /* Return the virtual CPU time, based on the instruction counter. */
245 int64_t cpu_get_icount(void)
251 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
252 icount = cpu_get_icount_locked();
253 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
258 int64_t cpu_icount_to_ns(int64_t icount)
260 return icount << atomic_read(&timers_state.icount_time_shift);
263 static int64_t cpu_get_ticks_locked(void)
265 int64_t ticks = timers_state.cpu_ticks_offset;
266 if (timers_state.cpu_ticks_enabled) {
267 ticks += cpu_get_host_ticks();
270 if (timers_state.cpu_ticks_prev > ticks) {
271 /* Non increasing ticks may happen if the host uses software suspend. */
272 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
273 ticks = timers_state.cpu_ticks_prev;
276 timers_state.cpu_ticks_prev = ticks;
280 /* return the time elapsed in VM between vm_start and vm_stop. Unless
281 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
284 int64_t cpu_get_ticks(void)
289 return cpu_get_icount();
292 qemu_spin_lock(&timers_state.vm_clock_lock);
293 ticks = cpu_get_ticks_locked();
294 qemu_spin_unlock(&timers_state.vm_clock_lock);
298 static int64_t cpu_get_clock_locked(void)
302 time = timers_state.cpu_clock_offset;
303 if (timers_state.cpu_ticks_enabled) {
310 /* Return the monotonic time elapsed in VM, i.e.,
311 * the time between vm_start and vm_stop
313 int64_t cpu_get_clock(void)
319 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
320 ti = cpu_get_clock_locked();
321 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
326 /* enable cpu_get_ticks()
327 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
329 void cpu_enable_ticks(void)
331 seqlock_write_lock(&timers_state.vm_clock_seqlock,
332 &timers_state.vm_clock_lock);
333 if (!timers_state.cpu_ticks_enabled) {
334 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
335 timers_state.cpu_clock_offset -= get_clock();
336 timers_state.cpu_ticks_enabled = 1;
338 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
339 &timers_state.vm_clock_lock);
342 /* disable cpu_get_ticks() : the clock is stopped. You must not call
343 * cpu_get_ticks() after that.
344 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
346 void cpu_disable_ticks(void)
348 seqlock_write_lock(&timers_state.vm_clock_seqlock,
349 &timers_state.vm_clock_lock);
350 if (timers_state.cpu_ticks_enabled) {
351 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
352 timers_state.cpu_clock_offset = cpu_get_clock_locked();
353 timers_state.cpu_ticks_enabled = 0;
355 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
356 &timers_state.vm_clock_lock);
359 /* Correlation between real and virtual time is always going to be
360 fairly approximate, so ignore small variation.
361 When the guest is idle real and virtual time will be aligned in
363 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
365 static void icount_adjust(void)
371 /* Protected by TimersState mutex. */
372 static int64_t last_delta;
374 /* If the VM is not running, then do nothing. */
375 if (!runstate_is_running()) {
379 seqlock_write_lock(&timers_state.vm_clock_seqlock,
380 &timers_state.vm_clock_lock);
381 cur_time = cpu_get_clock_locked();
382 cur_icount = cpu_get_icount_locked();
384 delta = cur_icount - cur_time;
385 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
387 && last_delta + ICOUNT_WOBBLE < delta * 2
388 && timers_state.icount_time_shift > 0) {
389 /* The guest is getting too far ahead. Slow time down. */
390 atomic_set(&timers_state.icount_time_shift,
391 timers_state.icount_time_shift - 1);
394 && last_delta - ICOUNT_WOBBLE > delta * 2
395 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
396 /* The guest is getting too far behind. Speed time up. */
397 atomic_set(&timers_state.icount_time_shift,
398 timers_state.icount_time_shift + 1);
401 atomic_set_i64(&timers_state.qemu_icount_bias,
402 cur_icount - (timers_state.qemu_icount
403 << timers_state.icount_time_shift));
404 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
405 &timers_state.vm_clock_lock);
408 static void icount_adjust_rt(void *opaque)
410 timer_mod(timers_state.icount_rt_timer,
411 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
415 static void icount_adjust_vm(void *opaque)
417 timer_mod(timers_state.icount_vm_timer,
418 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
419 NANOSECONDS_PER_SECOND / 10);
423 static int64_t qemu_icount_round(int64_t count)
425 int shift = atomic_read(&timers_state.icount_time_shift);
426 return (count + (1 << shift) - 1) >> shift;
429 static void icount_warp_rt(void)
434 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
435 * changes from -1 to another value, so the race here is okay.
438 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
439 warp_start = timers_state.vm_clock_warp_start;
440 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
442 if (warp_start == -1) {
446 seqlock_write_lock(&timers_state.vm_clock_seqlock,
447 &timers_state.vm_clock_lock);
448 if (runstate_is_running()) {
449 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
450 cpu_get_clock_locked());
453 warp_delta = clock - timers_state.vm_clock_warp_start;
454 if (use_icount == 2) {
456 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
457 * far ahead of real time.
459 int64_t cur_icount = cpu_get_icount_locked();
460 int64_t delta = clock - cur_icount;
461 warp_delta = MIN(warp_delta, delta);
463 atomic_set_i64(&timers_state.qemu_icount_bias,
464 timers_state.qemu_icount_bias + warp_delta);
466 timers_state.vm_clock_warp_start = -1;
467 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
468 &timers_state.vm_clock_lock);
470 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
471 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
475 static void icount_timer_cb(void *opaque)
477 /* No need for a checkpoint because the timer already synchronizes
478 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
483 void qtest_clock_warp(int64_t dest)
485 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
486 AioContext *aio_context;
487 assert(qtest_enabled());
488 aio_context = qemu_get_aio_context();
489 while (clock < dest) {
490 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
491 QEMU_TIMER_ATTR_ALL);
492 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
494 seqlock_write_lock(&timers_state.vm_clock_seqlock,
495 &timers_state.vm_clock_lock);
496 atomic_set_i64(&timers_state.qemu_icount_bias,
497 timers_state.qemu_icount_bias + warp);
498 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
499 &timers_state.vm_clock_lock);
501 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
502 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
503 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
505 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
508 void qemu_start_warp_timer(void)
517 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
518 * do not fire, so computing the deadline does not make sense.
520 if (!runstate_is_running()) {
524 if (replay_mode != REPLAY_MODE_PLAY) {
525 if (!all_cpu_threads_idle()) {
529 if (qtest_enabled()) {
530 /* When testing, qtest commands advance icount. */
534 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
536 /* warp clock deterministically in record/replay mode */
537 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
538 /* vCPU is sleeping and warp can't be started.
539 It is probably a race condition: notification sent
540 to vCPU was processed in advance and vCPU went to sleep.
541 Therefore we have to wake it up for doing someting. */
542 if (replay_has_checkpoint()) {
543 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
549 /* We want to use the earliest deadline from ALL vm_clocks */
550 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
551 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
552 ~QEMU_TIMER_ATTR_EXTERNAL);
554 static bool notified;
555 if (!icount_sleep && !notified) {
556 warn_report("icount sleep disabled and no active timers");
564 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
565 * sleep. Otherwise, the CPU might be waiting for a future timer
566 * interrupt to wake it up, but the interrupt never comes because
567 * the vCPU isn't running any insns and thus doesn't advance the
568 * QEMU_CLOCK_VIRTUAL.
572 * We never let VCPUs sleep in no sleep icount mode.
573 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
574 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
575 * It is useful when we want a deterministic execution time,
576 * isolated from host latencies.
578 seqlock_write_lock(&timers_state.vm_clock_seqlock,
579 &timers_state.vm_clock_lock);
580 atomic_set_i64(&timers_state.qemu_icount_bias,
581 timers_state.qemu_icount_bias + deadline);
582 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
583 &timers_state.vm_clock_lock);
584 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
587 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
588 * "real" time, (related to the time left until the next event) has
589 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
590 * This avoids that the warps are visible externally; for example,
591 * you will not be sending network packets continuously instead of
594 seqlock_write_lock(&timers_state.vm_clock_seqlock,
595 &timers_state.vm_clock_lock);
596 if (timers_state.vm_clock_warp_start == -1
597 || timers_state.vm_clock_warp_start > clock) {
598 timers_state.vm_clock_warp_start = clock;
600 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
601 &timers_state.vm_clock_lock);
602 timer_mod_anticipate(timers_state.icount_warp_timer,
605 } else if (deadline == 0) {
606 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
610 static void qemu_account_warp_timer(void)
612 if (!use_icount || !icount_sleep) {
616 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
617 * do not fire, so computing the deadline does not make sense.
619 if (!runstate_is_running()) {
623 /* warp clock deterministically in record/replay mode */
624 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
628 timer_del(timers_state.icount_warp_timer);
632 static bool icount_state_needed(void *opaque)
637 static bool warp_timer_state_needed(void *opaque)
639 TimersState *s = opaque;
640 return s->icount_warp_timer != NULL;
643 static bool adjust_timers_state_needed(void *opaque)
645 TimersState *s = opaque;
646 return s->icount_rt_timer != NULL;
650 * Subsection for warp timer migration is optional, because may not be created
652 static const VMStateDescription icount_vmstate_warp_timer = {
653 .name = "timer/icount/warp_timer",
655 .minimum_version_id = 1,
656 .needed = warp_timer_state_needed,
657 .fields = (VMStateField[]) {
658 VMSTATE_INT64(vm_clock_warp_start, TimersState),
659 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
660 VMSTATE_END_OF_LIST()
664 static const VMStateDescription icount_vmstate_adjust_timers = {
665 .name = "timer/icount/timers",
667 .minimum_version_id = 1,
668 .needed = adjust_timers_state_needed,
669 .fields = (VMStateField[]) {
670 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
671 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
672 VMSTATE_END_OF_LIST()
677 * This is a subsection for icount migration.
679 static const VMStateDescription icount_vmstate_timers = {
680 .name = "timer/icount",
682 .minimum_version_id = 1,
683 .needed = icount_state_needed,
684 .fields = (VMStateField[]) {
685 VMSTATE_INT64(qemu_icount_bias, TimersState),
686 VMSTATE_INT64(qemu_icount, TimersState),
687 VMSTATE_END_OF_LIST()
689 .subsections = (const VMStateDescription*[]) {
690 &icount_vmstate_warp_timer,
691 &icount_vmstate_adjust_timers,
696 static const VMStateDescription vmstate_timers = {
699 .minimum_version_id = 1,
700 .fields = (VMStateField[]) {
701 VMSTATE_INT64(cpu_ticks_offset, TimersState),
703 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
704 VMSTATE_END_OF_LIST()
706 .subsections = (const VMStateDescription*[]) {
707 &icount_vmstate_timers,
712 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
715 double throttle_ratio;
716 int64_t sleeptime_ns, endtime_ns;
718 if (!cpu_throttle_get_percentage()) {
722 pct = (double)cpu_throttle_get_percentage()/100;
723 throttle_ratio = pct / (1 - pct);
724 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
725 sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
726 endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
727 while (sleeptime_ns > 0 && !cpu->stop) {
728 if (sleeptime_ns > SCALE_MS) {
729 qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
730 sleeptime_ns / SCALE_MS);
732 qemu_mutex_unlock_iothread();
733 g_usleep(sleeptime_ns / SCALE_US);
734 qemu_mutex_lock_iothread();
736 sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
738 atomic_set(&cpu->throttle_thread_scheduled, 0);
741 static void cpu_throttle_timer_tick(void *opaque)
746 /* Stop the timer if needed */
747 if (!cpu_throttle_get_percentage()) {
751 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
752 async_run_on_cpu(cpu, cpu_throttle_thread,
757 pct = (double)cpu_throttle_get_percentage()/100;
758 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
759 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
762 void cpu_throttle_set(int new_throttle_pct)
764 /* Ensure throttle percentage is within valid range */
765 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
766 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
768 atomic_set(&throttle_percentage, new_throttle_pct);
770 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
771 CPU_THROTTLE_TIMESLICE_NS);
774 void cpu_throttle_stop(void)
776 atomic_set(&throttle_percentage, 0);
779 bool cpu_throttle_active(void)
781 return (cpu_throttle_get_percentage() != 0);
784 int cpu_throttle_get_percentage(void)
786 return atomic_read(&throttle_percentage);
789 void cpu_ticks_init(void)
791 seqlock_init(&timers_state.vm_clock_seqlock);
792 qemu_spin_init(&timers_state.vm_clock_lock);
793 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
794 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
795 cpu_throttle_timer_tick, NULL);
798 void configure_icount(QemuOpts *opts, Error **errp)
800 const char *option = qemu_opt_get(opts, "shift");
801 bool sleep = qemu_opt_get_bool(opts, "sleep", true);
802 bool align = qemu_opt_get_bool(opts, "align", false);
803 long time_shift = -1;
804 char *rem_str = NULL;
806 if (!option && qemu_opt_get(opts, "align")) {
807 error_setg(errp, "Please specify shift option when using align");
811 if (align && !sleep) {
812 error_setg(errp, "align=on and sleep=off are incompatible");
816 if (strcmp(option, "auto") != 0) {
818 time_shift = strtol(option, &rem_str, 0);
819 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
820 error_setg(errp, "icount: Invalid shift value");
823 } else if (icount_align_option) {
824 error_setg(errp, "shift=auto and align=on are incompatible");
826 } else if (!icount_sleep) {
827 error_setg(errp, "shift=auto and sleep=off are incompatible");
831 icount_sleep = sleep;
833 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
834 icount_timer_cb, NULL);
837 icount_align_option = align;
839 if (time_shift >= 0) {
840 timers_state.icount_time_shift = time_shift;
847 /* 125MIPS seems a reasonable initial guess at the guest speed.
848 It will be corrected fairly quickly anyway. */
849 timers_state.icount_time_shift = 3;
851 /* Have both realtime and virtual time triggers for speed adjustment.
852 The realtime trigger catches emulated time passing too slowly,
853 the virtual time trigger catches emulated time passing too fast.
854 Realtime triggers occur even when idle, so use them less frequently
856 timers_state.vm_clock_warp_start = -1;
857 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
858 icount_adjust_rt, NULL);
859 timer_mod(timers_state.icount_rt_timer,
860 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
861 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
862 icount_adjust_vm, NULL);
863 timer_mod(timers_state.icount_vm_timer,
864 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
865 NANOSECONDS_PER_SECOND / 10);
868 /***********************************************************/
869 /* TCG vCPU kick timer
871 * The kick timer is responsible for moving single threaded vCPU
872 * emulation on to the next vCPU. If more than one vCPU is running a
873 * timer event with force a cpu->exit so the next vCPU can get
876 * The timer is removed if all vCPUs are idle and restarted again once
877 * idleness is complete.
880 static QEMUTimer *tcg_kick_vcpu_timer;
881 static CPUState *tcg_current_rr_cpu;
883 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
885 static inline int64_t qemu_tcg_next_kick(void)
887 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
890 /* Kick the currently round-robin scheduled vCPU to next */
891 static void qemu_cpu_kick_rr_next_cpu(void)
895 cpu = atomic_mb_read(&tcg_current_rr_cpu);
899 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
902 /* Kick all RR vCPUs */
903 static void qemu_cpu_kick_rr_cpus(void)
912 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
916 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
918 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
923 if (qemu_in_vcpu_thread()) {
924 /* A CPU is currently running; kick it back out to the
925 * tcg_cpu_exec() loop so it will recalculate its
926 * icount deadline immediately.
928 qemu_cpu_kick(current_cpu);
929 } else if (first_cpu) {
930 /* qemu_cpu_kick is not enough to kick a halted CPU out of
931 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
932 * causes cpu_thread_is_idle to return false. This way,
933 * handle_icount_deadline can run.
934 * If we have no CPUs at all for some reason, we don't
935 * need to do anything.
937 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
941 static void kick_tcg_thread(void *opaque)
943 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
944 qemu_cpu_kick_rr_next_cpu();
947 static void start_tcg_kick_timer(void)
949 assert(!mttcg_enabled);
950 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
951 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
952 kick_tcg_thread, NULL);
954 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
955 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
959 static void stop_tcg_kick_timer(void)
961 assert(!mttcg_enabled);
962 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
963 timer_del(tcg_kick_vcpu_timer);
967 /***********************************************************/
968 void hw_error(const char *fmt, ...)
974 fprintf(stderr, "qemu: hardware error: ");
975 vfprintf(stderr, fmt, ap);
976 fprintf(stderr, "\n");
978 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
979 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
985 void cpu_synchronize_all_states(void)
990 cpu_synchronize_state(cpu);
991 /* TODO: move to cpu_synchronize_state() */
993 hvf_cpu_synchronize_state(cpu);
998 void cpu_synchronize_all_post_reset(void)
1003 cpu_synchronize_post_reset(cpu);
1004 /* TODO: move to cpu_synchronize_post_reset() */
1005 if (hvf_enabled()) {
1006 hvf_cpu_synchronize_post_reset(cpu);
1011 void cpu_synchronize_all_post_init(void)
1016 cpu_synchronize_post_init(cpu);
1017 /* TODO: move to cpu_synchronize_post_init() */
1018 if (hvf_enabled()) {
1019 hvf_cpu_synchronize_post_init(cpu);
1024 void cpu_synchronize_all_pre_loadvm(void)
1029 cpu_synchronize_pre_loadvm(cpu);
1033 static int do_vm_stop(RunState state, bool send_stop)
1037 if (runstate_is_running()) {
1038 runstate_set(state);
1039 cpu_disable_ticks();
1041 vm_state_notify(0, state);
1043 qapi_event_send_stop();
1048 ret = bdrv_flush_all();
1053 /* Special vm_stop() variant for terminating the process. Historically clients
1054 * did not expect a QMP STOP event and so we need to retain compatibility.
1056 int vm_shutdown(void)
1058 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1061 static bool cpu_can_run(CPUState *cpu)
1066 if (cpu_is_stopped(cpu)) {
1072 static void cpu_handle_guest_debug(CPUState *cpu)
1074 gdb_set_stop_cpu(cpu);
1075 qemu_system_debug_request();
1076 cpu->stopped = true;
1080 static void sigbus_reraise(void)
1083 struct sigaction action;
1085 memset(&action, 0, sizeof(action));
1086 action.sa_handler = SIG_DFL;
1087 if (!sigaction(SIGBUS, &action, NULL)) {
1090 sigaddset(&set, SIGBUS);
1091 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1093 perror("Failed to re-raise SIGBUS!\n");
1097 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1099 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1104 /* Called asynchronously in VCPU thread. */
1105 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1109 /* Called synchronously (via signalfd) in main thread. */
1110 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1116 static void qemu_init_sigbus(void)
1118 struct sigaction action;
1120 memset(&action, 0, sizeof(action));
1121 action.sa_flags = SA_SIGINFO;
1122 action.sa_sigaction = sigbus_handler;
1123 sigaction(SIGBUS, &action, NULL);
1125 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1127 #else /* !CONFIG_LINUX */
1128 static void qemu_init_sigbus(void)
1131 #endif /* !CONFIG_LINUX */
1133 static QemuThread io_thread;
1136 static QemuCond qemu_cpu_cond;
1138 static QemuCond qemu_pause_cond;
1140 void qemu_init_cpu_loop(void)
1143 qemu_cond_init(&qemu_cpu_cond);
1144 qemu_cond_init(&qemu_pause_cond);
1145 qemu_mutex_init(&qemu_global_mutex);
1147 qemu_thread_get_self(&io_thread);
1150 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1152 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1155 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1157 if (kvm_destroy_vcpu(cpu) < 0) {
1158 error_report("kvm_destroy_vcpu failed");
1163 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1167 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1169 g_assert(qemu_cpu_is_self(cpu));
1171 cpu->stopped = true;
1175 qemu_cond_broadcast(&qemu_pause_cond);
1178 static void qemu_wait_io_event_common(CPUState *cpu)
1180 atomic_mb_set(&cpu->thread_kicked, false);
1182 qemu_cpu_stop(cpu, false);
1184 process_queued_cpu_work(cpu);
1187 static void qemu_tcg_rr_wait_io_event(void)
1191 while (all_cpu_threads_idle()) {
1192 stop_tcg_kick_timer();
1193 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1196 start_tcg_kick_timer();
1199 qemu_wait_io_event_common(cpu);
1203 static void qemu_wait_io_event(CPUState *cpu)
1207 while (cpu_thread_is_idle(cpu)) {
1210 qemu_plugin_vcpu_idle_cb(cpu);
1212 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1215 qemu_plugin_vcpu_resume_cb(cpu);
1219 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1220 if (!tcg_enabled()) {
1224 qemu_wait_io_event_common(cpu);
1227 static void *qemu_kvm_cpu_thread_fn(void *arg)
1229 CPUState *cpu = arg;
1232 rcu_register_thread();
1234 qemu_mutex_lock_iothread();
1235 qemu_thread_get_self(cpu->thread);
1236 cpu->thread_id = qemu_get_thread_id();
1240 r = kvm_init_vcpu(cpu);
1242 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1246 kvm_init_cpu_signals(cpu);
1248 /* signal CPU creation */
1249 cpu->created = true;
1250 qemu_cond_signal(&qemu_cpu_cond);
1251 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1254 if (cpu_can_run(cpu)) {
1255 r = kvm_cpu_exec(cpu);
1256 if (r == EXCP_DEBUG) {
1257 cpu_handle_guest_debug(cpu);
1260 qemu_wait_io_event(cpu);
1261 } while (!cpu->unplug || cpu_can_run(cpu));
1263 qemu_kvm_destroy_vcpu(cpu);
1264 cpu->created = false;
1265 qemu_cond_signal(&qemu_cpu_cond);
1266 qemu_mutex_unlock_iothread();
1267 rcu_unregister_thread();
1271 static void *qemu_dummy_cpu_thread_fn(void *arg)
1274 error_report("qtest is not supported under Windows");
1277 CPUState *cpu = arg;
1281 rcu_register_thread();
1283 qemu_mutex_lock_iothread();
1284 qemu_thread_get_self(cpu->thread);
1285 cpu->thread_id = qemu_get_thread_id();
1289 sigemptyset(&waitset);
1290 sigaddset(&waitset, SIG_IPI);
1292 /* signal CPU creation */
1293 cpu->created = true;
1294 qemu_cond_signal(&qemu_cpu_cond);
1295 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1298 qemu_mutex_unlock_iothread();
1301 r = sigwait(&waitset, &sig);
1302 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1307 qemu_mutex_lock_iothread();
1308 qemu_wait_io_event(cpu);
1309 } while (!cpu->unplug);
1311 qemu_mutex_unlock_iothread();
1312 rcu_unregister_thread();
1317 static int64_t tcg_get_icount_limit(void)
1321 if (replay_mode != REPLAY_MODE_PLAY) {
1323 * Include all the timers, because they may need an attention.
1324 * Too long CPU execution may create unnecessary delay in UI.
1326 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1327 QEMU_TIMER_ATTR_ALL);
1328 /* Check realtime timers, because they help with input processing */
1329 deadline = qemu_soonest_timeout(deadline,
1330 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1331 QEMU_TIMER_ATTR_ALL));
1333 /* Maintain prior (possibly buggy) behaviour where if no deadline
1334 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1335 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1338 if ((deadline < 0) || (deadline > INT32_MAX)) {
1339 deadline = INT32_MAX;
1342 return qemu_icount_round(deadline);
1344 return replay_get_instructions();
1348 static void handle_icount_deadline(void)
1350 assert(qemu_in_vcpu_thread());
1352 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1353 QEMU_TIMER_ATTR_ALL);
1355 if (deadline == 0) {
1356 /* Wake up other AioContexts. */
1357 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1358 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1363 static void prepare_icount_for_run(CPUState *cpu)
1368 /* These should always be cleared by process_icount_data after
1369 * each vCPU execution. However u16.high can be raised
1370 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1372 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1373 g_assert(cpu->icount_extra == 0);
1375 cpu->icount_budget = tcg_get_icount_limit();
1376 insns_left = MIN(0xffff, cpu->icount_budget);
1377 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1378 cpu->icount_extra = cpu->icount_budget - insns_left;
1380 replay_mutex_lock();
1384 static void process_icount_data(CPUState *cpu)
1387 /* Account for executed instructions */
1388 cpu_update_icount(cpu);
1390 /* Reset the counters */
1391 cpu_neg(cpu)->icount_decr.u16.low = 0;
1392 cpu->icount_extra = 0;
1393 cpu->icount_budget = 0;
1395 replay_account_executed_instructions();
1397 replay_mutex_unlock();
1402 static int tcg_cpu_exec(CPUState *cpu)
1405 #ifdef CONFIG_PROFILER
1409 assert(tcg_enabled());
1410 #ifdef CONFIG_PROFILER
1411 ti = profile_getclock();
1413 cpu_exec_start(cpu);
1414 ret = cpu_exec(cpu);
1416 #ifdef CONFIG_PROFILER
1417 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1418 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1423 /* Destroy any remaining vCPUs which have been unplugged and have
1426 static void deal_with_unplugged_cpus(void)
1431 if (cpu->unplug && !cpu_can_run(cpu)) {
1432 qemu_tcg_destroy_vcpu(cpu);
1433 cpu->created = false;
1434 qemu_cond_signal(&qemu_cpu_cond);
1440 /* Single-threaded TCG
1442 * In the single-threaded case each vCPU is simulated in turn. If
1443 * there is more than a single vCPU we create a simple timer to kick
1444 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1445 * This is done explicitly rather than relying on side-effects
1449 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1451 CPUState *cpu = arg;
1453 assert(tcg_enabled());
1454 rcu_register_thread();
1455 tcg_register_thread();
1457 qemu_mutex_lock_iothread();
1458 qemu_thread_get_self(cpu->thread);
1460 cpu->thread_id = qemu_get_thread_id();
1461 cpu->created = true;
1463 qemu_cond_signal(&qemu_cpu_cond);
1464 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1466 /* wait for initial kick-off after machine start */
1467 while (first_cpu->stopped) {
1468 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1470 /* process any pending work */
1473 qemu_wait_io_event_common(cpu);
1477 start_tcg_kick_timer();
1481 /* process any pending work */
1482 cpu->exit_request = 1;
1485 qemu_mutex_unlock_iothread();
1486 replay_mutex_lock();
1487 qemu_mutex_lock_iothread();
1488 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1489 qemu_account_warp_timer();
1491 /* Run the timers here. This is much more efficient than
1492 * waking up the I/O thread and waiting for completion.
1494 handle_icount_deadline();
1496 replay_mutex_unlock();
1502 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1504 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1507 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1508 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1510 if (cpu_can_run(cpu)) {
1513 qemu_mutex_unlock_iothread();
1514 prepare_icount_for_run(cpu);
1516 r = tcg_cpu_exec(cpu);
1518 process_icount_data(cpu);
1519 qemu_mutex_lock_iothread();
1521 if (r == EXCP_DEBUG) {
1522 cpu_handle_guest_debug(cpu);
1524 } else if (r == EXCP_ATOMIC) {
1525 qemu_mutex_unlock_iothread();
1526 cpu_exec_step_atomic(cpu);
1527 qemu_mutex_lock_iothread();
1530 } else if (cpu->stop) {
1532 cpu = CPU_NEXT(cpu);
1537 cpu = CPU_NEXT(cpu);
1538 } /* while (cpu && !cpu->exit_request).. */
1540 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1541 atomic_set(&tcg_current_rr_cpu, NULL);
1543 if (cpu && cpu->exit_request) {
1544 atomic_mb_set(&cpu->exit_request, 0);
1547 if (use_icount && all_cpu_threads_idle()) {
1549 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1550 * in the main_loop, wake it up in order to start the warp timer.
1552 qemu_notify_event();
1555 qemu_tcg_rr_wait_io_event();
1556 deal_with_unplugged_cpus();
1559 rcu_unregister_thread();
1563 static void *qemu_hax_cpu_thread_fn(void *arg)
1565 CPUState *cpu = arg;
1568 rcu_register_thread();
1569 qemu_mutex_lock_iothread();
1570 qemu_thread_get_self(cpu->thread);
1572 cpu->thread_id = qemu_get_thread_id();
1573 cpu->created = true;
1577 qemu_cond_signal(&qemu_cpu_cond);
1578 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1581 if (cpu_can_run(cpu)) {
1582 r = hax_smp_cpu_exec(cpu);
1583 if (r == EXCP_DEBUG) {
1584 cpu_handle_guest_debug(cpu);
1588 qemu_wait_io_event(cpu);
1589 } while (!cpu->unplug || cpu_can_run(cpu));
1590 rcu_unregister_thread();
1594 /* The HVF-specific vCPU thread function. This one should only run when the host
1595 * CPU supports the VMX "unrestricted guest" feature. */
1596 static void *qemu_hvf_cpu_thread_fn(void *arg)
1598 CPUState *cpu = arg;
1602 assert(hvf_enabled());
1604 rcu_register_thread();
1606 qemu_mutex_lock_iothread();
1607 qemu_thread_get_self(cpu->thread);
1609 cpu->thread_id = qemu_get_thread_id();
1615 /* signal CPU creation */
1616 cpu->created = true;
1617 qemu_cond_signal(&qemu_cpu_cond);
1618 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1621 if (cpu_can_run(cpu)) {
1622 r = hvf_vcpu_exec(cpu);
1623 if (r == EXCP_DEBUG) {
1624 cpu_handle_guest_debug(cpu);
1627 qemu_wait_io_event(cpu);
1628 } while (!cpu->unplug || cpu_can_run(cpu));
1630 hvf_vcpu_destroy(cpu);
1631 cpu->created = false;
1632 qemu_cond_signal(&qemu_cpu_cond);
1633 qemu_mutex_unlock_iothread();
1634 rcu_unregister_thread();
1638 static void *qemu_whpx_cpu_thread_fn(void *arg)
1640 CPUState *cpu = arg;
1643 rcu_register_thread();
1645 qemu_mutex_lock_iothread();
1646 qemu_thread_get_self(cpu->thread);
1647 cpu->thread_id = qemu_get_thread_id();
1650 r = whpx_init_vcpu(cpu);
1652 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1656 /* signal CPU creation */
1657 cpu->created = true;
1658 qemu_cond_signal(&qemu_cpu_cond);
1659 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1662 if (cpu_can_run(cpu)) {
1663 r = whpx_vcpu_exec(cpu);
1664 if (r == EXCP_DEBUG) {
1665 cpu_handle_guest_debug(cpu);
1668 while (cpu_thread_is_idle(cpu)) {
1669 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1671 qemu_wait_io_event_common(cpu);
1672 } while (!cpu->unplug || cpu_can_run(cpu));
1674 whpx_destroy_vcpu(cpu);
1675 cpu->created = false;
1676 qemu_cond_signal(&qemu_cpu_cond);
1677 qemu_mutex_unlock_iothread();
1678 rcu_unregister_thread();
1683 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1688 /* Multi-threaded TCG
1690 * In the multi-threaded case each vCPU has its own thread. The TLS
1691 * variable current_cpu can be used deep in the code to find the
1692 * current CPUState for a given thread.
1695 static void *qemu_tcg_cpu_thread_fn(void *arg)
1697 CPUState *cpu = arg;
1699 assert(tcg_enabled());
1700 g_assert(!use_icount);
1702 rcu_register_thread();
1703 tcg_register_thread();
1705 qemu_mutex_lock_iothread();
1706 qemu_thread_get_self(cpu->thread);
1708 cpu->thread_id = qemu_get_thread_id();
1709 cpu->created = true;
1712 qemu_cond_signal(&qemu_cpu_cond);
1713 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1715 /* process any pending work */
1716 cpu->exit_request = 1;
1719 if (cpu_can_run(cpu)) {
1721 qemu_mutex_unlock_iothread();
1722 r = tcg_cpu_exec(cpu);
1723 qemu_mutex_lock_iothread();
1726 cpu_handle_guest_debug(cpu);
1729 /* during start-up the vCPU is reset and the thread is
1730 * kicked several times. If we don't ensure we go back
1731 * to sleep in the halted state we won't cleanly
1732 * start-up when the vCPU is enabled.
1734 * cpu->halted should ensure we sleep in wait_io_event
1736 g_assert(cpu->halted);
1739 qemu_mutex_unlock_iothread();
1740 cpu_exec_step_atomic(cpu);
1741 qemu_mutex_lock_iothread();
1743 /* Ignore everything else? */
1748 atomic_mb_set(&cpu->exit_request, 0);
1749 qemu_wait_io_event(cpu);
1750 } while (!cpu->unplug || cpu_can_run(cpu));
1752 qemu_tcg_destroy_vcpu(cpu);
1753 cpu->created = false;
1754 qemu_cond_signal(&qemu_cpu_cond);
1755 qemu_mutex_unlock_iothread();
1756 rcu_unregister_thread();
1760 static void qemu_cpu_kick_thread(CPUState *cpu)
1765 if (cpu->thread_kicked) {
1768 cpu->thread_kicked = true;
1769 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1770 if (err && err != ESRCH) {
1771 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1775 if (!qemu_cpu_is_self(cpu)) {
1776 if (whpx_enabled()) {
1777 whpx_vcpu_kick(cpu);
1778 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1779 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1780 __func__, GetLastError());
1787 void qemu_cpu_kick(CPUState *cpu)
1789 qemu_cond_broadcast(cpu->halt_cond);
1790 if (tcg_enabled()) {
1791 if (qemu_tcg_mttcg_enabled()) {
1794 qemu_cpu_kick_rr_cpus();
1797 if (hax_enabled()) {
1799 * FIXME: race condition with the exit_request check in
1802 cpu->exit_request = 1;
1804 qemu_cpu_kick_thread(cpu);
1808 void qemu_cpu_kick_self(void)
1810 assert(current_cpu);
1811 qemu_cpu_kick_thread(current_cpu);
1814 bool qemu_cpu_is_self(CPUState *cpu)
1816 return qemu_thread_is_self(cpu->thread);
1819 bool qemu_in_vcpu_thread(void)
1821 return current_cpu && qemu_cpu_is_self(current_cpu);
1824 static __thread bool iothread_locked = false;
1826 bool qemu_mutex_iothread_locked(void)
1828 return iothread_locked;
1832 * The BQL is taken from so many places that it is worth profiling the
1833 * callers directly, instead of funneling them all through a single function.
1835 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1837 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1839 g_assert(!qemu_mutex_iothread_locked());
1840 bql_lock(&qemu_global_mutex, file, line);
1841 iothread_locked = true;
1844 void qemu_mutex_unlock_iothread(void)
1846 g_assert(qemu_mutex_iothread_locked());
1847 iothread_locked = false;
1848 qemu_mutex_unlock(&qemu_global_mutex);
1851 void qemu_cond_wait_iothread(QemuCond *cond)
1853 qemu_cond_wait(cond, &qemu_global_mutex);
1856 static bool all_vcpus_paused(void)
1861 if (!cpu->stopped) {
1869 void pause_all_vcpus(void)
1873 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1875 if (qemu_cpu_is_self(cpu)) {
1876 qemu_cpu_stop(cpu, true);
1883 /* We need to drop the replay_lock so any vCPU threads woken up
1884 * can finish their replay tasks
1886 replay_mutex_unlock();
1888 while (!all_vcpus_paused()) {
1889 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1895 qemu_mutex_unlock_iothread();
1896 replay_mutex_lock();
1897 qemu_mutex_lock_iothread();
1900 void cpu_resume(CPUState *cpu)
1903 cpu->stopped = false;
1907 void resume_all_vcpus(void)
1911 if (!runstate_is_running()) {
1915 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1921 void cpu_remove_sync(CPUState *cpu)
1926 qemu_mutex_unlock_iothread();
1927 qemu_thread_join(cpu->thread);
1928 qemu_mutex_lock_iothread();
1931 /* For temporary buffers for forming a name */
1932 #define VCPU_THREAD_NAME_SIZE 16
1934 static void qemu_tcg_init_vcpu(CPUState *cpu)
1936 char thread_name[VCPU_THREAD_NAME_SIZE];
1937 static QemuCond *single_tcg_halt_cond;
1938 static QemuThread *single_tcg_cpu_thread;
1939 static int tcg_region_inited;
1941 assert(tcg_enabled());
1943 * Initialize TCG regions--once. Now is a good time, because:
1944 * (1) TCG's init context, prologue and target globals have been set up.
1945 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1946 * -accel flag is processed, so the check doesn't work then).
1948 if (!tcg_region_inited) {
1949 tcg_region_inited = 1;
1953 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1954 cpu->thread = g_malloc0(sizeof(QemuThread));
1955 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1956 qemu_cond_init(cpu->halt_cond);
1958 if (qemu_tcg_mttcg_enabled()) {
1959 /* create a thread per vCPU with TCG (MTTCG) */
1960 parallel_cpus = true;
1961 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1964 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1965 cpu, QEMU_THREAD_JOINABLE);
1968 /* share a single thread for all cpus with TCG */
1969 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1970 qemu_thread_create(cpu->thread, thread_name,
1971 qemu_tcg_rr_cpu_thread_fn,
1972 cpu, QEMU_THREAD_JOINABLE);
1974 single_tcg_halt_cond = cpu->halt_cond;
1975 single_tcg_cpu_thread = cpu->thread;
1978 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1981 /* For non-MTTCG cases we share the thread */
1982 cpu->thread = single_tcg_cpu_thread;
1983 cpu->halt_cond = single_tcg_halt_cond;
1984 cpu->thread_id = first_cpu->thread_id;
1986 cpu->created = true;
1990 static void qemu_hax_start_vcpu(CPUState *cpu)
1992 char thread_name[VCPU_THREAD_NAME_SIZE];
1994 cpu->thread = g_malloc0(sizeof(QemuThread));
1995 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996 qemu_cond_init(cpu->halt_cond);
1998 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2000 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2001 cpu, QEMU_THREAD_JOINABLE);
2003 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2007 static void qemu_kvm_start_vcpu(CPUState *cpu)
2009 char thread_name[VCPU_THREAD_NAME_SIZE];
2011 cpu->thread = g_malloc0(sizeof(QemuThread));
2012 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2013 qemu_cond_init(cpu->halt_cond);
2014 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2016 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2017 cpu, QEMU_THREAD_JOINABLE);
2020 static void qemu_hvf_start_vcpu(CPUState *cpu)
2022 char thread_name[VCPU_THREAD_NAME_SIZE];
2024 /* HVF currently does not support TCG, and only runs in
2025 * unrestricted-guest mode. */
2026 assert(hvf_enabled());
2028 cpu->thread = g_malloc0(sizeof(QemuThread));
2029 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030 qemu_cond_init(cpu->halt_cond);
2032 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2034 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2035 cpu, QEMU_THREAD_JOINABLE);
2038 static void qemu_whpx_start_vcpu(CPUState *cpu)
2040 char thread_name[VCPU_THREAD_NAME_SIZE];
2042 cpu->thread = g_malloc0(sizeof(QemuThread));
2043 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2044 qemu_cond_init(cpu->halt_cond);
2045 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2047 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2048 cpu, QEMU_THREAD_JOINABLE);
2050 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2054 static void qemu_dummy_start_vcpu(CPUState *cpu)
2056 char thread_name[VCPU_THREAD_NAME_SIZE];
2058 cpu->thread = g_malloc0(sizeof(QemuThread));
2059 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2060 qemu_cond_init(cpu->halt_cond);
2061 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2063 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2064 QEMU_THREAD_JOINABLE);
2067 void qemu_init_vcpu(CPUState *cpu)
2069 MachineState *ms = MACHINE(qdev_get_machine());
2071 cpu->nr_cores = ms->smp.cores;
2072 cpu->nr_threads = ms->smp.threads;
2073 cpu->stopped = true;
2074 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2077 /* If the target cpu hasn't set up any address spaces itself,
2078 * give it the default one.
2081 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2084 if (kvm_enabled()) {
2085 qemu_kvm_start_vcpu(cpu);
2086 } else if (hax_enabled()) {
2087 qemu_hax_start_vcpu(cpu);
2088 } else if (hvf_enabled()) {
2089 qemu_hvf_start_vcpu(cpu);
2090 } else if (tcg_enabled()) {
2091 qemu_tcg_init_vcpu(cpu);
2092 } else if (whpx_enabled()) {
2093 qemu_whpx_start_vcpu(cpu);
2095 qemu_dummy_start_vcpu(cpu);
2098 while (!cpu->created) {
2099 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2103 void cpu_stop_current(void)
2106 current_cpu->stop = true;
2107 cpu_exit(current_cpu);
2111 int vm_stop(RunState state)
2113 if (qemu_in_vcpu_thread()) {
2114 qemu_system_vmstop_request_prepare();
2115 qemu_system_vmstop_request(state);
2117 * FIXME: should not return to device code in case
2118 * vm_stop() has been requested.
2124 return do_vm_stop(state, true);
2128 * Prepare for (re)starting the VM.
2129 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2130 * running or in case of an error condition), 0 otherwise.
2132 int vm_prepare_start(void)
2136 qemu_vmstop_requested(&requested);
2137 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2141 /* Ensure that a STOP/RESUME pair of events is emitted if a
2142 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2143 * example, according to documentation is always followed by
2146 if (runstate_is_running()) {
2147 qapi_event_send_stop();
2148 qapi_event_send_resume();
2152 /* We are sending this now, but the CPUs will be resumed shortly later */
2153 qapi_event_send_resume();
2156 runstate_set(RUN_STATE_RUNNING);
2157 vm_state_notify(1, RUN_STATE_RUNNING);
2163 if (!vm_prepare_start()) {
2168 /* does a state transition even if the VM is already stopped,
2169 current state is forgotten forever */
2170 int vm_stop_force_state(RunState state)
2172 if (runstate_is_running()) {
2173 return vm_stop(state);
2175 runstate_set(state);
2178 /* Make sure to return an error if the flush in a previous vm_stop()
2180 return bdrv_flush_all();
2184 void list_cpus(const char *optarg)
2186 /* XXX: implement xxx_cpu_list for targets that still miss it */
2187 #if defined(cpu_list)
2192 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2193 bool has_cpu, int64_t cpu_index, Error **errp)
2199 int64_t orig_addr = addr, orig_size = size;
2205 cpu = qemu_get_cpu(cpu_index);
2207 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2212 f = fopen(filename, "wb");
2214 error_setg_file_open(errp, errno, filename);
2222 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2223 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2224 " specified", orig_addr, orig_size);
2227 if (fwrite(buf, 1, l, f) != l) {
2228 error_setg(errp, QERR_IO_ERROR);
2239 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2246 f = fopen(filename, "wb");
2248 error_setg_file_open(errp, errno, filename);
2256 cpu_physical_memory_read(addr, buf, l);
2257 if (fwrite(buf, 1, l, f) != l) {
2258 error_setg(errp, QERR_IO_ERROR);
2269 void qmp_inject_nmi(Error **errp)
2271 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2274 void dump_drift_info(void)
2280 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2281 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2282 if (icount_align_option) {
2283 qemu_printf("Max guest delay %"PRIi64" ms\n",
2284 -max_delay / SCALE_MS);
2285 qemu_printf("Max guest advance %"PRIi64" ms\n",
2286 max_advance / SCALE_MS);
2288 qemu_printf("Max guest delay NA\n");
2289 qemu_printf("Max guest advance NA\n");