4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "qemu/cutils.h"
29 #include "migration/vmstate.h"
30 #include "monitor/monitor.h"
31 #include "qapi/error.h"
32 #include "qapi/qapi-commands-misc.h"
33 #include "qapi/qapi-events-run-state.h"
34 #include "qapi/qmp/qerror.h"
35 #include "qemu/error-report.h"
36 #include "qemu/qemu-print.h"
37 #include "sysemu/tcg.h"
38 #include "sysemu/block-backend.h"
39 #include "exec/gdbstub.h"
40 #include "sysemu/dma.h"
41 #include "sysemu/hw_accel.h"
42 #include "sysemu/kvm.h"
43 #include "sysemu/hax.h"
44 #include "sysemu/hvf.h"
45 #include "sysemu/whpx.h"
46 #include "exec/exec-all.h"
48 #include "qemu/thread.h"
49 #include "qemu/plugin.h"
50 #include "sysemu/cpus.h"
51 #include "sysemu/qtest.h"
52 #include "qemu/main-loop.h"
53 #include "qemu/option.h"
54 #include "qemu/bitmap.h"
55 #include "qemu/seqlock.h"
56 #include "qemu/guest-random.h"
59 #include "sysemu/replay.h"
60 #include "sysemu/runstate.h"
61 #include "hw/boards.h"
66 #include <sys/prctl.h>
69 #define PR_MCE_KILL 33
72 #ifndef PR_MCE_KILL_SET
73 #define PR_MCE_KILL_SET 1
76 #ifndef PR_MCE_KILL_EARLY
77 #define PR_MCE_KILL_EARLY 1
80 #endif /* CONFIG_LINUX */
82 static QemuMutex qemu_global_mutex;
87 /* vcpu throttling controls */
88 static QEMUTimer *throttle_timer;
89 static unsigned int throttle_percentage;
91 #define CPU_THROTTLE_PCT_MIN 1
92 #define CPU_THROTTLE_PCT_MAX 99
93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
95 bool cpu_is_stopped(CPUState *cpu)
97 return cpu->stopped || !runstate_is_running();
100 static inline bool cpu_work_list_empty(CPUState *cpu)
104 qemu_mutex_lock(&cpu->work_mutex);
105 ret = QSIMPLEQ_EMPTY(&cpu->work_list);
106 qemu_mutex_unlock(&cpu->work_mutex);
110 static bool cpu_thread_is_idle(CPUState *cpu)
112 if (cpu->stop || !cpu_work_list_empty(cpu)) {
115 if (cpu_is_stopped(cpu)) {
118 if (!cpu->halted || cpu_has_work(cpu) ||
119 kvm_halt_in_kernel()) {
125 static bool all_cpu_threads_idle(void)
130 if (!cpu_thread_is_idle(cpu)) {
137 /***********************************************************/
138 /* guest cycle counter */
140 /* Protected by TimersState seqlock */
142 static bool icount_sleep = true;
143 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
144 #define MAX_ICOUNT_SHIFT 10
146 typedef struct TimersState {
147 /* Protected by BQL. */
148 int64_t cpu_ticks_prev;
149 int64_t cpu_ticks_offset;
151 /* Protect fields that can be respectively read outside the
152 * BQL, and written from multiple threads.
154 QemuSeqLock vm_clock_seqlock;
155 QemuSpin vm_clock_lock;
157 int16_t cpu_ticks_enabled;
159 /* Conversion factor from emulated instructions to virtual clock ticks. */
160 int16_t icount_time_shift;
162 /* Compensate for varying guest execution speed. */
163 int64_t qemu_icount_bias;
165 int64_t vm_clock_warp_start;
166 int64_t cpu_clock_offset;
168 /* Only written by TCG thread */
171 /* for adjusting icount */
172 QEMUTimer *icount_rt_timer;
173 QEMUTimer *icount_vm_timer;
174 QEMUTimer *icount_warp_timer;
177 static TimersState timers_state;
181 /* The current number of executed instructions is based on what we
182 * originally budgeted minus the current state of the decrementing
183 * icount counters in extra/u16.low.
185 static int64_t cpu_get_icount_executed(CPUState *cpu)
187 return (cpu->icount_budget -
188 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
192 * Update the global shared timer_state.qemu_icount to take into
193 * account executed instructions. This is done by the TCG vCPU
194 * thread so the main-loop can see time has moved forward.
196 static void cpu_update_icount_locked(CPUState *cpu)
198 int64_t executed = cpu_get_icount_executed(cpu);
199 cpu->icount_budget -= executed;
201 atomic_set_i64(&timers_state.qemu_icount,
202 timers_state.qemu_icount + executed);
206 * Update the global shared timer_state.qemu_icount to take into
207 * account executed instructions. This is done by the TCG vCPU
208 * thread so the main-loop can see time has moved forward.
210 void cpu_update_icount(CPUState *cpu)
212 seqlock_write_lock(&timers_state.vm_clock_seqlock,
213 &timers_state.vm_clock_lock);
214 cpu_update_icount_locked(cpu);
215 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
216 &timers_state.vm_clock_lock);
219 static int64_t cpu_get_icount_raw_locked(void)
221 CPUState *cpu = current_cpu;
223 if (cpu && cpu->running) {
224 if (!cpu->can_do_io) {
225 error_report("Bad icount read");
228 /* Take into account what has run */
229 cpu_update_icount_locked(cpu);
231 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
232 return atomic_read_i64(&timers_state.qemu_icount);
235 static int64_t cpu_get_icount_locked(void)
237 int64_t icount = cpu_get_icount_raw_locked();
238 return atomic_read_i64(&timers_state.qemu_icount_bias) +
239 cpu_icount_to_ns(icount);
242 int64_t cpu_get_icount_raw(void)
248 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
249 icount = cpu_get_icount_raw_locked();
250 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
255 /* Return the virtual CPU time, based on the instruction counter. */
256 int64_t cpu_get_icount(void)
262 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
263 icount = cpu_get_icount_locked();
264 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
269 int64_t cpu_icount_to_ns(int64_t icount)
271 return icount << atomic_read(&timers_state.icount_time_shift);
274 static int64_t cpu_get_ticks_locked(void)
276 int64_t ticks = timers_state.cpu_ticks_offset;
277 if (timers_state.cpu_ticks_enabled) {
278 ticks += cpu_get_host_ticks();
281 if (timers_state.cpu_ticks_prev > ticks) {
282 /* Non increasing ticks may happen if the host uses software suspend. */
283 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
284 ticks = timers_state.cpu_ticks_prev;
287 timers_state.cpu_ticks_prev = ticks;
291 /* return the time elapsed in VM between vm_start and vm_stop. Unless
292 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
295 int64_t cpu_get_ticks(void)
300 return cpu_get_icount();
303 qemu_spin_lock(&timers_state.vm_clock_lock);
304 ticks = cpu_get_ticks_locked();
305 qemu_spin_unlock(&timers_state.vm_clock_lock);
309 static int64_t cpu_get_clock_locked(void)
313 time = timers_state.cpu_clock_offset;
314 if (timers_state.cpu_ticks_enabled) {
321 /* Return the monotonic time elapsed in VM, i.e.,
322 * the time between vm_start and vm_stop
324 int64_t cpu_get_clock(void)
330 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
331 ti = cpu_get_clock_locked();
332 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
337 /* enable cpu_get_ticks()
338 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
340 void cpu_enable_ticks(void)
342 seqlock_write_lock(&timers_state.vm_clock_seqlock,
343 &timers_state.vm_clock_lock);
344 if (!timers_state.cpu_ticks_enabled) {
345 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
346 timers_state.cpu_clock_offset -= get_clock();
347 timers_state.cpu_ticks_enabled = 1;
349 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
350 &timers_state.vm_clock_lock);
353 /* disable cpu_get_ticks() : the clock is stopped. You must not call
354 * cpu_get_ticks() after that.
355 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
357 void cpu_disable_ticks(void)
359 seqlock_write_lock(&timers_state.vm_clock_seqlock,
360 &timers_state.vm_clock_lock);
361 if (timers_state.cpu_ticks_enabled) {
362 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
363 timers_state.cpu_clock_offset = cpu_get_clock_locked();
364 timers_state.cpu_ticks_enabled = 0;
366 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
367 &timers_state.vm_clock_lock);
370 /* Correlation between real and virtual time is always going to be
371 fairly approximate, so ignore small variation.
372 When the guest is idle real and virtual time will be aligned in
374 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
376 static void icount_adjust(void)
382 /* Protected by TimersState mutex. */
383 static int64_t last_delta;
385 /* If the VM is not running, then do nothing. */
386 if (!runstate_is_running()) {
390 seqlock_write_lock(&timers_state.vm_clock_seqlock,
391 &timers_state.vm_clock_lock);
392 cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
393 cpu_get_clock_locked());
394 cur_icount = cpu_get_icount_locked();
396 delta = cur_icount - cur_time;
397 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
399 && last_delta + ICOUNT_WOBBLE < delta * 2
400 && timers_state.icount_time_shift > 0) {
401 /* The guest is getting too far ahead. Slow time down. */
402 atomic_set(&timers_state.icount_time_shift,
403 timers_state.icount_time_shift - 1);
406 && last_delta - ICOUNT_WOBBLE > delta * 2
407 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
408 /* The guest is getting too far behind. Speed time up. */
409 atomic_set(&timers_state.icount_time_shift,
410 timers_state.icount_time_shift + 1);
413 atomic_set_i64(&timers_state.qemu_icount_bias,
414 cur_icount - (timers_state.qemu_icount
415 << timers_state.icount_time_shift));
416 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
417 &timers_state.vm_clock_lock);
420 static void icount_adjust_rt(void *opaque)
422 timer_mod(timers_state.icount_rt_timer,
423 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
427 static void icount_adjust_vm(void *opaque)
429 timer_mod(timers_state.icount_vm_timer,
430 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
431 NANOSECONDS_PER_SECOND / 10);
435 static int64_t qemu_icount_round(int64_t count)
437 int shift = atomic_read(&timers_state.icount_time_shift);
438 return (count + (1 << shift) - 1) >> shift;
441 static void icount_warp_rt(void)
446 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
447 * changes from -1 to another value, so the race here is okay.
450 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
451 warp_start = timers_state.vm_clock_warp_start;
452 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
454 if (warp_start == -1) {
458 seqlock_write_lock(&timers_state.vm_clock_seqlock,
459 &timers_state.vm_clock_lock);
460 if (runstate_is_running()) {
461 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
462 cpu_get_clock_locked());
465 warp_delta = clock - timers_state.vm_clock_warp_start;
466 if (use_icount == 2) {
468 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
469 * far ahead of real time.
471 int64_t cur_icount = cpu_get_icount_locked();
472 int64_t delta = clock - cur_icount;
473 warp_delta = MIN(warp_delta, delta);
475 atomic_set_i64(&timers_state.qemu_icount_bias,
476 timers_state.qemu_icount_bias + warp_delta);
478 timers_state.vm_clock_warp_start = -1;
479 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
480 &timers_state.vm_clock_lock);
482 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
483 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
487 static void icount_timer_cb(void *opaque)
489 /* No need for a checkpoint because the timer already synchronizes
490 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
495 void qtest_clock_warp(int64_t dest)
497 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
498 AioContext *aio_context;
499 assert(qtest_enabled());
500 aio_context = qemu_get_aio_context();
501 while (clock < dest) {
502 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
503 QEMU_TIMER_ATTR_ALL);
504 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
506 seqlock_write_lock(&timers_state.vm_clock_seqlock,
507 &timers_state.vm_clock_lock);
508 atomic_set_i64(&timers_state.qemu_icount_bias,
509 timers_state.qemu_icount_bias + warp);
510 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
511 &timers_state.vm_clock_lock);
513 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
514 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
515 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
517 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
520 void qemu_start_warp_timer(void)
529 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
530 * do not fire, so computing the deadline does not make sense.
532 if (!runstate_is_running()) {
536 if (replay_mode != REPLAY_MODE_PLAY) {
537 if (!all_cpu_threads_idle()) {
541 if (qtest_enabled()) {
542 /* When testing, qtest commands advance icount. */
546 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
548 /* warp clock deterministically in record/replay mode */
549 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
550 /* vCPU is sleeping and warp can't be started.
551 It is probably a race condition: notification sent
552 to vCPU was processed in advance and vCPU went to sleep.
553 Therefore we have to wake it up for doing someting. */
554 if (replay_has_checkpoint()) {
555 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
561 /* We want to use the earliest deadline from ALL vm_clocks */
562 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
563 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
564 ~QEMU_TIMER_ATTR_EXTERNAL);
566 static bool notified;
567 if (!icount_sleep && !notified) {
568 warn_report("icount sleep disabled and no active timers");
576 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
577 * sleep. Otherwise, the CPU might be waiting for a future timer
578 * interrupt to wake it up, but the interrupt never comes because
579 * the vCPU isn't running any insns and thus doesn't advance the
580 * QEMU_CLOCK_VIRTUAL.
584 * We never let VCPUs sleep in no sleep icount mode.
585 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
586 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
587 * It is useful when we want a deterministic execution time,
588 * isolated from host latencies.
590 seqlock_write_lock(&timers_state.vm_clock_seqlock,
591 &timers_state.vm_clock_lock);
592 atomic_set_i64(&timers_state.qemu_icount_bias,
593 timers_state.qemu_icount_bias + deadline);
594 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
595 &timers_state.vm_clock_lock);
596 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
599 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
600 * "real" time, (related to the time left until the next event) has
601 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
602 * This avoids that the warps are visible externally; for example,
603 * you will not be sending network packets continuously instead of
606 seqlock_write_lock(&timers_state.vm_clock_seqlock,
607 &timers_state.vm_clock_lock);
608 if (timers_state.vm_clock_warp_start == -1
609 || timers_state.vm_clock_warp_start > clock) {
610 timers_state.vm_clock_warp_start = clock;
612 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
613 &timers_state.vm_clock_lock);
614 timer_mod_anticipate(timers_state.icount_warp_timer,
617 } else if (deadline == 0) {
618 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
622 static void qemu_account_warp_timer(void)
624 if (!use_icount || !icount_sleep) {
628 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
629 * do not fire, so computing the deadline does not make sense.
631 if (!runstate_is_running()) {
635 /* warp clock deterministically in record/replay mode */
636 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
640 timer_del(timers_state.icount_warp_timer);
644 static bool icount_state_needed(void *opaque)
649 static bool warp_timer_state_needed(void *opaque)
651 TimersState *s = opaque;
652 return s->icount_warp_timer != NULL;
655 static bool adjust_timers_state_needed(void *opaque)
657 TimersState *s = opaque;
658 return s->icount_rt_timer != NULL;
661 static bool shift_state_needed(void *opaque)
663 return use_icount == 2;
667 * Subsection for warp timer migration is optional, because may not be created
669 static const VMStateDescription icount_vmstate_warp_timer = {
670 .name = "timer/icount/warp_timer",
672 .minimum_version_id = 1,
673 .needed = warp_timer_state_needed,
674 .fields = (VMStateField[]) {
675 VMSTATE_INT64(vm_clock_warp_start, TimersState),
676 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
677 VMSTATE_END_OF_LIST()
681 static const VMStateDescription icount_vmstate_adjust_timers = {
682 .name = "timer/icount/timers",
684 .minimum_version_id = 1,
685 .needed = adjust_timers_state_needed,
686 .fields = (VMStateField[]) {
687 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
688 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
689 VMSTATE_END_OF_LIST()
693 static const VMStateDescription icount_vmstate_shift = {
694 .name = "timer/icount/shift",
696 .minimum_version_id = 1,
697 .needed = shift_state_needed,
698 .fields = (VMStateField[]) {
699 VMSTATE_INT16(icount_time_shift, TimersState),
700 VMSTATE_END_OF_LIST()
705 * This is a subsection for icount migration.
707 static const VMStateDescription icount_vmstate_timers = {
708 .name = "timer/icount",
710 .minimum_version_id = 1,
711 .needed = icount_state_needed,
712 .fields = (VMStateField[]) {
713 VMSTATE_INT64(qemu_icount_bias, TimersState),
714 VMSTATE_INT64(qemu_icount, TimersState),
715 VMSTATE_END_OF_LIST()
717 .subsections = (const VMStateDescription*[]) {
718 &icount_vmstate_warp_timer,
719 &icount_vmstate_adjust_timers,
720 &icount_vmstate_shift,
725 static const VMStateDescription vmstate_timers = {
728 .minimum_version_id = 1,
729 .fields = (VMStateField[]) {
730 VMSTATE_INT64(cpu_ticks_offset, TimersState),
732 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
733 VMSTATE_END_OF_LIST()
735 .subsections = (const VMStateDescription*[]) {
736 &icount_vmstate_timers,
741 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
744 double throttle_ratio;
745 int64_t sleeptime_ns, endtime_ns;
747 if (!cpu_throttle_get_percentage()) {
751 pct = (double)cpu_throttle_get_percentage()/100;
752 throttle_ratio = pct / (1 - pct);
753 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
754 sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
755 endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
756 while (sleeptime_ns > 0 && !cpu->stop) {
757 if (sleeptime_ns > SCALE_MS) {
758 qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
759 sleeptime_ns / SCALE_MS);
761 qemu_mutex_unlock_iothread();
762 g_usleep(sleeptime_ns / SCALE_US);
763 qemu_mutex_lock_iothread();
765 sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
767 atomic_set(&cpu->throttle_thread_scheduled, 0);
770 static void cpu_throttle_timer_tick(void *opaque)
775 /* Stop the timer if needed */
776 if (!cpu_throttle_get_percentage()) {
780 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
781 async_run_on_cpu(cpu, cpu_throttle_thread,
786 pct = (double)cpu_throttle_get_percentage()/100;
787 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
788 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
791 void cpu_throttle_set(int new_throttle_pct)
793 /* Ensure throttle percentage is within valid range */
794 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
795 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
797 atomic_set(&throttle_percentage, new_throttle_pct);
799 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
800 CPU_THROTTLE_TIMESLICE_NS);
803 void cpu_throttle_stop(void)
805 atomic_set(&throttle_percentage, 0);
808 bool cpu_throttle_active(void)
810 return (cpu_throttle_get_percentage() != 0);
813 int cpu_throttle_get_percentage(void)
815 return atomic_read(&throttle_percentage);
818 void cpu_ticks_init(void)
820 seqlock_init(&timers_state.vm_clock_seqlock);
821 qemu_spin_init(&timers_state.vm_clock_lock);
822 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
823 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
824 cpu_throttle_timer_tick, NULL);
827 void configure_icount(QemuOpts *opts, Error **errp)
829 const char *option = qemu_opt_get(opts, "shift");
830 bool sleep = qemu_opt_get_bool(opts, "sleep", true);
831 bool align = qemu_opt_get_bool(opts, "align", false);
832 long time_shift = -1;
835 if (qemu_opt_get(opts, "align") != NULL) {
836 error_setg(errp, "Please specify shift option when using align");
841 if (align && !sleep) {
842 error_setg(errp, "align=on and sleep=off are incompatible");
846 if (strcmp(option, "auto") != 0) {
847 if (qemu_strtol(option, NULL, 0, &time_shift) < 0
848 || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
849 error_setg(errp, "icount: Invalid shift value");
852 } else if (icount_align_option) {
853 error_setg(errp, "shift=auto and align=on are incompatible");
855 } else if (!icount_sleep) {
856 error_setg(errp, "shift=auto and sleep=off are incompatible");
860 icount_sleep = sleep;
862 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
863 icount_timer_cb, NULL);
866 icount_align_option = align;
868 if (time_shift >= 0) {
869 timers_state.icount_time_shift = time_shift;
876 /* 125MIPS seems a reasonable initial guess at the guest speed.
877 It will be corrected fairly quickly anyway. */
878 timers_state.icount_time_shift = 3;
880 /* Have both realtime and virtual time triggers for speed adjustment.
881 The realtime trigger catches emulated time passing too slowly,
882 the virtual time trigger catches emulated time passing too fast.
883 Realtime triggers occur even when idle, so use them less frequently
885 timers_state.vm_clock_warp_start = -1;
886 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
887 icount_adjust_rt, NULL);
888 timer_mod(timers_state.icount_rt_timer,
889 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
890 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
891 icount_adjust_vm, NULL);
892 timer_mod(timers_state.icount_vm_timer,
893 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
894 NANOSECONDS_PER_SECOND / 10);
897 /***********************************************************/
898 /* TCG vCPU kick timer
900 * The kick timer is responsible for moving single threaded vCPU
901 * emulation on to the next vCPU. If more than one vCPU is running a
902 * timer event with force a cpu->exit so the next vCPU can get
905 * The timer is removed if all vCPUs are idle and restarted again once
906 * idleness is complete.
909 static QEMUTimer *tcg_kick_vcpu_timer;
910 static CPUState *tcg_current_rr_cpu;
912 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
914 static inline int64_t qemu_tcg_next_kick(void)
916 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
919 /* Kick the currently round-robin scheduled vCPU to next */
920 static void qemu_cpu_kick_rr_next_cpu(void)
924 cpu = atomic_mb_read(&tcg_current_rr_cpu);
928 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
931 /* Kick all RR vCPUs */
932 static void qemu_cpu_kick_rr_cpus(void)
941 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
945 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
947 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
952 if (qemu_in_vcpu_thread()) {
953 /* A CPU is currently running; kick it back out to the
954 * tcg_cpu_exec() loop so it will recalculate its
955 * icount deadline immediately.
957 qemu_cpu_kick(current_cpu);
958 } else if (first_cpu) {
959 /* qemu_cpu_kick is not enough to kick a halted CPU out of
960 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
961 * causes cpu_thread_is_idle to return false. This way,
962 * handle_icount_deadline can run.
963 * If we have no CPUs at all for some reason, we don't
964 * need to do anything.
966 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
970 static void kick_tcg_thread(void *opaque)
972 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
973 qemu_cpu_kick_rr_next_cpu();
976 static void start_tcg_kick_timer(void)
978 assert(!mttcg_enabled);
979 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
980 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
981 kick_tcg_thread, NULL);
983 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
984 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
988 static void stop_tcg_kick_timer(void)
990 assert(!mttcg_enabled);
991 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
992 timer_del(tcg_kick_vcpu_timer);
996 /***********************************************************/
997 void hw_error(const char *fmt, ...)
1003 fprintf(stderr, "qemu: hardware error: ");
1004 vfprintf(stderr, fmt, ap);
1005 fprintf(stderr, "\n");
1007 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1008 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1014 void cpu_synchronize_all_states(void)
1019 cpu_synchronize_state(cpu);
1023 void cpu_synchronize_all_post_reset(void)
1028 cpu_synchronize_post_reset(cpu);
1032 void cpu_synchronize_all_post_init(void)
1037 cpu_synchronize_post_init(cpu);
1041 void cpu_synchronize_all_pre_loadvm(void)
1046 cpu_synchronize_pre_loadvm(cpu);
1050 static int do_vm_stop(RunState state, bool send_stop)
1054 if (runstate_is_running()) {
1055 runstate_set(state);
1056 cpu_disable_ticks();
1058 vm_state_notify(0, state);
1060 qapi_event_send_stop();
1065 ret = bdrv_flush_all();
1070 /* Special vm_stop() variant for terminating the process. Historically clients
1071 * did not expect a QMP STOP event and so we need to retain compatibility.
1073 int vm_shutdown(void)
1075 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1078 static bool cpu_can_run(CPUState *cpu)
1083 if (cpu_is_stopped(cpu)) {
1089 static void cpu_handle_guest_debug(CPUState *cpu)
1091 gdb_set_stop_cpu(cpu);
1092 qemu_system_debug_request();
1093 cpu->stopped = true;
1097 static void sigbus_reraise(void)
1100 struct sigaction action;
1102 memset(&action, 0, sizeof(action));
1103 action.sa_handler = SIG_DFL;
1104 if (!sigaction(SIGBUS, &action, NULL)) {
1107 sigaddset(&set, SIGBUS);
1108 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1110 perror("Failed to re-raise SIGBUS!\n");
1114 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1116 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1121 /* Called asynchronously in VCPU thread. */
1122 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1126 /* Called synchronously (via signalfd) in main thread. */
1127 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1133 static void qemu_init_sigbus(void)
1135 struct sigaction action;
1137 memset(&action, 0, sizeof(action));
1138 action.sa_flags = SA_SIGINFO;
1139 action.sa_sigaction = sigbus_handler;
1140 sigaction(SIGBUS, &action, NULL);
1142 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1144 #else /* !CONFIG_LINUX */
1145 static void qemu_init_sigbus(void)
1148 #endif /* !CONFIG_LINUX */
1150 static QemuThread io_thread;
1153 static QemuCond qemu_cpu_cond;
1155 static QemuCond qemu_pause_cond;
1157 void qemu_init_cpu_loop(void)
1160 qemu_cond_init(&qemu_cpu_cond);
1161 qemu_cond_init(&qemu_pause_cond);
1162 qemu_mutex_init(&qemu_global_mutex);
1164 qemu_thread_get_self(&io_thread);
1167 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1169 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1172 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1174 if (kvm_destroy_vcpu(cpu) < 0) {
1175 error_report("kvm_destroy_vcpu failed");
1180 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1184 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1186 g_assert(qemu_cpu_is_self(cpu));
1188 cpu->stopped = true;
1192 qemu_cond_broadcast(&qemu_pause_cond);
1195 static void qemu_wait_io_event_common(CPUState *cpu)
1197 atomic_mb_set(&cpu->thread_kicked, false);
1199 qemu_cpu_stop(cpu, false);
1201 process_queued_cpu_work(cpu);
1204 static void qemu_tcg_rr_wait_io_event(void)
1208 while (all_cpu_threads_idle()) {
1209 stop_tcg_kick_timer();
1210 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1213 start_tcg_kick_timer();
1216 qemu_wait_io_event_common(cpu);
1220 static void qemu_wait_io_event(CPUState *cpu)
1224 while (cpu_thread_is_idle(cpu)) {
1227 qemu_plugin_vcpu_idle_cb(cpu);
1229 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1232 qemu_plugin_vcpu_resume_cb(cpu);
1236 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1237 if (!tcg_enabled()) {
1241 qemu_wait_io_event_common(cpu);
1244 static void *qemu_kvm_cpu_thread_fn(void *arg)
1246 CPUState *cpu = arg;
1249 rcu_register_thread();
1251 qemu_mutex_lock_iothread();
1252 qemu_thread_get_self(cpu->thread);
1253 cpu->thread_id = qemu_get_thread_id();
1257 r = kvm_init_vcpu(cpu);
1259 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1263 kvm_init_cpu_signals(cpu);
1265 /* signal CPU creation */
1266 cpu->created = true;
1267 qemu_cond_signal(&qemu_cpu_cond);
1268 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1271 if (cpu_can_run(cpu)) {
1272 r = kvm_cpu_exec(cpu);
1273 if (r == EXCP_DEBUG) {
1274 cpu_handle_guest_debug(cpu);
1277 qemu_wait_io_event(cpu);
1278 } while (!cpu->unplug || cpu_can_run(cpu));
1280 qemu_kvm_destroy_vcpu(cpu);
1281 cpu->created = false;
1282 qemu_cond_signal(&qemu_cpu_cond);
1283 qemu_mutex_unlock_iothread();
1284 rcu_unregister_thread();
1288 static void *qemu_dummy_cpu_thread_fn(void *arg)
1291 error_report("qtest is not supported under Windows");
1294 CPUState *cpu = arg;
1298 rcu_register_thread();
1300 qemu_mutex_lock_iothread();
1301 qemu_thread_get_self(cpu->thread);
1302 cpu->thread_id = qemu_get_thread_id();
1306 sigemptyset(&waitset);
1307 sigaddset(&waitset, SIG_IPI);
1309 /* signal CPU creation */
1310 cpu->created = true;
1311 qemu_cond_signal(&qemu_cpu_cond);
1312 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1315 qemu_mutex_unlock_iothread();
1318 r = sigwait(&waitset, &sig);
1319 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1324 qemu_mutex_lock_iothread();
1325 qemu_wait_io_event(cpu);
1326 } while (!cpu->unplug);
1328 qemu_mutex_unlock_iothread();
1329 rcu_unregister_thread();
1334 static int64_t tcg_get_icount_limit(void)
1338 if (replay_mode != REPLAY_MODE_PLAY) {
1340 * Include all the timers, because they may need an attention.
1341 * Too long CPU execution may create unnecessary delay in UI.
1343 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1344 QEMU_TIMER_ATTR_ALL);
1345 /* Check realtime timers, because they help with input processing */
1346 deadline = qemu_soonest_timeout(deadline,
1347 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1348 QEMU_TIMER_ATTR_ALL));
1350 /* Maintain prior (possibly buggy) behaviour where if no deadline
1351 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1352 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1355 if ((deadline < 0) || (deadline > INT32_MAX)) {
1356 deadline = INT32_MAX;
1359 return qemu_icount_round(deadline);
1361 return replay_get_instructions();
1365 static void notify_aio_contexts(void)
1367 /* Wake up other AioContexts. */
1368 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1369 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1372 static void handle_icount_deadline(void)
1374 assert(qemu_in_vcpu_thread());
1376 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1377 QEMU_TIMER_ATTR_ALL);
1379 if (deadline == 0) {
1380 notify_aio_contexts();
1385 static void prepare_icount_for_run(CPUState *cpu)
1390 /* These should always be cleared by process_icount_data after
1391 * each vCPU execution. However u16.high can be raised
1392 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1394 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1395 g_assert(cpu->icount_extra == 0);
1397 cpu->icount_budget = tcg_get_icount_limit();
1398 insns_left = MIN(0xffff, cpu->icount_budget);
1399 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1400 cpu->icount_extra = cpu->icount_budget - insns_left;
1402 replay_mutex_lock();
1404 if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
1405 notify_aio_contexts();
1410 static void process_icount_data(CPUState *cpu)
1413 /* Account for executed instructions */
1414 cpu_update_icount(cpu);
1416 /* Reset the counters */
1417 cpu_neg(cpu)->icount_decr.u16.low = 0;
1418 cpu->icount_extra = 0;
1419 cpu->icount_budget = 0;
1421 replay_account_executed_instructions();
1423 replay_mutex_unlock();
1428 static int tcg_cpu_exec(CPUState *cpu)
1431 #ifdef CONFIG_PROFILER
1435 assert(tcg_enabled());
1436 #ifdef CONFIG_PROFILER
1437 ti = profile_getclock();
1439 cpu_exec_start(cpu);
1440 ret = cpu_exec(cpu);
1442 #ifdef CONFIG_PROFILER
1443 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1444 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1449 /* Destroy any remaining vCPUs which have been unplugged and have
1452 static void deal_with_unplugged_cpus(void)
1457 if (cpu->unplug && !cpu_can_run(cpu)) {
1458 qemu_tcg_destroy_vcpu(cpu);
1459 cpu->created = false;
1460 qemu_cond_signal(&qemu_cpu_cond);
1466 /* Single-threaded TCG
1468 * In the single-threaded case each vCPU is simulated in turn. If
1469 * there is more than a single vCPU we create a simple timer to kick
1470 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1471 * This is done explicitly rather than relying on side-effects
1475 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1477 CPUState *cpu = arg;
1479 assert(tcg_enabled());
1480 rcu_register_thread();
1481 tcg_register_thread();
1483 qemu_mutex_lock_iothread();
1484 qemu_thread_get_self(cpu->thread);
1486 cpu->thread_id = qemu_get_thread_id();
1487 cpu->created = true;
1489 qemu_cond_signal(&qemu_cpu_cond);
1490 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1492 /* wait for initial kick-off after machine start */
1493 while (first_cpu->stopped) {
1494 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1496 /* process any pending work */
1499 qemu_wait_io_event_common(cpu);
1503 start_tcg_kick_timer();
1507 /* process any pending work */
1508 cpu->exit_request = 1;
1511 qemu_mutex_unlock_iothread();
1512 replay_mutex_lock();
1513 qemu_mutex_lock_iothread();
1514 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1515 qemu_account_warp_timer();
1517 /* Run the timers here. This is much more efficient than
1518 * waking up the I/O thread and waiting for completion.
1520 handle_icount_deadline();
1522 replay_mutex_unlock();
1528 while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1530 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1533 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1534 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1536 if (cpu_can_run(cpu)) {
1539 qemu_mutex_unlock_iothread();
1540 prepare_icount_for_run(cpu);
1542 r = tcg_cpu_exec(cpu);
1544 process_icount_data(cpu);
1545 qemu_mutex_lock_iothread();
1547 if (r == EXCP_DEBUG) {
1548 cpu_handle_guest_debug(cpu);
1550 } else if (r == EXCP_ATOMIC) {
1551 qemu_mutex_unlock_iothread();
1552 cpu_exec_step_atomic(cpu);
1553 qemu_mutex_lock_iothread();
1556 } else if (cpu->stop) {
1558 cpu = CPU_NEXT(cpu);
1563 cpu = CPU_NEXT(cpu);
1564 } /* while (cpu && !cpu->exit_request).. */
1566 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1567 atomic_set(&tcg_current_rr_cpu, NULL);
1569 if (cpu && cpu->exit_request) {
1570 atomic_mb_set(&cpu->exit_request, 0);
1573 if (use_icount && all_cpu_threads_idle()) {
1575 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1576 * in the main_loop, wake it up in order to start the warp timer.
1578 qemu_notify_event();
1581 qemu_tcg_rr_wait_io_event();
1582 deal_with_unplugged_cpus();
1585 rcu_unregister_thread();
1589 static void *qemu_hax_cpu_thread_fn(void *arg)
1591 CPUState *cpu = arg;
1594 rcu_register_thread();
1595 qemu_mutex_lock_iothread();
1596 qemu_thread_get_self(cpu->thread);
1598 cpu->thread_id = qemu_get_thread_id();
1599 cpu->created = true;
1603 qemu_cond_signal(&qemu_cpu_cond);
1604 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1607 if (cpu_can_run(cpu)) {
1608 r = hax_smp_cpu_exec(cpu);
1609 if (r == EXCP_DEBUG) {
1610 cpu_handle_guest_debug(cpu);
1614 qemu_wait_io_event(cpu);
1615 } while (!cpu->unplug || cpu_can_run(cpu));
1616 rcu_unregister_thread();
1620 /* The HVF-specific vCPU thread function. This one should only run when the host
1621 * CPU supports the VMX "unrestricted guest" feature. */
1622 static void *qemu_hvf_cpu_thread_fn(void *arg)
1624 CPUState *cpu = arg;
1628 assert(hvf_enabled());
1630 rcu_register_thread();
1632 qemu_mutex_lock_iothread();
1633 qemu_thread_get_self(cpu->thread);
1635 cpu->thread_id = qemu_get_thread_id();
1641 /* signal CPU creation */
1642 cpu->created = true;
1643 qemu_cond_signal(&qemu_cpu_cond);
1644 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1647 if (cpu_can_run(cpu)) {
1648 r = hvf_vcpu_exec(cpu);
1649 if (r == EXCP_DEBUG) {
1650 cpu_handle_guest_debug(cpu);
1653 qemu_wait_io_event(cpu);
1654 } while (!cpu->unplug || cpu_can_run(cpu));
1656 hvf_vcpu_destroy(cpu);
1657 cpu->created = false;
1658 qemu_cond_signal(&qemu_cpu_cond);
1659 qemu_mutex_unlock_iothread();
1660 rcu_unregister_thread();
1664 static void *qemu_whpx_cpu_thread_fn(void *arg)
1666 CPUState *cpu = arg;
1669 rcu_register_thread();
1671 qemu_mutex_lock_iothread();
1672 qemu_thread_get_self(cpu->thread);
1673 cpu->thread_id = qemu_get_thread_id();
1676 r = whpx_init_vcpu(cpu);
1678 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1682 /* signal CPU creation */
1683 cpu->created = true;
1684 qemu_cond_signal(&qemu_cpu_cond);
1685 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1688 if (cpu_can_run(cpu)) {
1689 r = whpx_vcpu_exec(cpu);
1690 if (r == EXCP_DEBUG) {
1691 cpu_handle_guest_debug(cpu);
1694 while (cpu_thread_is_idle(cpu)) {
1695 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1697 qemu_wait_io_event_common(cpu);
1698 } while (!cpu->unplug || cpu_can_run(cpu));
1700 whpx_destroy_vcpu(cpu);
1701 cpu->created = false;
1702 qemu_cond_signal(&qemu_cpu_cond);
1703 qemu_mutex_unlock_iothread();
1704 rcu_unregister_thread();
1709 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1714 /* Multi-threaded TCG
1716 * In the multi-threaded case each vCPU has its own thread. The TLS
1717 * variable current_cpu can be used deep in the code to find the
1718 * current CPUState for a given thread.
1721 static void *qemu_tcg_cpu_thread_fn(void *arg)
1723 CPUState *cpu = arg;
1725 assert(tcg_enabled());
1726 g_assert(!use_icount);
1728 rcu_register_thread();
1729 tcg_register_thread();
1731 qemu_mutex_lock_iothread();
1732 qemu_thread_get_self(cpu->thread);
1734 cpu->thread_id = qemu_get_thread_id();
1735 cpu->created = true;
1738 qemu_cond_signal(&qemu_cpu_cond);
1739 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1741 /* process any pending work */
1742 cpu->exit_request = 1;
1745 if (cpu_can_run(cpu)) {
1747 qemu_mutex_unlock_iothread();
1748 r = tcg_cpu_exec(cpu);
1749 qemu_mutex_lock_iothread();
1752 cpu_handle_guest_debug(cpu);
1755 /* during start-up the vCPU is reset and the thread is
1756 * kicked several times. If we don't ensure we go back
1757 * to sleep in the halted state we won't cleanly
1758 * start-up when the vCPU is enabled.
1760 * cpu->halted should ensure we sleep in wait_io_event
1762 g_assert(cpu->halted);
1765 qemu_mutex_unlock_iothread();
1766 cpu_exec_step_atomic(cpu);
1767 qemu_mutex_lock_iothread();
1769 /* Ignore everything else? */
1774 atomic_mb_set(&cpu->exit_request, 0);
1775 qemu_wait_io_event(cpu);
1776 } while (!cpu->unplug || cpu_can_run(cpu));
1778 qemu_tcg_destroy_vcpu(cpu);
1779 cpu->created = false;
1780 qemu_cond_signal(&qemu_cpu_cond);
1781 qemu_mutex_unlock_iothread();
1782 rcu_unregister_thread();
1786 static void qemu_cpu_kick_thread(CPUState *cpu)
1791 if (cpu->thread_kicked) {
1794 cpu->thread_kicked = true;
1795 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1796 if (err && err != ESRCH) {
1797 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1801 if (!qemu_cpu_is_self(cpu)) {
1802 if (whpx_enabled()) {
1803 whpx_vcpu_kick(cpu);
1804 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1805 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1806 __func__, GetLastError());
1813 void qemu_cpu_kick(CPUState *cpu)
1815 qemu_cond_broadcast(cpu->halt_cond);
1816 if (tcg_enabled()) {
1817 if (qemu_tcg_mttcg_enabled()) {
1820 qemu_cpu_kick_rr_cpus();
1823 if (hax_enabled()) {
1825 * FIXME: race condition with the exit_request check in
1828 cpu->exit_request = 1;
1830 qemu_cpu_kick_thread(cpu);
1834 void qemu_cpu_kick_self(void)
1836 assert(current_cpu);
1837 qemu_cpu_kick_thread(current_cpu);
1840 bool qemu_cpu_is_self(CPUState *cpu)
1842 return qemu_thread_is_self(cpu->thread);
1845 bool qemu_in_vcpu_thread(void)
1847 return current_cpu && qemu_cpu_is_self(current_cpu);
1850 static __thread bool iothread_locked = false;
1852 bool qemu_mutex_iothread_locked(void)
1854 return iothread_locked;
1858 * The BQL is taken from so many places that it is worth profiling the
1859 * callers directly, instead of funneling them all through a single function.
1861 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1863 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1865 g_assert(!qemu_mutex_iothread_locked());
1866 bql_lock(&qemu_global_mutex, file, line);
1867 iothread_locked = true;
1870 void qemu_mutex_unlock_iothread(void)
1872 g_assert(qemu_mutex_iothread_locked());
1873 iothread_locked = false;
1874 qemu_mutex_unlock(&qemu_global_mutex);
1877 void qemu_cond_wait_iothread(QemuCond *cond)
1879 qemu_cond_wait(cond, &qemu_global_mutex);
1882 static bool all_vcpus_paused(void)
1887 if (!cpu->stopped) {
1895 void pause_all_vcpus(void)
1899 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1901 if (qemu_cpu_is_self(cpu)) {
1902 qemu_cpu_stop(cpu, true);
1909 /* We need to drop the replay_lock so any vCPU threads woken up
1910 * can finish their replay tasks
1912 replay_mutex_unlock();
1914 while (!all_vcpus_paused()) {
1915 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1921 qemu_mutex_unlock_iothread();
1922 replay_mutex_lock();
1923 qemu_mutex_lock_iothread();
1926 void cpu_resume(CPUState *cpu)
1929 cpu->stopped = false;
1933 void resume_all_vcpus(void)
1937 if (!runstate_is_running()) {
1941 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1947 void cpu_remove_sync(CPUState *cpu)
1952 qemu_mutex_unlock_iothread();
1953 qemu_thread_join(cpu->thread);
1954 qemu_mutex_lock_iothread();
1957 /* For temporary buffers for forming a name */
1958 #define VCPU_THREAD_NAME_SIZE 16
1960 static void qemu_tcg_init_vcpu(CPUState *cpu)
1962 char thread_name[VCPU_THREAD_NAME_SIZE];
1963 static QemuCond *single_tcg_halt_cond;
1964 static QemuThread *single_tcg_cpu_thread;
1965 static int tcg_region_inited;
1967 assert(tcg_enabled());
1969 * Initialize TCG regions--once. Now is a good time, because:
1970 * (1) TCG's init context, prologue and target globals have been set up.
1971 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1972 * -accel flag is processed, so the check doesn't work then).
1974 if (!tcg_region_inited) {
1975 tcg_region_inited = 1;
1979 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1980 cpu->thread = g_malloc0(sizeof(QemuThread));
1981 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1982 qemu_cond_init(cpu->halt_cond);
1984 if (qemu_tcg_mttcg_enabled()) {
1985 /* create a thread per vCPU with TCG (MTTCG) */
1986 parallel_cpus = true;
1987 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1990 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1991 cpu, QEMU_THREAD_JOINABLE);
1994 /* share a single thread for all cpus with TCG */
1995 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1996 qemu_thread_create(cpu->thread, thread_name,
1997 qemu_tcg_rr_cpu_thread_fn,
1998 cpu, QEMU_THREAD_JOINABLE);
2000 single_tcg_halt_cond = cpu->halt_cond;
2001 single_tcg_cpu_thread = cpu->thread;
2004 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2007 /* For non-MTTCG cases we share the thread */
2008 cpu->thread = single_tcg_cpu_thread;
2009 cpu->halt_cond = single_tcg_halt_cond;
2010 cpu->thread_id = first_cpu->thread_id;
2012 cpu->created = true;
2016 static void qemu_hax_start_vcpu(CPUState *cpu)
2018 char thread_name[VCPU_THREAD_NAME_SIZE];
2020 cpu->thread = g_malloc0(sizeof(QemuThread));
2021 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022 qemu_cond_init(cpu->halt_cond);
2024 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2026 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2027 cpu, QEMU_THREAD_JOINABLE);
2029 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2033 static void qemu_kvm_start_vcpu(CPUState *cpu)
2035 char thread_name[VCPU_THREAD_NAME_SIZE];
2037 cpu->thread = g_malloc0(sizeof(QemuThread));
2038 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2039 qemu_cond_init(cpu->halt_cond);
2040 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2042 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2043 cpu, QEMU_THREAD_JOINABLE);
2046 static void qemu_hvf_start_vcpu(CPUState *cpu)
2048 char thread_name[VCPU_THREAD_NAME_SIZE];
2050 /* HVF currently does not support TCG, and only runs in
2051 * unrestricted-guest mode. */
2052 assert(hvf_enabled());
2054 cpu->thread = g_malloc0(sizeof(QemuThread));
2055 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2056 qemu_cond_init(cpu->halt_cond);
2058 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2060 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2061 cpu, QEMU_THREAD_JOINABLE);
2064 static void qemu_whpx_start_vcpu(CPUState *cpu)
2066 char thread_name[VCPU_THREAD_NAME_SIZE];
2068 cpu->thread = g_malloc0(sizeof(QemuThread));
2069 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070 qemu_cond_init(cpu->halt_cond);
2071 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2073 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2074 cpu, QEMU_THREAD_JOINABLE);
2076 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2080 static void qemu_dummy_start_vcpu(CPUState *cpu)
2082 char thread_name[VCPU_THREAD_NAME_SIZE];
2084 cpu->thread = g_malloc0(sizeof(QemuThread));
2085 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2086 qemu_cond_init(cpu->halt_cond);
2087 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2089 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2090 QEMU_THREAD_JOINABLE);
2093 void qemu_init_vcpu(CPUState *cpu)
2095 MachineState *ms = MACHINE(qdev_get_machine());
2097 cpu->nr_cores = ms->smp.cores;
2098 cpu->nr_threads = ms->smp.threads;
2099 cpu->stopped = true;
2100 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2103 /* If the target cpu hasn't set up any address spaces itself,
2104 * give it the default one.
2107 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2110 if (kvm_enabled()) {
2111 qemu_kvm_start_vcpu(cpu);
2112 } else if (hax_enabled()) {
2113 qemu_hax_start_vcpu(cpu);
2114 } else if (hvf_enabled()) {
2115 qemu_hvf_start_vcpu(cpu);
2116 } else if (tcg_enabled()) {
2117 qemu_tcg_init_vcpu(cpu);
2118 } else if (whpx_enabled()) {
2119 qemu_whpx_start_vcpu(cpu);
2121 qemu_dummy_start_vcpu(cpu);
2124 while (!cpu->created) {
2125 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2129 void cpu_stop_current(void)
2132 current_cpu->stop = true;
2133 cpu_exit(current_cpu);
2137 int vm_stop(RunState state)
2139 if (qemu_in_vcpu_thread()) {
2140 qemu_system_vmstop_request_prepare();
2141 qemu_system_vmstop_request(state);
2143 * FIXME: should not return to device code in case
2144 * vm_stop() has been requested.
2150 return do_vm_stop(state, true);
2154 * Prepare for (re)starting the VM.
2155 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2156 * running or in case of an error condition), 0 otherwise.
2158 int vm_prepare_start(void)
2162 qemu_vmstop_requested(&requested);
2163 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2167 /* Ensure that a STOP/RESUME pair of events is emitted if a
2168 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2169 * example, according to documentation is always followed by
2172 if (runstate_is_running()) {
2173 qapi_event_send_stop();
2174 qapi_event_send_resume();
2178 /* We are sending this now, but the CPUs will be resumed shortly later */
2179 qapi_event_send_resume();
2182 runstate_set(RUN_STATE_RUNNING);
2183 vm_state_notify(1, RUN_STATE_RUNNING);
2189 if (!vm_prepare_start()) {
2194 /* does a state transition even if the VM is already stopped,
2195 current state is forgotten forever */
2196 int vm_stop_force_state(RunState state)
2198 if (runstate_is_running()) {
2199 return vm_stop(state);
2201 runstate_set(state);
2204 /* Make sure to return an error if the flush in a previous vm_stop()
2206 return bdrv_flush_all();
2210 void list_cpus(const char *optarg)
2212 /* XXX: implement xxx_cpu_list for targets that still miss it */
2213 #if defined(cpu_list)
2218 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2219 bool has_cpu, int64_t cpu_index, Error **errp)
2225 int64_t orig_addr = addr, orig_size = size;
2231 cpu = qemu_get_cpu(cpu_index);
2233 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2238 f = fopen(filename, "wb");
2240 error_setg_file_open(errp, errno, filename);
2248 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2249 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2250 " specified", orig_addr, orig_size);
2253 if (fwrite(buf, 1, l, f) != l) {
2254 error_setg(errp, QERR_IO_ERROR);
2265 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2272 f = fopen(filename, "wb");
2274 error_setg_file_open(errp, errno, filename);
2282 cpu_physical_memory_read(addr, buf, l);
2283 if (fwrite(buf, 1, l, f) != l) {
2284 error_setg(errp, QERR_IO_ERROR);
2295 void qmp_inject_nmi(Error **errp)
2297 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2300 void dump_drift_info(void)
2306 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2307 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2308 if (icount_align_option) {
2309 qemu_printf("Max guest delay %"PRIi64" ms\n",
2310 -max_delay / SCALE_MS);
2311 qemu_printf("Max guest advance %"PRIi64" ms\n",
2312 max_advance / SCALE_MS);
2314 qemu_printf("Max guest delay NA\n");
2315 qemu_printf("Max guest advance NA\n");