4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "sysemu/sysemu.h"
35 #include "sysemu/block-backend.h"
36 #include "exec/gdbstub.h"
37 #include "sysemu/dma.h"
38 #include "sysemu/hw_accel.h"
39 #include "sysemu/kvm.h"
40 #include "sysemu/hax.h"
41 #include "sysemu/hvf.h"
42 #include "sysemu/whpx.h"
43 #include "exec/exec-all.h"
45 #include "qemu/thread.h"
46 #include "sysemu/cpus.h"
47 #include "sysemu/qtest.h"
48 #include "qemu/main-loop.h"
49 #include "qemu/option.h"
50 #include "qemu/bitmap.h"
51 #include "qemu/seqlock.h"
54 #include "sysemu/replay.h"
55 #include "hw/boards.h"
59 #include <sys/prctl.h>
62 #define PR_MCE_KILL 33
65 #ifndef PR_MCE_KILL_SET
66 #define PR_MCE_KILL_SET 1
69 #ifndef PR_MCE_KILL_EARLY
70 #define PR_MCE_KILL_EARLY 1
73 #endif /* CONFIG_LINUX */
78 /* vcpu throttling controls */
79 static QEMUTimer *throttle_timer;
80 static unsigned int throttle_percentage;
82 #define CPU_THROTTLE_PCT_MIN 1
83 #define CPU_THROTTLE_PCT_MAX 99
84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86 bool cpu_is_stopped(CPUState *cpu)
88 return cpu->stopped || !runstate_is_running();
91 static bool cpu_thread_is_idle(CPUState *cpu)
93 if (cpu->stop || cpu->queued_work_first) {
96 if (cpu_is_stopped(cpu)) {
99 if (!cpu->halted || cpu_has_work(cpu) ||
100 kvm_halt_in_kernel()) {
106 static bool all_cpu_threads_idle(void)
111 if (!cpu_thread_is_idle(cpu)) {
118 /***********************************************************/
119 /* guest cycle counter */
121 /* Protected by TimersState seqlock */
123 static bool icount_sleep = true;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
127 typedef struct TimersState {
128 /* Protected by BQL. */
129 int64_t cpu_ticks_prev;
130 int64_t cpu_ticks_offset;
132 /* Protect fields that can be respectively read outside the
133 * BQL, and written from multiple threads.
135 QemuSeqLock vm_clock_seqlock;
136 QemuSpin vm_clock_lock;
138 int16_t cpu_ticks_enabled;
140 /* Conversion factor from emulated instructions to virtual clock ticks. */
141 int16_t icount_time_shift;
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias;
146 int64_t vm_clock_warp_start;
147 int64_t cpu_clock_offset;
149 /* Only written by TCG thread */
152 /* for adjusting icount */
153 QEMUTimer *icount_rt_timer;
154 QEMUTimer *icount_vm_timer;
155 QEMUTimer *icount_warp_timer;
158 static TimersState timers_state;
162 * We default to false if we know other options have been enabled
163 * which are currently incompatible with MTTCG. Otherwise when each
164 * guest (target) has been updated to support:
165 * - atomic instructions
166 * - memory ordering primitives (barriers)
167 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
169 * Once a guest architecture has been converted to the new primitives
170 * there are two remaining limitations to check.
172 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
173 * - The host must have a stronger memory order than the guest
175 * It may be possible in future to support strong guests on weak hosts
176 * but that will require tagging all load/stores in a guest with their
177 * implicit memory order requirements which would likely slow things
181 static bool check_tcg_memory_orders_compatible(void)
183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
184 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
190 static bool default_mttcg_enabled(void)
192 if (use_icount || TCG_OVERSIZED_GUEST) {
195 #ifdef TARGET_SUPPORTS_MTTCG
196 return check_tcg_memory_orders_compatible();
203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
205 const char *t = qemu_opt_get(opts, "thread");
207 if (strcmp(t, "multi") == 0) {
208 if (TCG_OVERSIZED_GUEST) {
209 error_setg(errp, "No MTTCG when guest word size > hosts");
210 } else if (use_icount) {
211 error_setg(errp, "No MTTCG when icount is enabled");
213 #ifndef TARGET_SUPPORTS_MTTCG
214 error_report("Guest not yet converted to MTTCG - "
215 "you may get unexpected results");
217 if (!check_tcg_memory_orders_compatible()) {
218 error_report("Guest expects a stronger memory ordering "
219 "than the host provides");
220 error_printf("This may cause strange/hard to debug errors\n");
222 mttcg_enabled = true;
224 } else if (strcmp(t, "single") == 0) {
225 mttcg_enabled = false;
227 error_setg(errp, "Invalid 'thread' setting %s", t);
230 mttcg_enabled = default_mttcg_enabled();
234 /* The current number of executed instructions is based on what we
235 * originally budgeted minus the current state of the decrementing
236 * icount counters in extra/u16.low.
238 static int64_t cpu_get_icount_executed(CPUState *cpu)
240 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
244 * Update the global shared timer_state.qemu_icount to take into
245 * account executed instructions. This is done by the TCG vCPU
246 * thread so the main-loop can see time has moved forward.
248 void cpu_update_icount(CPUState *cpu)
250 int64_t executed = cpu_get_icount_executed(cpu);
251 cpu->icount_budget -= executed;
253 #ifndef CONFIG_ATOMIC64
254 seqlock_write_lock(&timers_state.vm_clock_seqlock,
255 &timers_state.vm_clock_lock);
257 atomic_set__nocheck(&timers_state.qemu_icount,
258 timers_state.qemu_icount + executed);
259 #ifndef CONFIG_ATOMIC64
260 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
261 &timers_state.vm_clock_lock);
265 static int64_t cpu_get_icount_raw_locked(void)
267 CPUState *cpu = current_cpu;
269 if (cpu && cpu->running) {
270 if (!cpu->can_do_io) {
271 error_report("Bad icount read");
274 /* Take into account what has run */
275 cpu_update_icount(cpu);
277 /* The read is protected by the seqlock, so __nocheck is okay. */
278 return atomic_read__nocheck(&timers_state.qemu_icount);
281 static int64_t cpu_get_icount_locked(void)
283 int64_t icount = cpu_get_icount_raw_locked();
284 return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
287 int64_t cpu_get_icount_raw(void)
293 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
294 icount = cpu_get_icount_raw_locked();
295 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
300 /* Return the virtual CPU time, based on the instruction counter. */
301 int64_t cpu_get_icount(void)
307 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
308 icount = cpu_get_icount_locked();
309 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
314 int64_t cpu_icount_to_ns(int64_t icount)
316 return icount << atomic_read(&timers_state.icount_time_shift);
319 /* return the time elapsed in VM between vm_start and vm_stop. Unless
320 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
323 * Caller must hold the BQL
325 int64_t cpu_get_ticks(void)
330 return cpu_get_icount();
333 ticks = timers_state.cpu_ticks_offset;
334 if (timers_state.cpu_ticks_enabled) {
335 ticks += cpu_get_host_ticks();
338 if (timers_state.cpu_ticks_prev > ticks) {
339 /* Note: non increasing ticks may happen if the host uses
341 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
342 ticks = timers_state.cpu_ticks_prev;
345 timers_state.cpu_ticks_prev = ticks;
349 static int64_t cpu_get_clock_locked(void)
353 time = timers_state.cpu_clock_offset;
354 if (timers_state.cpu_ticks_enabled) {
361 /* Return the monotonic time elapsed in VM, i.e.,
362 * the time between vm_start and vm_stop
364 int64_t cpu_get_clock(void)
370 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
371 ti = cpu_get_clock_locked();
372 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
377 /* enable cpu_get_ticks()
378 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
380 void cpu_enable_ticks(void)
382 seqlock_write_lock(&timers_state.vm_clock_seqlock,
383 &timers_state.vm_clock_lock);
384 if (!timers_state.cpu_ticks_enabled) {
385 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
386 timers_state.cpu_clock_offset -= get_clock();
387 timers_state.cpu_ticks_enabled = 1;
389 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
390 &timers_state.vm_clock_lock);
393 /* disable cpu_get_ticks() : the clock is stopped. You must not call
394 * cpu_get_ticks() after that.
395 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
397 void cpu_disable_ticks(void)
399 seqlock_write_lock(&timers_state.vm_clock_seqlock,
400 &timers_state.vm_clock_lock);
401 if (timers_state.cpu_ticks_enabled) {
402 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
403 timers_state.cpu_clock_offset = cpu_get_clock_locked();
404 timers_state.cpu_ticks_enabled = 0;
406 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
407 &timers_state.vm_clock_lock);
410 /* Correlation between real and virtual time is always going to be
411 fairly approximate, so ignore small variation.
412 When the guest is idle real and virtual time will be aligned in
414 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
416 static void icount_adjust(void)
422 /* Protected by TimersState mutex. */
423 static int64_t last_delta;
425 /* If the VM is not running, then do nothing. */
426 if (!runstate_is_running()) {
430 seqlock_write_lock(&timers_state.vm_clock_seqlock,
431 &timers_state.vm_clock_lock);
432 cur_time = cpu_get_clock_locked();
433 cur_icount = cpu_get_icount_locked();
435 delta = cur_icount - cur_time;
436 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
438 && last_delta + ICOUNT_WOBBLE < delta * 2
439 && timers_state.icount_time_shift > 0) {
440 /* The guest is getting too far ahead. Slow time down. */
441 atomic_set(&timers_state.icount_time_shift,
442 timers_state.icount_time_shift - 1);
445 && last_delta - ICOUNT_WOBBLE > delta * 2
446 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
447 /* The guest is getting too far behind. Speed time up. */
448 atomic_set(&timers_state.icount_time_shift,
449 timers_state.icount_time_shift + 1);
452 atomic_set__nocheck(&timers_state.qemu_icount_bias,
453 cur_icount - (timers_state.qemu_icount
454 << timers_state.icount_time_shift));
455 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
456 &timers_state.vm_clock_lock);
459 static void icount_adjust_rt(void *opaque)
461 timer_mod(timers_state.icount_rt_timer,
462 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
466 static void icount_adjust_vm(void *opaque)
468 timer_mod(timers_state.icount_vm_timer,
469 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
470 NANOSECONDS_PER_SECOND / 10);
474 static int64_t qemu_icount_round(int64_t count)
476 int shift = atomic_read(&timers_state.icount_time_shift);
477 return (count + (1 << shift) - 1) >> shift;
480 static void icount_warp_rt(void)
485 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
486 * changes from -1 to another value, so the race here is okay.
489 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
490 warp_start = timers_state.vm_clock_warp_start;
491 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
493 if (warp_start == -1) {
497 seqlock_write_lock(&timers_state.vm_clock_seqlock,
498 &timers_state.vm_clock_lock);
499 if (runstate_is_running()) {
500 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
501 cpu_get_clock_locked());
504 warp_delta = clock - timers_state.vm_clock_warp_start;
505 if (use_icount == 2) {
507 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
508 * far ahead of real time.
510 int64_t cur_icount = cpu_get_icount_locked();
511 int64_t delta = clock - cur_icount;
512 warp_delta = MIN(warp_delta, delta);
514 atomic_set__nocheck(&timers_state.qemu_icount_bias,
515 timers_state.qemu_icount_bias + warp_delta);
517 timers_state.vm_clock_warp_start = -1;
518 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
519 &timers_state.vm_clock_lock);
521 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
522 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
526 static void icount_timer_cb(void *opaque)
528 /* No need for a checkpoint because the timer already synchronizes
529 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
534 void qtest_clock_warp(int64_t dest)
536 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
537 AioContext *aio_context;
538 assert(qtest_enabled());
539 aio_context = qemu_get_aio_context();
540 while (clock < dest) {
541 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
542 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
544 seqlock_write_lock(&timers_state.vm_clock_seqlock,
545 &timers_state.vm_clock_lock);
546 atomic_set__nocheck(&timers_state.qemu_icount_bias,
547 timers_state.qemu_icount_bias + warp);
548 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
549 &timers_state.vm_clock_lock);
551 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
552 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
553 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
555 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
558 void qemu_start_warp_timer(void)
567 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
568 * do not fire, so computing the deadline does not make sense.
570 if (!runstate_is_running()) {
574 /* warp clock deterministically in record/replay mode */
575 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
579 if (!all_cpu_threads_idle()) {
583 if (qtest_enabled()) {
584 /* When testing, qtest commands advance icount. */
588 /* We want to use the earliest deadline from ALL vm_clocks */
589 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
590 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
592 static bool notified;
593 if (!icount_sleep && !notified) {
594 warn_report("icount sleep disabled and no active timers");
602 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
603 * sleep. Otherwise, the CPU might be waiting for a future timer
604 * interrupt to wake it up, but the interrupt never comes because
605 * the vCPU isn't running any insns and thus doesn't advance the
606 * QEMU_CLOCK_VIRTUAL.
610 * We never let VCPUs sleep in no sleep icount mode.
611 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
612 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
613 * It is useful when we want a deterministic execution time,
614 * isolated from host latencies.
616 seqlock_write_lock(&timers_state.vm_clock_seqlock,
617 &timers_state.vm_clock_lock);
618 atomic_set__nocheck(&timers_state.qemu_icount_bias,
619 timers_state.qemu_icount_bias + deadline);
620 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
621 &timers_state.vm_clock_lock);
622 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
625 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
626 * "real" time, (related to the time left until the next event) has
627 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
628 * This avoids that the warps are visible externally; for example,
629 * you will not be sending network packets continuously instead of
632 seqlock_write_lock(&timers_state.vm_clock_seqlock,
633 &timers_state.vm_clock_lock);
634 if (timers_state.vm_clock_warp_start == -1
635 || timers_state.vm_clock_warp_start > clock) {
636 timers_state.vm_clock_warp_start = clock;
638 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
639 &timers_state.vm_clock_lock);
640 timer_mod_anticipate(timers_state.icount_warp_timer,
643 } else if (deadline == 0) {
644 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
648 static void qemu_account_warp_timer(void)
650 if (!use_icount || !icount_sleep) {
654 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
655 * do not fire, so computing the deadline does not make sense.
657 if (!runstate_is_running()) {
661 /* warp clock deterministically in record/replay mode */
662 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
666 timer_del(timers_state.icount_warp_timer);
670 static bool icount_state_needed(void *opaque)
675 static bool warp_timer_state_needed(void *opaque)
677 TimersState *s = opaque;
678 return s->icount_warp_timer != NULL;
681 static bool adjust_timers_state_needed(void *opaque)
683 TimersState *s = opaque;
684 return s->icount_rt_timer != NULL;
688 * Subsection for warp timer migration is optional, because may not be created
690 static const VMStateDescription icount_vmstate_warp_timer = {
691 .name = "timer/icount/warp_timer",
693 .minimum_version_id = 1,
694 .needed = warp_timer_state_needed,
695 .fields = (VMStateField[]) {
696 VMSTATE_INT64(vm_clock_warp_start, TimersState),
697 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
698 VMSTATE_END_OF_LIST()
702 static const VMStateDescription icount_vmstate_adjust_timers = {
703 .name = "timer/icount/timers",
705 .minimum_version_id = 1,
706 .needed = adjust_timers_state_needed,
707 .fields = (VMStateField[]) {
708 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
709 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
710 VMSTATE_END_OF_LIST()
715 * This is a subsection for icount migration.
717 static const VMStateDescription icount_vmstate_timers = {
718 .name = "timer/icount",
720 .minimum_version_id = 1,
721 .needed = icount_state_needed,
722 .fields = (VMStateField[]) {
723 VMSTATE_INT64(qemu_icount_bias, TimersState),
724 VMSTATE_INT64(qemu_icount, TimersState),
725 VMSTATE_END_OF_LIST()
727 .subsections = (const VMStateDescription*[]) {
728 &icount_vmstate_warp_timer,
729 &icount_vmstate_adjust_timers,
734 static const VMStateDescription vmstate_timers = {
737 .minimum_version_id = 1,
738 .fields = (VMStateField[]) {
739 VMSTATE_INT64(cpu_ticks_offset, TimersState),
741 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
742 VMSTATE_END_OF_LIST()
744 .subsections = (const VMStateDescription*[]) {
745 &icount_vmstate_timers,
750 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
753 double throttle_ratio;
756 if (!cpu_throttle_get_percentage()) {
760 pct = (double)cpu_throttle_get_percentage()/100;
761 throttle_ratio = pct / (1 - pct);
762 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
764 qemu_mutex_unlock_iothread();
765 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
766 qemu_mutex_lock_iothread();
767 atomic_set(&cpu->throttle_thread_scheduled, 0);
770 static void cpu_throttle_timer_tick(void *opaque)
775 /* Stop the timer if needed */
776 if (!cpu_throttle_get_percentage()) {
780 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
781 async_run_on_cpu(cpu, cpu_throttle_thread,
786 pct = (double)cpu_throttle_get_percentage()/100;
787 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
788 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
791 void cpu_throttle_set(int new_throttle_pct)
793 /* Ensure throttle percentage is within valid range */
794 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
795 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
797 atomic_set(&throttle_percentage, new_throttle_pct);
799 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
800 CPU_THROTTLE_TIMESLICE_NS);
803 void cpu_throttle_stop(void)
805 atomic_set(&throttle_percentage, 0);
808 bool cpu_throttle_active(void)
810 return (cpu_throttle_get_percentage() != 0);
813 int cpu_throttle_get_percentage(void)
815 return atomic_read(&throttle_percentage);
818 void cpu_ticks_init(void)
820 seqlock_init(&timers_state.vm_clock_seqlock);
821 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
822 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
823 cpu_throttle_timer_tick, NULL);
826 void configure_icount(QemuOpts *opts, Error **errp)
829 char *rem_str = NULL;
831 option = qemu_opt_get(opts, "shift");
833 if (qemu_opt_get(opts, "align") != NULL) {
834 error_setg(errp, "Please specify shift option when using align");
839 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
841 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
842 icount_timer_cb, NULL);
845 icount_align_option = qemu_opt_get_bool(opts, "align", false);
847 if (icount_align_option && !icount_sleep) {
848 error_setg(errp, "align=on and sleep=off are incompatible");
850 if (strcmp(option, "auto") != 0) {
852 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
853 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
854 error_setg(errp, "icount: Invalid shift value");
858 } else if (icount_align_option) {
859 error_setg(errp, "shift=auto and align=on are incompatible");
860 } else if (!icount_sleep) {
861 error_setg(errp, "shift=auto and sleep=off are incompatible");
866 /* 125MIPS seems a reasonable initial guess at the guest speed.
867 It will be corrected fairly quickly anyway. */
868 timers_state.icount_time_shift = 3;
870 /* Have both realtime and virtual time triggers for speed adjustment.
871 The realtime trigger catches emulated time passing too slowly,
872 the virtual time trigger catches emulated time passing too fast.
873 Realtime triggers occur even when idle, so use them less frequently
875 timers_state.vm_clock_warp_start = -1;
876 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
877 icount_adjust_rt, NULL);
878 timer_mod(timers_state.icount_rt_timer,
879 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
880 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
881 icount_adjust_vm, NULL);
882 timer_mod(timers_state.icount_vm_timer,
883 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
884 NANOSECONDS_PER_SECOND / 10);
887 /***********************************************************/
888 /* TCG vCPU kick timer
890 * The kick timer is responsible for moving single threaded vCPU
891 * emulation on to the next vCPU. If more than one vCPU is running a
892 * timer event with force a cpu->exit so the next vCPU can get
895 * The timer is removed if all vCPUs are idle and restarted again once
896 * idleness is complete.
899 static QEMUTimer *tcg_kick_vcpu_timer;
900 static CPUState *tcg_current_rr_cpu;
902 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
904 static inline int64_t qemu_tcg_next_kick(void)
906 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
909 /* Kick the currently round-robin scheduled vCPU */
910 static void qemu_cpu_kick_rr_cpu(void)
914 cpu = atomic_mb_read(&tcg_current_rr_cpu);
918 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
921 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
925 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
927 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
932 if (qemu_in_vcpu_thread()) {
933 /* A CPU is currently running; kick it back out to the
934 * tcg_cpu_exec() loop so it will recalculate its
935 * icount deadline immediately.
937 qemu_cpu_kick(current_cpu);
938 } else if (first_cpu) {
939 /* qemu_cpu_kick is not enough to kick a halted CPU out of
940 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
941 * causes cpu_thread_is_idle to return false. This way,
942 * handle_icount_deadline can run.
943 * If we have no CPUs at all for some reason, we don't
944 * need to do anything.
946 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
950 static void kick_tcg_thread(void *opaque)
952 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
953 qemu_cpu_kick_rr_cpu();
956 static void start_tcg_kick_timer(void)
958 assert(!mttcg_enabled);
959 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
960 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
961 kick_tcg_thread, NULL);
962 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
966 static void stop_tcg_kick_timer(void)
968 assert(!mttcg_enabled);
969 if (tcg_kick_vcpu_timer) {
970 timer_del(tcg_kick_vcpu_timer);
971 tcg_kick_vcpu_timer = NULL;
975 /***********************************************************/
976 void hw_error(const char *fmt, ...)
982 fprintf(stderr, "qemu: hardware error: ");
983 vfprintf(stderr, fmt, ap);
984 fprintf(stderr, "\n");
986 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
987 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
993 void cpu_synchronize_all_states(void)
998 cpu_synchronize_state(cpu);
999 /* TODO: move to cpu_synchronize_state() */
1000 if (hvf_enabled()) {
1001 hvf_cpu_synchronize_state(cpu);
1006 void cpu_synchronize_all_post_reset(void)
1011 cpu_synchronize_post_reset(cpu);
1012 /* TODO: move to cpu_synchronize_post_reset() */
1013 if (hvf_enabled()) {
1014 hvf_cpu_synchronize_post_reset(cpu);
1019 void cpu_synchronize_all_post_init(void)
1024 cpu_synchronize_post_init(cpu);
1025 /* TODO: move to cpu_synchronize_post_init() */
1026 if (hvf_enabled()) {
1027 hvf_cpu_synchronize_post_init(cpu);
1032 void cpu_synchronize_all_pre_loadvm(void)
1037 cpu_synchronize_pre_loadvm(cpu);
1041 static int do_vm_stop(RunState state, bool send_stop)
1045 if (runstate_is_running()) {
1046 cpu_disable_ticks();
1048 runstate_set(state);
1049 vm_state_notify(0, state);
1051 qapi_event_send_stop(&error_abort);
1056 replay_disable_events();
1057 ret = bdrv_flush_all();
1062 /* Special vm_stop() variant for terminating the process. Historically clients
1063 * did not expect a QMP STOP event and so we need to retain compatibility.
1065 int vm_shutdown(void)
1067 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1070 static bool cpu_can_run(CPUState *cpu)
1075 if (cpu_is_stopped(cpu)) {
1081 static void cpu_handle_guest_debug(CPUState *cpu)
1083 gdb_set_stop_cpu(cpu);
1084 qemu_system_debug_request();
1085 cpu->stopped = true;
1089 static void sigbus_reraise(void)
1092 struct sigaction action;
1094 memset(&action, 0, sizeof(action));
1095 action.sa_handler = SIG_DFL;
1096 if (!sigaction(SIGBUS, &action, NULL)) {
1099 sigaddset(&set, SIGBUS);
1100 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1102 perror("Failed to re-raise SIGBUS!\n");
1106 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1108 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1113 /* Called asynchronously in VCPU thread. */
1114 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1118 /* Called synchronously (via signalfd) in main thread. */
1119 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1125 static void qemu_init_sigbus(void)
1127 struct sigaction action;
1129 memset(&action, 0, sizeof(action));
1130 action.sa_flags = SA_SIGINFO;
1131 action.sa_sigaction = sigbus_handler;
1132 sigaction(SIGBUS, &action, NULL);
1134 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1136 #else /* !CONFIG_LINUX */
1137 static void qemu_init_sigbus(void)
1140 #endif /* !CONFIG_LINUX */
1142 static QemuMutex qemu_global_mutex;
1144 static QemuThread io_thread;
1147 static QemuCond qemu_cpu_cond;
1149 static QemuCond qemu_pause_cond;
1151 void qemu_init_cpu_loop(void)
1154 qemu_cond_init(&qemu_cpu_cond);
1155 qemu_cond_init(&qemu_pause_cond);
1156 qemu_mutex_init(&qemu_global_mutex);
1158 qemu_thread_get_self(&io_thread);
1161 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1163 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1166 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1168 if (kvm_destroy_vcpu(cpu) < 0) {
1169 error_report("kvm_destroy_vcpu failed");
1174 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1178 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1180 g_assert(qemu_cpu_is_self(cpu));
1182 cpu->stopped = true;
1186 qemu_cond_broadcast(&qemu_pause_cond);
1189 static void qemu_wait_io_event_common(CPUState *cpu)
1191 atomic_mb_set(&cpu->thread_kicked, false);
1193 qemu_cpu_stop(cpu, false);
1195 process_queued_cpu_work(cpu);
1198 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1200 while (all_cpu_threads_idle()) {
1201 stop_tcg_kick_timer();
1202 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1205 start_tcg_kick_timer();
1207 qemu_wait_io_event_common(cpu);
1210 static void qemu_wait_io_event(CPUState *cpu)
1212 while (cpu_thread_is_idle(cpu)) {
1213 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1217 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1218 if (!tcg_enabled()) {
1222 qemu_wait_io_event_common(cpu);
1225 static void *qemu_kvm_cpu_thread_fn(void *arg)
1227 CPUState *cpu = arg;
1230 rcu_register_thread();
1232 qemu_mutex_lock_iothread();
1233 qemu_thread_get_self(cpu->thread);
1234 cpu->thread_id = qemu_get_thread_id();
1238 r = kvm_init_vcpu(cpu);
1240 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1244 kvm_init_cpu_signals(cpu);
1246 /* signal CPU creation */
1247 cpu->created = true;
1248 qemu_cond_signal(&qemu_cpu_cond);
1251 if (cpu_can_run(cpu)) {
1252 r = kvm_cpu_exec(cpu);
1253 if (r == EXCP_DEBUG) {
1254 cpu_handle_guest_debug(cpu);
1257 qemu_wait_io_event(cpu);
1258 } while (!cpu->unplug || cpu_can_run(cpu));
1260 qemu_kvm_destroy_vcpu(cpu);
1261 cpu->created = false;
1262 qemu_cond_signal(&qemu_cpu_cond);
1263 qemu_mutex_unlock_iothread();
1264 rcu_unregister_thread();
1268 static void *qemu_dummy_cpu_thread_fn(void *arg)
1271 error_report("qtest is not supported under Windows");
1274 CPUState *cpu = arg;
1278 rcu_register_thread();
1280 qemu_mutex_lock_iothread();
1281 qemu_thread_get_self(cpu->thread);
1282 cpu->thread_id = qemu_get_thread_id();
1286 sigemptyset(&waitset);
1287 sigaddset(&waitset, SIG_IPI);
1289 /* signal CPU creation */
1290 cpu->created = true;
1291 qemu_cond_signal(&qemu_cpu_cond);
1294 qemu_mutex_unlock_iothread();
1297 r = sigwait(&waitset, &sig);
1298 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1303 qemu_mutex_lock_iothread();
1304 qemu_wait_io_event(cpu);
1305 } while (!cpu->unplug);
1307 rcu_unregister_thread();
1312 static int64_t tcg_get_icount_limit(void)
1316 if (replay_mode != REPLAY_MODE_PLAY) {
1317 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1319 /* Maintain prior (possibly buggy) behaviour where if no deadline
1320 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1321 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1324 if ((deadline < 0) || (deadline > INT32_MAX)) {
1325 deadline = INT32_MAX;
1328 return qemu_icount_round(deadline);
1330 return replay_get_instructions();
1334 static void handle_icount_deadline(void)
1336 assert(qemu_in_vcpu_thread());
1339 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1341 if (deadline == 0) {
1342 /* Wake up other AioContexts. */
1343 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1344 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1349 static void prepare_icount_for_run(CPUState *cpu)
1354 /* These should always be cleared by process_icount_data after
1355 * each vCPU execution. However u16.high can be raised
1356 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1358 g_assert(cpu->icount_decr.u16.low == 0);
1359 g_assert(cpu->icount_extra == 0);
1361 cpu->icount_budget = tcg_get_icount_limit();
1362 insns_left = MIN(0xffff, cpu->icount_budget);
1363 cpu->icount_decr.u16.low = insns_left;
1364 cpu->icount_extra = cpu->icount_budget - insns_left;
1366 replay_mutex_lock();
1370 static void process_icount_data(CPUState *cpu)
1373 /* Account for executed instructions */
1374 cpu_update_icount(cpu);
1376 /* Reset the counters */
1377 cpu->icount_decr.u16.low = 0;
1378 cpu->icount_extra = 0;
1379 cpu->icount_budget = 0;
1381 replay_account_executed_instructions();
1383 replay_mutex_unlock();
1388 static int tcg_cpu_exec(CPUState *cpu)
1391 #ifdef CONFIG_PROFILER
1395 assert(tcg_enabled());
1396 #ifdef CONFIG_PROFILER
1397 ti = profile_getclock();
1399 cpu_exec_start(cpu);
1400 ret = cpu_exec(cpu);
1402 #ifdef CONFIG_PROFILER
1403 tcg_time += profile_getclock() - ti;
1408 /* Destroy any remaining vCPUs which have been unplugged and have
1411 static void deal_with_unplugged_cpus(void)
1416 if (cpu->unplug && !cpu_can_run(cpu)) {
1417 qemu_tcg_destroy_vcpu(cpu);
1418 cpu->created = false;
1419 qemu_cond_signal(&qemu_cpu_cond);
1425 /* Single-threaded TCG
1427 * In the single-threaded case each vCPU is simulated in turn. If
1428 * there is more than a single vCPU we create a simple timer to kick
1429 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1430 * This is done explicitly rather than relying on side-effects
1434 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1436 CPUState *cpu = arg;
1438 assert(tcg_enabled());
1439 rcu_register_thread();
1440 tcg_register_thread();
1442 qemu_mutex_lock_iothread();
1443 qemu_thread_get_self(cpu->thread);
1445 cpu->thread_id = qemu_get_thread_id();
1446 cpu->created = true;
1448 qemu_cond_signal(&qemu_cpu_cond);
1450 /* wait for initial kick-off after machine start */
1451 while (first_cpu->stopped) {
1452 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1454 /* process any pending work */
1457 qemu_wait_io_event_common(cpu);
1461 start_tcg_kick_timer();
1465 /* process any pending work */
1466 cpu->exit_request = 1;
1469 qemu_mutex_unlock_iothread();
1470 replay_mutex_lock();
1471 qemu_mutex_lock_iothread();
1472 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1473 qemu_account_warp_timer();
1475 /* Run the timers here. This is much more efficient than
1476 * waking up the I/O thread and waiting for completion.
1478 handle_icount_deadline();
1480 replay_mutex_unlock();
1486 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1488 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1491 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1492 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1494 if (cpu_can_run(cpu)) {
1497 qemu_mutex_unlock_iothread();
1498 prepare_icount_for_run(cpu);
1500 r = tcg_cpu_exec(cpu);
1502 process_icount_data(cpu);
1503 qemu_mutex_lock_iothread();
1505 if (r == EXCP_DEBUG) {
1506 cpu_handle_guest_debug(cpu);
1508 } else if (r == EXCP_ATOMIC) {
1509 qemu_mutex_unlock_iothread();
1510 cpu_exec_step_atomic(cpu);
1511 qemu_mutex_lock_iothread();
1514 } else if (cpu->stop) {
1516 cpu = CPU_NEXT(cpu);
1521 cpu = CPU_NEXT(cpu);
1522 } /* while (cpu && !cpu->exit_request).. */
1524 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1525 atomic_set(&tcg_current_rr_cpu, NULL);
1527 if (cpu && cpu->exit_request) {
1528 atomic_mb_set(&cpu->exit_request, 0);
1531 qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1532 deal_with_unplugged_cpus();
1535 rcu_unregister_thread();
1539 static void *qemu_hax_cpu_thread_fn(void *arg)
1541 CPUState *cpu = arg;
1544 rcu_register_thread();
1545 qemu_mutex_lock_iothread();
1546 qemu_thread_get_self(cpu->thread);
1548 cpu->thread_id = qemu_get_thread_id();
1549 cpu->created = true;
1554 qemu_cond_signal(&qemu_cpu_cond);
1557 if (cpu_can_run(cpu)) {
1558 r = hax_smp_cpu_exec(cpu);
1559 if (r == EXCP_DEBUG) {
1560 cpu_handle_guest_debug(cpu);
1564 qemu_wait_io_event(cpu);
1565 } while (!cpu->unplug || cpu_can_run(cpu));
1566 rcu_unregister_thread();
1570 /* The HVF-specific vCPU thread function. This one should only run when the host
1571 * CPU supports the VMX "unrestricted guest" feature. */
1572 static void *qemu_hvf_cpu_thread_fn(void *arg)
1574 CPUState *cpu = arg;
1578 assert(hvf_enabled());
1580 rcu_register_thread();
1582 qemu_mutex_lock_iothread();
1583 qemu_thread_get_self(cpu->thread);
1585 cpu->thread_id = qemu_get_thread_id();
1591 /* signal CPU creation */
1592 cpu->created = true;
1593 qemu_cond_signal(&qemu_cpu_cond);
1596 if (cpu_can_run(cpu)) {
1597 r = hvf_vcpu_exec(cpu);
1598 if (r == EXCP_DEBUG) {
1599 cpu_handle_guest_debug(cpu);
1602 qemu_wait_io_event(cpu);
1603 } while (!cpu->unplug || cpu_can_run(cpu));
1605 hvf_vcpu_destroy(cpu);
1606 cpu->created = false;
1607 qemu_cond_signal(&qemu_cpu_cond);
1608 qemu_mutex_unlock_iothread();
1609 rcu_unregister_thread();
1613 static void *qemu_whpx_cpu_thread_fn(void *arg)
1615 CPUState *cpu = arg;
1618 rcu_register_thread();
1620 qemu_mutex_lock_iothread();
1621 qemu_thread_get_self(cpu->thread);
1622 cpu->thread_id = qemu_get_thread_id();
1625 r = whpx_init_vcpu(cpu);
1627 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1631 /* signal CPU creation */
1632 cpu->created = true;
1633 qemu_cond_signal(&qemu_cpu_cond);
1636 if (cpu_can_run(cpu)) {
1637 r = whpx_vcpu_exec(cpu);
1638 if (r == EXCP_DEBUG) {
1639 cpu_handle_guest_debug(cpu);
1642 while (cpu_thread_is_idle(cpu)) {
1643 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1645 qemu_wait_io_event_common(cpu);
1646 } while (!cpu->unplug || cpu_can_run(cpu));
1648 whpx_destroy_vcpu(cpu);
1649 cpu->created = false;
1650 qemu_cond_signal(&qemu_cpu_cond);
1651 qemu_mutex_unlock_iothread();
1652 rcu_unregister_thread();
1657 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1662 /* Multi-threaded TCG
1664 * In the multi-threaded case each vCPU has its own thread. The TLS
1665 * variable current_cpu can be used deep in the code to find the
1666 * current CPUState for a given thread.
1669 static void *qemu_tcg_cpu_thread_fn(void *arg)
1671 CPUState *cpu = arg;
1673 assert(tcg_enabled());
1674 g_assert(!use_icount);
1676 rcu_register_thread();
1677 tcg_register_thread();
1679 qemu_mutex_lock_iothread();
1680 qemu_thread_get_self(cpu->thread);
1682 cpu->thread_id = qemu_get_thread_id();
1683 cpu->created = true;
1686 qemu_cond_signal(&qemu_cpu_cond);
1688 /* process any pending work */
1689 cpu->exit_request = 1;
1692 if (cpu_can_run(cpu)) {
1694 qemu_mutex_unlock_iothread();
1695 r = tcg_cpu_exec(cpu);
1696 qemu_mutex_lock_iothread();
1699 cpu_handle_guest_debug(cpu);
1702 /* during start-up the vCPU is reset and the thread is
1703 * kicked several times. If we don't ensure we go back
1704 * to sleep in the halted state we won't cleanly
1705 * start-up when the vCPU is enabled.
1707 * cpu->halted should ensure we sleep in wait_io_event
1709 g_assert(cpu->halted);
1712 qemu_mutex_unlock_iothread();
1713 cpu_exec_step_atomic(cpu);
1714 qemu_mutex_lock_iothread();
1716 /* Ignore everything else? */
1721 atomic_mb_set(&cpu->exit_request, 0);
1722 qemu_wait_io_event(cpu);
1723 } while (!cpu->unplug || cpu_can_run(cpu));
1725 qemu_tcg_destroy_vcpu(cpu);
1726 cpu->created = false;
1727 qemu_cond_signal(&qemu_cpu_cond);
1728 qemu_mutex_unlock_iothread();
1729 rcu_unregister_thread();
1733 static void qemu_cpu_kick_thread(CPUState *cpu)
1738 if (cpu->thread_kicked) {
1741 cpu->thread_kicked = true;
1742 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1744 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1748 if (!qemu_cpu_is_self(cpu)) {
1749 if (whpx_enabled()) {
1750 whpx_vcpu_kick(cpu);
1751 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1752 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1753 __func__, GetLastError());
1760 void qemu_cpu_kick(CPUState *cpu)
1762 qemu_cond_broadcast(cpu->halt_cond);
1763 if (tcg_enabled()) {
1765 /* NOP unless doing single-thread RR */
1766 qemu_cpu_kick_rr_cpu();
1768 if (hax_enabled()) {
1770 * FIXME: race condition with the exit_request check in
1773 cpu->exit_request = 1;
1775 qemu_cpu_kick_thread(cpu);
1779 void qemu_cpu_kick_self(void)
1781 assert(current_cpu);
1782 qemu_cpu_kick_thread(current_cpu);
1785 bool qemu_cpu_is_self(CPUState *cpu)
1787 return qemu_thread_is_self(cpu->thread);
1790 bool qemu_in_vcpu_thread(void)
1792 return current_cpu && qemu_cpu_is_self(current_cpu);
1795 static __thread bool iothread_locked = false;
1797 bool qemu_mutex_iothread_locked(void)
1799 return iothread_locked;
1803 * The BQL is taken from so many places that it is worth profiling the
1804 * callers directly, instead of funneling them all through a single function.
1806 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1808 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1810 g_assert(!qemu_mutex_iothread_locked());
1811 bql_lock(&qemu_global_mutex, file, line);
1812 iothread_locked = true;
1815 void qemu_mutex_unlock_iothread(void)
1817 g_assert(qemu_mutex_iothread_locked());
1818 iothread_locked = false;
1819 qemu_mutex_unlock(&qemu_global_mutex);
1822 static bool all_vcpus_paused(void)
1827 if (!cpu->stopped) {
1835 void pause_all_vcpus(void)
1839 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1841 if (qemu_cpu_is_self(cpu)) {
1842 qemu_cpu_stop(cpu, true);
1849 /* We need to drop the replay_lock so any vCPU threads woken up
1850 * can finish their replay tasks
1852 replay_mutex_unlock();
1854 while (!all_vcpus_paused()) {
1855 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1861 qemu_mutex_unlock_iothread();
1862 replay_mutex_lock();
1863 qemu_mutex_lock_iothread();
1866 void cpu_resume(CPUState *cpu)
1869 cpu->stopped = false;
1873 void resume_all_vcpus(void)
1877 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1883 void cpu_remove_sync(CPUState *cpu)
1888 qemu_mutex_unlock_iothread();
1889 qemu_thread_join(cpu->thread);
1890 qemu_mutex_lock_iothread();
1893 /* For temporary buffers for forming a name */
1894 #define VCPU_THREAD_NAME_SIZE 16
1896 static void qemu_tcg_init_vcpu(CPUState *cpu)
1898 char thread_name[VCPU_THREAD_NAME_SIZE];
1899 static QemuCond *single_tcg_halt_cond;
1900 static QemuThread *single_tcg_cpu_thread;
1901 static int tcg_region_inited;
1903 assert(tcg_enabled());
1905 * Initialize TCG regions--once. Now is a good time, because:
1906 * (1) TCG's init context, prologue and target globals have been set up.
1907 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1908 * -accel flag is processed, so the check doesn't work then).
1910 if (!tcg_region_inited) {
1911 tcg_region_inited = 1;
1915 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1916 cpu->thread = g_malloc0(sizeof(QemuThread));
1917 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1918 qemu_cond_init(cpu->halt_cond);
1920 if (qemu_tcg_mttcg_enabled()) {
1921 /* create a thread per vCPU with TCG (MTTCG) */
1922 parallel_cpus = true;
1923 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1926 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1927 cpu, QEMU_THREAD_JOINABLE);
1930 /* share a single thread for all cpus with TCG */
1931 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1932 qemu_thread_create(cpu->thread, thread_name,
1933 qemu_tcg_rr_cpu_thread_fn,
1934 cpu, QEMU_THREAD_JOINABLE);
1936 single_tcg_halt_cond = cpu->halt_cond;
1937 single_tcg_cpu_thread = cpu->thread;
1940 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1943 /* For non-MTTCG cases we share the thread */
1944 cpu->thread = single_tcg_cpu_thread;
1945 cpu->halt_cond = single_tcg_halt_cond;
1946 cpu->thread_id = first_cpu->thread_id;
1948 cpu->created = true;
1952 static void qemu_hax_start_vcpu(CPUState *cpu)
1954 char thread_name[VCPU_THREAD_NAME_SIZE];
1956 cpu->thread = g_malloc0(sizeof(QemuThread));
1957 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1958 qemu_cond_init(cpu->halt_cond);
1960 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1962 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1963 cpu, QEMU_THREAD_JOINABLE);
1965 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1969 static void qemu_kvm_start_vcpu(CPUState *cpu)
1971 char thread_name[VCPU_THREAD_NAME_SIZE];
1973 cpu->thread = g_malloc0(sizeof(QemuThread));
1974 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1975 qemu_cond_init(cpu->halt_cond);
1976 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1978 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1979 cpu, QEMU_THREAD_JOINABLE);
1982 static void qemu_hvf_start_vcpu(CPUState *cpu)
1984 char thread_name[VCPU_THREAD_NAME_SIZE];
1986 /* HVF currently does not support TCG, and only runs in
1987 * unrestricted-guest mode. */
1988 assert(hvf_enabled());
1990 cpu->thread = g_malloc0(sizeof(QemuThread));
1991 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1992 qemu_cond_init(cpu->halt_cond);
1994 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1996 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1997 cpu, QEMU_THREAD_JOINABLE);
2000 static void qemu_whpx_start_vcpu(CPUState *cpu)
2002 char thread_name[VCPU_THREAD_NAME_SIZE];
2004 cpu->thread = g_malloc0(sizeof(QemuThread));
2005 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006 qemu_cond_init(cpu->halt_cond);
2007 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2009 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2010 cpu, QEMU_THREAD_JOINABLE);
2012 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2016 static void qemu_dummy_start_vcpu(CPUState *cpu)
2018 char thread_name[VCPU_THREAD_NAME_SIZE];
2020 cpu->thread = g_malloc0(sizeof(QemuThread));
2021 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022 qemu_cond_init(cpu->halt_cond);
2023 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2025 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2026 QEMU_THREAD_JOINABLE);
2029 void qemu_init_vcpu(CPUState *cpu)
2031 cpu->nr_cores = smp_cores;
2032 cpu->nr_threads = smp_threads;
2033 cpu->stopped = true;
2036 /* If the target cpu hasn't set up any address spaces itself,
2037 * give it the default one.
2040 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2043 if (kvm_enabled()) {
2044 qemu_kvm_start_vcpu(cpu);
2045 } else if (hax_enabled()) {
2046 qemu_hax_start_vcpu(cpu);
2047 } else if (hvf_enabled()) {
2048 qemu_hvf_start_vcpu(cpu);
2049 } else if (tcg_enabled()) {
2050 qemu_tcg_init_vcpu(cpu);
2051 } else if (whpx_enabled()) {
2052 qemu_whpx_start_vcpu(cpu);
2054 qemu_dummy_start_vcpu(cpu);
2057 while (!cpu->created) {
2058 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2062 void cpu_stop_current(void)
2065 qemu_cpu_stop(current_cpu, true);
2069 int vm_stop(RunState state)
2071 if (qemu_in_vcpu_thread()) {
2072 qemu_system_vmstop_request_prepare();
2073 qemu_system_vmstop_request(state);
2075 * FIXME: should not return to device code in case
2076 * vm_stop() has been requested.
2082 return do_vm_stop(state, true);
2086 * Prepare for (re)starting the VM.
2087 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2088 * running or in case of an error condition), 0 otherwise.
2090 int vm_prepare_start(void)
2094 qemu_vmstop_requested(&requested);
2095 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2099 /* Ensure that a STOP/RESUME pair of events is emitted if a
2100 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2101 * example, according to documentation is always followed by
2104 if (runstate_is_running()) {
2105 qapi_event_send_stop(&error_abort);
2106 qapi_event_send_resume(&error_abort);
2110 /* We are sending this now, but the CPUs will be resumed shortly later */
2111 qapi_event_send_resume(&error_abort);
2113 replay_enable_events();
2115 runstate_set(RUN_STATE_RUNNING);
2116 vm_state_notify(1, RUN_STATE_RUNNING);
2122 if (!vm_prepare_start()) {
2127 /* does a state transition even if the VM is already stopped,
2128 current state is forgotten forever */
2129 int vm_stop_force_state(RunState state)
2131 if (runstate_is_running()) {
2132 return vm_stop(state);
2134 runstate_set(state);
2137 /* Make sure to return an error if the flush in a previous vm_stop()
2139 return bdrv_flush_all();
2143 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2145 /* XXX: implement xxx_cpu_list for targets that still miss it */
2146 #if defined(cpu_list)
2147 cpu_list(f, cpu_fprintf);
2151 CpuInfoList *qmp_query_cpus(Error **errp)
2153 MachineState *ms = MACHINE(qdev_get_machine());
2154 MachineClass *mc = MACHINE_GET_CLASS(ms);
2155 CpuInfoList *head = NULL, *cur_item = NULL;
2160 #if defined(TARGET_I386)
2161 X86CPU *x86_cpu = X86_CPU(cpu);
2162 CPUX86State *env = &x86_cpu->env;
2163 #elif defined(TARGET_PPC)
2164 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2165 CPUPPCState *env = &ppc_cpu->env;
2166 #elif defined(TARGET_SPARC)
2167 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2168 CPUSPARCState *env = &sparc_cpu->env;
2169 #elif defined(TARGET_RISCV)
2170 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2171 CPURISCVState *env = &riscv_cpu->env;
2172 #elif defined(TARGET_MIPS)
2173 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2174 CPUMIPSState *env = &mips_cpu->env;
2175 #elif defined(TARGET_TRICORE)
2176 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2177 CPUTriCoreState *env = &tricore_cpu->env;
2178 #elif defined(TARGET_S390X)
2179 S390CPU *s390_cpu = S390_CPU(cpu);
2180 CPUS390XState *env = &s390_cpu->env;
2183 cpu_synchronize_state(cpu);
2185 info = g_malloc0(sizeof(*info));
2186 info->value = g_malloc0(sizeof(*info->value));
2187 info->value->CPU = cpu->cpu_index;
2188 info->value->current = (cpu == first_cpu);
2189 info->value->halted = cpu->halted;
2190 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2191 info->value->thread_id = cpu->thread_id;
2192 #if defined(TARGET_I386)
2193 info->value->arch = CPU_INFO_ARCH_X86;
2194 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2195 #elif defined(TARGET_PPC)
2196 info->value->arch = CPU_INFO_ARCH_PPC;
2197 info->value->u.ppc.nip = env->nip;
2198 #elif defined(TARGET_SPARC)
2199 info->value->arch = CPU_INFO_ARCH_SPARC;
2200 info->value->u.q_sparc.pc = env->pc;
2201 info->value->u.q_sparc.npc = env->npc;
2202 #elif defined(TARGET_MIPS)
2203 info->value->arch = CPU_INFO_ARCH_MIPS;
2204 info->value->u.q_mips.PC = env->active_tc.PC;
2205 #elif defined(TARGET_TRICORE)
2206 info->value->arch = CPU_INFO_ARCH_TRICORE;
2207 info->value->u.tricore.PC = env->PC;
2208 #elif defined(TARGET_S390X)
2209 info->value->arch = CPU_INFO_ARCH_S390;
2210 info->value->u.s390.cpu_state = env->cpu_state;
2211 #elif defined(TARGET_RISCV)
2212 info->value->arch = CPU_INFO_ARCH_RISCV;
2213 info->value->u.riscv.pc = env->pc;
2215 info->value->arch = CPU_INFO_ARCH_OTHER;
2217 info->value->has_props = !!mc->cpu_index_to_instance_props;
2218 if (info->value->has_props) {
2219 CpuInstanceProperties *props;
2220 props = g_malloc0(sizeof(*props));
2221 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2222 info->value->props = props;
2225 /* XXX: waiting for the qapi to support GSList */
2227 head = cur_item = info;
2229 cur_item->next = info;
2237 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2240 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2241 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2244 case SYS_EMU_TARGET_I386:
2245 case SYS_EMU_TARGET_X86_64:
2246 return CPU_INFO_ARCH_X86;
2248 case SYS_EMU_TARGET_PPC:
2249 case SYS_EMU_TARGET_PPCEMB:
2250 case SYS_EMU_TARGET_PPC64:
2251 return CPU_INFO_ARCH_PPC;
2253 case SYS_EMU_TARGET_SPARC:
2254 case SYS_EMU_TARGET_SPARC64:
2255 return CPU_INFO_ARCH_SPARC;
2257 case SYS_EMU_TARGET_MIPS:
2258 case SYS_EMU_TARGET_MIPSEL:
2259 case SYS_EMU_TARGET_MIPS64:
2260 case SYS_EMU_TARGET_MIPS64EL:
2261 return CPU_INFO_ARCH_MIPS;
2263 case SYS_EMU_TARGET_TRICORE:
2264 return CPU_INFO_ARCH_TRICORE;
2266 case SYS_EMU_TARGET_S390X:
2267 return CPU_INFO_ARCH_S390;
2269 case SYS_EMU_TARGET_RISCV32:
2270 case SYS_EMU_TARGET_RISCV64:
2271 return CPU_INFO_ARCH_RISCV;
2274 return CPU_INFO_ARCH_OTHER;
2278 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2281 S390CPU *s390_cpu = S390_CPU(cpu);
2282 CPUS390XState *env = &s390_cpu->env;
2284 info->cpu_state = env->cpu_state;
2291 * fast means: we NEVER interrupt vCPU threads to retrieve
2292 * information from KVM.
2294 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2296 MachineState *ms = MACHINE(qdev_get_machine());
2297 MachineClass *mc = MACHINE_GET_CLASS(ms);
2298 CpuInfoFastList *head = NULL, *cur_item = NULL;
2299 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2304 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2305 info->value = g_malloc0(sizeof(*info->value));
2307 info->value->cpu_index = cpu->cpu_index;
2308 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2309 info->value->thread_id = cpu->thread_id;
2311 info->value->has_props = !!mc->cpu_index_to_instance_props;
2312 if (info->value->has_props) {
2313 CpuInstanceProperties *props;
2314 props = g_malloc0(sizeof(*props));
2315 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2316 info->value->props = props;
2319 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2320 info->value->target = target;
2321 if (target == SYS_EMU_TARGET_S390X) {
2322 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2326 head = cur_item = info;
2328 cur_item->next = info;
2336 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2337 bool has_cpu, int64_t cpu_index, Error **errp)
2343 int64_t orig_addr = addr, orig_size = size;
2349 cpu = qemu_get_cpu(cpu_index);
2351 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2356 f = fopen(filename, "wb");
2358 error_setg_file_open(errp, errno, filename);
2366 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2367 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2368 " specified", orig_addr, orig_size);
2371 if (fwrite(buf, 1, l, f) != l) {
2372 error_setg(errp, QERR_IO_ERROR);
2383 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2390 f = fopen(filename, "wb");
2392 error_setg_file_open(errp, errno, filename);
2400 cpu_physical_memory_read(addr, buf, l);
2401 if (fwrite(buf, 1, l, f) != l) {
2402 error_setg(errp, QERR_IO_ERROR);
2413 void qmp_inject_nmi(Error **errp)
2415 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2418 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2424 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2425 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2426 if (icount_align_option) {
2427 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2428 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2430 cpu_fprintf(f, "Max guest delay NA\n");
2431 cpu_fprintf(f, "Max guest advance NA\n");