4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "sysemu/sysemu.h"
35 #include "sysemu/block-backend.h"
36 #include "exec/gdbstub.h"
37 #include "sysemu/dma.h"
38 #include "sysemu/hw_accel.h"
39 #include "sysemu/kvm.h"
40 #include "sysemu/hax.h"
41 #include "sysemu/hvf.h"
42 #include "sysemu/whpx.h"
43 #include "exec/exec-all.h"
45 #include "qemu/thread.h"
46 #include "sysemu/cpus.h"
47 #include "sysemu/qtest.h"
48 #include "qemu/main-loop.h"
49 #include "qemu/option.h"
50 #include "qemu/bitmap.h"
51 #include "qemu/seqlock.h"
54 #include "sysemu/replay.h"
55 #include "hw/boards.h"
59 #include <sys/prctl.h>
62 #define PR_MCE_KILL 33
65 #ifndef PR_MCE_KILL_SET
66 #define PR_MCE_KILL_SET 1
69 #ifndef PR_MCE_KILL_EARLY
70 #define PR_MCE_KILL_EARLY 1
73 #endif /* CONFIG_LINUX */
78 /* vcpu throttling controls */
79 static QEMUTimer *throttle_timer;
80 static unsigned int throttle_percentage;
82 #define CPU_THROTTLE_PCT_MIN 1
83 #define CPU_THROTTLE_PCT_MAX 99
84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86 bool cpu_is_stopped(CPUState *cpu)
88 return cpu->stopped || !runstate_is_running();
91 static bool cpu_thread_is_idle(CPUState *cpu)
93 if (cpu->stop || cpu->queued_work_first) {
96 if (cpu_is_stopped(cpu)) {
99 if (!cpu->halted || cpu_has_work(cpu) ||
100 kvm_halt_in_kernel()) {
106 static bool all_cpu_threads_idle(void)
111 if (!cpu_thread_is_idle(cpu)) {
118 /***********************************************************/
119 /* guest cycle counter */
121 /* Protected by TimersState seqlock */
123 static bool icount_sleep = true;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
127 typedef struct TimersState {
128 /* Protected by BQL. */
129 int64_t cpu_ticks_prev;
130 int64_t cpu_ticks_offset;
132 /* Protect fields that can be respectively read outside the
133 * BQL, and written from multiple threads.
135 QemuSeqLock vm_clock_seqlock;
136 QemuSpin vm_clock_lock;
138 int16_t cpu_ticks_enabled;
140 /* Conversion factor from emulated instructions to virtual clock ticks. */
141 int16_t icount_time_shift;
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias;
146 int64_t vm_clock_warp_start;
147 int64_t cpu_clock_offset;
149 /* Only written by TCG thread */
152 /* for adjusting icount */
153 QEMUTimer *icount_rt_timer;
154 QEMUTimer *icount_vm_timer;
155 QEMUTimer *icount_warp_timer;
158 static TimersState timers_state;
162 * We default to false if we know other options have been enabled
163 * which are currently incompatible with MTTCG. Otherwise when each
164 * guest (target) has been updated to support:
165 * - atomic instructions
166 * - memory ordering primitives (barriers)
167 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
169 * Once a guest architecture has been converted to the new primitives
170 * there are two remaining limitations to check.
172 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
173 * - The host must have a stronger memory order than the guest
175 * It may be possible in future to support strong guests on weak hosts
176 * but that will require tagging all load/stores in a guest with their
177 * implicit memory order requirements which would likely slow things
181 static bool check_tcg_memory_orders_compatible(void)
183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
184 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
190 static bool default_mttcg_enabled(void)
192 if (use_icount || TCG_OVERSIZED_GUEST) {
195 #ifdef TARGET_SUPPORTS_MTTCG
196 return check_tcg_memory_orders_compatible();
203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
205 const char *t = qemu_opt_get(opts, "thread");
207 if (strcmp(t, "multi") == 0) {
208 if (TCG_OVERSIZED_GUEST) {
209 error_setg(errp, "No MTTCG when guest word size > hosts");
210 } else if (use_icount) {
211 error_setg(errp, "No MTTCG when icount is enabled");
213 #ifndef TARGET_SUPPORTS_MTTCG
214 error_report("Guest not yet converted to MTTCG - "
215 "you may get unexpected results");
217 if (!check_tcg_memory_orders_compatible()) {
218 error_report("Guest expects a stronger memory ordering "
219 "than the host provides");
220 error_printf("This may cause strange/hard to debug errors\n");
222 mttcg_enabled = true;
224 } else if (strcmp(t, "single") == 0) {
225 mttcg_enabled = false;
227 error_setg(errp, "Invalid 'thread' setting %s", t);
230 mttcg_enabled = default_mttcg_enabled();
234 /* The current number of executed instructions is based on what we
235 * originally budgeted minus the current state of the decrementing
236 * icount counters in extra/u16.low.
238 static int64_t cpu_get_icount_executed(CPUState *cpu)
240 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
244 * Update the global shared timer_state.qemu_icount to take into
245 * account executed instructions. This is done by the TCG vCPU
246 * thread so the main-loop can see time has moved forward.
248 void cpu_update_icount(CPUState *cpu)
250 int64_t executed = cpu_get_icount_executed(cpu);
251 cpu->icount_budget -= executed;
253 #ifndef CONFIG_ATOMIC64
254 seqlock_write_lock(&timers_state.vm_clock_seqlock,
255 &timers_state.vm_clock_lock);
257 atomic_set__nocheck(&timers_state.qemu_icount,
258 timers_state.qemu_icount + executed);
259 #ifndef CONFIG_ATOMIC64
260 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
261 &timers_state.vm_clock_lock);
265 static int64_t cpu_get_icount_raw_locked(void)
267 CPUState *cpu = current_cpu;
269 if (cpu && cpu->running) {
270 if (!cpu->can_do_io) {
271 error_report("Bad icount read");
274 /* Take into account what has run */
275 cpu_update_icount(cpu);
277 /* The read is protected by the seqlock, so __nocheck is okay. */
278 return atomic_read__nocheck(&timers_state.qemu_icount);
281 static int64_t cpu_get_icount_locked(void)
283 int64_t icount = cpu_get_icount_raw_locked();
284 return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
287 int64_t cpu_get_icount_raw(void)
293 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
294 icount = cpu_get_icount_raw_locked();
295 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
300 /* Return the virtual CPU time, based on the instruction counter. */
301 int64_t cpu_get_icount(void)
307 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
308 icount = cpu_get_icount_locked();
309 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
314 int64_t cpu_icount_to_ns(int64_t icount)
316 return icount << atomic_read(&timers_state.icount_time_shift);
319 static int64_t cpu_get_ticks_locked(void)
321 int64_t ticks = timers_state.cpu_ticks_offset;
322 if (timers_state.cpu_ticks_enabled) {
323 ticks += cpu_get_host_ticks();
326 if (timers_state.cpu_ticks_prev > ticks) {
327 /* Non increasing ticks may happen if the host uses software suspend. */
328 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
329 ticks = timers_state.cpu_ticks_prev;
332 timers_state.cpu_ticks_prev = ticks;
336 /* return the time elapsed in VM between vm_start and vm_stop. Unless
337 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
340 int64_t cpu_get_ticks(void)
345 return cpu_get_icount();
348 qemu_spin_lock(&timers_state.vm_clock_lock);
349 ticks = cpu_get_ticks_locked();
350 qemu_spin_unlock(&timers_state.vm_clock_lock);
354 static int64_t cpu_get_clock_locked(void)
358 time = timers_state.cpu_clock_offset;
359 if (timers_state.cpu_ticks_enabled) {
366 /* Return the monotonic time elapsed in VM, i.e.,
367 * the time between vm_start and vm_stop
369 int64_t cpu_get_clock(void)
375 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
376 ti = cpu_get_clock_locked();
377 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
382 /* enable cpu_get_ticks()
383 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
385 void cpu_enable_ticks(void)
387 seqlock_write_lock(&timers_state.vm_clock_seqlock,
388 &timers_state.vm_clock_lock);
389 if (!timers_state.cpu_ticks_enabled) {
390 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
391 timers_state.cpu_clock_offset -= get_clock();
392 timers_state.cpu_ticks_enabled = 1;
394 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
395 &timers_state.vm_clock_lock);
398 /* disable cpu_get_ticks() : the clock is stopped. You must not call
399 * cpu_get_ticks() after that.
400 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
402 void cpu_disable_ticks(void)
404 seqlock_write_lock(&timers_state.vm_clock_seqlock,
405 &timers_state.vm_clock_lock);
406 if (timers_state.cpu_ticks_enabled) {
407 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
408 timers_state.cpu_clock_offset = cpu_get_clock_locked();
409 timers_state.cpu_ticks_enabled = 0;
411 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
412 &timers_state.vm_clock_lock);
415 /* Correlation between real and virtual time is always going to be
416 fairly approximate, so ignore small variation.
417 When the guest is idle real and virtual time will be aligned in
419 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
421 static void icount_adjust(void)
427 /* Protected by TimersState mutex. */
428 static int64_t last_delta;
430 /* If the VM is not running, then do nothing. */
431 if (!runstate_is_running()) {
435 seqlock_write_lock(&timers_state.vm_clock_seqlock,
436 &timers_state.vm_clock_lock);
437 cur_time = cpu_get_clock_locked();
438 cur_icount = cpu_get_icount_locked();
440 delta = cur_icount - cur_time;
441 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
443 && last_delta + ICOUNT_WOBBLE < delta * 2
444 && timers_state.icount_time_shift > 0) {
445 /* The guest is getting too far ahead. Slow time down. */
446 atomic_set(&timers_state.icount_time_shift,
447 timers_state.icount_time_shift - 1);
450 && last_delta - ICOUNT_WOBBLE > delta * 2
451 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
452 /* The guest is getting too far behind. Speed time up. */
453 atomic_set(&timers_state.icount_time_shift,
454 timers_state.icount_time_shift + 1);
457 atomic_set__nocheck(&timers_state.qemu_icount_bias,
458 cur_icount - (timers_state.qemu_icount
459 << timers_state.icount_time_shift));
460 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
461 &timers_state.vm_clock_lock);
464 static void icount_adjust_rt(void *opaque)
466 timer_mod(timers_state.icount_rt_timer,
467 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
471 static void icount_adjust_vm(void *opaque)
473 timer_mod(timers_state.icount_vm_timer,
474 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
475 NANOSECONDS_PER_SECOND / 10);
479 static int64_t qemu_icount_round(int64_t count)
481 int shift = atomic_read(&timers_state.icount_time_shift);
482 return (count + (1 << shift) - 1) >> shift;
485 static void icount_warp_rt(void)
490 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
491 * changes from -1 to another value, so the race here is okay.
494 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
495 warp_start = timers_state.vm_clock_warp_start;
496 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
498 if (warp_start == -1) {
502 seqlock_write_lock(&timers_state.vm_clock_seqlock,
503 &timers_state.vm_clock_lock);
504 if (runstate_is_running()) {
505 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
506 cpu_get_clock_locked());
509 warp_delta = clock - timers_state.vm_clock_warp_start;
510 if (use_icount == 2) {
512 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
513 * far ahead of real time.
515 int64_t cur_icount = cpu_get_icount_locked();
516 int64_t delta = clock - cur_icount;
517 warp_delta = MIN(warp_delta, delta);
519 atomic_set__nocheck(&timers_state.qemu_icount_bias,
520 timers_state.qemu_icount_bias + warp_delta);
522 timers_state.vm_clock_warp_start = -1;
523 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
524 &timers_state.vm_clock_lock);
526 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
527 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
531 static void icount_timer_cb(void *opaque)
533 /* No need for a checkpoint because the timer already synchronizes
534 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
539 void qtest_clock_warp(int64_t dest)
541 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
542 AioContext *aio_context;
543 assert(qtest_enabled());
544 aio_context = qemu_get_aio_context();
545 while (clock < dest) {
546 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
547 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
549 seqlock_write_lock(&timers_state.vm_clock_seqlock,
550 &timers_state.vm_clock_lock);
551 atomic_set__nocheck(&timers_state.qemu_icount_bias,
552 timers_state.qemu_icount_bias + warp);
553 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
554 &timers_state.vm_clock_lock);
556 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
557 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
558 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
560 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
563 void qemu_start_warp_timer(void)
572 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
573 * do not fire, so computing the deadline does not make sense.
575 if (!runstate_is_running()) {
579 /* warp clock deterministically in record/replay mode */
580 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
584 if (!all_cpu_threads_idle()) {
588 if (qtest_enabled()) {
589 /* When testing, qtest commands advance icount. */
593 /* We want to use the earliest deadline from ALL vm_clocks */
594 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
595 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
597 static bool notified;
598 if (!icount_sleep && !notified) {
599 warn_report("icount sleep disabled and no active timers");
607 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
608 * sleep. Otherwise, the CPU might be waiting for a future timer
609 * interrupt to wake it up, but the interrupt never comes because
610 * the vCPU isn't running any insns and thus doesn't advance the
611 * QEMU_CLOCK_VIRTUAL.
615 * We never let VCPUs sleep in no sleep icount mode.
616 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
617 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
618 * It is useful when we want a deterministic execution time,
619 * isolated from host latencies.
621 seqlock_write_lock(&timers_state.vm_clock_seqlock,
622 &timers_state.vm_clock_lock);
623 atomic_set__nocheck(&timers_state.qemu_icount_bias,
624 timers_state.qemu_icount_bias + deadline);
625 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
626 &timers_state.vm_clock_lock);
627 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
630 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
631 * "real" time, (related to the time left until the next event) has
632 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
633 * This avoids that the warps are visible externally; for example,
634 * you will not be sending network packets continuously instead of
637 seqlock_write_lock(&timers_state.vm_clock_seqlock,
638 &timers_state.vm_clock_lock);
639 if (timers_state.vm_clock_warp_start == -1
640 || timers_state.vm_clock_warp_start > clock) {
641 timers_state.vm_clock_warp_start = clock;
643 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
644 &timers_state.vm_clock_lock);
645 timer_mod_anticipate(timers_state.icount_warp_timer,
648 } else if (deadline == 0) {
649 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
653 static void qemu_account_warp_timer(void)
655 if (!use_icount || !icount_sleep) {
659 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
660 * do not fire, so computing the deadline does not make sense.
662 if (!runstate_is_running()) {
666 /* warp clock deterministically in record/replay mode */
667 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
671 timer_del(timers_state.icount_warp_timer);
675 static bool icount_state_needed(void *opaque)
680 static bool warp_timer_state_needed(void *opaque)
682 TimersState *s = opaque;
683 return s->icount_warp_timer != NULL;
686 static bool adjust_timers_state_needed(void *opaque)
688 TimersState *s = opaque;
689 return s->icount_rt_timer != NULL;
693 * Subsection for warp timer migration is optional, because may not be created
695 static const VMStateDescription icount_vmstate_warp_timer = {
696 .name = "timer/icount/warp_timer",
698 .minimum_version_id = 1,
699 .needed = warp_timer_state_needed,
700 .fields = (VMStateField[]) {
701 VMSTATE_INT64(vm_clock_warp_start, TimersState),
702 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
703 VMSTATE_END_OF_LIST()
707 static const VMStateDescription icount_vmstate_adjust_timers = {
708 .name = "timer/icount/timers",
710 .minimum_version_id = 1,
711 .needed = adjust_timers_state_needed,
712 .fields = (VMStateField[]) {
713 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
714 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
715 VMSTATE_END_OF_LIST()
720 * This is a subsection for icount migration.
722 static const VMStateDescription icount_vmstate_timers = {
723 .name = "timer/icount",
725 .minimum_version_id = 1,
726 .needed = icount_state_needed,
727 .fields = (VMStateField[]) {
728 VMSTATE_INT64(qemu_icount_bias, TimersState),
729 VMSTATE_INT64(qemu_icount, TimersState),
730 VMSTATE_END_OF_LIST()
732 .subsections = (const VMStateDescription*[]) {
733 &icount_vmstate_warp_timer,
734 &icount_vmstate_adjust_timers,
739 static const VMStateDescription vmstate_timers = {
742 .minimum_version_id = 1,
743 .fields = (VMStateField[]) {
744 VMSTATE_INT64(cpu_ticks_offset, TimersState),
746 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
747 VMSTATE_END_OF_LIST()
749 .subsections = (const VMStateDescription*[]) {
750 &icount_vmstate_timers,
755 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
758 double throttle_ratio;
761 if (!cpu_throttle_get_percentage()) {
765 pct = (double)cpu_throttle_get_percentage()/100;
766 throttle_ratio = pct / (1 - pct);
767 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
769 qemu_mutex_unlock_iothread();
770 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
771 qemu_mutex_lock_iothread();
772 atomic_set(&cpu->throttle_thread_scheduled, 0);
775 static void cpu_throttle_timer_tick(void *opaque)
780 /* Stop the timer if needed */
781 if (!cpu_throttle_get_percentage()) {
785 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
786 async_run_on_cpu(cpu, cpu_throttle_thread,
791 pct = (double)cpu_throttle_get_percentage()/100;
792 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
793 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
796 void cpu_throttle_set(int new_throttle_pct)
798 /* Ensure throttle percentage is within valid range */
799 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
800 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
802 atomic_set(&throttle_percentage, new_throttle_pct);
804 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
805 CPU_THROTTLE_TIMESLICE_NS);
808 void cpu_throttle_stop(void)
810 atomic_set(&throttle_percentage, 0);
813 bool cpu_throttle_active(void)
815 return (cpu_throttle_get_percentage() != 0);
818 int cpu_throttle_get_percentage(void)
820 return atomic_read(&throttle_percentage);
823 void cpu_ticks_init(void)
825 seqlock_init(&timers_state.vm_clock_seqlock);
826 qemu_spin_init(&timers_state.vm_clock_lock);
827 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
828 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
829 cpu_throttle_timer_tick, NULL);
832 void configure_icount(QemuOpts *opts, Error **errp)
835 char *rem_str = NULL;
837 option = qemu_opt_get(opts, "shift");
839 if (qemu_opt_get(opts, "align") != NULL) {
840 error_setg(errp, "Please specify shift option when using align");
845 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
847 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
848 icount_timer_cb, NULL);
851 icount_align_option = qemu_opt_get_bool(opts, "align", false);
853 if (icount_align_option && !icount_sleep) {
854 error_setg(errp, "align=on and sleep=off are incompatible");
856 if (strcmp(option, "auto") != 0) {
858 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
859 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
860 error_setg(errp, "icount: Invalid shift value");
864 } else if (icount_align_option) {
865 error_setg(errp, "shift=auto and align=on are incompatible");
866 } else if (!icount_sleep) {
867 error_setg(errp, "shift=auto and sleep=off are incompatible");
872 /* 125MIPS seems a reasonable initial guess at the guest speed.
873 It will be corrected fairly quickly anyway. */
874 timers_state.icount_time_shift = 3;
876 /* Have both realtime and virtual time triggers for speed adjustment.
877 The realtime trigger catches emulated time passing too slowly,
878 the virtual time trigger catches emulated time passing too fast.
879 Realtime triggers occur even when idle, so use them less frequently
881 timers_state.vm_clock_warp_start = -1;
882 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
883 icount_adjust_rt, NULL);
884 timer_mod(timers_state.icount_rt_timer,
885 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
886 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
887 icount_adjust_vm, NULL);
888 timer_mod(timers_state.icount_vm_timer,
889 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
890 NANOSECONDS_PER_SECOND / 10);
893 /***********************************************************/
894 /* TCG vCPU kick timer
896 * The kick timer is responsible for moving single threaded vCPU
897 * emulation on to the next vCPU. If more than one vCPU is running a
898 * timer event with force a cpu->exit so the next vCPU can get
901 * The timer is removed if all vCPUs are idle and restarted again once
902 * idleness is complete.
905 static QEMUTimer *tcg_kick_vcpu_timer;
906 static CPUState *tcg_current_rr_cpu;
908 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
910 static inline int64_t qemu_tcg_next_kick(void)
912 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
915 /* Kick the currently round-robin scheduled vCPU */
916 static void qemu_cpu_kick_rr_cpu(void)
920 cpu = atomic_mb_read(&tcg_current_rr_cpu);
924 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
927 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
931 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
933 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
938 if (qemu_in_vcpu_thread()) {
939 /* A CPU is currently running; kick it back out to the
940 * tcg_cpu_exec() loop so it will recalculate its
941 * icount deadline immediately.
943 qemu_cpu_kick(current_cpu);
944 } else if (first_cpu) {
945 /* qemu_cpu_kick is not enough to kick a halted CPU out of
946 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
947 * causes cpu_thread_is_idle to return false. This way,
948 * handle_icount_deadline can run.
949 * If we have no CPUs at all for some reason, we don't
950 * need to do anything.
952 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
956 static void kick_tcg_thread(void *opaque)
958 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
959 qemu_cpu_kick_rr_cpu();
962 static void start_tcg_kick_timer(void)
964 assert(!mttcg_enabled);
965 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
966 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
967 kick_tcg_thread, NULL);
968 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
972 static void stop_tcg_kick_timer(void)
974 assert(!mttcg_enabled);
975 if (tcg_kick_vcpu_timer) {
976 timer_del(tcg_kick_vcpu_timer);
977 tcg_kick_vcpu_timer = NULL;
981 /***********************************************************/
982 void hw_error(const char *fmt, ...)
988 fprintf(stderr, "qemu: hardware error: ");
989 vfprintf(stderr, fmt, ap);
990 fprintf(stderr, "\n");
992 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
993 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
999 void cpu_synchronize_all_states(void)
1004 cpu_synchronize_state(cpu);
1005 /* TODO: move to cpu_synchronize_state() */
1006 if (hvf_enabled()) {
1007 hvf_cpu_synchronize_state(cpu);
1012 void cpu_synchronize_all_post_reset(void)
1017 cpu_synchronize_post_reset(cpu);
1018 /* TODO: move to cpu_synchronize_post_reset() */
1019 if (hvf_enabled()) {
1020 hvf_cpu_synchronize_post_reset(cpu);
1025 void cpu_synchronize_all_post_init(void)
1030 cpu_synchronize_post_init(cpu);
1031 /* TODO: move to cpu_synchronize_post_init() */
1032 if (hvf_enabled()) {
1033 hvf_cpu_synchronize_post_init(cpu);
1038 void cpu_synchronize_all_pre_loadvm(void)
1043 cpu_synchronize_pre_loadvm(cpu);
1047 static int do_vm_stop(RunState state, bool send_stop)
1051 if (runstate_is_running()) {
1052 cpu_disable_ticks();
1054 runstate_set(state);
1055 vm_state_notify(0, state);
1057 qapi_event_send_stop();
1062 replay_disable_events();
1063 ret = bdrv_flush_all();
1068 /* Special vm_stop() variant for terminating the process. Historically clients
1069 * did not expect a QMP STOP event and so we need to retain compatibility.
1071 int vm_shutdown(void)
1073 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1076 static bool cpu_can_run(CPUState *cpu)
1081 if (cpu_is_stopped(cpu)) {
1087 static void cpu_handle_guest_debug(CPUState *cpu)
1089 gdb_set_stop_cpu(cpu);
1090 qemu_system_debug_request();
1091 cpu->stopped = true;
1095 static void sigbus_reraise(void)
1098 struct sigaction action;
1100 memset(&action, 0, sizeof(action));
1101 action.sa_handler = SIG_DFL;
1102 if (!sigaction(SIGBUS, &action, NULL)) {
1105 sigaddset(&set, SIGBUS);
1106 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1108 perror("Failed to re-raise SIGBUS!\n");
1112 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1114 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1119 /* Called asynchronously in VCPU thread. */
1120 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1124 /* Called synchronously (via signalfd) in main thread. */
1125 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1131 static void qemu_init_sigbus(void)
1133 struct sigaction action;
1135 memset(&action, 0, sizeof(action));
1136 action.sa_flags = SA_SIGINFO;
1137 action.sa_sigaction = sigbus_handler;
1138 sigaction(SIGBUS, &action, NULL);
1140 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1142 #else /* !CONFIG_LINUX */
1143 static void qemu_init_sigbus(void)
1146 #endif /* !CONFIG_LINUX */
1148 static QemuMutex qemu_global_mutex;
1150 static QemuThread io_thread;
1153 static QemuCond qemu_cpu_cond;
1155 static QemuCond qemu_pause_cond;
1157 void qemu_init_cpu_loop(void)
1160 qemu_cond_init(&qemu_cpu_cond);
1161 qemu_cond_init(&qemu_pause_cond);
1162 qemu_mutex_init(&qemu_global_mutex);
1164 qemu_thread_get_self(&io_thread);
1167 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1169 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1172 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1174 if (kvm_destroy_vcpu(cpu) < 0) {
1175 error_report("kvm_destroy_vcpu failed");
1180 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1184 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1186 g_assert(qemu_cpu_is_self(cpu));
1188 cpu->stopped = true;
1192 qemu_cond_broadcast(&qemu_pause_cond);
1195 static void qemu_wait_io_event_common(CPUState *cpu)
1197 atomic_mb_set(&cpu->thread_kicked, false);
1199 qemu_cpu_stop(cpu, false);
1201 process_queued_cpu_work(cpu);
1204 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1206 while (all_cpu_threads_idle()) {
1207 stop_tcg_kick_timer();
1208 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1211 start_tcg_kick_timer();
1213 qemu_wait_io_event_common(cpu);
1216 static void qemu_wait_io_event(CPUState *cpu)
1218 while (cpu_thread_is_idle(cpu)) {
1219 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1223 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1224 if (!tcg_enabled()) {
1228 qemu_wait_io_event_common(cpu);
1231 static void *qemu_kvm_cpu_thread_fn(void *arg)
1233 CPUState *cpu = arg;
1236 rcu_register_thread();
1238 qemu_mutex_lock_iothread();
1239 qemu_thread_get_self(cpu->thread);
1240 cpu->thread_id = qemu_get_thread_id();
1244 r = kvm_init_vcpu(cpu);
1246 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1250 kvm_init_cpu_signals(cpu);
1252 /* signal CPU creation */
1253 cpu->created = true;
1254 qemu_cond_signal(&qemu_cpu_cond);
1257 if (cpu_can_run(cpu)) {
1258 r = kvm_cpu_exec(cpu);
1259 if (r == EXCP_DEBUG) {
1260 cpu_handle_guest_debug(cpu);
1263 qemu_wait_io_event(cpu);
1264 } while (!cpu->unplug || cpu_can_run(cpu));
1266 qemu_kvm_destroy_vcpu(cpu);
1267 cpu->created = false;
1268 qemu_cond_signal(&qemu_cpu_cond);
1269 qemu_mutex_unlock_iothread();
1270 rcu_unregister_thread();
1274 static void *qemu_dummy_cpu_thread_fn(void *arg)
1277 error_report("qtest is not supported under Windows");
1280 CPUState *cpu = arg;
1284 rcu_register_thread();
1286 qemu_mutex_lock_iothread();
1287 qemu_thread_get_self(cpu->thread);
1288 cpu->thread_id = qemu_get_thread_id();
1292 sigemptyset(&waitset);
1293 sigaddset(&waitset, SIG_IPI);
1295 /* signal CPU creation */
1296 cpu->created = true;
1297 qemu_cond_signal(&qemu_cpu_cond);
1300 qemu_mutex_unlock_iothread();
1303 r = sigwait(&waitset, &sig);
1304 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1309 qemu_mutex_lock_iothread();
1310 qemu_wait_io_event(cpu);
1311 } while (!cpu->unplug);
1313 rcu_unregister_thread();
1318 static int64_t tcg_get_icount_limit(void)
1322 if (replay_mode != REPLAY_MODE_PLAY) {
1323 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1325 /* Maintain prior (possibly buggy) behaviour where if no deadline
1326 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1327 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1330 if ((deadline < 0) || (deadline > INT32_MAX)) {
1331 deadline = INT32_MAX;
1334 return qemu_icount_round(deadline);
1336 return replay_get_instructions();
1340 static void handle_icount_deadline(void)
1342 assert(qemu_in_vcpu_thread());
1345 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1347 if (deadline == 0) {
1348 /* Wake up other AioContexts. */
1349 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1350 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1355 static void prepare_icount_for_run(CPUState *cpu)
1360 /* These should always be cleared by process_icount_data after
1361 * each vCPU execution. However u16.high can be raised
1362 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1364 g_assert(cpu->icount_decr.u16.low == 0);
1365 g_assert(cpu->icount_extra == 0);
1367 cpu->icount_budget = tcg_get_icount_limit();
1368 insns_left = MIN(0xffff, cpu->icount_budget);
1369 cpu->icount_decr.u16.low = insns_left;
1370 cpu->icount_extra = cpu->icount_budget - insns_left;
1372 replay_mutex_lock();
1376 static void process_icount_data(CPUState *cpu)
1379 /* Account for executed instructions */
1380 cpu_update_icount(cpu);
1382 /* Reset the counters */
1383 cpu->icount_decr.u16.low = 0;
1384 cpu->icount_extra = 0;
1385 cpu->icount_budget = 0;
1387 replay_account_executed_instructions();
1389 replay_mutex_unlock();
1394 static int tcg_cpu_exec(CPUState *cpu)
1397 #ifdef CONFIG_PROFILER
1401 assert(tcg_enabled());
1402 #ifdef CONFIG_PROFILER
1403 ti = profile_getclock();
1405 cpu_exec_start(cpu);
1406 ret = cpu_exec(cpu);
1408 #ifdef CONFIG_PROFILER
1409 tcg_time += profile_getclock() - ti;
1414 /* Destroy any remaining vCPUs which have been unplugged and have
1417 static void deal_with_unplugged_cpus(void)
1422 if (cpu->unplug && !cpu_can_run(cpu)) {
1423 qemu_tcg_destroy_vcpu(cpu);
1424 cpu->created = false;
1425 qemu_cond_signal(&qemu_cpu_cond);
1431 /* Single-threaded TCG
1433 * In the single-threaded case each vCPU is simulated in turn. If
1434 * there is more than a single vCPU we create a simple timer to kick
1435 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1436 * This is done explicitly rather than relying on side-effects
1440 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1442 CPUState *cpu = arg;
1444 assert(tcg_enabled());
1445 rcu_register_thread();
1446 tcg_register_thread();
1448 qemu_mutex_lock_iothread();
1449 qemu_thread_get_self(cpu->thread);
1451 cpu->thread_id = qemu_get_thread_id();
1452 cpu->created = true;
1454 qemu_cond_signal(&qemu_cpu_cond);
1456 /* wait for initial kick-off after machine start */
1457 while (first_cpu->stopped) {
1458 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1460 /* process any pending work */
1463 qemu_wait_io_event_common(cpu);
1467 start_tcg_kick_timer();
1471 /* process any pending work */
1472 cpu->exit_request = 1;
1475 qemu_mutex_unlock_iothread();
1476 replay_mutex_lock();
1477 qemu_mutex_lock_iothread();
1478 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1479 qemu_account_warp_timer();
1481 /* Run the timers here. This is much more efficient than
1482 * waking up the I/O thread and waiting for completion.
1484 handle_icount_deadline();
1486 replay_mutex_unlock();
1492 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1494 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1497 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1498 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1500 if (cpu_can_run(cpu)) {
1503 qemu_mutex_unlock_iothread();
1504 prepare_icount_for_run(cpu);
1506 r = tcg_cpu_exec(cpu);
1508 process_icount_data(cpu);
1509 qemu_mutex_lock_iothread();
1511 if (r == EXCP_DEBUG) {
1512 cpu_handle_guest_debug(cpu);
1514 } else if (r == EXCP_ATOMIC) {
1515 qemu_mutex_unlock_iothread();
1516 cpu_exec_step_atomic(cpu);
1517 qemu_mutex_lock_iothread();
1520 } else if (cpu->stop) {
1522 cpu = CPU_NEXT(cpu);
1527 cpu = CPU_NEXT(cpu);
1528 } /* while (cpu && !cpu->exit_request).. */
1530 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1531 atomic_set(&tcg_current_rr_cpu, NULL);
1533 if (cpu && cpu->exit_request) {
1534 atomic_mb_set(&cpu->exit_request, 0);
1537 qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1538 deal_with_unplugged_cpus();
1541 rcu_unregister_thread();
1545 static void *qemu_hax_cpu_thread_fn(void *arg)
1547 CPUState *cpu = arg;
1550 rcu_register_thread();
1551 qemu_mutex_lock_iothread();
1552 qemu_thread_get_self(cpu->thread);
1554 cpu->thread_id = qemu_get_thread_id();
1555 cpu->created = true;
1560 qemu_cond_signal(&qemu_cpu_cond);
1563 if (cpu_can_run(cpu)) {
1564 r = hax_smp_cpu_exec(cpu);
1565 if (r == EXCP_DEBUG) {
1566 cpu_handle_guest_debug(cpu);
1570 qemu_wait_io_event(cpu);
1571 } while (!cpu->unplug || cpu_can_run(cpu));
1572 rcu_unregister_thread();
1576 /* The HVF-specific vCPU thread function. This one should only run when the host
1577 * CPU supports the VMX "unrestricted guest" feature. */
1578 static void *qemu_hvf_cpu_thread_fn(void *arg)
1580 CPUState *cpu = arg;
1584 assert(hvf_enabled());
1586 rcu_register_thread();
1588 qemu_mutex_lock_iothread();
1589 qemu_thread_get_self(cpu->thread);
1591 cpu->thread_id = qemu_get_thread_id();
1597 /* signal CPU creation */
1598 cpu->created = true;
1599 qemu_cond_signal(&qemu_cpu_cond);
1602 if (cpu_can_run(cpu)) {
1603 r = hvf_vcpu_exec(cpu);
1604 if (r == EXCP_DEBUG) {
1605 cpu_handle_guest_debug(cpu);
1608 qemu_wait_io_event(cpu);
1609 } while (!cpu->unplug || cpu_can_run(cpu));
1611 hvf_vcpu_destroy(cpu);
1612 cpu->created = false;
1613 qemu_cond_signal(&qemu_cpu_cond);
1614 qemu_mutex_unlock_iothread();
1615 rcu_unregister_thread();
1619 static void *qemu_whpx_cpu_thread_fn(void *arg)
1621 CPUState *cpu = arg;
1624 rcu_register_thread();
1626 qemu_mutex_lock_iothread();
1627 qemu_thread_get_self(cpu->thread);
1628 cpu->thread_id = qemu_get_thread_id();
1631 r = whpx_init_vcpu(cpu);
1633 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1637 /* signal CPU creation */
1638 cpu->created = true;
1639 qemu_cond_signal(&qemu_cpu_cond);
1642 if (cpu_can_run(cpu)) {
1643 r = whpx_vcpu_exec(cpu);
1644 if (r == EXCP_DEBUG) {
1645 cpu_handle_guest_debug(cpu);
1648 while (cpu_thread_is_idle(cpu)) {
1649 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1651 qemu_wait_io_event_common(cpu);
1652 } while (!cpu->unplug || cpu_can_run(cpu));
1654 whpx_destroy_vcpu(cpu);
1655 cpu->created = false;
1656 qemu_cond_signal(&qemu_cpu_cond);
1657 qemu_mutex_unlock_iothread();
1658 rcu_unregister_thread();
1663 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1668 /* Multi-threaded TCG
1670 * In the multi-threaded case each vCPU has its own thread. The TLS
1671 * variable current_cpu can be used deep in the code to find the
1672 * current CPUState for a given thread.
1675 static void *qemu_tcg_cpu_thread_fn(void *arg)
1677 CPUState *cpu = arg;
1679 assert(tcg_enabled());
1680 g_assert(!use_icount);
1682 rcu_register_thread();
1683 tcg_register_thread();
1685 qemu_mutex_lock_iothread();
1686 qemu_thread_get_self(cpu->thread);
1688 cpu->thread_id = qemu_get_thread_id();
1689 cpu->created = true;
1692 qemu_cond_signal(&qemu_cpu_cond);
1694 /* process any pending work */
1695 cpu->exit_request = 1;
1698 if (cpu_can_run(cpu)) {
1700 qemu_mutex_unlock_iothread();
1701 r = tcg_cpu_exec(cpu);
1702 qemu_mutex_lock_iothread();
1705 cpu_handle_guest_debug(cpu);
1708 /* during start-up the vCPU is reset and the thread is
1709 * kicked several times. If we don't ensure we go back
1710 * to sleep in the halted state we won't cleanly
1711 * start-up when the vCPU is enabled.
1713 * cpu->halted should ensure we sleep in wait_io_event
1715 g_assert(cpu->halted);
1718 qemu_mutex_unlock_iothread();
1719 cpu_exec_step_atomic(cpu);
1720 qemu_mutex_lock_iothread();
1722 /* Ignore everything else? */
1727 atomic_mb_set(&cpu->exit_request, 0);
1728 qemu_wait_io_event(cpu);
1729 } while (!cpu->unplug || cpu_can_run(cpu));
1731 qemu_tcg_destroy_vcpu(cpu);
1732 cpu->created = false;
1733 qemu_cond_signal(&qemu_cpu_cond);
1734 qemu_mutex_unlock_iothread();
1735 rcu_unregister_thread();
1739 static void qemu_cpu_kick_thread(CPUState *cpu)
1744 if (cpu->thread_kicked) {
1747 cpu->thread_kicked = true;
1748 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1750 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1754 if (!qemu_cpu_is_self(cpu)) {
1755 if (whpx_enabled()) {
1756 whpx_vcpu_kick(cpu);
1757 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1758 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1759 __func__, GetLastError());
1766 void qemu_cpu_kick(CPUState *cpu)
1768 qemu_cond_broadcast(cpu->halt_cond);
1769 if (tcg_enabled()) {
1771 /* NOP unless doing single-thread RR */
1772 qemu_cpu_kick_rr_cpu();
1774 if (hax_enabled()) {
1776 * FIXME: race condition with the exit_request check in
1779 cpu->exit_request = 1;
1781 qemu_cpu_kick_thread(cpu);
1785 void qemu_cpu_kick_self(void)
1787 assert(current_cpu);
1788 qemu_cpu_kick_thread(current_cpu);
1791 bool qemu_cpu_is_self(CPUState *cpu)
1793 return qemu_thread_is_self(cpu->thread);
1796 bool qemu_in_vcpu_thread(void)
1798 return current_cpu && qemu_cpu_is_self(current_cpu);
1801 static __thread bool iothread_locked = false;
1803 bool qemu_mutex_iothread_locked(void)
1805 return iothread_locked;
1809 * The BQL is taken from so many places that it is worth profiling the
1810 * callers directly, instead of funneling them all through a single function.
1812 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1814 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1816 g_assert(!qemu_mutex_iothread_locked());
1817 bql_lock(&qemu_global_mutex, file, line);
1818 iothread_locked = true;
1821 void qemu_mutex_unlock_iothread(void)
1823 g_assert(qemu_mutex_iothread_locked());
1824 iothread_locked = false;
1825 qemu_mutex_unlock(&qemu_global_mutex);
1828 static bool all_vcpus_paused(void)
1833 if (!cpu->stopped) {
1841 void pause_all_vcpus(void)
1845 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1847 if (qemu_cpu_is_self(cpu)) {
1848 qemu_cpu_stop(cpu, true);
1855 /* We need to drop the replay_lock so any vCPU threads woken up
1856 * can finish their replay tasks
1858 replay_mutex_unlock();
1860 while (!all_vcpus_paused()) {
1861 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1867 qemu_mutex_unlock_iothread();
1868 replay_mutex_lock();
1869 qemu_mutex_lock_iothread();
1872 void cpu_resume(CPUState *cpu)
1875 cpu->stopped = false;
1879 void resume_all_vcpus(void)
1883 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1889 void cpu_remove_sync(CPUState *cpu)
1894 qemu_mutex_unlock_iothread();
1895 qemu_thread_join(cpu->thread);
1896 qemu_mutex_lock_iothread();
1899 /* For temporary buffers for forming a name */
1900 #define VCPU_THREAD_NAME_SIZE 16
1902 static void qemu_tcg_init_vcpu(CPUState *cpu)
1904 char thread_name[VCPU_THREAD_NAME_SIZE];
1905 static QemuCond *single_tcg_halt_cond;
1906 static QemuThread *single_tcg_cpu_thread;
1907 static int tcg_region_inited;
1909 assert(tcg_enabled());
1911 * Initialize TCG regions--once. Now is a good time, because:
1912 * (1) TCG's init context, prologue and target globals have been set up.
1913 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1914 * -accel flag is processed, so the check doesn't work then).
1916 if (!tcg_region_inited) {
1917 tcg_region_inited = 1;
1921 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1922 cpu->thread = g_malloc0(sizeof(QemuThread));
1923 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1924 qemu_cond_init(cpu->halt_cond);
1926 if (qemu_tcg_mttcg_enabled()) {
1927 /* create a thread per vCPU with TCG (MTTCG) */
1928 parallel_cpus = true;
1929 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1932 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1933 cpu, QEMU_THREAD_JOINABLE);
1936 /* share a single thread for all cpus with TCG */
1937 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1938 qemu_thread_create(cpu->thread, thread_name,
1939 qemu_tcg_rr_cpu_thread_fn,
1940 cpu, QEMU_THREAD_JOINABLE);
1942 single_tcg_halt_cond = cpu->halt_cond;
1943 single_tcg_cpu_thread = cpu->thread;
1946 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1949 /* For non-MTTCG cases we share the thread */
1950 cpu->thread = single_tcg_cpu_thread;
1951 cpu->halt_cond = single_tcg_halt_cond;
1952 cpu->thread_id = first_cpu->thread_id;
1954 cpu->created = true;
1958 static void qemu_hax_start_vcpu(CPUState *cpu)
1960 char thread_name[VCPU_THREAD_NAME_SIZE];
1962 cpu->thread = g_malloc0(sizeof(QemuThread));
1963 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1964 qemu_cond_init(cpu->halt_cond);
1966 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1968 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1969 cpu, QEMU_THREAD_JOINABLE);
1971 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1975 static void qemu_kvm_start_vcpu(CPUState *cpu)
1977 char thread_name[VCPU_THREAD_NAME_SIZE];
1979 cpu->thread = g_malloc0(sizeof(QemuThread));
1980 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1981 qemu_cond_init(cpu->halt_cond);
1982 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1984 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1985 cpu, QEMU_THREAD_JOINABLE);
1988 static void qemu_hvf_start_vcpu(CPUState *cpu)
1990 char thread_name[VCPU_THREAD_NAME_SIZE];
1992 /* HVF currently does not support TCG, and only runs in
1993 * unrestricted-guest mode. */
1994 assert(hvf_enabled());
1996 cpu->thread = g_malloc0(sizeof(QemuThread));
1997 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1998 qemu_cond_init(cpu->halt_cond);
2000 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2002 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2003 cpu, QEMU_THREAD_JOINABLE);
2006 static void qemu_whpx_start_vcpu(CPUState *cpu)
2008 char thread_name[VCPU_THREAD_NAME_SIZE];
2010 cpu->thread = g_malloc0(sizeof(QemuThread));
2011 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2012 qemu_cond_init(cpu->halt_cond);
2013 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2015 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2016 cpu, QEMU_THREAD_JOINABLE);
2018 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2022 static void qemu_dummy_start_vcpu(CPUState *cpu)
2024 char thread_name[VCPU_THREAD_NAME_SIZE];
2026 cpu->thread = g_malloc0(sizeof(QemuThread));
2027 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2028 qemu_cond_init(cpu->halt_cond);
2029 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2031 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2032 QEMU_THREAD_JOINABLE);
2035 void qemu_init_vcpu(CPUState *cpu)
2037 cpu->nr_cores = smp_cores;
2038 cpu->nr_threads = smp_threads;
2039 cpu->stopped = true;
2042 /* If the target cpu hasn't set up any address spaces itself,
2043 * give it the default one.
2046 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2049 if (kvm_enabled()) {
2050 qemu_kvm_start_vcpu(cpu);
2051 } else if (hax_enabled()) {
2052 qemu_hax_start_vcpu(cpu);
2053 } else if (hvf_enabled()) {
2054 qemu_hvf_start_vcpu(cpu);
2055 } else if (tcg_enabled()) {
2056 qemu_tcg_init_vcpu(cpu);
2057 } else if (whpx_enabled()) {
2058 qemu_whpx_start_vcpu(cpu);
2060 qemu_dummy_start_vcpu(cpu);
2063 while (!cpu->created) {
2064 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2068 void cpu_stop_current(void)
2071 qemu_cpu_stop(current_cpu, true);
2075 int vm_stop(RunState state)
2077 if (qemu_in_vcpu_thread()) {
2078 qemu_system_vmstop_request_prepare();
2079 qemu_system_vmstop_request(state);
2081 * FIXME: should not return to device code in case
2082 * vm_stop() has been requested.
2088 return do_vm_stop(state, true);
2092 * Prepare for (re)starting the VM.
2093 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2094 * running or in case of an error condition), 0 otherwise.
2096 int vm_prepare_start(void)
2100 qemu_vmstop_requested(&requested);
2101 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2105 /* Ensure that a STOP/RESUME pair of events is emitted if a
2106 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2107 * example, according to documentation is always followed by
2110 if (runstate_is_running()) {
2111 qapi_event_send_stop();
2112 qapi_event_send_resume();
2116 /* We are sending this now, but the CPUs will be resumed shortly later */
2117 qapi_event_send_resume();
2119 replay_enable_events();
2121 runstate_set(RUN_STATE_RUNNING);
2122 vm_state_notify(1, RUN_STATE_RUNNING);
2128 if (!vm_prepare_start()) {
2133 /* does a state transition even if the VM is already stopped,
2134 current state is forgotten forever */
2135 int vm_stop_force_state(RunState state)
2137 if (runstate_is_running()) {
2138 return vm_stop(state);
2140 runstate_set(state);
2143 /* Make sure to return an error if the flush in a previous vm_stop()
2145 return bdrv_flush_all();
2149 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2151 /* XXX: implement xxx_cpu_list for targets that still miss it */
2152 #if defined(cpu_list)
2153 cpu_list(f, cpu_fprintf);
2157 CpuInfoList *qmp_query_cpus(Error **errp)
2159 MachineState *ms = MACHINE(qdev_get_machine());
2160 MachineClass *mc = MACHINE_GET_CLASS(ms);
2161 CpuInfoList *head = NULL, *cur_item = NULL;
2166 #if defined(TARGET_I386)
2167 X86CPU *x86_cpu = X86_CPU(cpu);
2168 CPUX86State *env = &x86_cpu->env;
2169 #elif defined(TARGET_PPC)
2170 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2171 CPUPPCState *env = &ppc_cpu->env;
2172 #elif defined(TARGET_SPARC)
2173 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2174 CPUSPARCState *env = &sparc_cpu->env;
2175 #elif defined(TARGET_RISCV)
2176 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2177 CPURISCVState *env = &riscv_cpu->env;
2178 #elif defined(TARGET_MIPS)
2179 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2180 CPUMIPSState *env = &mips_cpu->env;
2181 #elif defined(TARGET_TRICORE)
2182 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2183 CPUTriCoreState *env = &tricore_cpu->env;
2184 #elif defined(TARGET_S390X)
2185 S390CPU *s390_cpu = S390_CPU(cpu);
2186 CPUS390XState *env = &s390_cpu->env;
2189 cpu_synchronize_state(cpu);
2191 info = g_malloc0(sizeof(*info));
2192 info->value = g_malloc0(sizeof(*info->value));
2193 info->value->CPU = cpu->cpu_index;
2194 info->value->current = (cpu == first_cpu);
2195 info->value->halted = cpu->halted;
2196 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2197 info->value->thread_id = cpu->thread_id;
2198 #if defined(TARGET_I386)
2199 info->value->arch = CPU_INFO_ARCH_X86;
2200 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2201 #elif defined(TARGET_PPC)
2202 info->value->arch = CPU_INFO_ARCH_PPC;
2203 info->value->u.ppc.nip = env->nip;
2204 #elif defined(TARGET_SPARC)
2205 info->value->arch = CPU_INFO_ARCH_SPARC;
2206 info->value->u.q_sparc.pc = env->pc;
2207 info->value->u.q_sparc.npc = env->npc;
2208 #elif defined(TARGET_MIPS)
2209 info->value->arch = CPU_INFO_ARCH_MIPS;
2210 info->value->u.q_mips.PC = env->active_tc.PC;
2211 #elif defined(TARGET_TRICORE)
2212 info->value->arch = CPU_INFO_ARCH_TRICORE;
2213 info->value->u.tricore.PC = env->PC;
2214 #elif defined(TARGET_S390X)
2215 info->value->arch = CPU_INFO_ARCH_S390;
2216 info->value->u.s390.cpu_state = env->cpu_state;
2217 #elif defined(TARGET_RISCV)
2218 info->value->arch = CPU_INFO_ARCH_RISCV;
2219 info->value->u.riscv.pc = env->pc;
2221 info->value->arch = CPU_INFO_ARCH_OTHER;
2223 info->value->has_props = !!mc->cpu_index_to_instance_props;
2224 if (info->value->has_props) {
2225 CpuInstanceProperties *props;
2226 props = g_malloc0(sizeof(*props));
2227 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2228 info->value->props = props;
2231 /* XXX: waiting for the qapi to support GSList */
2233 head = cur_item = info;
2235 cur_item->next = info;
2243 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2246 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2247 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2250 case SYS_EMU_TARGET_I386:
2251 case SYS_EMU_TARGET_X86_64:
2252 return CPU_INFO_ARCH_X86;
2254 case SYS_EMU_TARGET_PPC:
2255 case SYS_EMU_TARGET_PPC64:
2256 return CPU_INFO_ARCH_PPC;
2258 case SYS_EMU_TARGET_SPARC:
2259 case SYS_EMU_TARGET_SPARC64:
2260 return CPU_INFO_ARCH_SPARC;
2262 case SYS_EMU_TARGET_MIPS:
2263 case SYS_EMU_TARGET_MIPSEL:
2264 case SYS_EMU_TARGET_MIPS64:
2265 case SYS_EMU_TARGET_MIPS64EL:
2266 return CPU_INFO_ARCH_MIPS;
2268 case SYS_EMU_TARGET_TRICORE:
2269 return CPU_INFO_ARCH_TRICORE;
2271 case SYS_EMU_TARGET_S390X:
2272 return CPU_INFO_ARCH_S390;
2274 case SYS_EMU_TARGET_RISCV32:
2275 case SYS_EMU_TARGET_RISCV64:
2276 return CPU_INFO_ARCH_RISCV;
2279 return CPU_INFO_ARCH_OTHER;
2283 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2286 S390CPU *s390_cpu = S390_CPU(cpu);
2287 CPUS390XState *env = &s390_cpu->env;
2289 info->cpu_state = env->cpu_state;
2296 * fast means: we NEVER interrupt vCPU threads to retrieve
2297 * information from KVM.
2299 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2301 MachineState *ms = MACHINE(qdev_get_machine());
2302 MachineClass *mc = MACHINE_GET_CLASS(ms);
2303 CpuInfoFastList *head = NULL, *cur_item = NULL;
2304 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2309 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2310 info->value = g_malloc0(sizeof(*info->value));
2312 info->value->cpu_index = cpu->cpu_index;
2313 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2314 info->value->thread_id = cpu->thread_id;
2316 info->value->has_props = !!mc->cpu_index_to_instance_props;
2317 if (info->value->has_props) {
2318 CpuInstanceProperties *props;
2319 props = g_malloc0(sizeof(*props));
2320 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2321 info->value->props = props;
2324 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2325 info->value->target = target;
2326 if (target == SYS_EMU_TARGET_S390X) {
2327 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2331 head = cur_item = info;
2333 cur_item->next = info;
2341 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2342 bool has_cpu, int64_t cpu_index, Error **errp)
2348 int64_t orig_addr = addr, orig_size = size;
2354 cpu = qemu_get_cpu(cpu_index);
2356 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2361 f = fopen(filename, "wb");
2363 error_setg_file_open(errp, errno, filename);
2371 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2372 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2373 " specified", orig_addr, orig_size);
2376 if (fwrite(buf, 1, l, f) != l) {
2377 error_setg(errp, QERR_IO_ERROR);
2388 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2395 f = fopen(filename, "wb");
2397 error_setg_file_open(errp, errno, filename);
2405 cpu_physical_memory_read(addr, buf, l);
2406 if (fwrite(buf, 1, l, f) != l) {
2407 error_setg(errp, QERR_IO_ERROR);
2418 void qmp_inject_nmi(Error **errp)
2420 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2423 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2429 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2430 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2431 if (icount_align_option) {
2432 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2433 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2435 cpu_fprintf(f, "Max guest delay NA\n");
2436 cpu_fprintf(f, "Max guest advance NA\n");