Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* QEMU System Emulator
	3	*
	4	* Copyright (c) 2003-2008 Fabrice Bellard
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24
	25	#include "qemu/osdep.h"
	26	#include "qemu/config-file.h"
	27	#include "cpu.h"
	28	#include "monitor/monitor.h"
	29	#include "qapi/error.h"
	30	#include "qapi/qapi-commands-misc.h"
	31	#include "qapi/qapi-events-run-state.h"
	32	#include "qapi/qmp/qerror.h"
	33	#include "qemu/error-report.h"
	34	#include "qemu/qemu-print.h"
	35	#include "sysemu/sysemu.h"
	36	#include "sysemu/block-backend.h"
	37	#include "exec/gdbstub.h"
	38	#include "sysemu/dma.h"
	39	#include "sysemu/hw_accel.h"
	40	#include "sysemu/kvm.h"
	41	#include "sysemu/hax.h"
	42	#include "sysemu/hvf.h"
	43	#include "sysemu/whpx.h"
	44	#include "exec/exec-all.h"
	45
	46	#include "qemu/thread.h"
	47	#include "sysemu/cpus.h"
	48	#include "sysemu/qtest.h"
	49	#include "qemu/main-loop.h"
	50	#include "qemu/option.h"
	51	#include "qemu/bitmap.h"
	52	#include "qemu/seqlock.h"
	53	#include "qemu/guest-random.h"
	54	#include "tcg.h"
	55	#include "hw/nmi.h"
	56	#include "sysemu/replay.h"
	57	#include "hw/boards.h"
	58
	59	#ifdef CONFIG_LINUX
	60
	61	#include <sys/prctl.h>
	62
	63	#ifndef PR_MCE_KILL
	64	#define PR_MCE_KILL 33
	65	#endif
	66
	67	#ifndef PR_MCE_KILL_SET
	68	#define PR_MCE_KILL_SET 1
	69	#endif
	70
	71	#ifndef PR_MCE_KILL_EARLY
	72	#define PR_MCE_KILL_EARLY 1
	73	#endif
	74
	75	#endif /* CONFIG_LINUX */
	76
	77	int64_t max_delay;
	78	int64_t max_advance;
	79
	80	/* vcpu throttling controls */
	81	static QEMUTimer *throttle_timer;
	82	static unsigned int throttle_percentage;
	83
	84	#define CPU_THROTTLE_PCT_MIN 1
	85	#define CPU_THROTTLE_PCT_MAX 99
	86	#define CPU_THROTTLE_TIMESLICE_NS 10000000
	87
	88	bool cpu_is_stopped(CPUState *cpu)
	89	{
	90	return cpu->stopped \|\| !runstate_is_running();
	91	}
	92
	93	static bool cpu_thread_is_idle(CPUState *cpu)
	94	{
	95	if (cpu->stop \|\| cpu->queued_work_first) {
	96	return false;
	97	}
	98	if (cpu_is_stopped(cpu)) {
	99	return true;
	100	}
	101	if (!cpu->halted \|\| cpu_has_work(cpu) \|\|
	102	kvm_halt_in_kernel()) {
	103	return false;
	104	}
	105	return true;
	106	}
	107
	108	static bool all_cpu_threads_idle(void)
	109	{
	110	CPUState *cpu;
	111
	112	CPU_FOREACH(cpu) {
	113	if (!cpu_thread_is_idle(cpu)) {
	114	return false;
	115	}
	116	}
	117	return true;
	118	}
	119
	120	/***********************************************************/
	121	/* guest cycle counter */
	122
	123	/* Protected by TimersState seqlock */
	124
	125	static bool icount_sleep = true;
	126	/* Arbitrarily pick 1MIPS as the minimum allowable speed. */
	127	#define MAX_ICOUNT_SHIFT 10
	128
	129	typedef struct TimersState {
	130	/* Protected by BQL. */
	131	int64_t cpu_ticks_prev;
	132	int64_t cpu_ticks_offset;
	133
	134	/* Protect fields that can be respectively read outside the
	135	* BQL, and written from multiple threads.
	136	*/
	137	QemuSeqLock vm_clock_seqlock;
	138	QemuSpin vm_clock_lock;
	139
	140	int16_t cpu_ticks_enabled;
	141
	142	/* Conversion factor from emulated instructions to virtual clock ticks. */
	143	int16_t icount_time_shift;
	144
	145	/* Compensate for varying guest execution speed. */
	146	int64_t qemu_icount_bias;
	147
	148	int64_t vm_clock_warp_start;
	149	int64_t cpu_clock_offset;
	150
	151	/* Only written by TCG thread */
	152	int64_t qemu_icount;
	153
	154	/* for adjusting icount */
	155	QEMUTimer *icount_rt_timer;
	156	QEMUTimer *icount_vm_timer;
	157	QEMUTimer *icount_warp_timer;
	158	} TimersState;
	159
	160	static TimersState timers_state;
	161	bool mttcg_enabled;
	162
	163	/*
	164	* We default to false if we know other options have been enabled
	165	* which are currently incompatible with MTTCG. Otherwise when each
	166	* guest (target) has been updated to support:
	167	* - atomic instructions
	168	* - memory ordering primitives (barriers)
	169	* they can set the appropriate CONFIG flags in ${target}-softmmu.mak
	170	*
	171	* Once a guest architecture has been converted to the new primitives
	172	* there are two remaining limitations to check.
	173	*
	174	* - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
	175	* - The host must have a stronger memory order than the guest
	176	*
	177	* It may be possible in future to support strong guests on weak hosts
	178	* but that will require tagging all load/stores in a guest with their
	179	* implicit memory order requirements which would likely slow things
	180	* down a lot.
	181	*/
	182
	183	static bool check_tcg_memory_orders_compatible(void)
	184	{
	185	#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
	186	return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
	187	#else
	188	return false;
	189	#endif
	190	}
	191
	192	static bool default_mttcg_enabled(void)
	193	{
	194	if (use_icount \|\| TCG_OVERSIZED_GUEST) {
	195	return false;
	196	} else {
	197	#ifdef TARGET_SUPPORTS_MTTCG
	198	return check_tcg_memory_orders_compatible();
	199	#else
	200	return false;
	201	#endif
	202	}
	203	}
	204
	205	void qemu_tcg_configure(QemuOpts opts, Error *errp)
	206	{
	207	const char *t = qemu_opt_get(opts, "thread");
	208	if (t) {
	209	if (strcmp(t, "multi") == 0) {
	210	if (TCG_OVERSIZED_GUEST) {
	211	error_setg(errp, "No MTTCG when guest word size > hosts");
	212	} else if (use_icount) {
	213	error_setg(errp, "No MTTCG when icount is enabled");
	214	} else {
	215	#ifndef TARGET_SUPPORTS_MTTCG
	216	warn_report("Guest not yet converted to MTTCG - "
	217	"you may get unexpected results");
	218	#endif
	219	if (!check_tcg_memory_orders_compatible()) {
	220	warn_report("Guest expects a stronger memory ordering "
	221	"than the host provides");
	222	error_printf("This may cause strange/hard to debug errors\n");
	223	}
	224	mttcg_enabled = true;
	225	}
	226	} else if (strcmp(t, "single") == 0) {
	227	mttcg_enabled = false;
	228	} else {
	229	error_setg(errp, "Invalid 'thread' setting %s", t);
	230	}
	231	} else {
	232	mttcg_enabled = default_mttcg_enabled();
	233	}
	234	}
	235
	236	/* The current number of executed instructions is based on what we
	237	* originally budgeted minus the current state of the decrementing
	238	* icount counters in extra/u16.low.
	239	*/
	240	static int64_t cpu_get_icount_executed(CPUState *cpu)
	241	{
	242	return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
	243	}
	244
	245	/*
	246	* Update the global shared timer_state.qemu_icount to take into
	247	* account executed instructions. This is done by the TCG vCPU
	248	* thread so the main-loop can see time has moved forward.
	249	*/
	250	static void cpu_update_icount_locked(CPUState *cpu)
	251	{
	252	int64_t executed = cpu_get_icount_executed(cpu);
	253	cpu->icount_budget -= executed;
	254
	255	atomic_set_i64(&timers_state.qemu_icount,
	256	timers_state.qemu_icount + executed);
	257	}
	258
	259	/*
	260	* Update the global shared timer_state.qemu_icount to take into
	261	* account executed instructions. This is done by the TCG vCPU
	262	* thread so the main-loop can see time has moved forward.
	263	*/
	264	void cpu_update_icount(CPUState *cpu)
	265	{
	266	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	267	&timers_state.vm_clock_lock);
	268	cpu_update_icount_locked(cpu);
	269	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	270	&timers_state.vm_clock_lock);
	271	}
	272
	273	static int64_t cpu_get_icount_raw_locked(void)
	274	{
	275	CPUState *cpu = current_cpu;
	276
	277	if (cpu && cpu->running) {
	278	if (!cpu->can_do_io) {
	279	error_report("Bad icount read");
	280	exit(1);
	281	}
	282	/* Take into account what has run */
	283	cpu_update_icount_locked(cpu);
	284	}
	285	/* The read is protected by the seqlock, but needs atomic64 to avoid UB */
	286	return atomic_read_i64(&timers_state.qemu_icount);
	287	}
	288
	289	static int64_t cpu_get_icount_locked(void)
	290	{
	291	int64_t icount = cpu_get_icount_raw_locked();
	292	return atomic_read_i64(&timers_state.qemu_icount_bias) +
	293	cpu_icount_to_ns(icount);
	294	}
	295
	296	int64_t cpu_get_icount_raw(void)
	297	{
	298	int64_t icount;
	299	unsigned start;
	300
	301	do {
	302	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	303	icount = cpu_get_icount_raw_locked();
	304	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	305
	306	return icount;
	307	}
	308
	309	/* Return the virtual CPU time, based on the instruction counter. */
	310	int64_t cpu_get_icount(void)
	311	{
	312	int64_t icount;
	313	unsigned start;
	314
	315	do {
	316	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	317	icount = cpu_get_icount_locked();
	318	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	319
	320	return icount;
	321	}
	322
	323	int64_t cpu_icount_to_ns(int64_t icount)
	324	{
	325	return icount << atomic_read(&timers_state.icount_time_shift);
	326	}
	327
	328	static int64_t cpu_get_ticks_locked(void)
	329	{
	330	int64_t ticks = timers_state.cpu_ticks_offset;
	331	if (timers_state.cpu_ticks_enabled) {
	332	ticks += cpu_get_host_ticks();
	333	}
	334
	335	if (timers_state.cpu_ticks_prev > ticks) {
	336	/* Non increasing ticks may happen if the host uses software suspend. */
	337	timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
	338	ticks = timers_state.cpu_ticks_prev;
	339	}
	340
	341	timers_state.cpu_ticks_prev = ticks;
	342	return ticks;
	343	}
	344
	345	/* return the time elapsed in VM between vm_start and vm_stop. Unless
	346	* icount is active, cpu_get_ticks() uses units of the host CPU cycle
	347	* counter.
	348	*/
	349	int64_t cpu_get_ticks(void)
	350	{
	351	int64_t ticks;
	352
	353	if (use_icount) {
	354	return cpu_get_icount();
	355	}
	356
	357	qemu_spin_lock(&timers_state.vm_clock_lock);
	358	ticks = cpu_get_ticks_locked();
	359	qemu_spin_unlock(&timers_state.vm_clock_lock);
	360	return ticks;
	361	}
	362
	363	static int64_t cpu_get_clock_locked(void)
	364	{
	365	int64_t time;
	366
	367	time = timers_state.cpu_clock_offset;
	368	if (timers_state.cpu_ticks_enabled) {
	369	time += get_clock();
	370	}
	371
	372	return time;
	373	}
	374
	375	/* Return the monotonic time elapsed in VM, i.e.,
	376	* the time between vm_start and vm_stop
	377	*/
	378	int64_t cpu_get_clock(void)
	379	{
	380	int64_t ti;
	381	unsigned start;
	382
	383	do {
	384	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	385	ti = cpu_get_clock_locked();
	386	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	387
	388	return ti;
	389	}
	390
	391	/* enable cpu_get_ticks()
	392	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	393	*/
	394	void cpu_enable_ticks(void)
	395	{
	396	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	397	&timers_state.vm_clock_lock);
	398	if (!timers_state.cpu_ticks_enabled) {
	399	timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
	400	timers_state.cpu_clock_offset -= get_clock();
	401	timers_state.cpu_ticks_enabled = 1;
	402	}
	403	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	404	&timers_state.vm_clock_lock);
	405	}
	406
	407	/* disable cpu_get_ticks() : the clock is stopped. You must not call
	408	* cpu_get_ticks() after that.
	409	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	410	*/
	411	void cpu_disable_ticks(void)
	412	{
	413	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	414	&timers_state.vm_clock_lock);
	415	if (timers_state.cpu_ticks_enabled) {
	416	timers_state.cpu_ticks_offset += cpu_get_host_ticks();
	417	timers_state.cpu_clock_offset = cpu_get_clock_locked();
	418	timers_state.cpu_ticks_enabled = 0;
	419	}
	420	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	421	&timers_state.vm_clock_lock);
	422	}
	423
	424	/* Correlation between real and virtual time is always going to be
	425	fairly approximate, so ignore small variation.
	426	When the guest is idle real and virtual time will be aligned in
	427	the IO wait loop. */
	428	#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
	429
	430	static void icount_adjust(void)
	431	{
	432	int64_t cur_time;
	433	int64_t cur_icount;
	434	int64_t delta;
	435
	436	/* Protected by TimersState mutex. */
	437	static int64_t last_delta;
	438
	439	/* If the VM is not running, then do nothing. */
	440	if (!runstate_is_running()) {
	441	return;
	442	}
	443
	444	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	445	&timers_state.vm_clock_lock);
	446	cur_time = cpu_get_clock_locked();
	447	cur_icount = cpu_get_icount_locked();
	448
	449	delta = cur_icount - cur_time;
	450	/* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
	451	if (delta > 0
	452	&& last_delta + ICOUNT_WOBBLE < delta * 2
	453	&& timers_state.icount_time_shift > 0) {
	454	/* The guest is getting too far ahead. Slow time down. */
	455	atomic_set(&timers_state.icount_time_shift,
	456	timers_state.icount_time_shift - 1);
	457	}
	458	if (delta < 0
	459	&& last_delta - ICOUNT_WOBBLE > delta * 2
	460	&& timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
	461	/* The guest is getting too far behind. Speed time up. */
	462	atomic_set(&timers_state.icount_time_shift,
	463	timers_state.icount_time_shift + 1);
	464	}
	465	last_delta = delta;
	466	atomic_set_i64(&timers_state.qemu_icount_bias,
	467	cur_icount - (timers_state.qemu_icount
	468	<< timers_state.icount_time_shift));
	469	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	470	&timers_state.vm_clock_lock);
	471	}
	472
	473	static void icount_adjust_rt(void *opaque)
	474	{
	475	timer_mod(timers_state.icount_rt_timer,
	476	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	477	icount_adjust();
	478	}
	479
	480	static void icount_adjust_vm(void *opaque)
	481	{
	482	timer_mod(timers_state.icount_vm_timer,
	483	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	484	NANOSECONDS_PER_SECOND / 10);
	485	icount_adjust();
	486	}
	487
	488	static int64_t qemu_icount_round(int64_t count)
	489	{
	490	int shift = atomic_read(&timers_state.icount_time_shift);
	491	return (count + (1 << shift) - 1) >> shift;
	492	}
	493
	494	static void icount_warp_rt(void)
	495	{
	496	unsigned seq;
	497	int64_t warp_start;
	498
	499	/* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
	500	* changes from -1 to another value, so the race here is okay.
	501	*/
	502	do {
	503	seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	504	warp_start = timers_state.vm_clock_warp_start;
	505	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
	506
	507	if (warp_start == -1) {
	508	return;
	509	}
	510
	511	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	512	&timers_state.vm_clock_lock);
	513	if (runstate_is_running()) {
	514	int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
	515	cpu_get_clock_locked());
	516	int64_t warp_delta;
	517
	518	warp_delta = clock - timers_state.vm_clock_warp_start;
	519	if (use_icount == 2) {
	520	/*
	521	* In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
	522	* far ahead of real time.
	523	*/
	524	int64_t cur_icount = cpu_get_icount_locked();
	525	int64_t delta = clock - cur_icount;
	526	warp_delta = MIN(warp_delta, delta);
	527	}
	528	atomic_set_i64(&timers_state.qemu_icount_bias,
	529	timers_state.qemu_icount_bias + warp_delta);
	530	}
	531	timers_state.vm_clock_warp_start = -1;
	532	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	533	&timers_state.vm_clock_lock);
	534
	535	if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
	536	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	537	}
	538	}
	539
	540	static void icount_timer_cb(void *opaque)
	541	{
	542	/* No need for a checkpoint because the timer already synchronizes
	543	* with CHECKPOINT_CLOCK_VIRTUAL_RT.
	544	*/
	545	icount_warp_rt();
	546	}
	547
	548	void qtest_clock_warp(int64_t dest)
	549	{
	550	int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	551	AioContext *aio_context;
	552	assert(qtest_enabled());
	553	aio_context = qemu_get_aio_context();
	554	while (clock < dest) {
	555	int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	556	int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
	557
	558	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	559	&timers_state.vm_clock_lock);
	560	atomic_set_i64(&timers_state.qemu_icount_bias,
	561	timers_state.qemu_icount_bias + warp);
	562	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	563	&timers_state.vm_clock_lock);
	564
	565	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	566	timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
	567	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	568	}
	569	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	570	}
	571
	572	void qemu_start_warp_timer(void)
	573	{
	574	int64_t clock;
	575	int64_t deadline;
	576
	577	if (!use_icount) {
	578	return;
	579	}
	580
	581	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	582	* do not fire, so computing the deadline does not make sense.
	583	*/
	584	if (!runstate_is_running()) {
	585	return;
	586	}
	587
	588	if (replay_mode != REPLAY_MODE_PLAY) {
	589	if (!all_cpu_threads_idle()) {
	590	return;
	591	}
	592
	593	if (qtest_enabled()) {
	594	/* When testing, qtest commands advance icount. */
	595	return;
	596	}
	597
	598	replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
	599	} else {
	600	/* warp clock deterministically in record/replay mode */
	601	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
	602	/* vCPU is sleeping and warp can't be started.
	603	It is probably a race condition: notification sent
	604	to vCPU was processed in advance and vCPU went to sleep.
	605	Therefore we have to wake it up for doing someting. */
	606	if (replay_has_checkpoint()) {
	607	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	608	}
	609	return;
	610	}
	611	}
	612
	613	/* We want to use the earliest deadline from ALL vm_clocks */
	614	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
	615	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	616	if (deadline < 0) {
	617	static bool notified;
	618	if (!icount_sleep && !notified) {
	619	warn_report("icount sleep disabled and no active timers");
	620	notified = true;
	621	}
	622	return;
	623	}
	624
	625	if (deadline > 0) {
	626	/*
	627	* Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
	628	* sleep. Otherwise, the CPU might be waiting for a future timer
	629	* interrupt to wake it up, but the interrupt never comes because
	630	* the vCPU isn't running any insns and thus doesn't advance the
	631	* QEMU_CLOCK_VIRTUAL.
	632	*/
	633	if (!icount_sleep) {
	634	/*
	635	* We never let VCPUs sleep in no sleep icount mode.
	636	* If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
	637	* to the next QEMU_CLOCK_VIRTUAL event and notify it.
	638	* It is useful when we want a deterministic execution time,
	639	* isolated from host latencies.
	640	*/
	641	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	642	&timers_state.vm_clock_lock);
	643	atomic_set_i64(&timers_state.qemu_icount_bias,
	644	timers_state.qemu_icount_bias + deadline);
	645	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	646	&timers_state.vm_clock_lock);
	647	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	648	} else {
	649	/*
	650	* We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
	651	* "real" time, (related to the time left until the next event) has
	652	* passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
	653	* This avoids that the warps are visible externally; for example,
	654	* you will not be sending network packets continuously instead of
	655	* every 100ms.
	656	*/
	657	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	658	&timers_state.vm_clock_lock);
	659	if (timers_state.vm_clock_warp_start == -1
	660	\|\| timers_state.vm_clock_warp_start > clock) {
	661	timers_state.vm_clock_warp_start = clock;
	662	}
	663	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	664	&timers_state.vm_clock_lock);
	665	timer_mod_anticipate(timers_state.icount_warp_timer,
	666	clock + deadline);
	667	}
	668	} else if (deadline == 0) {
	669	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	670	}
	671	}
	672
	673	static void qemu_account_warp_timer(void)
	674	{
	675	if (!use_icount \|\| !icount_sleep) {
	676	return;
	677	}
	678
	679	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	680	* do not fire, so computing the deadline does not make sense.
	681	*/
	682	if (!runstate_is_running()) {
	683	return;
	684	}
	685
	686	/* warp clock deterministically in record/replay mode */
	687	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
	688	return;
	689	}
	690
	691	timer_del(timers_state.icount_warp_timer);
	692	icount_warp_rt();
	693	}
	694
	695	static bool icount_state_needed(void *opaque)
	696	{
	697	return use_icount;
	698	}
	699
	700	static bool warp_timer_state_needed(void *opaque)
	701	{
	702	TimersState *s = opaque;
	703	return s->icount_warp_timer != NULL;
	704	}
	705
	706	static bool adjust_timers_state_needed(void *opaque)
	707	{
	708	TimersState *s = opaque;
	709	return s->icount_rt_timer != NULL;
	710	}
	711
	712	/*
	713	* Subsection for warp timer migration is optional, because may not be created
	714	*/
	715	static const VMStateDescription icount_vmstate_warp_timer = {
	716	.name = "timer/icount/warp_timer",
	717	.version_id = 1,
	718	.minimum_version_id = 1,
	719	.needed = warp_timer_state_needed,
	720	.fields = (VMStateField[]) {
	721	VMSTATE_INT64(vm_clock_warp_start, TimersState),
	722	VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
	723	VMSTATE_END_OF_LIST()
	724	}
	725	};
	726
	727	static const VMStateDescription icount_vmstate_adjust_timers = {
	728	.name = "timer/icount/timers",
	729	.version_id = 1,
	730	.minimum_version_id = 1,
	731	.needed = adjust_timers_state_needed,
	732	.fields = (VMStateField[]) {
	733	VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
	734	VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
	735	VMSTATE_END_OF_LIST()
	736	}
	737	};
	738
	739	/*
	740	* This is a subsection for icount migration.
	741	*/
	742	static const VMStateDescription icount_vmstate_timers = {
	743	.name = "timer/icount",
	744	.version_id = 1,
	745	.minimum_version_id = 1,
	746	.needed = icount_state_needed,
	747	.fields = (VMStateField[]) {
	748	VMSTATE_INT64(qemu_icount_bias, TimersState),
	749	VMSTATE_INT64(qemu_icount, TimersState),
	750	VMSTATE_END_OF_LIST()
	751	},
	752	.subsections = (const VMStateDescription*[]) {
	753	&icount_vmstate_warp_timer,
	754	&icount_vmstate_adjust_timers,
	755	NULL
	756	}
	757	};
	758
	759	static const VMStateDescription vmstate_timers = {
	760	.name = "timer",
	761	.version_id = 2,
	762	.minimum_version_id = 1,
	763	.fields = (VMStateField[]) {
	764	VMSTATE_INT64(cpu_ticks_offset, TimersState),
	765	VMSTATE_UNUSED(8),
	766	VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
	767	VMSTATE_END_OF_LIST()
	768	},
	769	.subsections = (const VMStateDescription*[]) {
	770	&icount_vmstate_timers,
	771	NULL
	772	}
	773	};
	774
	775	static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
	776	{
	777	double pct;
	778	double throttle_ratio;
	779	long sleeptime_ns;
	780
	781	if (!cpu_throttle_get_percentage()) {
	782	return;
	783	}
	784
	785	pct = (double)cpu_throttle_get_percentage()/100;
	786	throttle_ratio = pct / (1 - pct);
	787	sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
	788
	789	qemu_mutex_unlock_iothread();
	790	g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
	791	qemu_mutex_lock_iothread();
	792	atomic_set(&cpu->throttle_thread_scheduled, 0);
	793	}
	794
	795	static void cpu_throttle_timer_tick(void *opaque)
	796	{
	797	CPUState *cpu;
	798	double pct;
	799
	800	/* Stop the timer if needed */
	801	if (!cpu_throttle_get_percentage()) {
	802	return;
	803	}
	804	CPU_FOREACH(cpu) {
	805	if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
	806	async_run_on_cpu(cpu, cpu_throttle_thread,
	807	RUN_ON_CPU_NULL);
	808	}
	809	}
	810
	811	pct = (double)cpu_throttle_get_percentage()/100;
	812	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	813	CPU_THROTTLE_TIMESLICE_NS / (1-pct));
	814	}
	815
	816	void cpu_throttle_set(int new_throttle_pct)
	817	{
	818	/* Ensure throttle percentage is within valid range */
	819	new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
	820	new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
	821
	822	atomic_set(&throttle_percentage, new_throttle_pct);
	823
	824	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	825	CPU_THROTTLE_TIMESLICE_NS);
	826	}
	827
	828	void cpu_throttle_stop(void)
	829	{
	830	atomic_set(&throttle_percentage, 0);
	831	}
	832
	833	bool cpu_throttle_active(void)
	834	{
	835	return (cpu_throttle_get_percentage() != 0);
	836	}
	837
	838	int cpu_throttle_get_percentage(void)
	839	{
	840	return atomic_read(&throttle_percentage);
	841	}
	842
	843	void cpu_ticks_init(void)
	844	{
	845	seqlock_init(&timers_state.vm_clock_seqlock);
	846	qemu_spin_init(&timers_state.vm_clock_lock);
	847	vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
	848	throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	849	cpu_throttle_timer_tick, NULL);
	850	}
	851
	852	void configure_icount(QemuOpts opts, Error *errp)
	853	{
	854	const char *option;
	855	char *rem_str = NULL;
	856
	857	option = qemu_opt_get(opts, "shift");
	858	if (!option) {
	859	if (qemu_opt_get(opts, "align") != NULL) {
	860	error_setg(errp, "Please specify shift option when using align");
	861	}
	862	return;
	863	}
	864
	865	icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
	866	if (icount_sleep) {
	867	timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	868	icount_timer_cb, NULL);
	869	}
	870
	871	icount_align_option = qemu_opt_get_bool(opts, "align", false);
	872
	873	if (icount_align_option && !icount_sleep) {
	874	error_setg(errp, "align=on and sleep=off are incompatible");
	875	}
	876	if (strcmp(option, "auto") != 0) {
	877	errno = 0;
	878	timers_state.icount_time_shift = strtol(option, &rem_str, 0);
	879	if (errno != 0 \|\| *rem_str != '\0' \|\| !strlen(option)) {
	880	error_setg(errp, "icount: Invalid shift value");
	881	}
	882	use_icount = 1;
	883	return;
	884	} else if (icount_align_option) {
	885	error_setg(errp, "shift=auto and align=on are incompatible");
	886	} else if (!icount_sleep) {
	887	error_setg(errp, "shift=auto and sleep=off are incompatible");
	888	}
	889
	890	use_icount = 2;
	891
	892	/* 125MIPS seems a reasonable initial guess at the guest speed.
	893	It will be corrected fairly quickly anyway. */
	894	timers_state.icount_time_shift = 3;
	895
	896	/* Have both realtime and virtual time triggers for speed adjustment.
	897	The realtime trigger catches emulated time passing too slowly,
	898	the virtual time trigger catches emulated time passing too fast.
	899	Realtime triggers occur even when idle, so use them less frequently
	900	than VM triggers. */
	901	timers_state.vm_clock_warp_start = -1;
	902	timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
	903	icount_adjust_rt, NULL);
	904	timer_mod(timers_state.icount_rt_timer,
	905	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	906	timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	907	icount_adjust_vm, NULL);
	908	timer_mod(timers_state.icount_vm_timer,
	909	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	910	NANOSECONDS_PER_SECOND / 10);
	911	}
	912
	913	/***********************************************************/
	914	/* TCG vCPU kick timer
	915	*
	916	* The kick timer is responsible for moving single threaded vCPU
	917	* emulation on to the next vCPU. If more than one vCPU is running a
	918	* timer event with force a cpu->exit so the next vCPU can get
	919	* scheduled.
	920	*
	921	* The timer is removed if all vCPUs are idle and restarted again once
	922	* idleness is complete.
	923	*/
	924
	925	static QEMUTimer *tcg_kick_vcpu_timer;
	926	static CPUState *tcg_current_rr_cpu;
	927
	928	#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
	929
	930	static inline int64_t qemu_tcg_next_kick(void)
	931	{
	932	return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
	933	}
	934
	935	/* Kick the currently round-robin scheduled vCPU */
	936	static void qemu_cpu_kick_rr_cpu(void)
	937	{
	938	CPUState *cpu;
	939	do {
	940	cpu = atomic_mb_read(&tcg_current_rr_cpu);
	941	if (cpu) {
	942	cpu_exit(cpu);
	943	}
	944	} while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
	945	}
	946
	947	static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
	948	{
	949	}
	950
	951	void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
	952	{
	953	if (!use_icount \|\| type != QEMU_CLOCK_VIRTUAL) {
	954	qemu_notify_event();
	955	return;
	956	}
	957
	958	if (qemu_in_vcpu_thread()) {
	959	/* A CPU is currently running; kick it back out to the
	960	* tcg_cpu_exec() loop so it will recalculate its
	961	* icount deadline immediately.
	962	*/
	963	qemu_cpu_kick(current_cpu);
	964	} else if (first_cpu) {
	965	/* qemu_cpu_kick is not enough to kick a halted CPU out of
	966	* qemu_tcg_wait_io_event. async_run_on_cpu, instead,
	967	* causes cpu_thread_is_idle to return false. This way,
	968	* handle_icount_deadline can run.
	969	* If we have no CPUs at all for some reason, we don't
	970	* need to do anything.
	971	*/
	972	async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
	973	}
	974	}
	975
	976	static void kick_tcg_thread(void *opaque)
	977	{
	978	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	979	qemu_cpu_kick_rr_cpu();
	980	}
	981
	982	static void start_tcg_kick_timer(void)
	983	{
	984	assert(!mttcg_enabled);
	985	if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
	986	tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	987	kick_tcg_thread, NULL);
	988	}
	989	if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
	990	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	991	}
	992	}
	993
	994	static void stop_tcg_kick_timer(void)
	995	{
	996	assert(!mttcg_enabled);
	997	if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
	998	timer_del(tcg_kick_vcpu_timer);
	999	}
	1000	}
	1001
	1002	/***********************************************************/
	1003	void hw_error(const char *fmt, ...)
	1004	{
	1005	va_list ap;
	1006	CPUState *cpu;
	1007
	1008	va_start(ap, fmt);
	1009	fprintf(stderr, "qemu: hardware error: ");
	1010	vfprintf(stderr, fmt, ap);
	1011	fprintf(stderr, "\n");
	1012	CPU_FOREACH(cpu) {
	1013	fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
	1014	cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
	1015	}
	1016	va_end(ap);
	1017	abort();
	1018	}
	1019
	1020	void cpu_synchronize_all_states(void)
	1021	{
	1022	CPUState *cpu;
	1023
	1024	CPU_FOREACH(cpu) {
	1025	cpu_synchronize_state(cpu);
	1026	/* TODO: move to cpu_synchronize_state() */
	1027	if (hvf_enabled()) {
	1028	hvf_cpu_synchronize_state(cpu);
	1029	}
	1030	}
	1031	}
	1032
	1033	void cpu_synchronize_all_post_reset(void)
	1034	{
	1035	CPUState *cpu;
	1036
	1037	CPU_FOREACH(cpu) {
	1038	cpu_synchronize_post_reset(cpu);
	1039	/* TODO: move to cpu_synchronize_post_reset() */
	1040	if (hvf_enabled()) {
	1041	hvf_cpu_synchronize_post_reset(cpu);
	1042	}
	1043	}
	1044	}
	1045
	1046	void cpu_synchronize_all_post_init(void)
	1047	{
	1048	CPUState *cpu;
	1049
	1050	CPU_FOREACH(cpu) {
	1051	cpu_synchronize_post_init(cpu);
	1052	/* TODO: move to cpu_synchronize_post_init() */
	1053	if (hvf_enabled()) {
	1054	hvf_cpu_synchronize_post_init(cpu);
	1055	}
	1056	}
	1057	}
	1058
	1059	void cpu_synchronize_all_pre_loadvm(void)
	1060	{
	1061	CPUState *cpu;
	1062
	1063	CPU_FOREACH(cpu) {
	1064	cpu_synchronize_pre_loadvm(cpu);
	1065	}
	1066	}
	1067
	1068	static int do_vm_stop(RunState state, bool send_stop)
	1069	{
	1070	int ret = 0;
	1071
	1072	if (runstate_is_running()) {
	1073	cpu_disable_ticks();
	1074	pause_all_vcpus();
	1075	runstate_set(state);
	1076	vm_state_notify(0, state);
	1077	if (send_stop) {
	1078	qapi_event_send_stop();
	1079	}
	1080	}
	1081
	1082	bdrv_drain_all();
	1083	replay_disable_events();
	1084	ret = bdrv_flush_all();
	1085
	1086	return ret;
	1087	}
	1088
	1089	/* Special vm_stop() variant for terminating the process. Historically clients
	1090	* did not expect a QMP STOP event and so we need to retain compatibility.
	1091	*/
	1092	int vm_shutdown(void)
	1093	{
	1094	return do_vm_stop(RUN_STATE_SHUTDOWN, false);
	1095	}
	1096
	1097	static bool cpu_can_run(CPUState *cpu)
	1098	{
	1099	if (cpu->stop) {
	1100	return false;
	1101	}
	1102	if (cpu_is_stopped(cpu)) {
	1103	return false;
	1104	}
	1105	return true;
	1106	}
	1107
	1108	static void cpu_handle_guest_debug(CPUState *cpu)
	1109	{
	1110	gdb_set_stop_cpu(cpu);
	1111	qemu_system_debug_request();
	1112	cpu->stopped = true;
	1113	}
	1114
	1115	#ifdef CONFIG_LINUX
	1116	static void sigbus_reraise(void)
	1117	{
	1118	sigset_t set;
	1119	struct sigaction action;
	1120
	1121	memset(&action, 0, sizeof(action));
	1122	action.sa_handler = SIG_DFL;
	1123	if (!sigaction(SIGBUS, &action, NULL)) {
	1124	raise(SIGBUS);
	1125	sigemptyset(&set);
	1126	sigaddset(&set, SIGBUS);
	1127	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
	1128	}
	1129	perror("Failed to re-raise SIGBUS!\n");
	1130	abort();
	1131	}
	1132
	1133	static void sigbus_handler(int n, siginfo_t siginfo, void ctx)
	1134	{
	1135	if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
	1136	sigbus_reraise();
	1137	}
	1138
	1139	if (current_cpu) {
	1140	/* Called asynchronously in VCPU thread. */
	1141	if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
	1142	sigbus_reraise();
	1143	}
	1144	} else {
	1145	/* Called synchronously (via signalfd) in main thread. */
	1146	if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
	1147	sigbus_reraise();
	1148	}
	1149	}
	1150	}
	1151
	1152	static void qemu_init_sigbus(void)
	1153	{
	1154	struct sigaction action;
	1155
	1156	memset(&action, 0, sizeof(action));
	1157	action.sa_flags = SA_SIGINFO;
	1158	action.sa_sigaction = sigbus_handler;
	1159	sigaction(SIGBUS, &action, NULL);
	1160
	1161	prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
	1162	}
	1163	#else /* !CONFIG_LINUX */
	1164	static void qemu_init_sigbus(void)
	1165	{
	1166	}
	1167	#endif /* !CONFIG_LINUX */
	1168
	1169	static QemuMutex qemu_global_mutex;
	1170
	1171	static QemuThread io_thread;
	1172
	1173	/* cpu creation */
	1174	static QemuCond qemu_cpu_cond;
	1175	/* system init */
	1176	static QemuCond qemu_pause_cond;
	1177
	1178	void qemu_init_cpu_loop(void)
	1179	{
	1180	qemu_init_sigbus();
	1181	qemu_cond_init(&qemu_cpu_cond);
	1182	qemu_cond_init(&qemu_pause_cond);
	1183	qemu_mutex_init(&qemu_global_mutex);
	1184
	1185	qemu_thread_get_self(&io_thread);
	1186	}
	1187
	1188	void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
	1189	{
	1190	do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
	1191	}
	1192
	1193	static void qemu_kvm_destroy_vcpu(CPUState *cpu)
	1194	{
	1195	if (kvm_destroy_vcpu(cpu) < 0) {
	1196	error_report("kvm_destroy_vcpu failed");
	1197	exit(EXIT_FAILURE);
	1198	}
	1199	}
	1200
	1201	static void qemu_tcg_destroy_vcpu(CPUState *cpu)
	1202	{
	1203	}
	1204
	1205	static void qemu_cpu_stop(CPUState *cpu, bool exit)
	1206	{
	1207	g_assert(qemu_cpu_is_self(cpu));
	1208	cpu->stop = false;
	1209	cpu->stopped = true;
	1210	if (exit) {
	1211	cpu_exit(cpu);
	1212	}
	1213	qemu_cond_broadcast(&qemu_pause_cond);
	1214	}
	1215
	1216	static void qemu_wait_io_event_common(CPUState *cpu)
	1217	{
	1218	atomic_mb_set(&cpu->thread_kicked, false);
	1219	if (cpu->stop) {
	1220	qemu_cpu_stop(cpu, false);
	1221	}
	1222	process_queued_cpu_work(cpu);
	1223	}
	1224
	1225	static void qemu_tcg_rr_wait_io_event(void)
	1226	{
	1227	CPUState *cpu;
	1228
	1229	while (all_cpu_threads_idle()) {
	1230	stop_tcg_kick_timer();
	1231	qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
	1232	}
	1233
	1234	start_tcg_kick_timer();
	1235
	1236	CPU_FOREACH(cpu) {
	1237	qemu_wait_io_event_common(cpu);
	1238	}
	1239	}
	1240
	1241	static void qemu_wait_io_event(CPUState *cpu)
	1242	{
	1243	while (cpu_thread_is_idle(cpu)) {
	1244	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1245	}
	1246
	1247	#ifdef _WIN32
	1248	/* Eat dummy APC queued by qemu_cpu_kick_thread. */
	1249	if (!tcg_enabled()) {
	1250	SleepEx(0, TRUE);
	1251	}
	1252	#endif
	1253	qemu_wait_io_event_common(cpu);
	1254	}
	1255
	1256	static void qemu_kvm_cpu_thread_fn(void arg)
	1257	{
	1258	CPUState *cpu = arg;
	1259	int r;
	1260
	1261	rcu_register_thread();
	1262
	1263	qemu_mutex_lock_iothread();
	1264	qemu_thread_get_self(cpu->thread);
	1265	cpu->thread_id = qemu_get_thread_id();
	1266	cpu->can_do_io = 1;
	1267	current_cpu = cpu;
	1268
	1269	r = kvm_init_vcpu(cpu);
	1270	if (r < 0) {
	1271	error_report("kvm_init_vcpu failed: %s", strerror(-r));
	1272	exit(1);
	1273	}
	1274
	1275	kvm_init_cpu_signals(cpu);
	1276
	1277	/* signal CPU creation */
	1278	cpu->created = true;
	1279	qemu_cond_signal(&qemu_cpu_cond);
	1280	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1281
	1282	do {
	1283	if (cpu_can_run(cpu)) {
	1284	r = kvm_cpu_exec(cpu);
	1285	if (r == EXCP_DEBUG) {
	1286	cpu_handle_guest_debug(cpu);
	1287	}
	1288	}
	1289	qemu_wait_io_event(cpu);
	1290	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1291
	1292	qemu_kvm_destroy_vcpu(cpu);
	1293	cpu->created = false;
	1294	qemu_cond_signal(&qemu_cpu_cond);
	1295	qemu_mutex_unlock_iothread();
	1296	rcu_unregister_thread();
	1297	return NULL;
	1298	}
	1299
	1300	static void qemu_dummy_cpu_thread_fn(void arg)
	1301	{
	1302	#ifdef _WIN32
	1303	error_report("qtest is not supported under Windows");
	1304	exit(1);
	1305	#else
	1306	CPUState *cpu = arg;
	1307	sigset_t waitset;
	1308	int r;
	1309
	1310	rcu_register_thread();
	1311
	1312	qemu_mutex_lock_iothread();
	1313	qemu_thread_get_self(cpu->thread);
	1314	cpu->thread_id = qemu_get_thread_id();
	1315	cpu->can_do_io = 1;
	1316	current_cpu = cpu;
	1317
	1318	sigemptyset(&waitset);
	1319	sigaddset(&waitset, SIG_IPI);
	1320
	1321	/* signal CPU creation */
	1322	cpu->created = true;
	1323	qemu_cond_signal(&qemu_cpu_cond);
	1324	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1325
	1326	do {
	1327	qemu_mutex_unlock_iothread();
	1328	do {
	1329	int sig;
	1330	r = sigwait(&waitset, &sig);
	1331	} while (r == -1 && (errno == EAGAIN \|\| errno == EINTR));
	1332	if (r == -1) {
	1333	perror("sigwait");
	1334	exit(1);
	1335	}
	1336	qemu_mutex_lock_iothread();
	1337	qemu_wait_io_event(cpu);
	1338	} while (!cpu->unplug);
	1339
	1340	qemu_mutex_unlock_iothread();
	1341	rcu_unregister_thread();
	1342	return NULL;
	1343	#endif
	1344	}
	1345
	1346	static int64_t tcg_get_icount_limit(void)
	1347	{
	1348	int64_t deadline;
	1349
	1350	if (replay_mode != REPLAY_MODE_PLAY) {
	1351	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1352
	1353	/* Maintain prior (possibly buggy) behaviour where if no deadline
	1354	* was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
	1355	* INT32_MAX nanoseconds ahead, we still use INT32_MAX
	1356	* nanoseconds.
	1357	*/
	1358	if ((deadline < 0) \|\| (deadline > INT32_MAX)) {
	1359	deadline = INT32_MAX;
	1360	}
	1361
	1362	return qemu_icount_round(deadline);
	1363	} else {
	1364	return replay_get_instructions();
	1365	}
	1366	}
	1367
	1368	static void handle_icount_deadline(void)
	1369	{
	1370	assert(qemu_in_vcpu_thread());
	1371	if (use_icount) {
	1372	int64_t deadline =
	1373	qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1374
	1375	if (deadline == 0) {
	1376	/* Wake up other AioContexts. */
	1377	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	1378	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	1379	}
	1380	}
	1381	}
	1382
	1383	static void prepare_icount_for_run(CPUState *cpu)
	1384	{
	1385	if (use_icount) {
	1386	int insns_left;
	1387
	1388	/* These should always be cleared by process_icount_data after
	1389	* each vCPU execution. However u16.high can be raised
	1390	* asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
	1391	*/
	1392	g_assert(cpu->icount_decr.u16.low == 0);
	1393	g_assert(cpu->icount_extra == 0);
	1394
	1395	cpu->icount_budget = tcg_get_icount_limit();
	1396	insns_left = MIN(0xffff, cpu->icount_budget);
	1397	cpu->icount_decr.u16.low = insns_left;
	1398	cpu->icount_extra = cpu->icount_budget - insns_left;
	1399
	1400	replay_mutex_lock();
	1401	}
	1402	}
	1403
	1404	static void process_icount_data(CPUState *cpu)
	1405	{
	1406	if (use_icount) {
	1407	/* Account for executed instructions */
	1408	cpu_update_icount(cpu);
	1409
	1410	/* Reset the counters */
	1411	cpu->icount_decr.u16.low = 0;
	1412	cpu->icount_extra = 0;
	1413	cpu->icount_budget = 0;
	1414
	1415	replay_account_executed_instructions();
	1416
	1417	replay_mutex_unlock();
	1418	}
	1419	}
	1420
	1421
	1422	static int tcg_cpu_exec(CPUState *cpu)
	1423	{
	1424	int ret;
	1425	#ifdef CONFIG_PROFILER
	1426	int64_t ti;
	1427	#endif
	1428
	1429	assert(tcg_enabled());
	1430	#ifdef CONFIG_PROFILER
	1431	ti = profile_getclock();
	1432	#endif
	1433	cpu_exec_start(cpu);
	1434	ret = cpu_exec(cpu);
	1435	cpu_exec_end(cpu);
	1436	#ifdef CONFIG_PROFILER
	1437	atomic_set(&tcg_ctx->prof.cpu_exec_time,
	1438	tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
	1439	#endif
	1440	return ret;
	1441	}
	1442
	1443	/* Destroy any remaining vCPUs which have been unplugged and have
	1444	* finished running
	1445	*/
	1446	static void deal_with_unplugged_cpus(void)
	1447	{
	1448	CPUState *cpu;
	1449
	1450	CPU_FOREACH(cpu) {
	1451	if (cpu->unplug && !cpu_can_run(cpu)) {
	1452	qemu_tcg_destroy_vcpu(cpu);
	1453	cpu->created = false;
	1454	qemu_cond_signal(&qemu_cpu_cond);
	1455	break;
	1456	}
	1457	}
	1458	}
	1459
	1460	/* Single-threaded TCG
	1461	*
	1462	* In the single-threaded case each vCPU is simulated in turn. If
	1463	* there is more than a single vCPU we create a simple timer to kick
	1464	* the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
	1465	* This is done explicitly rather than relying on side-effects
	1466	* elsewhere.
	1467	*/
	1468
	1469	static void qemu_tcg_rr_cpu_thread_fn(void arg)
	1470	{
	1471	CPUState *cpu = arg;
	1472
	1473	assert(tcg_enabled());
	1474	rcu_register_thread();
	1475	tcg_register_thread();
	1476
	1477	qemu_mutex_lock_iothread();
	1478	qemu_thread_get_self(cpu->thread);
	1479
	1480	cpu->thread_id = qemu_get_thread_id();
	1481	cpu->created = true;
	1482	cpu->can_do_io = 1;
	1483	qemu_cond_signal(&qemu_cpu_cond);
	1484	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1485
	1486	/* wait for initial kick-off after machine start */
	1487	while (first_cpu->stopped) {
	1488	qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
	1489
	1490	/* process any pending work */
	1491	CPU_FOREACH(cpu) {
	1492	current_cpu = cpu;
	1493	qemu_wait_io_event_common(cpu);
	1494	}
	1495	}
	1496
	1497	start_tcg_kick_timer();
	1498
	1499	cpu = first_cpu;
	1500
	1501	/* process any pending work */
	1502	cpu->exit_request = 1;
	1503
	1504	while (1) {
	1505	qemu_mutex_unlock_iothread();
	1506	replay_mutex_lock();
	1507	qemu_mutex_lock_iothread();
	1508	/* Account partial waits to QEMU_CLOCK_VIRTUAL. */
	1509	qemu_account_warp_timer();
	1510
	1511	/* Run the timers here. This is much more efficient than
	1512	* waking up the I/O thread and waiting for completion.
	1513	*/
	1514	handle_icount_deadline();
	1515
	1516	replay_mutex_unlock();
	1517
	1518	if (!cpu) {
	1519	cpu = first_cpu;
	1520	}
	1521
	1522	while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
	1523
	1524	atomic_mb_set(&tcg_current_rr_cpu, cpu);
	1525	current_cpu = cpu;
	1526
	1527	qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
	1528	(cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
	1529
	1530	if (cpu_can_run(cpu)) {
	1531	int r;
	1532
	1533	qemu_mutex_unlock_iothread();
	1534	prepare_icount_for_run(cpu);
	1535
	1536	r = tcg_cpu_exec(cpu);
	1537
	1538	process_icount_data(cpu);
	1539	qemu_mutex_lock_iothread();
	1540
	1541	if (r == EXCP_DEBUG) {
	1542	cpu_handle_guest_debug(cpu);
	1543	break;
	1544	} else if (r == EXCP_ATOMIC) {
	1545	qemu_mutex_unlock_iothread();
	1546	cpu_exec_step_atomic(cpu);
	1547	qemu_mutex_lock_iothread();
	1548	break;
	1549	}
	1550	} else if (cpu->stop) {
	1551	if (cpu->unplug) {
	1552	cpu = CPU_NEXT(cpu);
	1553	}
	1554	break;
	1555	}
	1556
	1557	cpu = CPU_NEXT(cpu);
	1558	} /* while (cpu && !cpu->exit_request).. */
	1559
	1560	/* Does not need atomic_mb_set because a spurious wakeup is okay. */
	1561	atomic_set(&tcg_current_rr_cpu, NULL);
	1562
	1563	if (cpu && cpu->exit_request) {
	1564	atomic_mb_set(&cpu->exit_request, 0);
	1565	}
	1566
	1567	if (use_icount && all_cpu_threads_idle()) {
	1568	/*
	1569	* When all cpus are sleeping (e.g in WFI), to avoid a deadlock
	1570	* in the main_loop, wake it up in order to start the warp timer.
	1571	*/
	1572	qemu_notify_event();
	1573	}
	1574
	1575	qemu_tcg_rr_wait_io_event();
	1576	deal_with_unplugged_cpus();
	1577	}
	1578
	1579	rcu_unregister_thread();
	1580	return NULL;
	1581	}
	1582
	1583	static void qemu_hax_cpu_thread_fn(void arg)
	1584	{
	1585	CPUState *cpu = arg;
	1586	int r;
	1587
	1588	rcu_register_thread();
	1589	qemu_mutex_lock_iothread();
	1590	qemu_thread_get_self(cpu->thread);
	1591
	1592	cpu->thread_id = qemu_get_thread_id();
	1593	cpu->created = true;
	1594	cpu->halted = 0;
	1595	current_cpu = cpu;
	1596
	1597	hax_init_vcpu(cpu);
	1598	qemu_cond_signal(&qemu_cpu_cond);
	1599	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1600
	1601	do {
	1602	if (cpu_can_run(cpu)) {
	1603	r = hax_smp_cpu_exec(cpu);
	1604	if (r == EXCP_DEBUG) {
	1605	cpu_handle_guest_debug(cpu);
	1606	}
	1607	}
	1608
	1609	qemu_wait_io_event(cpu);
	1610	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1611	rcu_unregister_thread();
	1612	return NULL;
	1613	}
	1614
	1615	/* The HVF-specific vCPU thread function. This one should only run when the host
	1616	* CPU supports the VMX "unrestricted guest" feature. */
	1617	static void qemu_hvf_cpu_thread_fn(void arg)
	1618	{
	1619	CPUState *cpu = arg;
	1620
	1621	int r;
	1622
	1623	assert(hvf_enabled());
	1624
	1625	rcu_register_thread();
	1626
	1627	qemu_mutex_lock_iothread();
	1628	qemu_thread_get_self(cpu->thread);
	1629
	1630	cpu->thread_id = qemu_get_thread_id();
	1631	cpu->can_do_io = 1;
	1632	current_cpu = cpu;
	1633
	1634	hvf_init_vcpu(cpu);
	1635
	1636	/* signal CPU creation */
	1637	cpu->created = true;
	1638	qemu_cond_signal(&qemu_cpu_cond);
	1639	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1640
	1641	do {
	1642	if (cpu_can_run(cpu)) {
	1643	r = hvf_vcpu_exec(cpu);
	1644	if (r == EXCP_DEBUG) {
	1645	cpu_handle_guest_debug(cpu);
	1646	}
	1647	}
	1648	qemu_wait_io_event(cpu);
	1649	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1650
	1651	hvf_vcpu_destroy(cpu);
	1652	cpu->created = false;
	1653	qemu_cond_signal(&qemu_cpu_cond);
	1654	qemu_mutex_unlock_iothread();
	1655	rcu_unregister_thread();
	1656	return NULL;
	1657	}
	1658
	1659	static void qemu_whpx_cpu_thread_fn(void arg)
	1660	{
	1661	CPUState *cpu = arg;
	1662	int r;
	1663
	1664	rcu_register_thread();
	1665
	1666	qemu_mutex_lock_iothread();
	1667	qemu_thread_get_self(cpu->thread);
	1668	cpu->thread_id = qemu_get_thread_id();
	1669	current_cpu = cpu;
	1670
	1671	r = whpx_init_vcpu(cpu);
	1672	if (r < 0) {
	1673	fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
	1674	exit(1);
	1675	}
	1676
	1677	/* signal CPU creation */
	1678	cpu->created = true;
	1679	qemu_cond_signal(&qemu_cpu_cond);
	1680	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1681
	1682	do {
	1683	if (cpu_can_run(cpu)) {
	1684	r = whpx_vcpu_exec(cpu);
	1685	if (r == EXCP_DEBUG) {
	1686	cpu_handle_guest_debug(cpu);
	1687	}
	1688	}
	1689	while (cpu_thread_is_idle(cpu)) {
	1690	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1691	}
	1692	qemu_wait_io_event_common(cpu);
	1693	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1694
	1695	whpx_destroy_vcpu(cpu);
	1696	cpu->created = false;
	1697	qemu_cond_signal(&qemu_cpu_cond);
	1698	qemu_mutex_unlock_iothread();
	1699	rcu_unregister_thread();
	1700	return NULL;
	1701	}
	1702
	1703	#ifdef _WIN32
	1704	static void CALLBACK dummy_apc_func(ULONG_PTR unused)
	1705	{
	1706	}
	1707	#endif
	1708
	1709	/* Multi-threaded TCG
	1710	*
	1711	* In the multi-threaded case each vCPU has its own thread. The TLS
	1712	* variable current_cpu can be used deep in the code to find the
	1713	* current CPUState for a given thread.
	1714	*/
	1715
	1716	static void qemu_tcg_cpu_thread_fn(void arg)
	1717	{
	1718	CPUState *cpu = arg;
	1719
	1720	assert(tcg_enabled());
	1721	g_assert(!use_icount);
	1722
	1723	rcu_register_thread();
	1724	tcg_register_thread();
	1725
	1726	qemu_mutex_lock_iothread();
	1727	qemu_thread_get_self(cpu->thread);
	1728
	1729	cpu->thread_id = qemu_get_thread_id();
	1730	cpu->created = true;
	1731	cpu->can_do_io = 1;
	1732	current_cpu = cpu;
	1733	qemu_cond_signal(&qemu_cpu_cond);
	1734	qemu_guest_random_seed_thread_part2(cpu->random_seed);
	1735
	1736	/* process any pending work */
	1737	cpu->exit_request = 1;
	1738
	1739	do {
	1740	if (cpu_can_run(cpu)) {
	1741	int r;
	1742	qemu_mutex_unlock_iothread();
	1743	r = tcg_cpu_exec(cpu);
	1744	qemu_mutex_lock_iothread();
	1745	switch (r) {
	1746	case EXCP_DEBUG:
	1747	cpu_handle_guest_debug(cpu);
	1748	break;
	1749	case EXCP_HALTED:
	1750	/* during start-up the vCPU is reset and the thread is
	1751	* kicked several times. If we don't ensure we go back
	1752	* to sleep in the halted state we won't cleanly
	1753	* start-up when the vCPU is enabled.
	1754	*
	1755	* cpu->halted should ensure we sleep in wait_io_event
	1756	*/
	1757	g_assert(cpu->halted);
	1758	break;
	1759	case EXCP_ATOMIC:
	1760	qemu_mutex_unlock_iothread();
	1761	cpu_exec_step_atomic(cpu);
	1762	qemu_mutex_lock_iothread();
	1763	default:
	1764	/* Ignore everything else? */
	1765	break;
	1766	}
	1767	}
	1768
	1769	atomic_mb_set(&cpu->exit_request, 0);
	1770	qemu_wait_io_event(cpu);
	1771	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1772
	1773	qemu_tcg_destroy_vcpu(cpu);
	1774	cpu->created = false;
	1775	qemu_cond_signal(&qemu_cpu_cond);
	1776	qemu_mutex_unlock_iothread();
	1777	rcu_unregister_thread();
	1778	return NULL;
	1779	}
	1780
	1781	static void qemu_cpu_kick_thread(CPUState *cpu)
	1782	{
	1783	#ifndef _WIN32
	1784	int err;
	1785
	1786	if (cpu->thread_kicked) {
	1787	return;
	1788	}
	1789	cpu->thread_kicked = true;
	1790	err = pthread_kill(cpu->thread->thread, SIG_IPI);
	1791	if (err && err != ESRCH) {
	1792	fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
	1793	exit(1);
	1794	}
	1795	#else /* _WIN32 */
	1796	if (!qemu_cpu_is_self(cpu)) {
	1797	if (whpx_enabled()) {
	1798	whpx_vcpu_kick(cpu);
	1799	} else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
	1800	fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
	1801	__func__, GetLastError());
	1802	exit(1);
	1803	}
	1804	}
	1805	#endif
	1806	}
	1807
	1808	void qemu_cpu_kick(CPUState *cpu)
	1809	{
	1810	qemu_cond_broadcast(cpu->halt_cond);
	1811	if (tcg_enabled()) {
	1812	cpu_exit(cpu);
	1813	/* NOP unless doing single-thread RR */
	1814	qemu_cpu_kick_rr_cpu();
	1815	} else {
	1816	if (hax_enabled()) {
	1817	/*
	1818	* FIXME: race condition with the exit_request check in
	1819	* hax_vcpu_hax_exec
	1820	*/
	1821	cpu->exit_request = 1;
	1822	}
	1823	qemu_cpu_kick_thread(cpu);
	1824	}
	1825	}
	1826
	1827	void qemu_cpu_kick_self(void)
	1828	{
	1829	assert(current_cpu);
	1830	qemu_cpu_kick_thread(current_cpu);
	1831	}
	1832
	1833	bool qemu_cpu_is_self(CPUState *cpu)
	1834	{
	1835	return qemu_thread_is_self(cpu->thread);
	1836	}
	1837
	1838	bool qemu_in_vcpu_thread(void)
	1839	{
	1840	return current_cpu && qemu_cpu_is_self(current_cpu);
	1841	}
	1842
	1843	static __thread bool iothread_locked = false;
	1844
	1845	bool qemu_mutex_iothread_locked(void)
	1846	{
	1847	return iothread_locked;
	1848	}
	1849
	1850	/*
	1851	* The BQL is taken from so many places that it is worth profiling the
	1852	* callers directly, instead of funneling them all through a single function.
	1853	*/
	1854	void qemu_mutex_lock_iothread_impl(const char *file, int line)
	1855	{
	1856	QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
	1857
	1858	g_assert(!qemu_mutex_iothread_locked());
	1859	bql_lock(&qemu_global_mutex, file, line);
	1860	iothread_locked = true;
	1861	}
	1862
	1863	void qemu_mutex_unlock_iothread(void)
	1864	{
	1865	g_assert(qemu_mutex_iothread_locked());
	1866	iothread_locked = false;
	1867	qemu_mutex_unlock(&qemu_global_mutex);
	1868	}
	1869
	1870	static bool all_vcpus_paused(void)
	1871	{
	1872	CPUState *cpu;
	1873
	1874	CPU_FOREACH(cpu) {
	1875	if (!cpu->stopped) {
	1876	return false;
	1877	}
	1878	}
	1879
	1880	return true;
	1881	}
	1882
	1883	void pause_all_vcpus(void)
	1884	{
	1885	CPUState *cpu;
	1886
	1887	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
	1888	CPU_FOREACH(cpu) {
	1889	if (qemu_cpu_is_self(cpu)) {
	1890	qemu_cpu_stop(cpu, true);
	1891	} else {
	1892	cpu->stop = true;
	1893	qemu_cpu_kick(cpu);
	1894	}
	1895	}
	1896
	1897	/* We need to drop the replay_lock so any vCPU threads woken up
	1898	* can finish their replay tasks
	1899	*/
	1900	replay_mutex_unlock();
	1901
	1902	while (!all_vcpus_paused()) {
	1903	qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
	1904	CPU_FOREACH(cpu) {
	1905	qemu_cpu_kick(cpu);
	1906	}
	1907	}
	1908
	1909	qemu_mutex_unlock_iothread();
	1910	replay_mutex_lock();
	1911	qemu_mutex_lock_iothread();
	1912	}
	1913
	1914	void cpu_resume(CPUState *cpu)
	1915	{
	1916	cpu->stop = false;
	1917	cpu->stopped = false;
	1918	qemu_cpu_kick(cpu);
	1919	}
	1920
	1921	void resume_all_vcpus(void)
	1922	{
	1923	CPUState *cpu;
	1924
	1925	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
	1926	CPU_FOREACH(cpu) {
	1927	cpu_resume(cpu);
	1928	}
	1929	}
	1930
	1931	void cpu_remove_sync(CPUState *cpu)
	1932	{
	1933	cpu->stop = true;
	1934	cpu->unplug = true;
	1935	qemu_cpu_kick(cpu);
	1936	qemu_mutex_unlock_iothread();
	1937	qemu_thread_join(cpu->thread);
	1938	qemu_mutex_lock_iothread();
	1939	}
	1940
	1941	/* For temporary buffers for forming a name */
	1942	#define VCPU_THREAD_NAME_SIZE 16
	1943
	1944	static void qemu_tcg_init_vcpu(CPUState *cpu)
	1945	{
	1946	char thread_name[VCPU_THREAD_NAME_SIZE];
	1947	static QemuCond *single_tcg_halt_cond;
	1948	static QemuThread *single_tcg_cpu_thread;
	1949	static int tcg_region_inited;
	1950
	1951	assert(tcg_enabled());
	1952	/*
	1953	* Initialize TCG regions--once. Now is a good time, because:
	1954	* (1) TCG's init context, prologue and target globals have been set up.
	1955	* (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
	1956	* -accel flag is processed, so the check doesn't work then).
	1957	*/
	1958	if (!tcg_region_inited) {
	1959	tcg_region_inited = 1;
	1960	tcg_region_init();
	1961	}
	1962
	1963	if (qemu_tcg_mttcg_enabled() \|\| !single_tcg_cpu_thread) {
	1964	cpu->thread = g_malloc0(sizeof(QemuThread));
	1965	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1966	qemu_cond_init(cpu->halt_cond);
	1967
	1968	if (qemu_tcg_mttcg_enabled()) {
	1969	/* create a thread per vCPU with TCG (MTTCG) */
	1970	parallel_cpus = true;
	1971	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
	1972	cpu->cpu_index);
	1973
	1974	qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
	1975	cpu, QEMU_THREAD_JOINABLE);
	1976
	1977	} else {
	1978	/* share a single thread for all cpus with TCG */
	1979	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
	1980	qemu_thread_create(cpu->thread, thread_name,
	1981	qemu_tcg_rr_cpu_thread_fn,
	1982	cpu, QEMU_THREAD_JOINABLE);
	1983
	1984	single_tcg_halt_cond = cpu->halt_cond;
	1985	single_tcg_cpu_thread = cpu->thread;
	1986	}
	1987	#ifdef _WIN32
	1988	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	1989	#endif
	1990	} else {
	1991	/* For non-MTTCG cases we share the thread */
	1992	cpu->thread = single_tcg_cpu_thread;
	1993	cpu->halt_cond = single_tcg_halt_cond;
	1994	cpu->thread_id = first_cpu->thread_id;
	1995	cpu->can_do_io = 1;
	1996	cpu->created = true;
	1997	}
	1998	}
	1999
	2000	static void qemu_hax_start_vcpu(CPUState *cpu)
	2001	{
	2002	char thread_name[VCPU_THREAD_NAME_SIZE];
	2003
	2004	cpu->thread = g_malloc0(sizeof(QemuThread));
	2005	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2006	qemu_cond_init(cpu->halt_cond);
	2007
	2008	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
	2009	cpu->cpu_index);
	2010	qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
	2011	cpu, QEMU_THREAD_JOINABLE);
	2012	#ifdef _WIN32
	2013	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	2014	#endif
	2015	}
	2016
	2017	static void qemu_kvm_start_vcpu(CPUState *cpu)
	2018	{
	2019	char thread_name[VCPU_THREAD_NAME_SIZE];
	2020
	2021	cpu->thread = g_malloc0(sizeof(QemuThread));
	2022	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2023	qemu_cond_init(cpu->halt_cond);
	2024	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
	2025	cpu->cpu_index);
	2026	qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
	2027	cpu, QEMU_THREAD_JOINABLE);
	2028	}
	2029
	2030	static void qemu_hvf_start_vcpu(CPUState *cpu)
	2031	{
	2032	char thread_name[VCPU_THREAD_NAME_SIZE];
	2033
	2034	/* HVF currently does not support TCG, and only runs in
	2035	* unrestricted-guest mode. */
	2036	assert(hvf_enabled());
	2037
	2038	cpu->thread = g_malloc0(sizeof(QemuThread));
	2039	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2040	qemu_cond_init(cpu->halt_cond);
	2041
	2042	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
	2043	cpu->cpu_index);
	2044	qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
	2045	cpu, QEMU_THREAD_JOINABLE);
	2046	}
	2047
	2048	static void qemu_whpx_start_vcpu(CPUState *cpu)
	2049	{
	2050	char thread_name[VCPU_THREAD_NAME_SIZE];
	2051
	2052	cpu->thread = g_malloc0(sizeof(QemuThread));
	2053	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2054	qemu_cond_init(cpu->halt_cond);
	2055	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
	2056	cpu->cpu_index);
	2057	qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
	2058	cpu, QEMU_THREAD_JOINABLE);
	2059	#ifdef _WIN32
	2060	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	2061	#endif
	2062	}
	2063
	2064	static void qemu_dummy_start_vcpu(CPUState *cpu)
	2065	{
	2066	char thread_name[VCPU_THREAD_NAME_SIZE];
	2067
	2068	cpu->thread = g_malloc0(sizeof(QemuThread));
	2069	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2070	qemu_cond_init(cpu->halt_cond);
	2071	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
	2072	cpu->cpu_index);
	2073	qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
	2074	QEMU_THREAD_JOINABLE);
	2075	}
	2076
	2077	void qemu_init_vcpu(CPUState *cpu)
	2078	{
	2079	cpu->nr_cores = smp_cores;
	2080	cpu->nr_threads = smp_threads;
	2081	cpu->stopped = true;
	2082	cpu->random_seed = qemu_guest_random_seed_thread_part1();
	2083
	2084	if (!cpu->as) {
	2085	/* If the target cpu hasn't set up any address spaces itself,
	2086	* give it the default one.
	2087	*/
	2088	cpu->num_ases = 1;
	2089	cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
	2090	}
	2091
	2092	if (kvm_enabled()) {
	2093	qemu_kvm_start_vcpu(cpu);
	2094	} else if (hax_enabled()) {
	2095	qemu_hax_start_vcpu(cpu);
	2096	} else if (hvf_enabled()) {
	2097	qemu_hvf_start_vcpu(cpu);
	2098	} else if (tcg_enabled()) {
	2099	qemu_tcg_init_vcpu(cpu);
	2100	} else if (whpx_enabled()) {
	2101	qemu_whpx_start_vcpu(cpu);
	2102	} else {
	2103	qemu_dummy_start_vcpu(cpu);
	2104	}
	2105
	2106	while (!cpu->created) {
	2107	qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
	2108	}
	2109	}
	2110
	2111	void cpu_stop_current(void)
	2112	{
	2113	if (current_cpu) {
	2114	current_cpu->stop = true;
	2115	cpu_exit(current_cpu);
	2116	}
	2117	}
	2118
	2119	int vm_stop(RunState state)
	2120	{
	2121	if (qemu_in_vcpu_thread()) {
	2122	qemu_system_vmstop_request_prepare();
	2123	qemu_system_vmstop_request(state);
	2124	/*
	2125	* FIXME: should not return to device code in case
	2126	* vm_stop() has been requested.
	2127	*/
	2128	cpu_stop_current();
	2129	return 0;
	2130	}
	2131
	2132	return do_vm_stop(state, true);
	2133	}
	2134
	2135	/**
	2136	* Prepare for (re)starting the VM.
	2137	* Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
	2138	* running or in case of an error condition), 0 otherwise.
	2139	*/
	2140	int vm_prepare_start(void)
	2141	{
	2142	RunState requested;
	2143
	2144	qemu_vmstop_requested(&requested);
	2145	if (runstate_is_running() && requested == RUN_STATE__MAX) {
	2146	return -1;
	2147	}
	2148
	2149	/* Ensure that a STOP/RESUME pair of events is emitted if a
	2150	* vmstop request was pending. The BLOCK_IO_ERROR event, for
	2151	* example, according to documentation is always followed by
	2152	* the STOP event.
	2153	*/
	2154	if (runstate_is_running()) {
	2155	qapi_event_send_stop();
	2156	qapi_event_send_resume();
	2157	return -1;
	2158	}
	2159
	2160	/* We are sending this now, but the CPUs will be resumed shortly later */
	2161	qapi_event_send_resume();
	2162
	2163	replay_enable_events();
	2164	cpu_enable_ticks();
	2165	runstate_set(RUN_STATE_RUNNING);
	2166	vm_state_notify(1, RUN_STATE_RUNNING);
	2167	return 0;
	2168	}
	2169
	2170	void vm_start(void)
	2171	{
	2172	if (!vm_prepare_start()) {
	2173	resume_all_vcpus();
	2174	}
	2175	}
	2176
	2177	/* does a state transition even if the VM is already stopped,
	2178	current state is forgotten forever */
	2179	int vm_stop_force_state(RunState state)
	2180	{
	2181	if (runstate_is_running()) {
	2182	return vm_stop(state);
	2183	} else {
	2184	runstate_set(state);
	2185
	2186	bdrv_drain_all();
	2187	/* Make sure to return an error if the flush in a previous vm_stop()
	2188	* failed. */
	2189	return bdrv_flush_all();
	2190	}
	2191	}
	2192
	2193	void list_cpus(const char *optarg)
	2194	{
	2195	/* XXX: implement xxx_cpu_list for targets that still miss it */
	2196	#if defined(cpu_list)
	2197	cpu_list();
	2198	#endif
	2199	}
	2200
	2201	CpuInfoList qmp_query_cpus(Error *errp)
	2202	{
	2203	MachineState *ms = MACHINE(qdev_get_machine());
	2204	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2205	CpuInfoList head = NULL, cur_item = NULL;
	2206	CPUState *cpu;
	2207
	2208	CPU_FOREACH(cpu) {
	2209	CpuInfoList *info;
	2210	#if defined(TARGET_I386)
	2211	X86CPU *x86_cpu = X86_CPU(cpu);
	2212	CPUX86State *env = &x86_cpu->env;
	2213	#elif defined(TARGET_PPC)
	2214	PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
	2215	CPUPPCState *env = &ppc_cpu->env;
	2216	#elif defined(TARGET_SPARC)
	2217	SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
	2218	CPUSPARCState *env = &sparc_cpu->env;
	2219	#elif defined(TARGET_RISCV)
	2220	RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
	2221	CPURISCVState *env = &riscv_cpu->env;
	2222	#elif defined(TARGET_MIPS)
	2223	MIPSCPU *mips_cpu = MIPS_CPU(cpu);
	2224	CPUMIPSState *env = &mips_cpu->env;
	2225	#elif defined(TARGET_TRICORE)
	2226	TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
	2227	CPUTriCoreState *env = &tricore_cpu->env;
	2228	#elif defined(TARGET_S390X)
	2229	S390CPU *s390_cpu = S390_CPU(cpu);
	2230	CPUS390XState *env = &s390_cpu->env;
	2231	#endif
	2232
	2233	cpu_synchronize_state(cpu);
	2234
	2235	info = g_malloc0(sizeof(*info));
	2236	info->value = g_malloc0(sizeof(*info->value));
	2237	info->value->CPU = cpu->cpu_index;
	2238	info->value->current = (cpu == first_cpu);
	2239	info->value->halted = cpu->halted;
	2240	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2241	info->value->thread_id = cpu->thread_id;
	2242	#if defined(TARGET_I386)
	2243	info->value->arch = CPU_INFO_ARCH_X86;
	2244	info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
	2245	#elif defined(TARGET_PPC)
	2246	info->value->arch = CPU_INFO_ARCH_PPC;
	2247	info->value->u.ppc.nip = env->nip;
	2248	#elif defined(TARGET_SPARC)
	2249	info->value->arch = CPU_INFO_ARCH_SPARC;
	2250	info->value->u.q_sparc.pc = env->pc;
	2251	info->value->u.q_sparc.npc = env->npc;
	2252	#elif defined(TARGET_MIPS)
	2253	info->value->arch = CPU_INFO_ARCH_MIPS;
	2254	info->value->u.q_mips.PC = env->active_tc.PC;
	2255	#elif defined(TARGET_TRICORE)
	2256	info->value->arch = CPU_INFO_ARCH_TRICORE;
	2257	info->value->u.tricore.PC = env->PC;
	2258	#elif defined(TARGET_S390X)
	2259	info->value->arch = CPU_INFO_ARCH_S390;
	2260	info->value->u.s390.cpu_state = env->cpu_state;
	2261	#elif defined(TARGET_RISCV)
	2262	info->value->arch = CPU_INFO_ARCH_RISCV;
	2263	info->value->u.riscv.pc = env->pc;
	2264	#else
	2265	info->value->arch = CPU_INFO_ARCH_OTHER;
	2266	#endif
	2267	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2268	if (info->value->has_props) {
	2269	CpuInstanceProperties *props;
	2270	props = g_malloc0(sizeof(*props));
	2271	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2272	info->value->props = props;
	2273	}
	2274
	2275	/* XXX: waiting for the qapi to support GSList */
	2276	if (!cur_item) {
	2277	head = cur_item = info;
	2278	} else {
	2279	cur_item->next = info;
	2280	cur_item = info;
	2281	}
	2282	}
	2283
	2284	return head;
	2285	}
	2286
	2287	static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
	2288	{
	2289	/*
	2290	* The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
	2291	* TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
	2292	*/
	2293	switch (target) {
	2294	case SYS_EMU_TARGET_I386:
	2295	case SYS_EMU_TARGET_X86_64:
	2296	return CPU_INFO_ARCH_X86;
	2297
	2298	case SYS_EMU_TARGET_PPC:
	2299	case SYS_EMU_TARGET_PPC64:
	2300	return CPU_INFO_ARCH_PPC;
	2301
	2302	case SYS_EMU_TARGET_SPARC:
	2303	case SYS_EMU_TARGET_SPARC64:
	2304	return CPU_INFO_ARCH_SPARC;
	2305
	2306	case SYS_EMU_TARGET_MIPS:
	2307	case SYS_EMU_TARGET_MIPSEL:
	2308	case SYS_EMU_TARGET_MIPS64:
	2309	case SYS_EMU_TARGET_MIPS64EL:
	2310	return CPU_INFO_ARCH_MIPS;
	2311
	2312	case SYS_EMU_TARGET_TRICORE:
	2313	return CPU_INFO_ARCH_TRICORE;
	2314
	2315	case SYS_EMU_TARGET_S390X:
	2316	return CPU_INFO_ARCH_S390;
	2317
	2318	case SYS_EMU_TARGET_RISCV32:
	2319	case SYS_EMU_TARGET_RISCV64:
	2320	return CPU_INFO_ARCH_RISCV;
	2321
	2322	default:
	2323	return CPU_INFO_ARCH_OTHER;
	2324	}
	2325	}
	2326
	2327	static void cpustate_to_cpuinfo_s390(CpuInfoS390 info, const CPUState cpu)
	2328	{
	2329	#ifdef TARGET_S390X
	2330	S390CPU *s390_cpu = S390_CPU(cpu);
	2331	CPUS390XState *env = &s390_cpu->env;
	2332
	2333	info->cpu_state = env->cpu_state;
	2334	#else
	2335	abort();
	2336	#endif
	2337	}
	2338
	2339	/*
	2340	* fast means: we NEVER interrupt vCPU threads to retrieve
	2341	* information from KVM.
	2342	*/
	2343	CpuInfoFastList qmp_query_cpus_fast(Error *errp)
	2344	{
	2345	MachineState *ms = MACHINE(qdev_get_machine());
	2346	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2347	CpuInfoFastList head = NULL, cur_item = NULL;
	2348	SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
	2349	-1, &error_abort);
	2350	CPUState *cpu;
	2351
	2352	CPU_FOREACH(cpu) {
	2353	CpuInfoFastList info = g_malloc0(sizeof(info));
	2354	info->value = g_malloc0(sizeof(*info->value));
	2355
	2356	info->value->cpu_index = cpu->cpu_index;
	2357	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2358	info->value->thread_id = cpu->thread_id;
	2359
	2360	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2361	if (info->value->has_props) {
	2362	CpuInstanceProperties *props;
	2363	props = g_malloc0(sizeof(*props));
	2364	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2365	info->value->props = props;
	2366	}
	2367
	2368	info->value->arch = sysemu_target_to_cpuinfo_arch(target);
	2369	info->value->target = target;
	2370	if (target == SYS_EMU_TARGET_S390X) {
	2371	cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
	2372	}
	2373
	2374	if (!cur_item) {
	2375	head = cur_item = info;
	2376	} else {
	2377	cur_item->next = info;
	2378	cur_item = info;
	2379	}
	2380	}
	2381
	2382	return head;
	2383	}
	2384
	2385	void qmp_memsave(int64_t addr, int64_t size, const char *filename,
	2386	bool has_cpu, int64_t cpu_index, Error **errp)
	2387	{
	2388	FILE *f;
	2389	uint32_t l;
	2390	CPUState *cpu;
	2391	uint8_t buf[1024];
	2392	int64_t orig_addr = addr, orig_size = size;
	2393
	2394	if (!has_cpu) {
	2395	cpu_index = 0;
	2396	}
	2397
	2398	cpu = qemu_get_cpu(cpu_index);
	2399	if (cpu == NULL) {
	2400	error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
	2401	"a CPU number");
	2402	return;
	2403	}
	2404
	2405	f = fopen(filename, "wb");
	2406	if (!f) {
	2407	error_setg_file_open(errp, errno, filename);
	2408	return;
	2409	}
	2410
	2411	while (size != 0) {
	2412	l = sizeof(buf);
	2413	if (l > size)
	2414	l = size;
	2415	if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
	2416	error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
	2417	" specified", orig_addr, orig_size);
	2418	goto exit;
	2419	}
	2420	if (fwrite(buf, 1, l, f) != l) {
	2421	error_setg(errp, QERR_IO_ERROR);
	2422	goto exit;
	2423	}
	2424	addr += l;
	2425	size -= l;
	2426	}
	2427
	2428	exit:
	2429	fclose(f);
	2430	}
	2431
	2432	void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
	2433	Error **errp)
	2434	{
	2435	FILE *f;
	2436	uint32_t l;
	2437	uint8_t buf[1024];
	2438
	2439	f = fopen(filename, "wb");
	2440	if (!f) {
	2441	error_setg_file_open(errp, errno, filename);
	2442	return;
	2443	}
	2444
	2445	while (size != 0) {
	2446	l = sizeof(buf);
	2447	if (l > size)
	2448	l = size;
	2449	cpu_physical_memory_read(addr, buf, l);
	2450	if (fwrite(buf, 1, l, f) != l) {
	2451	error_setg(errp, QERR_IO_ERROR);
	2452	goto exit;
	2453	}
	2454	addr += l;
	2455	size -= l;
	2456	}
	2457
	2458	exit:
	2459	fclose(f);
	2460	}
	2461
	2462	void qmp_inject_nmi(Error **errp)
	2463	{
	2464	nmi_monitor_handle(monitor_get_cpu_index(), errp);
	2465	}
	2466
	2467	void dump_drift_info(void)
	2468	{
	2469	if (!use_icount) {
	2470	return;
	2471	}
	2472
	2473	qemu_printf("Host - Guest clock %"PRIi64" ms\n",
	2474	(cpu_get_clock() - cpu_get_icount())/SCALE_MS);
	2475	if (icount_align_option) {
	2476	qemu_printf("Max guest delay %"PRIi64" ms\n",
	2477	-max_delay / SCALE_MS);
	2478	qemu_printf("Max guest advance %"PRIi64" ms\n",
	2479	max_advance / SCALE_MS);
	2480	} else {
	2481	qemu_printf("Max guest delay NA\n");
	2482	qemu_printf("Max guest advance NA\n");
	2483	}
	2484	}