Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* QEMU System Emulator
	3	*
	4	* Copyright (c) 2003-2008 Fabrice Bellard
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24
	25	#include "qemu/osdep.h"
	26	#include "qemu/config-file.h"
	27	#include "cpu.h"
	28	#include "monitor/monitor.h"
	29	#include "qapi/error.h"
	30	#include "qapi/qapi-commands-misc.h"
	31	#include "qapi/qapi-events-run-state.h"
	32	#include "qapi/qmp/qerror.h"
	33	#include "qemu/error-report.h"
	34	#include "qemu/qemu-print.h"
	35	#include "sysemu/sysemu.h"
	36	#include "sysemu/block-backend.h"
	37	#include "exec/gdbstub.h"
	38	#include "sysemu/dma.h"
	39	#include "sysemu/hw_accel.h"
	40	#include "sysemu/kvm.h"
	41	#include "sysemu/hax.h"
	42	#include "sysemu/hvf.h"
	43	#include "sysemu/whpx.h"
	44	#include "exec/exec-all.h"
	45
	46	#include "qemu/thread.h"
	47	#include "sysemu/cpus.h"
	48	#include "sysemu/qtest.h"
	49	#include "qemu/main-loop.h"
	50	#include "qemu/option.h"
	51	#include "qemu/bitmap.h"
	52	#include "qemu/seqlock.h"
	53	#include "tcg.h"
	54	#include "hw/nmi.h"
	55	#include "sysemu/replay.h"
	56	#include "hw/boards.h"
	57
	58	#ifdef CONFIG_LINUX
	59
	60	#include <sys/prctl.h>
	61
	62	#ifndef PR_MCE_KILL
	63	#define PR_MCE_KILL 33
	64	#endif
	65
	66	#ifndef PR_MCE_KILL_SET
	67	#define PR_MCE_KILL_SET 1
	68	#endif
	69
	70	#ifndef PR_MCE_KILL_EARLY
	71	#define PR_MCE_KILL_EARLY 1
	72	#endif
	73
	74	#endif /* CONFIG_LINUX */
	75
	76	int64_t max_delay;
	77	int64_t max_advance;
	78
	79	/* vcpu throttling controls */
	80	static QEMUTimer *throttle_timer;
	81	static unsigned int throttle_percentage;
	82
	83	#define CPU_THROTTLE_PCT_MIN 1
	84	#define CPU_THROTTLE_PCT_MAX 99
	85	#define CPU_THROTTLE_TIMESLICE_NS 10000000
	86
	87	bool cpu_is_stopped(CPUState *cpu)
	88	{
	89	return cpu->stopped \|\| !runstate_is_running();
	90	}
	91
	92	static bool cpu_thread_is_idle(CPUState *cpu)
	93	{
	94	if (cpu->stop \|\| cpu->queued_work_first) {
	95	return false;
	96	}
	97	if (cpu_is_stopped(cpu)) {
	98	return true;
	99	}
	100	if (!cpu->halted \|\| cpu_has_work(cpu) \|\|
	101	kvm_halt_in_kernel()) {
	102	return false;
	103	}
	104	return true;
	105	}
	106
	107	static bool all_cpu_threads_idle(void)
	108	{
	109	CPUState *cpu;
	110
	111	CPU_FOREACH(cpu) {
	112	if (!cpu_thread_is_idle(cpu)) {
	113	return false;
	114	}
	115	}
	116	return true;
	117	}
	118
	119	/***********************************************************/
	120	/* guest cycle counter */
	121
	122	/* Protected by TimersState seqlock */
	123
	124	static bool icount_sleep = true;
	125	/* Arbitrarily pick 1MIPS as the minimum allowable speed. */
	126	#define MAX_ICOUNT_SHIFT 10
	127
	128	typedef struct TimersState {
	129	/* Protected by BQL. */
	130	int64_t cpu_ticks_prev;
	131	int64_t cpu_ticks_offset;
	132
	133	/* Protect fields that can be respectively read outside the
	134	* BQL, and written from multiple threads.
	135	*/
	136	QemuSeqLock vm_clock_seqlock;
	137	QemuSpin vm_clock_lock;
	138
	139	int16_t cpu_ticks_enabled;
	140
	141	/* Conversion factor from emulated instructions to virtual clock ticks. */
	142	int16_t icount_time_shift;
	143
	144	/* Compensate for varying guest execution speed. */
	145	int64_t qemu_icount_bias;
	146
	147	int64_t vm_clock_warp_start;
	148	int64_t cpu_clock_offset;
	149
	150	/* Only written by TCG thread */
	151	int64_t qemu_icount;
	152
	153	/* for adjusting icount */
	154	QEMUTimer *icount_rt_timer;
	155	QEMUTimer *icount_vm_timer;
	156	QEMUTimer *icount_warp_timer;
	157	} TimersState;
	158
	159	static TimersState timers_state;
	160	bool mttcg_enabled;
	161
	162	/*
	163	* We default to false if we know other options have been enabled
	164	* which are currently incompatible with MTTCG. Otherwise when each
	165	* guest (target) has been updated to support:
	166	* - atomic instructions
	167	* - memory ordering primitives (barriers)
	168	* they can set the appropriate CONFIG flags in ${target}-softmmu.mak
	169	*
	170	* Once a guest architecture has been converted to the new primitives
	171	* there are two remaining limitations to check.
	172	*
	173	* - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
	174	* - The host must have a stronger memory order than the guest
	175	*
	176	* It may be possible in future to support strong guests on weak hosts
	177	* but that will require tagging all load/stores in a guest with their
	178	* implicit memory order requirements which would likely slow things
	179	* down a lot.
	180	*/
	181
	182	static bool check_tcg_memory_orders_compatible(void)
	183	{
	184	#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
	185	return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
	186	#else
	187	return false;
	188	#endif
	189	}
	190
	191	static bool default_mttcg_enabled(void)
	192	{
	193	if (use_icount \|\| TCG_OVERSIZED_GUEST) {
	194	return false;
	195	} else {
	196	#ifdef TARGET_SUPPORTS_MTTCG
	197	return check_tcg_memory_orders_compatible();
	198	#else
	199	return false;
	200	#endif
	201	}
	202	}
	203
	204	void qemu_tcg_configure(QemuOpts opts, Error *errp)
	205	{
	206	const char *t = qemu_opt_get(opts, "thread");
	207	if (t) {
	208	if (strcmp(t, "multi") == 0) {
	209	if (TCG_OVERSIZED_GUEST) {
	210	error_setg(errp, "No MTTCG when guest word size > hosts");
	211	} else if (use_icount) {
	212	error_setg(errp, "No MTTCG when icount is enabled");
	213	} else {
	214	#ifndef TARGET_SUPPORTS_MTTCG
	215	warn_report("Guest not yet converted to MTTCG - "
	216	"you may get unexpected results");
	217	#endif
	218	if (!check_tcg_memory_orders_compatible()) {
	219	warn_report("Guest expects a stronger memory ordering "
	220	"than the host provides");
	221	error_printf("This may cause strange/hard to debug errors\n");
	222	}
	223	mttcg_enabled = true;
	224	}
	225	} else if (strcmp(t, "single") == 0) {
	226	mttcg_enabled = false;
	227	} else {
	228	error_setg(errp, "Invalid 'thread' setting %s", t);
	229	}
	230	} else {
	231	mttcg_enabled = default_mttcg_enabled();
	232	}
	233	}
	234
	235	/* The current number of executed instructions is based on what we
	236	* originally budgeted minus the current state of the decrementing
	237	* icount counters in extra/u16.low.
	238	*/
	239	static int64_t cpu_get_icount_executed(CPUState *cpu)
	240	{
	241	return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
	242	}
	243
	244	/*
	245	* Update the global shared timer_state.qemu_icount to take into
	246	* account executed instructions. This is done by the TCG vCPU
	247	* thread so the main-loop can see time has moved forward.
	248	*/
	249	static void cpu_update_icount_locked(CPUState *cpu)
	250	{
	251	int64_t executed = cpu_get_icount_executed(cpu);
	252	cpu->icount_budget -= executed;
	253
	254	atomic_set_i64(&timers_state.qemu_icount,
	255	timers_state.qemu_icount + executed);
	256	}
	257
	258	/*
	259	* Update the global shared timer_state.qemu_icount to take into
	260	* account executed instructions. This is done by the TCG vCPU
	261	* thread so the main-loop can see time has moved forward.
	262	*/
	263	void cpu_update_icount(CPUState *cpu)
	264	{
	265	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	266	&timers_state.vm_clock_lock);
	267	cpu_update_icount_locked(cpu);
	268	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	269	&timers_state.vm_clock_lock);
	270	}
	271
	272	static int64_t cpu_get_icount_raw_locked(void)
	273	{
	274	CPUState *cpu = current_cpu;
	275
	276	if (cpu && cpu->running) {
	277	if (!cpu->can_do_io) {
	278	error_report("Bad icount read");
	279	exit(1);
	280	}
	281	/* Take into account what has run */
	282	cpu_update_icount_locked(cpu);
	283	}
	284	/* The read is protected by the seqlock, but needs atomic64 to avoid UB */
	285	return atomic_read_i64(&timers_state.qemu_icount);
	286	}
	287
	288	static int64_t cpu_get_icount_locked(void)
	289	{
	290	int64_t icount = cpu_get_icount_raw_locked();
	291	return atomic_read_i64(&timers_state.qemu_icount_bias) +
	292	cpu_icount_to_ns(icount);
	293	}
	294
	295	int64_t cpu_get_icount_raw(void)
	296	{
	297	int64_t icount;
	298	unsigned start;
	299
	300	do {
	301	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	302	icount = cpu_get_icount_raw_locked();
	303	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	304
	305	return icount;
	306	}
	307
	308	/* Return the virtual CPU time, based on the instruction counter. */
	309	int64_t cpu_get_icount(void)
	310	{
	311	int64_t icount;
	312	unsigned start;
	313
	314	do {
	315	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	316	icount = cpu_get_icount_locked();
	317	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	318
	319	return icount;
	320	}
	321
	322	int64_t cpu_icount_to_ns(int64_t icount)
	323	{
	324	return icount << atomic_read(&timers_state.icount_time_shift);
	325	}
	326
	327	static int64_t cpu_get_ticks_locked(void)
	328	{
	329	int64_t ticks = timers_state.cpu_ticks_offset;
	330	if (timers_state.cpu_ticks_enabled) {
	331	ticks += cpu_get_host_ticks();
	332	}
	333
	334	if (timers_state.cpu_ticks_prev > ticks) {
	335	/* Non increasing ticks may happen if the host uses software suspend. */
	336	timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
	337	ticks = timers_state.cpu_ticks_prev;
	338	}
	339
	340	timers_state.cpu_ticks_prev = ticks;
	341	return ticks;
	342	}
	343
	344	/* return the time elapsed in VM between vm_start and vm_stop. Unless
	345	* icount is active, cpu_get_ticks() uses units of the host CPU cycle
	346	* counter.
	347	*/
	348	int64_t cpu_get_ticks(void)
	349	{
	350	int64_t ticks;
	351
	352	if (use_icount) {
	353	return cpu_get_icount();
	354	}
	355
	356	qemu_spin_lock(&timers_state.vm_clock_lock);
	357	ticks = cpu_get_ticks_locked();
	358	qemu_spin_unlock(&timers_state.vm_clock_lock);
	359	return ticks;
	360	}
	361
	362	static int64_t cpu_get_clock_locked(void)
	363	{
	364	int64_t time;
	365
	366	time = timers_state.cpu_clock_offset;
	367	if (timers_state.cpu_ticks_enabled) {
	368	time += get_clock();
	369	}
	370
	371	return time;
	372	}
	373
	374	/* Return the monotonic time elapsed in VM, i.e.,
	375	* the time between vm_start and vm_stop
	376	*/
	377	int64_t cpu_get_clock(void)
	378	{
	379	int64_t ti;
	380	unsigned start;
	381
	382	do {
	383	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	384	ti = cpu_get_clock_locked();
	385	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	386
	387	return ti;
	388	}
	389
	390	/* enable cpu_get_ticks()
	391	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	392	*/
	393	void cpu_enable_ticks(void)
	394	{
	395	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	396	&timers_state.vm_clock_lock);
	397	if (!timers_state.cpu_ticks_enabled) {
	398	timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
	399	timers_state.cpu_clock_offset -= get_clock();
	400	timers_state.cpu_ticks_enabled = 1;
	401	}
	402	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	403	&timers_state.vm_clock_lock);
	404	}
	405
	406	/* disable cpu_get_ticks() : the clock is stopped. You must not call
	407	* cpu_get_ticks() after that.
	408	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	409	*/
	410	void cpu_disable_ticks(void)
	411	{
	412	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	413	&timers_state.vm_clock_lock);
	414	if (timers_state.cpu_ticks_enabled) {
	415	timers_state.cpu_ticks_offset += cpu_get_host_ticks();
	416	timers_state.cpu_clock_offset = cpu_get_clock_locked();
	417	timers_state.cpu_ticks_enabled = 0;
	418	}
	419	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	420	&timers_state.vm_clock_lock);
	421	}
	422
	423	/* Correlation between real and virtual time is always going to be
	424	fairly approximate, so ignore small variation.
	425	When the guest is idle real and virtual time will be aligned in
	426	the IO wait loop. */
	427	#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
	428
	429	static void icount_adjust(void)
	430	{
	431	int64_t cur_time;
	432	int64_t cur_icount;
	433	int64_t delta;
	434
	435	/* Protected by TimersState mutex. */
	436	static int64_t last_delta;
	437
	438	/* If the VM is not running, then do nothing. */
	439	if (!runstate_is_running()) {
	440	return;
	441	}
	442
	443	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	444	&timers_state.vm_clock_lock);
	445	cur_time = cpu_get_clock_locked();
	446	cur_icount = cpu_get_icount_locked();
	447
	448	delta = cur_icount - cur_time;
	449	/* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
	450	if (delta > 0
	451	&& last_delta + ICOUNT_WOBBLE < delta * 2
	452	&& timers_state.icount_time_shift > 0) {
	453	/* The guest is getting too far ahead. Slow time down. */
	454	atomic_set(&timers_state.icount_time_shift,
	455	timers_state.icount_time_shift - 1);
	456	}
	457	if (delta < 0
	458	&& last_delta - ICOUNT_WOBBLE > delta * 2
	459	&& timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
	460	/* The guest is getting too far behind. Speed time up. */
	461	atomic_set(&timers_state.icount_time_shift,
	462	timers_state.icount_time_shift + 1);
	463	}
	464	last_delta = delta;
	465	atomic_set_i64(&timers_state.qemu_icount_bias,
	466	cur_icount - (timers_state.qemu_icount
	467	<< timers_state.icount_time_shift));
	468	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	469	&timers_state.vm_clock_lock);
	470	}
	471
	472	static void icount_adjust_rt(void *opaque)
	473	{
	474	timer_mod(timers_state.icount_rt_timer,
	475	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	476	icount_adjust();
	477	}
	478
	479	static void icount_adjust_vm(void *opaque)
	480	{
	481	timer_mod(timers_state.icount_vm_timer,
	482	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	483	NANOSECONDS_PER_SECOND / 10);
	484	icount_adjust();
	485	}
	486
	487	static int64_t qemu_icount_round(int64_t count)
	488	{
	489	int shift = atomic_read(&timers_state.icount_time_shift);
	490	return (count + (1 << shift) - 1) >> shift;
	491	}
	492
	493	static void icount_warp_rt(void)
	494	{
	495	unsigned seq;
	496	int64_t warp_start;
	497
	498	/* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
	499	* changes from -1 to another value, so the race here is okay.
	500	*/
	501	do {
	502	seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	503	warp_start = timers_state.vm_clock_warp_start;
	504	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
	505
	506	if (warp_start == -1) {
	507	return;
	508	}
	509
	510	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	511	&timers_state.vm_clock_lock);
	512	if (runstate_is_running()) {
	513	int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
	514	cpu_get_clock_locked());
	515	int64_t warp_delta;
	516
	517	warp_delta = clock - timers_state.vm_clock_warp_start;
	518	if (use_icount == 2) {
	519	/*
	520	* In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
	521	* far ahead of real time.
	522	*/
	523	int64_t cur_icount = cpu_get_icount_locked();
	524	int64_t delta = clock - cur_icount;
	525	warp_delta = MIN(warp_delta, delta);
	526	}
	527	atomic_set_i64(&timers_state.qemu_icount_bias,
	528	timers_state.qemu_icount_bias + warp_delta);
	529	}
	530	timers_state.vm_clock_warp_start = -1;
	531	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	532	&timers_state.vm_clock_lock);
	533
	534	if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
	535	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	536	}
	537	}
	538
	539	static void icount_timer_cb(void *opaque)
	540	{
	541	/* No need for a checkpoint because the timer already synchronizes
	542	* with CHECKPOINT_CLOCK_VIRTUAL_RT.
	543	*/
	544	icount_warp_rt();
	545	}
	546
	547	void qtest_clock_warp(int64_t dest)
	548	{
	549	int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	550	AioContext *aio_context;
	551	assert(qtest_enabled());
	552	aio_context = qemu_get_aio_context();
	553	while (clock < dest) {
	554	int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	555	int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
	556
	557	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	558	&timers_state.vm_clock_lock);
	559	atomic_set_i64(&timers_state.qemu_icount_bias,
	560	timers_state.qemu_icount_bias + warp);
	561	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	562	&timers_state.vm_clock_lock);
	563
	564	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	565	timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
	566	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	567	}
	568	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	569	}
	570
	571	void qemu_start_warp_timer(void)
	572	{
	573	int64_t clock;
	574	int64_t deadline;
	575
	576	if (!use_icount) {
	577	return;
	578	}
	579
	580	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	581	* do not fire, so computing the deadline does not make sense.
	582	*/
	583	if (!runstate_is_running()) {
	584	return;
	585	}
	586
	587	if (replay_mode != REPLAY_MODE_PLAY) {
	588	if (!all_cpu_threads_idle()) {
	589	return;
	590	}
	591
	592	if (qtest_enabled()) {
	593	/* When testing, qtest commands advance icount. */
	594	return;
	595	}
	596
	597	replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
	598	} else {
	599	/* warp clock deterministically in record/replay mode */
	600	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
	601	/* vCPU is sleeping and warp can't be started.
	602	It is probably a race condition: notification sent
	603	to vCPU was processed in advance and vCPU went to sleep.
	604	Therefore we have to wake it up for doing someting. */
	605	if (replay_has_checkpoint()) {
	606	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	607	}
	608	return;
	609	}
	610	}
	611
	612	/* We want to use the earliest deadline from ALL vm_clocks */
	613	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
	614	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	615	if (deadline < 0) {
	616	static bool notified;
	617	if (!icount_sleep && !notified) {
	618	warn_report("icount sleep disabled and no active timers");
	619	notified = true;
	620	}
	621	return;
	622	}
	623
	624	if (deadline > 0) {
	625	/*
	626	* Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
	627	* sleep. Otherwise, the CPU might be waiting for a future timer
	628	* interrupt to wake it up, but the interrupt never comes because
	629	* the vCPU isn't running any insns and thus doesn't advance the
	630	* QEMU_CLOCK_VIRTUAL.
	631	*/
	632	if (!icount_sleep) {
	633	/*
	634	* We never let VCPUs sleep in no sleep icount mode.
	635	* If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
	636	* to the next QEMU_CLOCK_VIRTUAL event and notify it.
	637	* It is useful when we want a deterministic execution time,
	638	* isolated from host latencies.
	639	*/
	640	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	641	&timers_state.vm_clock_lock);
	642	atomic_set_i64(&timers_state.qemu_icount_bias,
	643	timers_state.qemu_icount_bias + deadline);
	644	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	645	&timers_state.vm_clock_lock);
	646	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	647	} else {
	648	/*
	649	* We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
	650	* "real" time, (related to the time left until the next event) has
	651	* passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
	652	* This avoids that the warps are visible externally; for example,
	653	* you will not be sending network packets continuously instead of
	654	* every 100ms.
	655	*/
	656	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	657	&timers_state.vm_clock_lock);
	658	if (timers_state.vm_clock_warp_start == -1
	659	\|\| timers_state.vm_clock_warp_start > clock) {
	660	timers_state.vm_clock_warp_start = clock;
	661	}
	662	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	663	&timers_state.vm_clock_lock);
	664	timer_mod_anticipate(timers_state.icount_warp_timer,
	665	clock + deadline);
	666	}
	667	} else if (deadline == 0) {
	668	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	669	}
	670	}
	671
	672	static void qemu_account_warp_timer(void)
	673	{
	674	if (!use_icount \|\| !icount_sleep) {
	675	return;
	676	}
	677
	678	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	679	* do not fire, so computing the deadline does not make sense.
	680	*/
	681	if (!runstate_is_running()) {
	682	return;
	683	}
	684
	685	/* warp clock deterministically in record/replay mode */
	686	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
	687	return;
	688	}
	689
	690	timer_del(timers_state.icount_warp_timer);
	691	icount_warp_rt();
	692	}
	693
	694	static bool icount_state_needed(void *opaque)
	695	{
	696	return use_icount;
	697	}
	698
	699	static bool warp_timer_state_needed(void *opaque)
	700	{
	701	TimersState *s = opaque;
	702	return s->icount_warp_timer != NULL;
	703	}
	704
	705	static bool adjust_timers_state_needed(void *opaque)
	706	{
	707	TimersState *s = opaque;
	708	return s->icount_rt_timer != NULL;
	709	}
	710
	711	/*
	712	* Subsection for warp timer migration is optional, because may not be created
	713	*/
	714	static const VMStateDescription icount_vmstate_warp_timer = {
	715	.name = "timer/icount/warp_timer",
	716	.version_id = 1,
	717	.minimum_version_id = 1,
	718	.needed = warp_timer_state_needed,
	719	.fields = (VMStateField[]) {
	720	VMSTATE_INT64(vm_clock_warp_start, TimersState),
	721	VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
	722	VMSTATE_END_OF_LIST()
	723	}
	724	};
	725
	726	static const VMStateDescription icount_vmstate_adjust_timers = {
	727	.name = "timer/icount/timers",
	728	.version_id = 1,
	729	.minimum_version_id = 1,
	730	.needed = adjust_timers_state_needed,
	731	.fields = (VMStateField[]) {
	732	VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
	733	VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
	734	VMSTATE_END_OF_LIST()
	735	}
	736	};
	737
	738	/*
	739	* This is a subsection for icount migration.
	740	*/
	741	static const VMStateDescription icount_vmstate_timers = {
	742	.name = "timer/icount",
	743	.version_id = 1,
	744	.minimum_version_id = 1,
	745	.needed = icount_state_needed,
	746	.fields = (VMStateField[]) {
	747	VMSTATE_INT64(qemu_icount_bias, TimersState),
	748	VMSTATE_INT64(qemu_icount, TimersState),
	749	VMSTATE_END_OF_LIST()
	750	},
	751	.subsections = (const VMStateDescription*[]) {
	752	&icount_vmstate_warp_timer,
	753	&icount_vmstate_adjust_timers,
	754	NULL
	755	}
	756	};
	757
	758	static const VMStateDescription vmstate_timers = {
	759	.name = "timer",
	760	.version_id = 2,
	761	.minimum_version_id = 1,
	762	.fields = (VMStateField[]) {
	763	VMSTATE_INT64(cpu_ticks_offset, TimersState),
	764	VMSTATE_UNUSED(8),
	765	VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
	766	VMSTATE_END_OF_LIST()
	767	},
	768	.subsections = (const VMStateDescription*[]) {
	769	&icount_vmstate_timers,
	770	NULL
	771	}
	772	};
	773
	774	static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
	775	{
	776	double pct;
	777	double throttle_ratio;
	778	long sleeptime_ns;
	779
	780	if (!cpu_throttle_get_percentage()) {
	781	return;
	782	}
	783
	784	pct = (double)cpu_throttle_get_percentage()/100;
	785	throttle_ratio = pct / (1 - pct);
	786	sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
	787
	788	qemu_mutex_unlock_iothread();
	789	g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
	790	qemu_mutex_lock_iothread();
	791	atomic_set(&cpu->throttle_thread_scheduled, 0);
	792	}
	793
	794	static void cpu_throttle_timer_tick(void *opaque)
	795	{
	796	CPUState *cpu;
	797	double pct;
	798
	799	/* Stop the timer if needed */
	800	if (!cpu_throttle_get_percentage()) {
	801	return;
	802	}
	803	CPU_FOREACH(cpu) {
	804	if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
	805	async_run_on_cpu(cpu, cpu_throttle_thread,
	806	RUN_ON_CPU_NULL);
	807	}
	808	}
	809
	810	pct = (double)cpu_throttle_get_percentage()/100;
	811	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	812	CPU_THROTTLE_TIMESLICE_NS / (1-pct));
	813	}
	814
	815	void cpu_throttle_set(int new_throttle_pct)
	816	{
	817	/* Ensure throttle percentage is within valid range */
	818	new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
	819	new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
	820
	821	atomic_set(&throttle_percentage, new_throttle_pct);
	822
	823	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	824	CPU_THROTTLE_TIMESLICE_NS);
	825	}
	826
	827	void cpu_throttle_stop(void)
	828	{
	829	atomic_set(&throttle_percentage, 0);
	830	}
	831
	832	bool cpu_throttle_active(void)
	833	{
	834	return (cpu_throttle_get_percentage() != 0);
	835	}
	836
	837	int cpu_throttle_get_percentage(void)
	838	{
	839	return atomic_read(&throttle_percentage);
	840	}
	841
	842	void cpu_ticks_init(void)
	843	{
	844	seqlock_init(&timers_state.vm_clock_seqlock);
	845	qemu_spin_init(&timers_state.vm_clock_lock);
	846	vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
	847	throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	848	cpu_throttle_timer_tick, NULL);
	849	}
	850
	851	void configure_icount(QemuOpts opts, Error *errp)
	852	{
	853	const char *option;
	854	char *rem_str = NULL;
	855
	856	option = qemu_opt_get(opts, "shift");
	857	if (!option) {
	858	if (qemu_opt_get(opts, "align") != NULL) {
	859	error_setg(errp, "Please specify shift option when using align");
	860	}
	861	return;
	862	}
	863
	864	icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
	865	if (icount_sleep) {
	866	timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	867	icount_timer_cb, NULL);
	868	}
	869
	870	icount_align_option = qemu_opt_get_bool(opts, "align", false);
	871
	872	if (icount_align_option && !icount_sleep) {
	873	error_setg(errp, "align=on and sleep=off are incompatible");
	874	}
	875	if (strcmp(option, "auto") != 0) {
	876	errno = 0;
	877	timers_state.icount_time_shift = strtol(option, &rem_str, 0);
	878	if (errno != 0 \|\| *rem_str != '\0' \|\| !strlen(option)) {
	879	error_setg(errp, "icount: Invalid shift value");
	880	}
	881	use_icount = 1;
	882	return;
	883	} else if (icount_align_option) {
	884	error_setg(errp, "shift=auto and align=on are incompatible");
	885	} else if (!icount_sleep) {
	886	error_setg(errp, "shift=auto and sleep=off are incompatible");
	887	}
	888
	889	use_icount = 2;
	890
	891	/* 125MIPS seems a reasonable initial guess at the guest speed.
	892	It will be corrected fairly quickly anyway. */
	893	timers_state.icount_time_shift = 3;
	894
	895	/* Have both realtime and virtual time triggers for speed adjustment.
	896	The realtime trigger catches emulated time passing too slowly,
	897	the virtual time trigger catches emulated time passing too fast.
	898	Realtime triggers occur even when idle, so use them less frequently
	899	than VM triggers. */
	900	timers_state.vm_clock_warp_start = -1;
	901	timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
	902	icount_adjust_rt, NULL);
	903	timer_mod(timers_state.icount_rt_timer,
	904	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	905	timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	906	icount_adjust_vm, NULL);
	907	timer_mod(timers_state.icount_vm_timer,
	908	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	909	NANOSECONDS_PER_SECOND / 10);
	910	}
	911
	912	/***********************************************************/
	913	/* TCG vCPU kick timer
	914	*
	915	* The kick timer is responsible for moving single threaded vCPU
	916	* emulation on to the next vCPU. If more than one vCPU is running a
	917	* timer event with force a cpu->exit so the next vCPU can get
	918	* scheduled.
	919	*
	920	* The timer is removed if all vCPUs are idle and restarted again once
	921	* idleness is complete.
	922	*/
	923
	924	static QEMUTimer *tcg_kick_vcpu_timer;
	925	static CPUState *tcg_current_rr_cpu;
	926
	927	#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
	928
	929	static inline int64_t qemu_tcg_next_kick(void)
	930	{
	931	return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
	932	}
	933
	934	/* Kick the currently round-robin scheduled vCPU */
	935	static void qemu_cpu_kick_rr_cpu(void)
	936	{
	937	CPUState *cpu;
	938	do {
	939	cpu = atomic_mb_read(&tcg_current_rr_cpu);
	940	if (cpu) {
	941	cpu_exit(cpu);
	942	}
	943	} while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
	944	}
	945
	946	static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
	947	{
	948	}
	949
	950	void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
	951	{
	952	if (!use_icount \|\| type != QEMU_CLOCK_VIRTUAL) {
	953	qemu_notify_event();
	954	return;
	955	}
	956
	957	if (qemu_in_vcpu_thread()) {
	958	/* A CPU is currently running; kick it back out to the
	959	* tcg_cpu_exec() loop so it will recalculate its
	960	* icount deadline immediately.
	961	*/
	962	qemu_cpu_kick(current_cpu);
	963	} else if (first_cpu) {
	964	/* qemu_cpu_kick is not enough to kick a halted CPU out of
	965	* qemu_tcg_wait_io_event. async_run_on_cpu, instead,
	966	* causes cpu_thread_is_idle to return false. This way,
	967	* handle_icount_deadline can run.
	968	* If we have no CPUs at all for some reason, we don't
	969	* need to do anything.
	970	*/
	971	async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
	972	}
	973	}
	974
	975	static void kick_tcg_thread(void *opaque)
	976	{
	977	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	978	qemu_cpu_kick_rr_cpu();
	979	}
	980
	981	static void start_tcg_kick_timer(void)
	982	{
	983	assert(!mttcg_enabled);
	984	if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
	985	tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	986	kick_tcg_thread, NULL);
	987	}
	988	if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
	989	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	990	}
	991	}
	992
	993	static void stop_tcg_kick_timer(void)
	994	{
	995	assert(!mttcg_enabled);
	996	if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
	997	timer_del(tcg_kick_vcpu_timer);
	998	}
	999	}
	1000
	1001	/***********************************************************/
	1002	void hw_error(const char *fmt, ...)
	1003	{
	1004	va_list ap;
	1005	CPUState *cpu;
	1006
	1007	va_start(ap, fmt);
	1008	fprintf(stderr, "qemu: hardware error: ");
	1009	vfprintf(stderr, fmt, ap);
	1010	fprintf(stderr, "\n");
	1011	CPU_FOREACH(cpu) {
	1012	fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
	1013	cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
	1014	}
	1015	va_end(ap);
	1016	abort();
	1017	}
	1018
	1019	void cpu_synchronize_all_states(void)
	1020	{
	1021	CPUState *cpu;
	1022
	1023	CPU_FOREACH(cpu) {
	1024	cpu_synchronize_state(cpu);
	1025	/* TODO: move to cpu_synchronize_state() */
	1026	if (hvf_enabled()) {
	1027	hvf_cpu_synchronize_state(cpu);
	1028	}
	1029	}
	1030	}
	1031
	1032	void cpu_synchronize_all_post_reset(void)
	1033	{
	1034	CPUState *cpu;
	1035
	1036	CPU_FOREACH(cpu) {
	1037	cpu_synchronize_post_reset(cpu);
	1038	/* TODO: move to cpu_synchronize_post_reset() */
	1039	if (hvf_enabled()) {
	1040	hvf_cpu_synchronize_post_reset(cpu);
	1041	}
	1042	}
	1043	}
	1044
	1045	void cpu_synchronize_all_post_init(void)
	1046	{
	1047	CPUState *cpu;
	1048
	1049	CPU_FOREACH(cpu) {
	1050	cpu_synchronize_post_init(cpu);
	1051	/* TODO: move to cpu_synchronize_post_init() */
	1052	if (hvf_enabled()) {
	1053	hvf_cpu_synchronize_post_init(cpu);
	1054	}
	1055	}
	1056	}
	1057
	1058	void cpu_synchronize_all_pre_loadvm(void)
	1059	{
	1060	CPUState *cpu;
	1061
	1062	CPU_FOREACH(cpu) {
	1063	cpu_synchronize_pre_loadvm(cpu);
	1064	}
	1065	}
	1066
	1067	static int do_vm_stop(RunState state, bool send_stop)
	1068	{
	1069	int ret = 0;
	1070
	1071	if (runstate_is_running()) {
	1072	cpu_disable_ticks();
	1073	pause_all_vcpus();
	1074	runstate_set(state);
	1075	vm_state_notify(0, state);
	1076	if (send_stop) {
	1077	qapi_event_send_stop();
	1078	}
	1079	}
	1080
	1081	bdrv_drain_all();
	1082	replay_disable_events();
	1083	ret = bdrv_flush_all();
	1084
	1085	return ret;
	1086	}
	1087
	1088	/* Special vm_stop() variant for terminating the process. Historically clients
	1089	* did not expect a QMP STOP event and so we need to retain compatibility.
	1090	*/
	1091	int vm_shutdown(void)
	1092	{
	1093	return do_vm_stop(RUN_STATE_SHUTDOWN, false);
	1094	}
	1095
	1096	static bool cpu_can_run(CPUState *cpu)
	1097	{
	1098	if (cpu->stop) {
	1099	return false;
	1100	}
	1101	if (cpu_is_stopped(cpu)) {
	1102	return false;
	1103	}
	1104	return true;
	1105	}
	1106
	1107	static void cpu_handle_guest_debug(CPUState *cpu)
	1108	{
	1109	gdb_set_stop_cpu(cpu);
	1110	qemu_system_debug_request();
	1111	cpu->stopped = true;
	1112	}
	1113
	1114	#ifdef CONFIG_LINUX
	1115	static void sigbus_reraise(void)
	1116	{
	1117	sigset_t set;
	1118	struct sigaction action;
	1119
	1120	memset(&action, 0, sizeof(action));
	1121	action.sa_handler = SIG_DFL;
	1122	if (!sigaction(SIGBUS, &action, NULL)) {
	1123	raise(SIGBUS);
	1124	sigemptyset(&set);
	1125	sigaddset(&set, SIGBUS);
	1126	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
	1127	}
	1128	perror("Failed to re-raise SIGBUS!\n");
	1129	abort();
	1130	}
	1131
	1132	static void sigbus_handler(int n, siginfo_t siginfo, void ctx)
	1133	{
	1134	if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
	1135	sigbus_reraise();
	1136	}
	1137
	1138	if (current_cpu) {
	1139	/* Called asynchronously in VCPU thread. */
	1140	if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
	1141	sigbus_reraise();
	1142	}
	1143	} else {
	1144	/* Called synchronously (via signalfd) in main thread. */
	1145	if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
	1146	sigbus_reraise();
	1147	}
	1148	}
	1149	}
	1150
	1151	static void qemu_init_sigbus(void)
	1152	{
	1153	struct sigaction action;
	1154
	1155	memset(&action, 0, sizeof(action));
	1156	action.sa_flags = SA_SIGINFO;
	1157	action.sa_sigaction = sigbus_handler;
	1158	sigaction(SIGBUS, &action, NULL);
	1159
	1160	prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
	1161	}
	1162	#else /* !CONFIG_LINUX */
	1163	static void qemu_init_sigbus(void)
	1164	{
	1165	}
	1166	#endif /* !CONFIG_LINUX */
	1167
	1168	static QemuMutex qemu_global_mutex;
	1169
	1170	static QemuThread io_thread;
	1171
	1172	/* cpu creation */
	1173	static QemuCond qemu_cpu_cond;
	1174	/* system init */
	1175	static QemuCond qemu_pause_cond;
	1176
	1177	void qemu_init_cpu_loop(void)
	1178	{
	1179	qemu_init_sigbus();
	1180	qemu_cond_init(&qemu_cpu_cond);
	1181	qemu_cond_init(&qemu_pause_cond);
	1182	qemu_mutex_init(&qemu_global_mutex);
	1183
	1184	qemu_thread_get_self(&io_thread);
	1185	}
	1186
	1187	void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
	1188	{
	1189	do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
	1190	}
	1191
	1192	static void qemu_kvm_destroy_vcpu(CPUState *cpu)
	1193	{
	1194	if (kvm_destroy_vcpu(cpu) < 0) {
	1195	error_report("kvm_destroy_vcpu failed");
	1196	exit(EXIT_FAILURE);
	1197	}
	1198	}
	1199
	1200	static void qemu_tcg_destroy_vcpu(CPUState *cpu)
	1201	{
	1202	}
	1203
	1204	static void qemu_cpu_stop(CPUState *cpu, bool exit)
	1205	{
	1206	g_assert(qemu_cpu_is_self(cpu));
	1207	cpu->stop = false;
	1208	cpu->stopped = true;
	1209	if (exit) {
	1210	cpu_exit(cpu);
	1211	}
	1212	qemu_cond_broadcast(&qemu_pause_cond);
	1213	}
	1214
	1215	static void qemu_wait_io_event_common(CPUState *cpu)
	1216	{
	1217	atomic_mb_set(&cpu->thread_kicked, false);
	1218	if (cpu->stop) {
	1219	qemu_cpu_stop(cpu, false);
	1220	}
	1221	process_queued_cpu_work(cpu);
	1222	}
	1223
	1224	static void qemu_tcg_rr_wait_io_event(void)
	1225	{
	1226	CPUState *cpu;
	1227
	1228	while (all_cpu_threads_idle()) {
	1229	stop_tcg_kick_timer();
	1230	qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
	1231	}
	1232
	1233	start_tcg_kick_timer();
	1234
	1235	CPU_FOREACH(cpu) {
	1236	qemu_wait_io_event_common(cpu);
	1237	}
	1238	}
	1239
	1240	static void qemu_wait_io_event(CPUState *cpu)
	1241	{
	1242	while (cpu_thread_is_idle(cpu)) {
	1243	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1244	}
	1245
	1246	#ifdef _WIN32
	1247	/* Eat dummy APC queued by qemu_cpu_kick_thread. */
	1248	if (!tcg_enabled()) {
	1249	SleepEx(0, TRUE);
	1250	}
	1251	#endif
	1252	qemu_wait_io_event_common(cpu);
	1253	}
	1254
	1255	static void qemu_kvm_cpu_thread_fn(void arg)
	1256	{
	1257	CPUState *cpu = arg;
	1258	int r;
	1259
	1260	rcu_register_thread();
	1261
	1262	qemu_mutex_lock_iothread();
	1263	qemu_thread_get_self(cpu->thread);
	1264	cpu->thread_id = qemu_get_thread_id();
	1265	cpu->can_do_io = 1;
	1266	current_cpu = cpu;
	1267
	1268	r = kvm_init_vcpu(cpu);
	1269	if (r < 0) {
	1270	error_report("kvm_init_vcpu failed: %s", strerror(-r));
	1271	exit(1);
	1272	}
	1273
	1274	kvm_init_cpu_signals(cpu);
	1275
	1276	/* signal CPU creation */
	1277	cpu->created = true;
	1278	qemu_cond_signal(&qemu_cpu_cond);
	1279
	1280	do {
	1281	if (cpu_can_run(cpu)) {
	1282	r = kvm_cpu_exec(cpu);
	1283	if (r == EXCP_DEBUG) {
	1284	cpu_handle_guest_debug(cpu);
	1285	}
	1286	}
	1287	qemu_wait_io_event(cpu);
	1288	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1289
	1290	qemu_kvm_destroy_vcpu(cpu);
	1291	cpu->created = false;
	1292	qemu_cond_signal(&qemu_cpu_cond);
	1293	qemu_mutex_unlock_iothread();
	1294	rcu_unregister_thread();
	1295	return NULL;
	1296	}
	1297
	1298	static void qemu_dummy_cpu_thread_fn(void arg)
	1299	{
	1300	#ifdef _WIN32
	1301	error_report("qtest is not supported under Windows");
	1302	exit(1);
	1303	#else
	1304	CPUState *cpu = arg;
	1305	sigset_t waitset;
	1306	int r;
	1307
	1308	rcu_register_thread();
	1309
	1310	qemu_mutex_lock_iothread();
	1311	qemu_thread_get_self(cpu->thread);
	1312	cpu->thread_id = qemu_get_thread_id();
	1313	cpu->can_do_io = 1;
	1314	current_cpu = cpu;
	1315
	1316	sigemptyset(&waitset);
	1317	sigaddset(&waitset, SIG_IPI);
	1318
	1319	/* signal CPU creation */
	1320	cpu->created = true;
	1321	qemu_cond_signal(&qemu_cpu_cond);
	1322
	1323	do {
	1324	qemu_mutex_unlock_iothread();
	1325	do {
	1326	int sig;
	1327	r = sigwait(&waitset, &sig);
	1328	} while (r == -1 && (errno == EAGAIN \|\| errno == EINTR));
	1329	if (r == -1) {
	1330	perror("sigwait");
	1331	exit(1);
	1332	}
	1333	qemu_mutex_lock_iothread();
	1334	qemu_wait_io_event(cpu);
	1335	} while (!cpu->unplug);
	1336
	1337	qemu_mutex_unlock_iothread();
	1338	rcu_unregister_thread();
	1339	return NULL;
	1340	#endif
	1341	}
	1342
	1343	static int64_t tcg_get_icount_limit(void)
	1344	{
	1345	int64_t deadline;
	1346
	1347	if (replay_mode != REPLAY_MODE_PLAY) {
	1348	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1349
	1350	/* Maintain prior (possibly buggy) behaviour where if no deadline
	1351	* was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
	1352	* INT32_MAX nanoseconds ahead, we still use INT32_MAX
	1353	* nanoseconds.
	1354	*/
	1355	if ((deadline < 0) \|\| (deadline > INT32_MAX)) {
	1356	deadline = INT32_MAX;
	1357	}
	1358
	1359	return qemu_icount_round(deadline);
	1360	} else {
	1361	return replay_get_instructions();
	1362	}
	1363	}
	1364
	1365	static void handle_icount_deadline(void)
	1366	{
	1367	assert(qemu_in_vcpu_thread());
	1368	if (use_icount) {
	1369	int64_t deadline =
	1370	qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1371
	1372	if (deadline == 0) {
	1373	/* Wake up other AioContexts. */
	1374	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	1375	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	1376	}
	1377	}
	1378	}
	1379
	1380	static void prepare_icount_for_run(CPUState *cpu)
	1381	{
	1382	if (use_icount) {
	1383	int insns_left;
	1384
	1385	/* These should always be cleared by process_icount_data after
	1386	* each vCPU execution. However u16.high can be raised
	1387	* asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
	1388	*/
	1389	g_assert(cpu->icount_decr.u16.low == 0);
	1390	g_assert(cpu->icount_extra == 0);
	1391
	1392	cpu->icount_budget = tcg_get_icount_limit();
	1393	insns_left = MIN(0xffff, cpu->icount_budget);
	1394	cpu->icount_decr.u16.low = insns_left;
	1395	cpu->icount_extra = cpu->icount_budget - insns_left;
	1396
	1397	replay_mutex_lock();
	1398	}
	1399	}
	1400
	1401	static void process_icount_data(CPUState *cpu)
	1402	{
	1403	if (use_icount) {
	1404	/* Account for executed instructions */
	1405	cpu_update_icount(cpu);
	1406
	1407	/* Reset the counters */
	1408	cpu->icount_decr.u16.low = 0;
	1409	cpu->icount_extra = 0;
	1410	cpu->icount_budget = 0;
	1411
	1412	replay_account_executed_instructions();
	1413
	1414	replay_mutex_unlock();
	1415	}
	1416	}
	1417
	1418
	1419	static int tcg_cpu_exec(CPUState *cpu)
	1420	{
	1421	int ret;
	1422	#ifdef CONFIG_PROFILER
	1423	int64_t ti;
	1424	#endif
	1425
	1426	assert(tcg_enabled());
	1427	#ifdef CONFIG_PROFILER
	1428	ti = profile_getclock();
	1429	#endif
	1430	cpu_exec_start(cpu);
	1431	ret = cpu_exec(cpu);
	1432	cpu_exec_end(cpu);
	1433	#ifdef CONFIG_PROFILER
	1434	atomic_set(&tcg_ctx->prof.cpu_exec_time,
	1435	tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
	1436	#endif
	1437	return ret;
	1438	}
	1439
	1440	/* Destroy any remaining vCPUs which have been unplugged and have
	1441	* finished running
	1442	*/
	1443	static void deal_with_unplugged_cpus(void)
	1444	{
	1445	CPUState *cpu;
	1446
	1447	CPU_FOREACH(cpu) {
	1448	if (cpu->unplug && !cpu_can_run(cpu)) {
	1449	qemu_tcg_destroy_vcpu(cpu);
	1450	cpu->created = false;
	1451	qemu_cond_signal(&qemu_cpu_cond);
	1452	break;
	1453	}
	1454	}
	1455	}
	1456
	1457	/* Single-threaded TCG
	1458	*
	1459	* In the single-threaded case each vCPU is simulated in turn. If
	1460	* there is more than a single vCPU we create a simple timer to kick
	1461	* the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
	1462	* This is done explicitly rather than relying on side-effects
	1463	* elsewhere.
	1464	*/
	1465
	1466	static void qemu_tcg_rr_cpu_thread_fn(void arg)
	1467	{
	1468	CPUState *cpu = arg;
	1469
	1470	assert(tcg_enabled());
	1471	rcu_register_thread();
	1472	tcg_register_thread();
	1473
	1474	qemu_mutex_lock_iothread();
	1475	qemu_thread_get_self(cpu->thread);
	1476
	1477	cpu->thread_id = qemu_get_thread_id();
	1478	cpu->created = true;
	1479	cpu->can_do_io = 1;
	1480	qemu_cond_signal(&qemu_cpu_cond);
	1481
	1482	/* wait for initial kick-off after machine start */
	1483	while (first_cpu->stopped) {
	1484	qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
	1485
	1486	/* process any pending work */
	1487	CPU_FOREACH(cpu) {
	1488	current_cpu = cpu;
	1489	qemu_wait_io_event_common(cpu);
	1490	}
	1491	}
	1492
	1493	start_tcg_kick_timer();
	1494
	1495	cpu = first_cpu;
	1496
	1497	/* process any pending work */
	1498	cpu->exit_request = 1;
	1499
	1500	while (1) {
	1501	qemu_mutex_unlock_iothread();
	1502	replay_mutex_lock();
	1503	qemu_mutex_lock_iothread();
	1504	/* Account partial waits to QEMU_CLOCK_VIRTUAL. */
	1505	qemu_account_warp_timer();
	1506
	1507	/* Run the timers here. This is much more efficient than
	1508	* waking up the I/O thread and waiting for completion.
	1509	*/
	1510	handle_icount_deadline();
	1511
	1512	replay_mutex_unlock();
	1513
	1514	if (!cpu) {
	1515	cpu = first_cpu;
	1516	}
	1517
	1518	while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
	1519
	1520	atomic_mb_set(&tcg_current_rr_cpu, cpu);
	1521	current_cpu = cpu;
	1522
	1523	qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
	1524	(cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
	1525
	1526	if (cpu_can_run(cpu)) {
	1527	int r;
	1528
	1529	qemu_mutex_unlock_iothread();
	1530	prepare_icount_for_run(cpu);
	1531
	1532	r = tcg_cpu_exec(cpu);
	1533
	1534	process_icount_data(cpu);
	1535	qemu_mutex_lock_iothread();
	1536
	1537	if (r == EXCP_DEBUG) {
	1538	cpu_handle_guest_debug(cpu);
	1539	break;
	1540	} else if (r == EXCP_ATOMIC) {
	1541	qemu_mutex_unlock_iothread();
	1542	cpu_exec_step_atomic(cpu);
	1543	qemu_mutex_lock_iothread();
	1544	break;
	1545	}
	1546	} else if (cpu->stop) {
	1547	if (cpu->unplug) {
	1548	cpu = CPU_NEXT(cpu);
	1549	}
	1550	break;
	1551	}
	1552
	1553	cpu = CPU_NEXT(cpu);
	1554	} /* while (cpu && !cpu->exit_request).. */
	1555
	1556	/* Does not need atomic_mb_set because a spurious wakeup is okay. */
	1557	atomic_set(&tcg_current_rr_cpu, NULL);
	1558
	1559	if (cpu && cpu->exit_request) {
	1560	atomic_mb_set(&cpu->exit_request, 0);
	1561	}
	1562
	1563	if (use_icount && all_cpu_threads_idle()) {
	1564	/*
	1565	* When all cpus are sleeping (e.g in WFI), to avoid a deadlock
	1566	* in the main_loop, wake it up in order to start the warp timer.
	1567	*/
	1568	qemu_notify_event();
	1569	}
	1570
	1571	qemu_tcg_rr_wait_io_event();
	1572	deal_with_unplugged_cpus();
	1573	}
	1574
	1575	rcu_unregister_thread();
	1576	return NULL;
	1577	}
	1578
	1579	static void qemu_hax_cpu_thread_fn(void arg)
	1580	{
	1581	CPUState *cpu = arg;
	1582	int r;
	1583
	1584	rcu_register_thread();
	1585	qemu_mutex_lock_iothread();
	1586	qemu_thread_get_self(cpu->thread);
	1587
	1588	cpu->thread_id = qemu_get_thread_id();
	1589	cpu->created = true;
	1590	cpu->halted = 0;
	1591	current_cpu = cpu;
	1592
	1593	hax_init_vcpu(cpu);
	1594	qemu_cond_signal(&qemu_cpu_cond);
	1595
	1596	do {
	1597	if (cpu_can_run(cpu)) {
	1598	r = hax_smp_cpu_exec(cpu);
	1599	if (r == EXCP_DEBUG) {
	1600	cpu_handle_guest_debug(cpu);
	1601	}
	1602	}
	1603
	1604	qemu_wait_io_event(cpu);
	1605	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1606	rcu_unregister_thread();
	1607	return NULL;
	1608	}
	1609
	1610	/* The HVF-specific vCPU thread function. This one should only run when the host
	1611	* CPU supports the VMX "unrestricted guest" feature. */
	1612	static void qemu_hvf_cpu_thread_fn(void arg)
	1613	{
	1614	CPUState *cpu = arg;
	1615
	1616	int r;
	1617
	1618	assert(hvf_enabled());
	1619
	1620	rcu_register_thread();
	1621
	1622	qemu_mutex_lock_iothread();
	1623	qemu_thread_get_self(cpu->thread);
	1624
	1625	cpu->thread_id = qemu_get_thread_id();
	1626	cpu->can_do_io = 1;
	1627	current_cpu = cpu;
	1628
	1629	hvf_init_vcpu(cpu);
	1630
	1631	/* signal CPU creation */
	1632	cpu->created = true;
	1633	qemu_cond_signal(&qemu_cpu_cond);
	1634
	1635	do {
	1636	if (cpu_can_run(cpu)) {
	1637	r = hvf_vcpu_exec(cpu);
	1638	if (r == EXCP_DEBUG) {
	1639	cpu_handle_guest_debug(cpu);
	1640	}
	1641	}
	1642	qemu_wait_io_event(cpu);
	1643	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1644
	1645	hvf_vcpu_destroy(cpu);
	1646	cpu->created = false;
	1647	qemu_cond_signal(&qemu_cpu_cond);
	1648	qemu_mutex_unlock_iothread();
	1649	rcu_unregister_thread();
	1650	return NULL;
	1651	}
	1652
	1653	static void qemu_whpx_cpu_thread_fn(void arg)
	1654	{
	1655	CPUState *cpu = arg;
	1656	int r;
	1657
	1658	rcu_register_thread();
	1659
	1660	qemu_mutex_lock_iothread();
	1661	qemu_thread_get_self(cpu->thread);
	1662	cpu->thread_id = qemu_get_thread_id();
	1663	current_cpu = cpu;
	1664
	1665	r = whpx_init_vcpu(cpu);
	1666	if (r < 0) {
	1667	fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
	1668	exit(1);
	1669	}
	1670
	1671	/* signal CPU creation */
	1672	cpu->created = true;
	1673	qemu_cond_signal(&qemu_cpu_cond);
	1674
	1675	do {
	1676	if (cpu_can_run(cpu)) {
	1677	r = whpx_vcpu_exec(cpu);
	1678	if (r == EXCP_DEBUG) {
	1679	cpu_handle_guest_debug(cpu);
	1680	}
	1681	}
	1682	while (cpu_thread_is_idle(cpu)) {
	1683	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1684	}
	1685	qemu_wait_io_event_common(cpu);
	1686	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1687
	1688	whpx_destroy_vcpu(cpu);
	1689	cpu->created = false;
	1690	qemu_cond_signal(&qemu_cpu_cond);
	1691	qemu_mutex_unlock_iothread();
	1692	rcu_unregister_thread();
	1693	return NULL;
	1694	}
	1695
	1696	#ifdef _WIN32
	1697	static void CALLBACK dummy_apc_func(ULONG_PTR unused)
	1698	{
	1699	}
	1700	#endif
	1701
	1702	/* Multi-threaded TCG
	1703	*
	1704	* In the multi-threaded case each vCPU has its own thread. The TLS
	1705	* variable current_cpu can be used deep in the code to find the
	1706	* current CPUState for a given thread.
	1707	*/
	1708
	1709	static void qemu_tcg_cpu_thread_fn(void arg)
	1710	{
	1711	CPUState *cpu = arg;
	1712
	1713	assert(tcg_enabled());
	1714	g_assert(!use_icount);
	1715
	1716	rcu_register_thread();
	1717	tcg_register_thread();
	1718
	1719	qemu_mutex_lock_iothread();
	1720	qemu_thread_get_self(cpu->thread);
	1721
	1722	cpu->thread_id = qemu_get_thread_id();
	1723	cpu->created = true;
	1724	cpu->can_do_io = 1;
	1725	current_cpu = cpu;
	1726	qemu_cond_signal(&qemu_cpu_cond);
	1727
	1728	/* process any pending work */
	1729	cpu->exit_request = 1;
	1730
	1731	do {
	1732	if (cpu_can_run(cpu)) {
	1733	int r;
	1734	qemu_mutex_unlock_iothread();
	1735	r = tcg_cpu_exec(cpu);
	1736	qemu_mutex_lock_iothread();
	1737	switch (r) {
	1738	case EXCP_DEBUG:
	1739	cpu_handle_guest_debug(cpu);
	1740	break;
	1741	case EXCP_HALTED:
	1742	/* during start-up the vCPU is reset and the thread is
	1743	* kicked several times. If we don't ensure we go back
	1744	* to sleep in the halted state we won't cleanly
	1745	* start-up when the vCPU is enabled.
	1746	*
	1747	* cpu->halted should ensure we sleep in wait_io_event
	1748	*/
	1749	g_assert(cpu->halted);
	1750	break;
	1751	case EXCP_ATOMIC:
	1752	qemu_mutex_unlock_iothread();
	1753	cpu_exec_step_atomic(cpu);
	1754	qemu_mutex_lock_iothread();
	1755	default:
	1756	/* Ignore everything else? */
	1757	break;
	1758	}
	1759	}
	1760
	1761	atomic_mb_set(&cpu->exit_request, 0);
	1762	qemu_wait_io_event(cpu);
	1763	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1764
	1765	qemu_tcg_destroy_vcpu(cpu);
	1766	cpu->created = false;
	1767	qemu_cond_signal(&qemu_cpu_cond);
	1768	qemu_mutex_unlock_iothread();
	1769	rcu_unregister_thread();
	1770	return NULL;
	1771	}
	1772
	1773	static void qemu_cpu_kick_thread(CPUState *cpu)
	1774	{
	1775	#ifndef _WIN32
	1776	int err;
	1777
	1778	if (cpu->thread_kicked) {
	1779	return;
	1780	}
	1781	cpu->thread_kicked = true;
	1782	err = pthread_kill(cpu->thread->thread, SIG_IPI);
	1783	if (err && err != ESRCH) {
	1784	fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
	1785	exit(1);
	1786	}
	1787	#else /* _WIN32 */
	1788	if (!qemu_cpu_is_self(cpu)) {
	1789	if (whpx_enabled()) {
	1790	whpx_vcpu_kick(cpu);
	1791	} else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
	1792	fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
	1793	__func__, GetLastError());
	1794	exit(1);
	1795	}
	1796	}
	1797	#endif
	1798	}
	1799
	1800	void qemu_cpu_kick(CPUState *cpu)
	1801	{
	1802	qemu_cond_broadcast(cpu->halt_cond);
	1803	if (tcg_enabled()) {
	1804	cpu_exit(cpu);
	1805	/* NOP unless doing single-thread RR */
	1806	qemu_cpu_kick_rr_cpu();
	1807	} else {
	1808	if (hax_enabled()) {
	1809	/*
	1810	* FIXME: race condition with the exit_request check in
	1811	* hax_vcpu_hax_exec
	1812	*/
	1813	cpu->exit_request = 1;
	1814	}
	1815	qemu_cpu_kick_thread(cpu);
	1816	}
	1817	}
	1818
	1819	void qemu_cpu_kick_self(void)
	1820	{
	1821	assert(current_cpu);
	1822	qemu_cpu_kick_thread(current_cpu);
	1823	}
	1824
	1825	bool qemu_cpu_is_self(CPUState *cpu)
	1826	{
	1827	return qemu_thread_is_self(cpu->thread);
	1828	}
	1829
	1830	bool qemu_in_vcpu_thread(void)
	1831	{
	1832	return current_cpu && qemu_cpu_is_self(current_cpu);
	1833	}
	1834
	1835	static __thread bool iothread_locked = false;
	1836
	1837	bool qemu_mutex_iothread_locked(void)
	1838	{
	1839	return iothread_locked;
	1840	}
	1841
	1842	/*
	1843	* The BQL is taken from so many places that it is worth profiling the
	1844	* callers directly, instead of funneling them all through a single function.
	1845	*/
	1846	void qemu_mutex_lock_iothread_impl(const char *file, int line)
	1847	{
	1848	QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
	1849
	1850	g_assert(!qemu_mutex_iothread_locked());
	1851	bql_lock(&qemu_global_mutex, file, line);
	1852	iothread_locked = true;
	1853	}
	1854
	1855	void qemu_mutex_unlock_iothread(void)
	1856	{
	1857	g_assert(qemu_mutex_iothread_locked());
	1858	iothread_locked = false;
	1859	qemu_mutex_unlock(&qemu_global_mutex);
	1860	}
	1861
	1862	static bool all_vcpus_paused(void)
	1863	{
	1864	CPUState *cpu;
	1865
	1866	CPU_FOREACH(cpu) {
	1867	if (!cpu->stopped) {
	1868	return false;
	1869	}
	1870	}
	1871
	1872	return true;
	1873	}
	1874
	1875	void pause_all_vcpus(void)
	1876	{
	1877	CPUState *cpu;
	1878
	1879	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
	1880	CPU_FOREACH(cpu) {
	1881	if (qemu_cpu_is_self(cpu)) {
	1882	qemu_cpu_stop(cpu, true);
	1883	} else {
	1884	cpu->stop = true;
	1885	qemu_cpu_kick(cpu);
	1886	}
	1887	}
	1888
	1889	/* We need to drop the replay_lock so any vCPU threads woken up
	1890	* can finish their replay tasks
	1891	*/
	1892	replay_mutex_unlock();
	1893
	1894	while (!all_vcpus_paused()) {
	1895	qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
	1896	CPU_FOREACH(cpu) {
	1897	qemu_cpu_kick(cpu);
	1898	}
	1899	}
	1900
	1901	qemu_mutex_unlock_iothread();
	1902	replay_mutex_lock();
	1903	qemu_mutex_lock_iothread();
	1904	}
	1905
	1906	void cpu_resume(CPUState *cpu)
	1907	{
	1908	cpu->stop = false;
	1909	cpu->stopped = false;
	1910	qemu_cpu_kick(cpu);
	1911	}
	1912
	1913	void resume_all_vcpus(void)
	1914	{
	1915	CPUState *cpu;
	1916
	1917	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
	1918	CPU_FOREACH(cpu) {
	1919	cpu_resume(cpu);
	1920	}
	1921	}
	1922
	1923	void cpu_remove_sync(CPUState *cpu)
	1924	{
	1925	cpu->stop = true;
	1926	cpu->unplug = true;
	1927	qemu_cpu_kick(cpu);
	1928	qemu_mutex_unlock_iothread();
	1929	qemu_thread_join(cpu->thread);
	1930	qemu_mutex_lock_iothread();
	1931	}
	1932
	1933	/* For temporary buffers for forming a name */
	1934	#define VCPU_THREAD_NAME_SIZE 16
	1935
	1936	static void qemu_tcg_init_vcpu(CPUState *cpu)
	1937	{
	1938	char thread_name[VCPU_THREAD_NAME_SIZE];
	1939	static QemuCond *single_tcg_halt_cond;
	1940	static QemuThread *single_tcg_cpu_thread;
	1941	static int tcg_region_inited;
	1942
	1943	assert(tcg_enabled());
	1944	/*
	1945	* Initialize TCG regions--once. Now is a good time, because:
	1946	* (1) TCG's init context, prologue and target globals have been set up.
	1947	* (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
	1948	* -accel flag is processed, so the check doesn't work then).
	1949	*/
	1950	if (!tcg_region_inited) {
	1951	tcg_region_inited = 1;
	1952	tcg_region_init();
	1953	}
	1954
	1955	if (qemu_tcg_mttcg_enabled() \|\| !single_tcg_cpu_thread) {
	1956	cpu->thread = g_malloc0(sizeof(QemuThread));
	1957	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1958	qemu_cond_init(cpu->halt_cond);
	1959
	1960	if (qemu_tcg_mttcg_enabled()) {
	1961	/* create a thread per vCPU with TCG (MTTCG) */
	1962	parallel_cpus = true;
	1963	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
	1964	cpu->cpu_index);
	1965
	1966	qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
	1967	cpu, QEMU_THREAD_JOINABLE);
	1968
	1969	} else {
	1970	/* share a single thread for all cpus with TCG */
	1971	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
	1972	qemu_thread_create(cpu->thread, thread_name,
	1973	qemu_tcg_rr_cpu_thread_fn,
	1974	cpu, QEMU_THREAD_JOINABLE);
	1975
	1976	single_tcg_halt_cond = cpu->halt_cond;
	1977	single_tcg_cpu_thread = cpu->thread;
	1978	}
	1979	#ifdef _WIN32
	1980	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	1981	#endif
	1982	} else {
	1983	/* For non-MTTCG cases we share the thread */
	1984	cpu->thread = single_tcg_cpu_thread;
	1985	cpu->halt_cond = single_tcg_halt_cond;
	1986	cpu->thread_id = first_cpu->thread_id;
	1987	cpu->can_do_io = 1;
	1988	cpu->created = true;
	1989	}
	1990	}
	1991
	1992	static void qemu_hax_start_vcpu(CPUState *cpu)
	1993	{
	1994	char thread_name[VCPU_THREAD_NAME_SIZE];
	1995
	1996	cpu->thread = g_malloc0(sizeof(QemuThread));
	1997	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1998	qemu_cond_init(cpu->halt_cond);
	1999
	2000	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
	2001	cpu->cpu_index);
	2002	qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
	2003	cpu, QEMU_THREAD_JOINABLE);
	2004	#ifdef _WIN32
	2005	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	2006	#endif
	2007	}
	2008
	2009	static void qemu_kvm_start_vcpu(CPUState *cpu)
	2010	{
	2011	char thread_name[VCPU_THREAD_NAME_SIZE];
	2012
	2013	cpu->thread = g_malloc0(sizeof(QemuThread));
	2014	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2015	qemu_cond_init(cpu->halt_cond);
	2016	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
	2017	cpu->cpu_index);
	2018	qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
	2019	cpu, QEMU_THREAD_JOINABLE);
	2020	}
	2021
	2022	static void qemu_hvf_start_vcpu(CPUState *cpu)
	2023	{
	2024	char thread_name[VCPU_THREAD_NAME_SIZE];
	2025
	2026	/* HVF currently does not support TCG, and only runs in
	2027	* unrestricted-guest mode. */
	2028	assert(hvf_enabled());
	2029
	2030	cpu->thread = g_malloc0(sizeof(QemuThread));
	2031	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2032	qemu_cond_init(cpu->halt_cond);
	2033
	2034	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
	2035	cpu->cpu_index);
	2036	qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
	2037	cpu, QEMU_THREAD_JOINABLE);
	2038	}
	2039
	2040	static void qemu_whpx_start_vcpu(CPUState *cpu)
	2041	{
	2042	char thread_name[VCPU_THREAD_NAME_SIZE];
	2043
	2044	cpu->thread = g_malloc0(sizeof(QemuThread));
	2045	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2046	qemu_cond_init(cpu->halt_cond);
	2047	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
	2048	cpu->cpu_index);
	2049	qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
	2050	cpu, QEMU_THREAD_JOINABLE);
	2051	#ifdef _WIN32
	2052	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	2053	#endif
	2054	}
	2055
	2056	static void qemu_dummy_start_vcpu(CPUState *cpu)
	2057	{
	2058	char thread_name[VCPU_THREAD_NAME_SIZE];
	2059
	2060	cpu->thread = g_malloc0(sizeof(QemuThread));
	2061	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2062	qemu_cond_init(cpu->halt_cond);
	2063	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
	2064	cpu->cpu_index);
	2065	qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
	2066	QEMU_THREAD_JOINABLE);
	2067	}
	2068
	2069	void qemu_init_vcpu(CPUState *cpu)
	2070	{
	2071	cpu->nr_cores = smp_cores;
	2072	cpu->nr_threads = smp_threads;
	2073	cpu->stopped = true;
	2074
	2075	if (!cpu->as) {
	2076	/* If the target cpu hasn't set up any address spaces itself,
	2077	* give it the default one.
	2078	*/
	2079	cpu->num_ases = 1;
	2080	cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
	2081	}
	2082
	2083	if (kvm_enabled()) {
	2084	qemu_kvm_start_vcpu(cpu);
	2085	} else if (hax_enabled()) {
	2086	qemu_hax_start_vcpu(cpu);
	2087	} else if (hvf_enabled()) {
	2088	qemu_hvf_start_vcpu(cpu);
	2089	} else if (tcg_enabled()) {
	2090	qemu_tcg_init_vcpu(cpu);
	2091	} else if (whpx_enabled()) {
	2092	qemu_whpx_start_vcpu(cpu);
	2093	} else {
	2094	qemu_dummy_start_vcpu(cpu);
	2095	}
	2096
	2097	while (!cpu->created) {
	2098	qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
	2099	}
	2100	}
	2101
	2102	void cpu_stop_current(void)
	2103	{
	2104	if (current_cpu) {
	2105	current_cpu->stop = true;
	2106	cpu_exit(current_cpu);
	2107	}
	2108	}
	2109
	2110	int vm_stop(RunState state)
	2111	{
	2112	if (qemu_in_vcpu_thread()) {
	2113	qemu_system_vmstop_request_prepare();
	2114	qemu_system_vmstop_request(state);
	2115	/*
	2116	* FIXME: should not return to device code in case
	2117	* vm_stop() has been requested.
	2118	*/
	2119	cpu_stop_current();
	2120	return 0;
	2121	}
	2122
	2123	return do_vm_stop(state, true);
	2124	}
	2125
	2126	/**
	2127	* Prepare for (re)starting the VM.
	2128	* Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
	2129	* running or in case of an error condition), 0 otherwise.
	2130	*/
	2131	int vm_prepare_start(void)
	2132	{
	2133	RunState requested;
	2134
	2135	qemu_vmstop_requested(&requested);
	2136	if (runstate_is_running() && requested == RUN_STATE__MAX) {
	2137	return -1;
	2138	}
	2139
	2140	/* Ensure that a STOP/RESUME pair of events is emitted if a
	2141	* vmstop request was pending. The BLOCK_IO_ERROR event, for
	2142	* example, according to documentation is always followed by
	2143	* the STOP event.
	2144	*/
	2145	if (runstate_is_running()) {
	2146	qapi_event_send_stop();
	2147	qapi_event_send_resume();
	2148	return -1;
	2149	}
	2150
	2151	/* We are sending this now, but the CPUs will be resumed shortly later */
	2152	qapi_event_send_resume();
	2153
	2154	replay_enable_events();
	2155	cpu_enable_ticks();
	2156	runstate_set(RUN_STATE_RUNNING);
	2157	vm_state_notify(1, RUN_STATE_RUNNING);
	2158	return 0;
	2159	}
	2160
	2161	void vm_start(void)
	2162	{
	2163	if (!vm_prepare_start()) {
	2164	resume_all_vcpus();
	2165	}
	2166	}
	2167
	2168	/* does a state transition even if the VM is already stopped,
	2169	current state is forgotten forever */
	2170	int vm_stop_force_state(RunState state)
	2171	{
	2172	if (runstate_is_running()) {
	2173	return vm_stop(state);
	2174	} else {
	2175	runstate_set(state);
	2176
	2177	bdrv_drain_all();
	2178	/* Make sure to return an error if the flush in a previous vm_stop()
	2179	* failed. */
	2180	return bdrv_flush_all();
	2181	}
	2182	}
	2183
	2184	void list_cpus(const char *optarg)
	2185	{
	2186	/* XXX: implement xxx_cpu_list for targets that still miss it */
	2187	#if defined(cpu_list)
	2188	cpu_list();
	2189	#endif
	2190	}
	2191
	2192	CpuInfoList qmp_query_cpus(Error *errp)
	2193	{
	2194	MachineState *ms = MACHINE(qdev_get_machine());
	2195	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2196	CpuInfoList head = NULL, cur_item = NULL;
	2197	CPUState *cpu;
	2198
	2199	CPU_FOREACH(cpu) {
	2200	CpuInfoList *info;
	2201	#if defined(TARGET_I386)
	2202	X86CPU *x86_cpu = X86_CPU(cpu);
	2203	CPUX86State *env = &x86_cpu->env;
	2204	#elif defined(TARGET_PPC)
	2205	PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
	2206	CPUPPCState *env = &ppc_cpu->env;
	2207	#elif defined(TARGET_SPARC)
	2208	SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
	2209	CPUSPARCState *env = &sparc_cpu->env;
	2210	#elif defined(TARGET_RISCV)
	2211	RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
	2212	CPURISCVState *env = &riscv_cpu->env;
	2213	#elif defined(TARGET_MIPS)
	2214	MIPSCPU *mips_cpu = MIPS_CPU(cpu);
	2215	CPUMIPSState *env = &mips_cpu->env;
	2216	#elif defined(TARGET_TRICORE)
	2217	TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
	2218	CPUTriCoreState *env = &tricore_cpu->env;
	2219	#elif defined(TARGET_S390X)
	2220	S390CPU *s390_cpu = S390_CPU(cpu);
	2221	CPUS390XState *env = &s390_cpu->env;
	2222	#endif
	2223
	2224	cpu_synchronize_state(cpu);
	2225
	2226	info = g_malloc0(sizeof(*info));
	2227	info->value = g_malloc0(sizeof(*info->value));
	2228	info->value->CPU = cpu->cpu_index;
	2229	info->value->current = (cpu == first_cpu);
	2230	info->value->halted = cpu->halted;
	2231	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2232	info->value->thread_id = cpu->thread_id;
	2233	#if defined(TARGET_I386)
	2234	info->value->arch = CPU_INFO_ARCH_X86;
	2235	info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
	2236	#elif defined(TARGET_PPC)
	2237	info->value->arch = CPU_INFO_ARCH_PPC;
	2238	info->value->u.ppc.nip = env->nip;
	2239	#elif defined(TARGET_SPARC)
	2240	info->value->arch = CPU_INFO_ARCH_SPARC;
	2241	info->value->u.q_sparc.pc = env->pc;
	2242	info->value->u.q_sparc.npc = env->npc;
	2243	#elif defined(TARGET_MIPS)
	2244	info->value->arch = CPU_INFO_ARCH_MIPS;
	2245	info->value->u.q_mips.PC = env->active_tc.PC;
	2246	#elif defined(TARGET_TRICORE)
	2247	info->value->arch = CPU_INFO_ARCH_TRICORE;
	2248	info->value->u.tricore.PC = env->PC;
	2249	#elif defined(TARGET_S390X)
	2250	info->value->arch = CPU_INFO_ARCH_S390;
	2251	info->value->u.s390.cpu_state = env->cpu_state;
	2252	#elif defined(TARGET_RISCV)
	2253	info->value->arch = CPU_INFO_ARCH_RISCV;
	2254	info->value->u.riscv.pc = env->pc;
	2255	#else
	2256	info->value->arch = CPU_INFO_ARCH_OTHER;
	2257	#endif
	2258	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2259	if (info->value->has_props) {
	2260	CpuInstanceProperties *props;
	2261	props = g_malloc0(sizeof(*props));
	2262	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2263	info->value->props = props;
	2264	}
	2265
	2266	/* XXX: waiting for the qapi to support GSList */
	2267	if (!cur_item) {
	2268	head = cur_item = info;
	2269	} else {
	2270	cur_item->next = info;
	2271	cur_item = info;
	2272	}
	2273	}
	2274
	2275	return head;
	2276	}
	2277
	2278	static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
	2279	{
	2280	/*
	2281	* The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
	2282	* TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
	2283	*/
	2284	switch (target) {
	2285	case SYS_EMU_TARGET_I386:
	2286	case SYS_EMU_TARGET_X86_64:
	2287	return CPU_INFO_ARCH_X86;
	2288
	2289	case SYS_EMU_TARGET_PPC:
	2290	case SYS_EMU_TARGET_PPC64:
	2291	return CPU_INFO_ARCH_PPC;
	2292
	2293	case SYS_EMU_TARGET_SPARC:
	2294	case SYS_EMU_TARGET_SPARC64:
	2295	return CPU_INFO_ARCH_SPARC;
	2296
	2297	case SYS_EMU_TARGET_MIPS:
	2298	case SYS_EMU_TARGET_MIPSEL:
	2299	case SYS_EMU_TARGET_MIPS64:
	2300	case SYS_EMU_TARGET_MIPS64EL:
	2301	return CPU_INFO_ARCH_MIPS;
	2302
	2303	case SYS_EMU_TARGET_TRICORE:
	2304	return CPU_INFO_ARCH_TRICORE;
	2305
	2306	case SYS_EMU_TARGET_S390X:
	2307	return CPU_INFO_ARCH_S390;
	2308
	2309	case SYS_EMU_TARGET_RISCV32:
	2310	case SYS_EMU_TARGET_RISCV64:
	2311	return CPU_INFO_ARCH_RISCV;
	2312
	2313	default:
	2314	return CPU_INFO_ARCH_OTHER;
	2315	}
	2316	}
	2317
	2318	static void cpustate_to_cpuinfo_s390(CpuInfoS390 info, const CPUState cpu)
	2319	{
	2320	#ifdef TARGET_S390X
	2321	S390CPU *s390_cpu = S390_CPU(cpu);
	2322	CPUS390XState *env = &s390_cpu->env;
	2323
	2324	info->cpu_state = env->cpu_state;
	2325	#else
	2326	abort();
	2327	#endif
	2328	}
	2329
	2330	/*
	2331	* fast means: we NEVER interrupt vCPU threads to retrieve
	2332	* information from KVM.
	2333	*/
	2334	CpuInfoFastList qmp_query_cpus_fast(Error *errp)
	2335	{
	2336	MachineState *ms = MACHINE(qdev_get_machine());
	2337	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2338	CpuInfoFastList head = NULL, cur_item = NULL;
	2339	SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
	2340	-1, &error_abort);
	2341	CPUState *cpu;
	2342
	2343	CPU_FOREACH(cpu) {
	2344	CpuInfoFastList info = g_malloc0(sizeof(info));
	2345	info->value = g_malloc0(sizeof(*info->value));
	2346
	2347	info->value->cpu_index = cpu->cpu_index;
	2348	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2349	info->value->thread_id = cpu->thread_id;
	2350
	2351	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2352	if (info->value->has_props) {
	2353	CpuInstanceProperties *props;
	2354	props = g_malloc0(sizeof(*props));
	2355	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2356	info->value->props = props;
	2357	}
	2358
	2359	info->value->arch = sysemu_target_to_cpuinfo_arch(target);
	2360	info->value->target = target;
	2361	if (target == SYS_EMU_TARGET_S390X) {
	2362	cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
	2363	}
	2364
	2365	if (!cur_item) {
	2366	head = cur_item = info;
	2367	} else {
	2368	cur_item->next = info;
	2369	cur_item = info;
	2370	}
	2371	}
	2372
	2373	return head;
	2374	}
	2375
	2376	void qmp_memsave(int64_t addr, int64_t size, const char *filename,
	2377	bool has_cpu, int64_t cpu_index, Error **errp)
	2378	{
	2379	FILE *f;
	2380	uint32_t l;
	2381	CPUState *cpu;
	2382	uint8_t buf[1024];
	2383	int64_t orig_addr = addr, orig_size = size;
	2384
	2385	if (!has_cpu) {
	2386	cpu_index = 0;
	2387	}
	2388
	2389	cpu = qemu_get_cpu(cpu_index);
	2390	if (cpu == NULL) {
	2391	error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
	2392	"a CPU number");
	2393	return;
	2394	}
	2395
	2396	f = fopen(filename, "wb");
	2397	if (!f) {
	2398	error_setg_file_open(errp, errno, filename);
	2399	return;
	2400	}
	2401
	2402	while (size != 0) {
	2403	l = sizeof(buf);
	2404	if (l > size)
	2405	l = size;
	2406	if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
	2407	error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
	2408	" specified", orig_addr, orig_size);
	2409	goto exit;
	2410	}
	2411	if (fwrite(buf, 1, l, f) != l) {
	2412	error_setg(errp, QERR_IO_ERROR);
	2413	goto exit;
	2414	}
	2415	addr += l;
	2416	size -= l;
	2417	}
	2418
	2419	exit:
	2420	fclose(f);
	2421	}
	2422
	2423	void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
	2424	Error **errp)
	2425	{
	2426	FILE *f;
	2427	uint32_t l;
	2428	uint8_t buf[1024];
	2429
	2430	f = fopen(filename, "wb");
	2431	if (!f) {
	2432	error_setg_file_open(errp, errno, filename);
	2433	return;
	2434	}
	2435
	2436	while (size != 0) {
	2437	l = sizeof(buf);
	2438	if (l > size)
	2439	l = size;
	2440	cpu_physical_memory_read(addr, buf, l);
	2441	if (fwrite(buf, 1, l, f) != l) {
	2442	error_setg(errp, QERR_IO_ERROR);
	2443	goto exit;
	2444	}
	2445	addr += l;
	2446	size -= l;
	2447	}
	2448
	2449	exit:
	2450	fclose(f);
	2451	}
	2452
	2453	void qmp_inject_nmi(Error **errp)
	2454	{
	2455	nmi_monitor_handle(monitor_get_cpu_index(), errp);
	2456	}
	2457
	2458	void dump_drift_info(void)
	2459	{
	2460	if (!use_icount) {
	2461	return;
	2462	}
	2463
	2464	qemu_printf("Host - Guest clock %"PRIi64" ms\n",
	2465	(cpu_get_clock() - cpu_get_icount())/SCALE_MS);
	2466	if (icount_align_option) {
	2467	qemu_printf("Max guest delay %"PRIi64" ms\n",
	2468	-max_delay / SCALE_MS);
	2469	qemu_printf("Max guest advance %"PRIi64" ms\n",
	2470	max_advance / SCALE_MS);
	2471	} else {
	2472	qemu_printf("Max guest delay NA\n");
	2473	qemu_printf("Max guest advance NA\n");
	2474	}
	2475	}