Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* QEMU System Emulator
	3	*
	4	* Copyright (c) 2003-2008 Fabrice Bellard
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24
	25	#include "qemu/osdep.h"
	26	#include "qemu/config-file.h"
	27	#include "cpu.h"
	28	#include "monitor/monitor.h"
	29	#include "qapi/error.h"
	30	#include "qapi/qapi-commands-misc.h"
	31	#include "qapi/qapi-events-run-state.h"
	32	#include "qapi/qmp/qerror.h"
	33	#include "qemu/error-report.h"
	34	#include "sysemu/sysemu.h"
	35	#include "sysemu/block-backend.h"
	36	#include "exec/gdbstub.h"
	37	#include "sysemu/dma.h"
	38	#include "sysemu/hw_accel.h"
	39	#include "sysemu/kvm.h"
	40	#include "sysemu/hax.h"
	41	#include "sysemu/hvf.h"
	42	#include "sysemu/whpx.h"
	43	#include "exec/exec-all.h"
	44
	45	#include "qemu/thread.h"
	46	#include "sysemu/cpus.h"
	47	#include "sysemu/qtest.h"
	48	#include "qemu/main-loop.h"
	49	#include "qemu/option.h"
	50	#include "qemu/bitmap.h"
	51	#include "qemu/seqlock.h"
	52	#include "tcg.h"
	53	#include "hw/nmi.h"
	54	#include "sysemu/replay.h"
	55	#include "hw/boards.h"
	56
	57	#ifdef CONFIG_LINUX
	58
	59	#include <sys/prctl.h>
	60
	61	#ifndef PR_MCE_KILL
	62	#define PR_MCE_KILL 33
	63	#endif
	64
	65	#ifndef PR_MCE_KILL_SET
	66	#define PR_MCE_KILL_SET 1
	67	#endif
	68
	69	#ifndef PR_MCE_KILL_EARLY
	70	#define PR_MCE_KILL_EARLY 1
	71	#endif
	72
	73	#endif /* CONFIG_LINUX */
	74
	75	int64_t max_delay;
	76	int64_t max_advance;
	77
	78	/* vcpu throttling controls */
	79	static QEMUTimer *throttle_timer;
	80	static unsigned int throttle_percentage;
	81
	82	#define CPU_THROTTLE_PCT_MIN 1
	83	#define CPU_THROTTLE_PCT_MAX 99
	84	#define CPU_THROTTLE_TIMESLICE_NS 10000000
	85
	86	bool cpu_is_stopped(CPUState *cpu)
	87	{
	88	return cpu->stopped \|\| !runstate_is_running();
	89	}
	90
	91	static bool cpu_thread_is_idle(CPUState *cpu)
	92	{
	93	if (cpu->stop \|\| cpu->queued_work_first) {
	94	return false;
	95	}
	96	if (cpu_is_stopped(cpu)) {
	97	return true;
	98	}
	99	if (!cpu->halted \|\| cpu_has_work(cpu) \|\|
	100	kvm_halt_in_kernel()) {
	101	return false;
	102	}
	103	return true;
	104	}
	105
	106	static bool all_cpu_threads_idle(void)
	107	{
	108	CPUState *cpu;
	109
	110	CPU_FOREACH(cpu) {
	111	if (!cpu_thread_is_idle(cpu)) {
	112	return false;
	113	}
	114	}
	115	return true;
	116	}
	117
	118	/***********************************************************/
	119	/* guest cycle counter */
	120
	121	/* Protected by TimersState seqlock */
	122
	123	static bool icount_sleep = true;
	124	/* Arbitrarily pick 1MIPS as the minimum allowable speed. */
	125	#define MAX_ICOUNT_SHIFT 10
	126
	127	typedef struct TimersState {
	128	/* Protected by BQL. */
	129	int64_t cpu_ticks_prev;
	130	int64_t cpu_ticks_offset;
	131
	132	/* Protect fields that can be respectively read outside the
	133	* BQL, and written from multiple threads.
	134	*/
	135	QemuSeqLock vm_clock_seqlock;
	136	QemuSpin vm_clock_lock;
	137
	138	int16_t cpu_ticks_enabled;
	139
	140	/* Conversion factor from emulated instructions to virtual clock ticks. */
	141	int16_t icount_time_shift;
	142
	143	/* Compensate for varying guest execution speed. */
	144	int64_t qemu_icount_bias;
	145
	146	int64_t vm_clock_warp_start;
	147	int64_t cpu_clock_offset;
	148
	149	/* Only written by TCG thread */
	150	int64_t qemu_icount;
	151
	152	/* for adjusting icount */
	153	QEMUTimer *icount_rt_timer;
	154	QEMUTimer *icount_vm_timer;
	155	QEMUTimer *icount_warp_timer;
	156	} TimersState;
	157
	158	static TimersState timers_state;
	159	bool mttcg_enabled;
	160
	161	/*
	162	* We default to false if we know other options have been enabled
	163	* which are currently incompatible with MTTCG. Otherwise when each
	164	* guest (target) has been updated to support:
	165	* - atomic instructions
	166	* - memory ordering primitives (barriers)
	167	* they can set the appropriate CONFIG flags in ${target}-softmmu.mak
	168	*
	169	* Once a guest architecture has been converted to the new primitives
	170	* there are two remaining limitations to check.
	171	*
	172	* - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
	173	* - The host must have a stronger memory order than the guest
	174	*
	175	* It may be possible in future to support strong guests on weak hosts
	176	* but that will require tagging all load/stores in a guest with their
	177	* implicit memory order requirements which would likely slow things
	178	* down a lot.
	179	*/
	180
	181	static bool check_tcg_memory_orders_compatible(void)
	182	{
	183	#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
	184	return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
	185	#else
	186	return false;
	187	#endif
	188	}
	189
	190	static bool default_mttcg_enabled(void)
	191	{
	192	if (use_icount \|\| TCG_OVERSIZED_GUEST) {
	193	return false;
	194	} else {
	195	#ifdef TARGET_SUPPORTS_MTTCG
	196	return check_tcg_memory_orders_compatible();
	197	#else
	198	return false;
	199	#endif
	200	}
	201	}
	202
	203	void qemu_tcg_configure(QemuOpts opts, Error *errp)
	204	{
	205	const char *t = qemu_opt_get(opts, "thread");
	206	if (t) {
	207	if (strcmp(t, "multi") == 0) {
	208	if (TCG_OVERSIZED_GUEST) {
	209	error_setg(errp, "No MTTCG when guest word size > hosts");
	210	} else if (use_icount) {
	211	error_setg(errp, "No MTTCG when icount is enabled");
	212	} else {
	213	#ifndef TARGET_SUPPORTS_MTTCG
	214	error_report("Guest not yet converted to MTTCG - "
	215	"you may get unexpected results");
	216	#endif
	217	if (!check_tcg_memory_orders_compatible()) {
	218	error_report("Guest expects a stronger memory ordering "
	219	"than the host provides");
	220	error_printf("This may cause strange/hard to debug errors\n");
	221	}
	222	mttcg_enabled = true;
	223	}
	224	} else if (strcmp(t, "single") == 0) {
	225	mttcg_enabled = false;
	226	} else {
	227	error_setg(errp, "Invalid 'thread' setting %s", t);
	228	}
	229	} else {
	230	mttcg_enabled = default_mttcg_enabled();
	231	}
	232	}
	233
	234	/* The current number of executed instructions is based on what we
	235	* originally budgeted minus the current state of the decrementing
	236	* icount counters in extra/u16.low.
	237	*/
	238	static int64_t cpu_get_icount_executed(CPUState *cpu)
	239	{
	240	return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
	241	}
	242
	243	/*
	244	* Update the global shared timer_state.qemu_icount to take into
	245	* account executed instructions. This is done by the TCG vCPU
	246	* thread so the main-loop can see time has moved forward.
	247	*/
	248	void cpu_update_icount(CPUState *cpu)
	249	{
	250	int64_t executed = cpu_get_icount_executed(cpu);
	251	cpu->icount_budget -= executed;
	252
	253	#ifndef CONFIG_ATOMIC64
	254	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	255	&timers_state.vm_clock_lock);
	256	#endif
	257	atomic_set__nocheck(&timers_state.qemu_icount,
	258	timers_state.qemu_icount + executed);
	259	#ifndef CONFIG_ATOMIC64
	260	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	261	&timers_state.vm_clock_lock);
	262	#endif
	263	}
	264
	265	static int64_t cpu_get_icount_raw_locked(void)
	266	{
	267	CPUState *cpu = current_cpu;
	268
	269	if (cpu && cpu->running) {
	270	if (!cpu->can_do_io) {
	271	error_report("Bad icount read");
	272	exit(1);
	273	}
	274	/* Take into account what has run */
	275	cpu_update_icount(cpu);
	276	}
	277	/* The read is protected by the seqlock, so __nocheck is okay. */
	278	return atomic_read__nocheck(&timers_state.qemu_icount);
	279	}
	280
	281	static int64_t cpu_get_icount_locked(void)
	282	{
	283	int64_t icount = cpu_get_icount_raw_locked();
	284	return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
	285	}
	286
	287	int64_t cpu_get_icount_raw(void)
	288	{
	289	int64_t icount;
	290	unsigned start;
	291
	292	do {
	293	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	294	icount = cpu_get_icount_raw_locked();
	295	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	296
	297	return icount;
	298	}
	299
	300	/* Return the virtual CPU time, based on the instruction counter. */
	301	int64_t cpu_get_icount(void)
	302	{
	303	int64_t icount;
	304	unsigned start;
	305
	306	do {
	307	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	308	icount = cpu_get_icount_locked();
	309	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	310
	311	return icount;
	312	}
	313
	314	int64_t cpu_icount_to_ns(int64_t icount)
	315	{
	316	return icount << atomic_read(&timers_state.icount_time_shift);
	317	}
	318
	319	static int64_t cpu_get_ticks_locked(void)
	320	{
	321	int64_t ticks = timers_state.cpu_ticks_offset;
	322	if (timers_state.cpu_ticks_enabled) {
	323	ticks += cpu_get_host_ticks();
	324	}
	325
	326	if (timers_state.cpu_ticks_prev > ticks) {
	327	/* Non increasing ticks may happen if the host uses software suspend. */
	328	timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
	329	ticks = timers_state.cpu_ticks_prev;
	330	}
	331
	332	timers_state.cpu_ticks_prev = ticks;
	333	return ticks;
	334	}
	335
	336	/* return the time elapsed in VM between vm_start and vm_stop. Unless
	337	* icount is active, cpu_get_ticks() uses units of the host CPU cycle
	338	* counter.
	339	*/
	340	int64_t cpu_get_ticks(void)
	341	{
	342	int64_t ticks;
	343
	344	if (use_icount) {
	345	return cpu_get_icount();
	346	}
	347
	348	qemu_spin_lock(&timers_state.vm_clock_lock);
	349	ticks = cpu_get_ticks_locked();
	350	qemu_spin_unlock(&timers_state.vm_clock_lock);
	351	return ticks;
	352	}
	353
	354	static int64_t cpu_get_clock_locked(void)
	355	{
	356	int64_t time;
	357
	358	time = timers_state.cpu_clock_offset;
	359	if (timers_state.cpu_ticks_enabled) {
	360	time += get_clock();
	361	}
	362
	363	return time;
	364	}
	365
	366	/* Return the monotonic time elapsed in VM, i.e.,
	367	* the time between vm_start and vm_stop
	368	*/
	369	int64_t cpu_get_clock(void)
	370	{
	371	int64_t ti;
	372	unsigned start;
	373
	374	do {
	375	start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	376	ti = cpu_get_clock_locked();
	377	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
	378
	379	return ti;
	380	}
	381
	382	/* enable cpu_get_ticks()
	383	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	384	*/
	385	void cpu_enable_ticks(void)
	386	{
	387	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	388	&timers_state.vm_clock_lock);
	389	if (!timers_state.cpu_ticks_enabled) {
	390	timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
	391	timers_state.cpu_clock_offset -= get_clock();
	392	timers_state.cpu_ticks_enabled = 1;
	393	}
	394	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	395	&timers_state.vm_clock_lock);
	396	}
	397
	398	/* disable cpu_get_ticks() : the clock is stopped. You must not call
	399	* cpu_get_ticks() after that.
	400	* Caller must hold BQL which serves as mutex for vm_clock_seqlock.
	401	*/
	402	void cpu_disable_ticks(void)
	403	{
	404	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	405	&timers_state.vm_clock_lock);
	406	if (timers_state.cpu_ticks_enabled) {
	407	timers_state.cpu_ticks_offset += cpu_get_host_ticks();
	408	timers_state.cpu_clock_offset = cpu_get_clock_locked();
	409	timers_state.cpu_ticks_enabled = 0;
	410	}
	411	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	412	&timers_state.vm_clock_lock);
	413	}
	414
	415	/* Correlation between real and virtual time is always going to be
	416	fairly approximate, so ignore small variation.
	417	When the guest is idle real and virtual time will be aligned in
	418	the IO wait loop. */
	419	#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
	420
	421	static void icount_adjust(void)
	422	{
	423	int64_t cur_time;
	424	int64_t cur_icount;
	425	int64_t delta;
	426
	427	/* Protected by TimersState mutex. */
	428	static int64_t last_delta;
	429
	430	/* If the VM is not running, then do nothing. */
	431	if (!runstate_is_running()) {
	432	return;
	433	}
	434
	435	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	436	&timers_state.vm_clock_lock);
	437	cur_time = cpu_get_clock_locked();
	438	cur_icount = cpu_get_icount_locked();
	439
	440	delta = cur_icount - cur_time;
	441	/* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
	442	if (delta > 0
	443	&& last_delta + ICOUNT_WOBBLE < delta * 2
	444	&& timers_state.icount_time_shift > 0) {
	445	/* The guest is getting too far ahead. Slow time down. */
	446	atomic_set(&timers_state.icount_time_shift,
	447	timers_state.icount_time_shift - 1);
	448	}
	449	if (delta < 0
	450	&& last_delta - ICOUNT_WOBBLE > delta * 2
	451	&& timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
	452	/* The guest is getting too far behind. Speed time up. */
	453	atomic_set(&timers_state.icount_time_shift,
	454	timers_state.icount_time_shift + 1);
	455	}
	456	last_delta = delta;
	457	atomic_set__nocheck(&timers_state.qemu_icount_bias,
	458	cur_icount - (timers_state.qemu_icount
	459	<< timers_state.icount_time_shift));
	460	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	461	&timers_state.vm_clock_lock);
	462	}
	463
	464	static void icount_adjust_rt(void *opaque)
	465	{
	466	timer_mod(timers_state.icount_rt_timer,
	467	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	468	icount_adjust();
	469	}
	470
	471	static void icount_adjust_vm(void *opaque)
	472	{
	473	timer_mod(timers_state.icount_vm_timer,
	474	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	475	NANOSECONDS_PER_SECOND / 10);
	476	icount_adjust();
	477	}
	478
	479	static int64_t qemu_icount_round(int64_t count)
	480	{
	481	int shift = atomic_read(&timers_state.icount_time_shift);
	482	return (count + (1 << shift) - 1) >> shift;
	483	}
	484
	485	static void icount_warp_rt(void)
	486	{
	487	unsigned seq;
	488	int64_t warp_start;
	489
	490	/* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
	491	* changes from -1 to another value, so the race here is okay.
	492	*/
	493	do {
	494	seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
	495	warp_start = timers_state.vm_clock_warp_start;
	496	} while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
	497
	498	if (warp_start == -1) {
	499	return;
	500	}
	501
	502	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	503	&timers_state.vm_clock_lock);
	504	if (runstate_is_running()) {
	505	int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
	506	cpu_get_clock_locked());
	507	int64_t warp_delta;
	508
	509	warp_delta = clock - timers_state.vm_clock_warp_start;
	510	if (use_icount == 2) {
	511	/*
	512	* In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
	513	* far ahead of real time.
	514	*/
	515	int64_t cur_icount = cpu_get_icount_locked();
	516	int64_t delta = clock - cur_icount;
	517	warp_delta = MIN(warp_delta, delta);
	518	}
	519	atomic_set__nocheck(&timers_state.qemu_icount_bias,
	520	timers_state.qemu_icount_bias + warp_delta);
	521	}
	522	timers_state.vm_clock_warp_start = -1;
	523	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	524	&timers_state.vm_clock_lock);
	525
	526	if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
	527	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	528	}
	529	}
	530
	531	static void icount_timer_cb(void *opaque)
	532	{
	533	/* No need for a checkpoint because the timer already synchronizes
	534	* with CHECKPOINT_CLOCK_VIRTUAL_RT.
	535	*/
	536	icount_warp_rt();
	537	}
	538
	539	void qtest_clock_warp(int64_t dest)
	540	{
	541	int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	542	AioContext *aio_context;
	543	assert(qtest_enabled());
	544	aio_context = qemu_get_aio_context();
	545	while (clock < dest) {
	546	int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	547	int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
	548
	549	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	550	&timers_state.vm_clock_lock);
	551	atomic_set__nocheck(&timers_state.qemu_icount_bias,
	552	timers_state.qemu_icount_bias + warp);
	553	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	554	&timers_state.vm_clock_lock);
	555
	556	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	557	timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
	558	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
	559	}
	560	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	561	}
	562
	563	void qemu_start_warp_timer(void)
	564	{
	565	int64_t clock;
	566	int64_t deadline;
	567
	568	if (!use_icount) {
	569	return;
	570	}
	571
	572	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	573	* do not fire, so computing the deadline does not make sense.
	574	*/
	575	if (!runstate_is_running()) {
	576	return;
	577	}
	578
	579	/* warp clock deterministically in record/replay mode */
	580	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
	581	return;
	582	}
	583
	584	if (!all_cpu_threads_idle()) {
	585	return;
	586	}
	587
	588	if (qtest_enabled()) {
	589	/* When testing, qtest commands advance icount. */
	590	return;
	591	}
	592
	593	/* We want to use the earliest deadline from ALL vm_clocks */
	594	clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
	595	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	596	if (deadline < 0) {
	597	static bool notified;
	598	if (!icount_sleep && !notified) {
	599	warn_report("icount sleep disabled and no active timers");
	600	notified = true;
	601	}
	602	return;
	603	}
	604
	605	if (deadline > 0) {
	606	/*
	607	* Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
	608	* sleep. Otherwise, the CPU might be waiting for a future timer
	609	* interrupt to wake it up, but the interrupt never comes because
	610	* the vCPU isn't running any insns and thus doesn't advance the
	611	* QEMU_CLOCK_VIRTUAL.
	612	*/
	613	if (!icount_sleep) {
	614	/*
	615	* We never let VCPUs sleep in no sleep icount mode.
	616	* If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
	617	* to the next QEMU_CLOCK_VIRTUAL event and notify it.
	618	* It is useful when we want a deterministic execution time,
	619	* isolated from host latencies.
	620	*/
	621	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	622	&timers_state.vm_clock_lock);
	623	atomic_set__nocheck(&timers_state.qemu_icount_bias,
	624	timers_state.qemu_icount_bias + deadline);
	625	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	626	&timers_state.vm_clock_lock);
	627	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	628	} else {
	629	/*
	630	* We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
	631	* "real" time, (related to the time left until the next event) has
	632	* passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
	633	* This avoids that the warps are visible externally; for example,
	634	* you will not be sending network packets continuously instead of
	635	* every 100ms.
	636	*/
	637	seqlock_write_lock(&timers_state.vm_clock_seqlock,
	638	&timers_state.vm_clock_lock);
	639	if (timers_state.vm_clock_warp_start == -1
	640	\|\| timers_state.vm_clock_warp_start > clock) {
	641	timers_state.vm_clock_warp_start = clock;
	642	}
	643	seqlock_write_unlock(&timers_state.vm_clock_seqlock,
	644	&timers_state.vm_clock_lock);
	645	timer_mod_anticipate(timers_state.icount_warp_timer,
	646	clock + deadline);
	647	}
	648	} else if (deadline == 0) {
	649	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	650	}
	651	}
	652
	653	static void qemu_account_warp_timer(void)
	654	{
	655	if (!use_icount \|\| !icount_sleep) {
	656	return;
	657	}
	658
	659	/* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
	660	* do not fire, so computing the deadline does not make sense.
	661	*/
	662	if (!runstate_is_running()) {
	663	return;
	664	}
	665
	666	/* warp clock deterministically in record/replay mode */
	667	if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
	668	return;
	669	}
	670
	671	timer_del(timers_state.icount_warp_timer);
	672	icount_warp_rt();
	673	}
	674
	675	static bool icount_state_needed(void *opaque)
	676	{
	677	return use_icount;
	678	}
	679
	680	static bool warp_timer_state_needed(void *opaque)
	681	{
	682	TimersState *s = opaque;
	683	return s->icount_warp_timer != NULL;
	684	}
	685
	686	static bool adjust_timers_state_needed(void *opaque)
	687	{
	688	TimersState *s = opaque;
	689	return s->icount_rt_timer != NULL;
	690	}
	691
	692	/*
	693	* Subsection for warp timer migration is optional, because may not be created
	694	*/
	695	static const VMStateDescription icount_vmstate_warp_timer = {
	696	.name = "timer/icount/warp_timer",
	697	.version_id = 1,
	698	.minimum_version_id = 1,
	699	.needed = warp_timer_state_needed,
	700	.fields = (VMStateField[]) {
	701	VMSTATE_INT64(vm_clock_warp_start, TimersState),
	702	VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
	703	VMSTATE_END_OF_LIST()
	704	}
	705	};
	706
	707	static const VMStateDescription icount_vmstate_adjust_timers = {
	708	.name = "timer/icount/timers",
	709	.version_id = 1,
	710	.minimum_version_id = 1,
	711	.needed = adjust_timers_state_needed,
	712	.fields = (VMStateField[]) {
	713	VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
	714	VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
	715	VMSTATE_END_OF_LIST()
	716	}
	717	};
	718
	719	/*
	720	* This is a subsection for icount migration.
	721	*/
	722	static const VMStateDescription icount_vmstate_timers = {
	723	.name = "timer/icount",
	724	.version_id = 1,
	725	.minimum_version_id = 1,
	726	.needed = icount_state_needed,
	727	.fields = (VMStateField[]) {
	728	VMSTATE_INT64(qemu_icount_bias, TimersState),
	729	VMSTATE_INT64(qemu_icount, TimersState),
	730	VMSTATE_END_OF_LIST()
	731	},
	732	.subsections = (const VMStateDescription*[]) {
	733	&icount_vmstate_warp_timer,
	734	&icount_vmstate_adjust_timers,
	735	NULL
	736	}
	737	};
	738
	739	static const VMStateDescription vmstate_timers = {
	740	.name = "timer",
	741	.version_id = 2,
	742	.minimum_version_id = 1,
	743	.fields = (VMStateField[]) {
	744	VMSTATE_INT64(cpu_ticks_offset, TimersState),
	745	VMSTATE_UNUSED(8),
	746	VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
	747	VMSTATE_END_OF_LIST()
	748	},
	749	.subsections = (const VMStateDescription*[]) {
	750	&icount_vmstate_timers,
	751	NULL
	752	}
	753	};
	754
	755	static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
	756	{
	757	double pct;
	758	double throttle_ratio;
	759	long sleeptime_ns;
	760
	761	if (!cpu_throttle_get_percentage()) {
	762	return;
	763	}
	764
	765	pct = (double)cpu_throttle_get_percentage()/100;
	766	throttle_ratio = pct / (1 - pct);
	767	sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
	768
	769	qemu_mutex_unlock_iothread();
	770	g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
	771	qemu_mutex_lock_iothread();
	772	atomic_set(&cpu->throttle_thread_scheduled, 0);
	773	}
	774
	775	static void cpu_throttle_timer_tick(void *opaque)
	776	{
	777	CPUState *cpu;
	778	double pct;
	779
	780	/* Stop the timer if needed */
	781	if (!cpu_throttle_get_percentage()) {
	782	return;
	783	}
	784	CPU_FOREACH(cpu) {
	785	if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
	786	async_run_on_cpu(cpu, cpu_throttle_thread,
	787	RUN_ON_CPU_NULL);
	788	}
	789	}
	790
	791	pct = (double)cpu_throttle_get_percentage()/100;
	792	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	793	CPU_THROTTLE_TIMESLICE_NS / (1-pct));
	794	}
	795
	796	void cpu_throttle_set(int new_throttle_pct)
	797	{
	798	/* Ensure throttle percentage is within valid range */
	799	new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
	800	new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
	801
	802	atomic_set(&throttle_percentage, new_throttle_pct);
	803
	804	timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
	805	CPU_THROTTLE_TIMESLICE_NS);
	806	}
	807
	808	void cpu_throttle_stop(void)
	809	{
	810	atomic_set(&throttle_percentage, 0);
	811	}
	812
	813	bool cpu_throttle_active(void)
	814	{
	815	return (cpu_throttle_get_percentage() != 0);
	816	}
	817
	818	int cpu_throttle_get_percentage(void)
	819	{
	820	return atomic_read(&throttle_percentage);
	821	}
	822
	823	void cpu_ticks_init(void)
	824	{
	825	seqlock_init(&timers_state.vm_clock_seqlock);
	826	vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
	827	throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	828	cpu_throttle_timer_tick, NULL);
	829	}
	830
	831	void configure_icount(QemuOpts opts, Error *errp)
	832	{
	833	const char *option;
	834	char *rem_str = NULL;
	835
	836	option = qemu_opt_get(opts, "shift");
	837	if (!option) {
	838	if (qemu_opt_get(opts, "align") != NULL) {
	839	error_setg(errp, "Please specify shift option when using align");
	840	}
	841	return;
	842	}
	843
	844	icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
	845	if (icount_sleep) {
	846	timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
	847	icount_timer_cb, NULL);
	848	}
	849
	850	icount_align_option = qemu_opt_get_bool(opts, "align", false);
	851
	852	if (icount_align_option && !icount_sleep) {
	853	error_setg(errp, "align=on and sleep=off are incompatible");
	854	}
	855	if (strcmp(option, "auto") != 0) {
	856	errno = 0;
	857	timers_state.icount_time_shift = strtol(option, &rem_str, 0);
	858	if (errno != 0 \|\| *rem_str != '\0' \|\| !strlen(option)) {
	859	error_setg(errp, "icount: Invalid shift value");
	860	}
	861	use_icount = 1;
	862	return;
	863	} else if (icount_align_option) {
	864	error_setg(errp, "shift=auto and align=on are incompatible");
	865	} else if (!icount_sleep) {
	866	error_setg(errp, "shift=auto and sleep=off are incompatible");
	867	}
	868
	869	use_icount = 2;
	870
	871	/* 125MIPS seems a reasonable initial guess at the guest speed.
	872	It will be corrected fairly quickly anyway. */
	873	timers_state.icount_time_shift = 3;
	874
	875	/* Have both realtime and virtual time triggers for speed adjustment.
	876	The realtime trigger catches emulated time passing too slowly,
	877	the virtual time trigger catches emulated time passing too fast.
	878	Realtime triggers occur even when idle, so use them less frequently
	879	than VM triggers. */
	880	timers_state.vm_clock_warp_start = -1;
	881	timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
	882	icount_adjust_rt, NULL);
	883	timer_mod(timers_state.icount_rt_timer,
	884	qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
	885	timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	886	icount_adjust_vm, NULL);
	887	timer_mod(timers_state.icount_vm_timer,
	888	qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	889	NANOSECONDS_PER_SECOND / 10);
	890	}
	891
	892	/***********************************************************/
	893	/* TCG vCPU kick timer
	894	*
	895	* The kick timer is responsible for moving single threaded vCPU
	896	* emulation on to the next vCPU. If more than one vCPU is running a
	897	* timer event with force a cpu->exit so the next vCPU can get
	898	* scheduled.
	899	*
	900	* The timer is removed if all vCPUs are idle and restarted again once
	901	* idleness is complete.
	902	*/
	903
	904	static QEMUTimer *tcg_kick_vcpu_timer;
	905	static CPUState *tcg_current_rr_cpu;
	906
	907	#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
	908
	909	static inline int64_t qemu_tcg_next_kick(void)
	910	{
	911	return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
	912	}
	913
	914	/* Kick the currently round-robin scheduled vCPU */
	915	static void qemu_cpu_kick_rr_cpu(void)
	916	{
	917	CPUState *cpu;
	918	do {
	919	cpu = atomic_mb_read(&tcg_current_rr_cpu);
	920	if (cpu) {
	921	cpu_exit(cpu);
	922	}
	923	} while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
	924	}
	925
	926	static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
	927	{
	928	}
	929
	930	void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
	931	{
	932	if (!use_icount \|\| type != QEMU_CLOCK_VIRTUAL) {
	933	qemu_notify_event();
	934	return;
	935	}
	936
	937	if (qemu_in_vcpu_thread()) {
	938	/* A CPU is currently running; kick it back out to the
	939	* tcg_cpu_exec() loop so it will recalculate its
	940	* icount deadline immediately.
	941	*/
	942	qemu_cpu_kick(current_cpu);
	943	} else if (first_cpu) {
	944	/* qemu_cpu_kick is not enough to kick a halted CPU out of
	945	* qemu_tcg_wait_io_event. async_run_on_cpu, instead,
	946	* causes cpu_thread_is_idle to return false. This way,
	947	* handle_icount_deadline can run.
	948	* If we have no CPUs at all for some reason, we don't
	949	* need to do anything.
	950	*/
	951	async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
	952	}
	953	}
	954
	955	static void kick_tcg_thread(void *opaque)
	956	{
	957	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	958	qemu_cpu_kick_rr_cpu();
	959	}
	960
	961	static void start_tcg_kick_timer(void)
	962	{
	963	assert(!mttcg_enabled);
	964	if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
	965	tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
	966	kick_tcg_thread, NULL);
	967	timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
	968	}
	969	}
	970
	971	static void stop_tcg_kick_timer(void)
	972	{
	973	assert(!mttcg_enabled);
	974	if (tcg_kick_vcpu_timer) {
	975	timer_del(tcg_kick_vcpu_timer);
	976	tcg_kick_vcpu_timer = NULL;
	977	}
	978	}
	979
	980	/***********************************************************/
	981	void hw_error(const char *fmt, ...)
	982	{
	983	va_list ap;
	984	CPUState *cpu;
	985
	986	va_start(ap, fmt);
	987	fprintf(stderr, "qemu: hardware error: ");
	988	vfprintf(stderr, fmt, ap);
	989	fprintf(stderr, "\n");
	990	CPU_FOREACH(cpu) {
	991	fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
	992	cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
	993	}
	994	va_end(ap);
	995	abort();
	996	}
	997
	998	void cpu_synchronize_all_states(void)
	999	{
	1000	CPUState *cpu;
	1001
	1002	CPU_FOREACH(cpu) {
	1003	cpu_synchronize_state(cpu);
	1004	/* TODO: move to cpu_synchronize_state() */
	1005	if (hvf_enabled()) {
	1006	hvf_cpu_synchronize_state(cpu);
	1007	}
	1008	}
	1009	}
	1010
	1011	void cpu_synchronize_all_post_reset(void)
	1012	{
	1013	CPUState *cpu;
	1014
	1015	CPU_FOREACH(cpu) {
	1016	cpu_synchronize_post_reset(cpu);
	1017	/* TODO: move to cpu_synchronize_post_reset() */
	1018	if (hvf_enabled()) {
	1019	hvf_cpu_synchronize_post_reset(cpu);
	1020	}
	1021	}
	1022	}
	1023
	1024	void cpu_synchronize_all_post_init(void)
	1025	{
	1026	CPUState *cpu;
	1027
	1028	CPU_FOREACH(cpu) {
	1029	cpu_synchronize_post_init(cpu);
	1030	/* TODO: move to cpu_synchronize_post_init() */
	1031	if (hvf_enabled()) {
	1032	hvf_cpu_synchronize_post_init(cpu);
	1033	}
	1034	}
	1035	}
	1036
	1037	void cpu_synchronize_all_pre_loadvm(void)
	1038	{
	1039	CPUState *cpu;
	1040
	1041	CPU_FOREACH(cpu) {
	1042	cpu_synchronize_pre_loadvm(cpu);
	1043	}
	1044	}
	1045
	1046	static int do_vm_stop(RunState state, bool send_stop)
	1047	{
	1048	int ret = 0;
	1049
	1050	if (runstate_is_running()) {
	1051	cpu_disable_ticks();
	1052	pause_all_vcpus();
	1053	runstate_set(state);
	1054	vm_state_notify(0, state);
	1055	if (send_stop) {
	1056	qapi_event_send_stop();
	1057	}
	1058	}
	1059
	1060	bdrv_drain_all();
	1061	replay_disable_events();
	1062	ret = bdrv_flush_all();
	1063
	1064	return ret;
	1065	}
	1066
	1067	/* Special vm_stop() variant for terminating the process. Historically clients
	1068	* did not expect a QMP STOP event and so we need to retain compatibility.
	1069	*/
	1070	int vm_shutdown(void)
	1071	{
	1072	return do_vm_stop(RUN_STATE_SHUTDOWN, false);
	1073	}
	1074
	1075	static bool cpu_can_run(CPUState *cpu)
	1076	{
	1077	if (cpu->stop) {
	1078	return false;
	1079	}
	1080	if (cpu_is_stopped(cpu)) {
	1081	return false;
	1082	}
	1083	return true;
	1084	}
	1085
	1086	static void cpu_handle_guest_debug(CPUState *cpu)
	1087	{
	1088	gdb_set_stop_cpu(cpu);
	1089	qemu_system_debug_request();
	1090	cpu->stopped = true;
	1091	}
	1092
	1093	#ifdef CONFIG_LINUX
	1094	static void sigbus_reraise(void)
	1095	{
	1096	sigset_t set;
	1097	struct sigaction action;
	1098
	1099	memset(&action, 0, sizeof(action));
	1100	action.sa_handler = SIG_DFL;
	1101	if (!sigaction(SIGBUS, &action, NULL)) {
	1102	raise(SIGBUS);
	1103	sigemptyset(&set);
	1104	sigaddset(&set, SIGBUS);
	1105	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
	1106	}
	1107	perror("Failed to re-raise SIGBUS!\n");
	1108	abort();
	1109	}
	1110
	1111	static void sigbus_handler(int n, siginfo_t siginfo, void ctx)
	1112	{
	1113	if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
	1114	sigbus_reraise();
	1115	}
	1116
	1117	if (current_cpu) {
	1118	/* Called asynchronously in VCPU thread. */
	1119	if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
	1120	sigbus_reraise();
	1121	}
	1122	} else {
	1123	/* Called synchronously (via signalfd) in main thread. */
	1124	if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
	1125	sigbus_reraise();
	1126	}
	1127	}
	1128	}
	1129
	1130	static void qemu_init_sigbus(void)
	1131	{
	1132	struct sigaction action;
	1133
	1134	memset(&action, 0, sizeof(action));
	1135	action.sa_flags = SA_SIGINFO;
	1136	action.sa_sigaction = sigbus_handler;
	1137	sigaction(SIGBUS, &action, NULL);
	1138
	1139	prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
	1140	}
	1141	#else /* !CONFIG_LINUX */
	1142	static void qemu_init_sigbus(void)
	1143	{
	1144	}
	1145	#endif /* !CONFIG_LINUX */
	1146
	1147	static QemuMutex qemu_global_mutex;
	1148
	1149	static QemuThread io_thread;
	1150
	1151	/* cpu creation */
	1152	static QemuCond qemu_cpu_cond;
	1153	/* system init */
	1154	static QemuCond qemu_pause_cond;
	1155
	1156	void qemu_init_cpu_loop(void)
	1157	{
	1158	qemu_init_sigbus();
	1159	qemu_cond_init(&qemu_cpu_cond);
	1160	qemu_cond_init(&qemu_pause_cond);
	1161	qemu_mutex_init(&qemu_global_mutex);
	1162
	1163	qemu_thread_get_self(&io_thread);
	1164	}
	1165
	1166	void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
	1167	{
	1168	do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
	1169	}
	1170
	1171	static void qemu_kvm_destroy_vcpu(CPUState *cpu)
	1172	{
	1173	if (kvm_destroy_vcpu(cpu) < 0) {
	1174	error_report("kvm_destroy_vcpu failed");
	1175	exit(EXIT_FAILURE);
	1176	}
	1177	}
	1178
	1179	static void qemu_tcg_destroy_vcpu(CPUState *cpu)
	1180	{
	1181	}
	1182
	1183	static void qemu_cpu_stop(CPUState *cpu, bool exit)
	1184	{
	1185	g_assert(qemu_cpu_is_self(cpu));
	1186	cpu->stop = false;
	1187	cpu->stopped = true;
	1188	if (exit) {
	1189	cpu_exit(cpu);
	1190	}
	1191	qemu_cond_broadcast(&qemu_pause_cond);
	1192	}
	1193
	1194	static void qemu_wait_io_event_common(CPUState *cpu)
	1195	{
	1196	atomic_mb_set(&cpu->thread_kicked, false);
	1197	if (cpu->stop) {
	1198	qemu_cpu_stop(cpu, false);
	1199	}
	1200	process_queued_cpu_work(cpu);
	1201	}
	1202
	1203	static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
	1204	{
	1205	while (all_cpu_threads_idle()) {
	1206	stop_tcg_kick_timer();
	1207	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1208	}
	1209
	1210	start_tcg_kick_timer();
	1211
	1212	qemu_wait_io_event_common(cpu);
	1213	}
	1214
	1215	static void qemu_wait_io_event(CPUState *cpu)
	1216	{
	1217	while (cpu_thread_is_idle(cpu)) {
	1218	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1219	}
	1220
	1221	#ifdef _WIN32
	1222	/* Eat dummy APC queued by qemu_cpu_kick_thread. */
	1223	if (!tcg_enabled()) {
	1224	SleepEx(0, TRUE);
	1225	}
	1226	#endif
	1227	qemu_wait_io_event_common(cpu);
	1228	}
	1229
	1230	static void qemu_kvm_cpu_thread_fn(void arg)
	1231	{
	1232	CPUState *cpu = arg;
	1233	int r;
	1234
	1235	rcu_register_thread();
	1236
	1237	qemu_mutex_lock_iothread();
	1238	qemu_thread_get_self(cpu->thread);
	1239	cpu->thread_id = qemu_get_thread_id();
	1240	cpu->can_do_io = 1;
	1241	current_cpu = cpu;
	1242
	1243	r = kvm_init_vcpu(cpu);
	1244	if (r < 0) {
	1245	error_report("kvm_init_vcpu failed: %s", strerror(-r));
	1246	exit(1);
	1247	}
	1248
	1249	kvm_init_cpu_signals(cpu);
	1250
	1251	/* signal CPU creation */
	1252	cpu->created = true;
	1253	qemu_cond_signal(&qemu_cpu_cond);
	1254
	1255	do {
	1256	if (cpu_can_run(cpu)) {
	1257	r = kvm_cpu_exec(cpu);
	1258	if (r == EXCP_DEBUG) {
	1259	cpu_handle_guest_debug(cpu);
	1260	}
	1261	}
	1262	qemu_wait_io_event(cpu);
	1263	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1264
	1265	qemu_kvm_destroy_vcpu(cpu);
	1266	cpu->created = false;
	1267	qemu_cond_signal(&qemu_cpu_cond);
	1268	qemu_mutex_unlock_iothread();
	1269	rcu_unregister_thread();
	1270	return NULL;
	1271	}
	1272
	1273	static void qemu_dummy_cpu_thread_fn(void arg)
	1274	{
	1275	#ifdef _WIN32
	1276	error_report("qtest is not supported under Windows");
	1277	exit(1);
	1278	#else
	1279	CPUState *cpu = arg;
	1280	sigset_t waitset;
	1281	int r;
	1282
	1283	rcu_register_thread();
	1284
	1285	qemu_mutex_lock_iothread();
	1286	qemu_thread_get_self(cpu->thread);
	1287	cpu->thread_id = qemu_get_thread_id();
	1288	cpu->can_do_io = 1;
	1289	current_cpu = cpu;
	1290
	1291	sigemptyset(&waitset);
	1292	sigaddset(&waitset, SIG_IPI);
	1293
	1294	/* signal CPU creation */
	1295	cpu->created = true;
	1296	qemu_cond_signal(&qemu_cpu_cond);
	1297
	1298	do {
	1299	qemu_mutex_unlock_iothread();
	1300	do {
	1301	int sig;
	1302	r = sigwait(&waitset, &sig);
	1303	} while (r == -1 && (errno == EAGAIN \|\| errno == EINTR));
	1304	if (r == -1) {
	1305	perror("sigwait");
	1306	exit(1);
	1307	}
	1308	qemu_mutex_lock_iothread();
	1309	qemu_wait_io_event(cpu);
	1310	} while (!cpu->unplug);
	1311
	1312	rcu_unregister_thread();
	1313	return NULL;
	1314	#endif
	1315	}
	1316
	1317	static int64_t tcg_get_icount_limit(void)
	1318	{
	1319	int64_t deadline;
	1320
	1321	if (replay_mode != REPLAY_MODE_PLAY) {
	1322	deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1323
	1324	/* Maintain prior (possibly buggy) behaviour where if no deadline
	1325	* was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
	1326	* INT32_MAX nanoseconds ahead, we still use INT32_MAX
	1327	* nanoseconds.
	1328	*/
	1329	if ((deadline < 0) \|\| (deadline > INT32_MAX)) {
	1330	deadline = INT32_MAX;
	1331	}
	1332
	1333	return qemu_icount_round(deadline);
	1334	} else {
	1335	return replay_get_instructions();
	1336	}
	1337	}
	1338
	1339	static void handle_icount_deadline(void)
	1340	{
	1341	assert(qemu_in_vcpu_thread());
	1342	if (use_icount) {
	1343	int64_t deadline =
	1344	qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
	1345
	1346	if (deadline == 0) {
	1347	/* Wake up other AioContexts. */
	1348	qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
	1349	qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
	1350	}
	1351	}
	1352	}
	1353
	1354	static void prepare_icount_for_run(CPUState *cpu)
	1355	{
	1356	if (use_icount) {
	1357	int insns_left;
	1358
	1359	/* These should always be cleared by process_icount_data after
	1360	* each vCPU execution. However u16.high can be raised
	1361	* asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
	1362	*/
	1363	g_assert(cpu->icount_decr.u16.low == 0);
	1364	g_assert(cpu->icount_extra == 0);
	1365
	1366	cpu->icount_budget = tcg_get_icount_limit();
	1367	insns_left = MIN(0xffff, cpu->icount_budget);
	1368	cpu->icount_decr.u16.low = insns_left;
	1369	cpu->icount_extra = cpu->icount_budget - insns_left;
	1370
	1371	replay_mutex_lock();
	1372	}
	1373	}
	1374
	1375	static void process_icount_data(CPUState *cpu)
	1376	{
	1377	if (use_icount) {
	1378	/* Account for executed instructions */
	1379	cpu_update_icount(cpu);
	1380
	1381	/* Reset the counters */
	1382	cpu->icount_decr.u16.low = 0;
	1383	cpu->icount_extra = 0;
	1384	cpu->icount_budget = 0;
	1385
	1386	replay_account_executed_instructions();
	1387
	1388	replay_mutex_unlock();
	1389	}
	1390	}
	1391
	1392
	1393	static int tcg_cpu_exec(CPUState *cpu)
	1394	{
	1395	int ret;
	1396	#ifdef CONFIG_PROFILER
	1397	int64_t ti;
	1398	#endif
	1399
	1400	assert(tcg_enabled());
	1401	#ifdef CONFIG_PROFILER
	1402	ti = profile_getclock();
	1403	#endif
	1404	cpu_exec_start(cpu);
	1405	ret = cpu_exec(cpu);
	1406	cpu_exec_end(cpu);
	1407	#ifdef CONFIG_PROFILER
	1408	tcg_time += profile_getclock() - ti;
	1409	#endif
	1410	return ret;
	1411	}
	1412
	1413	/* Destroy any remaining vCPUs which have been unplugged and have
	1414	* finished running
	1415	*/
	1416	static void deal_with_unplugged_cpus(void)
	1417	{
	1418	CPUState *cpu;
	1419
	1420	CPU_FOREACH(cpu) {
	1421	if (cpu->unplug && !cpu_can_run(cpu)) {
	1422	qemu_tcg_destroy_vcpu(cpu);
	1423	cpu->created = false;
	1424	qemu_cond_signal(&qemu_cpu_cond);
	1425	break;
	1426	}
	1427	}
	1428	}
	1429
	1430	/* Single-threaded TCG
	1431	*
	1432	* In the single-threaded case each vCPU is simulated in turn. If
	1433	* there is more than a single vCPU we create a simple timer to kick
	1434	* the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
	1435	* This is done explicitly rather than relying on side-effects
	1436	* elsewhere.
	1437	*/
	1438
	1439	static void qemu_tcg_rr_cpu_thread_fn(void arg)
	1440	{
	1441	CPUState *cpu = arg;
	1442
	1443	assert(tcg_enabled());
	1444	rcu_register_thread();
	1445	tcg_register_thread();
	1446
	1447	qemu_mutex_lock_iothread();
	1448	qemu_thread_get_self(cpu->thread);
	1449
	1450	cpu->thread_id = qemu_get_thread_id();
	1451	cpu->created = true;
	1452	cpu->can_do_io = 1;
	1453	qemu_cond_signal(&qemu_cpu_cond);
	1454
	1455	/* wait for initial kick-off after machine start */
	1456	while (first_cpu->stopped) {
	1457	qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
	1458
	1459	/* process any pending work */
	1460	CPU_FOREACH(cpu) {
	1461	current_cpu = cpu;
	1462	qemu_wait_io_event_common(cpu);
	1463	}
	1464	}
	1465
	1466	start_tcg_kick_timer();
	1467
	1468	cpu = first_cpu;
	1469
	1470	/* process any pending work */
	1471	cpu->exit_request = 1;
	1472
	1473	while (1) {
	1474	qemu_mutex_unlock_iothread();
	1475	replay_mutex_lock();
	1476	qemu_mutex_lock_iothread();
	1477	/* Account partial waits to QEMU_CLOCK_VIRTUAL. */
	1478	qemu_account_warp_timer();
	1479
	1480	/* Run the timers here. This is much more efficient than
	1481	* waking up the I/O thread and waiting for completion.
	1482	*/
	1483	handle_icount_deadline();
	1484
	1485	replay_mutex_unlock();
	1486
	1487	if (!cpu) {
	1488	cpu = first_cpu;
	1489	}
	1490
	1491	while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
	1492
	1493	atomic_mb_set(&tcg_current_rr_cpu, cpu);
	1494	current_cpu = cpu;
	1495
	1496	qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
	1497	(cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
	1498
	1499	if (cpu_can_run(cpu)) {
	1500	int r;
	1501
	1502	qemu_mutex_unlock_iothread();
	1503	prepare_icount_for_run(cpu);
	1504
	1505	r = tcg_cpu_exec(cpu);
	1506
	1507	process_icount_data(cpu);
	1508	qemu_mutex_lock_iothread();
	1509
	1510	if (r == EXCP_DEBUG) {
	1511	cpu_handle_guest_debug(cpu);
	1512	break;
	1513	} else if (r == EXCP_ATOMIC) {
	1514	qemu_mutex_unlock_iothread();
	1515	cpu_exec_step_atomic(cpu);
	1516	qemu_mutex_lock_iothread();
	1517	break;
	1518	}
	1519	} else if (cpu->stop) {
	1520	if (cpu->unplug) {
	1521	cpu = CPU_NEXT(cpu);
	1522	}
	1523	break;
	1524	}
	1525
	1526	cpu = CPU_NEXT(cpu);
	1527	} /* while (cpu && !cpu->exit_request).. */
	1528
	1529	/* Does not need atomic_mb_set because a spurious wakeup is okay. */
	1530	atomic_set(&tcg_current_rr_cpu, NULL);
	1531
	1532	if (cpu && cpu->exit_request) {
	1533	atomic_mb_set(&cpu->exit_request, 0);
	1534	}
	1535
	1536	qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
	1537	deal_with_unplugged_cpus();
	1538	}
	1539
	1540	rcu_unregister_thread();
	1541	return NULL;
	1542	}
	1543
	1544	static void qemu_hax_cpu_thread_fn(void arg)
	1545	{
	1546	CPUState *cpu = arg;
	1547	int r;
	1548
	1549	rcu_register_thread();
	1550	qemu_mutex_lock_iothread();
	1551	qemu_thread_get_self(cpu->thread);
	1552
	1553	cpu->thread_id = qemu_get_thread_id();
	1554	cpu->created = true;
	1555	cpu->halted = 0;
	1556	current_cpu = cpu;
	1557
	1558	hax_init_vcpu(cpu);
	1559	qemu_cond_signal(&qemu_cpu_cond);
	1560
	1561	do {
	1562	if (cpu_can_run(cpu)) {
	1563	r = hax_smp_cpu_exec(cpu);
	1564	if (r == EXCP_DEBUG) {
	1565	cpu_handle_guest_debug(cpu);
	1566	}
	1567	}
	1568
	1569	qemu_wait_io_event(cpu);
	1570	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1571	rcu_unregister_thread();
	1572	return NULL;
	1573	}
	1574
	1575	/* The HVF-specific vCPU thread function. This one should only run when the host
	1576	* CPU supports the VMX "unrestricted guest" feature. */
	1577	static void qemu_hvf_cpu_thread_fn(void arg)
	1578	{
	1579	CPUState *cpu = arg;
	1580
	1581	int r;
	1582
	1583	assert(hvf_enabled());
	1584
	1585	rcu_register_thread();
	1586
	1587	qemu_mutex_lock_iothread();
	1588	qemu_thread_get_self(cpu->thread);
	1589
	1590	cpu->thread_id = qemu_get_thread_id();
	1591	cpu->can_do_io = 1;
	1592	current_cpu = cpu;
	1593
	1594	hvf_init_vcpu(cpu);
	1595
	1596	/* signal CPU creation */
	1597	cpu->created = true;
	1598	qemu_cond_signal(&qemu_cpu_cond);
	1599
	1600	do {
	1601	if (cpu_can_run(cpu)) {
	1602	r = hvf_vcpu_exec(cpu);
	1603	if (r == EXCP_DEBUG) {
	1604	cpu_handle_guest_debug(cpu);
	1605	}
	1606	}
	1607	qemu_wait_io_event(cpu);
	1608	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1609
	1610	hvf_vcpu_destroy(cpu);
	1611	cpu->created = false;
	1612	qemu_cond_signal(&qemu_cpu_cond);
	1613	qemu_mutex_unlock_iothread();
	1614	rcu_unregister_thread();
	1615	return NULL;
	1616	}
	1617
	1618	static void qemu_whpx_cpu_thread_fn(void arg)
	1619	{
	1620	CPUState *cpu = arg;
	1621	int r;
	1622
	1623	rcu_register_thread();
	1624
	1625	qemu_mutex_lock_iothread();
	1626	qemu_thread_get_self(cpu->thread);
	1627	cpu->thread_id = qemu_get_thread_id();
	1628	current_cpu = cpu;
	1629
	1630	r = whpx_init_vcpu(cpu);
	1631	if (r < 0) {
	1632	fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
	1633	exit(1);
	1634	}
	1635
	1636	/* signal CPU creation */
	1637	cpu->created = true;
	1638	qemu_cond_signal(&qemu_cpu_cond);
	1639
	1640	do {
	1641	if (cpu_can_run(cpu)) {
	1642	r = whpx_vcpu_exec(cpu);
	1643	if (r == EXCP_DEBUG) {
	1644	cpu_handle_guest_debug(cpu);
	1645	}
	1646	}
	1647	while (cpu_thread_is_idle(cpu)) {
	1648	qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
	1649	}
	1650	qemu_wait_io_event_common(cpu);
	1651	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1652
	1653	whpx_destroy_vcpu(cpu);
	1654	cpu->created = false;
	1655	qemu_cond_signal(&qemu_cpu_cond);
	1656	qemu_mutex_unlock_iothread();
	1657	rcu_unregister_thread();
	1658	return NULL;
	1659	}
	1660
	1661	#ifdef _WIN32
	1662	static void CALLBACK dummy_apc_func(ULONG_PTR unused)
	1663	{
	1664	}
	1665	#endif
	1666
	1667	/* Multi-threaded TCG
	1668	*
	1669	* In the multi-threaded case each vCPU has its own thread. The TLS
	1670	* variable current_cpu can be used deep in the code to find the
	1671	* current CPUState for a given thread.
	1672	*/
	1673
	1674	static void qemu_tcg_cpu_thread_fn(void arg)
	1675	{
	1676	CPUState *cpu = arg;
	1677
	1678	assert(tcg_enabled());
	1679	g_assert(!use_icount);
	1680
	1681	rcu_register_thread();
	1682	tcg_register_thread();
	1683
	1684	qemu_mutex_lock_iothread();
	1685	qemu_thread_get_self(cpu->thread);
	1686
	1687	cpu->thread_id = qemu_get_thread_id();
	1688	cpu->created = true;
	1689	cpu->can_do_io = 1;
	1690	current_cpu = cpu;
	1691	qemu_cond_signal(&qemu_cpu_cond);
	1692
	1693	/* process any pending work */
	1694	cpu->exit_request = 1;
	1695
	1696	do {
	1697	if (cpu_can_run(cpu)) {
	1698	int r;
	1699	qemu_mutex_unlock_iothread();
	1700	r = tcg_cpu_exec(cpu);
	1701	qemu_mutex_lock_iothread();
	1702	switch (r) {
	1703	case EXCP_DEBUG:
	1704	cpu_handle_guest_debug(cpu);
	1705	break;
	1706	case EXCP_HALTED:
	1707	/* during start-up the vCPU is reset and the thread is
	1708	* kicked several times. If we don't ensure we go back
	1709	* to sleep in the halted state we won't cleanly
	1710	* start-up when the vCPU is enabled.
	1711	*
	1712	* cpu->halted should ensure we sleep in wait_io_event
	1713	*/
	1714	g_assert(cpu->halted);
	1715	break;
	1716	case EXCP_ATOMIC:
	1717	qemu_mutex_unlock_iothread();
	1718	cpu_exec_step_atomic(cpu);
	1719	qemu_mutex_lock_iothread();
	1720	default:
	1721	/* Ignore everything else? */
	1722	break;
	1723	}
	1724	}
	1725
	1726	atomic_mb_set(&cpu->exit_request, 0);
	1727	qemu_wait_io_event(cpu);
	1728	} while (!cpu->unplug \|\| cpu_can_run(cpu));
	1729
	1730	qemu_tcg_destroy_vcpu(cpu);
	1731	cpu->created = false;
	1732	qemu_cond_signal(&qemu_cpu_cond);
	1733	qemu_mutex_unlock_iothread();
	1734	rcu_unregister_thread();
	1735	return NULL;
	1736	}
	1737
	1738	static void qemu_cpu_kick_thread(CPUState *cpu)
	1739	{
	1740	#ifndef _WIN32
	1741	int err;
	1742
	1743	if (cpu->thread_kicked) {
	1744	return;
	1745	}
	1746	cpu->thread_kicked = true;
	1747	err = pthread_kill(cpu->thread->thread, SIG_IPI);
	1748	if (err) {
	1749	fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
	1750	exit(1);
	1751	}
	1752	#else /* _WIN32 */
	1753	if (!qemu_cpu_is_self(cpu)) {
	1754	if (whpx_enabled()) {
	1755	whpx_vcpu_kick(cpu);
	1756	} else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
	1757	fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
	1758	__func__, GetLastError());
	1759	exit(1);
	1760	}
	1761	}
	1762	#endif
	1763	}
	1764
	1765	void qemu_cpu_kick(CPUState *cpu)
	1766	{
	1767	qemu_cond_broadcast(cpu->halt_cond);
	1768	if (tcg_enabled()) {
	1769	cpu_exit(cpu);
	1770	/* NOP unless doing single-thread RR */
	1771	qemu_cpu_kick_rr_cpu();
	1772	} else {
	1773	if (hax_enabled()) {
	1774	/*
	1775	* FIXME: race condition with the exit_request check in
	1776	* hax_vcpu_hax_exec
	1777	*/
	1778	cpu->exit_request = 1;
	1779	}
	1780	qemu_cpu_kick_thread(cpu);
	1781	}
	1782	}
	1783
	1784	void qemu_cpu_kick_self(void)
	1785	{
	1786	assert(current_cpu);
	1787	qemu_cpu_kick_thread(current_cpu);
	1788	}
	1789
	1790	bool qemu_cpu_is_self(CPUState *cpu)
	1791	{
	1792	return qemu_thread_is_self(cpu->thread);
	1793	}
	1794
	1795	bool qemu_in_vcpu_thread(void)
	1796	{
	1797	return current_cpu && qemu_cpu_is_self(current_cpu);
	1798	}
	1799
	1800	static __thread bool iothread_locked = false;
	1801
	1802	bool qemu_mutex_iothread_locked(void)
	1803	{
	1804	return iothread_locked;
	1805	}
	1806
	1807	/*
	1808	* The BQL is taken from so many places that it is worth profiling the
	1809	* callers directly, instead of funneling them all through a single function.
	1810	*/
	1811	void qemu_mutex_lock_iothread_impl(const char *file, int line)
	1812	{
	1813	QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
	1814
	1815	g_assert(!qemu_mutex_iothread_locked());
	1816	bql_lock(&qemu_global_mutex, file, line);
	1817	iothread_locked = true;
	1818	}
	1819
	1820	void qemu_mutex_unlock_iothread(void)
	1821	{
	1822	g_assert(qemu_mutex_iothread_locked());
	1823	iothread_locked = false;
	1824	qemu_mutex_unlock(&qemu_global_mutex);
	1825	}
	1826
	1827	static bool all_vcpus_paused(void)
	1828	{
	1829	CPUState *cpu;
	1830
	1831	CPU_FOREACH(cpu) {
	1832	if (!cpu->stopped) {
	1833	return false;
	1834	}
	1835	}
	1836
	1837	return true;
	1838	}
	1839
	1840	void pause_all_vcpus(void)
	1841	{
	1842	CPUState *cpu;
	1843
	1844	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
	1845	CPU_FOREACH(cpu) {
	1846	if (qemu_cpu_is_self(cpu)) {
	1847	qemu_cpu_stop(cpu, true);
	1848	} else {
	1849	cpu->stop = true;
	1850	qemu_cpu_kick(cpu);
	1851	}
	1852	}
	1853
	1854	/* We need to drop the replay_lock so any vCPU threads woken up
	1855	* can finish their replay tasks
	1856	*/
	1857	replay_mutex_unlock();
	1858
	1859	while (!all_vcpus_paused()) {
	1860	qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
	1861	CPU_FOREACH(cpu) {
	1862	qemu_cpu_kick(cpu);
	1863	}
	1864	}
	1865
	1866	qemu_mutex_unlock_iothread();
	1867	replay_mutex_lock();
	1868	qemu_mutex_lock_iothread();
	1869	}
	1870
	1871	void cpu_resume(CPUState *cpu)
	1872	{
	1873	cpu->stop = false;
	1874	cpu->stopped = false;
	1875	qemu_cpu_kick(cpu);
	1876	}
	1877
	1878	void resume_all_vcpus(void)
	1879	{
	1880	CPUState *cpu;
	1881
	1882	qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
	1883	CPU_FOREACH(cpu) {
	1884	cpu_resume(cpu);
	1885	}
	1886	}
	1887
	1888	void cpu_remove_sync(CPUState *cpu)
	1889	{
	1890	cpu->stop = true;
	1891	cpu->unplug = true;
	1892	qemu_cpu_kick(cpu);
	1893	qemu_mutex_unlock_iothread();
	1894	qemu_thread_join(cpu->thread);
	1895	qemu_mutex_lock_iothread();
	1896	}
	1897
	1898	/* For temporary buffers for forming a name */
	1899	#define VCPU_THREAD_NAME_SIZE 16
	1900
	1901	static void qemu_tcg_init_vcpu(CPUState *cpu)
	1902	{
	1903	char thread_name[VCPU_THREAD_NAME_SIZE];
	1904	static QemuCond *single_tcg_halt_cond;
	1905	static QemuThread *single_tcg_cpu_thread;
	1906	static int tcg_region_inited;
	1907
	1908	assert(tcg_enabled());
	1909	/*
	1910	* Initialize TCG regions--once. Now is a good time, because:
	1911	* (1) TCG's init context, prologue and target globals have been set up.
	1912	* (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
	1913	* -accel flag is processed, so the check doesn't work then).
	1914	*/
	1915	if (!tcg_region_inited) {
	1916	tcg_region_inited = 1;
	1917	tcg_region_init();
	1918	}
	1919
	1920	if (qemu_tcg_mttcg_enabled() \|\| !single_tcg_cpu_thread) {
	1921	cpu->thread = g_malloc0(sizeof(QemuThread));
	1922	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1923	qemu_cond_init(cpu->halt_cond);
	1924
	1925	if (qemu_tcg_mttcg_enabled()) {
	1926	/* create a thread per vCPU with TCG (MTTCG) */
	1927	parallel_cpus = true;
	1928	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
	1929	cpu->cpu_index);
	1930
	1931	qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
	1932	cpu, QEMU_THREAD_JOINABLE);
	1933
	1934	} else {
	1935	/* share a single thread for all cpus with TCG */
	1936	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
	1937	qemu_thread_create(cpu->thread, thread_name,
	1938	qemu_tcg_rr_cpu_thread_fn,
	1939	cpu, QEMU_THREAD_JOINABLE);
	1940
	1941	single_tcg_halt_cond = cpu->halt_cond;
	1942	single_tcg_cpu_thread = cpu->thread;
	1943	}
	1944	#ifdef _WIN32
	1945	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	1946	#endif
	1947	} else {
	1948	/* For non-MTTCG cases we share the thread */
	1949	cpu->thread = single_tcg_cpu_thread;
	1950	cpu->halt_cond = single_tcg_halt_cond;
	1951	cpu->thread_id = first_cpu->thread_id;
	1952	cpu->can_do_io = 1;
	1953	cpu->created = true;
	1954	}
	1955	}
	1956
	1957	static void qemu_hax_start_vcpu(CPUState *cpu)
	1958	{
	1959	char thread_name[VCPU_THREAD_NAME_SIZE];
	1960
	1961	cpu->thread = g_malloc0(sizeof(QemuThread));
	1962	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1963	qemu_cond_init(cpu->halt_cond);
	1964
	1965	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
	1966	cpu->cpu_index);
	1967	qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
	1968	cpu, QEMU_THREAD_JOINABLE);
	1969	#ifdef _WIN32
	1970	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	1971	#endif
	1972	}
	1973
	1974	static void qemu_kvm_start_vcpu(CPUState *cpu)
	1975	{
	1976	char thread_name[VCPU_THREAD_NAME_SIZE];
	1977
	1978	cpu->thread = g_malloc0(sizeof(QemuThread));
	1979	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1980	qemu_cond_init(cpu->halt_cond);
	1981	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
	1982	cpu->cpu_index);
	1983	qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
	1984	cpu, QEMU_THREAD_JOINABLE);
	1985	}
	1986
	1987	static void qemu_hvf_start_vcpu(CPUState *cpu)
	1988	{
	1989	char thread_name[VCPU_THREAD_NAME_SIZE];
	1990
	1991	/* HVF currently does not support TCG, and only runs in
	1992	* unrestricted-guest mode. */
	1993	assert(hvf_enabled());
	1994
	1995	cpu->thread = g_malloc0(sizeof(QemuThread));
	1996	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	1997	qemu_cond_init(cpu->halt_cond);
	1998
	1999	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
	2000	cpu->cpu_index);
	2001	qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
	2002	cpu, QEMU_THREAD_JOINABLE);
	2003	}
	2004
	2005	static void qemu_whpx_start_vcpu(CPUState *cpu)
	2006	{
	2007	char thread_name[VCPU_THREAD_NAME_SIZE];
	2008
	2009	cpu->thread = g_malloc0(sizeof(QemuThread));
	2010	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2011	qemu_cond_init(cpu->halt_cond);
	2012	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
	2013	cpu->cpu_index);
	2014	qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
	2015	cpu, QEMU_THREAD_JOINABLE);
	2016	#ifdef _WIN32
	2017	cpu->hThread = qemu_thread_get_handle(cpu->thread);
	2018	#endif
	2019	}
	2020
	2021	static void qemu_dummy_start_vcpu(CPUState *cpu)
	2022	{
	2023	char thread_name[VCPU_THREAD_NAME_SIZE];
	2024
	2025	cpu->thread = g_malloc0(sizeof(QemuThread));
	2026	cpu->halt_cond = g_malloc0(sizeof(QemuCond));
	2027	qemu_cond_init(cpu->halt_cond);
	2028	snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
	2029	cpu->cpu_index);
	2030	qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
	2031	QEMU_THREAD_JOINABLE);
	2032	}
	2033
	2034	void qemu_init_vcpu(CPUState *cpu)
	2035	{
	2036	cpu->nr_cores = smp_cores;
	2037	cpu->nr_threads = smp_threads;
	2038	cpu->stopped = true;
	2039
	2040	if (!cpu->as) {
	2041	/* If the target cpu hasn't set up any address spaces itself,
	2042	* give it the default one.
	2043	*/
	2044	cpu->num_ases = 1;
	2045	cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
	2046	}
	2047
	2048	if (kvm_enabled()) {
	2049	qemu_kvm_start_vcpu(cpu);
	2050	} else if (hax_enabled()) {
	2051	qemu_hax_start_vcpu(cpu);
	2052	} else if (hvf_enabled()) {
	2053	qemu_hvf_start_vcpu(cpu);
	2054	} else if (tcg_enabled()) {
	2055	qemu_tcg_init_vcpu(cpu);
	2056	} else if (whpx_enabled()) {
	2057	qemu_whpx_start_vcpu(cpu);
	2058	} else {
	2059	qemu_dummy_start_vcpu(cpu);
	2060	}
	2061
	2062	while (!cpu->created) {
	2063	qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
	2064	}
	2065	}
	2066
	2067	void cpu_stop_current(void)
	2068	{
	2069	if (current_cpu) {
	2070	qemu_cpu_stop(current_cpu, true);
	2071	}
	2072	}
	2073
	2074	int vm_stop(RunState state)
	2075	{
	2076	if (qemu_in_vcpu_thread()) {
	2077	qemu_system_vmstop_request_prepare();
	2078	qemu_system_vmstop_request(state);
	2079	/*
	2080	* FIXME: should not return to device code in case
	2081	* vm_stop() has been requested.
	2082	*/
	2083	cpu_stop_current();
	2084	return 0;
	2085	}
	2086
	2087	return do_vm_stop(state, true);
	2088	}
	2089
	2090	/**
	2091	* Prepare for (re)starting the VM.
	2092	* Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
	2093	* running or in case of an error condition), 0 otherwise.
	2094	*/
	2095	int vm_prepare_start(void)
	2096	{
	2097	RunState requested;
	2098
	2099	qemu_vmstop_requested(&requested);
	2100	if (runstate_is_running() && requested == RUN_STATE__MAX) {
	2101	return -1;
	2102	}
	2103
	2104	/* Ensure that a STOP/RESUME pair of events is emitted if a
	2105	* vmstop request was pending. The BLOCK_IO_ERROR event, for
	2106	* example, according to documentation is always followed by
	2107	* the STOP event.
	2108	*/
	2109	if (runstate_is_running()) {
	2110	qapi_event_send_stop();
	2111	qapi_event_send_resume();
	2112	return -1;
	2113	}
	2114
	2115	/* We are sending this now, but the CPUs will be resumed shortly later */
	2116	qapi_event_send_resume();
	2117
	2118	replay_enable_events();
	2119	cpu_enable_ticks();
	2120	runstate_set(RUN_STATE_RUNNING);
	2121	vm_state_notify(1, RUN_STATE_RUNNING);
	2122	return 0;
	2123	}
	2124
	2125	void vm_start(void)
	2126	{
	2127	if (!vm_prepare_start()) {
	2128	resume_all_vcpus();
	2129	}
	2130	}
	2131
	2132	/* does a state transition even if the VM is already stopped,
	2133	current state is forgotten forever */
	2134	int vm_stop_force_state(RunState state)
	2135	{
	2136	if (runstate_is_running()) {
	2137	return vm_stop(state);
	2138	} else {
	2139	runstate_set(state);
	2140
	2141	bdrv_drain_all();
	2142	/* Make sure to return an error if the flush in a previous vm_stop()
	2143	* failed. */
	2144	return bdrv_flush_all();
	2145	}
	2146	}
	2147
	2148	void list_cpus(FILE f, fprintf_function cpu_fprintf, const char optarg)
	2149	{
	2150	/* XXX: implement xxx_cpu_list for targets that still miss it */
	2151	#if defined(cpu_list)
	2152	cpu_list(f, cpu_fprintf);
	2153	#endif
	2154	}
	2155
	2156	CpuInfoList qmp_query_cpus(Error *errp)
	2157	{
	2158	MachineState *ms = MACHINE(qdev_get_machine());
	2159	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2160	CpuInfoList head = NULL, cur_item = NULL;
	2161	CPUState *cpu;
	2162
	2163	CPU_FOREACH(cpu) {
	2164	CpuInfoList *info;
	2165	#if defined(TARGET_I386)
	2166	X86CPU *x86_cpu = X86_CPU(cpu);
	2167	CPUX86State *env = &x86_cpu->env;
	2168	#elif defined(TARGET_PPC)
	2169	PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
	2170	CPUPPCState *env = &ppc_cpu->env;
	2171	#elif defined(TARGET_SPARC)
	2172	SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
	2173	CPUSPARCState *env = &sparc_cpu->env;
	2174	#elif defined(TARGET_RISCV)
	2175	RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
	2176	CPURISCVState *env = &riscv_cpu->env;
	2177	#elif defined(TARGET_MIPS)
	2178	MIPSCPU *mips_cpu = MIPS_CPU(cpu);
	2179	CPUMIPSState *env = &mips_cpu->env;
	2180	#elif defined(TARGET_TRICORE)
	2181	TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
	2182	CPUTriCoreState *env = &tricore_cpu->env;
	2183	#elif defined(TARGET_S390X)
	2184	S390CPU *s390_cpu = S390_CPU(cpu);
	2185	CPUS390XState *env = &s390_cpu->env;
	2186	#endif
	2187
	2188	cpu_synchronize_state(cpu);
	2189
	2190	info = g_malloc0(sizeof(*info));
	2191	info->value = g_malloc0(sizeof(*info->value));
	2192	info->value->CPU = cpu->cpu_index;
	2193	info->value->current = (cpu == first_cpu);
	2194	info->value->halted = cpu->halted;
	2195	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2196	info->value->thread_id = cpu->thread_id;
	2197	#if defined(TARGET_I386)
	2198	info->value->arch = CPU_INFO_ARCH_X86;
	2199	info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
	2200	#elif defined(TARGET_PPC)
	2201	info->value->arch = CPU_INFO_ARCH_PPC;
	2202	info->value->u.ppc.nip = env->nip;
	2203	#elif defined(TARGET_SPARC)
	2204	info->value->arch = CPU_INFO_ARCH_SPARC;
	2205	info->value->u.q_sparc.pc = env->pc;
	2206	info->value->u.q_sparc.npc = env->npc;
	2207	#elif defined(TARGET_MIPS)
	2208	info->value->arch = CPU_INFO_ARCH_MIPS;
	2209	info->value->u.q_mips.PC = env->active_tc.PC;
	2210	#elif defined(TARGET_TRICORE)
	2211	info->value->arch = CPU_INFO_ARCH_TRICORE;
	2212	info->value->u.tricore.PC = env->PC;
	2213	#elif defined(TARGET_S390X)
	2214	info->value->arch = CPU_INFO_ARCH_S390;
	2215	info->value->u.s390.cpu_state = env->cpu_state;
	2216	#elif defined(TARGET_RISCV)
	2217	info->value->arch = CPU_INFO_ARCH_RISCV;
	2218	info->value->u.riscv.pc = env->pc;
	2219	#else
	2220	info->value->arch = CPU_INFO_ARCH_OTHER;
	2221	#endif
	2222	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2223	if (info->value->has_props) {
	2224	CpuInstanceProperties *props;
	2225	props = g_malloc0(sizeof(*props));
	2226	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2227	info->value->props = props;
	2228	}
	2229
	2230	/* XXX: waiting for the qapi to support GSList */
	2231	if (!cur_item) {
	2232	head = cur_item = info;
	2233	} else {
	2234	cur_item->next = info;
	2235	cur_item = info;
	2236	}
	2237	}
	2238
	2239	return head;
	2240	}
	2241
	2242	static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
	2243	{
	2244	/*
	2245	* The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
	2246	* TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
	2247	*/
	2248	switch (target) {
	2249	case SYS_EMU_TARGET_I386:
	2250	case SYS_EMU_TARGET_X86_64:
	2251	return CPU_INFO_ARCH_X86;
	2252
	2253	case SYS_EMU_TARGET_PPC:
	2254	case SYS_EMU_TARGET_PPC64:
	2255	return CPU_INFO_ARCH_PPC;
	2256
	2257	case SYS_EMU_TARGET_SPARC:
	2258	case SYS_EMU_TARGET_SPARC64:
	2259	return CPU_INFO_ARCH_SPARC;
	2260
	2261	case SYS_EMU_TARGET_MIPS:
	2262	case SYS_EMU_TARGET_MIPSEL:
	2263	case SYS_EMU_TARGET_MIPS64:
	2264	case SYS_EMU_TARGET_MIPS64EL:
	2265	return CPU_INFO_ARCH_MIPS;
	2266
	2267	case SYS_EMU_TARGET_TRICORE:
	2268	return CPU_INFO_ARCH_TRICORE;
	2269
	2270	case SYS_EMU_TARGET_S390X:
	2271	return CPU_INFO_ARCH_S390;
	2272
	2273	case SYS_EMU_TARGET_RISCV32:
	2274	case SYS_EMU_TARGET_RISCV64:
	2275	return CPU_INFO_ARCH_RISCV;
	2276
	2277	default:
	2278	return CPU_INFO_ARCH_OTHER;
	2279	}
	2280	}
	2281
	2282	static void cpustate_to_cpuinfo_s390(CpuInfoS390 info, const CPUState cpu)
	2283	{
	2284	#ifdef TARGET_S390X
	2285	S390CPU *s390_cpu = S390_CPU(cpu);
	2286	CPUS390XState *env = &s390_cpu->env;
	2287
	2288	info->cpu_state = env->cpu_state;
	2289	#else
	2290	abort();
	2291	#endif
	2292	}
	2293
	2294	/*
	2295	* fast means: we NEVER interrupt vCPU threads to retrieve
	2296	* information from KVM.
	2297	*/
	2298	CpuInfoFastList qmp_query_cpus_fast(Error *errp)
	2299	{
	2300	MachineState *ms = MACHINE(qdev_get_machine());
	2301	MachineClass *mc = MACHINE_GET_CLASS(ms);
	2302	CpuInfoFastList head = NULL, cur_item = NULL;
	2303	SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
	2304	-1, &error_abort);
	2305	CPUState *cpu;
	2306
	2307	CPU_FOREACH(cpu) {
	2308	CpuInfoFastList info = g_malloc0(sizeof(info));
	2309	info->value = g_malloc0(sizeof(*info->value));
	2310
	2311	info->value->cpu_index = cpu->cpu_index;
	2312	info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
	2313	info->value->thread_id = cpu->thread_id;
	2314
	2315	info->value->has_props = !!mc->cpu_index_to_instance_props;
	2316	if (info->value->has_props) {
	2317	CpuInstanceProperties *props;
	2318	props = g_malloc0(sizeof(*props));
	2319	*props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
	2320	info->value->props = props;
	2321	}
	2322
	2323	info->value->arch = sysemu_target_to_cpuinfo_arch(target);
	2324	info->value->target = target;
	2325	if (target == SYS_EMU_TARGET_S390X) {
	2326	cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
	2327	}
	2328
	2329	if (!cur_item) {
	2330	head = cur_item = info;
	2331	} else {
	2332	cur_item->next = info;
	2333	cur_item = info;
	2334	}
	2335	}
	2336
	2337	return head;
	2338	}
	2339
	2340	void qmp_memsave(int64_t addr, int64_t size, const char *filename,
	2341	bool has_cpu, int64_t cpu_index, Error **errp)
	2342	{
	2343	FILE *f;
	2344	uint32_t l;
	2345	CPUState *cpu;
	2346	uint8_t buf[1024];
	2347	int64_t orig_addr = addr, orig_size = size;
	2348
	2349	if (!has_cpu) {
	2350	cpu_index = 0;
	2351	}
	2352
	2353	cpu = qemu_get_cpu(cpu_index);
	2354	if (cpu == NULL) {
	2355	error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
	2356	"a CPU number");
	2357	return;
	2358	}
	2359
	2360	f = fopen(filename, "wb");
	2361	if (!f) {
	2362	error_setg_file_open(errp, errno, filename);
	2363	return;
	2364	}
	2365
	2366	while (size != 0) {
	2367	l = sizeof(buf);
	2368	if (l > size)
	2369	l = size;
	2370	if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
	2371	error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
	2372	" specified", orig_addr, orig_size);
	2373	goto exit;
	2374	}
	2375	if (fwrite(buf, 1, l, f) != l) {
	2376	error_setg(errp, QERR_IO_ERROR);
	2377	goto exit;
	2378	}
	2379	addr += l;
	2380	size -= l;
	2381	}
	2382
	2383	exit:
	2384	fclose(f);
	2385	}
	2386
	2387	void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
	2388	Error **errp)
	2389	{
	2390	FILE *f;
	2391	uint32_t l;
	2392	uint8_t buf[1024];
	2393
	2394	f = fopen(filename, "wb");
	2395	if (!f) {
	2396	error_setg_file_open(errp, errno, filename);
	2397	return;
	2398	}
	2399
	2400	while (size != 0) {
	2401	l = sizeof(buf);
	2402	if (l > size)
	2403	l = size;
	2404	cpu_physical_memory_read(addr, buf, l);
	2405	if (fwrite(buf, 1, l, f) != l) {
	2406	error_setg(errp, QERR_IO_ERROR);
	2407	goto exit;
	2408	}
	2409	addr += l;
	2410	size -= l;
	2411	}
	2412
	2413	exit:
	2414	fclose(f);
	2415	}
	2416
	2417	void qmp_inject_nmi(Error **errp)
	2418	{
	2419	nmi_monitor_handle(monitor_get_cpu_index(), errp);
	2420	}
	2421
	2422	void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
	2423	{
	2424	if (!use_icount) {
	2425	return;
	2426	}
	2427
	2428	cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
	2429	(cpu_get_clock() - cpu_get_icount())/SCALE_MS);
	2430	if (icount_align_option) {
	2431	cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
	2432	cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
	2433	} else {
	2434	cpu_fprintf(f, "Max guest delay NA\n");
	2435	cpu_fprintf(f, "Max guest advance NA\n");
	2436	}
	2437	}