softmmu/cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #ifdef CONFIG_LINUX
  65
  66 #include <sys/prctl.h>
  67
  68 #ifndef PR_MCE_KILL
  69 #define PR_MCE_KILL 33
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_SET
  73 #define PR_MCE_KILL_SET 1
  74 #endif
  75
  76 #ifndef PR_MCE_KILL_EARLY
  77 #define PR_MCE_KILL_EARLY 1
  78 #endif
  79
  80 #endif /* CONFIG_LINUX */
  81
  82 static QemuMutex qemu_global_mutex;
  83
  84 int64_t max_delay;
  85 int64_t max_advance;
  86
  87 /* vcpu throttling controls */
  88 static QEMUTimer *throttle_timer;
  89 static unsigned int throttle_percentage;
  90
  91 #define CPU_THROTTLE_PCT_MIN 1
  92 #define CPU_THROTTLE_PCT_MAX 99
  93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  94
  95 bool cpu_is_stopped(CPUState *cpu)
  96 {
  97     return cpu->stopped || !runstate_is_running();
  98 }
  99
 100 static inline bool cpu_work_list_empty(CPUState *cpu)
 101 {
 102     bool ret;
 103
 104     qemu_mutex_lock(&cpu->work_mutex);
 105     ret = QSIMPLEQ_EMPTY(&cpu->work_list);
 106     qemu_mutex_unlock(&cpu->work_mutex);
 107     return ret;
 108 }
 109
 110 static bool cpu_thread_is_idle(CPUState *cpu)
 111 {
 112     if (cpu->stop || !cpu_work_list_empty(cpu)) {
 113         return false;
 114     }
 115     if (cpu_is_stopped(cpu)) {
 116         return true;
 117     }
 118     if (!cpu->halted || cpu_has_work(cpu) ||
 119         kvm_halt_in_kernel()) {
 120         return false;
 121     }
 122     return true;
 123 }
 124
 125 static bool all_cpu_threads_idle(void)
 126 {
 127     CPUState *cpu;
 128
 129     CPU_FOREACH(cpu) {
 130         if (!cpu_thread_is_idle(cpu)) {
 131             return false;
 132         }
 133     }
 134     return true;
 135 }
 136
 137 /***********************************************************/
 138 /* guest cycle counter */
 139
 140 /* Protected by TimersState seqlock */
 141
 142 static bool icount_sleep = true;
 143 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 144 #define MAX_ICOUNT_SHIFT 10
 145
 146 typedef struct TimersState {
 147     /* Protected by BQL.  */
 148     int64_t cpu_ticks_prev;
 149     int64_t cpu_ticks_offset;
 150
 151     /* Protect fields that can be respectively read outside the
 152      * BQL, and written from multiple threads.
 153      */
 154     QemuSeqLock vm_clock_seqlock;
 155     QemuSpin vm_clock_lock;
 156
 157     int16_t cpu_ticks_enabled;
 158
 159     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 160     int16_t icount_time_shift;
 161
 162     /* Compensate for varying guest execution speed.  */
 163     int64_t qemu_icount_bias;
 164
 165     int64_t vm_clock_warp_start;
 166     int64_t cpu_clock_offset;
 167
 168     /* Only written by TCG thread */
 169     int64_t qemu_icount;
 170
 171     /* for adjusting icount */
 172     QEMUTimer *icount_rt_timer;
 173     QEMUTimer *icount_vm_timer;
 174     QEMUTimer *icount_warp_timer;
 175 } TimersState;
 176
 177 static TimersState timers_state;
 178 bool mttcg_enabled;
 179
 180
 181 /* The current number of executed instructions is based on what we
 182  * originally budgeted minus the current state of the decrementing
 183  * icount counters in extra/u16.low.
 184  */
 185 static int64_t cpu_get_icount_executed(CPUState *cpu)
 186 {
 187     return (cpu->icount_budget -
 188             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 189 }
 190
 191 /*
 192  * Update the global shared timer_state.qemu_icount to take into
 193  * account executed instructions. This is done by the TCG vCPU
 194  * thread so the main-loop can see time has moved forward.
 195  */
 196 static void cpu_update_icount_locked(CPUState *cpu)
 197 {
 198     int64_t executed = cpu_get_icount_executed(cpu);
 199     cpu->icount_budget -= executed;
 200
 201     atomic_set_i64(&timers_state.qemu_icount,
 202                    timers_state.qemu_icount + executed);
 203 }
 204
 205 /*
 206  * Update the global shared timer_state.qemu_icount to take into
 207  * account executed instructions. This is done by the TCG vCPU
 208  * thread so the main-loop can see time has moved forward.
 209  */
 210 void cpu_update_icount(CPUState *cpu)
 211 {
 212     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 213                        &timers_state.vm_clock_lock);
 214     cpu_update_icount_locked(cpu);
 215     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 216                          &timers_state.vm_clock_lock);
 217 }
 218
 219 static int64_t cpu_get_icount_raw_locked(void)
 220 {
 221     CPUState *cpu = current_cpu;
 222
 223     if (cpu && cpu->running) {
 224         if (!cpu->can_do_io) {
 225             error_report("Bad icount read");
 226             exit(1);
 227         }
 228         /* Take into account what has run */
 229         cpu_update_icount_locked(cpu);
 230     }
 231     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 232     return atomic_read_i64(&timers_state.qemu_icount);
 233 }
 234
 235 static int64_t cpu_get_icount_locked(void)
 236 {
 237     int64_t icount = cpu_get_icount_raw_locked();
 238     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 239         cpu_icount_to_ns(icount);
 240 }
 241
 242 int64_t cpu_get_icount_raw(void)
 243 {
 244     int64_t icount;
 245     unsigned start;
 246
 247     do {
 248         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 249         icount = cpu_get_icount_raw_locked();
 250     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 251
 252     return icount;
 253 }
 254
 255 /* Return the virtual CPU time, based on the instruction counter.  */
 256 int64_t cpu_get_icount(void)
 257 {
 258     int64_t icount;
 259     unsigned start;
 260
 261     do {
 262         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 263         icount = cpu_get_icount_locked();
 264     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 265
 266     return icount;
 267 }
 268
 269 int64_t cpu_icount_to_ns(int64_t icount)
 270 {
 271     return icount << atomic_read(&timers_state.icount_time_shift);
 272 }
 273
 274 static int64_t cpu_get_ticks_locked(void)
 275 {
 276     int64_t ticks = timers_state.cpu_ticks_offset;
 277     if (timers_state.cpu_ticks_enabled) {
 278         ticks += cpu_get_host_ticks();
 279     }
 280
 281     if (timers_state.cpu_ticks_prev > ticks) {
 282         /* Non increasing ticks may happen if the host uses software suspend.  */
 283         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 284         ticks = timers_state.cpu_ticks_prev;
 285     }
 286
 287     timers_state.cpu_ticks_prev = ticks;
 288     return ticks;
 289 }
 290
 291 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 292  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 293  * counter.
 294  */
 295 int64_t cpu_get_ticks(void)
 296 {
 297     int64_t ticks;
 298
 299     if (use_icount) {
 300         return cpu_get_icount();
 301     }
 302
 303     qemu_spin_lock(&timers_state.vm_clock_lock);
 304     ticks = cpu_get_ticks_locked();
 305     qemu_spin_unlock(&timers_state.vm_clock_lock);
 306     return ticks;
 307 }
 308
 309 static int64_t cpu_get_clock_locked(void)
 310 {
 311     int64_t time;
 312
 313     time = timers_state.cpu_clock_offset;
 314     if (timers_state.cpu_ticks_enabled) {
 315         time += get_clock();
 316     }
 317
 318     return time;
 319 }
 320
 321 /* Return the monotonic time elapsed in VM, i.e.,
 322  * the time between vm_start and vm_stop
 323  */
 324 int64_t cpu_get_clock(void)
 325 {
 326     int64_t ti;
 327     unsigned start;
 328
 329     do {
 330         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 331         ti = cpu_get_clock_locked();
 332     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 333
 334     return ti;
 335 }
 336
 337 /* enable cpu_get_ticks()
 338  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 339  */
 340 void cpu_enable_ticks(void)
 341 {
 342     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 343                        &timers_state.vm_clock_lock);
 344     if (!timers_state.cpu_ticks_enabled) {
 345         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 346         timers_state.cpu_clock_offset -= get_clock();
 347         timers_state.cpu_ticks_enabled = 1;
 348     }
 349     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 350                        &timers_state.vm_clock_lock);
 351 }
 352
 353 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 354  * cpu_get_ticks() after that.
 355  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 356  */
 357 void cpu_disable_ticks(void)
 358 {
 359     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 360                        &timers_state.vm_clock_lock);
 361     if (timers_state.cpu_ticks_enabled) {
 362         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 363         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 364         timers_state.cpu_ticks_enabled = 0;
 365     }
 366     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 367                          &timers_state.vm_clock_lock);
 368 }
 369
 370 /* Correlation between real and virtual time is always going to be
 371    fairly approximate, so ignore small variation.
 372    When the guest is idle real and virtual time will be aligned in
 373    the IO wait loop.  */
 374 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 375
 376 static void icount_adjust(void)
 377 {
 378     int64_t cur_time;
 379     int64_t cur_icount;
 380     int64_t delta;
 381
 382     /* Protected by TimersState mutex.  */
 383     static int64_t last_delta;
 384
 385     /* If the VM is not running, then do nothing.  */
 386     if (!runstate_is_running()) {
 387         return;
 388     }
 389
 390     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 391                        &timers_state.vm_clock_lock);
 392     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 393                                    cpu_get_clock_locked());
 394     cur_icount = cpu_get_icount_locked();
 395
 396     delta = cur_icount - cur_time;
 397     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 398     if (delta > 0
 399         && last_delta + ICOUNT_WOBBLE < delta * 2
 400         && timers_state.icount_time_shift > 0) {
 401         /* The guest is getting too far ahead.  Slow time down.  */
 402         atomic_set(&timers_state.icount_time_shift,
 403                    timers_state.icount_time_shift - 1);
 404     }
 405     if (delta < 0
 406         && last_delta - ICOUNT_WOBBLE > delta * 2
 407         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 408         /* The guest is getting too far behind.  Speed time up.  */
 409         atomic_set(&timers_state.icount_time_shift,
 410                    timers_state.icount_time_shift + 1);
 411     }
 412     last_delta = delta;
 413     atomic_set_i64(&timers_state.qemu_icount_bias,
 414                    cur_icount - (timers_state.qemu_icount
 415                                  << timers_state.icount_time_shift));
 416     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 417                          &timers_state.vm_clock_lock);
 418 }
 419
 420 static void icount_adjust_rt(void *opaque)
 421 {
 422     timer_mod(timers_state.icount_rt_timer,
 423               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 424     icount_adjust();
 425 }
 426
 427 static void icount_adjust_vm(void *opaque)
 428 {
 429     timer_mod(timers_state.icount_vm_timer,
 430                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 431                    NANOSECONDS_PER_SECOND / 10);
 432     icount_adjust();
 433 }
 434
 435 static int64_t qemu_icount_round(int64_t count)
 436 {
 437     int shift = atomic_read(&timers_state.icount_time_shift);
 438     return (count + (1 << shift) - 1) >> shift;
 439 }
 440
 441 static void icount_warp_rt(void)
 442 {
 443     unsigned seq;
 444     int64_t warp_start;
 445
 446     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 447      * changes from -1 to another value, so the race here is okay.
 448      */
 449     do {
 450         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 451         warp_start = timers_state.vm_clock_warp_start;
 452     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 453
 454     if (warp_start == -1) {
 455         return;
 456     }
 457
 458     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 459                        &timers_state.vm_clock_lock);
 460     if (runstate_is_running()) {
 461         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 462                                             cpu_get_clock_locked());
 463         int64_t warp_delta;
 464
 465         warp_delta = clock - timers_state.vm_clock_warp_start;
 466         if (use_icount == 2) {
 467             /*
 468              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 469              * far ahead of real time.
 470              */
 471             int64_t cur_icount = cpu_get_icount_locked();
 472             int64_t delta = clock - cur_icount;
 473             warp_delta = MIN(warp_delta, delta);
 474         }
 475         atomic_set_i64(&timers_state.qemu_icount_bias,
 476                        timers_state.qemu_icount_bias + warp_delta);
 477     }
 478     timers_state.vm_clock_warp_start = -1;
 479     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 480                        &timers_state.vm_clock_lock);
 481
 482     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 483         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 484     }
 485 }
 486
 487 static void icount_timer_cb(void *opaque)
 488 {
 489     /* No need for a checkpoint because the timer already synchronizes
 490      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 491      */
 492     icount_warp_rt();
 493 }
 494
 495 void qtest_clock_warp(int64_t dest)
 496 {
 497     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 498     AioContext *aio_context;
 499     assert(qtest_enabled());
 500     aio_context = qemu_get_aio_context();
 501     while (clock < dest) {
 502         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 503                                                       QEMU_TIMER_ATTR_ALL);
 504         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 505
 506         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 507                            &timers_state.vm_clock_lock);
 508         atomic_set_i64(&timers_state.qemu_icount_bias,
 509                        timers_state.qemu_icount_bias + warp);
 510         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 511                              &timers_state.vm_clock_lock);
 512
 513         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 514         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 515         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 516     }
 517     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 518 }
 519
 520 void qemu_start_warp_timer(void)
 521 {
 522     int64_t clock;
 523     int64_t deadline;
 524
 525     if (!use_icount) {
 526         return;
 527     }
 528
 529     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 530      * do not fire, so computing the deadline does not make sense.
 531      */
 532     if (!runstate_is_running()) {
 533         return;
 534     }
 535
 536     if (replay_mode != REPLAY_MODE_PLAY) {
 537         if (!all_cpu_threads_idle()) {
 538             return;
 539         }
 540
 541         if (qtest_enabled()) {
 542             /* When testing, qtest commands advance icount.  */
 543             return;
 544         }
 545
 546         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 547     } else {
 548         /* warp clock deterministically in record/replay mode */
 549         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 550             /* vCPU is sleeping and warp can't be started.
 551                It is probably a race condition: notification sent
 552                to vCPU was processed in advance and vCPU went to sleep.
 553                Therefore we have to wake it up for doing someting. */
 554             if (replay_has_checkpoint()) {
 555                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 556             }
 557             return;
 558         }
 559     }
 560
 561     /* We want to use the earliest deadline from ALL vm_clocks */
 562     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 563     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 564                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 565     if (deadline < 0) {
 566         static bool notified;
 567         if (!icount_sleep && !notified) {
 568             warn_report("icount sleep disabled and no active timers");
 569             notified = true;
 570         }
 571         return;
 572     }
 573
 574     if (deadline > 0) {
 575         /*
 576          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 577          * sleep.  Otherwise, the CPU might be waiting for a future timer
 578          * interrupt to wake it up, but the interrupt never comes because
 579          * the vCPU isn't running any insns and thus doesn't advance the
 580          * QEMU_CLOCK_VIRTUAL.
 581          */
 582         if (!icount_sleep) {
 583             /*
 584              * We never let VCPUs sleep in no sleep icount mode.
 585              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 586              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 587              * It is useful when we want a deterministic execution time,
 588              * isolated from host latencies.
 589              */
 590             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 591                                &timers_state.vm_clock_lock);
 592             atomic_set_i64(&timers_state.qemu_icount_bias,
 593                            timers_state.qemu_icount_bias + deadline);
 594             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 595                                  &timers_state.vm_clock_lock);
 596             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 597         } else {
 598             /*
 599              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 600              * "real" time, (related to the time left until the next event) has
 601              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 602              * This avoids that the warps are visible externally; for example,
 603              * you will not be sending network packets continuously instead of
 604              * every 100ms.
 605              */
 606             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 607                                &timers_state.vm_clock_lock);
 608             if (timers_state.vm_clock_warp_start == -1
 609                 || timers_state.vm_clock_warp_start > clock) {
 610                 timers_state.vm_clock_warp_start = clock;
 611             }
 612             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 613                                  &timers_state.vm_clock_lock);
 614             timer_mod_anticipate(timers_state.icount_warp_timer,
 615                                  clock + deadline);
 616         }
 617     } else if (deadline == 0) {
 618         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 619     }
 620 }
 621
 622 static void qemu_account_warp_timer(void)
 623 {
 624     if (!use_icount || !icount_sleep) {
 625         return;
 626     }
 627
 628     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 629      * do not fire, so computing the deadline does not make sense.
 630      */
 631     if (!runstate_is_running()) {
 632         return;
 633     }
 634
 635     /* warp clock deterministically in record/replay mode */
 636     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 637         return;
 638     }
 639
 640     timer_del(timers_state.icount_warp_timer);
 641     icount_warp_rt();
 642 }
 643
 644 static bool icount_state_needed(void *opaque)
 645 {
 646     return use_icount;
 647 }
 648
 649 static bool warp_timer_state_needed(void *opaque)
 650 {
 651     TimersState *s = opaque;
 652     return s->icount_warp_timer != NULL;
 653 }
 654
 655 static bool adjust_timers_state_needed(void *opaque)
 656 {
 657     TimersState *s = opaque;
 658     return s->icount_rt_timer != NULL;
 659 }
 660
 661 static bool shift_state_needed(void *opaque)
 662 {
 663     return use_icount == 2;
 664 }
 665
 666 /*
 667  * Subsection for warp timer migration is optional, because may not be created
 668  */
 669 static const VMStateDescription icount_vmstate_warp_timer = {
 670     .name = "timer/icount/warp_timer",
 671     .version_id = 1,
 672     .minimum_version_id = 1,
 673     .needed = warp_timer_state_needed,
 674     .fields = (VMStateField[]) {
 675         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 676         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 677         VMSTATE_END_OF_LIST()
 678     }
 679 };
 680
 681 static const VMStateDescription icount_vmstate_adjust_timers = {
 682     .name = "timer/icount/timers",
 683     .version_id = 1,
 684     .minimum_version_id = 1,
 685     .needed = adjust_timers_state_needed,
 686     .fields = (VMStateField[]) {
 687         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 688         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 689         VMSTATE_END_OF_LIST()
 690     }
 691 };
 692
 693 static const VMStateDescription icount_vmstate_shift = {
 694     .name = "timer/icount/shift",
 695     .version_id = 1,
 696     .minimum_version_id = 1,
 697     .needed = shift_state_needed,
 698     .fields = (VMStateField[]) {
 699         VMSTATE_INT16(icount_time_shift, TimersState),
 700         VMSTATE_END_OF_LIST()
 701     }
 702 };
 703
 704 /*
 705  * This is a subsection for icount migration.
 706  */
 707 static const VMStateDescription icount_vmstate_timers = {
 708     .name = "timer/icount",
 709     .version_id = 1,
 710     .minimum_version_id = 1,
 711     .needed = icount_state_needed,
 712     .fields = (VMStateField[]) {
 713         VMSTATE_INT64(qemu_icount_bias, TimersState),
 714         VMSTATE_INT64(qemu_icount, TimersState),
 715         VMSTATE_END_OF_LIST()
 716     },
 717     .subsections = (const VMStateDescription*[]) {
 718         &icount_vmstate_warp_timer,
 719         &icount_vmstate_adjust_timers,
 720         &icount_vmstate_shift,
 721         NULL
 722     }
 723 };
 724
 725 static const VMStateDescription vmstate_timers = {
 726     .name = "timer",
 727     .version_id = 2,
 728     .minimum_version_id = 1,
 729     .fields = (VMStateField[]) {
 730         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 731         VMSTATE_UNUSED(8),
 732         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 733         VMSTATE_END_OF_LIST()
 734     },
 735     .subsections = (const VMStateDescription*[]) {
 736         &icount_vmstate_timers,
 737         NULL
 738     }
 739 };
 740
 741 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 742 {
 743     double pct;
 744     double throttle_ratio;
 745     int64_t sleeptime_ns, endtime_ns;
 746
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750
 751     pct = (double)cpu_throttle_get_percentage()/100;
 752     throttle_ratio = pct / (1 - pct);
 753     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 754     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 755     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 756     while (sleeptime_ns > 0 && !cpu->stop) {
 757         if (sleeptime_ns > SCALE_MS) {
 758             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 759                                 sleeptime_ns / SCALE_MS);
 760         } else {
 761             qemu_mutex_unlock_iothread();
 762             g_usleep(sleeptime_ns / SCALE_US);
 763             qemu_mutex_lock_iothread();
 764         }
 765         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 766     }
 767     atomic_set(&cpu->throttle_thread_scheduled, 0);
 768 }
 769
 770 static void cpu_throttle_timer_tick(void *opaque)
 771 {
 772     CPUState *cpu;
 773     double pct;
 774
 775     /* Stop the timer if needed */
 776     if (!cpu_throttle_get_percentage()) {
 777         return;
 778     }
 779     CPU_FOREACH(cpu) {
 780         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 781             async_run_on_cpu(cpu, cpu_throttle_thread,
 782                              RUN_ON_CPU_NULL);
 783         }
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 788                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 789 }
 790
 791 void cpu_throttle_set(int new_throttle_pct)
 792 {
 793     /* Ensure throttle percentage is within valid range */
 794     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 795     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 796
 797     atomic_set(&throttle_percentage, new_throttle_pct);
 798
 799     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 800                                        CPU_THROTTLE_TIMESLICE_NS);
 801 }
 802
 803 void cpu_throttle_stop(void)
 804 {
 805     atomic_set(&throttle_percentage, 0);
 806 }
 807
 808 bool cpu_throttle_active(void)
 809 {
 810     return (cpu_throttle_get_percentage() != 0);
 811 }
 812
 813 int cpu_throttle_get_percentage(void)
 814 {
 815     return atomic_read(&throttle_percentage);
 816 }
 817
 818 void cpu_ticks_init(void)
 819 {
 820     seqlock_init(&timers_state.vm_clock_seqlock);
 821     qemu_spin_init(&timers_state.vm_clock_lock);
 822     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 823     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 824                                            cpu_throttle_timer_tick, NULL);
 825 }
 826
 827 void configure_icount(QemuOpts *opts, Error **errp)
 828 {
 829     const char *option = qemu_opt_get(opts, "shift");
 830     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 831     bool align = qemu_opt_get_bool(opts, "align", false);
 832     long time_shift = -1;
 833
 834     if (!option) {
 835         if (qemu_opt_get(opts, "align") != NULL) {
 836             error_setg(errp, "Please specify shift option when using align");
 837         }
 838         return;
 839     }
 840
 841     if (align && !sleep) {
 842         error_setg(errp, "align=on and sleep=off are incompatible");
 843         return;
 844     }
 845
 846     if (strcmp(option, "auto") != 0) {
 847         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 848             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 849             error_setg(errp, "icount: Invalid shift value");
 850             return;
 851         }
 852     } else if (icount_align_option) {
 853         error_setg(errp, "shift=auto and align=on are incompatible");
 854         return;
 855     } else if (!icount_sleep) {
 856         error_setg(errp, "shift=auto and sleep=off are incompatible");
 857         return;
 858     }
 859
 860     icount_sleep = sleep;
 861     if (icount_sleep) {
 862         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 863                                          icount_timer_cb, NULL);
 864     }
 865
 866     icount_align_option = align;
 867
 868     if (time_shift >= 0) {
 869         timers_state.icount_time_shift = time_shift;
 870         use_icount = 1;
 871         return;
 872     }
 873
 874     use_icount = 2;
 875
 876     /* 125MIPS seems a reasonable initial guess at the guest speed.
 877        It will be corrected fairly quickly anyway.  */
 878     timers_state.icount_time_shift = 3;
 879
 880     /* Have both realtime and virtual time triggers for speed adjustment.
 881        The realtime trigger catches emulated time passing too slowly,
 882        the virtual time trigger catches emulated time passing too fast.
 883        Realtime triggers occur even when idle, so use them less frequently
 884        than VM triggers.  */
 885     timers_state.vm_clock_warp_start = -1;
 886     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 887                                    icount_adjust_rt, NULL);
 888     timer_mod(timers_state.icount_rt_timer,
 889                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 890     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 891                                         icount_adjust_vm, NULL);
 892     timer_mod(timers_state.icount_vm_timer,
 893                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 894                    NANOSECONDS_PER_SECOND / 10);
 895 }
 896
 897 /***********************************************************/
 898 /* TCG vCPU kick timer
 899  *
 900  * The kick timer is responsible for moving single threaded vCPU
 901  * emulation on to the next vCPU. If more than one vCPU is running a
 902  * timer event with force a cpu->exit so the next vCPU can get
 903  * scheduled.
 904  *
 905  * The timer is removed if all vCPUs are idle and restarted again once
 906  * idleness is complete.
 907  */
 908
 909 static QEMUTimer *tcg_kick_vcpu_timer;
 910 static CPUState *tcg_current_rr_cpu;
 911
 912 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 913
 914 static inline int64_t qemu_tcg_next_kick(void)
 915 {
 916     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 917 }
 918
 919 /* Kick the currently round-robin scheduled vCPU to next */
 920 static void qemu_cpu_kick_rr_next_cpu(void)
 921 {
 922     CPUState *cpu;
 923     do {
 924         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 925         if (cpu) {
 926             cpu_exit(cpu);
 927         }
 928     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 929 }
 930
 931 /* Kick all RR vCPUs */
 932 static void qemu_cpu_kick_rr_cpus(void)
 933 {
 934     CPUState *cpu;
 935
 936     CPU_FOREACH(cpu) {
 937         cpu_exit(cpu);
 938     };
 939 }
 940
 941 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 942 {
 943 }
 944
 945 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 946 {
 947     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 948         qemu_notify_event();
 949         return;
 950     }
 951
 952     if (qemu_in_vcpu_thread()) {
 953         /* A CPU is currently running; kick it back out to the
 954          * tcg_cpu_exec() loop so it will recalculate its
 955          * icount deadline immediately.
 956          */
 957         qemu_cpu_kick(current_cpu);
 958     } else if (first_cpu) {
 959         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 960          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 961          * causes cpu_thread_is_idle to return false.  This way,
 962          * handle_icount_deadline can run.
 963          * If we have no CPUs at all for some reason, we don't
 964          * need to do anything.
 965          */
 966         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 967     }
 968 }
 969
 970 static void kick_tcg_thread(void *opaque)
 971 {
 972     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 973     qemu_cpu_kick_rr_next_cpu();
 974 }
 975
 976 static void start_tcg_kick_timer(void)
 977 {
 978     assert(!mttcg_enabled);
 979     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 980         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 981                                            kick_tcg_thread, NULL);
 982     }
 983     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 984         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 985     }
 986 }
 987
 988 static void stop_tcg_kick_timer(void)
 989 {
 990     assert(!mttcg_enabled);
 991     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 992         timer_del(tcg_kick_vcpu_timer);
 993     }
 994 }
 995
 996 /***********************************************************/
 997 void hw_error(const char *fmt, ...)
 998 {
 999     va_list ap;
1000     CPUState *cpu;
1001
1002     va_start(ap, fmt);
1003     fprintf(stderr, "qemu: hardware error: ");
1004     vfprintf(stderr, fmt, ap);
1005     fprintf(stderr, "\n");
1006     CPU_FOREACH(cpu) {
1007         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1008         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1009     }
1010     va_end(ap);
1011     abort();
1012 }
1013
1014 void cpu_synchronize_all_states(void)
1015 {
1016     CPUState *cpu;
1017
1018     CPU_FOREACH(cpu) {
1019         cpu_synchronize_state(cpu);
1020     }
1021 }
1022
1023 void cpu_synchronize_all_post_reset(void)
1024 {
1025     CPUState *cpu;
1026
1027     CPU_FOREACH(cpu) {
1028         cpu_synchronize_post_reset(cpu);
1029     }
1030 }
1031
1032 void cpu_synchronize_all_post_init(void)
1033 {
1034     CPUState *cpu;
1035
1036     CPU_FOREACH(cpu) {
1037         cpu_synchronize_post_init(cpu);
1038     }
1039 }
1040
1041 void cpu_synchronize_all_pre_loadvm(void)
1042 {
1043     CPUState *cpu;
1044
1045     CPU_FOREACH(cpu) {
1046         cpu_synchronize_pre_loadvm(cpu);
1047     }
1048 }
1049
1050 static int do_vm_stop(RunState state, bool send_stop)
1051 {
1052     int ret = 0;
1053
1054     if (runstate_is_running()) {
1055         runstate_set(state);
1056         cpu_disable_ticks();
1057         pause_all_vcpus();
1058         vm_state_notify(0, state);
1059         if (send_stop) {
1060             qapi_event_send_stop();
1061         }
1062     }
1063
1064     bdrv_drain_all();
1065     ret = bdrv_flush_all();
1066
1067     return ret;
1068 }
1069
1070 /* Special vm_stop() variant for terminating the process.  Historically clients
1071  * did not expect a QMP STOP event and so we need to retain compatibility.
1072  */
1073 int vm_shutdown(void)
1074 {
1075     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1076 }
1077
1078 static bool cpu_can_run(CPUState *cpu)
1079 {
1080     if (cpu->stop) {
1081         return false;
1082     }
1083     if (cpu_is_stopped(cpu)) {
1084         return false;
1085     }
1086     return true;
1087 }
1088
1089 static void cpu_handle_guest_debug(CPUState *cpu)
1090 {
1091     gdb_set_stop_cpu(cpu);
1092     qemu_system_debug_request();
1093     cpu->stopped = true;
1094 }
1095
1096 #ifdef CONFIG_LINUX
1097 static void sigbus_reraise(void)
1098 {
1099     sigset_t set;
1100     struct sigaction action;
1101
1102     memset(&action, 0, sizeof(action));
1103     action.sa_handler = SIG_DFL;
1104     if (!sigaction(SIGBUS, &action, NULL)) {
1105         raise(SIGBUS);
1106         sigemptyset(&set);
1107         sigaddset(&set, SIGBUS);
1108         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1109     }
1110     perror("Failed to re-raise SIGBUS!\n");
1111     abort();
1112 }
1113
1114 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1115 {
1116     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1117         sigbus_reraise();
1118     }
1119
1120     if (current_cpu) {
1121         /* Called asynchronously in VCPU thread.  */
1122         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1123             sigbus_reraise();
1124         }
1125     } else {
1126         /* Called synchronously (via signalfd) in main thread.  */
1127         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1128             sigbus_reraise();
1129         }
1130     }
1131 }
1132
1133 static void qemu_init_sigbus(void)
1134 {
1135     struct sigaction action;
1136
1137     memset(&action, 0, sizeof(action));
1138     action.sa_flags = SA_SIGINFO;
1139     action.sa_sigaction = sigbus_handler;
1140     sigaction(SIGBUS, &action, NULL);
1141
1142     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1143 }
1144 #else /* !CONFIG_LINUX */
1145 static void qemu_init_sigbus(void)
1146 {
1147 }
1148 #endif /* !CONFIG_LINUX */
1149
1150 static QemuThread io_thread;
1151
1152 /* cpu creation */
1153 static QemuCond qemu_cpu_cond;
1154 /* system init */
1155 static QemuCond qemu_pause_cond;
1156
1157 void qemu_init_cpu_loop(void)
1158 {
1159     qemu_init_sigbus();
1160     qemu_cond_init(&qemu_cpu_cond);
1161     qemu_cond_init(&qemu_pause_cond);
1162     qemu_mutex_init(&qemu_global_mutex);
1163
1164     qemu_thread_get_self(&io_thread);
1165 }
1166
1167 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1168 {
1169     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1170 }
1171
1172 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1173 {
1174     if (kvm_destroy_vcpu(cpu) < 0) {
1175         error_report("kvm_destroy_vcpu failed");
1176         exit(EXIT_FAILURE);
1177     }
1178 }
1179
1180 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1181 {
1182 }
1183
1184 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1185 {
1186     g_assert(qemu_cpu_is_self(cpu));
1187     cpu->stop = false;
1188     cpu->stopped = true;
1189     if (exit) {
1190         cpu_exit(cpu);
1191     }
1192     qemu_cond_broadcast(&qemu_pause_cond);
1193 }
1194
1195 static void qemu_wait_io_event_common(CPUState *cpu)
1196 {
1197     atomic_mb_set(&cpu->thread_kicked, false);
1198     if (cpu->stop) {
1199         qemu_cpu_stop(cpu, false);
1200     }
1201     process_queued_cpu_work(cpu);
1202 }
1203
1204 static void qemu_tcg_rr_wait_io_event(void)
1205 {
1206     CPUState *cpu;
1207
1208     while (all_cpu_threads_idle()) {
1209         stop_tcg_kick_timer();
1210         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1211     }
1212
1213     start_tcg_kick_timer();
1214
1215     CPU_FOREACH(cpu) {
1216         qemu_wait_io_event_common(cpu);
1217     }
1218 }
1219
1220 static void qemu_wait_io_event(CPUState *cpu)
1221 {
1222     bool slept = false;
1223
1224     while (cpu_thread_is_idle(cpu)) {
1225         if (!slept) {
1226             slept = true;
1227             qemu_plugin_vcpu_idle_cb(cpu);
1228         }
1229         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1230     }
1231     if (slept) {
1232         qemu_plugin_vcpu_resume_cb(cpu);
1233     }
1234
1235 #ifdef _WIN32
1236     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1237     if (!tcg_enabled()) {
1238         SleepEx(0, TRUE);
1239     }
1240 #endif
1241     qemu_wait_io_event_common(cpu);
1242 }
1243
1244 static void *qemu_kvm_cpu_thread_fn(void *arg)
1245 {
1246     CPUState *cpu = arg;
1247     int r;
1248
1249     rcu_register_thread();
1250
1251     qemu_mutex_lock_iothread();
1252     qemu_thread_get_self(cpu->thread);
1253     cpu->thread_id = qemu_get_thread_id();
1254     cpu->can_do_io = 1;
1255     current_cpu = cpu;
1256
1257     r = kvm_init_vcpu(cpu);
1258     if (r < 0) {
1259         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1260         exit(1);
1261     }
1262
1263     kvm_init_cpu_signals(cpu);
1264
1265     /* signal CPU creation */
1266     cpu->created = true;
1267     qemu_cond_signal(&qemu_cpu_cond);
1268     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1269
1270     do {
1271         if (cpu_can_run(cpu)) {
1272             r = kvm_cpu_exec(cpu);
1273             if (r == EXCP_DEBUG) {
1274                 cpu_handle_guest_debug(cpu);
1275             }
1276         }
1277         qemu_wait_io_event(cpu);
1278     } while (!cpu->unplug || cpu_can_run(cpu));
1279
1280     qemu_kvm_destroy_vcpu(cpu);
1281     cpu->created = false;
1282     qemu_cond_signal(&qemu_cpu_cond);
1283     qemu_mutex_unlock_iothread();
1284     rcu_unregister_thread();
1285     return NULL;
1286 }
1287
1288 static void *qemu_dummy_cpu_thread_fn(void *arg)
1289 {
1290 #ifdef _WIN32
1291     error_report("qtest is not supported under Windows");
1292     exit(1);
1293 #else
1294     CPUState *cpu = arg;
1295     sigset_t waitset;
1296     int r;
1297
1298     rcu_register_thread();
1299
1300     qemu_mutex_lock_iothread();
1301     qemu_thread_get_self(cpu->thread);
1302     cpu->thread_id = qemu_get_thread_id();
1303     cpu->can_do_io = 1;
1304     current_cpu = cpu;
1305
1306     sigemptyset(&waitset);
1307     sigaddset(&waitset, SIG_IPI);
1308
1309     /* signal CPU creation */
1310     cpu->created = true;
1311     qemu_cond_signal(&qemu_cpu_cond);
1312     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1313
1314     do {
1315         qemu_mutex_unlock_iothread();
1316         do {
1317             int sig;
1318             r = sigwait(&waitset, &sig);
1319         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1320         if (r == -1) {
1321             perror("sigwait");
1322             exit(1);
1323         }
1324         qemu_mutex_lock_iothread();
1325         qemu_wait_io_event(cpu);
1326     } while (!cpu->unplug);
1327
1328     qemu_mutex_unlock_iothread();
1329     rcu_unregister_thread();
1330     return NULL;
1331 #endif
1332 }
1333
1334 static int64_t tcg_get_icount_limit(void)
1335 {
1336     int64_t deadline;
1337
1338     if (replay_mode != REPLAY_MODE_PLAY) {
1339         /*
1340          * Include all the timers, because they may need an attention.
1341          * Too long CPU execution may create unnecessary delay in UI.
1342          */
1343         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1344                                               QEMU_TIMER_ATTR_ALL);
1345         /* Check realtime timers, because they help with input processing */
1346         deadline = qemu_soonest_timeout(deadline,
1347                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1348                                            QEMU_TIMER_ATTR_ALL));
1349
1350         /* Maintain prior (possibly buggy) behaviour where if no deadline
1351          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1352          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1353          * nanoseconds.
1354          */
1355         if ((deadline < 0) || (deadline > INT32_MAX)) {
1356             deadline = INT32_MAX;
1357         }
1358
1359         return qemu_icount_round(deadline);
1360     } else {
1361         return replay_get_instructions();
1362     }
1363 }
1364
1365 static void notify_aio_contexts(void)
1366 {
1367     /* Wake up other AioContexts.  */
1368     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1369     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1370 }
1371
1372 static void handle_icount_deadline(void)
1373 {
1374     assert(qemu_in_vcpu_thread());
1375     if (use_icount) {
1376         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1377                                                       QEMU_TIMER_ATTR_ALL);
1378
1379         if (deadline == 0) {
1380             notify_aio_contexts();
1381         }
1382     }
1383 }
1384
1385 static void prepare_icount_for_run(CPUState *cpu)
1386 {
1387     if (use_icount) {
1388         int insns_left;
1389
1390         /* These should always be cleared by process_icount_data after
1391          * each vCPU execution. However u16.high can be raised
1392          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1393          */
1394         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1395         g_assert(cpu->icount_extra == 0);
1396
1397         cpu->icount_budget = tcg_get_icount_limit();
1398         insns_left = MIN(0xffff, cpu->icount_budget);
1399         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1400         cpu->icount_extra = cpu->icount_budget - insns_left;
1401
1402         replay_mutex_lock();
1403
1404         if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
1405             notify_aio_contexts();
1406         }
1407     }
1408 }
1409
1410 static void process_icount_data(CPUState *cpu)
1411 {
1412     if (use_icount) {
1413         /* Account for executed instructions */
1414         cpu_update_icount(cpu);
1415
1416         /* Reset the counters */
1417         cpu_neg(cpu)->icount_decr.u16.low = 0;
1418         cpu->icount_extra = 0;
1419         cpu->icount_budget = 0;
1420
1421         replay_account_executed_instructions();
1422
1423         replay_mutex_unlock();
1424     }
1425 }
1426
1427
1428 static int tcg_cpu_exec(CPUState *cpu)
1429 {
1430     int ret;
1431 #ifdef CONFIG_PROFILER
1432     int64_t ti;
1433 #endif
1434
1435     assert(tcg_enabled());
1436 #ifdef CONFIG_PROFILER
1437     ti = profile_getclock();
1438 #endif
1439     cpu_exec_start(cpu);
1440     ret = cpu_exec(cpu);
1441     cpu_exec_end(cpu);
1442 #ifdef CONFIG_PROFILER
1443     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1444                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1445 #endif
1446     return ret;
1447 }
1448
1449 /* Destroy any remaining vCPUs which have been unplugged and have
1450  * finished running
1451  */
1452 static void deal_with_unplugged_cpus(void)
1453 {
1454     CPUState *cpu;
1455
1456     CPU_FOREACH(cpu) {
1457         if (cpu->unplug && !cpu_can_run(cpu)) {
1458             qemu_tcg_destroy_vcpu(cpu);
1459             cpu->created = false;
1460             qemu_cond_signal(&qemu_cpu_cond);
1461             break;
1462         }
1463     }
1464 }
1465
1466 /* Single-threaded TCG
1467  *
1468  * In the single-threaded case each vCPU is simulated in turn. If
1469  * there is more than a single vCPU we create a simple timer to kick
1470  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1471  * This is done explicitly rather than relying on side-effects
1472  * elsewhere.
1473  */
1474
1475 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1476 {
1477     CPUState *cpu = arg;
1478
1479     assert(tcg_enabled());
1480     rcu_register_thread();
1481     tcg_register_thread();
1482
1483     qemu_mutex_lock_iothread();
1484     qemu_thread_get_self(cpu->thread);
1485
1486     cpu->thread_id = qemu_get_thread_id();
1487     cpu->created = true;
1488     cpu->can_do_io = 1;
1489     qemu_cond_signal(&qemu_cpu_cond);
1490     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1491
1492     /* wait for initial kick-off after machine start */
1493     while (first_cpu->stopped) {
1494         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1495
1496         /* process any pending work */
1497         CPU_FOREACH(cpu) {
1498             current_cpu = cpu;
1499             qemu_wait_io_event_common(cpu);
1500         }
1501     }
1502
1503     start_tcg_kick_timer();
1504
1505     cpu = first_cpu;
1506
1507     /* process any pending work */
1508     cpu->exit_request = 1;
1509
1510     while (1) {
1511         qemu_mutex_unlock_iothread();
1512         replay_mutex_lock();
1513         qemu_mutex_lock_iothread();
1514         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1515         qemu_account_warp_timer();
1516
1517         /* Run the timers here.  This is much more efficient than
1518          * waking up the I/O thread and waiting for completion.
1519          */
1520         handle_icount_deadline();
1521
1522         replay_mutex_unlock();
1523
1524         if (!cpu) {
1525             cpu = first_cpu;
1526         }
1527
1528         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1529
1530             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1531             current_cpu = cpu;
1532
1533             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1534                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1535
1536             if (cpu_can_run(cpu)) {
1537                 int r;
1538
1539                 qemu_mutex_unlock_iothread();
1540                 prepare_icount_for_run(cpu);
1541
1542                 r = tcg_cpu_exec(cpu);
1543
1544                 process_icount_data(cpu);
1545                 qemu_mutex_lock_iothread();
1546
1547                 if (r == EXCP_DEBUG) {
1548                     cpu_handle_guest_debug(cpu);
1549                     break;
1550                 } else if (r == EXCP_ATOMIC) {
1551                     qemu_mutex_unlock_iothread();
1552                     cpu_exec_step_atomic(cpu);
1553                     qemu_mutex_lock_iothread();
1554                     break;
1555                 }
1556             } else if (cpu->stop) {
1557                 if (cpu->unplug) {
1558                     cpu = CPU_NEXT(cpu);
1559                 }
1560                 break;
1561             }
1562
1563             cpu = CPU_NEXT(cpu);
1564         } /* while (cpu && !cpu->exit_request).. */
1565
1566         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1567         atomic_set(&tcg_current_rr_cpu, NULL);
1568
1569         if (cpu && cpu->exit_request) {
1570             atomic_mb_set(&cpu->exit_request, 0);
1571         }
1572
1573         if (use_icount && all_cpu_threads_idle()) {
1574             /*
1575              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1576              * in the main_loop, wake it up in order to start the warp timer.
1577              */
1578             qemu_notify_event();
1579         }
1580
1581         qemu_tcg_rr_wait_io_event();
1582         deal_with_unplugged_cpus();
1583     }
1584
1585     rcu_unregister_thread();
1586     return NULL;
1587 }
1588
1589 static void *qemu_hax_cpu_thread_fn(void *arg)
1590 {
1591     CPUState *cpu = arg;
1592     int r;
1593
1594     rcu_register_thread();
1595     qemu_mutex_lock_iothread();
1596     qemu_thread_get_self(cpu->thread);
1597
1598     cpu->thread_id = qemu_get_thread_id();
1599     cpu->created = true;
1600     current_cpu = cpu;
1601
1602     hax_init_vcpu(cpu);
1603     qemu_cond_signal(&qemu_cpu_cond);
1604     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1605
1606     do {
1607         if (cpu_can_run(cpu)) {
1608             r = hax_smp_cpu_exec(cpu);
1609             if (r == EXCP_DEBUG) {
1610                 cpu_handle_guest_debug(cpu);
1611             }
1612         }
1613
1614         qemu_wait_io_event(cpu);
1615     } while (!cpu->unplug || cpu_can_run(cpu));
1616     rcu_unregister_thread();
1617     return NULL;
1618 }
1619
1620 /* The HVF-specific vCPU thread function. This one should only run when the host
1621  * CPU supports the VMX "unrestricted guest" feature. */
1622 static void *qemu_hvf_cpu_thread_fn(void *arg)
1623 {
1624     CPUState *cpu = arg;
1625
1626     int r;
1627
1628     assert(hvf_enabled());
1629
1630     rcu_register_thread();
1631
1632     qemu_mutex_lock_iothread();
1633     qemu_thread_get_self(cpu->thread);
1634
1635     cpu->thread_id = qemu_get_thread_id();
1636     cpu->can_do_io = 1;
1637     current_cpu = cpu;
1638
1639     hvf_init_vcpu(cpu);
1640
1641     /* signal CPU creation */
1642     cpu->created = true;
1643     qemu_cond_signal(&qemu_cpu_cond);
1644     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1645
1646     do {
1647         if (cpu_can_run(cpu)) {
1648             r = hvf_vcpu_exec(cpu);
1649             if (r == EXCP_DEBUG) {
1650                 cpu_handle_guest_debug(cpu);
1651             }
1652         }
1653         qemu_wait_io_event(cpu);
1654     } while (!cpu->unplug || cpu_can_run(cpu));
1655
1656     hvf_vcpu_destroy(cpu);
1657     cpu->created = false;
1658     qemu_cond_signal(&qemu_cpu_cond);
1659     qemu_mutex_unlock_iothread();
1660     rcu_unregister_thread();
1661     return NULL;
1662 }
1663
1664 static void *qemu_whpx_cpu_thread_fn(void *arg)
1665 {
1666     CPUState *cpu = arg;
1667     int r;
1668
1669     rcu_register_thread();
1670
1671     qemu_mutex_lock_iothread();
1672     qemu_thread_get_self(cpu->thread);
1673     cpu->thread_id = qemu_get_thread_id();
1674     current_cpu = cpu;
1675
1676     r = whpx_init_vcpu(cpu);
1677     if (r < 0) {
1678         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1679         exit(1);
1680     }
1681
1682     /* signal CPU creation */
1683     cpu->created = true;
1684     qemu_cond_signal(&qemu_cpu_cond);
1685     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1686
1687     do {
1688         if (cpu_can_run(cpu)) {
1689             r = whpx_vcpu_exec(cpu);
1690             if (r == EXCP_DEBUG) {
1691                 cpu_handle_guest_debug(cpu);
1692             }
1693         }
1694         while (cpu_thread_is_idle(cpu)) {
1695             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1696         }
1697         qemu_wait_io_event_common(cpu);
1698     } while (!cpu->unplug || cpu_can_run(cpu));
1699
1700     whpx_destroy_vcpu(cpu);
1701     cpu->created = false;
1702     qemu_cond_signal(&qemu_cpu_cond);
1703     qemu_mutex_unlock_iothread();
1704     rcu_unregister_thread();
1705     return NULL;
1706 }
1707
1708 #ifdef _WIN32
1709 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1710 {
1711 }
1712 #endif
1713
1714 /* Multi-threaded TCG
1715  *
1716  * In the multi-threaded case each vCPU has its own thread. The TLS
1717  * variable current_cpu can be used deep in the code to find the
1718  * current CPUState for a given thread.
1719  */
1720
1721 static void *qemu_tcg_cpu_thread_fn(void *arg)
1722 {
1723     CPUState *cpu = arg;
1724
1725     assert(tcg_enabled());
1726     g_assert(!use_icount);
1727
1728     rcu_register_thread();
1729     tcg_register_thread();
1730
1731     qemu_mutex_lock_iothread();
1732     qemu_thread_get_self(cpu->thread);
1733
1734     cpu->thread_id = qemu_get_thread_id();
1735     cpu->created = true;
1736     cpu->can_do_io = 1;
1737     current_cpu = cpu;
1738     qemu_cond_signal(&qemu_cpu_cond);
1739     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1740
1741     /* process any pending work */
1742     cpu->exit_request = 1;
1743
1744     do {
1745         if (cpu_can_run(cpu)) {
1746             int r;
1747             qemu_mutex_unlock_iothread();
1748             r = tcg_cpu_exec(cpu);
1749             qemu_mutex_lock_iothread();
1750             switch (r) {
1751             case EXCP_DEBUG:
1752                 cpu_handle_guest_debug(cpu);
1753                 break;
1754             case EXCP_HALTED:
1755                 /* during start-up the vCPU is reset and the thread is
1756                  * kicked several times. If we don't ensure we go back
1757                  * to sleep in the halted state we won't cleanly
1758                  * start-up when the vCPU is enabled.
1759                  *
1760                  * cpu->halted should ensure we sleep in wait_io_event
1761                  */
1762                 g_assert(cpu->halted);
1763                 break;
1764             case EXCP_ATOMIC:
1765                 qemu_mutex_unlock_iothread();
1766                 cpu_exec_step_atomic(cpu);
1767                 qemu_mutex_lock_iothread();
1768             default:
1769                 /* Ignore everything else? */
1770                 break;
1771             }
1772         }
1773
1774         atomic_mb_set(&cpu->exit_request, 0);
1775         qemu_wait_io_event(cpu);
1776     } while (!cpu->unplug || cpu_can_run(cpu));
1777
1778     qemu_tcg_destroy_vcpu(cpu);
1779     cpu->created = false;
1780     qemu_cond_signal(&qemu_cpu_cond);
1781     qemu_mutex_unlock_iothread();
1782     rcu_unregister_thread();
1783     return NULL;
1784 }
1785
1786 static void qemu_cpu_kick_thread(CPUState *cpu)
1787 {
1788 #ifndef _WIN32
1789     int err;
1790
1791     if (cpu->thread_kicked) {
1792         return;
1793     }
1794     cpu->thread_kicked = true;
1795     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1796     if (err && err != ESRCH) {
1797         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1798         exit(1);
1799     }
1800 #else /* _WIN32 */
1801     if (!qemu_cpu_is_self(cpu)) {
1802         if (whpx_enabled()) {
1803             whpx_vcpu_kick(cpu);
1804         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1805             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1806                     __func__, GetLastError());
1807             exit(1);
1808         }
1809     }
1810 #endif
1811 }
1812
1813 void qemu_cpu_kick(CPUState *cpu)
1814 {
1815     qemu_cond_broadcast(cpu->halt_cond);
1816     if (tcg_enabled()) {
1817         if (qemu_tcg_mttcg_enabled()) {
1818             cpu_exit(cpu);
1819         } else {
1820             qemu_cpu_kick_rr_cpus();
1821         }
1822     } else {
1823         if (hax_enabled()) {
1824             /*
1825              * FIXME: race condition with the exit_request check in
1826              * hax_vcpu_hax_exec
1827              */
1828             cpu->exit_request = 1;
1829         }
1830         qemu_cpu_kick_thread(cpu);
1831     }
1832 }
1833
1834 void qemu_cpu_kick_self(void)
1835 {
1836     assert(current_cpu);
1837     qemu_cpu_kick_thread(current_cpu);
1838 }
1839
1840 bool qemu_cpu_is_self(CPUState *cpu)
1841 {
1842     return qemu_thread_is_self(cpu->thread);
1843 }
1844
1845 bool qemu_in_vcpu_thread(void)
1846 {
1847     return current_cpu && qemu_cpu_is_self(current_cpu);
1848 }
1849
1850 static __thread bool iothread_locked = false;
1851
1852 bool qemu_mutex_iothread_locked(void)
1853 {
1854     return iothread_locked;
1855 }
1856
1857 /*
1858  * The BQL is taken from so many places that it is worth profiling the
1859  * callers directly, instead of funneling them all through a single function.
1860  */
1861 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1862 {
1863     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1864
1865     g_assert(!qemu_mutex_iothread_locked());
1866     bql_lock(&qemu_global_mutex, file, line);
1867     iothread_locked = true;
1868 }
1869
1870 void qemu_mutex_unlock_iothread(void)
1871 {
1872     g_assert(qemu_mutex_iothread_locked());
1873     iothread_locked = false;
1874     qemu_mutex_unlock(&qemu_global_mutex);
1875 }
1876
1877 void qemu_cond_wait_iothread(QemuCond *cond)
1878 {
1879     qemu_cond_wait(cond, &qemu_global_mutex);
1880 }
1881
1882 static bool all_vcpus_paused(void)
1883 {
1884     CPUState *cpu;
1885
1886     CPU_FOREACH(cpu) {
1887         if (!cpu->stopped) {
1888             return false;
1889         }
1890     }
1891
1892     return true;
1893 }
1894
1895 void pause_all_vcpus(void)
1896 {
1897     CPUState *cpu;
1898
1899     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1900     CPU_FOREACH(cpu) {
1901         if (qemu_cpu_is_self(cpu)) {
1902             qemu_cpu_stop(cpu, true);
1903         } else {
1904             cpu->stop = true;
1905             qemu_cpu_kick(cpu);
1906         }
1907     }
1908
1909     /* We need to drop the replay_lock so any vCPU threads woken up
1910      * can finish their replay tasks
1911      */
1912     replay_mutex_unlock();
1913
1914     while (!all_vcpus_paused()) {
1915         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1916         CPU_FOREACH(cpu) {
1917             qemu_cpu_kick(cpu);
1918         }
1919     }
1920
1921     qemu_mutex_unlock_iothread();
1922     replay_mutex_lock();
1923     qemu_mutex_lock_iothread();
1924 }
1925
1926 void cpu_resume(CPUState *cpu)
1927 {
1928     cpu->stop = false;
1929     cpu->stopped = false;
1930     qemu_cpu_kick(cpu);
1931 }
1932
1933 void resume_all_vcpus(void)
1934 {
1935     CPUState *cpu;
1936
1937     if (!runstate_is_running()) {
1938         return;
1939     }
1940
1941     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1942     CPU_FOREACH(cpu) {
1943         cpu_resume(cpu);
1944     }
1945 }
1946
1947 void cpu_remove_sync(CPUState *cpu)
1948 {
1949     cpu->stop = true;
1950     cpu->unplug = true;
1951     qemu_cpu_kick(cpu);
1952     qemu_mutex_unlock_iothread();
1953     qemu_thread_join(cpu->thread);
1954     qemu_mutex_lock_iothread();
1955 }
1956
1957 /* For temporary buffers for forming a name */
1958 #define VCPU_THREAD_NAME_SIZE 16
1959
1960 static void qemu_tcg_init_vcpu(CPUState *cpu)
1961 {
1962     char thread_name[VCPU_THREAD_NAME_SIZE];
1963     static QemuCond *single_tcg_halt_cond;
1964     static QemuThread *single_tcg_cpu_thread;
1965     static int tcg_region_inited;
1966
1967     assert(tcg_enabled());
1968     /*
1969      * Initialize TCG regions--once. Now is a good time, because:
1970      * (1) TCG's init context, prologue and target globals have been set up.
1971      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1972      *     -accel flag is processed, so the check doesn't work then).
1973      */
1974     if (!tcg_region_inited) {
1975         tcg_region_inited = 1;
1976         tcg_region_init();
1977     }
1978
1979     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1980         cpu->thread = g_malloc0(sizeof(QemuThread));
1981         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1982         qemu_cond_init(cpu->halt_cond);
1983
1984         if (qemu_tcg_mttcg_enabled()) {
1985             /* create a thread per vCPU with TCG (MTTCG) */
1986             parallel_cpus = true;
1987             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1988                  cpu->cpu_index);
1989
1990             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1991                                cpu, QEMU_THREAD_JOINABLE);
1992
1993         } else {
1994             /* share a single thread for all cpus with TCG */
1995             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1996             qemu_thread_create(cpu->thread, thread_name,
1997                                qemu_tcg_rr_cpu_thread_fn,
1998                                cpu, QEMU_THREAD_JOINABLE);
1999
2000             single_tcg_halt_cond = cpu->halt_cond;
2001             single_tcg_cpu_thread = cpu->thread;
2002         }
2003 #ifdef _WIN32
2004         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2005 #endif
2006     } else {
2007         /* For non-MTTCG cases we share the thread */
2008         cpu->thread = single_tcg_cpu_thread;
2009         cpu->halt_cond = single_tcg_halt_cond;
2010         cpu->thread_id = first_cpu->thread_id;
2011         cpu->can_do_io = 1;
2012         cpu->created = true;
2013     }
2014 }
2015
2016 static void qemu_hax_start_vcpu(CPUState *cpu)
2017 {
2018     char thread_name[VCPU_THREAD_NAME_SIZE];
2019
2020     cpu->thread = g_malloc0(sizeof(QemuThread));
2021     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022     qemu_cond_init(cpu->halt_cond);
2023
2024     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2025              cpu->cpu_index);
2026     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2027                        cpu, QEMU_THREAD_JOINABLE);
2028 #ifdef _WIN32
2029     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2030 #endif
2031 }
2032
2033 static void qemu_kvm_start_vcpu(CPUState *cpu)
2034 {
2035     char thread_name[VCPU_THREAD_NAME_SIZE];
2036
2037     cpu->thread = g_malloc0(sizeof(QemuThread));
2038     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2039     qemu_cond_init(cpu->halt_cond);
2040     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2041              cpu->cpu_index);
2042     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2043                        cpu, QEMU_THREAD_JOINABLE);
2044 }
2045
2046 static void qemu_hvf_start_vcpu(CPUState *cpu)
2047 {
2048     char thread_name[VCPU_THREAD_NAME_SIZE];
2049
2050     /* HVF currently does not support TCG, and only runs in
2051      * unrestricted-guest mode. */
2052     assert(hvf_enabled());
2053
2054     cpu->thread = g_malloc0(sizeof(QemuThread));
2055     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2056     qemu_cond_init(cpu->halt_cond);
2057
2058     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2059              cpu->cpu_index);
2060     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2061                        cpu, QEMU_THREAD_JOINABLE);
2062 }
2063
2064 static void qemu_whpx_start_vcpu(CPUState *cpu)
2065 {
2066     char thread_name[VCPU_THREAD_NAME_SIZE];
2067
2068     cpu->thread = g_malloc0(sizeof(QemuThread));
2069     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070     qemu_cond_init(cpu->halt_cond);
2071     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2072              cpu->cpu_index);
2073     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2074                        cpu, QEMU_THREAD_JOINABLE);
2075 #ifdef _WIN32
2076     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2077 #endif
2078 }
2079
2080 static void qemu_dummy_start_vcpu(CPUState *cpu)
2081 {
2082     char thread_name[VCPU_THREAD_NAME_SIZE];
2083
2084     cpu->thread = g_malloc0(sizeof(QemuThread));
2085     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2086     qemu_cond_init(cpu->halt_cond);
2087     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2088              cpu->cpu_index);
2089     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2090                        QEMU_THREAD_JOINABLE);
2091 }
2092
2093 void qemu_init_vcpu(CPUState *cpu)
2094 {
2095     MachineState *ms = MACHINE(qdev_get_machine());
2096
2097     cpu->nr_cores = ms->smp.cores;
2098     cpu->nr_threads =  ms->smp.threads;
2099     cpu->stopped = true;
2100     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2101
2102     if (!cpu->as) {
2103         /* If the target cpu hasn't set up any address spaces itself,
2104          * give it the default one.
2105          */
2106         cpu->num_ases = 1;
2107         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2108     }
2109
2110     if (kvm_enabled()) {
2111         qemu_kvm_start_vcpu(cpu);
2112     } else if (hax_enabled()) {
2113         qemu_hax_start_vcpu(cpu);
2114     } else if (hvf_enabled()) {
2115         qemu_hvf_start_vcpu(cpu);
2116     } else if (tcg_enabled()) {
2117         qemu_tcg_init_vcpu(cpu);
2118     } else if (whpx_enabled()) {
2119         qemu_whpx_start_vcpu(cpu);
2120     } else {
2121         qemu_dummy_start_vcpu(cpu);
2122     }
2123
2124     while (!cpu->created) {
2125         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2126     }
2127 }
2128
2129 void cpu_stop_current(void)
2130 {
2131     if (current_cpu) {
2132         current_cpu->stop = true;
2133         cpu_exit(current_cpu);
2134     }
2135 }
2136
2137 int vm_stop(RunState state)
2138 {
2139     if (qemu_in_vcpu_thread()) {
2140         qemu_system_vmstop_request_prepare();
2141         qemu_system_vmstop_request(state);
2142         /*
2143          * FIXME: should not return to device code in case
2144          * vm_stop() has been requested.
2145          */
2146         cpu_stop_current();
2147         return 0;
2148     }
2149
2150     return do_vm_stop(state, true);
2151 }
2152
2153 /**
2154  * Prepare for (re)starting the VM.
2155  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2156  * running or in case of an error condition), 0 otherwise.
2157  */
2158 int vm_prepare_start(void)
2159 {
2160     RunState requested;
2161
2162     qemu_vmstop_requested(&requested);
2163     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2164         return -1;
2165     }
2166
2167     /* Ensure that a STOP/RESUME pair of events is emitted if a
2168      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2169      * example, according to documentation is always followed by
2170      * the STOP event.
2171      */
2172     if (runstate_is_running()) {
2173         qapi_event_send_stop();
2174         qapi_event_send_resume();
2175         return -1;
2176     }
2177
2178     /* We are sending this now, but the CPUs will be resumed shortly later */
2179     qapi_event_send_resume();
2180
2181     cpu_enable_ticks();
2182     runstate_set(RUN_STATE_RUNNING);
2183     vm_state_notify(1, RUN_STATE_RUNNING);
2184     return 0;
2185 }
2186
2187 void vm_start(void)
2188 {
2189     if (!vm_prepare_start()) {
2190         resume_all_vcpus();
2191     }
2192 }
2193
2194 /* does a state transition even if the VM is already stopped,
2195    current state is forgotten forever */
2196 int vm_stop_force_state(RunState state)
2197 {
2198     if (runstate_is_running()) {
2199         return vm_stop(state);
2200     } else {
2201         runstate_set(state);
2202
2203         bdrv_drain_all();
2204         /* Make sure to return an error if the flush in a previous vm_stop()
2205          * failed. */
2206         return bdrv_flush_all();
2207     }
2208 }
2209
2210 void list_cpus(const char *optarg)
2211 {
2212     /* XXX: implement xxx_cpu_list for targets that still miss it */
2213 #if defined(cpu_list)
2214     cpu_list();
2215 #endif
2216 }
2217
2218 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2219                  bool has_cpu, int64_t cpu_index, Error **errp)
2220 {
2221     FILE *f;
2222     uint32_t l;
2223     CPUState *cpu;
2224     uint8_t buf[1024];
2225     int64_t orig_addr = addr, orig_size = size;
2226
2227     if (!has_cpu) {
2228         cpu_index = 0;
2229     }
2230
2231     cpu = qemu_get_cpu(cpu_index);
2232     if (cpu == NULL) {
2233         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2234                    "a CPU number");
2235         return;
2236     }
2237
2238     f = fopen(filename, "wb");
2239     if (!f) {
2240         error_setg_file_open(errp, errno, filename);
2241         return;
2242     }
2243
2244     while (size != 0) {
2245         l = sizeof(buf);
2246         if (l > size)
2247             l = size;
2248         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2249             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2250                              " specified", orig_addr, orig_size);
2251             goto exit;
2252         }
2253         if (fwrite(buf, 1, l, f) != l) {
2254             error_setg(errp, QERR_IO_ERROR);
2255             goto exit;
2256         }
2257         addr += l;
2258         size -= l;
2259     }
2260
2261 exit:
2262     fclose(f);
2263 }
2264
2265 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2266                   Error **errp)
2267 {
2268     FILE *f;
2269     uint32_t l;
2270     uint8_t buf[1024];
2271
2272     f = fopen(filename, "wb");
2273     if (!f) {
2274         error_setg_file_open(errp, errno, filename);
2275         return;
2276     }
2277
2278     while (size != 0) {
2279         l = sizeof(buf);
2280         if (l > size)
2281             l = size;
2282         cpu_physical_memory_read(addr, buf, l);
2283         if (fwrite(buf, 1, l, f) != l) {
2284             error_setg(errp, QERR_IO_ERROR);
2285             goto exit;
2286         }
2287         addr += l;
2288         size -= l;
2289     }
2290
2291 exit:
2292     fclose(f);
2293 }
2294
2295 void qmp_inject_nmi(Error **errp)
2296 {
2297     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2298 }
2299
2300 void dump_drift_info(void)
2301 {
2302     if (!use_icount) {
2303         return;
2304     }
2305
2306     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2307                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2308     if (icount_align_option) {
2309         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2310                     -max_delay / SCALE_MS);
2311         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2312                     max_advance / SCALE_MS);
2313     } else {
2314         qemu_printf("Max guest delay     NA\n");
2315         qemu_printf("Max guest advance   NA\n");
2316     }
2317 }