cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "sysemu/runstate.h"
  59 #include "hw/boards.h"
  60 #include "hw/hw.h"
  61
  62 #ifdef CONFIG_LINUX
  63
  64 #include <sys/prctl.h>
  65
  66 #ifndef PR_MCE_KILL
  67 #define PR_MCE_KILL 33
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_SET
  71 #define PR_MCE_KILL_SET 1
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_EARLY
  75 #define PR_MCE_KILL_EARLY 1
  76 #endif
  77
  78 #endif /* CONFIG_LINUX */
  79
  80 int64_t max_delay;
  81 int64_t max_advance;
  82
  83 /* vcpu throttling controls */
  84 static QEMUTimer *throttle_timer;
  85 static unsigned int throttle_percentage;
  86
  87 #define CPU_THROTTLE_PCT_MIN 1
  88 #define CPU_THROTTLE_PCT_MAX 99
  89 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  90
  91 bool cpu_is_stopped(CPUState *cpu)
  92 {
  93     return cpu->stopped || !runstate_is_running();
  94 }
  95
  96 static bool cpu_thread_is_idle(CPUState *cpu)
  97 {
  98     if (cpu->stop || cpu->queued_work_first) {
  99         return false;
 100     }
 101     if (cpu_is_stopped(cpu)) {
 102         return true;
 103     }
 104     if (!cpu->halted || cpu_has_work(cpu) ||
 105         kvm_halt_in_kernel()) {
 106         return false;
 107     }
 108     return true;
 109 }
 110
 111 static bool all_cpu_threads_idle(void)
 112 {
 113     CPUState *cpu;
 114
 115     CPU_FOREACH(cpu) {
 116         if (!cpu_thread_is_idle(cpu)) {
 117             return false;
 118         }
 119     }
 120     return true;
 121 }
 122
 123 /***********************************************************/
 124 /* guest cycle counter */
 125
 126 /* Protected by TimersState seqlock */
 127
 128 static bool icount_sleep = true;
 129 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 130 #define MAX_ICOUNT_SHIFT 10
 131
 132 typedef struct TimersState {
 133     /* Protected by BQL.  */
 134     int64_t cpu_ticks_prev;
 135     int64_t cpu_ticks_offset;
 136
 137     /* Protect fields that can be respectively read outside the
 138      * BQL, and written from multiple threads.
 139      */
 140     QemuSeqLock vm_clock_seqlock;
 141     QemuSpin vm_clock_lock;
 142
 143     int16_t cpu_ticks_enabled;
 144
 145     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 146     int16_t icount_time_shift;
 147
 148     /* Compensate for varying guest execution speed.  */
 149     int64_t qemu_icount_bias;
 150
 151     int64_t vm_clock_warp_start;
 152     int64_t cpu_clock_offset;
 153
 154     /* Only written by TCG thread */
 155     int64_t qemu_icount;
 156
 157     /* for adjusting icount */
 158     QEMUTimer *icount_rt_timer;
 159     QEMUTimer *icount_vm_timer;
 160     QEMUTimer *icount_warp_timer;
 161 } TimersState;
 162
 163 static TimersState timers_state;
 164 bool mttcg_enabled;
 165
 166 /*
 167  * We default to false if we know other options have been enabled
 168  * which are currently incompatible with MTTCG. Otherwise when each
 169  * guest (target) has been updated to support:
 170  *   - atomic instructions
 171  *   - memory ordering primitives (barriers)
 172  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 173  *
 174  * Once a guest architecture has been converted to the new primitives
 175  * there are two remaining limitations to check.
 176  *
 177  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 178  * - The host must have a stronger memory order than the guest
 179  *
 180  * It may be possible in future to support strong guests on weak hosts
 181  * but that will require tagging all load/stores in a guest with their
 182  * implicit memory order requirements which would likely slow things
 183  * down a lot.
 184  */
 185
 186 static bool check_tcg_memory_orders_compatible(void)
 187 {
 188 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 189     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 190 #else
 191     return false;
 192 #endif
 193 }
 194
 195 static bool default_mttcg_enabled(void)
 196 {
 197     if (use_icount || TCG_OVERSIZED_GUEST) {
 198         return false;
 199     } else {
 200 #ifdef TARGET_SUPPORTS_MTTCG
 201         return check_tcg_memory_orders_compatible();
 202 #else
 203         return false;
 204 #endif
 205     }
 206 }
 207
 208 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 209 {
 210     const char *t = qemu_opt_get(opts, "thread");
 211     if (t) {
 212         if (strcmp(t, "multi") == 0) {
 213             if (TCG_OVERSIZED_GUEST) {
 214                 error_setg(errp, "No MTTCG when guest word size > hosts");
 215             } else if (use_icount) {
 216                 error_setg(errp, "No MTTCG when icount is enabled");
 217             } else {
 218 #ifndef TARGET_SUPPORTS_MTTCG
 219                 warn_report("Guest not yet converted to MTTCG - "
 220                             "you may get unexpected results");
 221 #endif
 222                 if (!check_tcg_memory_orders_compatible()) {
 223                     warn_report("Guest expects a stronger memory ordering "
 224                                 "than the host provides");
 225                     error_printf("This may cause strange/hard to debug errors\n");
 226                 }
 227                 mttcg_enabled = true;
 228             }
 229         } else if (strcmp(t, "single") == 0) {
 230             mttcg_enabled = false;
 231         } else {
 232             error_setg(errp, "Invalid 'thread' setting %s", t);
 233         }
 234     } else {
 235         mttcg_enabled = default_mttcg_enabled();
 236     }
 237 }
 238
 239 /* The current number of executed instructions is based on what we
 240  * originally budgeted minus the current state of the decrementing
 241  * icount counters in extra/u16.low.
 242  */
 243 static int64_t cpu_get_icount_executed(CPUState *cpu)
 244 {
 245     return (cpu->icount_budget -
 246             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 247 }
 248
 249 /*
 250  * Update the global shared timer_state.qemu_icount to take into
 251  * account executed instructions. This is done by the TCG vCPU
 252  * thread so the main-loop can see time has moved forward.
 253  */
 254 static void cpu_update_icount_locked(CPUState *cpu)
 255 {
 256     int64_t executed = cpu_get_icount_executed(cpu);
 257     cpu->icount_budget -= executed;
 258
 259     atomic_set_i64(&timers_state.qemu_icount,
 260                    timers_state.qemu_icount + executed);
 261 }
 262
 263 /*
 264  * Update the global shared timer_state.qemu_icount to take into
 265  * account executed instructions. This is done by the TCG vCPU
 266  * thread so the main-loop can see time has moved forward.
 267  */
 268 void cpu_update_icount(CPUState *cpu)
 269 {
 270     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 271                        &timers_state.vm_clock_lock);
 272     cpu_update_icount_locked(cpu);
 273     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 274                          &timers_state.vm_clock_lock);
 275 }
 276
 277 static int64_t cpu_get_icount_raw_locked(void)
 278 {
 279     CPUState *cpu = current_cpu;
 280
 281     if (cpu && cpu->running) {
 282         if (!cpu->can_do_io) {
 283             error_report("Bad icount read");
 284             exit(1);
 285         }
 286         /* Take into account what has run */
 287         cpu_update_icount_locked(cpu);
 288     }
 289     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 290     return atomic_read_i64(&timers_state.qemu_icount);
 291 }
 292
 293 static int64_t cpu_get_icount_locked(void)
 294 {
 295     int64_t icount = cpu_get_icount_raw_locked();
 296     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 297         cpu_icount_to_ns(icount);
 298 }
 299
 300 int64_t cpu_get_icount_raw(void)
 301 {
 302     int64_t icount;
 303     unsigned start;
 304
 305     do {
 306         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 307         icount = cpu_get_icount_raw_locked();
 308     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 309
 310     return icount;
 311 }
 312
 313 /* Return the virtual CPU time, based on the instruction counter.  */
 314 int64_t cpu_get_icount(void)
 315 {
 316     int64_t icount;
 317     unsigned start;
 318
 319     do {
 320         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 321         icount = cpu_get_icount_locked();
 322     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 323
 324     return icount;
 325 }
 326
 327 int64_t cpu_icount_to_ns(int64_t icount)
 328 {
 329     return icount << atomic_read(&timers_state.icount_time_shift);
 330 }
 331
 332 static int64_t cpu_get_ticks_locked(void)
 333 {
 334     int64_t ticks = timers_state.cpu_ticks_offset;
 335     if (timers_state.cpu_ticks_enabled) {
 336         ticks += cpu_get_host_ticks();
 337     }
 338
 339     if (timers_state.cpu_ticks_prev > ticks) {
 340         /* Non increasing ticks may happen if the host uses software suspend.  */
 341         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 342         ticks = timers_state.cpu_ticks_prev;
 343     }
 344
 345     timers_state.cpu_ticks_prev = ticks;
 346     return ticks;
 347 }
 348
 349 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 350  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 351  * counter.
 352  */
 353 int64_t cpu_get_ticks(void)
 354 {
 355     int64_t ticks;
 356
 357     if (use_icount) {
 358         return cpu_get_icount();
 359     }
 360
 361     qemu_spin_lock(&timers_state.vm_clock_lock);
 362     ticks = cpu_get_ticks_locked();
 363     qemu_spin_unlock(&timers_state.vm_clock_lock);
 364     return ticks;
 365 }
 366
 367 static int64_t cpu_get_clock_locked(void)
 368 {
 369     int64_t time;
 370
 371     time = timers_state.cpu_clock_offset;
 372     if (timers_state.cpu_ticks_enabled) {
 373         time += get_clock();
 374     }
 375
 376     return time;
 377 }
 378
 379 /* Return the monotonic time elapsed in VM, i.e.,
 380  * the time between vm_start and vm_stop
 381  */
 382 int64_t cpu_get_clock(void)
 383 {
 384     int64_t ti;
 385     unsigned start;
 386
 387     do {
 388         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 389         ti = cpu_get_clock_locked();
 390     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 391
 392     return ti;
 393 }
 394
 395 /* enable cpu_get_ticks()
 396  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 397  */
 398 void cpu_enable_ticks(void)
 399 {
 400     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 401                        &timers_state.vm_clock_lock);
 402     if (!timers_state.cpu_ticks_enabled) {
 403         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 404         timers_state.cpu_clock_offset -= get_clock();
 405         timers_state.cpu_ticks_enabled = 1;
 406     }
 407     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 408                        &timers_state.vm_clock_lock);
 409 }
 410
 411 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 412  * cpu_get_ticks() after that.
 413  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 414  */
 415 void cpu_disable_ticks(void)
 416 {
 417     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 418                        &timers_state.vm_clock_lock);
 419     if (timers_state.cpu_ticks_enabled) {
 420         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 421         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 422         timers_state.cpu_ticks_enabled = 0;
 423     }
 424     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 425                          &timers_state.vm_clock_lock);
 426 }
 427
 428 /* Correlation between real and virtual time is always going to be
 429    fairly approximate, so ignore small variation.
 430    When the guest is idle real and virtual time will be aligned in
 431    the IO wait loop.  */
 432 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 433
 434 static void icount_adjust(void)
 435 {
 436     int64_t cur_time;
 437     int64_t cur_icount;
 438     int64_t delta;
 439
 440     /* Protected by TimersState mutex.  */
 441     static int64_t last_delta;
 442
 443     /* If the VM is not running, then do nothing.  */
 444     if (!runstate_is_running()) {
 445         return;
 446     }
 447
 448     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 449                        &timers_state.vm_clock_lock);
 450     cur_time = cpu_get_clock_locked();
 451     cur_icount = cpu_get_icount_locked();
 452
 453     delta = cur_icount - cur_time;
 454     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 455     if (delta > 0
 456         && last_delta + ICOUNT_WOBBLE < delta * 2
 457         && timers_state.icount_time_shift > 0) {
 458         /* The guest is getting too far ahead.  Slow time down.  */
 459         atomic_set(&timers_state.icount_time_shift,
 460                    timers_state.icount_time_shift - 1);
 461     }
 462     if (delta < 0
 463         && last_delta - ICOUNT_WOBBLE > delta * 2
 464         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 465         /* The guest is getting too far behind.  Speed time up.  */
 466         atomic_set(&timers_state.icount_time_shift,
 467                    timers_state.icount_time_shift + 1);
 468     }
 469     last_delta = delta;
 470     atomic_set_i64(&timers_state.qemu_icount_bias,
 471                    cur_icount - (timers_state.qemu_icount
 472                                  << timers_state.icount_time_shift));
 473     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 474                          &timers_state.vm_clock_lock);
 475 }
 476
 477 static void icount_adjust_rt(void *opaque)
 478 {
 479     timer_mod(timers_state.icount_rt_timer,
 480               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 481     icount_adjust();
 482 }
 483
 484 static void icount_adjust_vm(void *opaque)
 485 {
 486     timer_mod(timers_state.icount_vm_timer,
 487                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 488                    NANOSECONDS_PER_SECOND / 10);
 489     icount_adjust();
 490 }
 491
 492 static int64_t qemu_icount_round(int64_t count)
 493 {
 494     int shift = atomic_read(&timers_state.icount_time_shift);
 495     return (count + (1 << shift) - 1) >> shift;
 496 }
 497
 498 static void icount_warp_rt(void)
 499 {
 500     unsigned seq;
 501     int64_t warp_start;
 502
 503     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 504      * changes from -1 to another value, so the race here is okay.
 505      */
 506     do {
 507         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 508         warp_start = timers_state.vm_clock_warp_start;
 509     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 510
 511     if (warp_start == -1) {
 512         return;
 513     }
 514
 515     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 516                        &timers_state.vm_clock_lock);
 517     if (runstate_is_running()) {
 518         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 519                                             cpu_get_clock_locked());
 520         int64_t warp_delta;
 521
 522         warp_delta = clock - timers_state.vm_clock_warp_start;
 523         if (use_icount == 2) {
 524             /*
 525              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 526              * far ahead of real time.
 527              */
 528             int64_t cur_icount = cpu_get_icount_locked();
 529             int64_t delta = clock - cur_icount;
 530             warp_delta = MIN(warp_delta, delta);
 531         }
 532         atomic_set_i64(&timers_state.qemu_icount_bias,
 533                        timers_state.qemu_icount_bias + warp_delta);
 534     }
 535     timers_state.vm_clock_warp_start = -1;
 536     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 537                        &timers_state.vm_clock_lock);
 538
 539     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 540         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 541     }
 542 }
 543
 544 static void icount_timer_cb(void *opaque)
 545 {
 546     /* No need for a checkpoint because the timer already synchronizes
 547      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 548      */
 549     icount_warp_rt();
 550 }
 551
 552 void qtest_clock_warp(int64_t dest)
 553 {
 554     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 555     AioContext *aio_context;
 556     assert(qtest_enabled());
 557     aio_context = qemu_get_aio_context();
 558     while (clock < dest) {
 559         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 560                                                       QEMU_TIMER_ATTR_ALL);
 561         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 562
 563         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 564                            &timers_state.vm_clock_lock);
 565         atomic_set_i64(&timers_state.qemu_icount_bias,
 566                        timers_state.qemu_icount_bias + warp);
 567         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 568                              &timers_state.vm_clock_lock);
 569
 570         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 571         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 572         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 573     }
 574     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 575 }
 576
 577 void qemu_start_warp_timer(void)
 578 {
 579     int64_t clock;
 580     int64_t deadline;
 581
 582     if (!use_icount) {
 583         return;
 584     }
 585
 586     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 587      * do not fire, so computing the deadline does not make sense.
 588      */
 589     if (!runstate_is_running()) {
 590         return;
 591     }
 592
 593     if (replay_mode != REPLAY_MODE_PLAY) {
 594         if (!all_cpu_threads_idle()) {
 595             return;
 596         }
 597
 598         if (qtest_enabled()) {
 599             /* When testing, qtest commands advance icount.  */
 600             return;
 601         }
 602
 603         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 604     } else {
 605         /* warp clock deterministically in record/replay mode */
 606         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 607             /* vCPU is sleeping and warp can't be started.
 608                It is probably a race condition: notification sent
 609                to vCPU was processed in advance and vCPU went to sleep.
 610                Therefore we have to wake it up for doing someting. */
 611             if (replay_has_checkpoint()) {
 612                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 613             }
 614             return;
 615         }
 616     }
 617
 618     /* We want to use the earliest deadline from ALL vm_clocks */
 619     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 620     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 621                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 622     if (deadline < 0) {
 623         static bool notified;
 624         if (!icount_sleep && !notified) {
 625             warn_report("icount sleep disabled and no active timers");
 626             notified = true;
 627         }
 628         return;
 629     }
 630
 631     if (deadline > 0) {
 632         /*
 633          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 634          * sleep.  Otherwise, the CPU might be waiting for a future timer
 635          * interrupt to wake it up, but the interrupt never comes because
 636          * the vCPU isn't running any insns and thus doesn't advance the
 637          * QEMU_CLOCK_VIRTUAL.
 638          */
 639         if (!icount_sleep) {
 640             /*
 641              * We never let VCPUs sleep in no sleep icount mode.
 642              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 643              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 644              * It is useful when we want a deterministic execution time,
 645              * isolated from host latencies.
 646              */
 647             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 648                                &timers_state.vm_clock_lock);
 649             atomic_set_i64(&timers_state.qemu_icount_bias,
 650                            timers_state.qemu_icount_bias + deadline);
 651             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 652                                  &timers_state.vm_clock_lock);
 653             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 654         } else {
 655             /*
 656              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 657              * "real" time, (related to the time left until the next event) has
 658              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 659              * This avoids that the warps are visible externally; for example,
 660              * you will not be sending network packets continuously instead of
 661              * every 100ms.
 662              */
 663             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 664                                &timers_state.vm_clock_lock);
 665             if (timers_state.vm_clock_warp_start == -1
 666                 || timers_state.vm_clock_warp_start > clock) {
 667                 timers_state.vm_clock_warp_start = clock;
 668             }
 669             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 670                                  &timers_state.vm_clock_lock);
 671             timer_mod_anticipate(timers_state.icount_warp_timer,
 672                                  clock + deadline);
 673         }
 674     } else if (deadline == 0) {
 675         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 676     }
 677 }
 678
 679 static void qemu_account_warp_timer(void)
 680 {
 681     if (!use_icount || !icount_sleep) {
 682         return;
 683     }
 684
 685     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 686      * do not fire, so computing the deadline does not make sense.
 687      */
 688     if (!runstate_is_running()) {
 689         return;
 690     }
 691
 692     /* warp clock deterministically in record/replay mode */
 693     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 694         return;
 695     }
 696
 697     timer_del(timers_state.icount_warp_timer);
 698     icount_warp_rt();
 699 }
 700
 701 static bool icount_state_needed(void *opaque)
 702 {
 703     return use_icount;
 704 }
 705
 706 static bool warp_timer_state_needed(void *opaque)
 707 {
 708     TimersState *s = opaque;
 709     return s->icount_warp_timer != NULL;
 710 }
 711
 712 static bool adjust_timers_state_needed(void *opaque)
 713 {
 714     TimersState *s = opaque;
 715     return s->icount_rt_timer != NULL;
 716 }
 717
 718 /*
 719  * Subsection for warp timer migration is optional, because may not be created
 720  */
 721 static const VMStateDescription icount_vmstate_warp_timer = {
 722     .name = "timer/icount/warp_timer",
 723     .version_id = 1,
 724     .minimum_version_id = 1,
 725     .needed = warp_timer_state_needed,
 726     .fields = (VMStateField[]) {
 727         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 728         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 729         VMSTATE_END_OF_LIST()
 730     }
 731 };
 732
 733 static const VMStateDescription icount_vmstate_adjust_timers = {
 734     .name = "timer/icount/timers",
 735     .version_id = 1,
 736     .minimum_version_id = 1,
 737     .needed = adjust_timers_state_needed,
 738     .fields = (VMStateField[]) {
 739         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 740         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 741         VMSTATE_END_OF_LIST()
 742     }
 743 };
 744
 745 /*
 746  * This is a subsection for icount migration.
 747  */
 748 static const VMStateDescription icount_vmstate_timers = {
 749     .name = "timer/icount",
 750     .version_id = 1,
 751     .minimum_version_id = 1,
 752     .needed = icount_state_needed,
 753     .fields = (VMStateField[]) {
 754         VMSTATE_INT64(qemu_icount_bias, TimersState),
 755         VMSTATE_INT64(qemu_icount, TimersState),
 756         VMSTATE_END_OF_LIST()
 757     },
 758     .subsections = (const VMStateDescription*[]) {
 759         &icount_vmstate_warp_timer,
 760         &icount_vmstate_adjust_timers,
 761         NULL
 762     }
 763 };
 764
 765 static const VMStateDescription vmstate_timers = {
 766     .name = "timer",
 767     .version_id = 2,
 768     .minimum_version_id = 1,
 769     .fields = (VMStateField[]) {
 770         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 771         VMSTATE_UNUSED(8),
 772         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 773         VMSTATE_END_OF_LIST()
 774     },
 775     .subsections = (const VMStateDescription*[]) {
 776         &icount_vmstate_timers,
 777         NULL
 778     }
 779 };
 780
 781 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 782 {
 783     double pct;
 784     double throttle_ratio;
 785     long sleeptime_ns;
 786
 787     if (!cpu_throttle_get_percentage()) {
 788         return;
 789     }
 790
 791     pct = (double)cpu_throttle_get_percentage()/100;
 792     throttle_ratio = pct / (1 - pct);
 793     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 794
 795     qemu_mutex_unlock_iothread();
 796     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 797     qemu_mutex_lock_iothread();
 798     atomic_set(&cpu->throttle_thread_scheduled, 0);
 799 }
 800
 801 static void cpu_throttle_timer_tick(void *opaque)
 802 {
 803     CPUState *cpu;
 804     double pct;
 805
 806     /* Stop the timer if needed */
 807     if (!cpu_throttle_get_percentage()) {
 808         return;
 809     }
 810     CPU_FOREACH(cpu) {
 811         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 812             async_run_on_cpu(cpu, cpu_throttle_thread,
 813                              RUN_ON_CPU_NULL);
 814         }
 815     }
 816
 817     pct = (double)cpu_throttle_get_percentage()/100;
 818     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 819                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 820 }
 821
 822 void cpu_throttle_set(int new_throttle_pct)
 823 {
 824     /* Ensure throttle percentage is within valid range */
 825     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 826     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 827
 828     atomic_set(&throttle_percentage, new_throttle_pct);
 829
 830     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 831                                        CPU_THROTTLE_TIMESLICE_NS);
 832 }
 833
 834 void cpu_throttle_stop(void)
 835 {
 836     atomic_set(&throttle_percentage, 0);
 837 }
 838
 839 bool cpu_throttle_active(void)
 840 {
 841     return (cpu_throttle_get_percentage() != 0);
 842 }
 843
 844 int cpu_throttle_get_percentage(void)
 845 {
 846     return atomic_read(&throttle_percentage);
 847 }
 848
 849 void cpu_ticks_init(void)
 850 {
 851     seqlock_init(&timers_state.vm_clock_seqlock);
 852     qemu_spin_init(&timers_state.vm_clock_lock);
 853     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 854     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 855                                            cpu_throttle_timer_tick, NULL);
 856 }
 857
 858 void configure_icount(QemuOpts *opts, Error **errp)
 859 {
 860     const char *option;
 861     char *rem_str = NULL;
 862
 863     option = qemu_opt_get(opts, "shift");
 864     if (!option) {
 865         if (qemu_opt_get(opts, "align") != NULL) {
 866             error_setg(errp, "Please specify shift option when using align");
 867         }
 868         return;
 869     }
 870
 871     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 872     if (icount_sleep) {
 873         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 874                                          icount_timer_cb, NULL);
 875     }
 876
 877     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 878
 879     if (icount_align_option && !icount_sleep) {
 880         error_setg(errp, "align=on and sleep=off are incompatible");
 881     }
 882     if (strcmp(option, "auto") != 0) {
 883         errno = 0;
 884         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 885         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 886             error_setg(errp, "icount: Invalid shift value");
 887         }
 888         use_icount = 1;
 889         return;
 890     } else if (icount_align_option) {
 891         error_setg(errp, "shift=auto and align=on are incompatible");
 892     } else if (!icount_sleep) {
 893         error_setg(errp, "shift=auto and sleep=off are incompatible");
 894     }
 895
 896     use_icount = 2;
 897
 898     /* 125MIPS seems a reasonable initial guess at the guest speed.
 899        It will be corrected fairly quickly anyway.  */
 900     timers_state.icount_time_shift = 3;
 901
 902     /* Have both realtime and virtual time triggers for speed adjustment.
 903        The realtime trigger catches emulated time passing too slowly,
 904        the virtual time trigger catches emulated time passing too fast.
 905        Realtime triggers occur even when idle, so use them less frequently
 906        than VM triggers.  */
 907     timers_state.vm_clock_warp_start = -1;
 908     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 909                                    icount_adjust_rt, NULL);
 910     timer_mod(timers_state.icount_rt_timer,
 911                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 912     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 913                                         icount_adjust_vm, NULL);
 914     timer_mod(timers_state.icount_vm_timer,
 915                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 916                    NANOSECONDS_PER_SECOND / 10);
 917 }
 918
 919 /***********************************************************/
 920 /* TCG vCPU kick timer
 921  *
 922  * The kick timer is responsible for moving single threaded vCPU
 923  * emulation on to the next vCPU. If more than one vCPU is running a
 924  * timer event with force a cpu->exit so the next vCPU can get
 925  * scheduled.
 926  *
 927  * The timer is removed if all vCPUs are idle and restarted again once
 928  * idleness is complete.
 929  */
 930
 931 static QEMUTimer *tcg_kick_vcpu_timer;
 932 static CPUState *tcg_current_rr_cpu;
 933
 934 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 935
 936 static inline int64_t qemu_tcg_next_kick(void)
 937 {
 938     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 939 }
 940
 941 /* Kick the currently round-robin scheduled vCPU */
 942 static void qemu_cpu_kick_rr_cpu(void)
 943 {
 944     CPUState *cpu;
 945     do {
 946         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 947         if (cpu) {
 948             cpu_exit(cpu);
 949         }
 950     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 951 }
 952
 953 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 954 {
 955 }
 956
 957 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 958 {
 959     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 960         qemu_notify_event();
 961         return;
 962     }
 963
 964     if (qemu_in_vcpu_thread()) {
 965         /* A CPU is currently running; kick it back out to the
 966          * tcg_cpu_exec() loop so it will recalculate its
 967          * icount deadline immediately.
 968          */
 969         qemu_cpu_kick(current_cpu);
 970     } else if (first_cpu) {
 971         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 972          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 973          * causes cpu_thread_is_idle to return false.  This way,
 974          * handle_icount_deadline can run.
 975          * If we have no CPUs at all for some reason, we don't
 976          * need to do anything.
 977          */
 978         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 979     }
 980 }
 981
 982 static void kick_tcg_thread(void *opaque)
 983 {
 984     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 985     qemu_cpu_kick_rr_cpu();
 986 }
 987
 988 static void start_tcg_kick_timer(void)
 989 {
 990     assert(!mttcg_enabled);
 991     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 992         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 993                                            kick_tcg_thread, NULL);
 994     }
 995     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 996         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 997     }
 998 }
 999
1000 static void stop_tcg_kick_timer(void)
1001 {
1002     assert(!mttcg_enabled);
1003     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1004         timer_del(tcg_kick_vcpu_timer);
1005     }
1006 }
1007
1008 /***********************************************************/
1009 void hw_error(const char *fmt, ...)
1010 {
1011     va_list ap;
1012     CPUState *cpu;
1013
1014     va_start(ap, fmt);
1015     fprintf(stderr, "qemu: hardware error: ");
1016     vfprintf(stderr, fmt, ap);
1017     fprintf(stderr, "\n");
1018     CPU_FOREACH(cpu) {
1019         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1020         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1021     }
1022     va_end(ap);
1023     abort();
1024 }
1025
1026 void cpu_synchronize_all_states(void)
1027 {
1028     CPUState *cpu;
1029
1030     CPU_FOREACH(cpu) {
1031         cpu_synchronize_state(cpu);
1032         /* TODO: move to cpu_synchronize_state() */
1033         if (hvf_enabled()) {
1034             hvf_cpu_synchronize_state(cpu);
1035         }
1036     }
1037 }
1038
1039 void cpu_synchronize_all_post_reset(void)
1040 {
1041     CPUState *cpu;
1042
1043     CPU_FOREACH(cpu) {
1044         cpu_synchronize_post_reset(cpu);
1045         /* TODO: move to cpu_synchronize_post_reset() */
1046         if (hvf_enabled()) {
1047             hvf_cpu_synchronize_post_reset(cpu);
1048         }
1049     }
1050 }
1051
1052 void cpu_synchronize_all_post_init(void)
1053 {
1054     CPUState *cpu;
1055
1056     CPU_FOREACH(cpu) {
1057         cpu_synchronize_post_init(cpu);
1058         /* TODO: move to cpu_synchronize_post_init() */
1059         if (hvf_enabled()) {
1060             hvf_cpu_synchronize_post_init(cpu);
1061         }
1062     }
1063 }
1064
1065 void cpu_synchronize_all_pre_loadvm(void)
1066 {
1067     CPUState *cpu;
1068
1069     CPU_FOREACH(cpu) {
1070         cpu_synchronize_pre_loadvm(cpu);
1071     }
1072 }
1073
1074 static int do_vm_stop(RunState state, bool send_stop)
1075 {
1076     int ret = 0;
1077
1078     if (runstate_is_running()) {
1079         cpu_disable_ticks();
1080         pause_all_vcpus();
1081         runstate_set(state);
1082         vm_state_notify(0, state);
1083         if (send_stop) {
1084             qapi_event_send_stop();
1085         }
1086     }
1087
1088     bdrv_drain_all();
1089     replay_disable_events();
1090     ret = bdrv_flush_all();
1091
1092     return ret;
1093 }
1094
1095 /* Special vm_stop() variant for terminating the process.  Historically clients
1096  * did not expect a QMP STOP event and so we need to retain compatibility.
1097  */
1098 int vm_shutdown(void)
1099 {
1100     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1101 }
1102
1103 static bool cpu_can_run(CPUState *cpu)
1104 {
1105     if (cpu->stop) {
1106         return false;
1107     }
1108     if (cpu_is_stopped(cpu)) {
1109         return false;
1110     }
1111     return true;
1112 }
1113
1114 static void cpu_handle_guest_debug(CPUState *cpu)
1115 {
1116     gdb_set_stop_cpu(cpu);
1117     qemu_system_debug_request();
1118     cpu->stopped = true;
1119 }
1120
1121 #ifdef CONFIG_LINUX
1122 static void sigbus_reraise(void)
1123 {
1124     sigset_t set;
1125     struct sigaction action;
1126
1127     memset(&action, 0, sizeof(action));
1128     action.sa_handler = SIG_DFL;
1129     if (!sigaction(SIGBUS, &action, NULL)) {
1130         raise(SIGBUS);
1131         sigemptyset(&set);
1132         sigaddset(&set, SIGBUS);
1133         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1134     }
1135     perror("Failed to re-raise SIGBUS!\n");
1136     abort();
1137 }
1138
1139 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1140 {
1141     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1142         sigbus_reraise();
1143     }
1144
1145     if (current_cpu) {
1146         /* Called asynchronously in VCPU thread.  */
1147         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1148             sigbus_reraise();
1149         }
1150     } else {
1151         /* Called synchronously (via signalfd) in main thread.  */
1152         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1153             sigbus_reraise();
1154         }
1155     }
1156 }
1157
1158 static void qemu_init_sigbus(void)
1159 {
1160     struct sigaction action;
1161
1162     memset(&action, 0, sizeof(action));
1163     action.sa_flags = SA_SIGINFO;
1164     action.sa_sigaction = sigbus_handler;
1165     sigaction(SIGBUS, &action, NULL);
1166
1167     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1168 }
1169 #else /* !CONFIG_LINUX */
1170 static void qemu_init_sigbus(void)
1171 {
1172 }
1173 #endif /* !CONFIG_LINUX */
1174
1175 static QemuMutex qemu_global_mutex;
1176
1177 static QemuThread io_thread;
1178
1179 /* cpu creation */
1180 static QemuCond qemu_cpu_cond;
1181 /* system init */
1182 static QemuCond qemu_pause_cond;
1183
1184 void qemu_init_cpu_loop(void)
1185 {
1186     qemu_init_sigbus();
1187     qemu_cond_init(&qemu_cpu_cond);
1188     qemu_cond_init(&qemu_pause_cond);
1189     qemu_mutex_init(&qemu_global_mutex);
1190
1191     qemu_thread_get_self(&io_thread);
1192 }
1193
1194 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1195 {
1196     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1197 }
1198
1199 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1200 {
1201     if (kvm_destroy_vcpu(cpu) < 0) {
1202         error_report("kvm_destroy_vcpu failed");
1203         exit(EXIT_FAILURE);
1204     }
1205 }
1206
1207 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1208 {
1209 }
1210
1211 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1212 {
1213     g_assert(qemu_cpu_is_self(cpu));
1214     cpu->stop = false;
1215     cpu->stopped = true;
1216     if (exit) {
1217         cpu_exit(cpu);
1218     }
1219     qemu_cond_broadcast(&qemu_pause_cond);
1220 }
1221
1222 static void qemu_wait_io_event_common(CPUState *cpu)
1223 {
1224     atomic_mb_set(&cpu->thread_kicked, false);
1225     if (cpu->stop) {
1226         qemu_cpu_stop(cpu, false);
1227     }
1228     process_queued_cpu_work(cpu);
1229 }
1230
1231 static void qemu_tcg_rr_wait_io_event(void)
1232 {
1233     CPUState *cpu;
1234
1235     while (all_cpu_threads_idle()) {
1236         stop_tcg_kick_timer();
1237         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1238     }
1239
1240     start_tcg_kick_timer();
1241
1242     CPU_FOREACH(cpu) {
1243         qemu_wait_io_event_common(cpu);
1244     }
1245 }
1246
1247 static void qemu_wait_io_event(CPUState *cpu)
1248 {
1249     while (cpu_thread_is_idle(cpu)) {
1250         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1251     }
1252
1253 #ifdef _WIN32
1254     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1255     if (!tcg_enabled()) {
1256         SleepEx(0, TRUE);
1257     }
1258 #endif
1259     qemu_wait_io_event_common(cpu);
1260 }
1261
1262 static void *qemu_kvm_cpu_thread_fn(void *arg)
1263 {
1264     CPUState *cpu = arg;
1265     int r;
1266
1267     rcu_register_thread();
1268
1269     qemu_mutex_lock_iothread();
1270     qemu_thread_get_self(cpu->thread);
1271     cpu->thread_id = qemu_get_thread_id();
1272     cpu->can_do_io = 1;
1273     current_cpu = cpu;
1274
1275     r = kvm_init_vcpu(cpu);
1276     if (r < 0) {
1277         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1278         exit(1);
1279     }
1280
1281     kvm_init_cpu_signals(cpu);
1282
1283     /* signal CPU creation */
1284     cpu->created = true;
1285     qemu_cond_signal(&qemu_cpu_cond);
1286     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1287
1288     do {
1289         if (cpu_can_run(cpu)) {
1290             r = kvm_cpu_exec(cpu);
1291             if (r == EXCP_DEBUG) {
1292                 cpu_handle_guest_debug(cpu);
1293             }
1294         }
1295         qemu_wait_io_event(cpu);
1296     } while (!cpu->unplug || cpu_can_run(cpu));
1297
1298     qemu_kvm_destroy_vcpu(cpu);
1299     cpu->created = false;
1300     qemu_cond_signal(&qemu_cpu_cond);
1301     qemu_mutex_unlock_iothread();
1302     rcu_unregister_thread();
1303     return NULL;
1304 }
1305
1306 static void *qemu_dummy_cpu_thread_fn(void *arg)
1307 {
1308 #ifdef _WIN32
1309     error_report("qtest is not supported under Windows");
1310     exit(1);
1311 #else
1312     CPUState *cpu = arg;
1313     sigset_t waitset;
1314     int r;
1315
1316     rcu_register_thread();
1317
1318     qemu_mutex_lock_iothread();
1319     qemu_thread_get_self(cpu->thread);
1320     cpu->thread_id = qemu_get_thread_id();
1321     cpu->can_do_io = 1;
1322     current_cpu = cpu;
1323
1324     sigemptyset(&waitset);
1325     sigaddset(&waitset, SIG_IPI);
1326
1327     /* signal CPU creation */
1328     cpu->created = true;
1329     qemu_cond_signal(&qemu_cpu_cond);
1330     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1331
1332     do {
1333         qemu_mutex_unlock_iothread();
1334         do {
1335             int sig;
1336             r = sigwait(&waitset, &sig);
1337         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1338         if (r == -1) {
1339             perror("sigwait");
1340             exit(1);
1341         }
1342         qemu_mutex_lock_iothread();
1343         qemu_wait_io_event(cpu);
1344     } while (!cpu->unplug);
1345
1346     qemu_mutex_unlock_iothread();
1347     rcu_unregister_thread();
1348     return NULL;
1349 #endif
1350 }
1351
1352 static int64_t tcg_get_icount_limit(void)
1353 {
1354     int64_t deadline;
1355
1356     if (replay_mode != REPLAY_MODE_PLAY) {
1357         /*
1358          * Include all the timers, because they may need an attention.
1359          * Too long CPU execution may create unnecessary delay in UI.
1360          */
1361         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1362                                               QEMU_TIMER_ATTR_ALL);
1363
1364         /* Maintain prior (possibly buggy) behaviour where if no deadline
1365          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1366          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1367          * nanoseconds.
1368          */
1369         if ((deadline < 0) || (deadline > INT32_MAX)) {
1370             deadline = INT32_MAX;
1371         }
1372
1373         return qemu_icount_round(deadline);
1374     } else {
1375         return replay_get_instructions();
1376     }
1377 }
1378
1379 static void handle_icount_deadline(void)
1380 {
1381     assert(qemu_in_vcpu_thread());
1382     if (use_icount) {
1383         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1384                                                       QEMU_TIMER_ATTR_ALL);
1385
1386         if (deadline == 0) {
1387             /* Wake up other AioContexts.  */
1388             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1389             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1390         }
1391     }
1392 }
1393
1394 static void prepare_icount_for_run(CPUState *cpu)
1395 {
1396     if (use_icount) {
1397         int insns_left;
1398
1399         /* These should always be cleared by process_icount_data after
1400          * each vCPU execution. However u16.high can be raised
1401          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1402          */
1403         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1404         g_assert(cpu->icount_extra == 0);
1405
1406         cpu->icount_budget = tcg_get_icount_limit();
1407         insns_left = MIN(0xffff, cpu->icount_budget);
1408         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1409         cpu->icount_extra = cpu->icount_budget - insns_left;
1410
1411         replay_mutex_lock();
1412     }
1413 }
1414
1415 static void process_icount_data(CPUState *cpu)
1416 {
1417     if (use_icount) {
1418         /* Account for executed instructions */
1419         cpu_update_icount(cpu);
1420
1421         /* Reset the counters */
1422         cpu_neg(cpu)->icount_decr.u16.low = 0;
1423         cpu->icount_extra = 0;
1424         cpu->icount_budget = 0;
1425
1426         replay_account_executed_instructions();
1427
1428         replay_mutex_unlock();
1429     }
1430 }
1431
1432
1433 static int tcg_cpu_exec(CPUState *cpu)
1434 {
1435     int ret;
1436 #ifdef CONFIG_PROFILER
1437     int64_t ti;
1438 #endif
1439
1440     assert(tcg_enabled());
1441 #ifdef CONFIG_PROFILER
1442     ti = profile_getclock();
1443 #endif
1444     cpu_exec_start(cpu);
1445     ret = cpu_exec(cpu);
1446     cpu_exec_end(cpu);
1447 #ifdef CONFIG_PROFILER
1448     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1449                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1450 #endif
1451     return ret;
1452 }
1453
1454 /* Destroy any remaining vCPUs which have been unplugged and have
1455  * finished running
1456  */
1457 static void deal_with_unplugged_cpus(void)
1458 {
1459     CPUState *cpu;
1460
1461     CPU_FOREACH(cpu) {
1462         if (cpu->unplug && !cpu_can_run(cpu)) {
1463             qemu_tcg_destroy_vcpu(cpu);
1464             cpu->created = false;
1465             qemu_cond_signal(&qemu_cpu_cond);
1466             break;
1467         }
1468     }
1469 }
1470
1471 /* Single-threaded TCG
1472  *
1473  * In the single-threaded case each vCPU is simulated in turn. If
1474  * there is more than a single vCPU we create a simple timer to kick
1475  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1476  * This is done explicitly rather than relying on side-effects
1477  * elsewhere.
1478  */
1479
1480 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1481 {
1482     CPUState *cpu = arg;
1483
1484     assert(tcg_enabled());
1485     rcu_register_thread();
1486     tcg_register_thread();
1487
1488     qemu_mutex_lock_iothread();
1489     qemu_thread_get_self(cpu->thread);
1490
1491     cpu->thread_id = qemu_get_thread_id();
1492     cpu->created = true;
1493     cpu->can_do_io = 1;
1494     qemu_cond_signal(&qemu_cpu_cond);
1495     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1496
1497     /* wait for initial kick-off after machine start */
1498     while (first_cpu->stopped) {
1499         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1500
1501         /* process any pending work */
1502         CPU_FOREACH(cpu) {
1503             current_cpu = cpu;
1504             qemu_wait_io_event_common(cpu);
1505         }
1506     }
1507
1508     start_tcg_kick_timer();
1509
1510     cpu = first_cpu;
1511
1512     /* process any pending work */
1513     cpu->exit_request = 1;
1514
1515     while (1) {
1516         qemu_mutex_unlock_iothread();
1517         replay_mutex_lock();
1518         qemu_mutex_lock_iothread();
1519         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1520         qemu_account_warp_timer();
1521
1522         /* Run the timers here.  This is much more efficient than
1523          * waking up the I/O thread and waiting for completion.
1524          */
1525         handle_icount_deadline();
1526
1527         replay_mutex_unlock();
1528
1529         if (!cpu) {
1530             cpu = first_cpu;
1531         }
1532
1533         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1534
1535             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1536             current_cpu = cpu;
1537
1538             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1539                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1540
1541             if (cpu_can_run(cpu)) {
1542                 int r;
1543
1544                 qemu_mutex_unlock_iothread();
1545                 prepare_icount_for_run(cpu);
1546
1547                 r = tcg_cpu_exec(cpu);
1548
1549                 process_icount_data(cpu);
1550                 qemu_mutex_lock_iothread();
1551
1552                 if (r == EXCP_DEBUG) {
1553                     cpu_handle_guest_debug(cpu);
1554                     break;
1555                 } else if (r == EXCP_ATOMIC) {
1556                     qemu_mutex_unlock_iothread();
1557                     cpu_exec_step_atomic(cpu);
1558                     qemu_mutex_lock_iothread();
1559                     break;
1560                 }
1561             } else if (cpu->stop) {
1562                 if (cpu->unplug) {
1563                     cpu = CPU_NEXT(cpu);
1564                 }
1565                 break;
1566             }
1567
1568             cpu = CPU_NEXT(cpu);
1569         } /* while (cpu && !cpu->exit_request).. */
1570
1571         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1572         atomic_set(&tcg_current_rr_cpu, NULL);
1573
1574         if (cpu && cpu->exit_request) {
1575             atomic_mb_set(&cpu->exit_request, 0);
1576         }
1577
1578         if (use_icount && all_cpu_threads_idle()) {
1579             /*
1580              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1581              * in the main_loop, wake it up in order to start the warp timer.
1582              */
1583             qemu_notify_event();
1584         }
1585
1586         qemu_tcg_rr_wait_io_event();
1587         deal_with_unplugged_cpus();
1588     }
1589
1590     rcu_unregister_thread();
1591     return NULL;
1592 }
1593
1594 static void *qemu_hax_cpu_thread_fn(void *arg)
1595 {
1596     CPUState *cpu = arg;
1597     int r;
1598
1599     rcu_register_thread();
1600     qemu_mutex_lock_iothread();
1601     qemu_thread_get_self(cpu->thread);
1602
1603     cpu->thread_id = qemu_get_thread_id();
1604     cpu->created = true;
1605     current_cpu = cpu;
1606
1607     hax_init_vcpu(cpu);
1608     qemu_cond_signal(&qemu_cpu_cond);
1609     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1610
1611     do {
1612         if (cpu_can_run(cpu)) {
1613             r = hax_smp_cpu_exec(cpu);
1614             if (r == EXCP_DEBUG) {
1615                 cpu_handle_guest_debug(cpu);
1616             }
1617         }
1618
1619         qemu_wait_io_event(cpu);
1620     } while (!cpu->unplug || cpu_can_run(cpu));
1621     rcu_unregister_thread();
1622     return NULL;
1623 }
1624
1625 /* The HVF-specific vCPU thread function. This one should only run when the host
1626  * CPU supports the VMX "unrestricted guest" feature. */
1627 static void *qemu_hvf_cpu_thread_fn(void *arg)
1628 {
1629     CPUState *cpu = arg;
1630
1631     int r;
1632
1633     assert(hvf_enabled());
1634
1635     rcu_register_thread();
1636
1637     qemu_mutex_lock_iothread();
1638     qemu_thread_get_self(cpu->thread);
1639
1640     cpu->thread_id = qemu_get_thread_id();
1641     cpu->can_do_io = 1;
1642     current_cpu = cpu;
1643
1644     hvf_init_vcpu(cpu);
1645
1646     /* signal CPU creation */
1647     cpu->created = true;
1648     qemu_cond_signal(&qemu_cpu_cond);
1649     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1650
1651     do {
1652         if (cpu_can_run(cpu)) {
1653             r = hvf_vcpu_exec(cpu);
1654             if (r == EXCP_DEBUG) {
1655                 cpu_handle_guest_debug(cpu);
1656             }
1657         }
1658         qemu_wait_io_event(cpu);
1659     } while (!cpu->unplug || cpu_can_run(cpu));
1660
1661     hvf_vcpu_destroy(cpu);
1662     cpu->created = false;
1663     qemu_cond_signal(&qemu_cpu_cond);
1664     qemu_mutex_unlock_iothread();
1665     rcu_unregister_thread();
1666     return NULL;
1667 }
1668
1669 static void *qemu_whpx_cpu_thread_fn(void *arg)
1670 {
1671     CPUState *cpu = arg;
1672     int r;
1673
1674     rcu_register_thread();
1675
1676     qemu_mutex_lock_iothread();
1677     qemu_thread_get_self(cpu->thread);
1678     cpu->thread_id = qemu_get_thread_id();
1679     current_cpu = cpu;
1680
1681     r = whpx_init_vcpu(cpu);
1682     if (r < 0) {
1683         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1684         exit(1);
1685     }
1686
1687     /* signal CPU creation */
1688     cpu->created = true;
1689     qemu_cond_signal(&qemu_cpu_cond);
1690     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1691
1692     do {
1693         if (cpu_can_run(cpu)) {
1694             r = whpx_vcpu_exec(cpu);
1695             if (r == EXCP_DEBUG) {
1696                 cpu_handle_guest_debug(cpu);
1697             }
1698         }
1699         while (cpu_thread_is_idle(cpu)) {
1700             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1701         }
1702         qemu_wait_io_event_common(cpu);
1703     } while (!cpu->unplug || cpu_can_run(cpu));
1704
1705     whpx_destroy_vcpu(cpu);
1706     cpu->created = false;
1707     qemu_cond_signal(&qemu_cpu_cond);
1708     qemu_mutex_unlock_iothread();
1709     rcu_unregister_thread();
1710     return NULL;
1711 }
1712
1713 #ifdef _WIN32
1714 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1715 {
1716 }
1717 #endif
1718
1719 /* Multi-threaded TCG
1720  *
1721  * In the multi-threaded case each vCPU has its own thread. The TLS
1722  * variable current_cpu can be used deep in the code to find the
1723  * current CPUState for a given thread.
1724  */
1725
1726 static void *qemu_tcg_cpu_thread_fn(void *arg)
1727 {
1728     CPUState *cpu = arg;
1729
1730     assert(tcg_enabled());
1731     g_assert(!use_icount);
1732
1733     rcu_register_thread();
1734     tcg_register_thread();
1735
1736     qemu_mutex_lock_iothread();
1737     qemu_thread_get_self(cpu->thread);
1738
1739     cpu->thread_id = qemu_get_thread_id();
1740     cpu->created = true;
1741     cpu->can_do_io = 1;
1742     current_cpu = cpu;
1743     qemu_cond_signal(&qemu_cpu_cond);
1744     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1745
1746     /* process any pending work */
1747     cpu->exit_request = 1;
1748
1749     do {
1750         if (cpu_can_run(cpu)) {
1751             int r;
1752             qemu_mutex_unlock_iothread();
1753             r = tcg_cpu_exec(cpu);
1754             qemu_mutex_lock_iothread();
1755             switch (r) {
1756             case EXCP_DEBUG:
1757                 cpu_handle_guest_debug(cpu);
1758                 break;
1759             case EXCP_HALTED:
1760                 /* during start-up the vCPU is reset and the thread is
1761                  * kicked several times. If we don't ensure we go back
1762                  * to sleep in the halted state we won't cleanly
1763                  * start-up when the vCPU is enabled.
1764                  *
1765                  * cpu->halted should ensure we sleep in wait_io_event
1766                  */
1767                 g_assert(cpu->halted);
1768                 break;
1769             case EXCP_ATOMIC:
1770                 qemu_mutex_unlock_iothread();
1771                 cpu_exec_step_atomic(cpu);
1772                 qemu_mutex_lock_iothread();
1773             default:
1774                 /* Ignore everything else? */
1775                 break;
1776             }
1777         }
1778
1779         atomic_mb_set(&cpu->exit_request, 0);
1780         qemu_wait_io_event(cpu);
1781     } while (!cpu->unplug || cpu_can_run(cpu));
1782
1783     qemu_tcg_destroy_vcpu(cpu);
1784     cpu->created = false;
1785     qemu_cond_signal(&qemu_cpu_cond);
1786     qemu_mutex_unlock_iothread();
1787     rcu_unregister_thread();
1788     return NULL;
1789 }
1790
1791 static void qemu_cpu_kick_thread(CPUState *cpu)
1792 {
1793 #ifndef _WIN32
1794     int err;
1795
1796     if (cpu->thread_kicked) {
1797         return;
1798     }
1799     cpu->thread_kicked = true;
1800     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1801     if (err && err != ESRCH) {
1802         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1803         exit(1);
1804     }
1805 #else /* _WIN32 */
1806     if (!qemu_cpu_is_self(cpu)) {
1807         if (whpx_enabled()) {
1808             whpx_vcpu_kick(cpu);
1809         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1810             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1811                     __func__, GetLastError());
1812             exit(1);
1813         }
1814     }
1815 #endif
1816 }
1817
1818 void qemu_cpu_kick(CPUState *cpu)
1819 {
1820     qemu_cond_broadcast(cpu->halt_cond);
1821     if (tcg_enabled()) {
1822         cpu_exit(cpu);
1823         /* NOP unless doing single-thread RR */
1824         qemu_cpu_kick_rr_cpu();
1825     } else {
1826         if (hax_enabled()) {
1827             /*
1828              * FIXME: race condition with the exit_request check in
1829              * hax_vcpu_hax_exec
1830              */
1831             cpu->exit_request = 1;
1832         }
1833         qemu_cpu_kick_thread(cpu);
1834     }
1835 }
1836
1837 void qemu_cpu_kick_self(void)
1838 {
1839     assert(current_cpu);
1840     qemu_cpu_kick_thread(current_cpu);
1841 }
1842
1843 bool qemu_cpu_is_self(CPUState *cpu)
1844 {
1845     return qemu_thread_is_self(cpu->thread);
1846 }
1847
1848 bool qemu_in_vcpu_thread(void)
1849 {
1850     return current_cpu && qemu_cpu_is_self(current_cpu);
1851 }
1852
1853 static __thread bool iothread_locked = false;
1854
1855 bool qemu_mutex_iothread_locked(void)
1856 {
1857     return iothread_locked;
1858 }
1859
1860 /*
1861  * The BQL is taken from so many places that it is worth profiling the
1862  * callers directly, instead of funneling them all through a single function.
1863  */
1864 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1865 {
1866     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1867
1868     g_assert(!qemu_mutex_iothread_locked());
1869     bql_lock(&qemu_global_mutex, file, line);
1870     iothread_locked = true;
1871 }
1872
1873 void qemu_mutex_unlock_iothread(void)
1874 {
1875     g_assert(qemu_mutex_iothread_locked());
1876     iothread_locked = false;
1877     qemu_mutex_unlock(&qemu_global_mutex);
1878 }
1879
1880 static bool all_vcpus_paused(void)
1881 {
1882     CPUState *cpu;
1883
1884     CPU_FOREACH(cpu) {
1885         if (!cpu->stopped) {
1886             return false;
1887         }
1888     }
1889
1890     return true;
1891 }
1892
1893 void pause_all_vcpus(void)
1894 {
1895     CPUState *cpu;
1896
1897     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1898     CPU_FOREACH(cpu) {
1899         if (qemu_cpu_is_self(cpu)) {
1900             qemu_cpu_stop(cpu, true);
1901         } else {
1902             cpu->stop = true;
1903             qemu_cpu_kick(cpu);
1904         }
1905     }
1906
1907     /* We need to drop the replay_lock so any vCPU threads woken up
1908      * can finish their replay tasks
1909      */
1910     replay_mutex_unlock();
1911
1912     while (!all_vcpus_paused()) {
1913         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1914         CPU_FOREACH(cpu) {
1915             qemu_cpu_kick(cpu);
1916         }
1917     }
1918
1919     qemu_mutex_unlock_iothread();
1920     replay_mutex_lock();
1921     qemu_mutex_lock_iothread();
1922 }
1923
1924 void cpu_resume(CPUState *cpu)
1925 {
1926     cpu->stop = false;
1927     cpu->stopped = false;
1928     qemu_cpu_kick(cpu);
1929 }
1930
1931 void resume_all_vcpus(void)
1932 {
1933     CPUState *cpu;
1934
1935     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1936     CPU_FOREACH(cpu) {
1937         cpu_resume(cpu);
1938     }
1939 }
1940
1941 void cpu_remove_sync(CPUState *cpu)
1942 {
1943     cpu->stop = true;
1944     cpu->unplug = true;
1945     qemu_cpu_kick(cpu);
1946     qemu_mutex_unlock_iothread();
1947     qemu_thread_join(cpu->thread);
1948     qemu_mutex_lock_iothread();
1949 }
1950
1951 /* For temporary buffers for forming a name */
1952 #define VCPU_THREAD_NAME_SIZE 16
1953
1954 static void qemu_tcg_init_vcpu(CPUState *cpu)
1955 {
1956     char thread_name[VCPU_THREAD_NAME_SIZE];
1957     static QemuCond *single_tcg_halt_cond;
1958     static QemuThread *single_tcg_cpu_thread;
1959     static int tcg_region_inited;
1960
1961     assert(tcg_enabled());
1962     /*
1963      * Initialize TCG regions--once. Now is a good time, because:
1964      * (1) TCG's init context, prologue and target globals have been set up.
1965      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1966      *     -accel flag is processed, so the check doesn't work then).
1967      */
1968     if (!tcg_region_inited) {
1969         tcg_region_inited = 1;
1970         tcg_region_init();
1971     }
1972
1973     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1974         cpu->thread = g_malloc0(sizeof(QemuThread));
1975         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1976         qemu_cond_init(cpu->halt_cond);
1977
1978         if (qemu_tcg_mttcg_enabled()) {
1979             /* create a thread per vCPU with TCG (MTTCG) */
1980             parallel_cpus = true;
1981             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1982                  cpu->cpu_index);
1983
1984             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1985                                cpu, QEMU_THREAD_JOINABLE);
1986
1987         } else {
1988             /* share a single thread for all cpus with TCG */
1989             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1990             qemu_thread_create(cpu->thread, thread_name,
1991                                qemu_tcg_rr_cpu_thread_fn,
1992                                cpu, QEMU_THREAD_JOINABLE);
1993
1994             single_tcg_halt_cond = cpu->halt_cond;
1995             single_tcg_cpu_thread = cpu->thread;
1996         }
1997 #ifdef _WIN32
1998         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1999 #endif
2000     } else {
2001         /* For non-MTTCG cases we share the thread */
2002         cpu->thread = single_tcg_cpu_thread;
2003         cpu->halt_cond = single_tcg_halt_cond;
2004         cpu->thread_id = first_cpu->thread_id;
2005         cpu->can_do_io = 1;
2006         cpu->created = true;
2007     }
2008 }
2009
2010 static void qemu_hax_start_vcpu(CPUState *cpu)
2011 {
2012     char thread_name[VCPU_THREAD_NAME_SIZE];
2013
2014     cpu->thread = g_malloc0(sizeof(QemuThread));
2015     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2016     qemu_cond_init(cpu->halt_cond);
2017
2018     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2019              cpu->cpu_index);
2020     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2021                        cpu, QEMU_THREAD_JOINABLE);
2022 #ifdef _WIN32
2023     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2024 #endif
2025 }
2026
2027 static void qemu_kvm_start_vcpu(CPUState *cpu)
2028 {
2029     char thread_name[VCPU_THREAD_NAME_SIZE];
2030
2031     cpu->thread = g_malloc0(sizeof(QemuThread));
2032     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2033     qemu_cond_init(cpu->halt_cond);
2034     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2035              cpu->cpu_index);
2036     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2037                        cpu, QEMU_THREAD_JOINABLE);
2038 }
2039
2040 static void qemu_hvf_start_vcpu(CPUState *cpu)
2041 {
2042     char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044     /* HVF currently does not support TCG, and only runs in
2045      * unrestricted-guest mode. */
2046     assert(hvf_enabled());
2047
2048     cpu->thread = g_malloc0(sizeof(QemuThread));
2049     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2050     qemu_cond_init(cpu->halt_cond);
2051
2052     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2053              cpu->cpu_index);
2054     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2055                        cpu, QEMU_THREAD_JOINABLE);
2056 }
2057
2058 static void qemu_whpx_start_vcpu(CPUState *cpu)
2059 {
2060     char thread_name[VCPU_THREAD_NAME_SIZE];
2061
2062     cpu->thread = g_malloc0(sizeof(QemuThread));
2063     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2064     qemu_cond_init(cpu->halt_cond);
2065     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2066              cpu->cpu_index);
2067     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2068                        cpu, QEMU_THREAD_JOINABLE);
2069 #ifdef _WIN32
2070     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2071 #endif
2072 }
2073
2074 static void qemu_dummy_start_vcpu(CPUState *cpu)
2075 {
2076     char thread_name[VCPU_THREAD_NAME_SIZE];
2077
2078     cpu->thread = g_malloc0(sizeof(QemuThread));
2079     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2080     qemu_cond_init(cpu->halt_cond);
2081     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2082              cpu->cpu_index);
2083     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2084                        QEMU_THREAD_JOINABLE);
2085 }
2086
2087 void qemu_init_vcpu(CPUState *cpu)
2088 {
2089     MachineState *ms = MACHINE(qdev_get_machine());
2090
2091     cpu->nr_cores = ms->smp.cores;
2092     cpu->nr_threads =  ms->smp.threads;
2093     cpu->stopped = true;
2094     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2095
2096     if (!cpu->as) {
2097         /* If the target cpu hasn't set up any address spaces itself,
2098          * give it the default one.
2099          */
2100         cpu->num_ases = 1;
2101         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2102     }
2103
2104     if (kvm_enabled()) {
2105         qemu_kvm_start_vcpu(cpu);
2106     } else if (hax_enabled()) {
2107         qemu_hax_start_vcpu(cpu);
2108     } else if (hvf_enabled()) {
2109         qemu_hvf_start_vcpu(cpu);
2110     } else if (tcg_enabled()) {
2111         qemu_tcg_init_vcpu(cpu);
2112     } else if (whpx_enabled()) {
2113         qemu_whpx_start_vcpu(cpu);
2114     } else {
2115         qemu_dummy_start_vcpu(cpu);
2116     }
2117
2118     while (!cpu->created) {
2119         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2120     }
2121 }
2122
2123 void cpu_stop_current(void)
2124 {
2125     if (current_cpu) {
2126         current_cpu->stop = true;
2127         cpu_exit(current_cpu);
2128     }
2129 }
2130
2131 int vm_stop(RunState state)
2132 {
2133     if (qemu_in_vcpu_thread()) {
2134         qemu_system_vmstop_request_prepare();
2135         qemu_system_vmstop_request(state);
2136         /*
2137          * FIXME: should not return to device code in case
2138          * vm_stop() has been requested.
2139          */
2140         cpu_stop_current();
2141         return 0;
2142     }
2143
2144     return do_vm_stop(state, true);
2145 }
2146
2147 /**
2148  * Prepare for (re)starting the VM.
2149  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2150  * running or in case of an error condition), 0 otherwise.
2151  */
2152 int vm_prepare_start(void)
2153 {
2154     RunState requested;
2155
2156     qemu_vmstop_requested(&requested);
2157     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2158         return -1;
2159     }
2160
2161     /* Ensure that a STOP/RESUME pair of events is emitted if a
2162      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2163      * example, according to documentation is always followed by
2164      * the STOP event.
2165      */
2166     if (runstate_is_running()) {
2167         qapi_event_send_stop();
2168         qapi_event_send_resume();
2169         return -1;
2170     }
2171
2172     /* We are sending this now, but the CPUs will be resumed shortly later */
2173     qapi_event_send_resume();
2174
2175     replay_enable_events();
2176     cpu_enable_ticks();
2177     runstate_set(RUN_STATE_RUNNING);
2178     vm_state_notify(1, RUN_STATE_RUNNING);
2179     return 0;
2180 }
2181
2182 void vm_start(void)
2183 {
2184     if (!vm_prepare_start()) {
2185         resume_all_vcpus();
2186     }
2187 }
2188
2189 /* does a state transition even if the VM is already stopped,
2190    current state is forgotten forever */
2191 int vm_stop_force_state(RunState state)
2192 {
2193     if (runstate_is_running()) {
2194         return vm_stop(state);
2195     } else {
2196         runstate_set(state);
2197
2198         bdrv_drain_all();
2199         /* Make sure to return an error if the flush in a previous vm_stop()
2200          * failed. */
2201         return bdrv_flush_all();
2202     }
2203 }
2204
2205 void list_cpus(const char *optarg)
2206 {
2207     /* XXX: implement xxx_cpu_list for targets that still miss it */
2208 #if defined(cpu_list)
2209     cpu_list();
2210 #endif
2211 }
2212
2213 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2214                  bool has_cpu, int64_t cpu_index, Error **errp)
2215 {
2216     FILE *f;
2217     uint32_t l;
2218     CPUState *cpu;
2219     uint8_t buf[1024];
2220     int64_t orig_addr = addr, orig_size = size;
2221
2222     if (!has_cpu) {
2223         cpu_index = 0;
2224     }
2225
2226     cpu = qemu_get_cpu(cpu_index);
2227     if (cpu == NULL) {
2228         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2229                    "a CPU number");
2230         return;
2231     }
2232
2233     f = fopen(filename, "wb");
2234     if (!f) {
2235         error_setg_file_open(errp, errno, filename);
2236         return;
2237     }
2238
2239     while (size != 0) {
2240         l = sizeof(buf);
2241         if (l > size)
2242             l = size;
2243         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2244             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2245                              " specified", orig_addr, orig_size);
2246             goto exit;
2247         }
2248         if (fwrite(buf, 1, l, f) != l) {
2249             error_setg(errp, QERR_IO_ERROR);
2250             goto exit;
2251         }
2252         addr += l;
2253         size -= l;
2254     }
2255
2256 exit:
2257     fclose(f);
2258 }
2259
2260 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2261                   Error **errp)
2262 {
2263     FILE *f;
2264     uint32_t l;
2265     uint8_t buf[1024];
2266
2267     f = fopen(filename, "wb");
2268     if (!f) {
2269         error_setg_file_open(errp, errno, filename);
2270         return;
2271     }
2272
2273     while (size != 0) {
2274         l = sizeof(buf);
2275         if (l > size)
2276             l = size;
2277         cpu_physical_memory_read(addr, buf, l);
2278         if (fwrite(buf, 1, l, f) != l) {
2279             error_setg(errp, QERR_IO_ERROR);
2280             goto exit;
2281         }
2282         addr += l;
2283         size -= l;
2284     }
2285
2286 exit:
2287     fclose(f);
2288 }
2289
2290 void qmp_inject_nmi(Error **errp)
2291 {
2292     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2293 }
2294
2295 void dump_drift_info(void)
2296 {
2297     if (!use_icount) {
2298         return;
2299     }
2300
2301     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2302                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2303     if (icount_align_option) {
2304         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2305                     -max_delay / SCALE_MS);
2306         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2307                     max_advance / SCALE_MS);
2308     } else {
2309         qemu_printf("Max guest delay     NA\n");
2310         qemu_printf("Max guest advance   NA\n");
2311     }
2312 }