cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "sysemu/runstate.h"
  59 #include "hw/boards.h"
  60 #include "hw/hw.h"
  61
  62 #ifdef CONFIG_LINUX
  63
  64 #include <sys/prctl.h>
  65
  66 #ifndef PR_MCE_KILL
  67 #define PR_MCE_KILL 33
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_SET
  71 #define PR_MCE_KILL_SET 1
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_EARLY
  75 #define PR_MCE_KILL_EARLY 1
  76 #endif
  77
  78 #endif /* CONFIG_LINUX */
  79
  80 static QemuMutex qemu_global_mutex;
  81
  82 int64_t max_delay;
  83 int64_t max_advance;
  84
  85 /* vcpu throttling controls */
  86 static QEMUTimer *throttle_timer;
  87 static unsigned int throttle_percentage;
  88
  89 #define CPU_THROTTLE_PCT_MIN 1
  90 #define CPU_THROTTLE_PCT_MAX 99
  91 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  92
  93 bool cpu_is_stopped(CPUState *cpu)
  94 {
  95     return cpu->stopped || !runstate_is_running();
  96 }
  97
  98 static bool cpu_thread_is_idle(CPUState *cpu)
  99 {
 100     if (cpu->stop || cpu->queued_work_first) {
 101         return false;
 102     }
 103     if (cpu_is_stopped(cpu)) {
 104         return true;
 105     }
 106     if (!cpu->halted || cpu_has_work(cpu) ||
 107         kvm_halt_in_kernel()) {
 108         return false;
 109     }
 110     return true;
 111 }
 112
 113 static bool all_cpu_threads_idle(void)
 114 {
 115     CPUState *cpu;
 116
 117     CPU_FOREACH(cpu) {
 118         if (!cpu_thread_is_idle(cpu)) {
 119             return false;
 120         }
 121     }
 122     return true;
 123 }
 124
 125 /***********************************************************/
 126 /* guest cycle counter */
 127
 128 /* Protected by TimersState seqlock */
 129
 130 static bool icount_sleep = true;
 131 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 132 #define MAX_ICOUNT_SHIFT 10
 133
 134 typedef struct TimersState {
 135     /* Protected by BQL.  */
 136     int64_t cpu_ticks_prev;
 137     int64_t cpu_ticks_offset;
 138
 139     /* Protect fields that can be respectively read outside the
 140      * BQL, and written from multiple threads.
 141      */
 142     QemuSeqLock vm_clock_seqlock;
 143     QemuSpin vm_clock_lock;
 144
 145     int16_t cpu_ticks_enabled;
 146
 147     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 148     int16_t icount_time_shift;
 149
 150     /* Compensate for varying guest execution speed.  */
 151     int64_t qemu_icount_bias;
 152
 153     int64_t vm_clock_warp_start;
 154     int64_t cpu_clock_offset;
 155
 156     /* Only written by TCG thread */
 157     int64_t qemu_icount;
 158
 159     /* for adjusting icount */
 160     QEMUTimer *icount_rt_timer;
 161     QEMUTimer *icount_vm_timer;
 162     QEMUTimer *icount_warp_timer;
 163 } TimersState;
 164
 165 static TimersState timers_state;
 166 bool mttcg_enabled;
 167
 168 /*
 169  * We default to false if we know other options have been enabled
 170  * which are currently incompatible with MTTCG. Otherwise when each
 171  * guest (target) has been updated to support:
 172  *   - atomic instructions
 173  *   - memory ordering primitives (barriers)
 174  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 175  *
 176  * Once a guest architecture has been converted to the new primitives
 177  * there are two remaining limitations to check.
 178  *
 179  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 180  * - The host must have a stronger memory order than the guest
 181  *
 182  * It may be possible in future to support strong guests on weak hosts
 183  * but that will require tagging all load/stores in a guest with their
 184  * implicit memory order requirements which would likely slow things
 185  * down a lot.
 186  */
 187
 188 static bool check_tcg_memory_orders_compatible(void)
 189 {
 190 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 191     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 192 #else
 193     return false;
 194 #endif
 195 }
 196
 197 static bool default_mttcg_enabled(void)
 198 {
 199     if (use_icount || TCG_OVERSIZED_GUEST) {
 200         return false;
 201     } else {
 202 #ifdef TARGET_SUPPORTS_MTTCG
 203         return check_tcg_memory_orders_compatible();
 204 #else
 205         return false;
 206 #endif
 207     }
 208 }
 209
 210 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 211 {
 212     const char *t = qemu_opt_get(opts, "thread");
 213     if (t) {
 214         if (strcmp(t, "multi") == 0) {
 215             if (TCG_OVERSIZED_GUEST) {
 216                 error_setg(errp, "No MTTCG when guest word size > hosts");
 217             } else if (use_icount) {
 218                 error_setg(errp, "No MTTCG when icount is enabled");
 219             } else {
 220 #ifndef TARGET_SUPPORTS_MTTCG
 221                 warn_report("Guest not yet converted to MTTCG - "
 222                             "you may get unexpected results");
 223 #endif
 224                 if (!check_tcg_memory_orders_compatible()) {
 225                     warn_report("Guest expects a stronger memory ordering "
 226                                 "than the host provides");
 227                     error_printf("This may cause strange/hard to debug errors\n");
 228                 }
 229                 mttcg_enabled = true;
 230             }
 231         } else if (strcmp(t, "single") == 0) {
 232             mttcg_enabled = false;
 233         } else {
 234             error_setg(errp, "Invalid 'thread' setting %s", t);
 235         }
 236     } else {
 237         mttcg_enabled = default_mttcg_enabled();
 238     }
 239 }
 240
 241 /* The current number of executed instructions is based on what we
 242  * originally budgeted minus the current state of the decrementing
 243  * icount counters in extra/u16.low.
 244  */
 245 static int64_t cpu_get_icount_executed(CPUState *cpu)
 246 {
 247     return (cpu->icount_budget -
 248             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 249 }
 250
 251 /*
 252  * Update the global shared timer_state.qemu_icount to take into
 253  * account executed instructions. This is done by the TCG vCPU
 254  * thread so the main-loop can see time has moved forward.
 255  */
 256 static void cpu_update_icount_locked(CPUState *cpu)
 257 {
 258     int64_t executed = cpu_get_icount_executed(cpu);
 259     cpu->icount_budget -= executed;
 260
 261     atomic_set_i64(&timers_state.qemu_icount,
 262                    timers_state.qemu_icount + executed);
 263 }
 264
 265 /*
 266  * Update the global shared timer_state.qemu_icount to take into
 267  * account executed instructions. This is done by the TCG vCPU
 268  * thread so the main-loop can see time has moved forward.
 269  */
 270 void cpu_update_icount(CPUState *cpu)
 271 {
 272     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 273                        &timers_state.vm_clock_lock);
 274     cpu_update_icount_locked(cpu);
 275     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 276                          &timers_state.vm_clock_lock);
 277 }
 278
 279 static int64_t cpu_get_icount_raw_locked(void)
 280 {
 281     CPUState *cpu = current_cpu;
 282
 283     if (cpu && cpu->running) {
 284         if (!cpu->can_do_io) {
 285             error_report("Bad icount read");
 286             exit(1);
 287         }
 288         /* Take into account what has run */
 289         cpu_update_icount_locked(cpu);
 290     }
 291     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 292     return atomic_read_i64(&timers_state.qemu_icount);
 293 }
 294
 295 static int64_t cpu_get_icount_locked(void)
 296 {
 297     int64_t icount = cpu_get_icount_raw_locked();
 298     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 299         cpu_icount_to_ns(icount);
 300 }
 301
 302 int64_t cpu_get_icount_raw(void)
 303 {
 304     int64_t icount;
 305     unsigned start;
 306
 307     do {
 308         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 309         icount = cpu_get_icount_raw_locked();
 310     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 311
 312     return icount;
 313 }
 314
 315 /* Return the virtual CPU time, based on the instruction counter.  */
 316 int64_t cpu_get_icount(void)
 317 {
 318     int64_t icount;
 319     unsigned start;
 320
 321     do {
 322         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 323         icount = cpu_get_icount_locked();
 324     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 325
 326     return icount;
 327 }
 328
 329 int64_t cpu_icount_to_ns(int64_t icount)
 330 {
 331     return icount << atomic_read(&timers_state.icount_time_shift);
 332 }
 333
 334 static int64_t cpu_get_ticks_locked(void)
 335 {
 336     int64_t ticks = timers_state.cpu_ticks_offset;
 337     if (timers_state.cpu_ticks_enabled) {
 338         ticks += cpu_get_host_ticks();
 339     }
 340
 341     if (timers_state.cpu_ticks_prev > ticks) {
 342         /* Non increasing ticks may happen if the host uses software suspend.  */
 343         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 344         ticks = timers_state.cpu_ticks_prev;
 345     }
 346
 347     timers_state.cpu_ticks_prev = ticks;
 348     return ticks;
 349 }
 350
 351 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 352  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 353  * counter.
 354  */
 355 int64_t cpu_get_ticks(void)
 356 {
 357     int64_t ticks;
 358
 359     if (use_icount) {
 360         return cpu_get_icount();
 361     }
 362
 363     qemu_spin_lock(&timers_state.vm_clock_lock);
 364     ticks = cpu_get_ticks_locked();
 365     qemu_spin_unlock(&timers_state.vm_clock_lock);
 366     return ticks;
 367 }
 368
 369 static int64_t cpu_get_clock_locked(void)
 370 {
 371     int64_t time;
 372
 373     time = timers_state.cpu_clock_offset;
 374     if (timers_state.cpu_ticks_enabled) {
 375         time += get_clock();
 376     }
 377
 378     return time;
 379 }
 380
 381 /* Return the monotonic time elapsed in VM, i.e.,
 382  * the time between vm_start and vm_stop
 383  */
 384 int64_t cpu_get_clock(void)
 385 {
 386     int64_t ti;
 387     unsigned start;
 388
 389     do {
 390         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 391         ti = cpu_get_clock_locked();
 392     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 393
 394     return ti;
 395 }
 396
 397 /* enable cpu_get_ticks()
 398  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 399  */
 400 void cpu_enable_ticks(void)
 401 {
 402     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 403                        &timers_state.vm_clock_lock);
 404     if (!timers_state.cpu_ticks_enabled) {
 405         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 406         timers_state.cpu_clock_offset -= get_clock();
 407         timers_state.cpu_ticks_enabled = 1;
 408     }
 409     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 410                        &timers_state.vm_clock_lock);
 411 }
 412
 413 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 414  * cpu_get_ticks() after that.
 415  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 416  */
 417 void cpu_disable_ticks(void)
 418 {
 419     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 420                        &timers_state.vm_clock_lock);
 421     if (timers_state.cpu_ticks_enabled) {
 422         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 423         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 424         timers_state.cpu_ticks_enabled = 0;
 425     }
 426     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 427                          &timers_state.vm_clock_lock);
 428 }
 429
 430 /* Correlation between real and virtual time is always going to be
 431    fairly approximate, so ignore small variation.
 432    When the guest is idle real and virtual time will be aligned in
 433    the IO wait loop.  */
 434 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 435
 436 static void icount_adjust(void)
 437 {
 438     int64_t cur_time;
 439     int64_t cur_icount;
 440     int64_t delta;
 441
 442     /* Protected by TimersState mutex.  */
 443     static int64_t last_delta;
 444
 445     /* If the VM is not running, then do nothing.  */
 446     if (!runstate_is_running()) {
 447         return;
 448     }
 449
 450     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 451                        &timers_state.vm_clock_lock);
 452     cur_time = cpu_get_clock_locked();
 453     cur_icount = cpu_get_icount_locked();
 454
 455     delta = cur_icount - cur_time;
 456     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 457     if (delta > 0
 458         && last_delta + ICOUNT_WOBBLE < delta * 2
 459         && timers_state.icount_time_shift > 0) {
 460         /* The guest is getting too far ahead.  Slow time down.  */
 461         atomic_set(&timers_state.icount_time_shift,
 462                    timers_state.icount_time_shift - 1);
 463     }
 464     if (delta < 0
 465         && last_delta - ICOUNT_WOBBLE > delta * 2
 466         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 467         /* The guest is getting too far behind.  Speed time up.  */
 468         atomic_set(&timers_state.icount_time_shift,
 469                    timers_state.icount_time_shift + 1);
 470     }
 471     last_delta = delta;
 472     atomic_set_i64(&timers_state.qemu_icount_bias,
 473                    cur_icount - (timers_state.qemu_icount
 474                                  << timers_state.icount_time_shift));
 475     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 476                          &timers_state.vm_clock_lock);
 477 }
 478
 479 static void icount_adjust_rt(void *opaque)
 480 {
 481     timer_mod(timers_state.icount_rt_timer,
 482               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 483     icount_adjust();
 484 }
 485
 486 static void icount_adjust_vm(void *opaque)
 487 {
 488     timer_mod(timers_state.icount_vm_timer,
 489                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 490                    NANOSECONDS_PER_SECOND / 10);
 491     icount_adjust();
 492 }
 493
 494 static int64_t qemu_icount_round(int64_t count)
 495 {
 496     int shift = atomic_read(&timers_state.icount_time_shift);
 497     return (count + (1 << shift) - 1) >> shift;
 498 }
 499
 500 static void icount_warp_rt(void)
 501 {
 502     unsigned seq;
 503     int64_t warp_start;
 504
 505     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 506      * changes from -1 to another value, so the race here is okay.
 507      */
 508     do {
 509         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 510         warp_start = timers_state.vm_clock_warp_start;
 511     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 512
 513     if (warp_start == -1) {
 514         return;
 515     }
 516
 517     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 518                        &timers_state.vm_clock_lock);
 519     if (runstate_is_running()) {
 520         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 521                                             cpu_get_clock_locked());
 522         int64_t warp_delta;
 523
 524         warp_delta = clock - timers_state.vm_clock_warp_start;
 525         if (use_icount == 2) {
 526             /*
 527              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 528              * far ahead of real time.
 529              */
 530             int64_t cur_icount = cpu_get_icount_locked();
 531             int64_t delta = clock - cur_icount;
 532             warp_delta = MIN(warp_delta, delta);
 533         }
 534         atomic_set_i64(&timers_state.qemu_icount_bias,
 535                        timers_state.qemu_icount_bias + warp_delta);
 536     }
 537     timers_state.vm_clock_warp_start = -1;
 538     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 539                        &timers_state.vm_clock_lock);
 540
 541     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 542         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 543     }
 544 }
 545
 546 static void icount_timer_cb(void *opaque)
 547 {
 548     /* No need for a checkpoint because the timer already synchronizes
 549      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 550      */
 551     icount_warp_rt();
 552 }
 553
 554 void qtest_clock_warp(int64_t dest)
 555 {
 556     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 557     AioContext *aio_context;
 558     assert(qtest_enabled());
 559     aio_context = qemu_get_aio_context();
 560     while (clock < dest) {
 561         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 562                                                       QEMU_TIMER_ATTR_ALL);
 563         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 564
 565         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 566                            &timers_state.vm_clock_lock);
 567         atomic_set_i64(&timers_state.qemu_icount_bias,
 568                        timers_state.qemu_icount_bias + warp);
 569         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 570                              &timers_state.vm_clock_lock);
 571
 572         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 573         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 574         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 575     }
 576     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 577 }
 578
 579 void qemu_start_warp_timer(void)
 580 {
 581     int64_t clock;
 582     int64_t deadline;
 583
 584     if (!use_icount) {
 585         return;
 586     }
 587
 588     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 589      * do not fire, so computing the deadline does not make sense.
 590      */
 591     if (!runstate_is_running()) {
 592         return;
 593     }
 594
 595     if (replay_mode != REPLAY_MODE_PLAY) {
 596         if (!all_cpu_threads_idle()) {
 597             return;
 598         }
 599
 600         if (qtest_enabled()) {
 601             /* When testing, qtest commands advance icount.  */
 602             return;
 603         }
 604
 605         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 606     } else {
 607         /* warp clock deterministically in record/replay mode */
 608         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 609             /* vCPU is sleeping and warp can't be started.
 610                It is probably a race condition: notification sent
 611                to vCPU was processed in advance and vCPU went to sleep.
 612                Therefore we have to wake it up for doing someting. */
 613             if (replay_has_checkpoint()) {
 614                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 615             }
 616             return;
 617         }
 618     }
 619
 620     /* We want to use the earliest deadline from ALL vm_clocks */
 621     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 622     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 623                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 624     if (deadline < 0) {
 625         static bool notified;
 626         if (!icount_sleep && !notified) {
 627             warn_report("icount sleep disabled and no active timers");
 628             notified = true;
 629         }
 630         return;
 631     }
 632
 633     if (deadline > 0) {
 634         /*
 635          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 636          * sleep.  Otherwise, the CPU might be waiting for a future timer
 637          * interrupt to wake it up, but the interrupt never comes because
 638          * the vCPU isn't running any insns and thus doesn't advance the
 639          * QEMU_CLOCK_VIRTUAL.
 640          */
 641         if (!icount_sleep) {
 642             /*
 643              * We never let VCPUs sleep in no sleep icount mode.
 644              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 645              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 646              * It is useful when we want a deterministic execution time,
 647              * isolated from host latencies.
 648              */
 649             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 650                                &timers_state.vm_clock_lock);
 651             atomic_set_i64(&timers_state.qemu_icount_bias,
 652                            timers_state.qemu_icount_bias + deadline);
 653             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 654                                  &timers_state.vm_clock_lock);
 655             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 656         } else {
 657             /*
 658              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 659              * "real" time, (related to the time left until the next event) has
 660              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 661              * This avoids that the warps are visible externally; for example,
 662              * you will not be sending network packets continuously instead of
 663              * every 100ms.
 664              */
 665             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 666                                &timers_state.vm_clock_lock);
 667             if (timers_state.vm_clock_warp_start == -1
 668                 || timers_state.vm_clock_warp_start > clock) {
 669                 timers_state.vm_clock_warp_start = clock;
 670             }
 671             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 672                                  &timers_state.vm_clock_lock);
 673             timer_mod_anticipate(timers_state.icount_warp_timer,
 674                                  clock + deadline);
 675         }
 676     } else if (deadline == 0) {
 677         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 678     }
 679 }
 680
 681 static void qemu_account_warp_timer(void)
 682 {
 683     if (!use_icount || !icount_sleep) {
 684         return;
 685     }
 686
 687     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 688      * do not fire, so computing the deadline does not make sense.
 689      */
 690     if (!runstate_is_running()) {
 691         return;
 692     }
 693
 694     /* warp clock deterministically in record/replay mode */
 695     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 696         return;
 697     }
 698
 699     timer_del(timers_state.icount_warp_timer);
 700     icount_warp_rt();
 701 }
 702
 703 static bool icount_state_needed(void *opaque)
 704 {
 705     return use_icount;
 706 }
 707
 708 static bool warp_timer_state_needed(void *opaque)
 709 {
 710     TimersState *s = opaque;
 711     return s->icount_warp_timer != NULL;
 712 }
 713
 714 static bool adjust_timers_state_needed(void *opaque)
 715 {
 716     TimersState *s = opaque;
 717     return s->icount_rt_timer != NULL;
 718 }
 719
 720 /*
 721  * Subsection for warp timer migration is optional, because may not be created
 722  */
 723 static const VMStateDescription icount_vmstate_warp_timer = {
 724     .name = "timer/icount/warp_timer",
 725     .version_id = 1,
 726     .minimum_version_id = 1,
 727     .needed = warp_timer_state_needed,
 728     .fields = (VMStateField[]) {
 729         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 730         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 731         VMSTATE_END_OF_LIST()
 732     }
 733 };
 734
 735 static const VMStateDescription icount_vmstate_adjust_timers = {
 736     .name = "timer/icount/timers",
 737     .version_id = 1,
 738     .minimum_version_id = 1,
 739     .needed = adjust_timers_state_needed,
 740     .fields = (VMStateField[]) {
 741         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 742         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 743         VMSTATE_END_OF_LIST()
 744     }
 745 };
 746
 747 /*
 748  * This is a subsection for icount migration.
 749  */
 750 static const VMStateDescription icount_vmstate_timers = {
 751     .name = "timer/icount",
 752     .version_id = 1,
 753     .minimum_version_id = 1,
 754     .needed = icount_state_needed,
 755     .fields = (VMStateField[]) {
 756         VMSTATE_INT64(qemu_icount_bias, TimersState),
 757         VMSTATE_INT64(qemu_icount, TimersState),
 758         VMSTATE_END_OF_LIST()
 759     },
 760     .subsections = (const VMStateDescription*[]) {
 761         &icount_vmstate_warp_timer,
 762         &icount_vmstate_adjust_timers,
 763         NULL
 764     }
 765 };
 766
 767 static const VMStateDescription vmstate_timers = {
 768     .name = "timer",
 769     .version_id = 2,
 770     .minimum_version_id = 1,
 771     .fields = (VMStateField[]) {
 772         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 773         VMSTATE_UNUSED(8),
 774         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 775         VMSTATE_END_OF_LIST()
 776     },
 777     .subsections = (const VMStateDescription*[]) {
 778         &icount_vmstate_timers,
 779         NULL
 780     }
 781 };
 782
 783 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 784 {
 785     double pct;
 786     double throttle_ratio;
 787     int64_t sleeptime_ns, endtime_ns;
 788
 789     if (!cpu_throttle_get_percentage()) {
 790         return;
 791     }
 792
 793     pct = (double)cpu_throttle_get_percentage()/100;
 794     throttle_ratio = pct / (1 - pct);
 795     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 796     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 797     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 798     while (sleeptime_ns > 0 && !cpu->stop) {
 799         if (sleeptime_ns > SCALE_MS) {
 800             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 801                                 sleeptime_ns / SCALE_MS);
 802         } else {
 803             qemu_mutex_unlock_iothread();
 804             g_usleep(sleeptime_ns / SCALE_US);
 805             qemu_mutex_lock_iothread();
 806         }
 807         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 808     }
 809     atomic_set(&cpu->throttle_thread_scheduled, 0);
 810 }
 811
 812 static void cpu_throttle_timer_tick(void *opaque)
 813 {
 814     CPUState *cpu;
 815     double pct;
 816
 817     /* Stop the timer if needed */
 818     if (!cpu_throttle_get_percentage()) {
 819         return;
 820     }
 821     CPU_FOREACH(cpu) {
 822         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 823             async_run_on_cpu(cpu, cpu_throttle_thread,
 824                              RUN_ON_CPU_NULL);
 825         }
 826     }
 827
 828     pct = (double)cpu_throttle_get_percentage()/100;
 829     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 830                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 831 }
 832
 833 void cpu_throttle_set(int new_throttle_pct)
 834 {
 835     /* Ensure throttle percentage is within valid range */
 836     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 837     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 838
 839     atomic_set(&throttle_percentage, new_throttle_pct);
 840
 841     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 842                                        CPU_THROTTLE_TIMESLICE_NS);
 843 }
 844
 845 void cpu_throttle_stop(void)
 846 {
 847     atomic_set(&throttle_percentage, 0);
 848 }
 849
 850 bool cpu_throttle_active(void)
 851 {
 852     return (cpu_throttle_get_percentage() != 0);
 853 }
 854
 855 int cpu_throttle_get_percentage(void)
 856 {
 857     return atomic_read(&throttle_percentage);
 858 }
 859
 860 void cpu_ticks_init(void)
 861 {
 862     seqlock_init(&timers_state.vm_clock_seqlock);
 863     qemu_spin_init(&timers_state.vm_clock_lock);
 864     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 865     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                            cpu_throttle_timer_tick, NULL);
 867 }
 868
 869 void configure_icount(QemuOpts *opts, Error **errp)
 870 {
 871     const char *option;
 872     char *rem_str = NULL;
 873
 874     option = qemu_opt_get(opts, "shift");
 875     if (!option) {
 876         if (qemu_opt_get(opts, "align") != NULL) {
 877             error_setg(errp, "Please specify shift option when using align");
 878         }
 879         return;
 880     }
 881
 882     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 883     if (icount_sleep) {
 884         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 885                                          icount_timer_cb, NULL);
 886     }
 887
 888     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 889
 890     if (icount_align_option && !icount_sleep) {
 891         error_setg(errp, "align=on and sleep=off are incompatible");
 892     }
 893     if (strcmp(option, "auto") != 0) {
 894         errno = 0;
 895         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 896         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 897             error_setg(errp, "icount: Invalid shift value");
 898         }
 899         use_icount = 1;
 900         return;
 901     } else if (icount_align_option) {
 902         error_setg(errp, "shift=auto and align=on are incompatible");
 903     } else if (!icount_sleep) {
 904         error_setg(errp, "shift=auto and sleep=off are incompatible");
 905     }
 906
 907     use_icount = 2;
 908
 909     /* 125MIPS seems a reasonable initial guess at the guest speed.
 910        It will be corrected fairly quickly anyway.  */
 911     timers_state.icount_time_shift = 3;
 912
 913     /* Have both realtime and virtual time triggers for speed adjustment.
 914        The realtime trigger catches emulated time passing too slowly,
 915        the virtual time trigger catches emulated time passing too fast.
 916        Realtime triggers occur even when idle, so use them less frequently
 917        than VM triggers.  */
 918     timers_state.vm_clock_warp_start = -1;
 919     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 920                                    icount_adjust_rt, NULL);
 921     timer_mod(timers_state.icount_rt_timer,
 922                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 923     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                         icount_adjust_vm, NULL);
 925     timer_mod(timers_state.icount_vm_timer,
 926                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 927                    NANOSECONDS_PER_SECOND / 10);
 928 }
 929
 930 /***********************************************************/
 931 /* TCG vCPU kick timer
 932  *
 933  * The kick timer is responsible for moving single threaded vCPU
 934  * emulation on to the next vCPU. If more than one vCPU is running a
 935  * timer event with force a cpu->exit so the next vCPU can get
 936  * scheduled.
 937  *
 938  * The timer is removed if all vCPUs are idle and restarted again once
 939  * idleness is complete.
 940  */
 941
 942 static QEMUTimer *tcg_kick_vcpu_timer;
 943 static CPUState *tcg_current_rr_cpu;
 944
 945 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 946
 947 static inline int64_t qemu_tcg_next_kick(void)
 948 {
 949     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 950 }
 951
 952 /* Kick the currently round-robin scheduled vCPU to next */
 953 static void qemu_cpu_kick_rr_next_cpu(void)
 954 {
 955     CPUState *cpu;
 956     do {
 957         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 958         if (cpu) {
 959             cpu_exit(cpu);
 960         }
 961     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 962 }
 963
 964 /* Kick all RR vCPUs */
 965 static void qemu_cpu_kick_rr_cpus(void)
 966 {
 967     CPUState *cpu;
 968
 969     CPU_FOREACH(cpu) {
 970         cpu_exit(cpu);
 971     };
 972 }
 973
 974 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 975 {
 976 }
 977
 978 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 979 {
 980     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 981         qemu_notify_event();
 982         return;
 983     }
 984
 985     if (qemu_in_vcpu_thread()) {
 986         /* A CPU is currently running; kick it back out to the
 987          * tcg_cpu_exec() loop so it will recalculate its
 988          * icount deadline immediately.
 989          */
 990         qemu_cpu_kick(current_cpu);
 991     } else if (first_cpu) {
 992         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 993          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 994          * causes cpu_thread_is_idle to return false.  This way,
 995          * handle_icount_deadline can run.
 996          * If we have no CPUs at all for some reason, we don't
 997          * need to do anything.
 998          */
 999         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1000     }
1001 }
1002
1003 static void kick_tcg_thread(void *opaque)
1004 {
1005     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1006     qemu_cpu_kick_rr_next_cpu();
1007 }
1008
1009 static void start_tcg_kick_timer(void)
1010 {
1011     assert(!mttcg_enabled);
1012     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1013         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1014                                            kick_tcg_thread, NULL);
1015     }
1016     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1017         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1018     }
1019 }
1020
1021 static void stop_tcg_kick_timer(void)
1022 {
1023     assert(!mttcg_enabled);
1024     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1025         timer_del(tcg_kick_vcpu_timer);
1026     }
1027 }
1028
1029 /***********************************************************/
1030 void hw_error(const char *fmt, ...)
1031 {
1032     va_list ap;
1033     CPUState *cpu;
1034
1035     va_start(ap, fmt);
1036     fprintf(stderr, "qemu: hardware error: ");
1037     vfprintf(stderr, fmt, ap);
1038     fprintf(stderr, "\n");
1039     CPU_FOREACH(cpu) {
1040         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1041         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1042     }
1043     va_end(ap);
1044     abort();
1045 }
1046
1047 void cpu_synchronize_all_states(void)
1048 {
1049     CPUState *cpu;
1050
1051     CPU_FOREACH(cpu) {
1052         cpu_synchronize_state(cpu);
1053         /* TODO: move to cpu_synchronize_state() */
1054         if (hvf_enabled()) {
1055             hvf_cpu_synchronize_state(cpu);
1056         }
1057     }
1058 }
1059
1060 void cpu_synchronize_all_post_reset(void)
1061 {
1062     CPUState *cpu;
1063
1064     CPU_FOREACH(cpu) {
1065         cpu_synchronize_post_reset(cpu);
1066         /* TODO: move to cpu_synchronize_post_reset() */
1067         if (hvf_enabled()) {
1068             hvf_cpu_synchronize_post_reset(cpu);
1069         }
1070     }
1071 }
1072
1073 void cpu_synchronize_all_post_init(void)
1074 {
1075     CPUState *cpu;
1076
1077     CPU_FOREACH(cpu) {
1078         cpu_synchronize_post_init(cpu);
1079         /* TODO: move to cpu_synchronize_post_init() */
1080         if (hvf_enabled()) {
1081             hvf_cpu_synchronize_post_init(cpu);
1082         }
1083     }
1084 }
1085
1086 void cpu_synchronize_all_pre_loadvm(void)
1087 {
1088     CPUState *cpu;
1089
1090     CPU_FOREACH(cpu) {
1091         cpu_synchronize_pre_loadvm(cpu);
1092     }
1093 }
1094
1095 static int do_vm_stop(RunState state, bool send_stop)
1096 {
1097     int ret = 0;
1098
1099     if (runstate_is_running()) {
1100         cpu_disable_ticks();
1101         pause_all_vcpus();
1102         runstate_set(state);
1103         vm_state_notify(0, state);
1104         if (send_stop) {
1105             qapi_event_send_stop();
1106         }
1107     }
1108
1109     bdrv_drain_all();
1110     ret = bdrv_flush_all();
1111
1112     return ret;
1113 }
1114
1115 /* Special vm_stop() variant for terminating the process.  Historically clients
1116  * did not expect a QMP STOP event and so we need to retain compatibility.
1117  */
1118 int vm_shutdown(void)
1119 {
1120     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1121 }
1122
1123 static bool cpu_can_run(CPUState *cpu)
1124 {
1125     if (cpu->stop) {
1126         return false;
1127     }
1128     if (cpu_is_stopped(cpu)) {
1129         return false;
1130     }
1131     return true;
1132 }
1133
1134 static void cpu_handle_guest_debug(CPUState *cpu)
1135 {
1136     gdb_set_stop_cpu(cpu);
1137     qemu_system_debug_request();
1138     cpu->stopped = true;
1139 }
1140
1141 #ifdef CONFIG_LINUX
1142 static void sigbus_reraise(void)
1143 {
1144     sigset_t set;
1145     struct sigaction action;
1146
1147     memset(&action, 0, sizeof(action));
1148     action.sa_handler = SIG_DFL;
1149     if (!sigaction(SIGBUS, &action, NULL)) {
1150         raise(SIGBUS);
1151         sigemptyset(&set);
1152         sigaddset(&set, SIGBUS);
1153         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1154     }
1155     perror("Failed to re-raise SIGBUS!\n");
1156     abort();
1157 }
1158
1159 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1160 {
1161     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1162         sigbus_reraise();
1163     }
1164
1165     if (current_cpu) {
1166         /* Called asynchronously in VCPU thread.  */
1167         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1168             sigbus_reraise();
1169         }
1170     } else {
1171         /* Called synchronously (via signalfd) in main thread.  */
1172         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1173             sigbus_reraise();
1174         }
1175     }
1176 }
1177
1178 static void qemu_init_sigbus(void)
1179 {
1180     struct sigaction action;
1181
1182     memset(&action, 0, sizeof(action));
1183     action.sa_flags = SA_SIGINFO;
1184     action.sa_sigaction = sigbus_handler;
1185     sigaction(SIGBUS, &action, NULL);
1186
1187     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1188 }
1189 #else /* !CONFIG_LINUX */
1190 static void qemu_init_sigbus(void)
1191 {
1192 }
1193 #endif /* !CONFIG_LINUX */
1194
1195 static QemuThread io_thread;
1196
1197 /* cpu creation */
1198 static QemuCond qemu_cpu_cond;
1199 /* system init */
1200 static QemuCond qemu_pause_cond;
1201
1202 void qemu_init_cpu_loop(void)
1203 {
1204     qemu_init_sigbus();
1205     qemu_cond_init(&qemu_cpu_cond);
1206     qemu_cond_init(&qemu_pause_cond);
1207     qemu_mutex_init(&qemu_global_mutex);
1208
1209     qemu_thread_get_self(&io_thread);
1210 }
1211
1212 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1213 {
1214     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1215 }
1216
1217 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1218 {
1219     if (kvm_destroy_vcpu(cpu) < 0) {
1220         error_report("kvm_destroy_vcpu failed");
1221         exit(EXIT_FAILURE);
1222     }
1223 }
1224
1225 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1226 {
1227 }
1228
1229 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1230 {
1231     g_assert(qemu_cpu_is_self(cpu));
1232     cpu->stop = false;
1233     cpu->stopped = true;
1234     if (exit) {
1235         cpu_exit(cpu);
1236     }
1237     qemu_cond_broadcast(&qemu_pause_cond);
1238 }
1239
1240 static void qemu_wait_io_event_common(CPUState *cpu)
1241 {
1242     atomic_mb_set(&cpu->thread_kicked, false);
1243     if (cpu->stop) {
1244         qemu_cpu_stop(cpu, false);
1245     }
1246     process_queued_cpu_work(cpu);
1247 }
1248
1249 static void qemu_tcg_rr_wait_io_event(void)
1250 {
1251     CPUState *cpu;
1252
1253     while (all_cpu_threads_idle()) {
1254         stop_tcg_kick_timer();
1255         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1256     }
1257
1258     start_tcg_kick_timer();
1259
1260     CPU_FOREACH(cpu) {
1261         qemu_wait_io_event_common(cpu);
1262     }
1263 }
1264
1265 static void qemu_wait_io_event(CPUState *cpu)
1266 {
1267     while (cpu_thread_is_idle(cpu)) {
1268         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1269     }
1270
1271 #ifdef _WIN32
1272     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1273     if (!tcg_enabled()) {
1274         SleepEx(0, TRUE);
1275     }
1276 #endif
1277     qemu_wait_io_event_common(cpu);
1278 }
1279
1280 static void *qemu_kvm_cpu_thread_fn(void *arg)
1281 {
1282     CPUState *cpu = arg;
1283     int r;
1284
1285     rcu_register_thread();
1286
1287     qemu_mutex_lock_iothread();
1288     qemu_thread_get_self(cpu->thread);
1289     cpu->thread_id = qemu_get_thread_id();
1290     cpu->can_do_io = 1;
1291     current_cpu = cpu;
1292
1293     r = kvm_init_vcpu(cpu);
1294     if (r < 0) {
1295         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1296         exit(1);
1297     }
1298
1299     kvm_init_cpu_signals(cpu);
1300
1301     /* signal CPU creation */
1302     cpu->created = true;
1303     qemu_cond_signal(&qemu_cpu_cond);
1304     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1305
1306     do {
1307         if (cpu_can_run(cpu)) {
1308             r = kvm_cpu_exec(cpu);
1309             if (r == EXCP_DEBUG) {
1310                 cpu_handle_guest_debug(cpu);
1311             }
1312         }
1313         qemu_wait_io_event(cpu);
1314     } while (!cpu->unplug || cpu_can_run(cpu));
1315
1316     qemu_kvm_destroy_vcpu(cpu);
1317     cpu->created = false;
1318     qemu_cond_signal(&qemu_cpu_cond);
1319     qemu_mutex_unlock_iothread();
1320     rcu_unregister_thread();
1321     return NULL;
1322 }
1323
1324 static void *qemu_dummy_cpu_thread_fn(void *arg)
1325 {
1326 #ifdef _WIN32
1327     error_report("qtest is not supported under Windows");
1328     exit(1);
1329 #else
1330     CPUState *cpu = arg;
1331     sigset_t waitset;
1332     int r;
1333
1334     rcu_register_thread();
1335
1336     qemu_mutex_lock_iothread();
1337     qemu_thread_get_self(cpu->thread);
1338     cpu->thread_id = qemu_get_thread_id();
1339     cpu->can_do_io = 1;
1340     current_cpu = cpu;
1341
1342     sigemptyset(&waitset);
1343     sigaddset(&waitset, SIG_IPI);
1344
1345     /* signal CPU creation */
1346     cpu->created = true;
1347     qemu_cond_signal(&qemu_cpu_cond);
1348     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1349
1350     do {
1351         qemu_mutex_unlock_iothread();
1352         do {
1353             int sig;
1354             r = sigwait(&waitset, &sig);
1355         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1356         if (r == -1) {
1357             perror("sigwait");
1358             exit(1);
1359         }
1360         qemu_mutex_lock_iothread();
1361         qemu_wait_io_event(cpu);
1362     } while (!cpu->unplug);
1363
1364     qemu_mutex_unlock_iothread();
1365     rcu_unregister_thread();
1366     return NULL;
1367 #endif
1368 }
1369
1370 static int64_t tcg_get_icount_limit(void)
1371 {
1372     int64_t deadline;
1373
1374     if (replay_mode != REPLAY_MODE_PLAY) {
1375         /*
1376          * Include all the timers, because they may need an attention.
1377          * Too long CPU execution may create unnecessary delay in UI.
1378          */
1379         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1380                                               QEMU_TIMER_ATTR_ALL);
1381
1382         /* Maintain prior (possibly buggy) behaviour where if no deadline
1383          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1384          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1385          * nanoseconds.
1386          */
1387         if ((deadline < 0) || (deadline > INT32_MAX)) {
1388             deadline = INT32_MAX;
1389         }
1390
1391         return qemu_icount_round(deadline);
1392     } else {
1393         return replay_get_instructions();
1394     }
1395 }
1396
1397 static void handle_icount_deadline(void)
1398 {
1399     assert(qemu_in_vcpu_thread());
1400     if (use_icount) {
1401         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1402                                                       QEMU_TIMER_ATTR_ALL);
1403
1404         if (deadline == 0) {
1405             /* Wake up other AioContexts.  */
1406             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1407             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1408         }
1409     }
1410 }
1411
1412 static void prepare_icount_for_run(CPUState *cpu)
1413 {
1414     if (use_icount) {
1415         int insns_left;
1416
1417         /* These should always be cleared by process_icount_data after
1418          * each vCPU execution. However u16.high can be raised
1419          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1420          */
1421         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1422         g_assert(cpu->icount_extra == 0);
1423
1424         cpu->icount_budget = tcg_get_icount_limit();
1425         insns_left = MIN(0xffff, cpu->icount_budget);
1426         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1427         cpu->icount_extra = cpu->icount_budget - insns_left;
1428
1429         replay_mutex_lock();
1430     }
1431 }
1432
1433 static void process_icount_data(CPUState *cpu)
1434 {
1435     if (use_icount) {
1436         /* Account for executed instructions */
1437         cpu_update_icount(cpu);
1438
1439         /* Reset the counters */
1440         cpu_neg(cpu)->icount_decr.u16.low = 0;
1441         cpu->icount_extra = 0;
1442         cpu->icount_budget = 0;
1443
1444         replay_account_executed_instructions();
1445
1446         replay_mutex_unlock();
1447     }
1448 }
1449
1450
1451 static int tcg_cpu_exec(CPUState *cpu)
1452 {
1453     int ret;
1454 #ifdef CONFIG_PROFILER
1455     int64_t ti;
1456 #endif
1457
1458     assert(tcg_enabled());
1459 #ifdef CONFIG_PROFILER
1460     ti = profile_getclock();
1461 #endif
1462     cpu_exec_start(cpu);
1463     ret = cpu_exec(cpu);
1464     cpu_exec_end(cpu);
1465 #ifdef CONFIG_PROFILER
1466     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1467                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1468 #endif
1469     return ret;
1470 }
1471
1472 /* Destroy any remaining vCPUs which have been unplugged and have
1473  * finished running
1474  */
1475 static void deal_with_unplugged_cpus(void)
1476 {
1477     CPUState *cpu;
1478
1479     CPU_FOREACH(cpu) {
1480         if (cpu->unplug && !cpu_can_run(cpu)) {
1481             qemu_tcg_destroy_vcpu(cpu);
1482             cpu->created = false;
1483             qemu_cond_signal(&qemu_cpu_cond);
1484             break;
1485         }
1486     }
1487 }
1488
1489 /* Single-threaded TCG
1490  *
1491  * In the single-threaded case each vCPU is simulated in turn. If
1492  * there is more than a single vCPU we create a simple timer to kick
1493  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1494  * This is done explicitly rather than relying on side-effects
1495  * elsewhere.
1496  */
1497
1498 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1499 {
1500     CPUState *cpu = arg;
1501
1502     assert(tcg_enabled());
1503     rcu_register_thread();
1504     tcg_register_thread();
1505
1506     qemu_mutex_lock_iothread();
1507     qemu_thread_get_self(cpu->thread);
1508
1509     cpu->thread_id = qemu_get_thread_id();
1510     cpu->created = true;
1511     cpu->can_do_io = 1;
1512     qemu_cond_signal(&qemu_cpu_cond);
1513     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1514
1515     /* wait for initial kick-off after machine start */
1516     while (first_cpu->stopped) {
1517         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1518
1519         /* process any pending work */
1520         CPU_FOREACH(cpu) {
1521             current_cpu = cpu;
1522             qemu_wait_io_event_common(cpu);
1523         }
1524     }
1525
1526     start_tcg_kick_timer();
1527
1528     cpu = first_cpu;
1529
1530     /* process any pending work */
1531     cpu->exit_request = 1;
1532
1533     while (1) {
1534         qemu_mutex_unlock_iothread();
1535         replay_mutex_lock();
1536         qemu_mutex_lock_iothread();
1537         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1538         qemu_account_warp_timer();
1539
1540         /* Run the timers here.  This is much more efficient than
1541          * waking up the I/O thread and waiting for completion.
1542          */
1543         handle_icount_deadline();
1544
1545         replay_mutex_unlock();
1546
1547         if (!cpu) {
1548             cpu = first_cpu;
1549         }
1550
1551         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1552
1553             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1554             current_cpu = cpu;
1555
1556             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1557                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1558
1559             if (cpu_can_run(cpu)) {
1560                 int r;
1561
1562                 qemu_mutex_unlock_iothread();
1563                 prepare_icount_for_run(cpu);
1564
1565                 r = tcg_cpu_exec(cpu);
1566
1567                 process_icount_data(cpu);
1568                 qemu_mutex_lock_iothread();
1569
1570                 if (r == EXCP_DEBUG) {
1571                     cpu_handle_guest_debug(cpu);
1572                     break;
1573                 } else if (r == EXCP_ATOMIC) {
1574                     qemu_mutex_unlock_iothread();
1575                     cpu_exec_step_atomic(cpu);
1576                     qemu_mutex_lock_iothread();
1577                     break;
1578                 }
1579             } else if (cpu->stop) {
1580                 if (cpu->unplug) {
1581                     cpu = CPU_NEXT(cpu);
1582                 }
1583                 break;
1584             }
1585
1586             cpu = CPU_NEXT(cpu);
1587         } /* while (cpu && !cpu->exit_request).. */
1588
1589         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1590         atomic_set(&tcg_current_rr_cpu, NULL);
1591
1592         if (cpu && cpu->exit_request) {
1593             atomic_mb_set(&cpu->exit_request, 0);
1594         }
1595
1596         if (use_icount && all_cpu_threads_idle()) {
1597             /*
1598              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1599              * in the main_loop, wake it up in order to start the warp timer.
1600              */
1601             qemu_notify_event();
1602         }
1603
1604         qemu_tcg_rr_wait_io_event();
1605         deal_with_unplugged_cpus();
1606     }
1607
1608     rcu_unregister_thread();
1609     return NULL;
1610 }
1611
1612 static void *qemu_hax_cpu_thread_fn(void *arg)
1613 {
1614     CPUState *cpu = arg;
1615     int r;
1616
1617     rcu_register_thread();
1618     qemu_mutex_lock_iothread();
1619     qemu_thread_get_self(cpu->thread);
1620
1621     cpu->thread_id = qemu_get_thread_id();
1622     cpu->created = true;
1623     current_cpu = cpu;
1624
1625     hax_init_vcpu(cpu);
1626     qemu_cond_signal(&qemu_cpu_cond);
1627     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1628
1629     do {
1630         if (cpu_can_run(cpu)) {
1631             r = hax_smp_cpu_exec(cpu);
1632             if (r == EXCP_DEBUG) {
1633                 cpu_handle_guest_debug(cpu);
1634             }
1635         }
1636
1637         qemu_wait_io_event(cpu);
1638     } while (!cpu->unplug || cpu_can_run(cpu));
1639     rcu_unregister_thread();
1640     return NULL;
1641 }
1642
1643 /* The HVF-specific vCPU thread function. This one should only run when the host
1644  * CPU supports the VMX "unrestricted guest" feature. */
1645 static void *qemu_hvf_cpu_thread_fn(void *arg)
1646 {
1647     CPUState *cpu = arg;
1648
1649     int r;
1650
1651     assert(hvf_enabled());
1652
1653     rcu_register_thread();
1654
1655     qemu_mutex_lock_iothread();
1656     qemu_thread_get_self(cpu->thread);
1657
1658     cpu->thread_id = qemu_get_thread_id();
1659     cpu->can_do_io = 1;
1660     current_cpu = cpu;
1661
1662     hvf_init_vcpu(cpu);
1663
1664     /* signal CPU creation */
1665     cpu->created = true;
1666     qemu_cond_signal(&qemu_cpu_cond);
1667     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1668
1669     do {
1670         if (cpu_can_run(cpu)) {
1671             r = hvf_vcpu_exec(cpu);
1672             if (r == EXCP_DEBUG) {
1673                 cpu_handle_guest_debug(cpu);
1674             }
1675         }
1676         qemu_wait_io_event(cpu);
1677     } while (!cpu->unplug || cpu_can_run(cpu));
1678
1679     hvf_vcpu_destroy(cpu);
1680     cpu->created = false;
1681     qemu_cond_signal(&qemu_cpu_cond);
1682     qemu_mutex_unlock_iothread();
1683     rcu_unregister_thread();
1684     return NULL;
1685 }
1686
1687 static void *qemu_whpx_cpu_thread_fn(void *arg)
1688 {
1689     CPUState *cpu = arg;
1690     int r;
1691
1692     rcu_register_thread();
1693
1694     qemu_mutex_lock_iothread();
1695     qemu_thread_get_self(cpu->thread);
1696     cpu->thread_id = qemu_get_thread_id();
1697     current_cpu = cpu;
1698
1699     r = whpx_init_vcpu(cpu);
1700     if (r < 0) {
1701         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1702         exit(1);
1703     }
1704
1705     /* signal CPU creation */
1706     cpu->created = true;
1707     qemu_cond_signal(&qemu_cpu_cond);
1708     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1709
1710     do {
1711         if (cpu_can_run(cpu)) {
1712             r = whpx_vcpu_exec(cpu);
1713             if (r == EXCP_DEBUG) {
1714                 cpu_handle_guest_debug(cpu);
1715             }
1716         }
1717         while (cpu_thread_is_idle(cpu)) {
1718             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1719         }
1720         qemu_wait_io_event_common(cpu);
1721     } while (!cpu->unplug || cpu_can_run(cpu));
1722
1723     whpx_destroy_vcpu(cpu);
1724     cpu->created = false;
1725     qemu_cond_signal(&qemu_cpu_cond);
1726     qemu_mutex_unlock_iothread();
1727     rcu_unregister_thread();
1728     return NULL;
1729 }
1730
1731 #ifdef _WIN32
1732 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1733 {
1734 }
1735 #endif
1736
1737 /* Multi-threaded TCG
1738  *
1739  * In the multi-threaded case each vCPU has its own thread. The TLS
1740  * variable current_cpu can be used deep in the code to find the
1741  * current CPUState for a given thread.
1742  */
1743
1744 static void *qemu_tcg_cpu_thread_fn(void *arg)
1745 {
1746     CPUState *cpu = arg;
1747
1748     assert(tcg_enabled());
1749     g_assert(!use_icount);
1750
1751     rcu_register_thread();
1752     tcg_register_thread();
1753
1754     qemu_mutex_lock_iothread();
1755     qemu_thread_get_self(cpu->thread);
1756
1757     cpu->thread_id = qemu_get_thread_id();
1758     cpu->created = true;
1759     cpu->can_do_io = 1;
1760     current_cpu = cpu;
1761     qemu_cond_signal(&qemu_cpu_cond);
1762     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1763
1764     /* process any pending work */
1765     cpu->exit_request = 1;
1766
1767     do {
1768         if (cpu_can_run(cpu)) {
1769             int r;
1770             qemu_mutex_unlock_iothread();
1771             r = tcg_cpu_exec(cpu);
1772             qemu_mutex_lock_iothread();
1773             switch (r) {
1774             case EXCP_DEBUG:
1775                 cpu_handle_guest_debug(cpu);
1776                 break;
1777             case EXCP_HALTED:
1778                 /* during start-up the vCPU is reset and the thread is
1779                  * kicked several times. If we don't ensure we go back
1780                  * to sleep in the halted state we won't cleanly
1781                  * start-up when the vCPU is enabled.
1782                  *
1783                  * cpu->halted should ensure we sleep in wait_io_event
1784                  */
1785                 g_assert(cpu->halted);
1786                 break;
1787             case EXCP_ATOMIC:
1788                 qemu_mutex_unlock_iothread();
1789                 cpu_exec_step_atomic(cpu);
1790                 qemu_mutex_lock_iothread();
1791             default:
1792                 /* Ignore everything else? */
1793                 break;
1794             }
1795         }
1796
1797         atomic_mb_set(&cpu->exit_request, 0);
1798         qemu_wait_io_event(cpu);
1799     } while (!cpu->unplug || cpu_can_run(cpu));
1800
1801     qemu_tcg_destroy_vcpu(cpu);
1802     cpu->created = false;
1803     qemu_cond_signal(&qemu_cpu_cond);
1804     qemu_mutex_unlock_iothread();
1805     rcu_unregister_thread();
1806     return NULL;
1807 }
1808
1809 static void qemu_cpu_kick_thread(CPUState *cpu)
1810 {
1811 #ifndef _WIN32
1812     int err;
1813
1814     if (cpu->thread_kicked) {
1815         return;
1816     }
1817     cpu->thread_kicked = true;
1818     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1819     if (err && err != ESRCH) {
1820         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1821         exit(1);
1822     }
1823 #else /* _WIN32 */
1824     if (!qemu_cpu_is_self(cpu)) {
1825         if (whpx_enabled()) {
1826             whpx_vcpu_kick(cpu);
1827         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1828             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1829                     __func__, GetLastError());
1830             exit(1);
1831         }
1832     }
1833 #endif
1834 }
1835
1836 void qemu_cpu_kick(CPUState *cpu)
1837 {
1838     qemu_cond_broadcast(cpu->halt_cond);
1839     if (tcg_enabled()) {
1840         if (qemu_tcg_mttcg_enabled()) {
1841             cpu_exit(cpu);
1842         } else {
1843             qemu_cpu_kick_rr_cpus();
1844         }
1845     } else {
1846         if (hax_enabled()) {
1847             /*
1848              * FIXME: race condition with the exit_request check in
1849              * hax_vcpu_hax_exec
1850              */
1851             cpu->exit_request = 1;
1852         }
1853         qemu_cpu_kick_thread(cpu);
1854     }
1855 }
1856
1857 void qemu_cpu_kick_self(void)
1858 {
1859     assert(current_cpu);
1860     qemu_cpu_kick_thread(current_cpu);
1861 }
1862
1863 bool qemu_cpu_is_self(CPUState *cpu)
1864 {
1865     return qemu_thread_is_self(cpu->thread);
1866 }
1867
1868 bool qemu_in_vcpu_thread(void)
1869 {
1870     return current_cpu && qemu_cpu_is_self(current_cpu);
1871 }
1872
1873 static __thread bool iothread_locked = false;
1874
1875 bool qemu_mutex_iothread_locked(void)
1876 {
1877     return iothread_locked;
1878 }
1879
1880 /*
1881  * The BQL is taken from so many places that it is worth profiling the
1882  * callers directly, instead of funneling them all through a single function.
1883  */
1884 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1885 {
1886     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1887
1888     g_assert(!qemu_mutex_iothread_locked());
1889     bql_lock(&qemu_global_mutex, file, line);
1890     iothread_locked = true;
1891 }
1892
1893 void qemu_mutex_unlock_iothread(void)
1894 {
1895     g_assert(qemu_mutex_iothread_locked());
1896     iothread_locked = false;
1897     qemu_mutex_unlock(&qemu_global_mutex);
1898 }
1899
1900 static bool all_vcpus_paused(void)
1901 {
1902     CPUState *cpu;
1903
1904     CPU_FOREACH(cpu) {
1905         if (!cpu->stopped) {
1906             return false;
1907         }
1908     }
1909
1910     return true;
1911 }
1912
1913 void pause_all_vcpus(void)
1914 {
1915     CPUState *cpu;
1916
1917     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1918     CPU_FOREACH(cpu) {
1919         if (qemu_cpu_is_self(cpu)) {
1920             qemu_cpu_stop(cpu, true);
1921         } else {
1922             cpu->stop = true;
1923             qemu_cpu_kick(cpu);
1924         }
1925     }
1926
1927     /* We need to drop the replay_lock so any vCPU threads woken up
1928      * can finish their replay tasks
1929      */
1930     replay_mutex_unlock();
1931
1932     while (!all_vcpus_paused()) {
1933         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1934         CPU_FOREACH(cpu) {
1935             qemu_cpu_kick(cpu);
1936         }
1937     }
1938
1939     qemu_mutex_unlock_iothread();
1940     replay_mutex_lock();
1941     qemu_mutex_lock_iothread();
1942 }
1943
1944 void cpu_resume(CPUState *cpu)
1945 {
1946     cpu->stop = false;
1947     cpu->stopped = false;
1948     qemu_cpu_kick(cpu);
1949 }
1950
1951 void resume_all_vcpus(void)
1952 {
1953     CPUState *cpu;
1954
1955     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1956     CPU_FOREACH(cpu) {
1957         cpu_resume(cpu);
1958     }
1959 }
1960
1961 void cpu_remove_sync(CPUState *cpu)
1962 {
1963     cpu->stop = true;
1964     cpu->unplug = true;
1965     qemu_cpu_kick(cpu);
1966     qemu_mutex_unlock_iothread();
1967     qemu_thread_join(cpu->thread);
1968     qemu_mutex_lock_iothread();
1969 }
1970
1971 /* For temporary buffers for forming a name */
1972 #define VCPU_THREAD_NAME_SIZE 16
1973
1974 static void qemu_tcg_init_vcpu(CPUState *cpu)
1975 {
1976     char thread_name[VCPU_THREAD_NAME_SIZE];
1977     static QemuCond *single_tcg_halt_cond;
1978     static QemuThread *single_tcg_cpu_thread;
1979     static int tcg_region_inited;
1980
1981     assert(tcg_enabled());
1982     /*
1983      * Initialize TCG regions--once. Now is a good time, because:
1984      * (1) TCG's init context, prologue and target globals have been set up.
1985      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1986      *     -accel flag is processed, so the check doesn't work then).
1987      */
1988     if (!tcg_region_inited) {
1989         tcg_region_inited = 1;
1990         tcg_region_init();
1991     }
1992
1993     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1994         cpu->thread = g_malloc0(sizeof(QemuThread));
1995         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996         qemu_cond_init(cpu->halt_cond);
1997
1998         if (qemu_tcg_mttcg_enabled()) {
1999             /* create a thread per vCPU with TCG (MTTCG) */
2000             parallel_cpus = true;
2001             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2002                  cpu->cpu_index);
2003
2004             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2005                                cpu, QEMU_THREAD_JOINABLE);
2006
2007         } else {
2008             /* share a single thread for all cpus with TCG */
2009             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2010             qemu_thread_create(cpu->thread, thread_name,
2011                                qemu_tcg_rr_cpu_thread_fn,
2012                                cpu, QEMU_THREAD_JOINABLE);
2013
2014             single_tcg_halt_cond = cpu->halt_cond;
2015             single_tcg_cpu_thread = cpu->thread;
2016         }
2017 #ifdef _WIN32
2018         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2019 #endif
2020     } else {
2021         /* For non-MTTCG cases we share the thread */
2022         cpu->thread = single_tcg_cpu_thread;
2023         cpu->halt_cond = single_tcg_halt_cond;
2024         cpu->thread_id = first_cpu->thread_id;
2025         cpu->can_do_io = 1;
2026         cpu->created = true;
2027     }
2028 }
2029
2030 static void qemu_hax_start_vcpu(CPUState *cpu)
2031 {
2032     char thread_name[VCPU_THREAD_NAME_SIZE];
2033
2034     cpu->thread = g_malloc0(sizeof(QemuThread));
2035     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2036     qemu_cond_init(cpu->halt_cond);
2037
2038     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2039              cpu->cpu_index);
2040     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2041                        cpu, QEMU_THREAD_JOINABLE);
2042 #ifdef _WIN32
2043     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2044 #endif
2045 }
2046
2047 static void qemu_kvm_start_vcpu(CPUState *cpu)
2048 {
2049     char thread_name[VCPU_THREAD_NAME_SIZE];
2050
2051     cpu->thread = g_malloc0(sizeof(QemuThread));
2052     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2053     qemu_cond_init(cpu->halt_cond);
2054     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2055              cpu->cpu_index);
2056     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2057                        cpu, QEMU_THREAD_JOINABLE);
2058 }
2059
2060 static void qemu_hvf_start_vcpu(CPUState *cpu)
2061 {
2062     char thread_name[VCPU_THREAD_NAME_SIZE];
2063
2064     /* HVF currently does not support TCG, and only runs in
2065      * unrestricted-guest mode. */
2066     assert(hvf_enabled());
2067
2068     cpu->thread = g_malloc0(sizeof(QemuThread));
2069     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070     qemu_cond_init(cpu->halt_cond);
2071
2072     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2073              cpu->cpu_index);
2074     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2075                        cpu, QEMU_THREAD_JOINABLE);
2076 }
2077
2078 static void qemu_whpx_start_vcpu(CPUState *cpu)
2079 {
2080     char thread_name[VCPU_THREAD_NAME_SIZE];
2081
2082     cpu->thread = g_malloc0(sizeof(QemuThread));
2083     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2084     qemu_cond_init(cpu->halt_cond);
2085     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2086              cpu->cpu_index);
2087     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2088                        cpu, QEMU_THREAD_JOINABLE);
2089 #ifdef _WIN32
2090     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2091 #endif
2092 }
2093
2094 static void qemu_dummy_start_vcpu(CPUState *cpu)
2095 {
2096     char thread_name[VCPU_THREAD_NAME_SIZE];
2097
2098     cpu->thread = g_malloc0(sizeof(QemuThread));
2099     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2100     qemu_cond_init(cpu->halt_cond);
2101     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2102              cpu->cpu_index);
2103     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2104                        QEMU_THREAD_JOINABLE);
2105 }
2106
2107 void qemu_init_vcpu(CPUState *cpu)
2108 {
2109     MachineState *ms = MACHINE(qdev_get_machine());
2110
2111     cpu->nr_cores = ms->smp.cores;
2112     cpu->nr_threads =  ms->smp.threads;
2113     cpu->stopped = true;
2114     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2115
2116     if (!cpu->as) {
2117         /* If the target cpu hasn't set up any address spaces itself,
2118          * give it the default one.
2119          */
2120         cpu->num_ases = 1;
2121         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2122     }
2123
2124     if (kvm_enabled()) {
2125         qemu_kvm_start_vcpu(cpu);
2126     } else if (hax_enabled()) {
2127         qemu_hax_start_vcpu(cpu);
2128     } else if (hvf_enabled()) {
2129         qemu_hvf_start_vcpu(cpu);
2130     } else if (tcg_enabled()) {
2131         qemu_tcg_init_vcpu(cpu);
2132     } else if (whpx_enabled()) {
2133         qemu_whpx_start_vcpu(cpu);
2134     } else {
2135         qemu_dummy_start_vcpu(cpu);
2136     }
2137
2138     while (!cpu->created) {
2139         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2140     }
2141 }
2142
2143 void cpu_stop_current(void)
2144 {
2145     if (current_cpu) {
2146         current_cpu->stop = true;
2147         cpu_exit(current_cpu);
2148     }
2149 }
2150
2151 int vm_stop(RunState state)
2152 {
2153     if (qemu_in_vcpu_thread()) {
2154         qemu_system_vmstop_request_prepare();
2155         qemu_system_vmstop_request(state);
2156         /*
2157          * FIXME: should not return to device code in case
2158          * vm_stop() has been requested.
2159          */
2160         cpu_stop_current();
2161         return 0;
2162     }
2163
2164     return do_vm_stop(state, true);
2165 }
2166
2167 /**
2168  * Prepare for (re)starting the VM.
2169  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2170  * running or in case of an error condition), 0 otherwise.
2171  */
2172 int vm_prepare_start(void)
2173 {
2174     RunState requested;
2175
2176     qemu_vmstop_requested(&requested);
2177     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2178         return -1;
2179     }
2180
2181     /* Ensure that a STOP/RESUME pair of events is emitted if a
2182      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2183      * example, according to documentation is always followed by
2184      * the STOP event.
2185      */
2186     if (runstate_is_running()) {
2187         qapi_event_send_stop();
2188         qapi_event_send_resume();
2189         return -1;
2190     }
2191
2192     /* We are sending this now, but the CPUs will be resumed shortly later */
2193     qapi_event_send_resume();
2194
2195     cpu_enable_ticks();
2196     runstate_set(RUN_STATE_RUNNING);
2197     vm_state_notify(1, RUN_STATE_RUNNING);
2198     return 0;
2199 }
2200
2201 void vm_start(void)
2202 {
2203     if (!vm_prepare_start()) {
2204         resume_all_vcpus();
2205     }
2206 }
2207
2208 /* does a state transition even if the VM is already stopped,
2209    current state is forgotten forever */
2210 int vm_stop_force_state(RunState state)
2211 {
2212     if (runstate_is_running()) {
2213         return vm_stop(state);
2214     } else {
2215         runstate_set(state);
2216
2217         bdrv_drain_all();
2218         /* Make sure to return an error if the flush in a previous vm_stop()
2219          * failed. */
2220         return bdrv_flush_all();
2221     }
2222 }
2223
2224 void list_cpus(const char *optarg)
2225 {
2226     /* XXX: implement xxx_cpu_list for targets that still miss it */
2227 #if defined(cpu_list)
2228     cpu_list();
2229 #endif
2230 }
2231
2232 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2233                  bool has_cpu, int64_t cpu_index, Error **errp)
2234 {
2235     FILE *f;
2236     uint32_t l;
2237     CPUState *cpu;
2238     uint8_t buf[1024];
2239     int64_t orig_addr = addr, orig_size = size;
2240
2241     if (!has_cpu) {
2242         cpu_index = 0;
2243     }
2244
2245     cpu = qemu_get_cpu(cpu_index);
2246     if (cpu == NULL) {
2247         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2248                    "a CPU number");
2249         return;
2250     }
2251
2252     f = fopen(filename, "wb");
2253     if (!f) {
2254         error_setg_file_open(errp, errno, filename);
2255         return;
2256     }
2257
2258     while (size != 0) {
2259         l = sizeof(buf);
2260         if (l > size)
2261             l = size;
2262         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2263             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2264                              " specified", orig_addr, orig_size);
2265             goto exit;
2266         }
2267         if (fwrite(buf, 1, l, f) != l) {
2268             error_setg(errp, QERR_IO_ERROR);
2269             goto exit;
2270         }
2271         addr += l;
2272         size -= l;
2273     }
2274
2275 exit:
2276     fclose(f);
2277 }
2278
2279 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2280                   Error **errp)
2281 {
2282     FILE *f;
2283     uint32_t l;
2284     uint8_t buf[1024];
2285
2286     f = fopen(filename, "wb");
2287     if (!f) {
2288         error_setg_file_open(errp, errno, filename);
2289         return;
2290     }
2291
2292     while (size != 0) {
2293         l = sizeof(buf);
2294         if (l > size)
2295             l = size;
2296         cpu_physical_memory_read(addr, buf, l);
2297         if (fwrite(buf, 1, l, f) != l) {
2298             error_setg(errp, QERR_IO_ERROR);
2299             goto exit;
2300         }
2301         addr += l;
2302         size -= l;
2303     }
2304
2305 exit:
2306     fclose(f);
2307 }
2308
2309 void qmp_inject_nmi(Error **errp)
2310 {
2311     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2312 }
2313
2314 void dump_drift_info(void)
2315 {
2316     if (!use_icount) {
2317         return;
2318     }
2319
2320     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2321                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2322     if (icount_align_option) {
2323         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2324                     -max_delay / SCALE_MS);
2325         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2326                     max_advance / SCALE_MS);
2327     } else {
2328         qemu_printf("Max guest delay     NA\n");
2329         qemu_printf("Max guest advance   NA\n");
2330     }
2331 }