cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "sysemu/runstate.h"
  59 #include "hw/boards.h"
  60 #include "hw/hw.h"
  61
  62 #ifdef CONFIG_LINUX
  63
  64 #include <sys/prctl.h>
  65
  66 #ifndef PR_MCE_KILL
  67 #define PR_MCE_KILL 33
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_SET
  71 #define PR_MCE_KILL_SET 1
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_EARLY
  75 #define PR_MCE_KILL_EARLY 1
  76 #endif
  77
  78 #endif /* CONFIG_LINUX */
  79
  80 static QemuMutex qemu_global_mutex;
  81
  82 int64_t max_delay;
  83 int64_t max_advance;
  84
  85 /* vcpu throttling controls */
  86 static QEMUTimer *throttle_timer;
  87 static unsigned int throttle_percentage;
  88
  89 #define CPU_THROTTLE_PCT_MIN 1
  90 #define CPU_THROTTLE_PCT_MAX 99
  91 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  92
  93 bool cpu_is_stopped(CPUState *cpu)
  94 {
  95     return cpu->stopped || !runstate_is_running();
  96 }
  97
  98 static bool cpu_thread_is_idle(CPUState *cpu)
  99 {
 100     if (cpu->stop || cpu->queued_work_first) {
 101         return false;
 102     }
 103     if (cpu_is_stopped(cpu)) {
 104         return true;
 105     }
 106     if (!cpu->halted || cpu_has_work(cpu) ||
 107         kvm_halt_in_kernel()) {
 108         return false;
 109     }
 110     return true;
 111 }
 112
 113 static bool all_cpu_threads_idle(void)
 114 {
 115     CPUState *cpu;
 116
 117     CPU_FOREACH(cpu) {
 118         if (!cpu_thread_is_idle(cpu)) {
 119             return false;
 120         }
 121     }
 122     return true;
 123 }
 124
 125 /***********************************************************/
 126 /* guest cycle counter */
 127
 128 /* Protected by TimersState seqlock */
 129
 130 static bool icount_sleep = true;
 131 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 132 #define MAX_ICOUNT_SHIFT 10
 133
 134 typedef struct TimersState {
 135     /* Protected by BQL.  */
 136     int64_t cpu_ticks_prev;
 137     int64_t cpu_ticks_offset;
 138
 139     /* Protect fields that can be respectively read outside the
 140      * BQL, and written from multiple threads.
 141      */
 142     QemuSeqLock vm_clock_seqlock;
 143     QemuSpin vm_clock_lock;
 144
 145     int16_t cpu_ticks_enabled;
 146
 147     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 148     int16_t icount_time_shift;
 149
 150     /* Compensate for varying guest execution speed.  */
 151     int64_t qemu_icount_bias;
 152
 153     int64_t vm_clock_warp_start;
 154     int64_t cpu_clock_offset;
 155
 156     /* Only written by TCG thread */
 157     int64_t qemu_icount;
 158
 159     /* for adjusting icount */
 160     QEMUTimer *icount_rt_timer;
 161     QEMUTimer *icount_vm_timer;
 162     QEMUTimer *icount_warp_timer;
 163 } TimersState;
 164
 165 static TimersState timers_state;
 166 bool mttcg_enabled;
 167
 168 /*
 169  * We default to false if we know other options have been enabled
 170  * which are currently incompatible with MTTCG. Otherwise when each
 171  * guest (target) has been updated to support:
 172  *   - atomic instructions
 173  *   - memory ordering primitives (barriers)
 174  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 175  *
 176  * Once a guest architecture has been converted to the new primitives
 177  * there are two remaining limitations to check.
 178  *
 179  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 180  * - The host must have a stronger memory order than the guest
 181  *
 182  * It may be possible in future to support strong guests on weak hosts
 183  * but that will require tagging all load/stores in a guest with their
 184  * implicit memory order requirements which would likely slow things
 185  * down a lot.
 186  */
 187
 188 static bool check_tcg_memory_orders_compatible(void)
 189 {
 190 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 191     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 192 #else
 193     return false;
 194 #endif
 195 }
 196
 197 static bool default_mttcg_enabled(void)
 198 {
 199     if (use_icount || TCG_OVERSIZED_GUEST) {
 200         return false;
 201     } else {
 202 #ifdef TARGET_SUPPORTS_MTTCG
 203         return check_tcg_memory_orders_compatible();
 204 #else
 205         return false;
 206 #endif
 207     }
 208 }
 209
 210 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 211 {
 212     const char *t = qemu_opt_get(opts, "thread");
 213     if (t) {
 214         if (strcmp(t, "multi") == 0) {
 215             if (TCG_OVERSIZED_GUEST) {
 216                 error_setg(errp, "No MTTCG when guest word size > hosts");
 217             } else if (use_icount) {
 218                 error_setg(errp, "No MTTCG when icount is enabled");
 219             } else {
 220 #ifndef TARGET_SUPPORTS_MTTCG
 221                 warn_report("Guest not yet converted to MTTCG - "
 222                             "you may get unexpected results");
 223 #endif
 224                 if (!check_tcg_memory_orders_compatible()) {
 225                     warn_report("Guest expects a stronger memory ordering "
 226                                 "than the host provides");
 227                     error_printf("This may cause strange/hard to debug errors\n");
 228                 }
 229                 mttcg_enabled = true;
 230             }
 231         } else if (strcmp(t, "single") == 0) {
 232             mttcg_enabled = false;
 233         } else {
 234             error_setg(errp, "Invalid 'thread' setting %s", t);
 235         }
 236     } else {
 237         mttcg_enabled = default_mttcg_enabled();
 238     }
 239 }
 240
 241 /* The current number of executed instructions is based on what we
 242  * originally budgeted minus the current state of the decrementing
 243  * icount counters in extra/u16.low.
 244  */
 245 static int64_t cpu_get_icount_executed(CPUState *cpu)
 246 {
 247     return (cpu->icount_budget -
 248             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 249 }
 250
 251 /*
 252  * Update the global shared timer_state.qemu_icount to take into
 253  * account executed instructions. This is done by the TCG vCPU
 254  * thread so the main-loop can see time has moved forward.
 255  */
 256 static void cpu_update_icount_locked(CPUState *cpu)
 257 {
 258     int64_t executed = cpu_get_icount_executed(cpu);
 259     cpu->icount_budget -= executed;
 260
 261     atomic_set_i64(&timers_state.qemu_icount,
 262                    timers_state.qemu_icount + executed);
 263 }
 264
 265 /*
 266  * Update the global shared timer_state.qemu_icount to take into
 267  * account executed instructions. This is done by the TCG vCPU
 268  * thread so the main-loop can see time has moved forward.
 269  */
 270 void cpu_update_icount(CPUState *cpu)
 271 {
 272     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 273                        &timers_state.vm_clock_lock);
 274     cpu_update_icount_locked(cpu);
 275     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 276                          &timers_state.vm_clock_lock);
 277 }
 278
 279 static int64_t cpu_get_icount_raw_locked(void)
 280 {
 281     CPUState *cpu = current_cpu;
 282
 283     if (cpu && cpu->running) {
 284         if (!cpu->can_do_io) {
 285             error_report("Bad icount read");
 286             exit(1);
 287         }
 288         /* Take into account what has run */
 289         cpu_update_icount_locked(cpu);
 290     }
 291     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 292     return atomic_read_i64(&timers_state.qemu_icount);
 293 }
 294
 295 static int64_t cpu_get_icount_locked(void)
 296 {
 297     int64_t icount = cpu_get_icount_raw_locked();
 298     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 299         cpu_icount_to_ns(icount);
 300 }
 301
 302 int64_t cpu_get_icount_raw(void)
 303 {
 304     int64_t icount;
 305     unsigned start;
 306
 307     do {
 308         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 309         icount = cpu_get_icount_raw_locked();
 310     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 311
 312     return icount;
 313 }
 314
 315 /* Return the virtual CPU time, based on the instruction counter.  */
 316 int64_t cpu_get_icount(void)
 317 {
 318     int64_t icount;
 319     unsigned start;
 320
 321     do {
 322         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 323         icount = cpu_get_icount_locked();
 324     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 325
 326     return icount;
 327 }
 328
 329 int64_t cpu_icount_to_ns(int64_t icount)
 330 {
 331     return icount << atomic_read(&timers_state.icount_time_shift);
 332 }
 333
 334 static int64_t cpu_get_ticks_locked(void)
 335 {
 336     int64_t ticks = timers_state.cpu_ticks_offset;
 337     if (timers_state.cpu_ticks_enabled) {
 338         ticks += cpu_get_host_ticks();
 339     }
 340
 341     if (timers_state.cpu_ticks_prev > ticks) {
 342         /* Non increasing ticks may happen if the host uses software suspend.  */
 343         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 344         ticks = timers_state.cpu_ticks_prev;
 345     }
 346
 347     timers_state.cpu_ticks_prev = ticks;
 348     return ticks;
 349 }
 350
 351 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 352  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 353  * counter.
 354  */
 355 int64_t cpu_get_ticks(void)
 356 {
 357     int64_t ticks;
 358
 359     if (use_icount) {
 360         return cpu_get_icount();
 361     }
 362
 363     qemu_spin_lock(&timers_state.vm_clock_lock);
 364     ticks = cpu_get_ticks_locked();
 365     qemu_spin_unlock(&timers_state.vm_clock_lock);
 366     return ticks;
 367 }
 368
 369 static int64_t cpu_get_clock_locked(void)
 370 {
 371     int64_t time;
 372
 373     time = timers_state.cpu_clock_offset;
 374     if (timers_state.cpu_ticks_enabled) {
 375         time += get_clock();
 376     }
 377
 378     return time;
 379 }
 380
 381 /* Return the monotonic time elapsed in VM, i.e.,
 382  * the time between vm_start and vm_stop
 383  */
 384 int64_t cpu_get_clock(void)
 385 {
 386     int64_t ti;
 387     unsigned start;
 388
 389     do {
 390         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 391         ti = cpu_get_clock_locked();
 392     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 393
 394     return ti;
 395 }
 396
 397 /* enable cpu_get_ticks()
 398  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 399  */
 400 void cpu_enable_ticks(void)
 401 {
 402     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 403                        &timers_state.vm_clock_lock);
 404     if (!timers_state.cpu_ticks_enabled) {
 405         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 406         timers_state.cpu_clock_offset -= get_clock();
 407         timers_state.cpu_ticks_enabled = 1;
 408     }
 409     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 410                        &timers_state.vm_clock_lock);
 411 }
 412
 413 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 414  * cpu_get_ticks() after that.
 415  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 416  */
 417 void cpu_disable_ticks(void)
 418 {
 419     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 420                        &timers_state.vm_clock_lock);
 421     if (timers_state.cpu_ticks_enabled) {
 422         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 423         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 424         timers_state.cpu_ticks_enabled = 0;
 425     }
 426     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 427                          &timers_state.vm_clock_lock);
 428 }
 429
 430 /* Correlation between real and virtual time is always going to be
 431    fairly approximate, so ignore small variation.
 432    When the guest is idle real and virtual time will be aligned in
 433    the IO wait loop.  */
 434 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 435
 436 static void icount_adjust(void)
 437 {
 438     int64_t cur_time;
 439     int64_t cur_icount;
 440     int64_t delta;
 441
 442     /* Protected by TimersState mutex.  */
 443     static int64_t last_delta;
 444
 445     /* If the VM is not running, then do nothing.  */
 446     if (!runstate_is_running()) {
 447         return;
 448     }
 449
 450     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 451                        &timers_state.vm_clock_lock);
 452     cur_time = cpu_get_clock_locked();
 453     cur_icount = cpu_get_icount_locked();
 454
 455     delta = cur_icount - cur_time;
 456     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 457     if (delta > 0
 458         && last_delta + ICOUNT_WOBBLE < delta * 2
 459         && timers_state.icount_time_shift > 0) {
 460         /* The guest is getting too far ahead.  Slow time down.  */
 461         atomic_set(&timers_state.icount_time_shift,
 462                    timers_state.icount_time_shift - 1);
 463     }
 464     if (delta < 0
 465         && last_delta - ICOUNT_WOBBLE > delta * 2
 466         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 467         /* The guest is getting too far behind.  Speed time up.  */
 468         atomic_set(&timers_state.icount_time_shift,
 469                    timers_state.icount_time_shift + 1);
 470     }
 471     last_delta = delta;
 472     atomic_set_i64(&timers_state.qemu_icount_bias,
 473                    cur_icount - (timers_state.qemu_icount
 474                                  << timers_state.icount_time_shift));
 475     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 476                          &timers_state.vm_clock_lock);
 477 }
 478
 479 static void icount_adjust_rt(void *opaque)
 480 {
 481     timer_mod(timers_state.icount_rt_timer,
 482               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 483     icount_adjust();
 484 }
 485
 486 static void icount_adjust_vm(void *opaque)
 487 {
 488     timer_mod(timers_state.icount_vm_timer,
 489                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 490                    NANOSECONDS_PER_SECOND / 10);
 491     icount_adjust();
 492 }
 493
 494 static int64_t qemu_icount_round(int64_t count)
 495 {
 496     int shift = atomic_read(&timers_state.icount_time_shift);
 497     return (count + (1 << shift) - 1) >> shift;
 498 }
 499
 500 static void icount_warp_rt(void)
 501 {
 502     unsigned seq;
 503     int64_t warp_start;
 504
 505     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 506      * changes from -1 to another value, so the race here is okay.
 507      */
 508     do {
 509         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 510         warp_start = timers_state.vm_clock_warp_start;
 511     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 512
 513     if (warp_start == -1) {
 514         return;
 515     }
 516
 517     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 518                        &timers_state.vm_clock_lock);
 519     if (runstate_is_running()) {
 520         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 521                                             cpu_get_clock_locked());
 522         int64_t warp_delta;
 523
 524         warp_delta = clock - timers_state.vm_clock_warp_start;
 525         if (use_icount == 2) {
 526             /*
 527              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 528              * far ahead of real time.
 529              */
 530             int64_t cur_icount = cpu_get_icount_locked();
 531             int64_t delta = clock - cur_icount;
 532             warp_delta = MIN(warp_delta, delta);
 533         }
 534         atomic_set_i64(&timers_state.qemu_icount_bias,
 535                        timers_state.qemu_icount_bias + warp_delta);
 536     }
 537     timers_state.vm_clock_warp_start = -1;
 538     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 539                        &timers_state.vm_clock_lock);
 540
 541     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 542         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 543     }
 544 }
 545
 546 static void icount_timer_cb(void *opaque)
 547 {
 548     /* No need for a checkpoint because the timer already synchronizes
 549      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 550      */
 551     icount_warp_rt();
 552 }
 553
 554 void qtest_clock_warp(int64_t dest)
 555 {
 556     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 557     AioContext *aio_context;
 558     assert(qtest_enabled());
 559     aio_context = qemu_get_aio_context();
 560     while (clock < dest) {
 561         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 562                                                       QEMU_TIMER_ATTR_ALL);
 563         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 564
 565         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 566                            &timers_state.vm_clock_lock);
 567         atomic_set_i64(&timers_state.qemu_icount_bias,
 568                        timers_state.qemu_icount_bias + warp);
 569         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 570                              &timers_state.vm_clock_lock);
 571
 572         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 573         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 574         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 575     }
 576     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 577 }
 578
 579 void qemu_start_warp_timer(void)
 580 {
 581     int64_t clock;
 582     int64_t deadline;
 583
 584     if (!use_icount) {
 585         return;
 586     }
 587
 588     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 589      * do not fire, so computing the deadline does not make sense.
 590      */
 591     if (!runstate_is_running()) {
 592         return;
 593     }
 594
 595     if (replay_mode != REPLAY_MODE_PLAY) {
 596         if (!all_cpu_threads_idle()) {
 597             return;
 598         }
 599
 600         if (qtest_enabled()) {
 601             /* When testing, qtest commands advance icount.  */
 602             return;
 603         }
 604
 605         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 606     } else {
 607         /* warp clock deterministically in record/replay mode */
 608         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 609             /* vCPU is sleeping and warp can't be started.
 610                It is probably a race condition: notification sent
 611                to vCPU was processed in advance and vCPU went to sleep.
 612                Therefore we have to wake it up for doing someting. */
 613             if (replay_has_checkpoint()) {
 614                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 615             }
 616             return;
 617         }
 618     }
 619
 620     /* We want to use the earliest deadline from ALL vm_clocks */
 621     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 622     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 623                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 624     if (deadline < 0) {
 625         static bool notified;
 626         if (!icount_sleep && !notified) {
 627             warn_report("icount sleep disabled and no active timers");
 628             notified = true;
 629         }
 630         return;
 631     }
 632
 633     if (deadline > 0) {
 634         /*
 635          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 636          * sleep.  Otherwise, the CPU might be waiting for a future timer
 637          * interrupt to wake it up, but the interrupt never comes because
 638          * the vCPU isn't running any insns and thus doesn't advance the
 639          * QEMU_CLOCK_VIRTUAL.
 640          */
 641         if (!icount_sleep) {
 642             /*
 643              * We never let VCPUs sleep in no sleep icount mode.
 644              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 645              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 646              * It is useful when we want a deterministic execution time,
 647              * isolated from host latencies.
 648              */
 649             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 650                                &timers_state.vm_clock_lock);
 651             atomic_set_i64(&timers_state.qemu_icount_bias,
 652                            timers_state.qemu_icount_bias + deadline);
 653             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 654                                  &timers_state.vm_clock_lock);
 655             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 656         } else {
 657             /*
 658              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 659              * "real" time, (related to the time left until the next event) has
 660              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 661              * This avoids that the warps are visible externally; for example,
 662              * you will not be sending network packets continuously instead of
 663              * every 100ms.
 664              */
 665             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 666                                &timers_state.vm_clock_lock);
 667             if (timers_state.vm_clock_warp_start == -1
 668                 || timers_state.vm_clock_warp_start > clock) {
 669                 timers_state.vm_clock_warp_start = clock;
 670             }
 671             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 672                                  &timers_state.vm_clock_lock);
 673             timer_mod_anticipate(timers_state.icount_warp_timer,
 674                                  clock + deadline);
 675         }
 676     } else if (deadline == 0) {
 677         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 678     }
 679 }
 680
 681 static void qemu_account_warp_timer(void)
 682 {
 683     if (!use_icount || !icount_sleep) {
 684         return;
 685     }
 686
 687     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 688      * do not fire, so computing the deadline does not make sense.
 689      */
 690     if (!runstate_is_running()) {
 691         return;
 692     }
 693
 694     /* warp clock deterministically in record/replay mode */
 695     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 696         return;
 697     }
 698
 699     timer_del(timers_state.icount_warp_timer);
 700     icount_warp_rt();
 701 }
 702
 703 static bool icount_state_needed(void *opaque)
 704 {
 705     return use_icount;
 706 }
 707
 708 static bool warp_timer_state_needed(void *opaque)
 709 {
 710     TimersState *s = opaque;
 711     return s->icount_warp_timer != NULL;
 712 }
 713
 714 static bool adjust_timers_state_needed(void *opaque)
 715 {
 716     TimersState *s = opaque;
 717     return s->icount_rt_timer != NULL;
 718 }
 719
 720 /*
 721  * Subsection for warp timer migration is optional, because may not be created
 722  */
 723 static const VMStateDescription icount_vmstate_warp_timer = {
 724     .name = "timer/icount/warp_timer",
 725     .version_id = 1,
 726     .minimum_version_id = 1,
 727     .needed = warp_timer_state_needed,
 728     .fields = (VMStateField[]) {
 729         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 730         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 731         VMSTATE_END_OF_LIST()
 732     }
 733 };
 734
 735 static const VMStateDescription icount_vmstate_adjust_timers = {
 736     .name = "timer/icount/timers",
 737     .version_id = 1,
 738     .minimum_version_id = 1,
 739     .needed = adjust_timers_state_needed,
 740     .fields = (VMStateField[]) {
 741         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 742         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 743         VMSTATE_END_OF_LIST()
 744     }
 745 };
 746
 747 /*
 748  * This is a subsection for icount migration.
 749  */
 750 static const VMStateDescription icount_vmstate_timers = {
 751     .name = "timer/icount",
 752     .version_id = 1,
 753     .minimum_version_id = 1,
 754     .needed = icount_state_needed,
 755     .fields = (VMStateField[]) {
 756         VMSTATE_INT64(qemu_icount_bias, TimersState),
 757         VMSTATE_INT64(qemu_icount, TimersState),
 758         VMSTATE_END_OF_LIST()
 759     },
 760     .subsections = (const VMStateDescription*[]) {
 761         &icount_vmstate_warp_timer,
 762         &icount_vmstate_adjust_timers,
 763         NULL
 764     }
 765 };
 766
 767 static const VMStateDescription vmstate_timers = {
 768     .name = "timer",
 769     .version_id = 2,
 770     .minimum_version_id = 1,
 771     .fields = (VMStateField[]) {
 772         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 773         VMSTATE_UNUSED(8),
 774         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 775         VMSTATE_END_OF_LIST()
 776     },
 777     .subsections = (const VMStateDescription*[]) {
 778         &icount_vmstate_timers,
 779         NULL
 780     }
 781 };
 782
 783 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 784 {
 785     double pct;
 786     double throttle_ratio;
 787     int64_t sleeptime_ns, endtime_ns;
 788
 789     if (!cpu_throttle_get_percentage()) {
 790         return;
 791     }
 792
 793     pct = (double)cpu_throttle_get_percentage()/100;
 794     throttle_ratio = pct / (1 - pct);
 795     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 796     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 797     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 798     while (sleeptime_ns > 0 && !cpu->stop) {
 799         if (sleeptime_ns > SCALE_MS) {
 800             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 801                                 sleeptime_ns / SCALE_MS);
 802         } else {
 803             qemu_mutex_unlock_iothread();
 804             g_usleep(sleeptime_ns / SCALE_US);
 805             qemu_mutex_lock_iothread();
 806         }
 807         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 808     }
 809     atomic_set(&cpu->throttle_thread_scheduled, 0);
 810 }
 811
 812 static void cpu_throttle_timer_tick(void *opaque)
 813 {
 814     CPUState *cpu;
 815     double pct;
 816
 817     /* Stop the timer if needed */
 818     if (!cpu_throttle_get_percentage()) {
 819         return;
 820     }
 821     CPU_FOREACH(cpu) {
 822         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 823             async_run_on_cpu(cpu, cpu_throttle_thread,
 824                              RUN_ON_CPU_NULL);
 825         }
 826     }
 827
 828     pct = (double)cpu_throttle_get_percentage()/100;
 829     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 830                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 831 }
 832
 833 void cpu_throttle_set(int new_throttle_pct)
 834 {
 835     /* Ensure throttle percentage is within valid range */
 836     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 837     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 838
 839     atomic_set(&throttle_percentage, new_throttle_pct);
 840
 841     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 842                                        CPU_THROTTLE_TIMESLICE_NS);
 843 }
 844
 845 void cpu_throttle_stop(void)
 846 {
 847     atomic_set(&throttle_percentage, 0);
 848 }
 849
 850 bool cpu_throttle_active(void)
 851 {
 852     return (cpu_throttle_get_percentage() != 0);
 853 }
 854
 855 int cpu_throttle_get_percentage(void)
 856 {
 857     return atomic_read(&throttle_percentage);
 858 }
 859
 860 void cpu_ticks_init(void)
 861 {
 862     seqlock_init(&timers_state.vm_clock_seqlock);
 863     qemu_spin_init(&timers_state.vm_clock_lock);
 864     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 865     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                            cpu_throttle_timer_tick, NULL);
 867 }
 868
 869 void configure_icount(QemuOpts *opts, Error **errp)
 870 {
 871     const char *option;
 872     char *rem_str = NULL;
 873
 874     option = qemu_opt_get(opts, "shift");
 875     if (!option) {
 876         if (qemu_opt_get(opts, "align") != NULL) {
 877             error_setg(errp, "Please specify shift option when using align");
 878         }
 879         return;
 880     }
 881
 882     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 883     if (icount_sleep) {
 884         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 885                                          icount_timer_cb, NULL);
 886     }
 887
 888     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 889
 890     if (icount_align_option && !icount_sleep) {
 891         error_setg(errp, "align=on and sleep=off are incompatible");
 892     }
 893     if (strcmp(option, "auto") != 0) {
 894         errno = 0;
 895         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 896         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 897             error_setg(errp, "icount: Invalid shift value");
 898         }
 899         use_icount = 1;
 900         return;
 901     } else if (icount_align_option) {
 902         error_setg(errp, "shift=auto and align=on are incompatible");
 903     } else if (!icount_sleep) {
 904         error_setg(errp, "shift=auto and sleep=off are incompatible");
 905     }
 906
 907     use_icount = 2;
 908
 909     /* 125MIPS seems a reasonable initial guess at the guest speed.
 910        It will be corrected fairly quickly anyway.  */
 911     timers_state.icount_time_shift = 3;
 912
 913     /* Have both realtime and virtual time triggers for speed adjustment.
 914        The realtime trigger catches emulated time passing too slowly,
 915        the virtual time trigger catches emulated time passing too fast.
 916        Realtime triggers occur even when idle, so use them less frequently
 917        than VM triggers.  */
 918     timers_state.vm_clock_warp_start = -1;
 919     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 920                                    icount_adjust_rt, NULL);
 921     timer_mod(timers_state.icount_rt_timer,
 922                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 923     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                         icount_adjust_vm, NULL);
 925     timer_mod(timers_state.icount_vm_timer,
 926                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 927                    NANOSECONDS_PER_SECOND / 10);
 928 }
 929
 930 /***********************************************************/
 931 /* TCG vCPU kick timer
 932  *
 933  * The kick timer is responsible for moving single threaded vCPU
 934  * emulation on to the next vCPU. If more than one vCPU is running a
 935  * timer event with force a cpu->exit so the next vCPU can get
 936  * scheduled.
 937  *
 938  * The timer is removed if all vCPUs are idle and restarted again once
 939  * idleness is complete.
 940  */
 941
 942 static QEMUTimer *tcg_kick_vcpu_timer;
 943 static CPUState *tcg_current_rr_cpu;
 944
 945 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 946
 947 static inline int64_t qemu_tcg_next_kick(void)
 948 {
 949     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 950 }
 951
 952 /* Kick the currently round-robin scheduled vCPU */
 953 static void qemu_cpu_kick_rr_cpu(void)
 954 {
 955     CPUState *cpu;
 956     do {
 957         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 958         if (cpu) {
 959             cpu_exit(cpu);
 960         }
 961     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 962 }
 963
 964 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 965 {
 966 }
 967
 968 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 969 {
 970     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 971         qemu_notify_event();
 972         return;
 973     }
 974
 975     if (qemu_in_vcpu_thread()) {
 976         /* A CPU is currently running; kick it back out to the
 977          * tcg_cpu_exec() loop so it will recalculate its
 978          * icount deadline immediately.
 979          */
 980         qemu_cpu_kick(current_cpu);
 981     } else if (first_cpu) {
 982         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 983          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 984          * causes cpu_thread_is_idle to return false.  This way,
 985          * handle_icount_deadline can run.
 986          * If we have no CPUs at all for some reason, we don't
 987          * need to do anything.
 988          */
 989         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 990     }
 991 }
 992
 993 static void kick_tcg_thread(void *opaque)
 994 {
 995     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 996     qemu_cpu_kick_rr_cpu();
 997 }
 998
 999 static void start_tcg_kick_timer(void)
1000 {
1001     assert(!mttcg_enabled);
1002     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1003         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1004                                            kick_tcg_thread, NULL);
1005     }
1006     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1007         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1008     }
1009 }
1010
1011 static void stop_tcg_kick_timer(void)
1012 {
1013     assert(!mttcg_enabled);
1014     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1015         timer_del(tcg_kick_vcpu_timer);
1016     }
1017 }
1018
1019 /***********************************************************/
1020 void hw_error(const char *fmt, ...)
1021 {
1022     va_list ap;
1023     CPUState *cpu;
1024
1025     va_start(ap, fmt);
1026     fprintf(stderr, "qemu: hardware error: ");
1027     vfprintf(stderr, fmt, ap);
1028     fprintf(stderr, "\n");
1029     CPU_FOREACH(cpu) {
1030         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1031         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1032     }
1033     va_end(ap);
1034     abort();
1035 }
1036
1037 void cpu_synchronize_all_states(void)
1038 {
1039     CPUState *cpu;
1040
1041     CPU_FOREACH(cpu) {
1042         cpu_synchronize_state(cpu);
1043         /* TODO: move to cpu_synchronize_state() */
1044         if (hvf_enabled()) {
1045             hvf_cpu_synchronize_state(cpu);
1046         }
1047     }
1048 }
1049
1050 void cpu_synchronize_all_post_reset(void)
1051 {
1052     CPUState *cpu;
1053
1054     CPU_FOREACH(cpu) {
1055         cpu_synchronize_post_reset(cpu);
1056         /* TODO: move to cpu_synchronize_post_reset() */
1057         if (hvf_enabled()) {
1058             hvf_cpu_synchronize_post_reset(cpu);
1059         }
1060     }
1061 }
1062
1063 void cpu_synchronize_all_post_init(void)
1064 {
1065     CPUState *cpu;
1066
1067     CPU_FOREACH(cpu) {
1068         cpu_synchronize_post_init(cpu);
1069         /* TODO: move to cpu_synchronize_post_init() */
1070         if (hvf_enabled()) {
1071             hvf_cpu_synchronize_post_init(cpu);
1072         }
1073     }
1074 }
1075
1076 void cpu_synchronize_all_pre_loadvm(void)
1077 {
1078     CPUState *cpu;
1079
1080     CPU_FOREACH(cpu) {
1081         cpu_synchronize_pre_loadvm(cpu);
1082     }
1083 }
1084
1085 static int do_vm_stop(RunState state, bool send_stop)
1086 {
1087     int ret = 0;
1088
1089     if (runstate_is_running()) {
1090         cpu_disable_ticks();
1091         pause_all_vcpus();
1092         runstate_set(state);
1093         vm_state_notify(0, state);
1094         if (send_stop) {
1095             qapi_event_send_stop();
1096         }
1097     }
1098
1099     bdrv_drain_all();
1100     replay_disable_events();
1101     ret = bdrv_flush_all();
1102
1103     return ret;
1104 }
1105
1106 /* Special vm_stop() variant for terminating the process.  Historically clients
1107  * did not expect a QMP STOP event and so we need to retain compatibility.
1108  */
1109 int vm_shutdown(void)
1110 {
1111     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1112 }
1113
1114 static bool cpu_can_run(CPUState *cpu)
1115 {
1116     if (cpu->stop) {
1117         return false;
1118     }
1119     if (cpu_is_stopped(cpu)) {
1120         return false;
1121     }
1122     return true;
1123 }
1124
1125 static void cpu_handle_guest_debug(CPUState *cpu)
1126 {
1127     gdb_set_stop_cpu(cpu);
1128     qemu_system_debug_request();
1129     cpu->stopped = true;
1130 }
1131
1132 #ifdef CONFIG_LINUX
1133 static void sigbus_reraise(void)
1134 {
1135     sigset_t set;
1136     struct sigaction action;
1137
1138     memset(&action, 0, sizeof(action));
1139     action.sa_handler = SIG_DFL;
1140     if (!sigaction(SIGBUS, &action, NULL)) {
1141         raise(SIGBUS);
1142         sigemptyset(&set);
1143         sigaddset(&set, SIGBUS);
1144         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1145     }
1146     perror("Failed to re-raise SIGBUS!\n");
1147     abort();
1148 }
1149
1150 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1151 {
1152     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1153         sigbus_reraise();
1154     }
1155
1156     if (current_cpu) {
1157         /* Called asynchronously in VCPU thread.  */
1158         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1159             sigbus_reraise();
1160         }
1161     } else {
1162         /* Called synchronously (via signalfd) in main thread.  */
1163         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1164             sigbus_reraise();
1165         }
1166     }
1167 }
1168
1169 static void qemu_init_sigbus(void)
1170 {
1171     struct sigaction action;
1172
1173     memset(&action, 0, sizeof(action));
1174     action.sa_flags = SA_SIGINFO;
1175     action.sa_sigaction = sigbus_handler;
1176     sigaction(SIGBUS, &action, NULL);
1177
1178     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1179 }
1180 #else /* !CONFIG_LINUX */
1181 static void qemu_init_sigbus(void)
1182 {
1183 }
1184 #endif /* !CONFIG_LINUX */
1185
1186 static QemuThread io_thread;
1187
1188 /* cpu creation */
1189 static QemuCond qemu_cpu_cond;
1190 /* system init */
1191 static QemuCond qemu_pause_cond;
1192
1193 void qemu_init_cpu_loop(void)
1194 {
1195     qemu_init_sigbus();
1196     qemu_cond_init(&qemu_cpu_cond);
1197     qemu_cond_init(&qemu_pause_cond);
1198     qemu_mutex_init(&qemu_global_mutex);
1199
1200     qemu_thread_get_self(&io_thread);
1201 }
1202
1203 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1204 {
1205     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1206 }
1207
1208 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1209 {
1210     if (kvm_destroy_vcpu(cpu) < 0) {
1211         error_report("kvm_destroy_vcpu failed");
1212         exit(EXIT_FAILURE);
1213     }
1214 }
1215
1216 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1217 {
1218 }
1219
1220 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1221 {
1222     g_assert(qemu_cpu_is_self(cpu));
1223     cpu->stop = false;
1224     cpu->stopped = true;
1225     if (exit) {
1226         cpu_exit(cpu);
1227     }
1228     qemu_cond_broadcast(&qemu_pause_cond);
1229 }
1230
1231 static void qemu_wait_io_event_common(CPUState *cpu)
1232 {
1233     atomic_mb_set(&cpu->thread_kicked, false);
1234     if (cpu->stop) {
1235         qemu_cpu_stop(cpu, false);
1236     }
1237     process_queued_cpu_work(cpu);
1238 }
1239
1240 static void qemu_tcg_rr_wait_io_event(void)
1241 {
1242     CPUState *cpu;
1243
1244     while (all_cpu_threads_idle()) {
1245         stop_tcg_kick_timer();
1246         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1247     }
1248
1249     start_tcg_kick_timer();
1250
1251     CPU_FOREACH(cpu) {
1252         qemu_wait_io_event_common(cpu);
1253     }
1254 }
1255
1256 static void qemu_wait_io_event(CPUState *cpu)
1257 {
1258     while (cpu_thread_is_idle(cpu)) {
1259         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1260     }
1261
1262 #ifdef _WIN32
1263     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1264     if (!tcg_enabled()) {
1265         SleepEx(0, TRUE);
1266     }
1267 #endif
1268     qemu_wait_io_event_common(cpu);
1269 }
1270
1271 static void *qemu_kvm_cpu_thread_fn(void *arg)
1272 {
1273     CPUState *cpu = arg;
1274     int r;
1275
1276     rcu_register_thread();
1277
1278     qemu_mutex_lock_iothread();
1279     qemu_thread_get_self(cpu->thread);
1280     cpu->thread_id = qemu_get_thread_id();
1281     cpu->can_do_io = 1;
1282     current_cpu = cpu;
1283
1284     r = kvm_init_vcpu(cpu);
1285     if (r < 0) {
1286         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1287         exit(1);
1288     }
1289
1290     kvm_init_cpu_signals(cpu);
1291
1292     /* signal CPU creation */
1293     cpu->created = true;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1296
1297     do {
1298         if (cpu_can_run(cpu)) {
1299             r = kvm_cpu_exec(cpu);
1300             if (r == EXCP_DEBUG) {
1301                 cpu_handle_guest_debug(cpu);
1302             }
1303         }
1304         qemu_wait_io_event(cpu);
1305     } while (!cpu->unplug || cpu_can_run(cpu));
1306
1307     qemu_kvm_destroy_vcpu(cpu);
1308     cpu->created = false;
1309     qemu_cond_signal(&qemu_cpu_cond);
1310     qemu_mutex_unlock_iothread();
1311     rcu_unregister_thread();
1312     return NULL;
1313 }
1314
1315 static void *qemu_dummy_cpu_thread_fn(void *arg)
1316 {
1317 #ifdef _WIN32
1318     error_report("qtest is not supported under Windows");
1319     exit(1);
1320 #else
1321     CPUState *cpu = arg;
1322     sigset_t waitset;
1323     int r;
1324
1325     rcu_register_thread();
1326
1327     qemu_mutex_lock_iothread();
1328     qemu_thread_get_self(cpu->thread);
1329     cpu->thread_id = qemu_get_thread_id();
1330     cpu->can_do_io = 1;
1331     current_cpu = cpu;
1332
1333     sigemptyset(&waitset);
1334     sigaddset(&waitset, SIG_IPI);
1335
1336     /* signal CPU creation */
1337     cpu->created = true;
1338     qemu_cond_signal(&qemu_cpu_cond);
1339     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1340
1341     do {
1342         qemu_mutex_unlock_iothread();
1343         do {
1344             int sig;
1345             r = sigwait(&waitset, &sig);
1346         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1347         if (r == -1) {
1348             perror("sigwait");
1349             exit(1);
1350         }
1351         qemu_mutex_lock_iothread();
1352         qemu_wait_io_event(cpu);
1353     } while (!cpu->unplug);
1354
1355     qemu_mutex_unlock_iothread();
1356     rcu_unregister_thread();
1357     return NULL;
1358 #endif
1359 }
1360
1361 static int64_t tcg_get_icount_limit(void)
1362 {
1363     int64_t deadline;
1364
1365     if (replay_mode != REPLAY_MODE_PLAY) {
1366         /*
1367          * Include all the timers, because they may need an attention.
1368          * Too long CPU execution may create unnecessary delay in UI.
1369          */
1370         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1371                                               QEMU_TIMER_ATTR_ALL);
1372
1373         /* Maintain prior (possibly buggy) behaviour where if no deadline
1374          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1375          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1376          * nanoseconds.
1377          */
1378         if ((deadline < 0) || (deadline > INT32_MAX)) {
1379             deadline = INT32_MAX;
1380         }
1381
1382         return qemu_icount_round(deadline);
1383     } else {
1384         return replay_get_instructions();
1385     }
1386 }
1387
1388 static void handle_icount_deadline(void)
1389 {
1390     assert(qemu_in_vcpu_thread());
1391     if (use_icount) {
1392         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1393                                                       QEMU_TIMER_ATTR_ALL);
1394
1395         if (deadline == 0) {
1396             /* Wake up other AioContexts.  */
1397             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1398             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1399         }
1400     }
1401 }
1402
1403 static void prepare_icount_for_run(CPUState *cpu)
1404 {
1405     if (use_icount) {
1406         int insns_left;
1407
1408         /* These should always be cleared by process_icount_data after
1409          * each vCPU execution. However u16.high can be raised
1410          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1411          */
1412         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1413         g_assert(cpu->icount_extra == 0);
1414
1415         cpu->icount_budget = tcg_get_icount_limit();
1416         insns_left = MIN(0xffff, cpu->icount_budget);
1417         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1418         cpu->icount_extra = cpu->icount_budget - insns_left;
1419
1420         replay_mutex_lock();
1421     }
1422 }
1423
1424 static void process_icount_data(CPUState *cpu)
1425 {
1426     if (use_icount) {
1427         /* Account for executed instructions */
1428         cpu_update_icount(cpu);
1429
1430         /* Reset the counters */
1431         cpu_neg(cpu)->icount_decr.u16.low = 0;
1432         cpu->icount_extra = 0;
1433         cpu->icount_budget = 0;
1434
1435         replay_account_executed_instructions();
1436
1437         replay_mutex_unlock();
1438     }
1439 }
1440
1441
1442 static int tcg_cpu_exec(CPUState *cpu)
1443 {
1444     int ret;
1445 #ifdef CONFIG_PROFILER
1446     int64_t ti;
1447 #endif
1448
1449     assert(tcg_enabled());
1450 #ifdef CONFIG_PROFILER
1451     ti = profile_getclock();
1452 #endif
1453     cpu_exec_start(cpu);
1454     ret = cpu_exec(cpu);
1455     cpu_exec_end(cpu);
1456 #ifdef CONFIG_PROFILER
1457     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1458                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1459 #endif
1460     return ret;
1461 }
1462
1463 /* Destroy any remaining vCPUs which have been unplugged and have
1464  * finished running
1465  */
1466 static void deal_with_unplugged_cpus(void)
1467 {
1468     CPUState *cpu;
1469
1470     CPU_FOREACH(cpu) {
1471         if (cpu->unplug && !cpu_can_run(cpu)) {
1472             qemu_tcg_destroy_vcpu(cpu);
1473             cpu->created = false;
1474             qemu_cond_signal(&qemu_cpu_cond);
1475             break;
1476         }
1477     }
1478 }
1479
1480 /* Single-threaded TCG
1481  *
1482  * In the single-threaded case each vCPU is simulated in turn. If
1483  * there is more than a single vCPU we create a simple timer to kick
1484  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1485  * This is done explicitly rather than relying on side-effects
1486  * elsewhere.
1487  */
1488
1489 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1490 {
1491     CPUState *cpu = arg;
1492
1493     assert(tcg_enabled());
1494     rcu_register_thread();
1495     tcg_register_thread();
1496
1497     qemu_mutex_lock_iothread();
1498     qemu_thread_get_self(cpu->thread);
1499
1500     cpu->thread_id = qemu_get_thread_id();
1501     cpu->created = true;
1502     cpu->can_do_io = 1;
1503     qemu_cond_signal(&qemu_cpu_cond);
1504     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1505
1506     /* wait for initial kick-off after machine start */
1507     while (first_cpu->stopped) {
1508         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1509
1510         /* process any pending work */
1511         CPU_FOREACH(cpu) {
1512             current_cpu = cpu;
1513             qemu_wait_io_event_common(cpu);
1514         }
1515     }
1516
1517     start_tcg_kick_timer();
1518
1519     cpu = first_cpu;
1520
1521     /* process any pending work */
1522     cpu->exit_request = 1;
1523
1524     while (1) {
1525         qemu_mutex_unlock_iothread();
1526         replay_mutex_lock();
1527         qemu_mutex_lock_iothread();
1528         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1529         qemu_account_warp_timer();
1530
1531         /* Run the timers here.  This is much more efficient than
1532          * waking up the I/O thread and waiting for completion.
1533          */
1534         handle_icount_deadline();
1535
1536         replay_mutex_unlock();
1537
1538         if (!cpu) {
1539             cpu = first_cpu;
1540         }
1541
1542         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1543
1544             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1545             current_cpu = cpu;
1546
1547             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1548                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1549
1550             if (cpu_can_run(cpu)) {
1551                 int r;
1552
1553                 qemu_mutex_unlock_iothread();
1554                 prepare_icount_for_run(cpu);
1555
1556                 r = tcg_cpu_exec(cpu);
1557
1558                 process_icount_data(cpu);
1559                 qemu_mutex_lock_iothread();
1560
1561                 if (r == EXCP_DEBUG) {
1562                     cpu_handle_guest_debug(cpu);
1563                     break;
1564                 } else if (r == EXCP_ATOMIC) {
1565                     qemu_mutex_unlock_iothread();
1566                     cpu_exec_step_atomic(cpu);
1567                     qemu_mutex_lock_iothread();
1568                     break;
1569                 }
1570             } else if (cpu->stop) {
1571                 if (cpu->unplug) {
1572                     cpu = CPU_NEXT(cpu);
1573                 }
1574                 break;
1575             }
1576
1577             cpu = CPU_NEXT(cpu);
1578         } /* while (cpu && !cpu->exit_request).. */
1579
1580         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1581         atomic_set(&tcg_current_rr_cpu, NULL);
1582
1583         if (cpu && cpu->exit_request) {
1584             atomic_mb_set(&cpu->exit_request, 0);
1585         }
1586
1587         if (use_icount && all_cpu_threads_idle()) {
1588             /*
1589              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1590              * in the main_loop, wake it up in order to start the warp timer.
1591              */
1592             qemu_notify_event();
1593         }
1594
1595         qemu_tcg_rr_wait_io_event();
1596         deal_with_unplugged_cpus();
1597     }
1598
1599     rcu_unregister_thread();
1600     return NULL;
1601 }
1602
1603 static void *qemu_hax_cpu_thread_fn(void *arg)
1604 {
1605     CPUState *cpu = arg;
1606     int r;
1607
1608     rcu_register_thread();
1609     qemu_mutex_lock_iothread();
1610     qemu_thread_get_self(cpu->thread);
1611
1612     cpu->thread_id = qemu_get_thread_id();
1613     cpu->created = true;
1614     current_cpu = cpu;
1615
1616     hax_init_vcpu(cpu);
1617     qemu_cond_signal(&qemu_cpu_cond);
1618     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1619
1620     do {
1621         if (cpu_can_run(cpu)) {
1622             r = hax_smp_cpu_exec(cpu);
1623             if (r == EXCP_DEBUG) {
1624                 cpu_handle_guest_debug(cpu);
1625             }
1626         }
1627
1628         qemu_wait_io_event(cpu);
1629     } while (!cpu->unplug || cpu_can_run(cpu));
1630     rcu_unregister_thread();
1631     return NULL;
1632 }
1633
1634 /* The HVF-specific vCPU thread function. This one should only run when the host
1635  * CPU supports the VMX "unrestricted guest" feature. */
1636 static void *qemu_hvf_cpu_thread_fn(void *arg)
1637 {
1638     CPUState *cpu = arg;
1639
1640     int r;
1641
1642     assert(hvf_enabled());
1643
1644     rcu_register_thread();
1645
1646     qemu_mutex_lock_iothread();
1647     qemu_thread_get_self(cpu->thread);
1648
1649     cpu->thread_id = qemu_get_thread_id();
1650     cpu->can_do_io = 1;
1651     current_cpu = cpu;
1652
1653     hvf_init_vcpu(cpu);
1654
1655     /* signal CPU creation */
1656     cpu->created = true;
1657     qemu_cond_signal(&qemu_cpu_cond);
1658     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1659
1660     do {
1661         if (cpu_can_run(cpu)) {
1662             r = hvf_vcpu_exec(cpu);
1663             if (r == EXCP_DEBUG) {
1664                 cpu_handle_guest_debug(cpu);
1665             }
1666         }
1667         qemu_wait_io_event(cpu);
1668     } while (!cpu->unplug || cpu_can_run(cpu));
1669
1670     hvf_vcpu_destroy(cpu);
1671     cpu->created = false;
1672     qemu_cond_signal(&qemu_cpu_cond);
1673     qemu_mutex_unlock_iothread();
1674     rcu_unregister_thread();
1675     return NULL;
1676 }
1677
1678 static void *qemu_whpx_cpu_thread_fn(void *arg)
1679 {
1680     CPUState *cpu = arg;
1681     int r;
1682
1683     rcu_register_thread();
1684
1685     qemu_mutex_lock_iothread();
1686     qemu_thread_get_self(cpu->thread);
1687     cpu->thread_id = qemu_get_thread_id();
1688     current_cpu = cpu;
1689
1690     r = whpx_init_vcpu(cpu);
1691     if (r < 0) {
1692         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1693         exit(1);
1694     }
1695
1696     /* signal CPU creation */
1697     cpu->created = true;
1698     qemu_cond_signal(&qemu_cpu_cond);
1699     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1700
1701     do {
1702         if (cpu_can_run(cpu)) {
1703             r = whpx_vcpu_exec(cpu);
1704             if (r == EXCP_DEBUG) {
1705                 cpu_handle_guest_debug(cpu);
1706             }
1707         }
1708         while (cpu_thread_is_idle(cpu)) {
1709             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1710         }
1711         qemu_wait_io_event_common(cpu);
1712     } while (!cpu->unplug || cpu_can_run(cpu));
1713
1714     whpx_destroy_vcpu(cpu);
1715     cpu->created = false;
1716     qemu_cond_signal(&qemu_cpu_cond);
1717     qemu_mutex_unlock_iothread();
1718     rcu_unregister_thread();
1719     return NULL;
1720 }
1721
1722 #ifdef _WIN32
1723 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1724 {
1725 }
1726 #endif
1727
1728 /* Multi-threaded TCG
1729  *
1730  * In the multi-threaded case each vCPU has its own thread. The TLS
1731  * variable current_cpu can be used deep in the code to find the
1732  * current CPUState for a given thread.
1733  */
1734
1735 static void *qemu_tcg_cpu_thread_fn(void *arg)
1736 {
1737     CPUState *cpu = arg;
1738
1739     assert(tcg_enabled());
1740     g_assert(!use_icount);
1741
1742     rcu_register_thread();
1743     tcg_register_thread();
1744
1745     qemu_mutex_lock_iothread();
1746     qemu_thread_get_self(cpu->thread);
1747
1748     cpu->thread_id = qemu_get_thread_id();
1749     cpu->created = true;
1750     cpu->can_do_io = 1;
1751     current_cpu = cpu;
1752     qemu_cond_signal(&qemu_cpu_cond);
1753     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1754
1755     /* process any pending work */
1756     cpu->exit_request = 1;
1757
1758     do {
1759         if (cpu_can_run(cpu)) {
1760             int r;
1761             qemu_mutex_unlock_iothread();
1762             r = tcg_cpu_exec(cpu);
1763             qemu_mutex_lock_iothread();
1764             switch (r) {
1765             case EXCP_DEBUG:
1766                 cpu_handle_guest_debug(cpu);
1767                 break;
1768             case EXCP_HALTED:
1769                 /* during start-up the vCPU is reset and the thread is
1770                  * kicked several times. If we don't ensure we go back
1771                  * to sleep in the halted state we won't cleanly
1772                  * start-up when the vCPU is enabled.
1773                  *
1774                  * cpu->halted should ensure we sleep in wait_io_event
1775                  */
1776                 g_assert(cpu->halted);
1777                 break;
1778             case EXCP_ATOMIC:
1779                 qemu_mutex_unlock_iothread();
1780                 cpu_exec_step_atomic(cpu);
1781                 qemu_mutex_lock_iothread();
1782             default:
1783                 /* Ignore everything else? */
1784                 break;
1785             }
1786         }
1787
1788         atomic_mb_set(&cpu->exit_request, 0);
1789         qemu_wait_io_event(cpu);
1790     } while (!cpu->unplug || cpu_can_run(cpu));
1791
1792     qemu_tcg_destroy_vcpu(cpu);
1793     cpu->created = false;
1794     qemu_cond_signal(&qemu_cpu_cond);
1795     qemu_mutex_unlock_iothread();
1796     rcu_unregister_thread();
1797     return NULL;
1798 }
1799
1800 static void qemu_cpu_kick_thread(CPUState *cpu)
1801 {
1802 #ifndef _WIN32
1803     int err;
1804
1805     if (cpu->thread_kicked) {
1806         return;
1807     }
1808     cpu->thread_kicked = true;
1809     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1810     if (err && err != ESRCH) {
1811         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1812         exit(1);
1813     }
1814 #else /* _WIN32 */
1815     if (!qemu_cpu_is_self(cpu)) {
1816         if (whpx_enabled()) {
1817             whpx_vcpu_kick(cpu);
1818         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1819             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1820                     __func__, GetLastError());
1821             exit(1);
1822         }
1823     }
1824 #endif
1825 }
1826
1827 void qemu_cpu_kick(CPUState *cpu)
1828 {
1829     qemu_cond_broadcast(cpu->halt_cond);
1830     if (tcg_enabled()) {
1831         cpu_exit(cpu);
1832         /* NOP unless doing single-thread RR */
1833         qemu_cpu_kick_rr_cpu();
1834     } else {
1835         if (hax_enabled()) {
1836             /*
1837              * FIXME: race condition with the exit_request check in
1838              * hax_vcpu_hax_exec
1839              */
1840             cpu->exit_request = 1;
1841         }
1842         qemu_cpu_kick_thread(cpu);
1843     }
1844 }
1845
1846 void qemu_cpu_kick_self(void)
1847 {
1848     assert(current_cpu);
1849     qemu_cpu_kick_thread(current_cpu);
1850 }
1851
1852 bool qemu_cpu_is_self(CPUState *cpu)
1853 {
1854     return qemu_thread_is_self(cpu->thread);
1855 }
1856
1857 bool qemu_in_vcpu_thread(void)
1858 {
1859     return current_cpu && qemu_cpu_is_self(current_cpu);
1860 }
1861
1862 static __thread bool iothread_locked = false;
1863
1864 bool qemu_mutex_iothread_locked(void)
1865 {
1866     return iothread_locked;
1867 }
1868
1869 /*
1870  * The BQL is taken from so many places that it is worth profiling the
1871  * callers directly, instead of funneling them all through a single function.
1872  */
1873 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1874 {
1875     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1876
1877     g_assert(!qemu_mutex_iothread_locked());
1878     bql_lock(&qemu_global_mutex, file, line);
1879     iothread_locked = true;
1880 }
1881
1882 void qemu_mutex_unlock_iothread(void)
1883 {
1884     g_assert(qemu_mutex_iothread_locked());
1885     iothread_locked = false;
1886     qemu_mutex_unlock(&qemu_global_mutex);
1887 }
1888
1889 static bool all_vcpus_paused(void)
1890 {
1891     CPUState *cpu;
1892
1893     CPU_FOREACH(cpu) {
1894         if (!cpu->stopped) {
1895             return false;
1896         }
1897     }
1898
1899     return true;
1900 }
1901
1902 void pause_all_vcpus(void)
1903 {
1904     CPUState *cpu;
1905
1906     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1907     CPU_FOREACH(cpu) {
1908         if (qemu_cpu_is_self(cpu)) {
1909             qemu_cpu_stop(cpu, true);
1910         } else {
1911             cpu->stop = true;
1912             qemu_cpu_kick(cpu);
1913         }
1914     }
1915
1916     /* We need to drop the replay_lock so any vCPU threads woken up
1917      * can finish their replay tasks
1918      */
1919     replay_mutex_unlock();
1920
1921     while (!all_vcpus_paused()) {
1922         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1923         CPU_FOREACH(cpu) {
1924             qemu_cpu_kick(cpu);
1925         }
1926     }
1927
1928     qemu_mutex_unlock_iothread();
1929     replay_mutex_lock();
1930     qemu_mutex_lock_iothread();
1931 }
1932
1933 void cpu_resume(CPUState *cpu)
1934 {
1935     cpu->stop = false;
1936     cpu->stopped = false;
1937     qemu_cpu_kick(cpu);
1938 }
1939
1940 void resume_all_vcpus(void)
1941 {
1942     CPUState *cpu;
1943
1944     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1945     CPU_FOREACH(cpu) {
1946         cpu_resume(cpu);
1947     }
1948 }
1949
1950 void cpu_remove_sync(CPUState *cpu)
1951 {
1952     cpu->stop = true;
1953     cpu->unplug = true;
1954     qemu_cpu_kick(cpu);
1955     qemu_mutex_unlock_iothread();
1956     qemu_thread_join(cpu->thread);
1957     qemu_mutex_lock_iothread();
1958 }
1959
1960 /* For temporary buffers for forming a name */
1961 #define VCPU_THREAD_NAME_SIZE 16
1962
1963 static void qemu_tcg_init_vcpu(CPUState *cpu)
1964 {
1965     char thread_name[VCPU_THREAD_NAME_SIZE];
1966     static QemuCond *single_tcg_halt_cond;
1967     static QemuThread *single_tcg_cpu_thread;
1968     static int tcg_region_inited;
1969
1970     assert(tcg_enabled());
1971     /*
1972      * Initialize TCG regions--once. Now is a good time, because:
1973      * (1) TCG's init context, prologue and target globals have been set up.
1974      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1975      *     -accel flag is processed, so the check doesn't work then).
1976      */
1977     if (!tcg_region_inited) {
1978         tcg_region_inited = 1;
1979         tcg_region_init();
1980     }
1981
1982     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1983         cpu->thread = g_malloc0(sizeof(QemuThread));
1984         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1985         qemu_cond_init(cpu->halt_cond);
1986
1987         if (qemu_tcg_mttcg_enabled()) {
1988             /* create a thread per vCPU with TCG (MTTCG) */
1989             parallel_cpus = true;
1990             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1991                  cpu->cpu_index);
1992
1993             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1994                                cpu, QEMU_THREAD_JOINABLE);
1995
1996         } else {
1997             /* share a single thread for all cpus with TCG */
1998             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1999             qemu_thread_create(cpu->thread, thread_name,
2000                                qemu_tcg_rr_cpu_thread_fn,
2001                                cpu, QEMU_THREAD_JOINABLE);
2002
2003             single_tcg_halt_cond = cpu->halt_cond;
2004             single_tcg_cpu_thread = cpu->thread;
2005         }
2006 #ifdef _WIN32
2007         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2008 #endif
2009     } else {
2010         /* For non-MTTCG cases we share the thread */
2011         cpu->thread = single_tcg_cpu_thread;
2012         cpu->halt_cond = single_tcg_halt_cond;
2013         cpu->thread_id = first_cpu->thread_id;
2014         cpu->can_do_io = 1;
2015         cpu->created = true;
2016     }
2017 }
2018
2019 static void qemu_hax_start_vcpu(CPUState *cpu)
2020 {
2021     char thread_name[VCPU_THREAD_NAME_SIZE];
2022
2023     cpu->thread = g_malloc0(sizeof(QemuThread));
2024     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2025     qemu_cond_init(cpu->halt_cond);
2026
2027     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2028              cpu->cpu_index);
2029     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2030                        cpu, QEMU_THREAD_JOINABLE);
2031 #ifdef _WIN32
2032     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2033 #endif
2034 }
2035
2036 static void qemu_kvm_start_vcpu(CPUState *cpu)
2037 {
2038     char thread_name[VCPU_THREAD_NAME_SIZE];
2039
2040     cpu->thread = g_malloc0(sizeof(QemuThread));
2041     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2042     qemu_cond_init(cpu->halt_cond);
2043     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2044              cpu->cpu_index);
2045     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2046                        cpu, QEMU_THREAD_JOINABLE);
2047 }
2048
2049 static void qemu_hvf_start_vcpu(CPUState *cpu)
2050 {
2051     char thread_name[VCPU_THREAD_NAME_SIZE];
2052
2053     /* HVF currently does not support TCG, and only runs in
2054      * unrestricted-guest mode. */
2055     assert(hvf_enabled());
2056
2057     cpu->thread = g_malloc0(sizeof(QemuThread));
2058     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2059     qemu_cond_init(cpu->halt_cond);
2060
2061     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2062              cpu->cpu_index);
2063     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2064                        cpu, QEMU_THREAD_JOINABLE);
2065 }
2066
2067 static void qemu_whpx_start_vcpu(CPUState *cpu)
2068 {
2069     char thread_name[VCPU_THREAD_NAME_SIZE];
2070
2071     cpu->thread = g_malloc0(sizeof(QemuThread));
2072     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2073     qemu_cond_init(cpu->halt_cond);
2074     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2075              cpu->cpu_index);
2076     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2077                        cpu, QEMU_THREAD_JOINABLE);
2078 #ifdef _WIN32
2079     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2080 #endif
2081 }
2082
2083 static void qemu_dummy_start_vcpu(CPUState *cpu)
2084 {
2085     char thread_name[VCPU_THREAD_NAME_SIZE];
2086
2087     cpu->thread = g_malloc0(sizeof(QemuThread));
2088     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2089     qemu_cond_init(cpu->halt_cond);
2090     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2091              cpu->cpu_index);
2092     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2093                        QEMU_THREAD_JOINABLE);
2094 }
2095
2096 void qemu_init_vcpu(CPUState *cpu)
2097 {
2098     MachineState *ms = MACHINE(qdev_get_machine());
2099
2100     cpu->nr_cores = ms->smp.cores;
2101     cpu->nr_threads =  ms->smp.threads;
2102     cpu->stopped = true;
2103     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2104
2105     if (!cpu->as) {
2106         /* If the target cpu hasn't set up any address spaces itself,
2107          * give it the default one.
2108          */
2109         cpu->num_ases = 1;
2110         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2111     }
2112
2113     if (kvm_enabled()) {
2114         qemu_kvm_start_vcpu(cpu);
2115     } else if (hax_enabled()) {
2116         qemu_hax_start_vcpu(cpu);
2117     } else if (hvf_enabled()) {
2118         qemu_hvf_start_vcpu(cpu);
2119     } else if (tcg_enabled()) {
2120         qemu_tcg_init_vcpu(cpu);
2121     } else if (whpx_enabled()) {
2122         qemu_whpx_start_vcpu(cpu);
2123     } else {
2124         qemu_dummy_start_vcpu(cpu);
2125     }
2126
2127     while (!cpu->created) {
2128         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2129     }
2130 }
2131
2132 void cpu_stop_current(void)
2133 {
2134     if (current_cpu) {
2135         current_cpu->stop = true;
2136         cpu_exit(current_cpu);
2137     }
2138 }
2139
2140 int vm_stop(RunState state)
2141 {
2142     if (qemu_in_vcpu_thread()) {
2143         qemu_system_vmstop_request_prepare();
2144         qemu_system_vmstop_request(state);
2145         /*
2146          * FIXME: should not return to device code in case
2147          * vm_stop() has been requested.
2148          */
2149         cpu_stop_current();
2150         return 0;
2151     }
2152
2153     return do_vm_stop(state, true);
2154 }
2155
2156 /**
2157  * Prepare for (re)starting the VM.
2158  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2159  * running or in case of an error condition), 0 otherwise.
2160  */
2161 int vm_prepare_start(void)
2162 {
2163     RunState requested;
2164
2165     qemu_vmstop_requested(&requested);
2166     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2167         return -1;
2168     }
2169
2170     /* Ensure that a STOP/RESUME pair of events is emitted if a
2171      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2172      * example, according to documentation is always followed by
2173      * the STOP event.
2174      */
2175     if (runstate_is_running()) {
2176         qapi_event_send_stop();
2177         qapi_event_send_resume();
2178         return -1;
2179     }
2180
2181     /* We are sending this now, but the CPUs will be resumed shortly later */
2182     qapi_event_send_resume();
2183
2184     replay_enable_events();
2185     cpu_enable_ticks();
2186     runstate_set(RUN_STATE_RUNNING);
2187     vm_state_notify(1, RUN_STATE_RUNNING);
2188     return 0;
2189 }
2190
2191 void vm_start(void)
2192 {
2193     if (!vm_prepare_start()) {
2194         resume_all_vcpus();
2195     }
2196 }
2197
2198 /* does a state transition even if the VM is already stopped,
2199    current state is forgotten forever */
2200 int vm_stop_force_state(RunState state)
2201 {
2202     if (runstate_is_running()) {
2203         return vm_stop(state);
2204     } else {
2205         runstate_set(state);
2206
2207         bdrv_drain_all();
2208         /* Make sure to return an error if the flush in a previous vm_stop()
2209          * failed. */
2210         return bdrv_flush_all();
2211     }
2212 }
2213
2214 void list_cpus(const char *optarg)
2215 {
2216     /* XXX: implement xxx_cpu_list for targets that still miss it */
2217 #if defined(cpu_list)
2218     cpu_list();
2219 #endif
2220 }
2221
2222 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2223                  bool has_cpu, int64_t cpu_index, Error **errp)
2224 {
2225     FILE *f;
2226     uint32_t l;
2227     CPUState *cpu;
2228     uint8_t buf[1024];
2229     int64_t orig_addr = addr, orig_size = size;
2230
2231     if (!has_cpu) {
2232         cpu_index = 0;
2233     }
2234
2235     cpu = qemu_get_cpu(cpu_index);
2236     if (cpu == NULL) {
2237         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2238                    "a CPU number");
2239         return;
2240     }
2241
2242     f = fopen(filename, "wb");
2243     if (!f) {
2244         error_setg_file_open(errp, errno, filename);
2245         return;
2246     }
2247
2248     while (size != 0) {
2249         l = sizeof(buf);
2250         if (l > size)
2251             l = size;
2252         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2253             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2254                              " specified", orig_addr, orig_size);
2255             goto exit;
2256         }
2257         if (fwrite(buf, 1, l, f) != l) {
2258             error_setg(errp, QERR_IO_ERROR);
2259             goto exit;
2260         }
2261         addr += l;
2262         size -= l;
2263     }
2264
2265 exit:
2266     fclose(f);
2267 }
2268
2269 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2270                   Error **errp)
2271 {
2272     FILE *f;
2273     uint32_t l;
2274     uint8_t buf[1024];
2275
2276     f = fopen(filename, "wb");
2277     if (!f) {
2278         error_setg_file_open(errp, errno, filename);
2279         return;
2280     }
2281
2282     while (size != 0) {
2283         l = sizeof(buf);
2284         if (l > size)
2285             l = size;
2286         cpu_physical_memory_read(addr, buf, l);
2287         if (fwrite(buf, 1, l, f) != l) {
2288             error_setg(errp, QERR_IO_ERROR);
2289             goto exit;
2290         }
2291         addr += l;
2292         size -= l;
2293     }
2294
2295 exit:
2296     fclose(f);
2297 }
2298
2299 void qmp_inject_nmi(Error **errp)
2300 {
2301     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2302 }
2303
2304 void dump_drift_info(void)
2305 {
2306     if (!use_icount) {
2307         return;
2308     }
2309
2310     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2311                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2312     if (icount_align_option) {
2313         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2314                     -max_delay / SCALE_MS);
2315         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2316                     max_advance / SCALE_MS);
2317     } else {
2318         qemu_printf("Max guest delay     NA\n");
2319         qemu_printf("Max guest advance   NA\n");
2320     }
2321 }