cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/tcg.h"
  36 #include "sysemu/block-backend.h"
  37 #include "exec/gdbstub.h"
  38 #include "sysemu/dma.h"
  39 #include "sysemu/hw_accel.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/hax.h"
  42 #include "sysemu/hvf.h"
  43 #include "sysemu/whpx.h"
  44 #include "exec/exec-all.h"
  45
  46 #include "qemu/thread.h"
  47 #include "sysemu/cpus.h"
  48 #include "sysemu/qtest.h"
  49 #include "qemu/main-loop.h"
  50 #include "qemu/option.h"
  51 #include "qemu/bitmap.h"
  52 #include "qemu/seqlock.h"
  53 #include "qemu/guest-random.h"
  54 #include "tcg.h"
  55 #include "hw/nmi.h"
  56 #include "sysemu/replay.h"
  57 #include "hw/boards.h"
  58
  59 #ifdef CONFIG_LINUX
  60
  61 #include <sys/prctl.h>
  62
  63 #ifndef PR_MCE_KILL
  64 #define PR_MCE_KILL 33
  65 #endif
  66
  67 #ifndef PR_MCE_KILL_SET
  68 #define PR_MCE_KILL_SET 1
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_EARLY
  72 #define PR_MCE_KILL_EARLY 1
  73 #endif
  74
  75 #endif /* CONFIG_LINUX */
  76
  77 int64_t max_delay;
  78 int64_t max_advance;
  79
  80 /* vcpu throttling controls */
  81 static QEMUTimer *throttle_timer;
  82 static unsigned int throttle_percentage;
  83
  84 #define CPU_THROTTLE_PCT_MIN 1
  85 #define CPU_THROTTLE_PCT_MAX 99
  86 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  87
  88 bool cpu_is_stopped(CPUState *cpu)
  89 {
  90     return cpu->stopped || !runstate_is_running();
  91 }
  92
  93 static bool cpu_thread_is_idle(CPUState *cpu)
  94 {
  95     if (cpu->stop || cpu->queued_work_first) {
  96         return false;
  97     }
  98     if (cpu_is_stopped(cpu)) {
  99         return true;
 100     }
 101     if (!cpu->halted || cpu_has_work(cpu) ||
 102         kvm_halt_in_kernel()) {
 103         return false;
 104     }
 105     return true;
 106 }
 107
 108 static bool all_cpu_threads_idle(void)
 109 {
 110     CPUState *cpu;
 111
 112     CPU_FOREACH(cpu) {
 113         if (!cpu_thread_is_idle(cpu)) {
 114             return false;
 115         }
 116     }
 117     return true;
 118 }
 119
 120 /***********************************************************/
 121 /* guest cycle counter */
 122
 123 /* Protected by TimersState seqlock */
 124
 125 static bool icount_sleep = true;
 126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127 #define MAX_ICOUNT_SHIFT 10
 128
 129 typedef struct TimersState {
 130     /* Protected by BQL.  */
 131     int64_t cpu_ticks_prev;
 132     int64_t cpu_ticks_offset;
 133
 134     /* Protect fields that can be respectively read outside the
 135      * BQL, and written from multiple threads.
 136      */
 137     QemuSeqLock vm_clock_seqlock;
 138     QemuSpin vm_clock_lock;
 139
 140     int16_t cpu_ticks_enabled;
 141
 142     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 143     int16_t icount_time_shift;
 144
 145     /* Compensate for varying guest execution speed.  */
 146     int64_t qemu_icount_bias;
 147
 148     int64_t vm_clock_warp_start;
 149     int64_t cpu_clock_offset;
 150
 151     /* Only written by TCG thread */
 152     int64_t qemu_icount;
 153
 154     /* for adjusting icount */
 155     QEMUTimer *icount_rt_timer;
 156     QEMUTimer *icount_vm_timer;
 157     QEMUTimer *icount_warp_timer;
 158 } TimersState;
 159
 160 static TimersState timers_state;
 161 bool mttcg_enabled;
 162
 163 /*
 164  * We default to false if we know other options have been enabled
 165  * which are currently incompatible with MTTCG. Otherwise when each
 166  * guest (target) has been updated to support:
 167  *   - atomic instructions
 168  *   - memory ordering primitives (barriers)
 169  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 170  *
 171  * Once a guest architecture has been converted to the new primitives
 172  * there are two remaining limitations to check.
 173  *
 174  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 175  * - The host must have a stronger memory order than the guest
 176  *
 177  * It may be possible in future to support strong guests on weak hosts
 178  * but that will require tagging all load/stores in a guest with their
 179  * implicit memory order requirements which would likely slow things
 180  * down a lot.
 181  */
 182
 183 static bool check_tcg_memory_orders_compatible(void)
 184 {
 185 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 186     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 187 #else
 188     return false;
 189 #endif
 190 }
 191
 192 static bool default_mttcg_enabled(void)
 193 {
 194     if (use_icount || TCG_OVERSIZED_GUEST) {
 195         return false;
 196     } else {
 197 #ifdef TARGET_SUPPORTS_MTTCG
 198         return check_tcg_memory_orders_compatible();
 199 #else
 200         return false;
 201 #endif
 202     }
 203 }
 204
 205 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 206 {
 207     const char *t = qemu_opt_get(opts, "thread");
 208     if (t) {
 209         if (strcmp(t, "multi") == 0) {
 210             if (TCG_OVERSIZED_GUEST) {
 211                 error_setg(errp, "No MTTCG when guest word size > hosts");
 212             } else if (use_icount) {
 213                 error_setg(errp, "No MTTCG when icount is enabled");
 214             } else {
 215 #ifndef TARGET_SUPPORTS_MTTCG
 216                 warn_report("Guest not yet converted to MTTCG - "
 217                             "you may get unexpected results");
 218 #endif
 219                 if (!check_tcg_memory_orders_compatible()) {
 220                     warn_report("Guest expects a stronger memory ordering "
 221                                 "than the host provides");
 222                     error_printf("This may cause strange/hard to debug errors\n");
 223                 }
 224                 mttcg_enabled = true;
 225             }
 226         } else if (strcmp(t, "single") == 0) {
 227             mttcg_enabled = false;
 228         } else {
 229             error_setg(errp, "Invalid 'thread' setting %s", t);
 230         }
 231     } else {
 232         mttcg_enabled = default_mttcg_enabled();
 233     }
 234 }
 235
 236 /* The current number of executed instructions is based on what we
 237  * originally budgeted minus the current state of the decrementing
 238  * icount counters in extra/u16.low.
 239  */
 240 static int64_t cpu_get_icount_executed(CPUState *cpu)
 241 {
 242     return (cpu->icount_budget -
 243             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 244 }
 245
 246 /*
 247  * Update the global shared timer_state.qemu_icount to take into
 248  * account executed instructions. This is done by the TCG vCPU
 249  * thread so the main-loop can see time has moved forward.
 250  */
 251 static void cpu_update_icount_locked(CPUState *cpu)
 252 {
 253     int64_t executed = cpu_get_icount_executed(cpu);
 254     cpu->icount_budget -= executed;
 255
 256     atomic_set_i64(&timers_state.qemu_icount,
 257                    timers_state.qemu_icount + executed);
 258 }
 259
 260 /*
 261  * Update the global shared timer_state.qemu_icount to take into
 262  * account executed instructions. This is done by the TCG vCPU
 263  * thread so the main-loop can see time has moved forward.
 264  */
 265 void cpu_update_icount(CPUState *cpu)
 266 {
 267     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 268                        &timers_state.vm_clock_lock);
 269     cpu_update_icount_locked(cpu);
 270     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 271                          &timers_state.vm_clock_lock);
 272 }
 273
 274 static int64_t cpu_get_icount_raw_locked(void)
 275 {
 276     CPUState *cpu = current_cpu;
 277
 278     if (cpu && cpu->running) {
 279         if (!cpu->can_do_io) {
 280             error_report("Bad icount read");
 281             exit(1);
 282         }
 283         /* Take into account what has run */
 284         cpu_update_icount_locked(cpu);
 285     }
 286     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 287     return atomic_read_i64(&timers_state.qemu_icount);
 288 }
 289
 290 static int64_t cpu_get_icount_locked(void)
 291 {
 292     int64_t icount = cpu_get_icount_raw_locked();
 293     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 294         cpu_icount_to_ns(icount);
 295 }
 296
 297 int64_t cpu_get_icount_raw(void)
 298 {
 299     int64_t icount;
 300     unsigned start;
 301
 302     do {
 303         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 304         icount = cpu_get_icount_raw_locked();
 305     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 306
 307     return icount;
 308 }
 309
 310 /* Return the virtual CPU time, based on the instruction counter.  */
 311 int64_t cpu_get_icount(void)
 312 {
 313     int64_t icount;
 314     unsigned start;
 315
 316     do {
 317         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 318         icount = cpu_get_icount_locked();
 319     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 320
 321     return icount;
 322 }
 323
 324 int64_t cpu_icount_to_ns(int64_t icount)
 325 {
 326     return icount << atomic_read(&timers_state.icount_time_shift);
 327 }
 328
 329 static int64_t cpu_get_ticks_locked(void)
 330 {
 331     int64_t ticks = timers_state.cpu_ticks_offset;
 332     if (timers_state.cpu_ticks_enabled) {
 333         ticks += cpu_get_host_ticks();
 334     }
 335
 336     if (timers_state.cpu_ticks_prev > ticks) {
 337         /* Non increasing ticks may happen if the host uses software suspend.  */
 338         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 339         ticks = timers_state.cpu_ticks_prev;
 340     }
 341
 342     timers_state.cpu_ticks_prev = ticks;
 343     return ticks;
 344 }
 345
 346 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 347  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 348  * counter.
 349  */
 350 int64_t cpu_get_ticks(void)
 351 {
 352     int64_t ticks;
 353
 354     if (use_icount) {
 355         return cpu_get_icount();
 356     }
 357
 358     qemu_spin_lock(&timers_state.vm_clock_lock);
 359     ticks = cpu_get_ticks_locked();
 360     qemu_spin_unlock(&timers_state.vm_clock_lock);
 361     return ticks;
 362 }
 363
 364 static int64_t cpu_get_clock_locked(void)
 365 {
 366     int64_t time;
 367
 368     time = timers_state.cpu_clock_offset;
 369     if (timers_state.cpu_ticks_enabled) {
 370         time += get_clock();
 371     }
 372
 373     return time;
 374 }
 375
 376 /* Return the monotonic time elapsed in VM, i.e.,
 377  * the time between vm_start and vm_stop
 378  */
 379 int64_t cpu_get_clock(void)
 380 {
 381     int64_t ti;
 382     unsigned start;
 383
 384     do {
 385         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 386         ti = cpu_get_clock_locked();
 387     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 388
 389     return ti;
 390 }
 391
 392 /* enable cpu_get_ticks()
 393  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 394  */
 395 void cpu_enable_ticks(void)
 396 {
 397     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 398                        &timers_state.vm_clock_lock);
 399     if (!timers_state.cpu_ticks_enabled) {
 400         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 401         timers_state.cpu_clock_offset -= get_clock();
 402         timers_state.cpu_ticks_enabled = 1;
 403     }
 404     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                        &timers_state.vm_clock_lock);
 406 }
 407
 408 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 409  * cpu_get_ticks() after that.
 410  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 411  */
 412 void cpu_disable_ticks(void)
 413 {
 414     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 415                        &timers_state.vm_clock_lock);
 416     if (timers_state.cpu_ticks_enabled) {
 417         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 418         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 419         timers_state.cpu_ticks_enabled = 0;
 420     }
 421     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 422                          &timers_state.vm_clock_lock);
 423 }
 424
 425 /* Correlation between real and virtual time is always going to be
 426    fairly approximate, so ignore small variation.
 427    When the guest is idle real and virtual time will be aligned in
 428    the IO wait loop.  */
 429 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 430
 431 static void icount_adjust(void)
 432 {
 433     int64_t cur_time;
 434     int64_t cur_icount;
 435     int64_t delta;
 436
 437     /* Protected by TimersState mutex.  */
 438     static int64_t last_delta;
 439
 440     /* If the VM is not running, then do nothing.  */
 441     if (!runstate_is_running()) {
 442         return;
 443     }
 444
 445     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 446                        &timers_state.vm_clock_lock);
 447     cur_time = cpu_get_clock_locked();
 448     cur_icount = cpu_get_icount_locked();
 449
 450     delta = cur_icount - cur_time;
 451     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 452     if (delta > 0
 453         && last_delta + ICOUNT_WOBBLE < delta * 2
 454         && timers_state.icount_time_shift > 0) {
 455         /* The guest is getting too far ahead.  Slow time down.  */
 456         atomic_set(&timers_state.icount_time_shift,
 457                    timers_state.icount_time_shift - 1);
 458     }
 459     if (delta < 0
 460         && last_delta - ICOUNT_WOBBLE > delta * 2
 461         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 462         /* The guest is getting too far behind.  Speed time up.  */
 463         atomic_set(&timers_state.icount_time_shift,
 464                    timers_state.icount_time_shift + 1);
 465     }
 466     last_delta = delta;
 467     atomic_set_i64(&timers_state.qemu_icount_bias,
 468                    cur_icount - (timers_state.qemu_icount
 469                                  << timers_state.icount_time_shift));
 470     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 471                          &timers_state.vm_clock_lock);
 472 }
 473
 474 static void icount_adjust_rt(void *opaque)
 475 {
 476     timer_mod(timers_state.icount_rt_timer,
 477               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 478     icount_adjust();
 479 }
 480
 481 static void icount_adjust_vm(void *opaque)
 482 {
 483     timer_mod(timers_state.icount_vm_timer,
 484                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 485                    NANOSECONDS_PER_SECOND / 10);
 486     icount_adjust();
 487 }
 488
 489 static int64_t qemu_icount_round(int64_t count)
 490 {
 491     int shift = atomic_read(&timers_state.icount_time_shift);
 492     return (count + (1 << shift) - 1) >> shift;
 493 }
 494
 495 static void icount_warp_rt(void)
 496 {
 497     unsigned seq;
 498     int64_t warp_start;
 499
 500     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 501      * changes from -1 to another value, so the race here is okay.
 502      */
 503     do {
 504         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 505         warp_start = timers_state.vm_clock_warp_start;
 506     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 507
 508     if (warp_start == -1) {
 509         return;
 510     }
 511
 512     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 513                        &timers_state.vm_clock_lock);
 514     if (runstate_is_running()) {
 515         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 516                                             cpu_get_clock_locked());
 517         int64_t warp_delta;
 518
 519         warp_delta = clock - timers_state.vm_clock_warp_start;
 520         if (use_icount == 2) {
 521             /*
 522              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 523              * far ahead of real time.
 524              */
 525             int64_t cur_icount = cpu_get_icount_locked();
 526             int64_t delta = clock - cur_icount;
 527             warp_delta = MIN(warp_delta, delta);
 528         }
 529         atomic_set_i64(&timers_state.qemu_icount_bias,
 530                        timers_state.qemu_icount_bias + warp_delta);
 531     }
 532     timers_state.vm_clock_warp_start = -1;
 533     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 534                        &timers_state.vm_clock_lock);
 535
 536     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 537         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 538     }
 539 }
 540
 541 static void icount_timer_cb(void *opaque)
 542 {
 543     /* No need for a checkpoint because the timer already synchronizes
 544      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 545      */
 546     icount_warp_rt();
 547 }
 548
 549 void qtest_clock_warp(int64_t dest)
 550 {
 551     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 552     AioContext *aio_context;
 553     assert(qtest_enabled());
 554     aio_context = qemu_get_aio_context();
 555     while (clock < dest) {
 556         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 557         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 558
 559         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 560                            &timers_state.vm_clock_lock);
 561         atomic_set_i64(&timers_state.qemu_icount_bias,
 562                        timers_state.qemu_icount_bias + warp);
 563         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 564                              &timers_state.vm_clock_lock);
 565
 566         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 567         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 568         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 569     }
 570     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 571 }
 572
 573 void qemu_start_warp_timer(void)
 574 {
 575     int64_t clock;
 576     int64_t deadline;
 577
 578     if (!use_icount) {
 579         return;
 580     }
 581
 582     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583      * do not fire, so computing the deadline does not make sense.
 584      */
 585     if (!runstate_is_running()) {
 586         return;
 587     }
 588
 589     if (replay_mode != REPLAY_MODE_PLAY) {
 590         if (!all_cpu_threads_idle()) {
 591             return;
 592         }
 593
 594         if (qtest_enabled()) {
 595             /* When testing, qtest commands advance icount.  */
 596             return;
 597         }
 598
 599         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 600     } else {
 601         /* warp clock deterministically in record/replay mode */
 602         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 603             /* vCPU is sleeping and warp can't be started.
 604                It is probably a race condition: notification sent
 605                to vCPU was processed in advance and vCPU went to sleep.
 606                Therefore we have to wake it up for doing someting. */
 607             if (replay_has_checkpoint()) {
 608                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 609             }
 610             return;
 611         }
 612     }
 613
 614     /* We want to use the earliest deadline from ALL vm_clocks */
 615     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 616     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 617     if (deadline < 0) {
 618         static bool notified;
 619         if (!icount_sleep && !notified) {
 620             warn_report("icount sleep disabled and no active timers");
 621             notified = true;
 622         }
 623         return;
 624     }
 625
 626     if (deadline > 0) {
 627         /*
 628          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 629          * sleep.  Otherwise, the CPU might be waiting for a future timer
 630          * interrupt to wake it up, but the interrupt never comes because
 631          * the vCPU isn't running any insns and thus doesn't advance the
 632          * QEMU_CLOCK_VIRTUAL.
 633          */
 634         if (!icount_sleep) {
 635             /*
 636              * We never let VCPUs sleep in no sleep icount mode.
 637              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 638              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 639              * It is useful when we want a deterministic execution time,
 640              * isolated from host latencies.
 641              */
 642             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 643                                &timers_state.vm_clock_lock);
 644             atomic_set_i64(&timers_state.qemu_icount_bias,
 645                            timers_state.qemu_icount_bias + deadline);
 646             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 647                                  &timers_state.vm_clock_lock);
 648             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 649         } else {
 650             /*
 651              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 652              * "real" time, (related to the time left until the next event) has
 653              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 654              * This avoids that the warps are visible externally; for example,
 655              * you will not be sending network packets continuously instead of
 656              * every 100ms.
 657              */
 658             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 659                                &timers_state.vm_clock_lock);
 660             if (timers_state.vm_clock_warp_start == -1
 661                 || timers_state.vm_clock_warp_start > clock) {
 662                 timers_state.vm_clock_warp_start = clock;
 663             }
 664             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 665                                  &timers_state.vm_clock_lock);
 666             timer_mod_anticipate(timers_state.icount_warp_timer,
 667                                  clock + deadline);
 668         }
 669     } else if (deadline == 0) {
 670         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 671     }
 672 }
 673
 674 static void qemu_account_warp_timer(void)
 675 {
 676     if (!use_icount || !icount_sleep) {
 677         return;
 678     }
 679
 680     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 681      * do not fire, so computing the deadline does not make sense.
 682      */
 683     if (!runstate_is_running()) {
 684         return;
 685     }
 686
 687     /* warp clock deterministically in record/replay mode */
 688     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 689         return;
 690     }
 691
 692     timer_del(timers_state.icount_warp_timer);
 693     icount_warp_rt();
 694 }
 695
 696 static bool icount_state_needed(void *opaque)
 697 {
 698     return use_icount;
 699 }
 700
 701 static bool warp_timer_state_needed(void *opaque)
 702 {
 703     TimersState *s = opaque;
 704     return s->icount_warp_timer != NULL;
 705 }
 706
 707 static bool adjust_timers_state_needed(void *opaque)
 708 {
 709     TimersState *s = opaque;
 710     return s->icount_rt_timer != NULL;
 711 }
 712
 713 /*
 714  * Subsection for warp timer migration is optional, because may not be created
 715  */
 716 static const VMStateDescription icount_vmstate_warp_timer = {
 717     .name = "timer/icount/warp_timer",
 718     .version_id = 1,
 719     .minimum_version_id = 1,
 720     .needed = warp_timer_state_needed,
 721     .fields = (VMStateField[]) {
 722         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 723         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 724         VMSTATE_END_OF_LIST()
 725     }
 726 };
 727
 728 static const VMStateDescription icount_vmstate_adjust_timers = {
 729     .name = "timer/icount/timers",
 730     .version_id = 1,
 731     .minimum_version_id = 1,
 732     .needed = adjust_timers_state_needed,
 733     .fields = (VMStateField[]) {
 734         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 735         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 736         VMSTATE_END_OF_LIST()
 737     }
 738 };
 739
 740 /*
 741  * This is a subsection for icount migration.
 742  */
 743 static const VMStateDescription icount_vmstate_timers = {
 744     .name = "timer/icount",
 745     .version_id = 1,
 746     .minimum_version_id = 1,
 747     .needed = icount_state_needed,
 748     .fields = (VMStateField[]) {
 749         VMSTATE_INT64(qemu_icount_bias, TimersState),
 750         VMSTATE_INT64(qemu_icount, TimersState),
 751         VMSTATE_END_OF_LIST()
 752     },
 753     .subsections = (const VMStateDescription*[]) {
 754         &icount_vmstate_warp_timer,
 755         &icount_vmstate_adjust_timers,
 756         NULL
 757     }
 758 };
 759
 760 static const VMStateDescription vmstate_timers = {
 761     .name = "timer",
 762     .version_id = 2,
 763     .minimum_version_id = 1,
 764     .fields = (VMStateField[]) {
 765         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 766         VMSTATE_UNUSED(8),
 767         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 768         VMSTATE_END_OF_LIST()
 769     },
 770     .subsections = (const VMStateDescription*[]) {
 771         &icount_vmstate_timers,
 772         NULL
 773     }
 774 };
 775
 776 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 777 {
 778     double pct;
 779     double throttle_ratio;
 780     long sleeptime_ns;
 781
 782     if (!cpu_throttle_get_percentage()) {
 783         return;
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     throttle_ratio = pct / (1 - pct);
 788     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 789
 790     qemu_mutex_unlock_iothread();
 791     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 792     qemu_mutex_lock_iothread();
 793     atomic_set(&cpu->throttle_thread_scheduled, 0);
 794 }
 795
 796 static void cpu_throttle_timer_tick(void *opaque)
 797 {
 798     CPUState *cpu;
 799     double pct;
 800
 801     /* Stop the timer if needed */
 802     if (!cpu_throttle_get_percentage()) {
 803         return;
 804     }
 805     CPU_FOREACH(cpu) {
 806         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 807             async_run_on_cpu(cpu, cpu_throttle_thread,
 808                              RUN_ON_CPU_NULL);
 809         }
 810     }
 811
 812     pct = (double)cpu_throttle_get_percentage()/100;
 813     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 814                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 815 }
 816
 817 void cpu_throttle_set(int new_throttle_pct)
 818 {
 819     /* Ensure throttle percentage is within valid range */
 820     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 821     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 822
 823     atomic_set(&throttle_percentage, new_throttle_pct);
 824
 825     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 826                                        CPU_THROTTLE_TIMESLICE_NS);
 827 }
 828
 829 void cpu_throttle_stop(void)
 830 {
 831     atomic_set(&throttle_percentage, 0);
 832 }
 833
 834 bool cpu_throttle_active(void)
 835 {
 836     return (cpu_throttle_get_percentage() != 0);
 837 }
 838
 839 int cpu_throttle_get_percentage(void)
 840 {
 841     return atomic_read(&throttle_percentage);
 842 }
 843
 844 void cpu_ticks_init(void)
 845 {
 846     seqlock_init(&timers_state.vm_clock_seqlock);
 847     qemu_spin_init(&timers_state.vm_clock_lock);
 848     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 849     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 850                                            cpu_throttle_timer_tick, NULL);
 851 }
 852
 853 void configure_icount(QemuOpts *opts, Error **errp)
 854 {
 855     const char *option;
 856     char *rem_str = NULL;
 857
 858     option = qemu_opt_get(opts, "shift");
 859     if (!option) {
 860         if (qemu_opt_get(opts, "align") != NULL) {
 861             error_setg(errp, "Please specify shift option when using align");
 862         }
 863         return;
 864     }
 865
 866     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 867     if (icount_sleep) {
 868         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 869                                          icount_timer_cb, NULL);
 870     }
 871
 872     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 873
 874     if (icount_align_option && !icount_sleep) {
 875         error_setg(errp, "align=on and sleep=off are incompatible");
 876     }
 877     if (strcmp(option, "auto") != 0) {
 878         errno = 0;
 879         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 880         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 881             error_setg(errp, "icount: Invalid shift value");
 882         }
 883         use_icount = 1;
 884         return;
 885     } else if (icount_align_option) {
 886         error_setg(errp, "shift=auto and align=on are incompatible");
 887     } else if (!icount_sleep) {
 888         error_setg(errp, "shift=auto and sleep=off are incompatible");
 889     }
 890
 891     use_icount = 2;
 892
 893     /* 125MIPS seems a reasonable initial guess at the guest speed.
 894        It will be corrected fairly quickly anyway.  */
 895     timers_state.icount_time_shift = 3;
 896
 897     /* Have both realtime and virtual time triggers for speed adjustment.
 898        The realtime trigger catches emulated time passing too slowly,
 899        the virtual time trigger catches emulated time passing too fast.
 900        Realtime triggers occur even when idle, so use them less frequently
 901        than VM triggers.  */
 902     timers_state.vm_clock_warp_start = -1;
 903     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 904                                    icount_adjust_rt, NULL);
 905     timer_mod(timers_state.icount_rt_timer,
 906                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 907     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 908                                         icount_adjust_vm, NULL);
 909     timer_mod(timers_state.icount_vm_timer,
 910                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 911                    NANOSECONDS_PER_SECOND / 10);
 912 }
 913
 914 /***********************************************************/
 915 /* TCG vCPU kick timer
 916  *
 917  * The kick timer is responsible for moving single threaded vCPU
 918  * emulation on to the next vCPU. If more than one vCPU is running a
 919  * timer event with force a cpu->exit so the next vCPU can get
 920  * scheduled.
 921  *
 922  * The timer is removed if all vCPUs are idle and restarted again once
 923  * idleness is complete.
 924  */
 925
 926 static QEMUTimer *tcg_kick_vcpu_timer;
 927 static CPUState *tcg_current_rr_cpu;
 928
 929 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 930
 931 static inline int64_t qemu_tcg_next_kick(void)
 932 {
 933     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 934 }
 935
 936 /* Kick the currently round-robin scheduled vCPU */
 937 static void qemu_cpu_kick_rr_cpu(void)
 938 {
 939     CPUState *cpu;
 940     do {
 941         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 942         if (cpu) {
 943             cpu_exit(cpu);
 944         }
 945     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 946 }
 947
 948 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 949 {
 950 }
 951
 952 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 953 {
 954     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 955         qemu_notify_event();
 956         return;
 957     }
 958
 959     if (qemu_in_vcpu_thread()) {
 960         /* A CPU is currently running; kick it back out to the
 961          * tcg_cpu_exec() loop so it will recalculate its
 962          * icount deadline immediately.
 963          */
 964         qemu_cpu_kick(current_cpu);
 965     } else if (first_cpu) {
 966         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 967          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 968          * causes cpu_thread_is_idle to return false.  This way,
 969          * handle_icount_deadline can run.
 970          * If we have no CPUs at all for some reason, we don't
 971          * need to do anything.
 972          */
 973         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 974     }
 975 }
 976
 977 static void kick_tcg_thread(void *opaque)
 978 {
 979     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 980     qemu_cpu_kick_rr_cpu();
 981 }
 982
 983 static void start_tcg_kick_timer(void)
 984 {
 985     assert(!mttcg_enabled);
 986     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 987         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 988                                            kick_tcg_thread, NULL);
 989     }
 990     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 991         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 992     }
 993 }
 994
 995 static void stop_tcg_kick_timer(void)
 996 {
 997     assert(!mttcg_enabled);
 998     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 999         timer_del(tcg_kick_vcpu_timer);
1000     }
1001 }
1002
1003 /***********************************************************/
1004 void hw_error(const char *fmt, ...)
1005 {
1006     va_list ap;
1007     CPUState *cpu;
1008
1009     va_start(ap, fmt);
1010     fprintf(stderr, "qemu: hardware error: ");
1011     vfprintf(stderr, fmt, ap);
1012     fprintf(stderr, "\n");
1013     CPU_FOREACH(cpu) {
1014         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1015         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1016     }
1017     va_end(ap);
1018     abort();
1019 }
1020
1021 void cpu_synchronize_all_states(void)
1022 {
1023     CPUState *cpu;
1024
1025     CPU_FOREACH(cpu) {
1026         cpu_synchronize_state(cpu);
1027         /* TODO: move to cpu_synchronize_state() */
1028         if (hvf_enabled()) {
1029             hvf_cpu_synchronize_state(cpu);
1030         }
1031     }
1032 }
1033
1034 void cpu_synchronize_all_post_reset(void)
1035 {
1036     CPUState *cpu;
1037
1038     CPU_FOREACH(cpu) {
1039         cpu_synchronize_post_reset(cpu);
1040         /* TODO: move to cpu_synchronize_post_reset() */
1041         if (hvf_enabled()) {
1042             hvf_cpu_synchronize_post_reset(cpu);
1043         }
1044     }
1045 }
1046
1047 void cpu_synchronize_all_post_init(void)
1048 {
1049     CPUState *cpu;
1050
1051     CPU_FOREACH(cpu) {
1052         cpu_synchronize_post_init(cpu);
1053         /* TODO: move to cpu_synchronize_post_init() */
1054         if (hvf_enabled()) {
1055             hvf_cpu_synchronize_post_init(cpu);
1056         }
1057     }
1058 }
1059
1060 void cpu_synchronize_all_pre_loadvm(void)
1061 {
1062     CPUState *cpu;
1063
1064     CPU_FOREACH(cpu) {
1065         cpu_synchronize_pre_loadvm(cpu);
1066     }
1067 }
1068
1069 static int do_vm_stop(RunState state, bool send_stop)
1070 {
1071     int ret = 0;
1072
1073     if (runstate_is_running()) {
1074         cpu_disable_ticks();
1075         pause_all_vcpus();
1076         runstate_set(state);
1077         vm_state_notify(0, state);
1078         if (send_stop) {
1079             qapi_event_send_stop();
1080         }
1081     }
1082
1083     bdrv_drain_all();
1084     replay_disable_events();
1085     ret = bdrv_flush_all();
1086
1087     return ret;
1088 }
1089
1090 /* Special vm_stop() variant for terminating the process.  Historically clients
1091  * did not expect a QMP STOP event and so we need to retain compatibility.
1092  */
1093 int vm_shutdown(void)
1094 {
1095     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1096 }
1097
1098 static bool cpu_can_run(CPUState *cpu)
1099 {
1100     if (cpu->stop) {
1101         return false;
1102     }
1103     if (cpu_is_stopped(cpu)) {
1104         return false;
1105     }
1106     return true;
1107 }
1108
1109 static void cpu_handle_guest_debug(CPUState *cpu)
1110 {
1111     gdb_set_stop_cpu(cpu);
1112     qemu_system_debug_request();
1113     cpu->stopped = true;
1114 }
1115
1116 #ifdef CONFIG_LINUX
1117 static void sigbus_reraise(void)
1118 {
1119     sigset_t set;
1120     struct sigaction action;
1121
1122     memset(&action, 0, sizeof(action));
1123     action.sa_handler = SIG_DFL;
1124     if (!sigaction(SIGBUS, &action, NULL)) {
1125         raise(SIGBUS);
1126         sigemptyset(&set);
1127         sigaddset(&set, SIGBUS);
1128         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1129     }
1130     perror("Failed to re-raise SIGBUS!\n");
1131     abort();
1132 }
1133
1134 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1135 {
1136     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1137         sigbus_reraise();
1138     }
1139
1140     if (current_cpu) {
1141         /* Called asynchronously in VCPU thread.  */
1142         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1143             sigbus_reraise();
1144         }
1145     } else {
1146         /* Called synchronously (via signalfd) in main thread.  */
1147         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1148             sigbus_reraise();
1149         }
1150     }
1151 }
1152
1153 static void qemu_init_sigbus(void)
1154 {
1155     struct sigaction action;
1156
1157     memset(&action, 0, sizeof(action));
1158     action.sa_flags = SA_SIGINFO;
1159     action.sa_sigaction = sigbus_handler;
1160     sigaction(SIGBUS, &action, NULL);
1161
1162     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1163 }
1164 #else /* !CONFIG_LINUX */
1165 static void qemu_init_sigbus(void)
1166 {
1167 }
1168 #endif /* !CONFIG_LINUX */
1169
1170 static QemuMutex qemu_global_mutex;
1171
1172 static QemuThread io_thread;
1173
1174 /* cpu creation */
1175 static QemuCond qemu_cpu_cond;
1176 /* system init */
1177 static QemuCond qemu_pause_cond;
1178
1179 void qemu_init_cpu_loop(void)
1180 {
1181     qemu_init_sigbus();
1182     qemu_cond_init(&qemu_cpu_cond);
1183     qemu_cond_init(&qemu_pause_cond);
1184     qemu_mutex_init(&qemu_global_mutex);
1185
1186     qemu_thread_get_self(&io_thread);
1187 }
1188
1189 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1190 {
1191     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1192 }
1193
1194 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1195 {
1196     if (kvm_destroy_vcpu(cpu) < 0) {
1197         error_report("kvm_destroy_vcpu failed");
1198         exit(EXIT_FAILURE);
1199     }
1200 }
1201
1202 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1203 {
1204 }
1205
1206 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1207 {
1208     g_assert(qemu_cpu_is_self(cpu));
1209     cpu->stop = false;
1210     cpu->stopped = true;
1211     if (exit) {
1212         cpu_exit(cpu);
1213     }
1214     qemu_cond_broadcast(&qemu_pause_cond);
1215 }
1216
1217 static void qemu_wait_io_event_common(CPUState *cpu)
1218 {
1219     atomic_mb_set(&cpu->thread_kicked, false);
1220     if (cpu->stop) {
1221         qemu_cpu_stop(cpu, false);
1222     }
1223     process_queued_cpu_work(cpu);
1224 }
1225
1226 static void qemu_tcg_rr_wait_io_event(void)
1227 {
1228     CPUState *cpu;
1229
1230     while (all_cpu_threads_idle()) {
1231         stop_tcg_kick_timer();
1232         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1233     }
1234
1235     start_tcg_kick_timer();
1236
1237     CPU_FOREACH(cpu) {
1238         qemu_wait_io_event_common(cpu);
1239     }
1240 }
1241
1242 static void qemu_wait_io_event(CPUState *cpu)
1243 {
1244     while (cpu_thread_is_idle(cpu)) {
1245         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1246     }
1247
1248 #ifdef _WIN32
1249     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1250     if (!tcg_enabled()) {
1251         SleepEx(0, TRUE);
1252     }
1253 #endif
1254     qemu_wait_io_event_common(cpu);
1255 }
1256
1257 static void *qemu_kvm_cpu_thread_fn(void *arg)
1258 {
1259     CPUState *cpu = arg;
1260     int r;
1261
1262     rcu_register_thread();
1263
1264     qemu_mutex_lock_iothread();
1265     qemu_thread_get_self(cpu->thread);
1266     cpu->thread_id = qemu_get_thread_id();
1267     cpu->can_do_io = 1;
1268     current_cpu = cpu;
1269
1270     r = kvm_init_vcpu(cpu);
1271     if (r < 0) {
1272         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1273         exit(1);
1274     }
1275
1276     kvm_init_cpu_signals(cpu);
1277
1278     /* signal CPU creation */
1279     cpu->created = true;
1280     qemu_cond_signal(&qemu_cpu_cond);
1281     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1282
1283     do {
1284         if (cpu_can_run(cpu)) {
1285             r = kvm_cpu_exec(cpu);
1286             if (r == EXCP_DEBUG) {
1287                 cpu_handle_guest_debug(cpu);
1288             }
1289         }
1290         qemu_wait_io_event(cpu);
1291     } while (!cpu->unplug || cpu_can_run(cpu));
1292
1293     qemu_kvm_destroy_vcpu(cpu);
1294     cpu->created = false;
1295     qemu_cond_signal(&qemu_cpu_cond);
1296     qemu_mutex_unlock_iothread();
1297     rcu_unregister_thread();
1298     return NULL;
1299 }
1300
1301 static void *qemu_dummy_cpu_thread_fn(void *arg)
1302 {
1303 #ifdef _WIN32
1304     error_report("qtest is not supported under Windows");
1305     exit(1);
1306 #else
1307     CPUState *cpu = arg;
1308     sigset_t waitset;
1309     int r;
1310
1311     rcu_register_thread();
1312
1313     qemu_mutex_lock_iothread();
1314     qemu_thread_get_self(cpu->thread);
1315     cpu->thread_id = qemu_get_thread_id();
1316     cpu->can_do_io = 1;
1317     current_cpu = cpu;
1318
1319     sigemptyset(&waitset);
1320     sigaddset(&waitset, SIG_IPI);
1321
1322     /* signal CPU creation */
1323     cpu->created = true;
1324     qemu_cond_signal(&qemu_cpu_cond);
1325     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1326
1327     do {
1328         qemu_mutex_unlock_iothread();
1329         do {
1330             int sig;
1331             r = sigwait(&waitset, &sig);
1332         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1333         if (r == -1) {
1334             perror("sigwait");
1335             exit(1);
1336         }
1337         qemu_mutex_lock_iothread();
1338         qemu_wait_io_event(cpu);
1339     } while (!cpu->unplug);
1340
1341     qemu_mutex_unlock_iothread();
1342     rcu_unregister_thread();
1343     return NULL;
1344 #endif
1345 }
1346
1347 static int64_t tcg_get_icount_limit(void)
1348 {
1349     int64_t deadline;
1350
1351     if (replay_mode != REPLAY_MODE_PLAY) {
1352         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1353
1354         /* Maintain prior (possibly buggy) behaviour where if no deadline
1355          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1356          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1357          * nanoseconds.
1358          */
1359         if ((deadline < 0) || (deadline > INT32_MAX)) {
1360             deadline = INT32_MAX;
1361         }
1362
1363         return qemu_icount_round(deadline);
1364     } else {
1365         return replay_get_instructions();
1366     }
1367 }
1368
1369 static void handle_icount_deadline(void)
1370 {
1371     assert(qemu_in_vcpu_thread());
1372     if (use_icount) {
1373         int64_t deadline =
1374             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1375
1376         if (deadline == 0) {
1377             /* Wake up other AioContexts.  */
1378             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1379             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1380         }
1381     }
1382 }
1383
1384 static void prepare_icount_for_run(CPUState *cpu)
1385 {
1386     if (use_icount) {
1387         int insns_left;
1388
1389         /* These should always be cleared by process_icount_data after
1390          * each vCPU execution. However u16.high can be raised
1391          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1392          */
1393         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1394         g_assert(cpu->icount_extra == 0);
1395
1396         cpu->icount_budget = tcg_get_icount_limit();
1397         insns_left = MIN(0xffff, cpu->icount_budget);
1398         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1399         cpu->icount_extra = cpu->icount_budget - insns_left;
1400
1401         replay_mutex_lock();
1402     }
1403 }
1404
1405 static void process_icount_data(CPUState *cpu)
1406 {
1407     if (use_icount) {
1408         /* Account for executed instructions */
1409         cpu_update_icount(cpu);
1410
1411         /* Reset the counters */
1412         cpu_neg(cpu)->icount_decr.u16.low = 0;
1413         cpu->icount_extra = 0;
1414         cpu->icount_budget = 0;
1415
1416         replay_account_executed_instructions();
1417
1418         replay_mutex_unlock();
1419     }
1420 }
1421
1422
1423 static int tcg_cpu_exec(CPUState *cpu)
1424 {
1425     int ret;
1426 #ifdef CONFIG_PROFILER
1427     int64_t ti;
1428 #endif
1429
1430     assert(tcg_enabled());
1431 #ifdef CONFIG_PROFILER
1432     ti = profile_getclock();
1433 #endif
1434     cpu_exec_start(cpu);
1435     ret = cpu_exec(cpu);
1436     cpu_exec_end(cpu);
1437 #ifdef CONFIG_PROFILER
1438     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1439                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1440 #endif
1441     return ret;
1442 }
1443
1444 /* Destroy any remaining vCPUs which have been unplugged and have
1445  * finished running
1446  */
1447 static void deal_with_unplugged_cpus(void)
1448 {
1449     CPUState *cpu;
1450
1451     CPU_FOREACH(cpu) {
1452         if (cpu->unplug && !cpu_can_run(cpu)) {
1453             qemu_tcg_destroy_vcpu(cpu);
1454             cpu->created = false;
1455             qemu_cond_signal(&qemu_cpu_cond);
1456             break;
1457         }
1458     }
1459 }
1460
1461 /* Single-threaded TCG
1462  *
1463  * In the single-threaded case each vCPU is simulated in turn. If
1464  * there is more than a single vCPU we create a simple timer to kick
1465  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1466  * This is done explicitly rather than relying on side-effects
1467  * elsewhere.
1468  */
1469
1470 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1471 {
1472     CPUState *cpu = arg;
1473
1474     assert(tcg_enabled());
1475     rcu_register_thread();
1476     tcg_register_thread();
1477
1478     qemu_mutex_lock_iothread();
1479     qemu_thread_get_self(cpu->thread);
1480
1481     cpu->thread_id = qemu_get_thread_id();
1482     cpu->created = true;
1483     cpu->can_do_io = 1;
1484     qemu_cond_signal(&qemu_cpu_cond);
1485     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1486
1487     /* wait for initial kick-off after machine start */
1488     while (first_cpu->stopped) {
1489         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1490
1491         /* process any pending work */
1492         CPU_FOREACH(cpu) {
1493             current_cpu = cpu;
1494             qemu_wait_io_event_common(cpu);
1495         }
1496     }
1497
1498     start_tcg_kick_timer();
1499
1500     cpu = first_cpu;
1501
1502     /* process any pending work */
1503     cpu->exit_request = 1;
1504
1505     while (1) {
1506         qemu_mutex_unlock_iothread();
1507         replay_mutex_lock();
1508         qemu_mutex_lock_iothread();
1509         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1510         qemu_account_warp_timer();
1511
1512         /* Run the timers here.  This is much more efficient than
1513          * waking up the I/O thread and waiting for completion.
1514          */
1515         handle_icount_deadline();
1516
1517         replay_mutex_unlock();
1518
1519         if (!cpu) {
1520             cpu = first_cpu;
1521         }
1522
1523         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1524
1525             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1526             current_cpu = cpu;
1527
1528             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1529                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1530
1531             if (cpu_can_run(cpu)) {
1532                 int r;
1533
1534                 qemu_mutex_unlock_iothread();
1535                 prepare_icount_for_run(cpu);
1536
1537                 r = tcg_cpu_exec(cpu);
1538
1539                 process_icount_data(cpu);
1540                 qemu_mutex_lock_iothread();
1541
1542                 if (r == EXCP_DEBUG) {
1543                     cpu_handle_guest_debug(cpu);
1544                     break;
1545                 } else if (r == EXCP_ATOMIC) {
1546                     qemu_mutex_unlock_iothread();
1547                     cpu_exec_step_atomic(cpu);
1548                     qemu_mutex_lock_iothread();
1549                     break;
1550                 }
1551             } else if (cpu->stop) {
1552                 if (cpu->unplug) {
1553                     cpu = CPU_NEXT(cpu);
1554                 }
1555                 break;
1556             }
1557
1558             cpu = CPU_NEXT(cpu);
1559         } /* while (cpu && !cpu->exit_request).. */
1560
1561         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1562         atomic_set(&tcg_current_rr_cpu, NULL);
1563
1564         if (cpu && cpu->exit_request) {
1565             atomic_mb_set(&cpu->exit_request, 0);
1566         }
1567
1568         if (use_icount && all_cpu_threads_idle()) {
1569             /*
1570              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1571              * in the main_loop, wake it up in order to start the warp timer.
1572              */
1573             qemu_notify_event();
1574         }
1575
1576         qemu_tcg_rr_wait_io_event();
1577         deal_with_unplugged_cpus();
1578     }
1579
1580     rcu_unregister_thread();
1581     return NULL;
1582 }
1583
1584 static void *qemu_hax_cpu_thread_fn(void *arg)
1585 {
1586     CPUState *cpu = arg;
1587     int r;
1588
1589     rcu_register_thread();
1590     qemu_mutex_lock_iothread();
1591     qemu_thread_get_self(cpu->thread);
1592
1593     cpu->thread_id = qemu_get_thread_id();
1594     cpu->created = true;
1595     current_cpu = cpu;
1596
1597     hax_init_vcpu(cpu);
1598     qemu_cond_signal(&qemu_cpu_cond);
1599     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1600
1601     do {
1602         if (cpu_can_run(cpu)) {
1603             r = hax_smp_cpu_exec(cpu);
1604             if (r == EXCP_DEBUG) {
1605                 cpu_handle_guest_debug(cpu);
1606             }
1607         }
1608
1609         qemu_wait_io_event(cpu);
1610     } while (!cpu->unplug || cpu_can_run(cpu));
1611     rcu_unregister_thread();
1612     return NULL;
1613 }
1614
1615 /* The HVF-specific vCPU thread function. This one should only run when the host
1616  * CPU supports the VMX "unrestricted guest" feature. */
1617 static void *qemu_hvf_cpu_thread_fn(void *arg)
1618 {
1619     CPUState *cpu = arg;
1620
1621     int r;
1622
1623     assert(hvf_enabled());
1624
1625     rcu_register_thread();
1626
1627     qemu_mutex_lock_iothread();
1628     qemu_thread_get_self(cpu->thread);
1629
1630     cpu->thread_id = qemu_get_thread_id();
1631     cpu->can_do_io = 1;
1632     current_cpu = cpu;
1633
1634     hvf_init_vcpu(cpu);
1635
1636     /* signal CPU creation */
1637     cpu->created = true;
1638     qemu_cond_signal(&qemu_cpu_cond);
1639     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1640
1641     do {
1642         if (cpu_can_run(cpu)) {
1643             r = hvf_vcpu_exec(cpu);
1644             if (r == EXCP_DEBUG) {
1645                 cpu_handle_guest_debug(cpu);
1646             }
1647         }
1648         qemu_wait_io_event(cpu);
1649     } while (!cpu->unplug || cpu_can_run(cpu));
1650
1651     hvf_vcpu_destroy(cpu);
1652     cpu->created = false;
1653     qemu_cond_signal(&qemu_cpu_cond);
1654     qemu_mutex_unlock_iothread();
1655     rcu_unregister_thread();
1656     return NULL;
1657 }
1658
1659 static void *qemu_whpx_cpu_thread_fn(void *arg)
1660 {
1661     CPUState *cpu = arg;
1662     int r;
1663
1664     rcu_register_thread();
1665
1666     qemu_mutex_lock_iothread();
1667     qemu_thread_get_self(cpu->thread);
1668     cpu->thread_id = qemu_get_thread_id();
1669     current_cpu = cpu;
1670
1671     r = whpx_init_vcpu(cpu);
1672     if (r < 0) {
1673         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1674         exit(1);
1675     }
1676
1677     /* signal CPU creation */
1678     cpu->created = true;
1679     qemu_cond_signal(&qemu_cpu_cond);
1680     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1681
1682     do {
1683         if (cpu_can_run(cpu)) {
1684             r = whpx_vcpu_exec(cpu);
1685             if (r == EXCP_DEBUG) {
1686                 cpu_handle_guest_debug(cpu);
1687             }
1688         }
1689         while (cpu_thread_is_idle(cpu)) {
1690             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1691         }
1692         qemu_wait_io_event_common(cpu);
1693     } while (!cpu->unplug || cpu_can_run(cpu));
1694
1695     whpx_destroy_vcpu(cpu);
1696     cpu->created = false;
1697     qemu_cond_signal(&qemu_cpu_cond);
1698     qemu_mutex_unlock_iothread();
1699     rcu_unregister_thread();
1700     return NULL;
1701 }
1702
1703 #ifdef _WIN32
1704 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1705 {
1706 }
1707 #endif
1708
1709 /* Multi-threaded TCG
1710  *
1711  * In the multi-threaded case each vCPU has its own thread. The TLS
1712  * variable current_cpu can be used deep in the code to find the
1713  * current CPUState for a given thread.
1714  */
1715
1716 static void *qemu_tcg_cpu_thread_fn(void *arg)
1717 {
1718     CPUState *cpu = arg;
1719
1720     assert(tcg_enabled());
1721     g_assert(!use_icount);
1722
1723     rcu_register_thread();
1724     tcg_register_thread();
1725
1726     qemu_mutex_lock_iothread();
1727     qemu_thread_get_self(cpu->thread);
1728
1729     cpu->thread_id = qemu_get_thread_id();
1730     cpu->created = true;
1731     cpu->can_do_io = 1;
1732     current_cpu = cpu;
1733     qemu_cond_signal(&qemu_cpu_cond);
1734     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1735
1736     /* process any pending work */
1737     cpu->exit_request = 1;
1738
1739     do {
1740         if (cpu_can_run(cpu)) {
1741             int r;
1742             qemu_mutex_unlock_iothread();
1743             r = tcg_cpu_exec(cpu);
1744             qemu_mutex_lock_iothread();
1745             switch (r) {
1746             case EXCP_DEBUG:
1747                 cpu_handle_guest_debug(cpu);
1748                 break;
1749             case EXCP_HALTED:
1750                 /* during start-up the vCPU is reset and the thread is
1751                  * kicked several times. If we don't ensure we go back
1752                  * to sleep in the halted state we won't cleanly
1753                  * start-up when the vCPU is enabled.
1754                  *
1755                  * cpu->halted should ensure we sleep in wait_io_event
1756                  */
1757                 g_assert(cpu->halted);
1758                 break;
1759             case EXCP_ATOMIC:
1760                 qemu_mutex_unlock_iothread();
1761                 cpu_exec_step_atomic(cpu);
1762                 qemu_mutex_lock_iothread();
1763             default:
1764                 /* Ignore everything else? */
1765                 break;
1766             }
1767         }
1768
1769         atomic_mb_set(&cpu->exit_request, 0);
1770         qemu_wait_io_event(cpu);
1771     } while (!cpu->unplug || cpu_can_run(cpu));
1772
1773     qemu_tcg_destroy_vcpu(cpu);
1774     cpu->created = false;
1775     qemu_cond_signal(&qemu_cpu_cond);
1776     qemu_mutex_unlock_iothread();
1777     rcu_unregister_thread();
1778     return NULL;
1779 }
1780
1781 static void qemu_cpu_kick_thread(CPUState *cpu)
1782 {
1783 #ifndef _WIN32
1784     int err;
1785
1786     if (cpu->thread_kicked) {
1787         return;
1788     }
1789     cpu->thread_kicked = true;
1790     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1791     if (err && err != ESRCH) {
1792         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1793         exit(1);
1794     }
1795 #else /* _WIN32 */
1796     if (!qemu_cpu_is_self(cpu)) {
1797         if (whpx_enabled()) {
1798             whpx_vcpu_kick(cpu);
1799         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1800             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1801                     __func__, GetLastError());
1802             exit(1);
1803         }
1804     }
1805 #endif
1806 }
1807
1808 void qemu_cpu_kick(CPUState *cpu)
1809 {
1810     qemu_cond_broadcast(cpu->halt_cond);
1811     if (tcg_enabled()) {
1812         cpu_exit(cpu);
1813         /* NOP unless doing single-thread RR */
1814         qemu_cpu_kick_rr_cpu();
1815     } else {
1816         if (hax_enabled()) {
1817             /*
1818              * FIXME: race condition with the exit_request check in
1819              * hax_vcpu_hax_exec
1820              */
1821             cpu->exit_request = 1;
1822         }
1823         qemu_cpu_kick_thread(cpu);
1824     }
1825 }
1826
1827 void qemu_cpu_kick_self(void)
1828 {
1829     assert(current_cpu);
1830     qemu_cpu_kick_thread(current_cpu);
1831 }
1832
1833 bool qemu_cpu_is_self(CPUState *cpu)
1834 {
1835     return qemu_thread_is_self(cpu->thread);
1836 }
1837
1838 bool qemu_in_vcpu_thread(void)
1839 {
1840     return current_cpu && qemu_cpu_is_self(current_cpu);
1841 }
1842
1843 static __thread bool iothread_locked = false;
1844
1845 bool qemu_mutex_iothread_locked(void)
1846 {
1847     return iothread_locked;
1848 }
1849
1850 /*
1851  * The BQL is taken from so many places that it is worth profiling the
1852  * callers directly, instead of funneling them all through a single function.
1853  */
1854 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1855 {
1856     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1857
1858     g_assert(!qemu_mutex_iothread_locked());
1859     bql_lock(&qemu_global_mutex, file, line);
1860     iothread_locked = true;
1861 }
1862
1863 void qemu_mutex_unlock_iothread(void)
1864 {
1865     g_assert(qemu_mutex_iothread_locked());
1866     iothread_locked = false;
1867     qemu_mutex_unlock(&qemu_global_mutex);
1868 }
1869
1870 static bool all_vcpus_paused(void)
1871 {
1872     CPUState *cpu;
1873
1874     CPU_FOREACH(cpu) {
1875         if (!cpu->stopped) {
1876             return false;
1877         }
1878     }
1879
1880     return true;
1881 }
1882
1883 void pause_all_vcpus(void)
1884 {
1885     CPUState *cpu;
1886
1887     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1888     CPU_FOREACH(cpu) {
1889         if (qemu_cpu_is_self(cpu)) {
1890             qemu_cpu_stop(cpu, true);
1891         } else {
1892             cpu->stop = true;
1893             qemu_cpu_kick(cpu);
1894         }
1895     }
1896
1897     /* We need to drop the replay_lock so any vCPU threads woken up
1898      * can finish their replay tasks
1899      */
1900     replay_mutex_unlock();
1901
1902     while (!all_vcpus_paused()) {
1903         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1904         CPU_FOREACH(cpu) {
1905             qemu_cpu_kick(cpu);
1906         }
1907     }
1908
1909     qemu_mutex_unlock_iothread();
1910     replay_mutex_lock();
1911     qemu_mutex_lock_iothread();
1912 }
1913
1914 void cpu_resume(CPUState *cpu)
1915 {
1916     cpu->stop = false;
1917     cpu->stopped = false;
1918     qemu_cpu_kick(cpu);
1919 }
1920
1921 void resume_all_vcpus(void)
1922 {
1923     CPUState *cpu;
1924
1925     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1926     CPU_FOREACH(cpu) {
1927         cpu_resume(cpu);
1928     }
1929 }
1930
1931 void cpu_remove_sync(CPUState *cpu)
1932 {
1933     cpu->stop = true;
1934     cpu->unplug = true;
1935     qemu_cpu_kick(cpu);
1936     qemu_mutex_unlock_iothread();
1937     qemu_thread_join(cpu->thread);
1938     qemu_mutex_lock_iothread();
1939 }
1940
1941 /* For temporary buffers for forming a name */
1942 #define VCPU_THREAD_NAME_SIZE 16
1943
1944 static void qemu_tcg_init_vcpu(CPUState *cpu)
1945 {
1946     char thread_name[VCPU_THREAD_NAME_SIZE];
1947     static QemuCond *single_tcg_halt_cond;
1948     static QemuThread *single_tcg_cpu_thread;
1949     static int tcg_region_inited;
1950
1951     assert(tcg_enabled());
1952     /*
1953      * Initialize TCG regions--once. Now is a good time, because:
1954      * (1) TCG's init context, prologue and target globals have been set up.
1955      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1956      *     -accel flag is processed, so the check doesn't work then).
1957      */
1958     if (!tcg_region_inited) {
1959         tcg_region_inited = 1;
1960         tcg_region_init();
1961     }
1962
1963     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1964         cpu->thread = g_malloc0(sizeof(QemuThread));
1965         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1966         qemu_cond_init(cpu->halt_cond);
1967
1968         if (qemu_tcg_mttcg_enabled()) {
1969             /* create a thread per vCPU with TCG (MTTCG) */
1970             parallel_cpus = true;
1971             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1972                  cpu->cpu_index);
1973
1974             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1975                                cpu, QEMU_THREAD_JOINABLE);
1976
1977         } else {
1978             /* share a single thread for all cpus with TCG */
1979             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1980             qemu_thread_create(cpu->thread, thread_name,
1981                                qemu_tcg_rr_cpu_thread_fn,
1982                                cpu, QEMU_THREAD_JOINABLE);
1983
1984             single_tcg_halt_cond = cpu->halt_cond;
1985             single_tcg_cpu_thread = cpu->thread;
1986         }
1987 #ifdef _WIN32
1988         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1989 #endif
1990     } else {
1991         /* For non-MTTCG cases we share the thread */
1992         cpu->thread = single_tcg_cpu_thread;
1993         cpu->halt_cond = single_tcg_halt_cond;
1994         cpu->thread_id = first_cpu->thread_id;
1995         cpu->can_do_io = 1;
1996         cpu->created = true;
1997     }
1998 }
1999
2000 static void qemu_hax_start_vcpu(CPUState *cpu)
2001 {
2002     char thread_name[VCPU_THREAD_NAME_SIZE];
2003
2004     cpu->thread = g_malloc0(sizeof(QemuThread));
2005     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006     qemu_cond_init(cpu->halt_cond);
2007
2008     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2009              cpu->cpu_index);
2010     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2011                        cpu, QEMU_THREAD_JOINABLE);
2012 #ifdef _WIN32
2013     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2014 #endif
2015 }
2016
2017 static void qemu_kvm_start_vcpu(CPUState *cpu)
2018 {
2019     char thread_name[VCPU_THREAD_NAME_SIZE];
2020
2021     cpu->thread = g_malloc0(sizeof(QemuThread));
2022     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2023     qemu_cond_init(cpu->halt_cond);
2024     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2025              cpu->cpu_index);
2026     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2027                        cpu, QEMU_THREAD_JOINABLE);
2028 }
2029
2030 static void qemu_hvf_start_vcpu(CPUState *cpu)
2031 {
2032     char thread_name[VCPU_THREAD_NAME_SIZE];
2033
2034     /* HVF currently does not support TCG, and only runs in
2035      * unrestricted-guest mode. */
2036     assert(hvf_enabled());
2037
2038     cpu->thread = g_malloc0(sizeof(QemuThread));
2039     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2040     qemu_cond_init(cpu->halt_cond);
2041
2042     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2043              cpu->cpu_index);
2044     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2045                        cpu, QEMU_THREAD_JOINABLE);
2046 }
2047
2048 static void qemu_whpx_start_vcpu(CPUState *cpu)
2049 {
2050     char thread_name[VCPU_THREAD_NAME_SIZE];
2051
2052     cpu->thread = g_malloc0(sizeof(QemuThread));
2053     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2054     qemu_cond_init(cpu->halt_cond);
2055     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2056              cpu->cpu_index);
2057     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2058                        cpu, QEMU_THREAD_JOINABLE);
2059 #ifdef _WIN32
2060     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2061 #endif
2062 }
2063
2064 static void qemu_dummy_start_vcpu(CPUState *cpu)
2065 {
2066     char thread_name[VCPU_THREAD_NAME_SIZE];
2067
2068     cpu->thread = g_malloc0(sizeof(QemuThread));
2069     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070     qemu_cond_init(cpu->halt_cond);
2071     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2072              cpu->cpu_index);
2073     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2074                        QEMU_THREAD_JOINABLE);
2075 }
2076
2077 void qemu_init_vcpu(CPUState *cpu)
2078 {
2079     MachineState *ms = MACHINE(qdev_get_machine());
2080
2081     cpu->nr_cores = ms->smp.cores;
2082     cpu->nr_threads =  ms->smp.threads;
2083     cpu->stopped = true;
2084     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2085
2086     if (!cpu->as) {
2087         /* If the target cpu hasn't set up any address spaces itself,
2088          * give it the default one.
2089          */
2090         cpu->num_ases = 1;
2091         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2092     }
2093
2094     if (kvm_enabled()) {
2095         qemu_kvm_start_vcpu(cpu);
2096     } else if (hax_enabled()) {
2097         qemu_hax_start_vcpu(cpu);
2098     } else if (hvf_enabled()) {
2099         qemu_hvf_start_vcpu(cpu);
2100     } else if (tcg_enabled()) {
2101         qemu_tcg_init_vcpu(cpu);
2102     } else if (whpx_enabled()) {
2103         qemu_whpx_start_vcpu(cpu);
2104     } else {
2105         qemu_dummy_start_vcpu(cpu);
2106     }
2107
2108     while (!cpu->created) {
2109         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2110     }
2111 }
2112
2113 void cpu_stop_current(void)
2114 {
2115     if (current_cpu) {
2116         current_cpu->stop = true;
2117         cpu_exit(current_cpu);
2118     }
2119 }
2120
2121 int vm_stop(RunState state)
2122 {
2123     if (qemu_in_vcpu_thread()) {
2124         qemu_system_vmstop_request_prepare();
2125         qemu_system_vmstop_request(state);
2126         /*
2127          * FIXME: should not return to device code in case
2128          * vm_stop() has been requested.
2129          */
2130         cpu_stop_current();
2131         return 0;
2132     }
2133
2134     return do_vm_stop(state, true);
2135 }
2136
2137 /**
2138  * Prepare for (re)starting the VM.
2139  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2140  * running or in case of an error condition), 0 otherwise.
2141  */
2142 int vm_prepare_start(void)
2143 {
2144     RunState requested;
2145
2146     qemu_vmstop_requested(&requested);
2147     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2148         return -1;
2149     }
2150
2151     /* Ensure that a STOP/RESUME pair of events is emitted if a
2152      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2153      * example, according to documentation is always followed by
2154      * the STOP event.
2155      */
2156     if (runstate_is_running()) {
2157         qapi_event_send_stop();
2158         qapi_event_send_resume();
2159         return -1;
2160     }
2161
2162     /* We are sending this now, but the CPUs will be resumed shortly later */
2163     qapi_event_send_resume();
2164
2165     replay_enable_events();
2166     cpu_enable_ticks();
2167     runstate_set(RUN_STATE_RUNNING);
2168     vm_state_notify(1, RUN_STATE_RUNNING);
2169     return 0;
2170 }
2171
2172 void vm_start(void)
2173 {
2174     if (!vm_prepare_start()) {
2175         resume_all_vcpus();
2176     }
2177 }
2178
2179 /* does a state transition even if the VM is already stopped,
2180    current state is forgotten forever */
2181 int vm_stop_force_state(RunState state)
2182 {
2183     if (runstate_is_running()) {
2184         return vm_stop(state);
2185     } else {
2186         runstate_set(state);
2187
2188         bdrv_drain_all();
2189         /* Make sure to return an error if the flush in a previous vm_stop()
2190          * failed. */
2191         return bdrv_flush_all();
2192     }
2193 }
2194
2195 void list_cpus(const char *optarg)
2196 {
2197     /* XXX: implement xxx_cpu_list for targets that still miss it */
2198 #if defined(cpu_list)
2199     cpu_list();
2200 #endif
2201 }
2202
2203 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2204                  bool has_cpu, int64_t cpu_index, Error **errp)
2205 {
2206     FILE *f;
2207     uint32_t l;
2208     CPUState *cpu;
2209     uint8_t buf[1024];
2210     int64_t orig_addr = addr, orig_size = size;
2211
2212     if (!has_cpu) {
2213         cpu_index = 0;
2214     }
2215
2216     cpu = qemu_get_cpu(cpu_index);
2217     if (cpu == NULL) {
2218         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2219                    "a CPU number");
2220         return;
2221     }
2222
2223     f = fopen(filename, "wb");
2224     if (!f) {
2225         error_setg_file_open(errp, errno, filename);
2226         return;
2227     }
2228
2229     while (size != 0) {
2230         l = sizeof(buf);
2231         if (l > size)
2232             l = size;
2233         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2234             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2235                              " specified", orig_addr, orig_size);
2236             goto exit;
2237         }
2238         if (fwrite(buf, 1, l, f) != l) {
2239             error_setg(errp, QERR_IO_ERROR);
2240             goto exit;
2241         }
2242         addr += l;
2243         size -= l;
2244     }
2245
2246 exit:
2247     fclose(f);
2248 }
2249
2250 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2251                   Error **errp)
2252 {
2253     FILE *f;
2254     uint32_t l;
2255     uint8_t buf[1024];
2256
2257     f = fopen(filename, "wb");
2258     if (!f) {
2259         error_setg_file_open(errp, errno, filename);
2260         return;
2261     }
2262
2263     while (size != 0) {
2264         l = sizeof(buf);
2265         if (l > size)
2266             l = size;
2267         cpu_physical_memory_read(addr, buf, l);
2268         if (fwrite(buf, 1, l, f) != l) {
2269             error_setg(errp, QERR_IO_ERROR);
2270             goto exit;
2271         }
2272         addr += l;
2273         size -= l;
2274     }
2275
2276 exit:
2277     fclose(f);
2278 }
2279
2280 void qmp_inject_nmi(Error **errp)
2281 {
2282     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2283 }
2284
2285 void dump_drift_info(void)
2286 {
2287     if (!use_icount) {
2288         return;
2289     }
2290
2291     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2292                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2293     if (icount_align_option) {
2294         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2295                     -max_delay / SCALE_MS);
2296         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2297                     max_advance / SCALE_MS);
2298     } else {
2299         qemu_printf("Max guest delay     NA\n");
2300         qemu_printf("Max guest advance   NA\n");
2301     }
2302 }