cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "cpu.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/sysemu.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "sysemu/cpus.h"
  50 #include "sysemu/qtest.h"
  51 #include "qemu/main-loop.h"
  52 #include "qemu/option.h"
  53 #include "qemu/bitmap.h"
  54 #include "qemu/seqlock.h"
  55 #include "qemu/guest-random.h"
  56 #include "tcg.h"
  57 #include "hw/nmi.h"
  58 #include "sysemu/replay.h"
  59 #include "hw/boards.h"
  60
  61 #ifdef CONFIG_LINUX
  62
  63 #include <sys/prctl.h>
  64
  65 #ifndef PR_MCE_KILL
  66 #define PR_MCE_KILL 33
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_SET
  70 #define PR_MCE_KILL_SET 1
  71 #endif
  72
  73 #ifndef PR_MCE_KILL_EARLY
  74 #define PR_MCE_KILL_EARLY 1
  75 #endif
  76
  77 #endif /* CONFIG_LINUX */
  78
  79 int64_t max_delay;
  80 int64_t max_advance;
  81
  82 /* vcpu throttling controls */
  83 static QEMUTimer *throttle_timer;
  84 static unsigned int throttle_percentage;
  85
  86 #define CPU_THROTTLE_PCT_MIN 1
  87 #define CPU_THROTTLE_PCT_MAX 99
  88 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  89
  90 bool cpu_is_stopped(CPUState *cpu)
  91 {
  92     return cpu->stopped || !runstate_is_running();
  93 }
  94
  95 static bool cpu_thread_is_idle(CPUState *cpu)
  96 {
  97     if (cpu->stop || cpu->queued_work_first) {
  98         return false;
  99     }
 100     if (cpu_is_stopped(cpu)) {
 101         return true;
 102     }
 103     if (!cpu->halted || cpu_has_work(cpu) ||
 104         kvm_halt_in_kernel()) {
 105         return false;
 106     }
 107     return true;
 108 }
 109
 110 static bool all_cpu_threads_idle(void)
 111 {
 112     CPUState *cpu;
 113
 114     CPU_FOREACH(cpu) {
 115         if (!cpu_thread_is_idle(cpu)) {
 116             return false;
 117         }
 118     }
 119     return true;
 120 }
 121
 122 /***********************************************************/
 123 /* guest cycle counter */
 124
 125 /* Protected by TimersState seqlock */
 126
 127 static bool icount_sleep = true;
 128 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 129 #define MAX_ICOUNT_SHIFT 10
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* Protect fields that can be respectively read outside the
 137      * BQL, and written from multiple threads.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     QemuSpin vm_clock_lock;
 141
 142     int16_t cpu_ticks_enabled;
 143
 144     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 145     int16_t icount_time_shift;
 146
 147     /* Compensate for varying guest execution speed.  */
 148     int64_t qemu_icount_bias;
 149
 150     int64_t vm_clock_warp_start;
 151     int64_t cpu_clock_offset;
 152
 153     /* Only written by TCG thread */
 154     int64_t qemu_icount;
 155
 156     /* for adjusting icount */
 157     QEMUTimer *icount_rt_timer;
 158     QEMUTimer *icount_vm_timer;
 159     QEMUTimer *icount_warp_timer;
 160 } TimersState;
 161
 162 static TimersState timers_state;
 163 bool mttcg_enabled;
 164
 165 /*
 166  * We default to false if we know other options have been enabled
 167  * which are currently incompatible with MTTCG. Otherwise when each
 168  * guest (target) has been updated to support:
 169  *   - atomic instructions
 170  *   - memory ordering primitives (barriers)
 171  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 172  *
 173  * Once a guest architecture has been converted to the new primitives
 174  * there are two remaining limitations to check.
 175  *
 176  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 177  * - The host must have a stronger memory order than the guest
 178  *
 179  * It may be possible in future to support strong guests on weak hosts
 180  * but that will require tagging all load/stores in a guest with their
 181  * implicit memory order requirements which would likely slow things
 182  * down a lot.
 183  */
 184
 185 static bool check_tcg_memory_orders_compatible(void)
 186 {
 187 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 188     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 189 #else
 190     return false;
 191 #endif
 192 }
 193
 194 static bool default_mttcg_enabled(void)
 195 {
 196     if (use_icount || TCG_OVERSIZED_GUEST) {
 197         return false;
 198     } else {
 199 #ifdef TARGET_SUPPORTS_MTTCG
 200         return check_tcg_memory_orders_compatible();
 201 #else
 202         return false;
 203 #endif
 204     }
 205 }
 206
 207 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 208 {
 209     const char *t = qemu_opt_get(opts, "thread");
 210     if (t) {
 211         if (strcmp(t, "multi") == 0) {
 212             if (TCG_OVERSIZED_GUEST) {
 213                 error_setg(errp, "No MTTCG when guest word size > hosts");
 214             } else if (use_icount) {
 215                 error_setg(errp, "No MTTCG when icount is enabled");
 216             } else {
 217 #ifndef TARGET_SUPPORTS_MTTCG
 218                 warn_report("Guest not yet converted to MTTCG - "
 219                             "you may get unexpected results");
 220 #endif
 221                 if (!check_tcg_memory_orders_compatible()) {
 222                     warn_report("Guest expects a stronger memory ordering "
 223                                 "than the host provides");
 224                     error_printf("This may cause strange/hard to debug errors\n");
 225                 }
 226                 mttcg_enabled = true;
 227             }
 228         } else if (strcmp(t, "single") == 0) {
 229             mttcg_enabled = false;
 230         } else {
 231             error_setg(errp, "Invalid 'thread' setting %s", t);
 232         }
 233     } else {
 234         mttcg_enabled = default_mttcg_enabled();
 235     }
 236 }
 237
 238 /* The current number of executed instructions is based on what we
 239  * originally budgeted minus the current state of the decrementing
 240  * icount counters in extra/u16.low.
 241  */
 242 static int64_t cpu_get_icount_executed(CPUState *cpu)
 243 {
 244     return (cpu->icount_budget -
 245             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 246 }
 247
 248 /*
 249  * Update the global shared timer_state.qemu_icount to take into
 250  * account executed instructions. This is done by the TCG vCPU
 251  * thread so the main-loop can see time has moved forward.
 252  */
 253 static void cpu_update_icount_locked(CPUState *cpu)
 254 {
 255     int64_t executed = cpu_get_icount_executed(cpu);
 256     cpu->icount_budget -= executed;
 257
 258     atomic_set_i64(&timers_state.qemu_icount,
 259                    timers_state.qemu_icount + executed);
 260 }
 261
 262 /*
 263  * Update the global shared timer_state.qemu_icount to take into
 264  * account executed instructions. This is done by the TCG vCPU
 265  * thread so the main-loop can see time has moved forward.
 266  */
 267 void cpu_update_icount(CPUState *cpu)
 268 {
 269     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 270                        &timers_state.vm_clock_lock);
 271     cpu_update_icount_locked(cpu);
 272     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 273                          &timers_state.vm_clock_lock);
 274 }
 275
 276 static int64_t cpu_get_icount_raw_locked(void)
 277 {
 278     CPUState *cpu = current_cpu;
 279
 280     if (cpu && cpu->running) {
 281         if (!cpu->can_do_io) {
 282             error_report("Bad icount read");
 283             exit(1);
 284         }
 285         /* Take into account what has run */
 286         cpu_update_icount_locked(cpu);
 287     }
 288     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 289     return atomic_read_i64(&timers_state.qemu_icount);
 290 }
 291
 292 static int64_t cpu_get_icount_locked(void)
 293 {
 294     int64_t icount = cpu_get_icount_raw_locked();
 295     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 296         cpu_icount_to_ns(icount);
 297 }
 298
 299 int64_t cpu_get_icount_raw(void)
 300 {
 301     int64_t icount;
 302     unsigned start;
 303
 304     do {
 305         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 306         icount = cpu_get_icount_raw_locked();
 307     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 308
 309     return icount;
 310 }
 311
 312 /* Return the virtual CPU time, based on the instruction counter.  */
 313 int64_t cpu_get_icount(void)
 314 {
 315     int64_t icount;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         icount = cpu_get_icount_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return icount;
 324 }
 325
 326 int64_t cpu_icount_to_ns(int64_t icount)
 327 {
 328     return icount << atomic_read(&timers_state.icount_time_shift);
 329 }
 330
 331 static int64_t cpu_get_ticks_locked(void)
 332 {
 333     int64_t ticks = timers_state.cpu_ticks_offset;
 334     if (timers_state.cpu_ticks_enabled) {
 335         ticks += cpu_get_host_ticks();
 336     }
 337
 338     if (timers_state.cpu_ticks_prev > ticks) {
 339         /* Non increasing ticks may happen if the host uses software suspend.  */
 340         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 341         ticks = timers_state.cpu_ticks_prev;
 342     }
 343
 344     timers_state.cpu_ticks_prev = ticks;
 345     return ticks;
 346 }
 347
 348 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 349  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 350  * counter.
 351  */
 352 int64_t cpu_get_ticks(void)
 353 {
 354     int64_t ticks;
 355
 356     if (use_icount) {
 357         return cpu_get_icount();
 358     }
 359
 360     qemu_spin_lock(&timers_state.vm_clock_lock);
 361     ticks = cpu_get_ticks_locked();
 362     qemu_spin_unlock(&timers_state.vm_clock_lock);
 363     return ticks;
 364 }
 365
 366 static int64_t cpu_get_clock_locked(void)
 367 {
 368     int64_t time;
 369
 370     time = timers_state.cpu_clock_offset;
 371     if (timers_state.cpu_ticks_enabled) {
 372         time += get_clock();
 373     }
 374
 375     return time;
 376 }
 377
 378 /* Return the monotonic time elapsed in VM, i.e.,
 379  * the time between vm_start and vm_stop
 380  */
 381 int64_t cpu_get_clock(void)
 382 {
 383     int64_t ti;
 384     unsigned start;
 385
 386     do {
 387         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 388         ti = cpu_get_clock_locked();
 389     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 390
 391     return ti;
 392 }
 393
 394 /* enable cpu_get_ticks()
 395  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 396  */
 397 void cpu_enable_ticks(void)
 398 {
 399     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 400                        &timers_state.vm_clock_lock);
 401     if (!timers_state.cpu_ticks_enabled) {
 402         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 403         timers_state.cpu_clock_offset -= get_clock();
 404         timers_state.cpu_ticks_enabled = 1;
 405     }
 406     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 407                        &timers_state.vm_clock_lock);
 408 }
 409
 410 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 411  * cpu_get_ticks() after that.
 412  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 413  */
 414 void cpu_disable_ticks(void)
 415 {
 416     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 417                        &timers_state.vm_clock_lock);
 418     if (timers_state.cpu_ticks_enabled) {
 419         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 420         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 421         timers_state.cpu_ticks_enabled = 0;
 422     }
 423     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 424                          &timers_state.vm_clock_lock);
 425 }
 426
 427 /* Correlation between real and virtual time is always going to be
 428    fairly approximate, so ignore small variation.
 429    When the guest is idle real and virtual time will be aligned in
 430    the IO wait loop.  */
 431 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 432
 433 static void icount_adjust(void)
 434 {
 435     int64_t cur_time;
 436     int64_t cur_icount;
 437     int64_t delta;
 438
 439     /* Protected by TimersState mutex.  */
 440     static int64_t last_delta;
 441
 442     /* If the VM is not running, then do nothing.  */
 443     if (!runstate_is_running()) {
 444         return;
 445     }
 446
 447     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 448                        &timers_state.vm_clock_lock);
 449     cur_time = cpu_get_clock_locked();
 450     cur_icount = cpu_get_icount_locked();
 451
 452     delta = cur_icount - cur_time;
 453     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 454     if (delta > 0
 455         && last_delta + ICOUNT_WOBBLE < delta * 2
 456         && timers_state.icount_time_shift > 0) {
 457         /* The guest is getting too far ahead.  Slow time down.  */
 458         atomic_set(&timers_state.icount_time_shift,
 459                    timers_state.icount_time_shift - 1);
 460     }
 461     if (delta < 0
 462         && last_delta - ICOUNT_WOBBLE > delta * 2
 463         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 464         /* The guest is getting too far behind.  Speed time up.  */
 465         atomic_set(&timers_state.icount_time_shift,
 466                    timers_state.icount_time_shift + 1);
 467     }
 468     last_delta = delta;
 469     atomic_set_i64(&timers_state.qemu_icount_bias,
 470                    cur_icount - (timers_state.qemu_icount
 471                                  << timers_state.icount_time_shift));
 472     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 473                          &timers_state.vm_clock_lock);
 474 }
 475
 476 static void icount_adjust_rt(void *opaque)
 477 {
 478     timer_mod(timers_state.icount_rt_timer,
 479               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 480     icount_adjust();
 481 }
 482
 483 static void icount_adjust_vm(void *opaque)
 484 {
 485     timer_mod(timers_state.icount_vm_timer,
 486                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 487                    NANOSECONDS_PER_SECOND / 10);
 488     icount_adjust();
 489 }
 490
 491 static int64_t qemu_icount_round(int64_t count)
 492 {
 493     int shift = atomic_read(&timers_state.icount_time_shift);
 494     return (count + (1 << shift) - 1) >> shift;
 495 }
 496
 497 static void icount_warp_rt(void)
 498 {
 499     unsigned seq;
 500     int64_t warp_start;
 501
 502     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 503      * changes from -1 to another value, so the race here is okay.
 504      */
 505     do {
 506         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 507         warp_start = timers_state.vm_clock_warp_start;
 508     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 509
 510     if (warp_start == -1) {
 511         return;
 512     }
 513
 514     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 515                        &timers_state.vm_clock_lock);
 516     if (runstate_is_running()) {
 517         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 518                                             cpu_get_clock_locked());
 519         int64_t warp_delta;
 520
 521         warp_delta = clock - timers_state.vm_clock_warp_start;
 522         if (use_icount == 2) {
 523             /*
 524              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 525              * far ahead of real time.
 526              */
 527             int64_t cur_icount = cpu_get_icount_locked();
 528             int64_t delta = clock - cur_icount;
 529             warp_delta = MIN(warp_delta, delta);
 530         }
 531         atomic_set_i64(&timers_state.qemu_icount_bias,
 532                        timers_state.qemu_icount_bias + warp_delta);
 533     }
 534     timers_state.vm_clock_warp_start = -1;
 535     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 536                        &timers_state.vm_clock_lock);
 537
 538     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 539         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 540     }
 541 }
 542
 543 static void icount_timer_cb(void *opaque)
 544 {
 545     /* No need for a checkpoint because the timer already synchronizes
 546      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 547      */
 548     icount_warp_rt();
 549 }
 550
 551 void qtest_clock_warp(int64_t dest)
 552 {
 553     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 554     AioContext *aio_context;
 555     assert(qtest_enabled());
 556     aio_context = qemu_get_aio_context();
 557     while (clock < dest) {
 558         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 559         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 560
 561         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 562                            &timers_state.vm_clock_lock);
 563         atomic_set_i64(&timers_state.qemu_icount_bias,
 564                        timers_state.qemu_icount_bias + warp);
 565         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 566                              &timers_state.vm_clock_lock);
 567
 568         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 569         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 570         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 571     }
 572     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 573 }
 574
 575 void qemu_start_warp_timer(void)
 576 {
 577     int64_t clock;
 578     int64_t deadline;
 579
 580     if (!use_icount) {
 581         return;
 582     }
 583
 584     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 585      * do not fire, so computing the deadline does not make sense.
 586      */
 587     if (!runstate_is_running()) {
 588         return;
 589     }
 590
 591     if (replay_mode != REPLAY_MODE_PLAY) {
 592         if (!all_cpu_threads_idle()) {
 593             return;
 594         }
 595
 596         if (qtest_enabled()) {
 597             /* When testing, qtest commands advance icount.  */
 598             return;
 599         }
 600
 601         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 602     } else {
 603         /* warp clock deterministically in record/replay mode */
 604         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 605             /* vCPU is sleeping and warp can't be started.
 606                It is probably a race condition: notification sent
 607                to vCPU was processed in advance and vCPU went to sleep.
 608                Therefore we have to wake it up for doing someting. */
 609             if (replay_has_checkpoint()) {
 610                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 611             }
 612             return;
 613         }
 614     }
 615
 616     /* We want to use the earliest deadline from ALL vm_clocks */
 617     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 618     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 619     if (deadline < 0) {
 620         static bool notified;
 621         if (!icount_sleep && !notified) {
 622             warn_report("icount sleep disabled and no active timers");
 623             notified = true;
 624         }
 625         return;
 626     }
 627
 628     if (deadline > 0) {
 629         /*
 630          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 631          * sleep.  Otherwise, the CPU might be waiting for a future timer
 632          * interrupt to wake it up, but the interrupt never comes because
 633          * the vCPU isn't running any insns and thus doesn't advance the
 634          * QEMU_CLOCK_VIRTUAL.
 635          */
 636         if (!icount_sleep) {
 637             /*
 638              * We never let VCPUs sleep in no sleep icount mode.
 639              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 640              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 641              * It is useful when we want a deterministic execution time,
 642              * isolated from host latencies.
 643              */
 644             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 645                                &timers_state.vm_clock_lock);
 646             atomic_set_i64(&timers_state.qemu_icount_bias,
 647                            timers_state.qemu_icount_bias + deadline);
 648             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 649                                  &timers_state.vm_clock_lock);
 650             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 651         } else {
 652             /*
 653              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 654              * "real" time, (related to the time left until the next event) has
 655              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 656              * This avoids that the warps are visible externally; for example,
 657              * you will not be sending network packets continuously instead of
 658              * every 100ms.
 659              */
 660             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 661                                &timers_state.vm_clock_lock);
 662             if (timers_state.vm_clock_warp_start == -1
 663                 || timers_state.vm_clock_warp_start > clock) {
 664                 timers_state.vm_clock_warp_start = clock;
 665             }
 666             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 667                                  &timers_state.vm_clock_lock);
 668             timer_mod_anticipate(timers_state.icount_warp_timer,
 669                                  clock + deadline);
 670         }
 671     } else if (deadline == 0) {
 672         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 673     }
 674 }
 675
 676 static void qemu_account_warp_timer(void)
 677 {
 678     if (!use_icount || !icount_sleep) {
 679         return;
 680     }
 681
 682     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 683      * do not fire, so computing the deadline does not make sense.
 684      */
 685     if (!runstate_is_running()) {
 686         return;
 687     }
 688
 689     /* warp clock deterministically in record/replay mode */
 690     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 691         return;
 692     }
 693
 694     timer_del(timers_state.icount_warp_timer);
 695     icount_warp_rt();
 696 }
 697
 698 static bool icount_state_needed(void *opaque)
 699 {
 700     return use_icount;
 701 }
 702
 703 static bool warp_timer_state_needed(void *opaque)
 704 {
 705     TimersState *s = opaque;
 706     return s->icount_warp_timer != NULL;
 707 }
 708
 709 static bool adjust_timers_state_needed(void *opaque)
 710 {
 711     TimersState *s = opaque;
 712     return s->icount_rt_timer != NULL;
 713 }
 714
 715 /*
 716  * Subsection for warp timer migration is optional, because may not be created
 717  */
 718 static const VMStateDescription icount_vmstate_warp_timer = {
 719     .name = "timer/icount/warp_timer",
 720     .version_id = 1,
 721     .minimum_version_id = 1,
 722     .needed = warp_timer_state_needed,
 723     .fields = (VMStateField[]) {
 724         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 725         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 726         VMSTATE_END_OF_LIST()
 727     }
 728 };
 729
 730 static const VMStateDescription icount_vmstate_adjust_timers = {
 731     .name = "timer/icount/timers",
 732     .version_id = 1,
 733     .minimum_version_id = 1,
 734     .needed = adjust_timers_state_needed,
 735     .fields = (VMStateField[]) {
 736         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 737         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 738         VMSTATE_END_OF_LIST()
 739     }
 740 };
 741
 742 /*
 743  * This is a subsection for icount migration.
 744  */
 745 static const VMStateDescription icount_vmstate_timers = {
 746     .name = "timer/icount",
 747     .version_id = 1,
 748     .minimum_version_id = 1,
 749     .needed = icount_state_needed,
 750     .fields = (VMStateField[]) {
 751         VMSTATE_INT64(qemu_icount_bias, TimersState),
 752         VMSTATE_INT64(qemu_icount, TimersState),
 753         VMSTATE_END_OF_LIST()
 754     },
 755     .subsections = (const VMStateDescription*[]) {
 756         &icount_vmstate_warp_timer,
 757         &icount_vmstate_adjust_timers,
 758         NULL
 759     }
 760 };
 761
 762 static const VMStateDescription vmstate_timers = {
 763     .name = "timer",
 764     .version_id = 2,
 765     .minimum_version_id = 1,
 766     .fields = (VMStateField[]) {
 767         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 768         VMSTATE_UNUSED(8),
 769         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 770         VMSTATE_END_OF_LIST()
 771     },
 772     .subsections = (const VMStateDescription*[]) {
 773         &icount_vmstate_timers,
 774         NULL
 775     }
 776 };
 777
 778 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 779 {
 780     double pct;
 781     double throttle_ratio;
 782     long sleeptime_ns;
 783
 784     if (!cpu_throttle_get_percentage()) {
 785         return;
 786     }
 787
 788     pct = (double)cpu_throttle_get_percentage()/100;
 789     throttle_ratio = pct / (1 - pct);
 790     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 791
 792     qemu_mutex_unlock_iothread();
 793     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 794     qemu_mutex_lock_iothread();
 795     atomic_set(&cpu->throttle_thread_scheduled, 0);
 796 }
 797
 798 static void cpu_throttle_timer_tick(void *opaque)
 799 {
 800     CPUState *cpu;
 801     double pct;
 802
 803     /* Stop the timer if needed */
 804     if (!cpu_throttle_get_percentage()) {
 805         return;
 806     }
 807     CPU_FOREACH(cpu) {
 808         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 809             async_run_on_cpu(cpu, cpu_throttle_thread,
 810                              RUN_ON_CPU_NULL);
 811         }
 812     }
 813
 814     pct = (double)cpu_throttle_get_percentage()/100;
 815     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 816                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 817 }
 818
 819 void cpu_throttle_set(int new_throttle_pct)
 820 {
 821     /* Ensure throttle percentage is within valid range */
 822     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 823     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 824
 825     atomic_set(&throttle_percentage, new_throttle_pct);
 826
 827     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 828                                        CPU_THROTTLE_TIMESLICE_NS);
 829 }
 830
 831 void cpu_throttle_stop(void)
 832 {
 833     atomic_set(&throttle_percentage, 0);
 834 }
 835
 836 bool cpu_throttle_active(void)
 837 {
 838     return (cpu_throttle_get_percentage() != 0);
 839 }
 840
 841 int cpu_throttle_get_percentage(void)
 842 {
 843     return atomic_read(&throttle_percentage);
 844 }
 845
 846 void cpu_ticks_init(void)
 847 {
 848     seqlock_init(&timers_state.vm_clock_seqlock);
 849     qemu_spin_init(&timers_state.vm_clock_lock);
 850     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 851     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 852                                            cpu_throttle_timer_tick, NULL);
 853 }
 854
 855 void configure_icount(QemuOpts *opts, Error **errp)
 856 {
 857     const char *option;
 858     char *rem_str = NULL;
 859
 860     option = qemu_opt_get(opts, "shift");
 861     if (!option) {
 862         if (qemu_opt_get(opts, "align") != NULL) {
 863             error_setg(errp, "Please specify shift option when using align");
 864         }
 865         return;
 866     }
 867
 868     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 869     if (icount_sleep) {
 870         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 871                                          icount_timer_cb, NULL);
 872     }
 873
 874     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 875
 876     if (icount_align_option && !icount_sleep) {
 877         error_setg(errp, "align=on and sleep=off are incompatible");
 878     }
 879     if (strcmp(option, "auto") != 0) {
 880         errno = 0;
 881         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 882         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 883             error_setg(errp, "icount: Invalid shift value");
 884         }
 885         use_icount = 1;
 886         return;
 887     } else if (icount_align_option) {
 888         error_setg(errp, "shift=auto and align=on are incompatible");
 889     } else if (!icount_sleep) {
 890         error_setg(errp, "shift=auto and sleep=off are incompatible");
 891     }
 892
 893     use_icount = 2;
 894
 895     /* 125MIPS seems a reasonable initial guess at the guest speed.
 896        It will be corrected fairly quickly anyway.  */
 897     timers_state.icount_time_shift = 3;
 898
 899     /* Have both realtime and virtual time triggers for speed adjustment.
 900        The realtime trigger catches emulated time passing too slowly,
 901        the virtual time trigger catches emulated time passing too fast.
 902        Realtime triggers occur even when idle, so use them less frequently
 903        than VM triggers.  */
 904     timers_state.vm_clock_warp_start = -1;
 905     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 906                                    icount_adjust_rt, NULL);
 907     timer_mod(timers_state.icount_rt_timer,
 908                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 909     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 910                                         icount_adjust_vm, NULL);
 911     timer_mod(timers_state.icount_vm_timer,
 912                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 913                    NANOSECONDS_PER_SECOND / 10);
 914 }
 915
 916 /***********************************************************/
 917 /* TCG vCPU kick timer
 918  *
 919  * The kick timer is responsible for moving single threaded vCPU
 920  * emulation on to the next vCPU. If more than one vCPU is running a
 921  * timer event with force a cpu->exit so the next vCPU can get
 922  * scheduled.
 923  *
 924  * The timer is removed if all vCPUs are idle and restarted again once
 925  * idleness is complete.
 926  */
 927
 928 static QEMUTimer *tcg_kick_vcpu_timer;
 929 static CPUState *tcg_current_rr_cpu;
 930
 931 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 932
 933 static inline int64_t qemu_tcg_next_kick(void)
 934 {
 935     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 936 }
 937
 938 /* Kick the currently round-robin scheduled vCPU */
 939 static void qemu_cpu_kick_rr_cpu(void)
 940 {
 941     CPUState *cpu;
 942     do {
 943         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 944         if (cpu) {
 945             cpu_exit(cpu);
 946         }
 947     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 948 }
 949
 950 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 951 {
 952 }
 953
 954 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 955 {
 956     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 957         qemu_notify_event();
 958         return;
 959     }
 960
 961     if (qemu_in_vcpu_thread()) {
 962         /* A CPU is currently running; kick it back out to the
 963          * tcg_cpu_exec() loop so it will recalculate its
 964          * icount deadline immediately.
 965          */
 966         qemu_cpu_kick(current_cpu);
 967     } else if (first_cpu) {
 968         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 969          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 970          * causes cpu_thread_is_idle to return false.  This way,
 971          * handle_icount_deadline can run.
 972          * If we have no CPUs at all for some reason, we don't
 973          * need to do anything.
 974          */
 975         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 976     }
 977 }
 978
 979 static void kick_tcg_thread(void *opaque)
 980 {
 981     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 982     qemu_cpu_kick_rr_cpu();
 983 }
 984
 985 static void start_tcg_kick_timer(void)
 986 {
 987     assert(!mttcg_enabled);
 988     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 989         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 990                                            kick_tcg_thread, NULL);
 991     }
 992     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 993         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 994     }
 995 }
 996
 997 static void stop_tcg_kick_timer(void)
 998 {
 999     assert(!mttcg_enabled);
1000     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1001         timer_del(tcg_kick_vcpu_timer);
1002     }
1003 }
1004
1005 /***********************************************************/
1006 void hw_error(const char *fmt, ...)
1007 {
1008     va_list ap;
1009     CPUState *cpu;
1010
1011     va_start(ap, fmt);
1012     fprintf(stderr, "qemu: hardware error: ");
1013     vfprintf(stderr, fmt, ap);
1014     fprintf(stderr, "\n");
1015     CPU_FOREACH(cpu) {
1016         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1017         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1018     }
1019     va_end(ap);
1020     abort();
1021 }
1022
1023 void cpu_synchronize_all_states(void)
1024 {
1025     CPUState *cpu;
1026
1027     CPU_FOREACH(cpu) {
1028         cpu_synchronize_state(cpu);
1029         /* TODO: move to cpu_synchronize_state() */
1030         if (hvf_enabled()) {
1031             hvf_cpu_synchronize_state(cpu);
1032         }
1033     }
1034 }
1035
1036 void cpu_synchronize_all_post_reset(void)
1037 {
1038     CPUState *cpu;
1039
1040     CPU_FOREACH(cpu) {
1041         cpu_synchronize_post_reset(cpu);
1042         /* TODO: move to cpu_synchronize_post_reset() */
1043         if (hvf_enabled()) {
1044             hvf_cpu_synchronize_post_reset(cpu);
1045         }
1046     }
1047 }
1048
1049 void cpu_synchronize_all_post_init(void)
1050 {
1051     CPUState *cpu;
1052
1053     CPU_FOREACH(cpu) {
1054         cpu_synchronize_post_init(cpu);
1055         /* TODO: move to cpu_synchronize_post_init() */
1056         if (hvf_enabled()) {
1057             hvf_cpu_synchronize_post_init(cpu);
1058         }
1059     }
1060 }
1061
1062 void cpu_synchronize_all_pre_loadvm(void)
1063 {
1064     CPUState *cpu;
1065
1066     CPU_FOREACH(cpu) {
1067         cpu_synchronize_pre_loadvm(cpu);
1068     }
1069 }
1070
1071 static int do_vm_stop(RunState state, bool send_stop)
1072 {
1073     int ret = 0;
1074
1075     if (runstate_is_running()) {
1076         cpu_disable_ticks();
1077         pause_all_vcpus();
1078         runstate_set(state);
1079         vm_state_notify(0, state);
1080         if (send_stop) {
1081             qapi_event_send_stop();
1082         }
1083     }
1084
1085     bdrv_drain_all();
1086     replay_disable_events();
1087     ret = bdrv_flush_all();
1088
1089     return ret;
1090 }
1091
1092 /* Special vm_stop() variant for terminating the process.  Historically clients
1093  * did not expect a QMP STOP event and so we need to retain compatibility.
1094  */
1095 int vm_shutdown(void)
1096 {
1097     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1098 }
1099
1100 static bool cpu_can_run(CPUState *cpu)
1101 {
1102     if (cpu->stop) {
1103         return false;
1104     }
1105     if (cpu_is_stopped(cpu)) {
1106         return false;
1107     }
1108     return true;
1109 }
1110
1111 static void cpu_handle_guest_debug(CPUState *cpu)
1112 {
1113     gdb_set_stop_cpu(cpu);
1114     qemu_system_debug_request();
1115     cpu->stopped = true;
1116 }
1117
1118 #ifdef CONFIG_LINUX
1119 static void sigbus_reraise(void)
1120 {
1121     sigset_t set;
1122     struct sigaction action;
1123
1124     memset(&action, 0, sizeof(action));
1125     action.sa_handler = SIG_DFL;
1126     if (!sigaction(SIGBUS, &action, NULL)) {
1127         raise(SIGBUS);
1128         sigemptyset(&set);
1129         sigaddset(&set, SIGBUS);
1130         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1131     }
1132     perror("Failed to re-raise SIGBUS!\n");
1133     abort();
1134 }
1135
1136 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1137 {
1138     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1139         sigbus_reraise();
1140     }
1141
1142     if (current_cpu) {
1143         /* Called asynchronously in VCPU thread.  */
1144         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1145             sigbus_reraise();
1146         }
1147     } else {
1148         /* Called synchronously (via signalfd) in main thread.  */
1149         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1150             sigbus_reraise();
1151         }
1152     }
1153 }
1154
1155 static void qemu_init_sigbus(void)
1156 {
1157     struct sigaction action;
1158
1159     memset(&action, 0, sizeof(action));
1160     action.sa_flags = SA_SIGINFO;
1161     action.sa_sigaction = sigbus_handler;
1162     sigaction(SIGBUS, &action, NULL);
1163
1164     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1165 }
1166 #else /* !CONFIG_LINUX */
1167 static void qemu_init_sigbus(void)
1168 {
1169 }
1170 #endif /* !CONFIG_LINUX */
1171
1172 static QemuMutex qemu_global_mutex;
1173
1174 static QemuThread io_thread;
1175
1176 /* cpu creation */
1177 static QemuCond qemu_cpu_cond;
1178 /* system init */
1179 static QemuCond qemu_pause_cond;
1180
1181 void qemu_init_cpu_loop(void)
1182 {
1183     qemu_init_sigbus();
1184     qemu_cond_init(&qemu_cpu_cond);
1185     qemu_cond_init(&qemu_pause_cond);
1186     qemu_mutex_init(&qemu_global_mutex);
1187
1188     qemu_thread_get_self(&io_thread);
1189 }
1190
1191 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1192 {
1193     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1194 }
1195
1196 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1197 {
1198     if (kvm_destroy_vcpu(cpu) < 0) {
1199         error_report("kvm_destroy_vcpu failed");
1200         exit(EXIT_FAILURE);
1201     }
1202 }
1203
1204 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1205 {
1206 }
1207
1208 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1209 {
1210     g_assert(qemu_cpu_is_self(cpu));
1211     cpu->stop = false;
1212     cpu->stopped = true;
1213     if (exit) {
1214         cpu_exit(cpu);
1215     }
1216     qemu_cond_broadcast(&qemu_pause_cond);
1217 }
1218
1219 static void qemu_wait_io_event_common(CPUState *cpu)
1220 {
1221     atomic_mb_set(&cpu->thread_kicked, false);
1222     if (cpu->stop) {
1223         qemu_cpu_stop(cpu, false);
1224     }
1225     process_queued_cpu_work(cpu);
1226 }
1227
1228 static void qemu_tcg_rr_wait_io_event(void)
1229 {
1230     CPUState *cpu;
1231
1232     while (all_cpu_threads_idle()) {
1233         stop_tcg_kick_timer();
1234         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1235     }
1236
1237     start_tcg_kick_timer();
1238
1239     CPU_FOREACH(cpu) {
1240         qemu_wait_io_event_common(cpu);
1241     }
1242 }
1243
1244 static void qemu_wait_io_event(CPUState *cpu)
1245 {
1246     while (cpu_thread_is_idle(cpu)) {
1247         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1248     }
1249
1250 #ifdef _WIN32
1251     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1252     if (!tcg_enabled()) {
1253         SleepEx(0, TRUE);
1254     }
1255 #endif
1256     qemu_wait_io_event_common(cpu);
1257 }
1258
1259 static void *qemu_kvm_cpu_thread_fn(void *arg)
1260 {
1261     CPUState *cpu = arg;
1262     int r;
1263
1264     rcu_register_thread();
1265
1266     qemu_mutex_lock_iothread();
1267     qemu_thread_get_self(cpu->thread);
1268     cpu->thread_id = qemu_get_thread_id();
1269     cpu->can_do_io = 1;
1270     current_cpu = cpu;
1271
1272     r = kvm_init_vcpu(cpu);
1273     if (r < 0) {
1274         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1275         exit(1);
1276     }
1277
1278     kvm_init_cpu_signals(cpu);
1279
1280     /* signal CPU creation */
1281     cpu->created = true;
1282     qemu_cond_signal(&qemu_cpu_cond);
1283     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1284
1285     do {
1286         if (cpu_can_run(cpu)) {
1287             r = kvm_cpu_exec(cpu);
1288             if (r == EXCP_DEBUG) {
1289                 cpu_handle_guest_debug(cpu);
1290             }
1291         }
1292         qemu_wait_io_event(cpu);
1293     } while (!cpu->unplug || cpu_can_run(cpu));
1294
1295     qemu_kvm_destroy_vcpu(cpu);
1296     cpu->created = false;
1297     qemu_cond_signal(&qemu_cpu_cond);
1298     qemu_mutex_unlock_iothread();
1299     rcu_unregister_thread();
1300     return NULL;
1301 }
1302
1303 static void *qemu_dummy_cpu_thread_fn(void *arg)
1304 {
1305 #ifdef _WIN32
1306     error_report("qtest is not supported under Windows");
1307     exit(1);
1308 #else
1309     CPUState *cpu = arg;
1310     sigset_t waitset;
1311     int r;
1312
1313     rcu_register_thread();
1314
1315     qemu_mutex_lock_iothread();
1316     qemu_thread_get_self(cpu->thread);
1317     cpu->thread_id = qemu_get_thread_id();
1318     cpu->can_do_io = 1;
1319     current_cpu = cpu;
1320
1321     sigemptyset(&waitset);
1322     sigaddset(&waitset, SIG_IPI);
1323
1324     /* signal CPU creation */
1325     cpu->created = true;
1326     qemu_cond_signal(&qemu_cpu_cond);
1327     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1328
1329     do {
1330         qemu_mutex_unlock_iothread();
1331         do {
1332             int sig;
1333             r = sigwait(&waitset, &sig);
1334         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1335         if (r == -1) {
1336             perror("sigwait");
1337             exit(1);
1338         }
1339         qemu_mutex_lock_iothread();
1340         qemu_wait_io_event(cpu);
1341     } while (!cpu->unplug);
1342
1343     qemu_mutex_unlock_iothread();
1344     rcu_unregister_thread();
1345     return NULL;
1346 #endif
1347 }
1348
1349 static int64_t tcg_get_icount_limit(void)
1350 {
1351     int64_t deadline;
1352
1353     if (replay_mode != REPLAY_MODE_PLAY) {
1354         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1355
1356         /* Maintain prior (possibly buggy) behaviour where if no deadline
1357          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1358          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1359          * nanoseconds.
1360          */
1361         if ((deadline < 0) || (deadline > INT32_MAX)) {
1362             deadline = INT32_MAX;
1363         }
1364
1365         return qemu_icount_round(deadline);
1366     } else {
1367         return replay_get_instructions();
1368     }
1369 }
1370
1371 static void handle_icount_deadline(void)
1372 {
1373     assert(qemu_in_vcpu_thread());
1374     if (use_icount) {
1375         int64_t deadline =
1376             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1377
1378         if (deadline == 0) {
1379             /* Wake up other AioContexts.  */
1380             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1381             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1382         }
1383     }
1384 }
1385
1386 static void prepare_icount_for_run(CPUState *cpu)
1387 {
1388     if (use_icount) {
1389         int insns_left;
1390
1391         /* These should always be cleared by process_icount_data after
1392          * each vCPU execution. However u16.high can be raised
1393          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1394          */
1395         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1396         g_assert(cpu->icount_extra == 0);
1397
1398         cpu->icount_budget = tcg_get_icount_limit();
1399         insns_left = MIN(0xffff, cpu->icount_budget);
1400         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1401         cpu->icount_extra = cpu->icount_budget - insns_left;
1402
1403         replay_mutex_lock();
1404     }
1405 }
1406
1407 static void process_icount_data(CPUState *cpu)
1408 {
1409     if (use_icount) {
1410         /* Account for executed instructions */
1411         cpu_update_icount(cpu);
1412
1413         /* Reset the counters */
1414         cpu_neg(cpu)->icount_decr.u16.low = 0;
1415         cpu->icount_extra = 0;
1416         cpu->icount_budget = 0;
1417
1418         replay_account_executed_instructions();
1419
1420         replay_mutex_unlock();
1421     }
1422 }
1423
1424
1425 static int tcg_cpu_exec(CPUState *cpu)
1426 {
1427     int ret;
1428 #ifdef CONFIG_PROFILER
1429     int64_t ti;
1430 #endif
1431
1432     assert(tcg_enabled());
1433 #ifdef CONFIG_PROFILER
1434     ti = profile_getclock();
1435 #endif
1436     cpu_exec_start(cpu);
1437     ret = cpu_exec(cpu);
1438     cpu_exec_end(cpu);
1439 #ifdef CONFIG_PROFILER
1440     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1441                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1442 #endif
1443     return ret;
1444 }
1445
1446 /* Destroy any remaining vCPUs which have been unplugged and have
1447  * finished running
1448  */
1449 static void deal_with_unplugged_cpus(void)
1450 {
1451     CPUState *cpu;
1452
1453     CPU_FOREACH(cpu) {
1454         if (cpu->unplug && !cpu_can_run(cpu)) {
1455             qemu_tcg_destroy_vcpu(cpu);
1456             cpu->created = false;
1457             qemu_cond_signal(&qemu_cpu_cond);
1458             break;
1459         }
1460     }
1461 }
1462
1463 /* Single-threaded TCG
1464  *
1465  * In the single-threaded case each vCPU is simulated in turn. If
1466  * there is more than a single vCPU we create a simple timer to kick
1467  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1468  * This is done explicitly rather than relying on side-effects
1469  * elsewhere.
1470  */
1471
1472 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1473 {
1474     CPUState *cpu = arg;
1475
1476     assert(tcg_enabled());
1477     rcu_register_thread();
1478     tcg_register_thread();
1479
1480     qemu_mutex_lock_iothread();
1481     qemu_thread_get_self(cpu->thread);
1482
1483     cpu->thread_id = qemu_get_thread_id();
1484     cpu->created = true;
1485     cpu->can_do_io = 1;
1486     qemu_cond_signal(&qemu_cpu_cond);
1487     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1488
1489     /* wait for initial kick-off after machine start */
1490     while (first_cpu->stopped) {
1491         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1492
1493         /* process any pending work */
1494         CPU_FOREACH(cpu) {
1495             current_cpu = cpu;
1496             qemu_wait_io_event_common(cpu);
1497         }
1498     }
1499
1500     start_tcg_kick_timer();
1501
1502     cpu = first_cpu;
1503
1504     /* process any pending work */
1505     cpu->exit_request = 1;
1506
1507     while (1) {
1508         qemu_mutex_unlock_iothread();
1509         replay_mutex_lock();
1510         qemu_mutex_lock_iothread();
1511         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1512         qemu_account_warp_timer();
1513
1514         /* Run the timers here.  This is much more efficient than
1515          * waking up the I/O thread and waiting for completion.
1516          */
1517         handle_icount_deadline();
1518
1519         replay_mutex_unlock();
1520
1521         if (!cpu) {
1522             cpu = first_cpu;
1523         }
1524
1525         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1526
1527             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1528             current_cpu = cpu;
1529
1530             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1531                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1532
1533             if (cpu_can_run(cpu)) {
1534                 int r;
1535
1536                 qemu_mutex_unlock_iothread();
1537                 prepare_icount_for_run(cpu);
1538
1539                 r = tcg_cpu_exec(cpu);
1540
1541                 process_icount_data(cpu);
1542                 qemu_mutex_lock_iothread();
1543
1544                 if (r == EXCP_DEBUG) {
1545                     cpu_handle_guest_debug(cpu);
1546                     break;
1547                 } else if (r == EXCP_ATOMIC) {
1548                     qemu_mutex_unlock_iothread();
1549                     cpu_exec_step_atomic(cpu);
1550                     qemu_mutex_lock_iothread();
1551                     break;
1552                 }
1553             } else if (cpu->stop) {
1554                 if (cpu->unplug) {
1555                     cpu = CPU_NEXT(cpu);
1556                 }
1557                 break;
1558             }
1559
1560             cpu = CPU_NEXT(cpu);
1561         } /* while (cpu && !cpu->exit_request).. */
1562
1563         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1564         atomic_set(&tcg_current_rr_cpu, NULL);
1565
1566         if (cpu && cpu->exit_request) {
1567             atomic_mb_set(&cpu->exit_request, 0);
1568         }
1569
1570         if (use_icount && all_cpu_threads_idle()) {
1571             /*
1572              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1573              * in the main_loop, wake it up in order to start the warp timer.
1574              */
1575             qemu_notify_event();
1576         }
1577
1578         qemu_tcg_rr_wait_io_event();
1579         deal_with_unplugged_cpus();
1580     }
1581
1582     rcu_unregister_thread();
1583     return NULL;
1584 }
1585
1586 static void *qemu_hax_cpu_thread_fn(void *arg)
1587 {
1588     CPUState *cpu = arg;
1589     int r;
1590
1591     rcu_register_thread();
1592     qemu_mutex_lock_iothread();
1593     qemu_thread_get_self(cpu->thread);
1594
1595     cpu->thread_id = qemu_get_thread_id();
1596     cpu->created = true;
1597     cpu->halted = 0;
1598     current_cpu = cpu;
1599
1600     hax_init_vcpu(cpu);
1601     qemu_cond_signal(&qemu_cpu_cond);
1602     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1603
1604     do {
1605         if (cpu_can_run(cpu)) {
1606             r = hax_smp_cpu_exec(cpu);
1607             if (r == EXCP_DEBUG) {
1608                 cpu_handle_guest_debug(cpu);
1609             }
1610         }
1611
1612         qemu_wait_io_event(cpu);
1613     } while (!cpu->unplug || cpu_can_run(cpu));
1614     rcu_unregister_thread();
1615     return NULL;
1616 }
1617
1618 /* The HVF-specific vCPU thread function. This one should only run when the host
1619  * CPU supports the VMX "unrestricted guest" feature. */
1620 static void *qemu_hvf_cpu_thread_fn(void *arg)
1621 {
1622     CPUState *cpu = arg;
1623
1624     int r;
1625
1626     assert(hvf_enabled());
1627
1628     rcu_register_thread();
1629
1630     qemu_mutex_lock_iothread();
1631     qemu_thread_get_self(cpu->thread);
1632
1633     cpu->thread_id = qemu_get_thread_id();
1634     cpu->can_do_io = 1;
1635     current_cpu = cpu;
1636
1637     hvf_init_vcpu(cpu);
1638
1639     /* signal CPU creation */
1640     cpu->created = true;
1641     qemu_cond_signal(&qemu_cpu_cond);
1642     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1643
1644     do {
1645         if (cpu_can_run(cpu)) {
1646             r = hvf_vcpu_exec(cpu);
1647             if (r == EXCP_DEBUG) {
1648                 cpu_handle_guest_debug(cpu);
1649             }
1650         }
1651         qemu_wait_io_event(cpu);
1652     } while (!cpu->unplug || cpu_can_run(cpu));
1653
1654     hvf_vcpu_destroy(cpu);
1655     cpu->created = false;
1656     qemu_cond_signal(&qemu_cpu_cond);
1657     qemu_mutex_unlock_iothread();
1658     rcu_unregister_thread();
1659     return NULL;
1660 }
1661
1662 static void *qemu_whpx_cpu_thread_fn(void *arg)
1663 {
1664     CPUState *cpu = arg;
1665     int r;
1666
1667     rcu_register_thread();
1668
1669     qemu_mutex_lock_iothread();
1670     qemu_thread_get_self(cpu->thread);
1671     cpu->thread_id = qemu_get_thread_id();
1672     current_cpu = cpu;
1673
1674     r = whpx_init_vcpu(cpu);
1675     if (r < 0) {
1676         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1677         exit(1);
1678     }
1679
1680     /* signal CPU creation */
1681     cpu->created = true;
1682     qemu_cond_signal(&qemu_cpu_cond);
1683     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1684
1685     do {
1686         if (cpu_can_run(cpu)) {
1687             r = whpx_vcpu_exec(cpu);
1688             if (r == EXCP_DEBUG) {
1689                 cpu_handle_guest_debug(cpu);
1690             }
1691         }
1692         while (cpu_thread_is_idle(cpu)) {
1693             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1694         }
1695         qemu_wait_io_event_common(cpu);
1696     } while (!cpu->unplug || cpu_can_run(cpu));
1697
1698     whpx_destroy_vcpu(cpu);
1699     cpu->created = false;
1700     qemu_cond_signal(&qemu_cpu_cond);
1701     qemu_mutex_unlock_iothread();
1702     rcu_unregister_thread();
1703     return NULL;
1704 }
1705
1706 #ifdef _WIN32
1707 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1708 {
1709 }
1710 #endif
1711
1712 /* Multi-threaded TCG
1713  *
1714  * In the multi-threaded case each vCPU has its own thread. The TLS
1715  * variable current_cpu can be used deep in the code to find the
1716  * current CPUState for a given thread.
1717  */
1718
1719 static void *qemu_tcg_cpu_thread_fn(void *arg)
1720 {
1721     CPUState *cpu = arg;
1722
1723     assert(tcg_enabled());
1724     g_assert(!use_icount);
1725
1726     rcu_register_thread();
1727     tcg_register_thread();
1728
1729     qemu_mutex_lock_iothread();
1730     qemu_thread_get_self(cpu->thread);
1731
1732     cpu->thread_id = qemu_get_thread_id();
1733     cpu->created = true;
1734     cpu->can_do_io = 1;
1735     current_cpu = cpu;
1736     qemu_cond_signal(&qemu_cpu_cond);
1737     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1738
1739     /* process any pending work */
1740     cpu->exit_request = 1;
1741
1742     do {
1743         if (cpu_can_run(cpu)) {
1744             int r;
1745             qemu_mutex_unlock_iothread();
1746             r = tcg_cpu_exec(cpu);
1747             qemu_mutex_lock_iothread();
1748             switch (r) {
1749             case EXCP_DEBUG:
1750                 cpu_handle_guest_debug(cpu);
1751                 break;
1752             case EXCP_HALTED:
1753                 /* during start-up the vCPU is reset and the thread is
1754                  * kicked several times. If we don't ensure we go back
1755                  * to sleep in the halted state we won't cleanly
1756                  * start-up when the vCPU is enabled.
1757                  *
1758                  * cpu->halted should ensure we sleep in wait_io_event
1759                  */
1760                 g_assert(cpu->halted);
1761                 break;
1762             case EXCP_ATOMIC:
1763                 qemu_mutex_unlock_iothread();
1764                 cpu_exec_step_atomic(cpu);
1765                 qemu_mutex_lock_iothread();
1766             default:
1767                 /* Ignore everything else? */
1768                 break;
1769             }
1770         }
1771
1772         atomic_mb_set(&cpu->exit_request, 0);
1773         qemu_wait_io_event(cpu);
1774     } while (!cpu->unplug || cpu_can_run(cpu));
1775
1776     qemu_tcg_destroy_vcpu(cpu);
1777     cpu->created = false;
1778     qemu_cond_signal(&qemu_cpu_cond);
1779     qemu_mutex_unlock_iothread();
1780     rcu_unregister_thread();
1781     return NULL;
1782 }
1783
1784 static void qemu_cpu_kick_thread(CPUState *cpu)
1785 {
1786 #ifndef _WIN32
1787     int err;
1788
1789     if (cpu->thread_kicked) {
1790         return;
1791     }
1792     cpu->thread_kicked = true;
1793     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1794     if (err && err != ESRCH) {
1795         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1796         exit(1);
1797     }
1798 #else /* _WIN32 */
1799     if (!qemu_cpu_is_self(cpu)) {
1800         if (whpx_enabled()) {
1801             whpx_vcpu_kick(cpu);
1802         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1803             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1804                     __func__, GetLastError());
1805             exit(1);
1806         }
1807     }
1808 #endif
1809 }
1810
1811 void qemu_cpu_kick(CPUState *cpu)
1812 {
1813     qemu_cond_broadcast(cpu->halt_cond);
1814     if (tcg_enabled()) {
1815         cpu_exit(cpu);
1816         /* NOP unless doing single-thread RR */
1817         qemu_cpu_kick_rr_cpu();
1818     } else {
1819         if (hax_enabled()) {
1820             /*
1821              * FIXME: race condition with the exit_request check in
1822              * hax_vcpu_hax_exec
1823              */
1824             cpu->exit_request = 1;
1825         }
1826         qemu_cpu_kick_thread(cpu);
1827     }
1828 }
1829
1830 void qemu_cpu_kick_self(void)
1831 {
1832     assert(current_cpu);
1833     qemu_cpu_kick_thread(current_cpu);
1834 }
1835
1836 bool qemu_cpu_is_self(CPUState *cpu)
1837 {
1838     return qemu_thread_is_self(cpu->thread);
1839 }
1840
1841 bool qemu_in_vcpu_thread(void)
1842 {
1843     return current_cpu && qemu_cpu_is_self(current_cpu);
1844 }
1845
1846 static __thread bool iothread_locked = false;
1847
1848 bool qemu_mutex_iothread_locked(void)
1849 {
1850     return iothread_locked;
1851 }
1852
1853 /*
1854  * The BQL is taken from so many places that it is worth profiling the
1855  * callers directly, instead of funneling them all through a single function.
1856  */
1857 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1858 {
1859     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1860
1861     g_assert(!qemu_mutex_iothread_locked());
1862     bql_lock(&qemu_global_mutex, file, line);
1863     iothread_locked = true;
1864 }
1865
1866 void qemu_mutex_unlock_iothread(void)
1867 {
1868     g_assert(qemu_mutex_iothread_locked());
1869     iothread_locked = false;
1870     qemu_mutex_unlock(&qemu_global_mutex);
1871 }
1872
1873 static bool all_vcpus_paused(void)
1874 {
1875     CPUState *cpu;
1876
1877     CPU_FOREACH(cpu) {
1878         if (!cpu->stopped) {
1879             return false;
1880         }
1881     }
1882
1883     return true;
1884 }
1885
1886 void pause_all_vcpus(void)
1887 {
1888     CPUState *cpu;
1889
1890     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1891     CPU_FOREACH(cpu) {
1892         if (qemu_cpu_is_self(cpu)) {
1893             qemu_cpu_stop(cpu, true);
1894         } else {
1895             cpu->stop = true;
1896             qemu_cpu_kick(cpu);
1897         }
1898     }
1899
1900     /* We need to drop the replay_lock so any vCPU threads woken up
1901      * can finish their replay tasks
1902      */
1903     replay_mutex_unlock();
1904
1905     while (!all_vcpus_paused()) {
1906         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1907         CPU_FOREACH(cpu) {
1908             qemu_cpu_kick(cpu);
1909         }
1910     }
1911
1912     qemu_mutex_unlock_iothread();
1913     replay_mutex_lock();
1914     qemu_mutex_lock_iothread();
1915 }
1916
1917 void cpu_resume(CPUState *cpu)
1918 {
1919     cpu->stop = false;
1920     cpu->stopped = false;
1921     qemu_cpu_kick(cpu);
1922 }
1923
1924 void resume_all_vcpus(void)
1925 {
1926     CPUState *cpu;
1927
1928     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1929     CPU_FOREACH(cpu) {
1930         cpu_resume(cpu);
1931     }
1932 }
1933
1934 void cpu_remove_sync(CPUState *cpu)
1935 {
1936     cpu->stop = true;
1937     cpu->unplug = true;
1938     qemu_cpu_kick(cpu);
1939     qemu_mutex_unlock_iothread();
1940     qemu_thread_join(cpu->thread);
1941     qemu_mutex_lock_iothread();
1942 }
1943
1944 /* For temporary buffers for forming a name */
1945 #define VCPU_THREAD_NAME_SIZE 16
1946
1947 static void qemu_tcg_init_vcpu(CPUState *cpu)
1948 {
1949     char thread_name[VCPU_THREAD_NAME_SIZE];
1950     static QemuCond *single_tcg_halt_cond;
1951     static QemuThread *single_tcg_cpu_thread;
1952     static int tcg_region_inited;
1953
1954     assert(tcg_enabled());
1955     /*
1956      * Initialize TCG regions--once. Now is a good time, because:
1957      * (1) TCG's init context, prologue and target globals have been set up.
1958      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1959      *     -accel flag is processed, so the check doesn't work then).
1960      */
1961     if (!tcg_region_inited) {
1962         tcg_region_inited = 1;
1963         tcg_region_init();
1964     }
1965
1966     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1967         cpu->thread = g_malloc0(sizeof(QemuThread));
1968         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1969         qemu_cond_init(cpu->halt_cond);
1970
1971         if (qemu_tcg_mttcg_enabled()) {
1972             /* create a thread per vCPU with TCG (MTTCG) */
1973             parallel_cpus = true;
1974             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1975                  cpu->cpu_index);
1976
1977             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1978                                cpu, QEMU_THREAD_JOINABLE);
1979
1980         } else {
1981             /* share a single thread for all cpus with TCG */
1982             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1983             qemu_thread_create(cpu->thread, thread_name,
1984                                qemu_tcg_rr_cpu_thread_fn,
1985                                cpu, QEMU_THREAD_JOINABLE);
1986
1987             single_tcg_halt_cond = cpu->halt_cond;
1988             single_tcg_cpu_thread = cpu->thread;
1989         }
1990 #ifdef _WIN32
1991         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1992 #endif
1993     } else {
1994         /* For non-MTTCG cases we share the thread */
1995         cpu->thread = single_tcg_cpu_thread;
1996         cpu->halt_cond = single_tcg_halt_cond;
1997         cpu->thread_id = first_cpu->thread_id;
1998         cpu->can_do_io = 1;
1999         cpu->created = true;
2000     }
2001 }
2002
2003 static void qemu_hax_start_vcpu(CPUState *cpu)
2004 {
2005     char thread_name[VCPU_THREAD_NAME_SIZE];
2006
2007     cpu->thread = g_malloc0(sizeof(QemuThread));
2008     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2009     qemu_cond_init(cpu->halt_cond);
2010
2011     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2012              cpu->cpu_index);
2013     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2014                        cpu, QEMU_THREAD_JOINABLE);
2015 #ifdef _WIN32
2016     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2017 #endif
2018 }
2019
2020 static void qemu_kvm_start_vcpu(CPUState *cpu)
2021 {
2022     char thread_name[VCPU_THREAD_NAME_SIZE];
2023
2024     cpu->thread = g_malloc0(sizeof(QemuThread));
2025     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2026     qemu_cond_init(cpu->halt_cond);
2027     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2028              cpu->cpu_index);
2029     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2030                        cpu, QEMU_THREAD_JOINABLE);
2031 }
2032
2033 static void qemu_hvf_start_vcpu(CPUState *cpu)
2034 {
2035     char thread_name[VCPU_THREAD_NAME_SIZE];
2036
2037     /* HVF currently does not support TCG, and only runs in
2038      * unrestricted-guest mode. */
2039     assert(hvf_enabled());
2040
2041     cpu->thread = g_malloc0(sizeof(QemuThread));
2042     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2043     qemu_cond_init(cpu->halt_cond);
2044
2045     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2046              cpu->cpu_index);
2047     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2048                        cpu, QEMU_THREAD_JOINABLE);
2049 }
2050
2051 static void qemu_whpx_start_vcpu(CPUState *cpu)
2052 {
2053     char thread_name[VCPU_THREAD_NAME_SIZE];
2054
2055     cpu->thread = g_malloc0(sizeof(QemuThread));
2056     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2057     qemu_cond_init(cpu->halt_cond);
2058     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2059              cpu->cpu_index);
2060     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2061                        cpu, QEMU_THREAD_JOINABLE);
2062 #ifdef _WIN32
2063     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2064 #endif
2065 }
2066
2067 static void qemu_dummy_start_vcpu(CPUState *cpu)
2068 {
2069     char thread_name[VCPU_THREAD_NAME_SIZE];
2070
2071     cpu->thread = g_malloc0(sizeof(QemuThread));
2072     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2073     qemu_cond_init(cpu->halt_cond);
2074     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2075              cpu->cpu_index);
2076     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2077                        QEMU_THREAD_JOINABLE);
2078 }
2079
2080 void qemu_init_vcpu(CPUState *cpu)
2081 {
2082     cpu->nr_cores = smp_cores;
2083     cpu->nr_threads = smp_threads;
2084     cpu->stopped = true;
2085     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2086
2087     if (!cpu->as) {
2088         /* If the target cpu hasn't set up any address spaces itself,
2089          * give it the default one.
2090          */
2091         cpu->num_ases = 1;
2092         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2093     }
2094
2095     if (kvm_enabled()) {
2096         qemu_kvm_start_vcpu(cpu);
2097     } else if (hax_enabled()) {
2098         qemu_hax_start_vcpu(cpu);
2099     } else if (hvf_enabled()) {
2100         qemu_hvf_start_vcpu(cpu);
2101     } else if (tcg_enabled()) {
2102         qemu_tcg_init_vcpu(cpu);
2103     } else if (whpx_enabled()) {
2104         qemu_whpx_start_vcpu(cpu);
2105     } else {
2106         qemu_dummy_start_vcpu(cpu);
2107     }
2108
2109     while (!cpu->created) {
2110         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2111     }
2112 }
2113
2114 void cpu_stop_current(void)
2115 {
2116     if (current_cpu) {
2117         current_cpu->stop = true;
2118         cpu_exit(current_cpu);
2119     }
2120 }
2121
2122 int vm_stop(RunState state)
2123 {
2124     if (qemu_in_vcpu_thread()) {
2125         qemu_system_vmstop_request_prepare();
2126         qemu_system_vmstop_request(state);
2127         /*
2128          * FIXME: should not return to device code in case
2129          * vm_stop() has been requested.
2130          */
2131         cpu_stop_current();
2132         return 0;
2133     }
2134
2135     return do_vm_stop(state, true);
2136 }
2137
2138 /**
2139  * Prepare for (re)starting the VM.
2140  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2141  * running or in case of an error condition), 0 otherwise.
2142  */
2143 int vm_prepare_start(void)
2144 {
2145     RunState requested;
2146
2147     qemu_vmstop_requested(&requested);
2148     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2149         return -1;
2150     }
2151
2152     /* Ensure that a STOP/RESUME pair of events is emitted if a
2153      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2154      * example, according to documentation is always followed by
2155      * the STOP event.
2156      */
2157     if (runstate_is_running()) {
2158         qapi_event_send_stop();
2159         qapi_event_send_resume();
2160         return -1;
2161     }
2162
2163     /* We are sending this now, but the CPUs will be resumed shortly later */
2164     qapi_event_send_resume();
2165
2166     replay_enable_events();
2167     cpu_enable_ticks();
2168     runstate_set(RUN_STATE_RUNNING);
2169     vm_state_notify(1, RUN_STATE_RUNNING);
2170     return 0;
2171 }
2172
2173 void vm_start(void)
2174 {
2175     if (!vm_prepare_start()) {
2176         resume_all_vcpus();
2177     }
2178 }
2179
2180 /* does a state transition even if the VM is already stopped,
2181    current state is forgotten forever */
2182 int vm_stop_force_state(RunState state)
2183 {
2184     if (runstate_is_running()) {
2185         return vm_stop(state);
2186     } else {
2187         runstate_set(state);
2188
2189         bdrv_drain_all();
2190         /* Make sure to return an error if the flush in a previous vm_stop()
2191          * failed. */
2192         return bdrv_flush_all();
2193     }
2194 }
2195
2196 void list_cpus(const char *optarg)
2197 {
2198     /* XXX: implement xxx_cpu_list for targets that still miss it */
2199 #if defined(cpu_list)
2200     cpu_list();
2201 #endif
2202 }
2203
2204 CpuInfoList *qmp_query_cpus(Error **errp)
2205 {
2206     MachineState *ms = MACHINE(qdev_get_machine());
2207     MachineClass *mc = MACHINE_GET_CLASS(ms);
2208     CpuInfoList *head = NULL, *cur_item = NULL;
2209     CPUState *cpu;
2210
2211     CPU_FOREACH(cpu) {
2212         CpuInfoList *info;
2213 #if defined(TARGET_I386)
2214         X86CPU *x86_cpu = X86_CPU(cpu);
2215         CPUX86State *env = &x86_cpu->env;
2216 #elif defined(TARGET_PPC)
2217         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2218         CPUPPCState *env = &ppc_cpu->env;
2219 #elif defined(TARGET_SPARC)
2220         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2221         CPUSPARCState *env = &sparc_cpu->env;
2222 #elif defined(TARGET_RISCV)
2223         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2224         CPURISCVState *env = &riscv_cpu->env;
2225 #elif defined(TARGET_MIPS)
2226         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2227         CPUMIPSState *env = &mips_cpu->env;
2228 #elif defined(TARGET_TRICORE)
2229         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2230         CPUTriCoreState *env = &tricore_cpu->env;
2231 #elif defined(TARGET_S390X)
2232         S390CPU *s390_cpu = S390_CPU(cpu);
2233         CPUS390XState *env = &s390_cpu->env;
2234 #endif
2235
2236         cpu_synchronize_state(cpu);
2237
2238         info = g_malloc0(sizeof(*info));
2239         info->value = g_malloc0(sizeof(*info->value));
2240         info->value->CPU = cpu->cpu_index;
2241         info->value->current = (cpu == first_cpu);
2242         info->value->halted = cpu->halted;
2243         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2244         info->value->thread_id = cpu->thread_id;
2245 #if defined(TARGET_I386)
2246         info->value->arch = CPU_INFO_ARCH_X86;
2247         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2248 #elif defined(TARGET_PPC)
2249         info->value->arch = CPU_INFO_ARCH_PPC;
2250         info->value->u.ppc.nip = env->nip;
2251 #elif defined(TARGET_SPARC)
2252         info->value->arch = CPU_INFO_ARCH_SPARC;
2253         info->value->u.q_sparc.pc = env->pc;
2254         info->value->u.q_sparc.npc = env->npc;
2255 #elif defined(TARGET_MIPS)
2256         info->value->arch = CPU_INFO_ARCH_MIPS;
2257         info->value->u.q_mips.PC = env->active_tc.PC;
2258 #elif defined(TARGET_TRICORE)
2259         info->value->arch = CPU_INFO_ARCH_TRICORE;
2260         info->value->u.tricore.PC = env->PC;
2261 #elif defined(TARGET_S390X)
2262         info->value->arch = CPU_INFO_ARCH_S390;
2263         info->value->u.s390.cpu_state = env->cpu_state;
2264 #elif defined(TARGET_RISCV)
2265         info->value->arch = CPU_INFO_ARCH_RISCV;
2266         info->value->u.riscv.pc = env->pc;
2267 #else
2268         info->value->arch = CPU_INFO_ARCH_OTHER;
2269 #endif
2270         info->value->has_props = !!mc->cpu_index_to_instance_props;
2271         if (info->value->has_props) {
2272             CpuInstanceProperties *props;
2273             props = g_malloc0(sizeof(*props));
2274             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2275             info->value->props = props;
2276         }
2277
2278         /* XXX: waiting for the qapi to support GSList */
2279         if (!cur_item) {
2280             head = cur_item = info;
2281         } else {
2282             cur_item->next = info;
2283             cur_item = info;
2284         }
2285     }
2286
2287     return head;
2288 }
2289
2290 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2291 {
2292     /*
2293      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2294      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2295      */
2296     switch (target) {
2297     case SYS_EMU_TARGET_I386:
2298     case SYS_EMU_TARGET_X86_64:
2299         return CPU_INFO_ARCH_X86;
2300
2301     case SYS_EMU_TARGET_PPC:
2302     case SYS_EMU_TARGET_PPC64:
2303         return CPU_INFO_ARCH_PPC;
2304
2305     case SYS_EMU_TARGET_SPARC:
2306     case SYS_EMU_TARGET_SPARC64:
2307         return CPU_INFO_ARCH_SPARC;
2308
2309     case SYS_EMU_TARGET_MIPS:
2310     case SYS_EMU_TARGET_MIPSEL:
2311     case SYS_EMU_TARGET_MIPS64:
2312     case SYS_EMU_TARGET_MIPS64EL:
2313         return CPU_INFO_ARCH_MIPS;
2314
2315     case SYS_EMU_TARGET_TRICORE:
2316         return CPU_INFO_ARCH_TRICORE;
2317
2318     case SYS_EMU_TARGET_S390X:
2319         return CPU_INFO_ARCH_S390;
2320
2321     case SYS_EMU_TARGET_RISCV32:
2322     case SYS_EMU_TARGET_RISCV64:
2323         return CPU_INFO_ARCH_RISCV;
2324
2325     default:
2326         return CPU_INFO_ARCH_OTHER;
2327     }
2328 }
2329
2330 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2331 {
2332 #ifdef TARGET_S390X
2333     S390CPU *s390_cpu = S390_CPU(cpu);
2334     CPUS390XState *env = &s390_cpu->env;
2335
2336     info->cpu_state = env->cpu_state;
2337 #else
2338     abort();
2339 #endif
2340 }
2341
2342 /*
2343  * fast means: we NEVER interrupt vCPU threads to retrieve
2344  * information from KVM.
2345  */
2346 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2347 {
2348     MachineState *ms = MACHINE(qdev_get_machine());
2349     MachineClass *mc = MACHINE_GET_CLASS(ms);
2350     CpuInfoFastList *head = NULL, *cur_item = NULL;
2351     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2352                                           -1, &error_abort);
2353     CPUState *cpu;
2354
2355     CPU_FOREACH(cpu) {
2356         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2357         info->value = g_malloc0(sizeof(*info->value));
2358
2359         info->value->cpu_index = cpu->cpu_index;
2360         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2361         info->value->thread_id = cpu->thread_id;
2362
2363         info->value->has_props = !!mc->cpu_index_to_instance_props;
2364         if (info->value->has_props) {
2365             CpuInstanceProperties *props;
2366             props = g_malloc0(sizeof(*props));
2367             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2368             info->value->props = props;
2369         }
2370
2371         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2372         info->value->target = target;
2373         if (target == SYS_EMU_TARGET_S390X) {
2374             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2375         }
2376
2377         if (!cur_item) {
2378             head = cur_item = info;
2379         } else {
2380             cur_item->next = info;
2381             cur_item = info;
2382         }
2383     }
2384
2385     return head;
2386 }
2387
2388 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2389                  bool has_cpu, int64_t cpu_index, Error **errp)
2390 {
2391     FILE *f;
2392     uint32_t l;
2393     CPUState *cpu;
2394     uint8_t buf[1024];
2395     int64_t orig_addr = addr, orig_size = size;
2396
2397     if (!has_cpu) {
2398         cpu_index = 0;
2399     }
2400
2401     cpu = qemu_get_cpu(cpu_index);
2402     if (cpu == NULL) {
2403         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2404                    "a CPU number");
2405         return;
2406     }
2407
2408     f = fopen(filename, "wb");
2409     if (!f) {
2410         error_setg_file_open(errp, errno, filename);
2411         return;
2412     }
2413
2414     while (size != 0) {
2415         l = sizeof(buf);
2416         if (l > size)
2417             l = size;
2418         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2419             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2420                              " specified", orig_addr, orig_size);
2421             goto exit;
2422         }
2423         if (fwrite(buf, 1, l, f) != l) {
2424             error_setg(errp, QERR_IO_ERROR);
2425             goto exit;
2426         }
2427         addr += l;
2428         size -= l;
2429     }
2430
2431 exit:
2432     fclose(f);
2433 }
2434
2435 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2436                   Error **errp)
2437 {
2438     FILE *f;
2439     uint32_t l;
2440     uint8_t buf[1024];
2441
2442     f = fopen(filename, "wb");
2443     if (!f) {
2444         error_setg_file_open(errp, errno, filename);
2445         return;
2446     }
2447
2448     while (size != 0) {
2449         l = sizeof(buf);
2450         if (l > size)
2451             l = size;
2452         cpu_physical_memory_read(addr, buf, l);
2453         if (fwrite(buf, 1, l, f) != l) {
2454             error_setg(errp, QERR_IO_ERROR);
2455             goto exit;
2456         }
2457         addr += l;
2458         size -= l;
2459     }
2460
2461 exit:
2462     fclose(f);
2463 }
2464
2465 void qmp_inject_nmi(Error **errp)
2466 {
2467     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2468 }
2469
2470 void dump_drift_info(void)
2471 {
2472     if (!use_icount) {
2473         return;
2474     }
2475
2476     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2477                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2478     if (icount_align_option) {
2479         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2480                     -max_delay / SCALE_MS);
2481         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2482                     max_advance / SCALE_MS);
2483     } else {
2484         qemu_printf("Max guest delay     NA\n");
2485         qemu_printf("Max guest advance   NA\n");
2486     }
2487 }