cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 error_report("Guest not yet converted to MTTCG - "
 215                              "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     error_report("Guest expects a stronger memory ordering "
 219                                  "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 void cpu_update_icount(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253 #ifndef CONFIG_ATOMIC64
 254     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 255                        &timers_state.vm_clock_lock);
 256 #endif
 257     atomic_set__nocheck(&timers_state.qemu_icount,
 258                         timers_state.qemu_icount + executed);
 259 #ifndef CONFIG_ATOMIC64
 260     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 261                          &timers_state.vm_clock_lock);
 262 #endif
 263 }
 264
 265 static int64_t cpu_get_icount_raw_locked(void)
 266 {
 267     CPUState *cpu = current_cpu;
 268
 269     if (cpu && cpu->running) {
 270         if (!cpu->can_do_io) {
 271             error_report("Bad icount read");
 272             exit(1);
 273         }
 274         /* Take into account what has run */
 275         cpu_update_icount(cpu);
 276     }
 277     /* The read is protected by the seqlock, so __nocheck is okay.  */
 278     return atomic_read__nocheck(&timers_state.qemu_icount);
 279 }
 280
 281 static int64_t cpu_get_icount_locked(void)
 282 {
 283     int64_t icount = cpu_get_icount_raw_locked();
 284     return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
 285 }
 286
 287 int64_t cpu_get_icount_raw(void)
 288 {
 289     int64_t icount;
 290     unsigned start;
 291
 292     do {
 293         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 294         icount = cpu_get_icount_raw_locked();
 295     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 296
 297     return icount;
 298 }
 299
 300 /* Return the virtual CPU time, based on the instruction counter.  */
 301 int64_t cpu_get_icount(void)
 302 {
 303     int64_t icount;
 304     unsigned start;
 305
 306     do {
 307         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 308         icount = cpu_get_icount_locked();
 309     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 310
 311     return icount;
 312 }
 313
 314 int64_t cpu_icount_to_ns(int64_t icount)
 315 {
 316     return icount << atomic_read(&timers_state.icount_time_shift);
 317 }
 318
 319 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 320  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 321  * counter.
 322  *
 323  * Caller must hold the BQL
 324  */
 325 int64_t cpu_get_ticks(void)
 326 {
 327     int64_t ticks;
 328
 329     if (use_icount) {
 330         return cpu_get_icount();
 331     }
 332
 333     ticks = timers_state.cpu_ticks_offset;
 334     if (timers_state.cpu_ticks_enabled) {
 335         ticks += cpu_get_host_ticks();
 336     }
 337
 338     if (timers_state.cpu_ticks_prev > ticks) {
 339         /* Note: non increasing ticks may happen if the host uses
 340            software suspend */
 341         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 342         ticks = timers_state.cpu_ticks_prev;
 343     }
 344
 345     timers_state.cpu_ticks_prev = ticks;
 346     return ticks;
 347 }
 348
 349 static int64_t cpu_get_clock_locked(void)
 350 {
 351     int64_t time;
 352
 353     time = timers_state.cpu_clock_offset;
 354     if (timers_state.cpu_ticks_enabled) {
 355         time += get_clock();
 356     }
 357
 358     return time;
 359 }
 360
 361 /* Return the monotonic time elapsed in VM, i.e.,
 362  * the time between vm_start and vm_stop
 363  */
 364 int64_t cpu_get_clock(void)
 365 {
 366     int64_t ti;
 367     unsigned start;
 368
 369     do {
 370         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 371         ti = cpu_get_clock_locked();
 372     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 373
 374     return ti;
 375 }
 376
 377 /* enable cpu_get_ticks()
 378  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 379  */
 380 void cpu_enable_ticks(void)
 381 {
 382     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 383                        &timers_state.vm_clock_lock);
 384     if (!timers_state.cpu_ticks_enabled) {
 385         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 386         timers_state.cpu_clock_offset -= get_clock();
 387         timers_state.cpu_ticks_enabled = 1;
 388     }
 389     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 390                        &timers_state.vm_clock_lock);
 391 }
 392
 393 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 394  * cpu_get_ticks() after that.
 395  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 396  */
 397 void cpu_disable_ticks(void)
 398 {
 399     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 400                        &timers_state.vm_clock_lock);
 401     if (timers_state.cpu_ticks_enabled) {
 402         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 403         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 404         timers_state.cpu_ticks_enabled = 0;
 405     }
 406     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 407                          &timers_state.vm_clock_lock);
 408 }
 409
 410 /* Correlation between real and virtual time is always going to be
 411    fairly approximate, so ignore small variation.
 412    When the guest is idle real and virtual time will be aligned in
 413    the IO wait loop.  */
 414 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 415
 416 static void icount_adjust(void)
 417 {
 418     int64_t cur_time;
 419     int64_t cur_icount;
 420     int64_t delta;
 421
 422     /* Protected by TimersState mutex.  */
 423     static int64_t last_delta;
 424
 425     /* If the VM is not running, then do nothing.  */
 426     if (!runstate_is_running()) {
 427         return;
 428     }
 429
 430     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 431                        &timers_state.vm_clock_lock);
 432     cur_time = cpu_get_clock_locked();
 433     cur_icount = cpu_get_icount_locked();
 434
 435     delta = cur_icount - cur_time;
 436     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 437     if (delta > 0
 438         && last_delta + ICOUNT_WOBBLE < delta * 2
 439         && timers_state.icount_time_shift > 0) {
 440         /* The guest is getting too far ahead.  Slow time down.  */
 441         atomic_set(&timers_state.icount_time_shift,
 442                    timers_state.icount_time_shift - 1);
 443     }
 444     if (delta < 0
 445         && last_delta - ICOUNT_WOBBLE > delta * 2
 446         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 447         /* The guest is getting too far behind.  Speed time up.  */
 448         atomic_set(&timers_state.icount_time_shift,
 449                    timers_state.icount_time_shift + 1);
 450     }
 451     last_delta = delta;
 452     atomic_set__nocheck(&timers_state.qemu_icount_bias,
 453                         cur_icount - (timers_state.qemu_icount
 454                                       << timers_state.icount_time_shift));
 455     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 456                          &timers_state.vm_clock_lock);
 457 }
 458
 459 static void icount_adjust_rt(void *opaque)
 460 {
 461     timer_mod(timers_state.icount_rt_timer,
 462               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 463     icount_adjust();
 464 }
 465
 466 static void icount_adjust_vm(void *opaque)
 467 {
 468     timer_mod(timers_state.icount_vm_timer,
 469                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 470                    NANOSECONDS_PER_SECOND / 10);
 471     icount_adjust();
 472 }
 473
 474 static int64_t qemu_icount_round(int64_t count)
 475 {
 476     int shift = atomic_read(&timers_state.icount_time_shift);
 477     return (count + (1 << shift) - 1) >> shift;
 478 }
 479
 480 static void icount_warp_rt(void)
 481 {
 482     unsigned seq;
 483     int64_t warp_start;
 484
 485     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 486      * changes from -1 to another value, so the race here is okay.
 487      */
 488     do {
 489         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 490         warp_start = timers_state.vm_clock_warp_start;
 491     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 492
 493     if (warp_start == -1) {
 494         return;
 495     }
 496
 497     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 498                        &timers_state.vm_clock_lock);
 499     if (runstate_is_running()) {
 500         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 501                                      cpu_get_clock_locked());
 502         int64_t warp_delta;
 503
 504         warp_delta = clock - timers_state.vm_clock_warp_start;
 505         if (use_icount == 2) {
 506             /*
 507              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 508              * far ahead of real time.
 509              */
 510             int64_t cur_icount = cpu_get_icount_locked();
 511             int64_t delta = clock - cur_icount;
 512             warp_delta = MIN(warp_delta, delta);
 513         }
 514         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 515                             timers_state.qemu_icount_bias + warp_delta);
 516     }
 517     timers_state.vm_clock_warp_start = -1;
 518     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 519                        &timers_state.vm_clock_lock);
 520
 521     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 522         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 523     }
 524 }
 525
 526 static void icount_timer_cb(void *opaque)
 527 {
 528     /* No need for a checkpoint because the timer already synchronizes
 529      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 530      */
 531     icount_warp_rt();
 532 }
 533
 534 void qtest_clock_warp(int64_t dest)
 535 {
 536     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 537     AioContext *aio_context;
 538     assert(qtest_enabled());
 539     aio_context = qemu_get_aio_context();
 540     while (clock < dest) {
 541         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 542         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 543
 544         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 545                            &timers_state.vm_clock_lock);
 546         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 547                             timers_state.qemu_icount_bias + warp);
 548         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 549                              &timers_state.vm_clock_lock);
 550
 551         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 552         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 553         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 554     }
 555     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 556 }
 557
 558 void qemu_start_warp_timer(void)
 559 {
 560     int64_t clock;
 561     int64_t deadline;
 562
 563     if (!use_icount) {
 564         return;
 565     }
 566
 567     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 568      * do not fire, so computing the deadline does not make sense.
 569      */
 570     if (!runstate_is_running()) {
 571         return;
 572     }
 573
 574     /* warp clock deterministically in record/replay mode */
 575     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 576         return;
 577     }
 578
 579     if (!all_cpu_threads_idle()) {
 580         return;
 581     }
 582
 583     if (qtest_enabled()) {
 584         /* When testing, qtest commands advance icount.  */
 585         return;
 586     }
 587
 588     /* We want to use the earliest deadline from ALL vm_clocks */
 589     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 590     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 591     if (deadline < 0) {
 592         static bool notified;
 593         if (!icount_sleep && !notified) {
 594             warn_report("icount sleep disabled and no active timers");
 595             notified = true;
 596         }
 597         return;
 598     }
 599
 600     if (deadline > 0) {
 601         /*
 602          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 603          * sleep.  Otherwise, the CPU might be waiting for a future timer
 604          * interrupt to wake it up, but the interrupt never comes because
 605          * the vCPU isn't running any insns and thus doesn't advance the
 606          * QEMU_CLOCK_VIRTUAL.
 607          */
 608         if (!icount_sleep) {
 609             /*
 610              * We never let VCPUs sleep in no sleep icount mode.
 611              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 612              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 613              * It is useful when we want a deterministic execution time,
 614              * isolated from host latencies.
 615              */
 616             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 617                                &timers_state.vm_clock_lock);
 618             atomic_set__nocheck(&timers_state.qemu_icount_bias,
 619                                 timers_state.qemu_icount_bias + deadline);
 620             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 621                                  &timers_state.vm_clock_lock);
 622             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 623         } else {
 624             /*
 625              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 626              * "real" time, (related to the time left until the next event) has
 627              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 628              * This avoids that the warps are visible externally; for example,
 629              * you will not be sending network packets continuously instead of
 630              * every 100ms.
 631              */
 632             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 633                                &timers_state.vm_clock_lock);
 634             if (timers_state.vm_clock_warp_start == -1
 635                 || timers_state.vm_clock_warp_start > clock) {
 636                 timers_state.vm_clock_warp_start = clock;
 637             }
 638             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 639                                  &timers_state.vm_clock_lock);
 640             timer_mod_anticipate(timers_state.icount_warp_timer,
 641                                  clock + deadline);
 642         }
 643     } else if (deadline == 0) {
 644         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 645     }
 646 }
 647
 648 static void qemu_account_warp_timer(void)
 649 {
 650     if (!use_icount || !icount_sleep) {
 651         return;
 652     }
 653
 654     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 655      * do not fire, so computing the deadline does not make sense.
 656      */
 657     if (!runstate_is_running()) {
 658         return;
 659     }
 660
 661     /* warp clock deterministically in record/replay mode */
 662     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 663         return;
 664     }
 665
 666     timer_del(timers_state.icount_warp_timer);
 667     icount_warp_rt();
 668 }
 669
 670 static bool icount_state_needed(void *opaque)
 671 {
 672     return use_icount;
 673 }
 674
 675 static bool warp_timer_state_needed(void *opaque)
 676 {
 677     TimersState *s = opaque;
 678     return s->icount_warp_timer != NULL;
 679 }
 680
 681 static bool adjust_timers_state_needed(void *opaque)
 682 {
 683     TimersState *s = opaque;
 684     return s->icount_rt_timer != NULL;
 685 }
 686
 687 /*
 688  * Subsection for warp timer migration is optional, because may not be created
 689  */
 690 static const VMStateDescription icount_vmstate_warp_timer = {
 691     .name = "timer/icount/warp_timer",
 692     .version_id = 1,
 693     .minimum_version_id = 1,
 694     .needed = warp_timer_state_needed,
 695     .fields = (VMStateField[]) {
 696         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 697         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 698         VMSTATE_END_OF_LIST()
 699     }
 700 };
 701
 702 static const VMStateDescription icount_vmstate_adjust_timers = {
 703     .name = "timer/icount/timers",
 704     .version_id = 1,
 705     .minimum_version_id = 1,
 706     .needed = adjust_timers_state_needed,
 707     .fields = (VMStateField[]) {
 708         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 709         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 710         VMSTATE_END_OF_LIST()
 711     }
 712 };
 713
 714 /*
 715  * This is a subsection for icount migration.
 716  */
 717 static const VMStateDescription icount_vmstate_timers = {
 718     .name = "timer/icount",
 719     .version_id = 1,
 720     .minimum_version_id = 1,
 721     .needed = icount_state_needed,
 722     .fields = (VMStateField[]) {
 723         VMSTATE_INT64(qemu_icount_bias, TimersState),
 724         VMSTATE_INT64(qemu_icount, TimersState),
 725         VMSTATE_END_OF_LIST()
 726     },
 727     .subsections = (const VMStateDescription*[]) {
 728         &icount_vmstate_warp_timer,
 729         &icount_vmstate_adjust_timers,
 730         NULL
 731     }
 732 };
 733
 734 static const VMStateDescription vmstate_timers = {
 735     .name = "timer",
 736     .version_id = 2,
 737     .minimum_version_id = 1,
 738     .fields = (VMStateField[]) {
 739         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 740         VMSTATE_UNUSED(8),
 741         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 742         VMSTATE_END_OF_LIST()
 743     },
 744     .subsections = (const VMStateDescription*[]) {
 745         &icount_vmstate_timers,
 746         NULL
 747     }
 748 };
 749
 750 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 751 {
 752     double pct;
 753     double throttle_ratio;
 754     long sleeptime_ns;
 755
 756     if (!cpu_throttle_get_percentage()) {
 757         return;
 758     }
 759
 760     pct = (double)cpu_throttle_get_percentage()/100;
 761     throttle_ratio = pct / (1 - pct);
 762     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 763
 764     qemu_mutex_unlock_iothread();
 765     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 766     qemu_mutex_lock_iothread();
 767     atomic_set(&cpu->throttle_thread_scheduled, 0);
 768 }
 769
 770 static void cpu_throttle_timer_tick(void *opaque)
 771 {
 772     CPUState *cpu;
 773     double pct;
 774
 775     /* Stop the timer if needed */
 776     if (!cpu_throttle_get_percentage()) {
 777         return;
 778     }
 779     CPU_FOREACH(cpu) {
 780         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 781             async_run_on_cpu(cpu, cpu_throttle_thread,
 782                              RUN_ON_CPU_NULL);
 783         }
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 788                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 789 }
 790
 791 void cpu_throttle_set(int new_throttle_pct)
 792 {
 793     /* Ensure throttle percentage is within valid range */
 794     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 795     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 796
 797     atomic_set(&throttle_percentage, new_throttle_pct);
 798
 799     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 800                                        CPU_THROTTLE_TIMESLICE_NS);
 801 }
 802
 803 void cpu_throttle_stop(void)
 804 {
 805     atomic_set(&throttle_percentage, 0);
 806 }
 807
 808 bool cpu_throttle_active(void)
 809 {
 810     return (cpu_throttle_get_percentage() != 0);
 811 }
 812
 813 int cpu_throttle_get_percentage(void)
 814 {
 815     return atomic_read(&throttle_percentage);
 816 }
 817
 818 void cpu_ticks_init(void)
 819 {
 820     seqlock_init(&timers_state.vm_clock_seqlock);
 821     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 822     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 823                                            cpu_throttle_timer_tick, NULL);
 824 }
 825
 826 void configure_icount(QemuOpts *opts, Error **errp)
 827 {
 828     const char *option;
 829     char *rem_str = NULL;
 830
 831     option = qemu_opt_get(opts, "shift");
 832     if (!option) {
 833         if (qemu_opt_get(opts, "align") != NULL) {
 834             error_setg(errp, "Please specify shift option when using align");
 835         }
 836         return;
 837     }
 838
 839     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 840     if (icount_sleep) {
 841         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 842                                          icount_timer_cb, NULL);
 843     }
 844
 845     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 846
 847     if (icount_align_option && !icount_sleep) {
 848         error_setg(errp, "align=on and sleep=off are incompatible");
 849     }
 850     if (strcmp(option, "auto") != 0) {
 851         errno = 0;
 852         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 853         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 854             error_setg(errp, "icount: Invalid shift value");
 855         }
 856         use_icount = 1;
 857         return;
 858     } else if (icount_align_option) {
 859         error_setg(errp, "shift=auto and align=on are incompatible");
 860     } else if (!icount_sleep) {
 861         error_setg(errp, "shift=auto and sleep=off are incompatible");
 862     }
 863
 864     use_icount = 2;
 865
 866     /* 125MIPS seems a reasonable initial guess at the guest speed.
 867        It will be corrected fairly quickly anyway.  */
 868     timers_state.icount_time_shift = 3;
 869
 870     /* Have both realtime and virtual time triggers for speed adjustment.
 871        The realtime trigger catches emulated time passing too slowly,
 872        the virtual time trigger catches emulated time passing too fast.
 873        Realtime triggers occur even when idle, so use them less frequently
 874        than VM triggers.  */
 875     timers_state.vm_clock_warp_start = -1;
 876     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 877                                    icount_adjust_rt, NULL);
 878     timer_mod(timers_state.icount_rt_timer,
 879                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 880     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 881                                         icount_adjust_vm, NULL);
 882     timer_mod(timers_state.icount_vm_timer,
 883                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 884                    NANOSECONDS_PER_SECOND / 10);
 885 }
 886
 887 /***********************************************************/
 888 /* TCG vCPU kick timer
 889  *
 890  * The kick timer is responsible for moving single threaded vCPU
 891  * emulation on to the next vCPU. If more than one vCPU is running a
 892  * timer event with force a cpu->exit so the next vCPU can get
 893  * scheduled.
 894  *
 895  * The timer is removed if all vCPUs are idle and restarted again once
 896  * idleness is complete.
 897  */
 898
 899 static QEMUTimer *tcg_kick_vcpu_timer;
 900 static CPUState *tcg_current_rr_cpu;
 901
 902 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 903
 904 static inline int64_t qemu_tcg_next_kick(void)
 905 {
 906     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 907 }
 908
 909 /* Kick the currently round-robin scheduled vCPU */
 910 static void qemu_cpu_kick_rr_cpu(void)
 911 {
 912     CPUState *cpu;
 913     do {
 914         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 915         if (cpu) {
 916             cpu_exit(cpu);
 917         }
 918     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 919 }
 920
 921 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 922 {
 923 }
 924
 925 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 926 {
 927     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 928         qemu_notify_event();
 929         return;
 930     }
 931
 932     if (qemu_in_vcpu_thread()) {
 933         /* A CPU is currently running; kick it back out to the
 934          * tcg_cpu_exec() loop so it will recalculate its
 935          * icount deadline immediately.
 936          */
 937         qemu_cpu_kick(current_cpu);
 938     } else if (first_cpu) {
 939         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 940          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 941          * causes cpu_thread_is_idle to return false.  This way,
 942          * handle_icount_deadline can run.
 943          * If we have no CPUs at all for some reason, we don't
 944          * need to do anything.
 945          */
 946         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 947     }
 948 }
 949
 950 static void kick_tcg_thread(void *opaque)
 951 {
 952     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 953     qemu_cpu_kick_rr_cpu();
 954 }
 955
 956 static void start_tcg_kick_timer(void)
 957 {
 958     assert(!mttcg_enabled);
 959     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 960         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 961                                            kick_tcg_thread, NULL);
 962         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 963     }
 964 }
 965
 966 static void stop_tcg_kick_timer(void)
 967 {
 968     assert(!mttcg_enabled);
 969     if (tcg_kick_vcpu_timer) {
 970         timer_del(tcg_kick_vcpu_timer);
 971         tcg_kick_vcpu_timer = NULL;
 972     }
 973 }
 974
 975 /***********************************************************/
 976 void hw_error(const char *fmt, ...)
 977 {
 978     va_list ap;
 979     CPUState *cpu;
 980
 981     va_start(ap, fmt);
 982     fprintf(stderr, "qemu: hardware error: ");
 983     vfprintf(stderr, fmt, ap);
 984     fprintf(stderr, "\n");
 985     CPU_FOREACH(cpu) {
 986         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 987         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 988     }
 989     va_end(ap);
 990     abort();
 991 }
 992
 993 void cpu_synchronize_all_states(void)
 994 {
 995     CPUState *cpu;
 996
 997     CPU_FOREACH(cpu) {
 998         cpu_synchronize_state(cpu);
 999         /* TODO: move to cpu_synchronize_state() */
1000         if (hvf_enabled()) {
1001             hvf_cpu_synchronize_state(cpu);
1002         }
1003     }
1004 }
1005
1006 void cpu_synchronize_all_post_reset(void)
1007 {
1008     CPUState *cpu;
1009
1010     CPU_FOREACH(cpu) {
1011         cpu_synchronize_post_reset(cpu);
1012         /* TODO: move to cpu_synchronize_post_reset() */
1013         if (hvf_enabled()) {
1014             hvf_cpu_synchronize_post_reset(cpu);
1015         }
1016     }
1017 }
1018
1019 void cpu_synchronize_all_post_init(void)
1020 {
1021     CPUState *cpu;
1022
1023     CPU_FOREACH(cpu) {
1024         cpu_synchronize_post_init(cpu);
1025         /* TODO: move to cpu_synchronize_post_init() */
1026         if (hvf_enabled()) {
1027             hvf_cpu_synchronize_post_init(cpu);
1028         }
1029     }
1030 }
1031
1032 void cpu_synchronize_all_pre_loadvm(void)
1033 {
1034     CPUState *cpu;
1035
1036     CPU_FOREACH(cpu) {
1037         cpu_synchronize_pre_loadvm(cpu);
1038     }
1039 }
1040
1041 static int do_vm_stop(RunState state, bool send_stop)
1042 {
1043     int ret = 0;
1044
1045     if (runstate_is_running()) {
1046         cpu_disable_ticks();
1047         pause_all_vcpus();
1048         runstate_set(state);
1049         vm_state_notify(0, state);
1050         if (send_stop) {
1051             qapi_event_send_stop(&error_abort);
1052         }
1053     }
1054
1055     bdrv_drain_all();
1056     replay_disable_events();
1057     ret = bdrv_flush_all();
1058
1059     return ret;
1060 }
1061
1062 /* Special vm_stop() variant for terminating the process.  Historically clients
1063  * did not expect a QMP STOP event and so we need to retain compatibility.
1064  */
1065 int vm_shutdown(void)
1066 {
1067     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1068 }
1069
1070 static bool cpu_can_run(CPUState *cpu)
1071 {
1072     if (cpu->stop) {
1073         return false;
1074     }
1075     if (cpu_is_stopped(cpu)) {
1076         return false;
1077     }
1078     return true;
1079 }
1080
1081 static void cpu_handle_guest_debug(CPUState *cpu)
1082 {
1083     gdb_set_stop_cpu(cpu);
1084     qemu_system_debug_request();
1085     cpu->stopped = true;
1086 }
1087
1088 #ifdef CONFIG_LINUX
1089 static void sigbus_reraise(void)
1090 {
1091     sigset_t set;
1092     struct sigaction action;
1093
1094     memset(&action, 0, sizeof(action));
1095     action.sa_handler = SIG_DFL;
1096     if (!sigaction(SIGBUS, &action, NULL)) {
1097         raise(SIGBUS);
1098         sigemptyset(&set);
1099         sigaddset(&set, SIGBUS);
1100         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1101     }
1102     perror("Failed to re-raise SIGBUS!\n");
1103     abort();
1104 }
1105
1106 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1107 {
1108     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1109         sigbus_reraise();
1110     }
1111
1112     if (current_cpu) {
1113         /* Called asynchronously in VCPU thread.  */
1114         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1115             sigbus_reraise();
1116         }
1117     } else {
1118         /* Called synchronously (via signalfd) in main thread.  */
1119         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1120             sigbus_reraise();
1121         }
1122     }
1123 }
1124
1125 static void qemu_init_sigbus(void)
1126 {
1127     struct sigaction action;
1128
1129     memset(&action, 0, sizeof(action));
1130     action.sa_flags = SA_SIGINFO;
1131     action.sa_sigaction = sigbus_handler;
1132     sigaction(SIGBUS, &action, NULL);
1133
1134     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1135 }
1136 #else /* !CONFIG_LINUX */
1137 static void qemu_init_sigbus(void)
1138 {
1139 }
1140 #endif /* !CONFIG_LINUX */
1141
1142 static QemuMutex qemu_global_mutex;
1143
1144 static QemuThread io_thread;
1145
1146 /* cpu creation */
1147 static QemuCond qemu_cpu_cond;
1148 /* system init */
1149 static QemuCond qemu_pause_cond;
1150
1151 void qemu_init_cpu_loop(void)
1152 {
1153     qemu_init_sigbus();
1154     qemu_cond_init(&qemu_cpu_cond);
1155     qemu_cond_init(&qemu_pause_cond);
1156     qemu_mutex_init(&qemu_global_mutex);
1157
1158     qemu_thread_get_self(&io_thread);
1159 }
1160
1161 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1162 {
1163     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1164 }
1165
1166 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1167 {
1168     if (kvm_destroy_vcpu(cpu) < 0) {
1169         error_report("kvm_destroy_vcpu failed");
1170         exit(EXIT_FAILURE);
1171     }
1172 }
1173
1174 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1175 {
1176 }
1177
1178 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1179 {
1180     g_assert(qemu_cpu_is_self(cpu));
1181     cpu->stop = false;
1182     cpu->stopped = true;
1183     if (exit) {
1184         cpu_exit(cpu);
1185     }
1186     qemu_cond_broadcast(&qemu_pause_cond);
1187 }
1188
1189 static void qemu_wait_io_event_common(CPUState *cpu)
1190 {
1191     atomic_mb_set(&cpu->thread_kicked, false);
1192     if (cpu->stop) {
1193         qemu_cpu_stop(cpu, false);
1194     }
1195     process_queued_cpu_work(cpu);
1196 }
1197
1198 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1199 {
1200     while (all_cpu_threads_idle()) {
1201         stop_tcg_kick_timer();
1202         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1203     }
1204
1205     start_tcg_kick_timer();
1206
1207     qemu_wait_io_event_common(cpu);
1208 }
1209
1210 static void qemu_wait_io_event(CPUState *cpu)
1211 {
1212     while (cpu_thread_is_idle(cpu)) {
1213         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1214     }
1215
1216 #ifdef _WIN32
1217     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1218     if (!tcg_enabled()) {
1219         SleepEx(0, TRUE);
1220     }
1221 #endif
1222     qemu_wait_io_event_common(cpu);
1223 }
1224
1225 static void *qemu_kvm_cpu_thread_fn(void *arg)
1226 {
1227     CPUState *cpu = arg;
1228     int r;
1229
1230     rcu_register_thread();
1231
1232     qemu_mutex_lock_iothread();
1233     qemu_thread_get_self(cpu->thread);
1234     cpu->thread_id = qemu_get_thread_id();
1235     cpu->can_do_io = 1;
1236     current_cpu = cpu;
1237
1238     r = kvm_init_vcpu(cpu);
1239     if (r < 0) {
1240         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1241         exit(1);
1242     }
1243
1244     kvm_init_cpu_signals(cpu);
1245
1246     /* signal CPU creation */
1247     cpu->created = true;
1248     qemu_cond_signal(&qemu_cpu_cond);
1249
1250     do {
1251         if (cpu_can_run(cpu)) {
1252             r = kvm_cpu_exec(cpu);
1253             if (r == EXCP_DEBUG) {
1254                 cpu_handle_guest_debug(cpu);
1255             }
1256         }
1257         qemu_wait_io_event(cpu);
1258     } while (!cpu->unplug || cpu_can_run(cpu));
1259
1260     qemu_kvm_destroy_vcpu(cpu);
1261     cpu->created = false;
1262     qemu_cond_signal(&qemu_cpu_cond);
1263     qemu_mutex_unlock_iothread();
1264     rcu_unregister_thread();
1265     return NULL;
1266 }
1267
1268 static void *qemu_dummy_cpu_thread_fn(void *arg)
1269 {
1270 #ifdef _WIN32
1271     error_report("qtest is not supported under Windows");
1272     exit(1);
1273 #else
1274     CPUState *cpu = arg;
1275     sigset_t waitset;
1276     int r;
1277
1278     rcu_register_thread();
1279
1280     qemu_mutex_lock_iothread();
1281     qemu_thread_get_self(cpu->thread);
1282     cpu->thread_id = qemu_get_thread_id();
1283     cpu->can_do_io = 1;
1284     current_cpu = cpu;
1285
1286     sigemptyset(&waitset);
1287     sigaddset(&waitset, SIG_IPI);
1288
1289     /* signal CPU creation */
1290     cpu->created = true;
1291     qemu_cond_signal(&qemu_cpu_cond);
1292
1293     do {
1294         qemu_mutex_unlock_iothread();
1295         do {
1296             int sig;
1297             r = sigwait(&waitset, &sig);
1298         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1299         if (r == -1) {
1300             perror("sigwait");
1301             exit(1);
1302         }
1303         qemu_mutex_lock_iothread();
1304         qemu_wait_io_event(cpu);
1305     } while (!cpu->unplug);
1306
1307     rcu_unregister_thread();
1308     return NULL;
1309 #endif
1310 }
1311
1312 static int64_t tcg_get_icount_limit(void)
1313 {
1314     int64_t deadline;
1315
1316     if (replay_mode != REPLAY_MODE_PLAY) {
1317         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1318
1319         /* Maintain prior (possibly buggy) behaviour where if no deadline
1320          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1321          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1322          * nanoseconds.
1323          */
1324         if ((deadline < 0) || (deadline > INT32_MAX)) {
1325             deadline = INT32_MAX;
1326         }
1327
1328         return qemu_icount_round(deadline);
1329     } else {
1330         return replay_get_instructions();
1331     }
1332 }
1333
1334 static void handle_icount_deadline(void)
1335 {
1336     assert(qemu_in_vcpu_thread());
1337     if (use_icount) {
1338         int64_t deadline =
1339             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1340
1341         if (deadline == 0) {
1342             /* Wake up other AioContexts.  */
1343             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1344             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1345         }
1346     }
1347 }
1348
1349 static void prepare_icount_for_run(CPUState *cpu)
1350 {
1351     if (use_icount) {
1352         int insns_left;
1353
1354         /* These should always be cleared by process_icount_data after
1355          * each vCPU execution. However u16.high can be raised
1356          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1357          */
1358         g_assert(cpu->icount_decr.u16.low == 0);
1359         g_assert(cpu->icount_extra == 0);
1360
1361         cpu->icount_budget = tcg_get_icount_limit();
1362         insns_left = MIN(0xffff, cpu->icount_budget);
1363         cpu->icount_decr.u16.low = insns_left;
1364         cpu->icount_extra = cpu->icount_budget - insns_left;
1365
1366         replay_mutex_lock();
1367     }
1368 }
1369
1370 static void process_icount_data(CPUState *cpu)
1371 {
1372     if (use_icount) {
1373         /* Account for executed instructions */
1374         cpu_update_icount(cpu);
1375
1376         /* Reset the counters */
1377         cpu->icount_decr.u16.low = 0;
1378         cpu->icount_extra = 0;
1379         cpu->icount_budget = 0;
1380
1381         replay_account_executed_instructions();
1382
1383         replay_mutex_unlock();
1384     }
1385 }
1386
1387
1388 static int tcg_cpu_exec(CPUState *cpu)
1389 {
1390     int ret;
1391 #ifdef CONFIG_PROFILER
1392     int64_t ti;
1393 #endif
1394
1395     assert(tcg_enabled());
1396 #ifdef CONFIG_PROFILER
1397     ti = profile_getclock();
1398 #endif
1399     cpu_exec_start(cpu);
1400     ret = cpu_exec(cpu);
1401     cpu_exec_end(cpu);
1402 #ifdef CONFIG_PROFILER
1403     tcg_time += profile_getclock() - ti;
1404 #endif
1405     return ret;
1406 }
1407
1408 /* Destroy any remaining vCPUs which have been unplugged and have
1409  * finished running
1410  */
1411 static void deal_with_unplugged_cpus(void)
1412 {
1413     CPUState *cpu;
1414
1415     CPU_FOREACH(cpu) {
1416         if (cpu->unplug && !cpu_can_run(cpu)) {
1417             qemu_tcg_destroy_vcpu(cpu);
1418             cpu->created = false;
1419             qemu_cond_signal(&qemu_cpu_cond);
1420             break;
1421         }
1422     }
1423 }
1424
1425 /* Single-threaded TCG
1426  *
1427  * In the single-threaded case each vCPU is simulated in turn. If
1428  * there is more than a single vCPU we create a simple timer to kick
1429  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1430  * This is done explicitly rather than relying on side-effects
1431  * elsewhere.
1432  */
1433
1434 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1435 {
1436     CPUState *cpu = arg;
1437
1438     assert(tcg_enabled());
1439     rcu_register_thread();
1440     tcg_register_thread();
1441
1442     qemu_mutex_lock_iothread();
1443     qemu_thread_get_self(cpu->thread);
1444
1445     cpu->thread_id = qemu_get_thread_id();
1446     cpu->created = true;
1447     cpu->can_do_io = 1;
1448     qemu_cond_signal(&qemu_cpu_cond);
1449
1450     /* wait for initial kick-off after machine start */
1451     while (first_cpu->stopped) {
1452         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1453
1454         /* process any pending work */
1455         CPU_FOREACH(cpu) {
1456             current_cpu = cpu;
1457             qemu_wait_io_event_common(cpu);
1458         }
1459     }
1460
1461     start_tcg_kick_timer();
1462
1463     cpu = first_cpu;
1464
1465     /* process any pending work */
1466     cpu->exit_request = 1;
1467
1468     while (1) {
1469         qemu_mutex_unlock_iothread();
1470         replay_mutex_lock();
1471         qemu_mutex_lock_iothread();
1472         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1473         qemu_account_warp_timer();
1474
1475         /* Run the timers here.  This is much more efficient than
1476          * waking up the I/O thread and waiting for completion.
1477          */
1478         handle_icount_deadline();
1479
1480         replay_mutex_unlock();
1481
1482         if (!cpu) {
1483             cpu = first_cpu;
1484         }
1485
1486         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1487
1488             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1489             current_cpu = cpu;
1490
1491             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1492                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1493
1494             if (cpu_can_run(cpu)) {
1495                 int r;
1496
1497                 qemu_mutex_unlock_iothread();
1498                 prepare_icount_for_run(cpu);
1499
1500                 r = tcg_cpu_exec(cpu);
1501
1502                 process_icount_data(cpu);
1503                 qemu_mutex_lock_iothread();
1504
1505                 if (r == EXCP_DEBUG) {
1506                     cpu_handle_guest_debug(cpu);
1507                     break;
1508                 } else if (r == EXCP_ATOMIC) {
1509                     qemu_mutex_unlock_iothread();
1510                     cpu_exec_step_atomic(cpu);
1511                     qemu_mutex_lock_iothread();
1512                     break;
1513                 }
1514             } else if (cpu->stop) {
1515                 if (cpu->unplug) {
1516                     cpu = CPU_NEXT(cpu);
1517                 }
1518                 break;
1519             }
1520
1521             cpu = CPU_NEXT(cpu);
1522         } /* while (cpu && !cpu->exit_request).. */
1523
1524         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1525         atomic_set(&tcg_current_rr_cpu, NULL);
1526
1527         if (cpu && cpu->exit_request) {
1528             atomic_mb_set(&cpu->exit_request, 0);
1529         }
1530
1531         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1532         deal_with_unplugged_cpus();
1533     }
1534
1535     rcu_unregister_thread();
1536     return NULL;
1537 }
1538
1539 static void *qemu_hax_cpu_thread_fn(void *arg)
1540 {
1541     CPUState *cpu = arg;
1542     int r;
1543
1544     rcu_register_thread();
1545     qemu_mutex_lock_iothread();
1546     qemu_thread_get_self(cpu->thread);
1547
1548     cpu->thread_id = qemu_get_thread_id();
1549     cpu->created = true;
1550     cpu->halted = 0;
1551     current_cpu = cpu;
1552
1553     hax_init_vcpu(cpu);
1554     qemu_cond_signal(&qemu_cpu_cond);
1555
1556     do {
1557         if (cpu_can_run(cpu)) {
1558             r = hax_smp_cpu_exec(cpu);
1559             if (r == EXCP_DEBUG) {
1560                 cpu_handle_guest_debug(cpu);
1561             }
1562         }
1563
1564         qemu_wait_io_event(cpu);
1565     } while (!cpu->unplug || cpu_can_run(cpu));
1566     rcu_unregister_thread();
1567     return NULL;
1568 }
1569
1570 /* The HVF-specific vCPU thread function. This one should only run when the host
1571  * CPU supports the VMX "unrestricted guest" feature. */
1572 static void *qemu_hvf_cpu_thread_fn(void *arg)
1573 {
1574     CPUState *cpu = arg;
1575
1576     int r;
1577
1578     assert(hvf_enabled());
1579
1580     rcu_register_thread();
1581
1582     qemu_mutex_lock_iothread();
1583     qemu_thread_get_self(cpu->thread);
1584
1585     cpu->thread_id = qemu_get_thread_id();
1586     cpu->can_do_io = 1;
1587     current_cpu = cpu;
1588
1589     hvf_init_vcpu(cpu);
1590
1591     /* signal CPU creation */
1592     cpu->created = true;
1593     qemu_cond_signal(&qemu_cpu_cond);
1594
1595     do {
1596         if (cpu_can_run(cpu)) {
1597             r = hvf_vcpu_exec(cpu);
1598             if (r == EXCP_DEBUG) {
1599                 cpu_handle_guest_debug(cpu);
1600             }
1601         }
1602         qemu_wait_io_event(cpu);
1603     } while (!cpu->unplug || cpu_can_run(cpu));
1604
1605     hvf_vcpu_destroy(cpu);
1606     cpu->created = false;
1607     qemu_cond_signal(&qemu_cpu_cond);
1608     qemu_mutex_unlock_iothread();
1609     rcu_unregister_thread();
1610     return NULL;
1611 }
1612
1613 static void *qemu_whpx_cpu_thread_fn(void *arg)
1614 {
1615     CPUState *cpu = arg;
1616     int r;
1617
1618     rcu_register_thread();
1619
1620     qemu_mutex_lock_iothread();
1621     qemu_thread_get_self(cpu->thread);
1622     cpu->thread_id = qemu_get_thread_id();
1623     current_cpu = cpu;
1624
1625     r = whpx_init_vcpu(cpu);
1626     if (r < 0) {
1627         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1628         exit(1);
1629     }
1630
1631     /* signal CPU creation */
1632     cpu->created = true;
1633     qemu_cond_signal(&qemu_cpu_cond);
1634
1635     do {
1636         if (cpu_can_run(cpu)) {
1637             r = whpx_vcpu_exec(cpu);
1638             if (r == EXCP_DEBUG) {
1639                 cpu_handle_guest_debug(cpu);
1640             }
1641         }
1642         while (cpu_thread_is_idle(cpu)) {
1643             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1644         }
1645         qemu_wait_io_event_common(cpu);
1646     } while (!cpu->unplug || cpu_can_run(cpu));
1647
1648     whpx_destroy_vcpu(cpu);
1649     cpu->created = false;
1650     qemu_cond_signal(&qemu_cpu_cond);
1651     qemu_mutex_unlock_iothread();
1652     rcu_unregister_thread();
1653     return NULL;
1654 }
1655
1656 #ifdef _WIN32
1657 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1658 {
1659 }
1660 #endif
1661
1662 /* Multi-threaded TCG
1663  *
1664  * In the multi-threaded case each vCPU has its own thread. The TLS
1665  * variable current_cpu can be used deep in the code to find the
1666  * current CPUState for a given thread.
1667  */
1668
1669 static void *qemu_tcg_cpu_thread_fn(void *arg)
1670 {
1671     CPUState *cpu = arg;
1672
1673     assert(tcg_enabled());
1674     g_assert(!use_icount);
1675
1676     rcu_register_thread();
1677     tcg_register_thread();
1678
1679     qemu_mutex_lock_iothread();
1680     qemu_thread_get_self(cpu->thread);
1681
1682     cpu->thread_id = qemu_get_thread_id();
1683     cpu->created = true;
1684     cpu->can_do_io = 1;
1685     current_cpu = cpu;
1686     qemu_cond_signal(&qemu_cpu_cond);
1687
1688     /* process any pending work */
1689     cpu->exit_request = 1;
1690
1691     do {
1692         if (cpu_can_run(cpu)) {
1693             int r;
1694             qemu_mutex_unlock_iothread();
1695             r = tcg_cpu_exec(cpu);
1696             qemu_mutex_lock_iothread();
1697             switch (r) {
1698             case EXCP_DEBUG:
1699                 cpu_handle_guest_debug(cpu);
1700                 break;
1701             case EXCP_HALTED:
1702                 /* during start-up the vCPU is reset and the thread is
1703                  * kicked several times. If we don't ensure we go back
1704                  * to sleep in the halted state we won't cleanly
1705                  * start-up when the vCPU is enabled.
1706                  *
1707                  * cpu->halted should ensure we sleep in wait_io_event
1708                  */
1709                 g_assert(cpu->halted);
1710                 break;
1711             case EXCP_ATOMIC:
1712                 qemu_mutex_unlock_iothread();
1713                 cpu_exec_step_atomic(cpu);
1714                 qemu_mutex_lock_iothread();
1715             default:
1716                 /* Ignore everything else? */
1717                 break;
1718             }
1719         }
1720
1721         atomic_mb_set(&cpu->exit_request, 0);
1722         qemu_wait_io_event(cpu);
1723     } while (!cpu->unplug || cpu_can_run(cpu));
1724
1725     qemu_tcg_destroy_vcpu(cpu);
1726     cpu->created = false;
1727     qemu_cond_signal(&qemu_cpu_cond);
1728     qemu_mutex_unlock_iothread();
1729     rcu_unregister_thread();
1730     return NULL;
1731 }
1732
1733 static void qemu_cpu_kick_thread(CPUState *cpu)
1734 {
1735 #ifndef _WIN32
1736     int err;
1737
1738     if (cpu->thread_kicked) {
1739         return;
1740     }
1741     cpu->thread_kicked = true;
1742     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1743     if (err) {
1744         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1745         exit(1);
1746     }
1747 #else /* _WIN32 */
1748     if (!qemu_cpu_is_self(cpu)) {
1749         if (whpx_enabled()) {
1750             whpx_vcpu_kick(cpu);
1751         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1752             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1753                     __func__, GetLastError());
1754             exit(1);
1755         }
1756     }
1757 #endif
1758 }
1759
1760 void qemu_cpu_kick(CPUState *cpu)
1761 {
1762     qemu_cond_broadcast(cpu->halt_cond);
1763     if (tcg_enabled()) {
1764         cpu_exit(cpu);
1765         /* NOP unless doing single-thread RR */
1766         qemu_cpu_kick_rr_cpu();
1767     } else {
1768         if (hax_enabled()) {
1769             /*
1770              * FIXME: race condition with the exit_request check in
1771              * hax_vcpu_hax_exec
1772              */
1773             cpu->exit_request = 1;
1774         }
1775         qemu_cpu_kick_thread(cpu);
1776     }
1777 }
1778
1779 void qemu_cpu_kick_self(void)
1780 {
1781     assert(current_cpu);
1782     qemu_cpu_kick_thread(current_cpu);
1783 }
1784
1785 bool qemu_cpu_is_self(CPUState *cpu)
1786 {
1787     return qemu_thread_is_self(cpu->thread);
1788 }
1789
1790 bool qemu_in_vcpu_thread(void)
1791 {
1792     return current_cpu && qemu_cpu_is_self(current_cpu);
1793 }
1794
1795 static __thread bool iothread_locked = false;
1796
1797 bool qemu_mutex_iothread_locked(void)
1798 {
1799     return iothread_locked;
1800 }
1801
1802 /*
1803  * The BQL is taken from so many places that it is worth profiling the
1804  * callers directly, instead of funneling them all through a single function.
1805  */
1806 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1807 {
1808     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1809
1810     g_assert(!qemu_mutex_iothread_locked());
1811     bql_lock(&qemu_global_mutex, file, line);
1812     iothread_locked = true;
1813 }
1814
1815 void qemu_mutex_unlock_iothread(void)
1816 {
1817     g_assert(qemu_mutex_iothread_locked());
1818     iothread_locked = false;
1819     qemu_mutex_unlock(&qemu_global_mutex);
1820 }
1821
1822 static bool all_vcpus_paused(void)
1823 {
1824     CPUState *cpu;
1825
1826     CPU_FOREACH(cpu) {
1827         if (!cpu->stopped) {
1828             return false;
1829         }
1830     }
1831
1832     return true;
1833 }
1834
1835 void pause_all_vcpus(void)
1836 {
1837     CPUState *cpu;
1838
1839     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1840     CPU_FOREACH(cpu) {
1841         if (qemu_cpu_is_self(cpu)) {
1842             qemu_cpu_stop(cpu, true);
1843         } else {
1844             cpu->stop = true;
1845             qemu_cpu_kick(cpu);
1846         }
1847     }
1848
1849     /* We need to drop the replay_lock so any vCPU threads woken up
1850      * can finish their replay tasks
1851      */
1852     replay_mutex_unlock();
1853
1854     while (!all_vcpus_paused()) {
1855         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1856         CPU_FOREACH(cpu) {
1857             qemu_cpu_kick(cpu);
1858         }
1859     }
1860
1861     qemu_mutex_unlock_iothread();
1862     replay_mutex_lock();
1863     qemu_mutex_lock_iothread();
1864 }
1865
1866 void cpu_resume(CPUState *cpu)
1867 {
1868     cpu->stop = false;
1869     cpu->stopped = false;
1870     qemu_cpu_kick(cpu);
1871 }
1872
1873 void resume_all_vcpus(void)
1874 {
1875     CPUState *cpu;
1876
1877     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1878     CPU_FOREACH(cpu) {
1879         cpu_resume(cpu);
1880     }
1881 }
1882
1883 void cpu_remove_sync(CPUState *cpu)
1884 {
1885     cpu->stop = true;
1886     cpu->unplug = true;
1887     qemu_cpu_kick(cpu);
1888     qemu_mutex_unlock_iothread();
1889     qemu_thread_join(cpu->thread);
1890     qemu_mutex_lock_iothread();
1891 }
1892
1893 /* For temporary buffers for forming a name */
1894 #define VCPU_THREAD_NAME_SIZE 16
1895
1896 static void qemu_tcg_init_vcpu(CPUState *cpu)
1897 {
1898     char thread_name[VCPU_THREAD_NAME_SIZE];
1899     static QemuCond *single_tcg_halt_cond;
1900     static QemuThread *single_tcg_cpu_thread;
1901     static int tcg_region_inited;
1902
1903     assert(tcg_enabled());
1904     /*
1905      * Initialize TCG regions--once. Now is a good time, because:
1906      * (1) TCG's init context, prologue and target globals have been set up.
1907      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1908      *     -accel flag is processed, so the check doesn't work then).
1909      */
1910     if (!tcg_region_inited) {
1911         tcg_region_inited = 1;
1912         tcg_region_init();
1913     }
1914
1915     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1916         cpu->thread = g_malloc0(sizeof(QemuThread));
1917         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1918         qemu_cond_init(cpu->halt_cond);
1919
1920         if (qemu_tcg_mttcg_enabled()) {
1921             /* create a thread per vCPU with TCG (MTTCG) */
1922             parallel_cpus = true;
1923             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1924                  cpu->cpu_index);
1925
1926             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1927                                cpu, QEMU_THREAD_JOINABLE);
1928
1929         } else {
1930             /* share a single thread for all cpus with TCG */
1931             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1932             qemu_thread_create(cpu->thread, thread_name,
1933                                qemu_tcg_rr_cpu_thread_fn,
1934                                cpu, QEMU_THREAD_JOINABLE);
1935
1936             single_tcg_halt_cond = cpu->halt_cond;
1937             single_tcg_cpu_thread = cpu->thread;
1938         }
1939 #ifdef _WIN32
1940         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1941 #endif
1942     } else {
1943         /* For non-MTTCG cases we share the thread */
1944         cpu->thread = single_tcg_cpu_thread;
1945         cpu->halt_cond = single_tcg_halt_cond;
1946         cpu->thread_id = first_cpu->thread_id;
1947         cpu->can_do_io = 1;
1948         cpu->created = true;
1949     }
1950 }
1951
1952 static void qemu_hax_start_vcpu(CPUState *cpu)
1953 {
1954     char thread_name[VCPU_THREAD_NAME_SIZE];
1955
1956     cpu->thread = g_malloc0(sizeof(QemuThread));
1957     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1958     qemu_cond_init(cpu->halt_cond);
1959
1960     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1961              cpu->cpu_index);
1962     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1963                        cpu, QEMU_THREAD_JOINABLE);
1964 #ifdef _WIN32
1965     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1966 #endif
1967 }
1968
1969 static void qemu_kvm_start_vcpu(CPUState *cpu)
1970 {
1971     char thread_name[VCPU_THREAD_NAME_SIZE];
1972
1973     cpu->thread = g_malloc0(sizeof(QemuThread));
1974     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1975     qemu_cond_init(cpu->halt_cond);
1976     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1977              cpu->cpu_index);
1978     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1979                        cpu, QEMU_THREAD_JOINABLE);
1980 }
1981
1982 static void qemu_hvf_start_vcpu(CPUState *cpu)
1983 {
1984     char thread_name[VCPU_THREAD_NAME_SIZE];
1985
1986     /* HVF currently does not support TCG, and only runs in
1987      * unrestricted-guest mode. */
1988     assert(hvf_enabled());
1989
1990     cpu->thread = g_malloc0(sizeof(QemuThread));
1991     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1992     qemu_cond_init(cpu->halt_cond);
1993
1994     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1995              cpu->cpu_index);
1996     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1997                        cpu, QEMU_THREAD_JOINABLE);
1998 }
1999
2000 static void qemu_whpx_start_vcpu(CPUState *cpu)
2001 {
2002     char thread_name[VCPU_THREAD_NAME_SIZE];
2003
2004     cpu->thread = g_malloc0(sizeof(QemuThread));
2005     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006     qemu_cond_init(cpu->halt_cond);
2007     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2008              cpu->cpu_index);
2009     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2010                        cpu, QEMU_THREAD_JOINABLE);
2011 #ifdef _WIN32
2012     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2013 #endif
2014 }
2015
2016 static void qemu_dummy_start_vcpu(CPUState *cpu)
2017 {
2018     char thread_name[VCPU_THREAD_NAME_SIZE];
2019
2020     cpu->thread = g_malloc0(sizeof(QemuThread));
2021     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022     qemu_cond_init(cpu->halt_cond);
2023     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2024              cpu->cpu_index);
2025     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2026                        QEMU_THREAD_JOINABLE);
2027 }
2028
2029 void qemu_init_vcpu(CPUState *cpu)
2030 {
2031     cpu->nr_cores = smp_cores;
2032     cpu->nr_threads = smp_threads;
2033     cpu->stopped = true;
2034
2035     if (!cpu->as) {
2036         /* If the target cpu hasn't set up any address spaces itself,
2037          * give it the default one.
2038          */
2039         cpu->num_ases = 1;
2040         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2041     }
2042
2043     if (kvm_enabled()) {
2044         qemu_kvm_start_vcpu(cpu);
2045     } else if (hax_enabled()) {
2046         qemu_hax_start_vcpu(cpu);
2047     } else if (hvf_enabled()) {
2048         qemu_hvf_start_vcpu(cpu);
2049     } else if (tcg_enabled()) {
2050         qemu_tcg_init_vcpu(cpu);
2051     } else if (whpx_enabled()) {
2052         qemu_whpx_start_vcpu(cpu);
2053     } else {
2054         qemu_dummy_start_vcpu(cpu);
2055     }
2056
2057     while (!cpu->created) {
2058         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2059     }
2060 }
2061
2062 void cpu_stop_current(void)
2063 {
2064     if (current_cpu) {
2065         qemu_cpu_stop(current_cpu, true);
2066     }
2067 }
2068
2069 int vm_stop(RunState state)
2070 {
2071     if (qemu_in_vcpu_thread()) {
2072         qemu_system_vmstop_request_prepare();
2073         qemu_system_vmstop_request(state);
2074         /*
2075          * FIXME: should not return to device code in case
2076          * vm_stop() has been requested.
2077          */
2078         cpu_stop_current();
2079         return 0;
2080     }
2081
2082     return do_vm_stop(state, true);
2083 }
2084
2085 /**
2086  * Prepare for (re)starting the VM.
2087  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2088  * running or in case of an error condition), 0 otherwise.
2089  */
2090 int vm_prepare_start(void)
2091 {
2092     RunState requested;
2093
2094     qemu_vmstop_requested(&requested);
2095     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2096         return -1;
2097     }
2098
2099     /* Ensure that a STOP/RESUME pair of events is emitted if a
2100      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2101      * example, according to documentation is always followed by
2102      * the STOP event.
2103      */
2104     if (runstate_is_running()) {
2105         qapi_event_send_stop(&error_abort);
2106         qapi_event_send_resume(&error_abort);
2107         return -1;
2108     }
2109
2110     /* We are sending this now, but the CPUs will be resumed shortly later */
2111     qapi_event_send_resume(&error_abort);
2112
2113     replay_enable_events();
2114     cpu_enable_ticks();
2115     runstate_set(RUN_STATE_RUNNING);
2116     vm_state_notify(1, RUN_STATE_RUNNING);
2117     return 0;
2118 }
2119
2120 void vm_start(void)
2121 {
2122     if (!vm_prepare_start()) {
2123         resume_all_vcpus();
2124     }
2125 }
2126
2127 /* does a state transition even if the VM is already stopped,
2128    current state is forgotten forever */
2129 int vm_stop_force_state(RunState state)
2130 {
2131     if (runstate_is_running()) {
2132         return vm_stop(state);
2133     } else {
2134         runstate_set(state);
2135
2136         bdrv_drain_all();
2137         /* Make sure to return an error if the flush in a previous vm_stop()
2138          * failed. */
2139         return bdrv_flush_all();
2140     }
2141 }
2142
2143 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2144 {
2145     /* XXX: implement xxx_cpu_list for targets that still miss it */
2146 #if defined(cpu_list)
2147     cpu_list(f, cpu_fprintf);
2148 #endif
2149 }
2150
2151 CpuInfoList *qmp_query_cpus(Error **errp)
2152 {
2153     MachineState *ms = MACHINE(qdev_get_machine());
2154     MachineClass *mc = MACHINE_GET_CLASS(ms);
2155     CpuInfoList *head = NULL, *cur_item = NULL;
2156     CPUState *cpu;
2157
2158     CPU_FOREACH(cpu) {
2159         CpuInfoList *info;
2160 #if defined(TARGET_I386)
2161         X86CPU *x86_cpu = X86_CPU(cpu);
2162         CPUX86State *env = &x86_cpu->env;
2163 #elif defined(TARGET_PPC)
2164         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2165         CPUPPCState *env = &ppc_cpu->env;
2166 #elif defined(TARGET_SPARC)
2167         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2168         CPUSPARCState *env = &sparc_cpu->env;
2169 #elif defined(TARGET_RISCV)
2170         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2171         CPURISCVState *env = &riscv_cpu->env;
2172 #elif defined(TARGET_MIPS)
2173         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2174         CPUMIPSState *env = &mips_cpu->env;
2175 #elif defined(TARGET_TRICORE)
2176         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2177         CPUTriCoreState *env = &tricore_cpu->env;
2178 #elif defined(TARGET_S390X)
2179         S390CPU *s390_cpu = S390_CPU(cpu);
2180         CPUS390XState *env = &s390_cpu->env;
2181 #endif
2182
2183         cpu_synchronize_state(cpu);
2184
2185         info = g_malloc0(sizeof(*info));
2186         info->value = g_malloc0(sizeof(*info->value));
2187         info->value->CPU = cpu->cpu_index;
2188         info->value->current = (cpu == first_cpu);
2189         info->value->halted = cpu->halted;
2190         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2191         info->value->thread_id = cpu->thread_id;
2192 #if defined(TARGET_I386)
2193         info->value->arch = CPU_INFO_ARCH_X86;
2194         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2195 #elif defined(TARGET_PPC)
2196         info->value->arch = CPU_INFO_ARCH_PPC;
2197         info->value->u.ppc.nip = env->nip;
2198 #elif defined(TARGET_SPARC)
2199         info->value->arch = CPU_INFO_ARCH_SPARC;
2200         info->value->u.q_sparc.pc = env->pc;
2201         info->value->u.q_sparc.npc = env->npc;
2202 #elif defined(TARGET_MIPS)
2203         info->value->arch = CPU_INFO_ARCH_MIPS;
2204         info->value->u.q_mips.PC = env->active_tc.PC;
2205 #elif defined(TARGET_TRICORE)
2206         info->value->arch = CPU_INFO_ARCH_TRICORE;
2207         info->value->u.tricore.PC = env->PC;
2208 #elif defined(TARGET_S390X)
2209         info->value->arch = CPU_INFO_ARCH_S390;
2210         info->value->u.s390.cpu_state = env->cpu_state;
2211 #elif defined(TARGET_RISCV)
2212         info->value->arch = CPU_INFO_ARCH_RISCV;
2213         info->value->u.riscv.pc = env->pc;
2214 #else
2215         info->value->arch = CPU_INFO_ARCH_OTHER;
2216 #endif
2217         info->value->has_props = !!mc->cpu_index_to_instance_props;
2218         if (info->value->has_props) {
2219             CpuInstanceProperties *props;
2220             props = g_malloc0(sizeof(*props));
2221             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2222             info->value->props = props;
2223         }
2224
2225         /* XXX: waiting for the qapi to support GSList */
2226         if (!cur_item) {
2227             head = cur_item = info;
2228         } else {
2229             cur_item->next = info;
2230             cur_item = info;
2231         }
2232     }
2233
2234     return head;
2235 }
2236
2237 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2238 {
2239     /*
2240      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2241      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2242      */
2243     switch (target) {
2244     case SYS_EMU_TARGET_I386:
2245     case SYS_EMU_TARGET_X86_64:
2246         return CPU_INFO_ARCH_X86;
2247
2248     case SYS_EMU_TARGET_PPC:
2249     case SYS_EMU_TARGET_PPCEMB:
2250     case SYS_EMU_TARGET_PPC64:
2251         return CPU_INFO_ARCH_PPC;
2252
2253     case SYS_EMU_TARGET_SPARC:
2254     case SYS_EMU_TARGET_SPARC64:
2255         return CPU_INFO_ARCH_SPARC;
2256
2257     case SYS_EMU_TARGET_MIPS:
2258     case SYS_EMU_TARGET_MIPSEL:
2259     case SYS_EMU_TARGET_MIPS64:
2260     case SYS_EMU_TARGET_MIPS64EL:
2261         return CPU_INFO_ARCH_MIPS;
2262
2263     case SYS_EMU_TARGET_TRICORE:
2264         return CPU_INFO_ARCH_TRICORE;
2265
2266     case SYS_EMU_TARGET_S390X:
2267         return CPU_INFO_ARCH_S390;
2268
2269     case SYS_EMU_TARGET_RISCV32:
2270     case SYS_EMU_TARGET_RISCV64:
2271         return CPU_INFO_ARCH_RISCV;
2272
2273     default:
2274         return CPU_INFO_ARCH_OTHER;
2275     }
2276 }
2277
2278 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2279 {
2280 #ifdef TARGET_S390X
2281     S390CPU *s390_cpu = S390_CPU(cpu);
2282     CPUS390XState *env = &s390_cpu->env;
2283
2284     info->cpu_state = env->cpu_state;
2285 #else
2286     abort();
2287 #endif
2288 }
2289
2290 /*
2291  * fast means: we NEVER interrupt vCPU threads to retrieve
2292  * information from KVM.
2293  */
2294 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2295 {
2296     MachineState *ms = MACHINE(qdev_get_machine());
2297     MachineClass *mc = MACHINE_GET_CLASS(ms);
2298     CpuInfoFastList *head = NULL, *cur_item = NULL;
2299     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2300                                           -1, &error_abort);
2301     CPUState *cpu;
2302
2303     CPU_FOREACH(cpu) {
2304         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2305         info->value = g_malloc0(sizeof(*info->value));
2306
2307         info->value->cpu_index = cpu->cpu_index;
2308         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2309         info->value->thread_id = cpu->thread_id;
2310
2311         info->value->has_props = !!mc->cpu_index_to_instance_props;
2312         if (info->value->has_props) {
2313             CpuInstanceProperties *props;
2314             props = g_malloc0(sizeof(*props));
2315             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2316             info->value->props = props;
2317         }
2318
2319         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2320         info->value->target = target;
2321         if (target == SYS_EMU_TARGET_S390X) {
2322             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2323         }
2324
2325         if (!cur_item) {
2326             head = cur_item = info;
2327         } else {
2328             cur_item->next = info;
2329             cur_item = info;
2330         }
2331     }
2332
2333     return head;
2334 }
2335
2336 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2337                  bool has_cpu, int64_t cpu_index, Error **errp)
2338 {
2339     FILE *f;
2340     uint32_t l;
2341     CPUState *cpu;
2342     uint8_t buf[1024];
2343     int64_t orig_addr = addr, orig_size = size;
2344
2345     if (!has_cpu) {
2346         cpu_index = 0;
2347     }
2348
2349     cpu = qemu_get_cpu(cpu_index);
2350     if (cpu == NULL) {
2351         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2352                    "a CPU number");
2353         return;
2354     }
2355
2356     f = fopen(filename, "wb");
2357     if (!f) {
2358         error_setg_file_open(errp, errno, filename);
2359         return;
2360     }
2361
2362     while (size != 0) {
2363         l = sizeof(buf);
2364         if (l > size)
2365             l = size;
2366         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2367             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2368                              " specified", orig_addr, orig_size);
2369             goto exit;
2370         }
2371         if (fwrite(buf, 1, l, f) != l) {
2372             error_setg(errp, QERR_IO_ERROR);
2373             goto exit;
2374         }
2375         addr += l;
2376         size -= l;
2377     }
2378
2379 exit:
2380     fclose(f);
2381 }
2382
2383 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2384                   Error **errp)
2385 {
2386     FILE *f;
2387     uint32_t l;
2388     uint8_t buf[1024];
2389
2390     f = fopen(filename, "wb");
2391     if (!f) {
2392         error_setg_file_open(errp, errno, filename);
2393         return;
2394     }
2395
2396     while (size != 0) {
2397         l = sizeof(buf);
2398         if (l > size)
2399             l = size;
2400         cpu_physical_memory_read(addr, buf, l);
2401         if (fwrite(buf, 1, l, f) != l) {
2402             error_setg(errp, QERR_IO_ERROR);
2403             goto exit;
2404         }
2405         addr += l;
2406         size -= l;
2407     }
2408
2409 exit:
2410     fclose(f);
2411 }
2412
2413 void qmp_inject_nmi(Error **errp)
2414 {
2415     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2416 }
2417
2418 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2419 {
2420     if (!use_icount) {
2421         return;
2422     }
2423
2424     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2425                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2426     if (icount_align_option) {
2427         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2428         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2429     } else {
2430         cpu_fprintf(f, "Max guest delay     NA\n");
2431         cpu_fprintf(f, "Max guest advance   NA\n");
2432     }
2433 }