cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/sysemu.h"
  36 #include "sysemu/block-backend.h"
  37 #include "exec/gdbstub.h"
  38 #include "sysemu/dma.h"
  39 #include "sysemu/hw_accel.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/hax.h"
  42 #include "sysemu/hvf.h"
  43 #include "sysemu/whpx.h"
  44 #include "exec/exec-all.h"
  45
  46 #include "qemu/thread.h"
  47 #include "sysemu/cpus.h"
  48 #include "sysemu/qtest.h"
  49 #include "qemu/main-loop.h"
  50 #include "qemu/option.h"
  51 #include "qemu/bitmap.h"
  52 #include "qemu/seqlock.h"
  53 #include "tcg.h"
  54 #include "hw/nmi.h"
  55 #include "sysemu/replay.h"
  56 #include "hw/boards.h"
  57
  58 #ifdef CONFIG_LINUX
  59
  60 #include <sys/prctl.h>
  61
  62 #ifndef PR_MCE_KILL
  63 #define PR_MCE_KILL 33
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_SET
  67 #define PR_MCE_KILL_SET 1
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_EARLY
  71 #define PR_MCE_KILL_EARLY 1
  72 #endif
  73
  74 #endif /* CONFIG_LINUX */
  75
  76 int64_t max_delay;
  77 int64_t max_advance;
  78
  79 /* vcpu throttling controls */
  80 static QEMUTimer *throttle_timer;
  81 static unsigned int throttle_percentage;
  82
  83 #define CPU_THROTTLE_PCT_MIN 1
  84 #define CPU_THROTTLE_PCT_MAX 99
  85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  86
  87 bool cpu_is_stopped(CPUState *cpu)
  88 {
  89     return cpu->stopped || !runstate_is_running();
  90 }
  91
  92 static bool cpu_thread_is_idle(CPUState *cpu)
  93 {
  94     if (cpu->stop || cpu->queued_work_first) {
  95         return false;
  96     }
  97     if (cpu_is_stopped(cpu)) {
  98         return true;
  99     }
 100     if (!cpu->halted || cpu_has_work(cpu) ||
 101         kvm_halt_in_kernel()) {
 102         return false;
 103     }
 104     return true;
 105 }
 106
 107 static bool all_cpu_threads_idle(void)
 108 {
 109     CPUState *cpu;
 110
 111     CPU_FOREACH(cpu) {
 112         if (!cpu_thread_is_idle(cpu)) {
 113             return false;
 114         }
 115     }
 116     return true;
 117 }
 118
 119 /***********************************************************/
 120 /* guest cycle counter */
 121
 122 /* Protected by TimersState seqlock */
 123
 124 static bool icount_sleep = true;
 125 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 126 #define MAX_ICOUNT_SHIFT 10
 127
 128 typedef struct TimersState {
 129     /* Protected by BQL.  */
 130     int64_t cpu_ticks_prev;
 131     int64_t cpu_ticks_offset;
 132
 133     /* Protect fields that can be respectively read outside the
 134      * BQL, and written from multiple threads.
 135      */
 136     QemuSeqLock vm_clock_seqlock;
 137     QemuSpin vm_clock_lock;
 138
 139     int16_t cpu_ticks_enabled;
 140
 141     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 142     int16_t icount_time_shift;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146
 147     int64_t vm_clock_warp_start;
 148     int64_t cpu_clock_offset;
 149
 150     /* Only written by TCG thread */
 151     int64_t qemu_icount;
 152
 153     /* for adjusting icount */
 154     QEMUTimer *icount_rt_timer;
 155     QEMUTimer *icount_vm_timer;
 156     QEMUTimer *icount_warp_timer;
 157 } TimersState;
 158
 159 static TimersState timers_state;
 160 bool mttcg_enabled;
 161
 162 /*
 163  * We default to false if we know other options have been enabled
 164  * which are currently incompatible with MTTCG. Otherwise when each
 165  * guest (target) has been updated to support:
 166  *   - atomic instructions
 167  *   - memory ordering primitives (barriers)
 168  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 169  *
 170  * Once a guest architecture has been converted to the new primitives
 171  * there are two remaining limitations to check.
 172  *
 173  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 174  * - The host must have a stronger memory order than the guest
 175  *
 176  * It may be possible in future to support strong guests on weak hosts
 177  * but that will require tagging all load/stores in a guest with their
 178  * implicit memory order requirements which would likely slow things
 179  * down a lot.
 180  */
 181
 182 static bool check_tcg_memory_orders_compatible(void)
 183 {
 184 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 185     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 186 #else
 187     return false;
 188 #endif
 189 }
 190
 191 static bool default_mttcg_enabled(void)
 192 {
 193     if (use_icount || TCG_OVERSIZED_GUEST) {
 194         return false;
 195     } else {
 196 #ifdef TARGET_SUPPORTS_MTTCG
 197         return check_tcg_memory_orders_compatible();
 198 #else
 199         return false;
 200 #endif
 201     }
 202 }
 203
 204 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 205 {
 206     const char *t = qemu_opt_get(opts, "thread");
 207     if (t) {
 208         if (strcmp(t, "multi") == 0) {
 209             if (TCG_OVERSIZED_GUEST) {
 210                 error_setg(errp, "No MTTCG when guest word size > hosts");
 211             } else if (use_icount) {
 212                 error_setg(errp, "No MTTCG when icount is enabled");
 213             } else {
 214 #ifndef TARGET_SUPPORTS_MTTCG
 215                 warn_report("Guest not yet converted to MTTCG - "
 216                             "you may get unexpected results");
 217 #endif
 218                 if (!check_tcg_memory_orders_compatible()) {
 219                     warn_report("Guest expects a stronger memory ordering "
 220                                 "than the host provides");
 221                     error_printf("This may cause strange/hard to debug errors\n");
 222                 }
 223                 mttcg_enabled = true;
 224             }
 225         } else if (strcmp(t, "single") == 0) {
 226             mttcg_enabled = false;
 227         } else {
 228             error_setg(errp, "Invalid 'thread' setting %s", t);
 229         }
 230     } else {
 231         mttcg_enabled = default_mttcg_enabled();
 232     }
 233 }
 234
 235 /* The current number of executed instructions is based on what we
 236  * originally budgeted minus the current state of the decrementing
 237  * icount counters in extra/u16.low.
 238  */
 239 static int64_t cpu_get_icount_executed(CPUState *cpu)
 240 {
 241     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 242 }
 243
 244 /*
 245  * Update the global shared timer_state.qemu_icount to take into
 246  * account executed instructions. This is done by the TCG vCPU
 247  * thread so the main-loop can see time has moved forward.
 248  */
 249 static void cpu_update_icount_locked(CPUState *cpu)
 250 {
 251     int64_t executed = cpu_get_icount_executed(cpu);
 252     cpu->icount_budget -= executed;
 253
 254     atomic_set_i64(&timers_state.qemu_icount,
 255                    timers_state.qemu_icount + executed);
 256 }
 257
 258 /*
 259  * Update the global shared timer_state.qemu_icount to take into
 260  * account executed instructions. This is done by the TCG vCPU
 261  * thread so the main-loop can see time has moved forward.
 262  */
 263 void cpu_update_icount(CPUState *cpu)
 264 {
 265     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 266                        &timers_state.vm_clock_lock);
 267     cpu_update_icount_locked(cpu);
 268     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 269                          &timers_state.vm_clock_lock);
 270 }
 271
 272 static int64_t cpu_get_icount_raw_locked(void)
 273 {
 274     CPUState *cpu = current_cpu;
 275
 276     if (cpu && cpu->running) {
 277         if (!cpu->can_do_io) {
 278             error_report("Bad icount read");
 279             exit(1);
 280         }
 281         /* Take into account what has run */
 282         cpu_update_icount_locked(cpu);
 283     }
 284     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 285     return atomic_read_i64(&timers_state.qemu_icount);
 286 }
 287
 288 static int64_t cpu_get_icount_locked(void)
 289 {
 290     int64_t icount = cpu_get_icount_raw_locked();
 291     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 292         cpu_icount_to_ns(icount);
 293 }
 294
 295 int64_t cpu_get_icount_raw(void)
 296 {
 297     int64_t icount;
 298     unsigned start;
 299
 300     do {
 301         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 302         icount = cpu_get_icount_raw_locked();
 303     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 304
 305     return icount;
 306 }
 307
 308 /* Return the virtual CPU time, based on the instruction counter.  */
 309 int64_t cpu_get_icount(void)
 310 {
 311     int64_t icount;
 312     unsigned start;
 313
 314     do {
 315         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 316         icount = cpu_get_icount_locked();
 317     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 318
 319     return icount;
 320 }
 321
 322 int64_t cpu_icount_to_ns(int64_t icount)
 323 {
 324     return icount << atomic_read(&timers_state.icount_time_shift);
 325 }
 326
 327 static int64_t cpu_get_ticks_locked(void)
 328 {
 329     int64_t ticks = timers_state.cpu_ticks_offset;
 330     if (timers_state.cpu_ticks_enabled) {
 331         ticks += cpu_get_host_ticks();
 332     }
 333
 334     if (timers_state.cpu_ticks_prev > ticks) {
 335         /* Non increasing ticks may happen if the host uses software suspend.  */
 336         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 337         ticks = timers_state.cpu_ticks_prev;
 338     }
 339
 340     timers_state.cpu_ticks_prev = ticks;
 341     return ticks;
 342 }
 343
 344 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 345  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 346  * counter.
 347  */
 348 int64_t cpu_get_ticks(void)
 349 {
 350     int64_t ticks;
 351
 352     if (use_icount) {
 353         return cpu_get_icount();
 354     }
 355
 356     qemu_spin_lock(&timers_state.vm_clock_lock);
 357     ticks = cpu_get_ticks_locked();
 358     qemu_spin_unlock(&timers_state.vm_clock_lock);
 359     return ticks;
 360 }
 361
 362 static int64_t cpu_get_clock_locked(void)
 363 {
 364     int64_t time;
 365
 366     time = timers_state.cpu_clock_offset;
 367     if (timers_state.cpu_ticks_enabled) {
 368         time += get_clock();
 369     }
 370
 371     return time;
 372 }
 373
 374 /* Return the monotonic time elapsed in VM, i.e.,
 375  * the time between vm_start and vm_stop
 376  */
 377 int64_t cpu_get_clock(void)
 378 {
 379     int64_t ti;
 380     unsigned start;
 381
 382     do {
 383         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 384         ti = cpu_get_clock_locked();
 385     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 386
 387     return ti;
 388 }
 389
 390 /* enable cpu_get_ticks()
 391  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 392  */
 393 void cpu_enable_ticks(void)
 394 {
 395     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 396                        &timers_state.vm_clock_lock);
 397     if (!timers_state.cpu_ticks_enabled) {
 398         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 399         timers_state.cpu_clock_offset -= get_clock();
 400         timers_state.cpu_ticks_enabled = 1;
 401     }
 402     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 403                        &timers_state.vm_clock_lock);
 404 }
 405
 406 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 407  * cpu_get_ticks() after that.
 408  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 409  */
 410 void cpu_disable_ticks(void)
 411 {
 412     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 413                        &timers_state.vm_clock_lock);
 414     if (timers_state.cpu_ticks_enabled) {
 415         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 416         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 417         timers_state.cpu_ticks_enabled = 0;
 418     }
 419     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 420                          &timers_state.vm_clock_lock);
 421 }
 422
 423 /* Correlation between real and virtual time is always going to be
 424    fairly approximate, so ignore small variation.
 425    When the guest is idle real and virtual time will be aligned in
 426    the IO wait loop.  */
 427 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 428
 429 static void icount_adjust(void)
 430 {
 431     int64_t cur_time;
 432     int64_t cur_icount;
 433     int64_t delta;
 434
 435     /* Protected by TimersState mutex.  */
 436     static int64_t last_delta;
 437
 438     /* If the VM is not running, then do nothing.  */
 439     if (!runstate_is_running()) {
 440         return;
 441     }
 442
 443     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 444                        &timers_state.vm_clock_lock);
 445     cur_time = cpu_get_clock_locked();
 446     cur_icount = cpu_get_icount_locked();
 447
 448     delta = cur_icount - cur_time;
 449     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 450     if (delta > 0
 451         && last_delta + ICOUNT_WOBBLE < delta * 2
 452         && timers_state.icount_time_shift > 0) {
 453         /* The guest is getting too far ahead.  Slow time down.  */
 454         atomic_set(&timers_state.icount_time_shift,
 455                    timers_state.icount_time_shift - 1);
 456     }
 457     if (delta < 0
 458         && last_delta - ICOUNT_WOBBLE > delta * 2
 459         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 460         /* The guest is getting too far behind.  Speed time up.  */
 461         atomic_set(&timers_state.icount_time_shift,
 462                    timers_state.icount_time_shift + 1);
 463     }
 464     last_delta = delta;
 465     atomic_set_i64(&timers_state.qemu_icount_bias,
 466                    cur_icount - (timers_state.qemu_icount
 467                                  << timers_state.icount_time_shift));
 468     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 469                          &timers_state.vm_clock_lock);
 470 }
 471
 472 static void icount_adjust_rt(void *opaque)
 473 {
 474     timer_mod(timers_state.icount_rt_timer,
 475               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 476     icount_adjust();
 477 }
 478
 479 static void icount_adjust_vm(void *opaque)
 480 {
 481     timer_mod(timers_state.icount_vm_timer,
 482                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 483                    NANOSECONDS_PER_SECOND / 10);
 484     icount_adjust();
 485 }
 486
 487 static int64_t qemu_icount_round(int64_t count)
 488 {
 489     int shift = atomic_read(&timers_state.icount_time_shift);
 490     return (count + (1 << shift) - 1) >> shift;
 491 }
 492
 493 static void icount_warp_rt(void)
 494 {
 495     unsigned seq;
 496     int64_t warp_start;
 497
 498     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 499      * changes from -1 to another value, so the race here is okay.
 500      */
 501     do {
 502         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 503         warp_start = timers_state.vm_clock_warp_start;
 504     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 505
 506     if (warp_start == -1) {
 507         return;
 508     }
 509
 510     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 511                        &timers_state.vm_clock_lock);
 512     if (runstate_is_running()) {
 513         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 514                                             cpu_get_clock_locked());
 515         int64_t warp_delta;
 516
 517         warp_delta = clock - timers_state.vm_clock_warp_start;
 518         if (use_icount == 2) {
 519             /*
 520              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 521              * far ahead of real time.
 522              */
 523             int64_t cur_icount = cpu_get_icount_locked();
 524             int64_t delta = clock - cur_icount;
 525             warp_delta = MIN(warp_delta, delta);
 526         }
 527         atomic_set_i64(&timers_state.qemu_icount_bias,
 528                        timers_state.qemu_icount_bias + warp_delta);
 529     }
 530     timers_state.vm_clock_warp_start = -1;
 531     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 532                        &timers_state.vm_clock_lock);
 533
 534     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 535         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 536     }
 537 }
 538
 539 static void icount_timer_cb(void *opaque)
 540 {
 541     /* No need for a checkpoint because the timer already synchronizes
 542      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 543      */
 544     icount_warp_rt();
 545 }
 546
 547 void qtest_clock_warp(int64_t dest)
 548 {
 549     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 550     AioContext *aio_context;
 551     assert(qtest_enabled());
 552     aio_context = qemu_get_aio_context();
 553     while (clock < dest) {
 554         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 555         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 556
 557         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 558                            &timers_state.vm_clock_lock);
 559         atomic_set_i64(&timers_state.qemu_icount_bias,
 560                        timers_state.qemu_icount_bias + warp);
 561         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 562                              &timers_state.vm_clock_lock);
 563
 564         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 565         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 566         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 567     }
 568     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 569 }
 570
 571 void qemu_start_warp_timer(void)
 572 {
 573     int64_t clock;
 574     int64_t deadline;
 575
 576     if (!use_icount) {
 577         return;
 578     }
 579
 580     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 581      * do not fire, so computing the deadline does not make sense.
 582      */
 583     if (!runstate_is_running()) {
 584         return;
 585     }
 586
 587     if (replay_mode != REPLAY_MODE_PLAY) {
 588         if (!all_cpu_threads_idle()) {
 589             return;
 590         }
 591
 592         if (qtest_enabled()) {
 593             /* When testing, qtest commands advance icount.  */
 594             return;
 595         }
 596
 597         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 598     } else {
 599         /* warp clock deterministically in record/replay mode */
 600         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 601             /* vCPU is sleeping and warp can't be started.
 602                It is probably a race condition: notification sent
 603                to vCPU was processed in advance and vCPU went to sleep.
 604                Therefore we have to wake it up for doing someting. */
 605             if (replay_has_checkpoint()) {
 606                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 607             }
 608             return;
 609         }
 610     }
 611
 612     /* We want to use the earliest deadline from ALL vm_clocks */
 613     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 614     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 615     if (deadline < 0) {
 616         static bool notified;
 617         if (!icount_sleep && !notified) {
 618             warn_report("icount sleep disabled and no active timers");
 619             notified = true;
 620         }
 621         return;
 622     }
 623
 624     if (deadline > 0) {
 625         /*
 626          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 627          * sleep.  Otherwise, the CPU might be waiting for a future timer
 628          * interrupt to wake it up, but the interrupt never comes because
 629          * the vCPU isn't running any insns and thus doesn't advance the
 630          * QEMU_CLOCK_VIRTUAL.
 631          */
 632         if (!icount_sleep) {
 633             /*
 634              * We never let VCPUs sleep in no sleep icount mode.
 635              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 636              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 637              * It is useful when we want a deterministic execution time,
 638              * isolated from host latencies.
 639              */
 640             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 641                                &timers_state.vm_clock_lock);
 642             atomic_set_i64(&timers_state.qemu_icount_bias,
 643                            timers_state.qemu_icount_bias + deadline);
 644             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 645                                  &timers_state.vm_clock_lock);
 646             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 647         } else {
 648             /*
 649              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 650              * "real" time, (related to the time left until the next event) has
 651              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 652              * This avoids that the warps are visible externally; for example,
 653              * you will not be sending network packets continuously instead of
 654              * every 100ms.
 655              */
 656             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 657                                &timers_state.vm_clock_lock);
 658             if (timers_state.vm_clock_warp_start == -1
 659                 || timers_state.vm_clock_warp_start > clock) {
 660                 timers_state.vm_clock_warp_start = clock;
 661             }
 662             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 663                                  &timers_state.vm_clock_lock);
 664             timer_mod_anticipate(timers_state.icount_warp_timer,
 665                                  clock + deadline);
 666         }
 667     } else if (deadline == 0) {
 668         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 669     }
 670 }
 671
 672 static void qemu_account_warp_timer(void)
 673 {
 674     if (!use_icount || !icount_sleep) {
 675         return;
 676     }
 677
 678     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 679      * do not fire, so computing the deadline does not make sense.
 680      */
 681     if (!runstate_is_running()) {
 682         return;
 683     }
 684
 685     /* warp clock deterministically in record/replay mode */
 686     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 687         return;
 688     }
 689
 690     timer_del(timers_state.icount_warp_timer);
 691     icount_warp_rt();
 692 }
 693
 694 static bool icount_state_needed(void *opaque)
 695 {
 696     return use_icount;
 697 }
 698
 699 static bool warp_timer_state_needed(void *opaque)
 700 {
 701     TimersState *s = opaque;
 702     return s->icount_warp_timer != NULL;
 703 }
 704
 705 static bool adjust_timers_state_needed(void *opaque)
 706 {
 707     TimersState *s = opaque;
 708     return s->icount_rt_timer != NULL;
 709 }
 710
 711 /*
 712  * Subsection for warp timer migration is optional, because may not be created
 713  */
 714 static const VMStateDescription icount_vmstate_warp_timer = {
 715     .name = "timer/icount/warp_timer",
 716     .version_id = 1,
 717     .minimum_version_id = 1,
 718     .needed = warp_timer_state_needed,
 719     .fields = (VMStateField[]) {
 720         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 721         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 722         VMSTATE_END_OF_LIST()
 723     }
 724 };
 725
 726 static const VMStateDescription icount_vmstate_adjust_timers = {
 727     .name = "timer/icount/timers",
 728     .version_id = 1,
 729     .minimum_version_id = 1,
 730     .needed = adjust_timers_state_needed,
 731     .fields = (VMStateField[]) {
 732         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 733         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 734         VMSTATE_END_OF_LIST()
 735     }
 736 };
 737
 738 /*
 739  * This is a subsection for icount migration.
 740  */
 741 static const VMStateDescription icount_vmstate_timers = {
 742     .name = "timer/icount",
 743     .version_id = 1,
 744     .minimum_version_id = 1,
 745     .needed = icount_state_needed,
 746     .fields = (VMStateField[]) {
 747         VMSTATE_INT64(qemu_icount_bias, TimersState),
 748         VMSTATE_INT64(qemu_icount, TimersState),
 749         VMSTATE_END_OF_LIST()
 750     },
 751     .subsections = (const VMStateDescription*[]) {
 752         &icount_vmstate_warp_timer,
 753         &icount_vmstate_adjust_timers,
 754         NULL
 755     }
 756 };
 757
 758 static const VMStateDescription vmstate_timers = {
 759     .name = "timer",
 760     .version_id = 2,
 761     .minimum_version_id = 1,
 762     .fields = (VMStateField[]) {
 763         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 764         VMSTATE_UNUSED(8),
 765         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 766         VMSTATE_END_OF_LIST()
 767     },
 768     .subsections = (const VMStateDescription*[]) {
 769         &icount_vmstate_timers,
 770         NULL
 771     }
 772 };
 773
 774 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 775 {
 776     double pct;
 777     double throttle_ratio;
 778     long sleeptime_ns;
 779
 780     if (!cpu_throttle_get_percentage()) {
 781         return;
 782     }
 783
 784     pct = (double)cpu_throttle_get_percentage()/100;
 785     throttle_ratio = pct / (1 - pct);
 786     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 787
 788     qemu_mutex_unlock_iothread();
 789     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 790     qemu_mutex_lock_iothread();
 791     atomic_set(&cpu->throttle_thread_scheduled, 0);
 792 }
 793
 794 static void cpu_throttle_timer_tick(void *opaque)
 795 {
 796     CPUState *cpu;
 797     double pct;
 798
 799     /* Stop the timer if needed */
 800     if (!cpu_throttle_get_percentage()) {
 801         return;
 802     }
 803     CPU_FOREACH(cpu) {
 804         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 805             async_run_on_cpu(cpu, cpu_throttle_thread,
 806                              RUN_ON_CPU_NULL);
 807         }
 808     }
 809
 810     pct = (double)cpu_throttle_get_percentage()/100;
 811     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 812                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 813 }
 814
 815 void cpu_throttle_set(int new_throttle_pct)
 816 {
 817     /* Ensure throttle percentage is within valid range */
 818     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 819     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 820
 821     atomic_set(&throttle_percentage, new_throttle_pct);
 822
 823     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 824                                        CPU_THROTTLE_TIMESLICE_NS);
 825 }
 826
 827 void cpu_throttle_stop(void)
 828 {
 829     atomic_set(&throttle_percentage, 0);
 830 }
 831
 832 bool cpu_throttle_active(void)
 833 {
 834     return (cpu_throttle_get_percentage() != 0);
 835 }
 836
 837 int cpu_throttle_get_percentage(void)
 838 {
 839     return atomic_read(&throttle_percentage);
 840 }
 841
 842 void cpu_ticks_init(void)
 843 {
 844     seqlock_init(&timers_state.vm_clock_seqlock);
 845     qemu_spin_init(&timers_state.vm_clock_lock);
 846     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 847     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 848                                            cpu_throttle_timer_tick, NULL);
 849 }
 850
 851 void configure_icount(QemuOpts *opts, Error **errp)
 852 {
 853     const char *option;
 854     char *rem_str = NULL;
 855
 856     option = qemu_opt_get(opts, "shift");
 857     if (!option) {
 858         if (qemu_opt_get(opts, "align") != NULL) {
 859             error_setg(errp, "Please specify shift option when using align");
 860         }
 861         return;
 862     }
 863
 864     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 865     if (icount_sleep) {
 866         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 867                                          icount_timer_cb, NULL);
 868     }
 869
 870     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 871
 872     if (icount_align_option && !icount_sleep) {
 873         error_setg(errp, "align=on and sleep=off are incompatible");
 874     }
 875     if (strcmp(option, "auto") != 0) {
 876         errno = 0;
 877         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 878         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 879             error_setg(errp, "icount: Invalid shift value");
 880         }
 881         use_icount = 1;
 882         return;
 883     } else if (icount_align_option) {
 884         error_setg(errp, "shift=auto and align=on are incompatible");
 885     } else if (!icount_sleep) {
 886         error_setg(errp, "shift=auto and sleep=off are incompatible");
 887     }
 888
 889     use_icount = 2;
 890
 891     /* 125MIPS seems a reasonable initial guess at the guest speed.
 892        It will be corrected fairly quickly anyway.  */
 893     timers_state.icount_time_shift = 3;
 894
 895     /* Have both realtime and virtual time triggers for speed adjustment.
 896        The realtime trigger catches emulated time passing too slowly,
 897        the virtual time trigger catches emulated time passing too fast.
 898        Realtime triggers occur even when idle, so use them less frequently
 899        than VM triggers.  */
 900     timers_state.vm_clock_warp_start = -1;
 901     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 902                                    icount_adjust_rt, NULL);
 903     timer_mod(timers_state.icount_rt_timer,
 904                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 905     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 906                                         icount_adjust_vm, NULL);
 907     timer_mod(timers_state.icount_vm_timer,
 908                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 909                    NANOSECONDS_PER_SECOND / 10);
 910 }
 911
 912 /***********************************************************/
 913 /* TCG vCPU kick timer
 914  *
 915  * The kick timer is responsible for moving single threaded vCPU
 916  * emulation on to the next vCPU. If more than one vCPU is running a
 917  * timer event with force a cpu->exit so the next vCPU can get
 918  * scheduled.
 919  *
 920  * The timer is removed if all vCPUs are idle and restarted again once
 921  * idleness is complete.
 922  */
 923
 924 static QEMUTimer *tcg_kick_vcpu_timer;
 925 static CPUState *tcg_current_rr_cpu;
 926
 927 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 928
 929 static inline int64_t qemu_tcg_next_kick(void)
 930 {
 931     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 932 }
 933
 934 /* Kick the currently round-robin scheduled vCPU */
 935 static void qemu_cpu_kick_rr_cpu(void)
 936 {
 937     CPUState *cpu;
 938     do {
 939         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 940         if (cpu) {
 941             cpu_exit(cpu);
 942         }
 943     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 944 }
 945
 946 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 947 {
 948 }
 949
 950 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 951 {
 952     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 953         qemu_notify_event();
 954         return;
 955     }
 956
 957     if (qemu_in_vcpu_thread()) {
 958         /* A CPU is currently running; kick it back out to the
 959          * tcg_cpu_exec() loop so it will recalculate its
 960          * icount deadline immediately.
 961          */
 962         qemu_cpu_kick(current_cpu);
 963     } else if (first_cpu) {
 964         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 965          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 966          * causes cpu_thread_is_idle to return false.  This way,
 967          * handle_icount_deadline can run.
 968          * If we have no CPUs at all for some reason, we don't
 969          * need to do anything.
 970          */
 971         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 972     }
 973 }
 974
 975 static void kick_tcg_thread(void *opaque)
 976 {
 977     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 978     qemu_cpu_kick_rr_cpu();
 979 }
 980
 981 static void start_tcg_kick_timer(void)
 982 {
 983     assert(!mttcg_enabled);
 984     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 985         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 986                                            kick_tcg_thread, NULL);
 987     }
 988     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 989         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 990     }
 991 }
 992
 993 static void stop_tcg_kick_timer(void)
 994 {
 995     assert(!mttcg_enabled);
 996     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 997         timer_del(tcg_kick_vcpu_timer);
 998     }
 999 }
1000
1001 /***********************************************************/
1002 void hw_error(const char *fmt, ...)
1003 {
1004     va_list ap;
1005     CPUState *cpu;
1006
1007     va_start(ap, fmt);
1008     fprintf(stderr, "qemu: hardware error: ");
1009     vfprintf(stderr, fmt, ap);
1010     fprintf(stderr, "\n");
1011     CPU_FOREACH(cpu) {
1012         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1013         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1014     }
1015     va_end(ap);
1016     abort();
1017 }
1018
1019 void cpu_synchronize_all_states(void)
1020 {
1021     CPUState *cpu;
1022
1023     CPU_FOREACH(cpu) {
1024         cpu_synchronize_state(cpu);
1025         /* TODO: move to cpu_synchronize_state() */
1026         if (hvf_enabled()) {
1027             hvf_cpu_synchronize_state(cpu);
1028         }
1029     }
1030 }
1031
1032 void cpu_synchronize_all_post_reset(void)
1033 {
1034     CPUState *cpu;
1035
1036     CPU_FOREACH(cpu) {
1037         cpu_synchronize_post_reset(cpu);
1038         /* TODO: move to cpu_synchronize_post_reset() */
1039         if (hvf_enabled()) {
1040             hvf_cpu_synchronize_post_reset(cpu);
1041         }
1042     }
1043 }
1044
1045 void cpu_synchronize_all_post_init(void)
1046 {
1047     CPUState *cpu;
1048
1049     CPU_FOREACH(cpu) {
1050         cpu_synchronize_post_init(cpu);
1051         /* TODO: move to cpu_synchronize_post_init() */
1052         if (hvf_enabled()) {
1053             hvf_cpu_synchronize_post_init(cpu);
1054         }
1055     }
1056 }
1057
1058 void cpu_synchronize_all_pre_loadvm(void)
1059 {
1060     CPUState *cpu;
1061
1062     CPU_FOREACH(cpu) {
1063         cpu_synchronize_pre_loadvm(cpu);
1064     }
1065 }
1066
1067 static int do_vm_stop(RunState state, bool send_stop)
1068 {
1069     int ret = 0;
1070
1071     if (runstate_is_running()) {
1072         cpu_disable_ticks();
1073         pause_all_vcpus();
1074         runstate_set(state);
1075         vm_state_notify(0, state);
1076         if (send_stop) {
1077             qapi_event_send_stop();
1078         }
1079     }
1080
1081     bdrv_drain_all();
1082     replay_disable_events();
1083     ret = bdrv_flush_all();
1084
1085     return ret;
1086 }
1087
1088 /* Special vm_stop() variant for terminating the process.  Historically clients
1089  * did not expect a QMP STOP event and so we need to retain compatibility.
1090  */
1091 int vm_shutdown(void)
1092 {
1093     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1094 }
1095
1096 static bool cpu_can_run(CPUState *cpu)
1097 {
1098     if (cpu->stop) {
1099         return false;
1100     }
1101     if (cpu_is_stopped(cpu)) {
1102         return false;
1103     }
1104     return true;
1105 }
1106
1107 static void cpu_handle_guest_debug(CPUState *cpu)
1108 {
1109     gdb_set_stop_cpu(cpu);
1110     qemu_system_debug_request();
1111     cpu->stopped = true;
1112 }
1113
1114 #ifdef CONFIG_LINUX
1115 static void sigbus_reraise(void)
1116 {
1117     sigset_t set;
1118     struct sigaction action;
1119
1120     memset(&action, 0, sizeof(action));
1121     action.sa_handler = SIG_DFL;
1122     if (!sigaction(SIGBUS, &action, NULL)) {
1123         raise(SIGBUS);
1124         sigemptyset(&set);
1125         sigaddset(&set, SIGBUS);
1126         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1127     }
1128     perror("Failed to re-raise SIGBUS!\n");
1129     abort();
1130 }
1131
1132 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1133 {
1134     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1135         sigbus_reraise();
1136     }
1137
1138     if (current_cpu) {
1139         /* Called asynchronously in VCPU thread.  */
1140         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1141             sigbus_reraise();
1142         }
1143     } else {
1144         /* Called synchronously (via signalfd) in main thread.  */
1145         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1146             sigbus_reraise();
1147         }
1148     }
1149 }
1150
1151 static void qemu_init_sigbus(void)
1152 {
1153     struct sigaction action;
1154
1155     memset(&action, 0, sizeof(action));
1156     action.sa_flags = SA_SIGINFO;
1157     action.sa_sigaction = sigbus_handler;
1158     sigaction(SIGBUS, &action, NULL);
1159
1160     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1161 }
1162 #else /* !CONFIG_LINUX */
1163 static void qemu_init_sigbus(void)
1164 {
1165 }
1166 #endif /* !CONFIG_LINUX */
1167
1168 static QemuMutex qemu_global_mutex;
1169
1170 static QemuThread io_thread;
1171
1172 /* cpu creation */
1173 static QemuCond qemu_cpu_cond;
1174 /* system init */
1175 static QemuCond qemu_pause_cond;
1176
1177 void qemu_init_cpu_loop(void)
1178 {
1179     qemu_init_sigbus();
1180     qemu_cond_init(&qemu_cpu_cond);
1181     qemu_cond_init(&qemu_pause_cond);
1182     qemu_mutex_init(&qemu_global_mutex);
1183
1184     qemu_thread_get_self(&io_thread);
1185 }
1186
1187 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1188 {
1189     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1190 }
1191
1192 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1193 {
1194     if (kvm_destroy_vcpu(cpu) < 0) {
1195         error_report("kvm_destroy_vcpu failed");
1196         exit(EXIT_FAILURE);
1197     }
1198 }
1199
1200 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1201 {
1202 }
1203
1204 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1205 {
1206     g_assert(qemu_cpu_is_self(cpu));
1207     cpu->stop = false;
1208     cpu->stopped = true;
1209     if (exit) {
1210         cpu_exit(cpu);
1211     }
1212     qemu_cond_broadcast(&qemu_pause_cond);
1213 }
1214
1215 static void qemu_wait_io_event_common(CPUState *cpu)
1216 {
1217     atomic_mb_set(&cpu->thread_kicked, false);
1218     if (cpu->stop) {
1219         qemu_cpu_stop(cpu, false);
1220     }
1221     process_queued_cpu_work(cpu);
1222 }
1223
1224 static void qemu_tcg_rr_wait_io_event(void)
1225 {
1226     CPUState *cpu;
1227
1228     while (all_cpu_threads_idle()) {
1229         stop_tcg_kick_timer();
1230         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1231     }
1232
1233     start_tcg_kick_timer();
1234
1235     CPU_FOREACH(cpu) {
1236         qemu_wait_io_event_common(cpu);
1237     }
1238 }
1239
1240 static void qemu_wait_io_event(CPUState *cpu)
1241 {
1242     while (cpu_thread_is_idle(cpu)) {
1243         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1244     }
1245
1246 #ifdef _WIN32
1247     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1248     if (!tcg_enabled()) {
1249         SleepEx(0, TRUE);
1250     }
1251 #endif
1252     qemu_wait_io_event_common(cpu);
1253 }
1254
1255 static void *qemu_kvm_cpu_thread_fn(void *arg)
1256 {
1257     CPUState *cpu = arg;
1258     int r;
1259
1260     rcu_register_thread();
1261
1262     qemu_mutex_lock_iothread();
1263     qemu_thread_get_self(cpu->thread);
1264     cpu->thread_id = qemu_get_thread_id();
1265     cpu->can_do_io = 1;
1266     current_cpu = cpu;
1267
1268     r = kvm_init_vcpu(cpu);
1269     if (r < 0) {
1270         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1271         exit(1);
1272     }
1273
1274     kvm_init_cpu_signals(cpu);
1275
1276     /* signal CPU creation */
1277     cpu->created = true;
1278     qemu_cond_signal(&qemu_cpu_cond);
1279
1280     do {
1281         if (cpu_can_run(cpu)) {
1282             r = kvm_cpu_exec(cpu);
1283             if (r == EXCP_DEBUG) {
1284                 cpu_handle_guest_debug(cpu);
1285             }
1286         }
1287         qemu_wait_io_event(cpu);
1288     } while (!cpu->unplug || cpu_can_run(cpu));
1289
1290     qemu_kvm_destroy_vcpu(cpu);
1291     cpu->created = false;
1292     qemu_cond_signal(&qemu_cpu_cond);
1293     qemu_mutex_unlock_iothread();
1294     rcu_unregister_thread();
1295     return NULL;
1296 }
1297
1298 static void *qemu_dummy_cpu_thread_fn(void *arg)
1299 {
1300 #ifdef _WIN32
1301     error_report("qtest is not supported under Windows");
1302     exit(1);
1303 #else
1304     CPUState *cpu = arg;
1305     sigset_t waitset;
1306     int r;
1307
1308     rcu_register_thread();
1309
1310     qemu_mutex_lock_iothread();
1311     qemu_thread_get_self(cpu->thread);
1312     cpu->thread_id = qemu_get_thread_id();
1313     cpu->can_do_io = 1;
1314     current_cpu = cpu;
1315
1316     sigemptyset(&waitset);
1317     sigaddset(&waitset, SIG_IPI);
1318
1319     /* signal CPU creation */
1320     cpu->created = true;
1321     qemu_cond_signal(&qemu_cpu_cond);
1322
1323     do {
1324         qemu_mutex_unlock_iothread();
1325         do {
1326             int sig;
1327             r = sigwait(&waitset, &sig);
1328         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1329         if (r == -1) {
1330             perror("sigwait");
1331             exit(1);
1332         }
1333         qemu_mutex_lock_iothread();
1334         qemu_wait_io_event(cpu);
1335     } while (!cpu->unplug);
1336
1337     qemu_mutex_unlock_iothread();
1338     rcu_unregister_thread();
1339     return NULL;
1340 #endif
1341 }
1342
1343 static int64_t tcg_get_icount_limit(void)
1344 {
1345     int64_t deadline;
1346
1347     if (replay_mode != REPLAY_MODE_PLAY) {
1348         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1349
1350         /* Maintain prior (possibly buggy) behaviour where if no deadline
1351          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1352          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1353          * nanoseconds.
1354          */
1355         if ((deadline < 0) || (deadline > INT32_MAX)) {
1356             deadline = INT32_MAX;
1357         }
1358
1359         return qemu_icount_round(deadline);
1360     } else {
1361         return replay_get_instructions();
1362     }
1363 }
1364
1365 static void handle_icount_deadline(void)
1366 {
1367     assert(qemu_in_vcpu_thread());
1368     if (use_icount) {
1369         int64_t deadline =
1370             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1371
1372         if (deadline == 0) {
1373             /* Wake up other AioContexts.  */
1374             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1375             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1376         }
1377     }
1378 }
1379
1380 static void prepare_icount_for_run(CPUState *cpu)
1381 {
1382     if (use_icount) {
1383         int insns_left;
1384
1385         /* These should always be cleared by process_icount_data after
1386          * each vCPU execution. However u16.high can be raised
1387          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1388          */
1389         g_assert(cpu->icount_decr.u16.low == 0);
1390         g_assert(cpu->icount_extra == 0);
1391
1392         cpu->icount_budget = tcg_get_icount_limit();
1393         insns_left = MIN(0xffff, cpu->icount_budget);
1394         cpu->icount_decr.u16.low = insns_left;
1395         cpu->icount_extra = cpu->icount_budget - insns_left;
1396
1397         replay_mutex_lock();
1398     }
1399 }
1400
1401 static void process_icount_data(CPUState *cpu)
1402 {
1403     if (use_icount) {
1404         /* Account for executed instructions */
1405         cpu_update_icount(cpu);
1406
1407         /* Reset the counters */
1408         cpu->icount_decr.u16.low = 0;
1409         cpu->icount_extra = 0;
1410         cpu->icount_budget = 0;
1411
1412         replay_account_executed_instructions();
1413
1414         replay_mutex_unlock();
1415     }
1416 }
1417
1418
1419 static int tcg_cpu_exec(CPUState *cpu)
1420 {
1421     int ret;
1422 #ifdef CONFIG_PROFILER
1423     int64_t ti;
1424 #endif
1425
1426     assert(tcg_enabled());
1427 #ifdef CONFIG_PROFILER
1428     ti = profile_getclock();
1429 #endif
1430     cpu_exec_start(cpu);
1431     ret = cpu_exec(cpu);
1432     cpu_exec_end(cpu);
1433 #ifdef CONFIG_PROFILER
1434     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1435                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1436 #endif
1437     return ret;
1438 }
1439
1440 /* Destroy any remaining vCPUs which have been unplugged and have
1441  * finished running
1442  */
1443 static void deal_with_unplugged_cpus(void)
1444 {
1445     CPUState *cpu;
1446
1447     CPU_FOREACH(cpu) {
1448         if (cpu->unplug && !cpu_can_run(cpu)) {
1449             qemu_tcg_destroy_vcpu(cpu);
1450             cpu->created = false;
1451             qemu_cond_signal(&qemu_cpu_cond);
1452             break;
1453         }
1454     }
1455 }
1456
1457 /* Single-threaded TCG
1458  *
1459  * In the single-threaded case each vCPU is simulated in turn. If
1460  * there is more than a single vCPU we create a simple timer to kick
1461  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1462  * This is done explicitly rather than relying on side-effects
1463  * elsewhere.
1464  */
1465
1466 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1467 {
1468     CPUState *cpu = arg;
1469
1470     assert(tcg_enabled());
1471     rcu_register_thread();
1472     tcg_register_thread();
1473
1474     qemu_mutex_lock_iothread();
1475     qemu_thread_get_self(cpu->thread);
1476
1477     cpu->thread_id = qemu_get_thread_id();
1478     cpu->created = true;
1479     cpu->can_do_io = 1;
1480     qemu_cond_signal(&qemu_cpu_cond);
1481
1482     /* wait for initial kick-off after machine start */
1483     while (first_cpu->stopped) {
1484         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1485
1486         /* process any pending work */
1487         CPU_FOREACH(cpu) {
1488             current_cpu = cpu;
1489             qemu_wait_io_event_common(cpu);
1490         }
1491     }
1492
1493     start_tcg_kick_timer();
1494
1495     cpu = first_cpu;
1496
1497     /* process any pending work */
1498     cpu->exit_request = 1;
1499
1500     while (1) {
1501         qemu_mutex_unlock_iothread();
1502         replay_mutex_lock();
1503         qemu_mutex_lock_iothread();
1504         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1505         qemu_account_warp_timer();
1506
1507         /* Run the timers here.  This is much more efficient than
1508          * waking up the I/O thread and waiting for completion.
1509          */
1510         handle_icount_deadline();
1511
1512         replay_mutex_unlock();
1513
1514         if (!cpu) {
1515             cpu = first_cpu;
1516         }
1517
1518         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1519
1520             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1521             current_cpu = cpu;
1522
1523             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1524                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1525
1526             if (cpu_can_run(cpu)) {
1527                 int r;
1528
1529                 qemu_mutex_unlock_iothread();
1530                 prepare_icount_for_run(cpu);
1531
1532                 r = tcg_cpu_exec(cpu);
1533
1534                 process_icount_data(cpu);
1535                 qemu_mutex_lock_iothread();
1536
1537                 if (r == EXCP_DEBUG) {
1538                     cpu_handle_guest_debug(cpu);
1539                     break;
1540                 } else if (r == EXCP_ATOMIC) {
1541                     qemu_mutex_unlock_iothread();
1542                     cpu_exec_step_atomic(cpu);
1543                     qemu_mutex_lock_iothread();
1544                     break;
1545                 }
1546             } else if (cpu->stop) {
1547                 if (cpu->unplug) {
1548                     cpu = CPU_NEXT(cpu);
1549                 }
1550                 break;
1551             }
1552
1553             cpu = CPU_NEXT(cpu);
1554         } /* while (cpu && !cpu->exit_request).. */
1555
1556         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1557         atomic_set(&tcg_current_rr_cpu, NULL);
1558
1559         if (cpu && cpu->exit_request) {
1560             atomic_mb_set(&cpu->exit_request, 0);
1561         }
1562
1563         if (use_icount && all_cpu_threads_idle()) {
1564             /*
1565              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1566              * in the main_loop, wake it up in order to start the warp timer.
1567              */
1568             qemu_notify_event();
1569         }
1570
1571         qemu_tcg_rr_wait_io_event();
1572         deal_with_unplugged_cpus();
1573     }
1574
1575     rcu_unregister_thread();
1576     return NULL;
1577 }
1578
1579 static void *qemu_hax_cpu_thread_fn(void *arg)
1580 {
1581     CPUState *cpu = arg;
1582     int r;
1583
1584     rcu_register_thread();
1585     qemu_mutex_lock_iothread();
1586     qemu_thread_get_self(cpu->thread);
1587
1588     cpu->thread_id = qemu_get_thread_id();
1589     cpu->created = true;
1590     cpu->halted = 0;
1591     current_cpu = cpu;
1592
1593     hax_init_vcpu(cpu);
1594     qemu_cond_signal(&qemu_cpu_cond);
1595
1596     do {
1597         if (cpu_can_run(cpu)) {
1598             r = hax_smp_cpu_exec(cpu);
1599             if (r == EXCP_DEBUG) {
1600                 cpu_handle_guest_debug(cpu);
1601             }
1602         }
1603
1604         qemu_wait_io_event(cpu);
1605     } while (!cpu->unplug || cpu_can_run(cpu));
1606     rcu_unregister_thread();
1607     return NULL;
1608 }
1609
1610 /* The HVF-specific vCPU thread function. This one should only run when the host
1611  * CPU supports the VMX "unrestricted guest" feature. */
1612 static void *qemu_hvf_cpu_thread_fn(void *arg)
1613 {
1614     CPUState *cpu = arg;
1615
1616     int r;
1617
1618     assert(hvf_enabled());
1619
1620     rcu_register_thread();
1621
1622     qemu_mutex_lock_iothread();
1623     qemu_thread_get_self(cpu->thread);
1624
1625     cpu->thread_id = qemu_get_thread_id();
1626     cpu->can_do_io = 1;
1627     current_cpu = cpu;
1628
1629     hvf_init_vcpu(cpu);
1630
1631     /* signal CPU creation */
1632     cpu->created = true;
1633     qemu_cond_signal(&qemu_cpu_cond);
1634
1635     do {
1636         if (cpu_can_run(cpu)) {
1637             r = hvf_vcpu_exec(cpu);
1638             if (r == EXCP_DEBUG) {
1639                 cpu_handle_guest_debug(cpu);
1640             }
1641         }
1642         qemu_wait_io_event(cpu);
1643     } while (!cpu->unplug || cpu_can_run(cpu));
1644
1645     hvf_vcpu_destroy(cpu);
1646     cpu->created = false;
1647     qemu_cond_signal(&qemu_cpu_cond);
1648     qemu_mutex_unlock_iothread();
1649     rcu_unregister_thread();
1650     return NULL;
1651 }
1652
1653 static void *qemu_whpx_cpu_thread_fn(void *arg)
1654 {
1655     CPUState *cpu = arg;
1656     int r;
1657
1658     rcu_register_thread();
1659
1660     qemu_mutex_lock_iothread();
1661     qemu_thread_get_self(cpu->thread);
1662     cpu->thread_id = qemu_get_thread_id();
1663     current_cpu = cpu;
1664
1665     r = whpx_init_vcpu(cpu);
1666     if (r < 0) {
1667         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1668         exit(1);
1669     }
1670
1671     /* signal CPU creation */
1672     cpu->created = true;
1673     qemu_cond_signal(&qemu_cpu_cond);
1674
1675     do {
1676         if (cpu_can_run(cpu)) {
1677             r = whpx_vcpu_exec(cpu);
1678             if (r == EXCP_DEBUG) {
1679                 cpu_handle_guest_debug(cpu);
1680             }
1681         }
1682         while (cpu_thread_is_idle(cpu)) {
1683             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1684         }
1685         qemu_wait_io_event_common(cpu);
1686     } while (!cpu->unplug || cpu_can_run(cpu));
1687
1688     whpx_destroy_vcpu(cpu);
1689     cpu->created = false;
1690     qemu_cond_signal(&qemu_cpu_cond);
1691     qemu_mutex_unlock_iothread();
1692     rcu_unregister_thread();
1693     return NULL;
1694 }
1695
1696 #ifdef _WIN32
1697 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1698 {
1699 }
1700 #endif
1701
1702 /* Multi-threaded TCG
1703  *
1704  * In the multi-threaded case each vCPU has its own thread. The TLS
1705  * variable current_cpu can be used deep in the code to find the
1706  * current CPUState for a given thread.
1707  */
1708
1709 static void *qemu_tcg_cpu_thread_fn(void *arg)
1710 {
1711     CPUState *cpu = arg;
1712
1713     assert(tcg_enabled());
1714     g_assert(!use_icount);
1715
1716     rcu_register_thread();
1717     tcg_register_thread();
1718
1719     qemu_mutex_lock_iothread();
1720     qemu_thread_get_self(cpu->thread);
1721
1722     cpu->thread_id = qemu_get_thread_id();
1723     cpu->created = true;
1724     cpu->can_do_io = 1;
1725     current_cpu = cpu;
1726     qemu_cond_signal(&qemu_cpu_cond);
1727
1728     /* process any pending work */
1729     cpu->exit_request = 1;
1730
1731     do {
1732         if (cpu_can_run(cpu)) {
1733             int r;
1734             qemu_mutex_unlock_iothread();
1735             r = tcg_cpu_exec(cpu);
1736             qemu_mutex_lock_iothread();
1737             switch (r) {
1738             case EXCP_DEBUG:
1739                 cpu_handle_guest_debug(cpu);
1740                 break;
1741             case EXCP_HALTED:
1742                 /* during start-up the vCPU is reset and the thread is
1743                  * kicked several times. If we don't ensure we go back
1744                  * to sleep in the halted state we won't cleanly
1745                  * start-up when the vCPU is enabled.
1746                  *
1747                  * cpu->halted should ensure we sleep in wait_io_event
1748                  */
1749                 g_assert(cpu->halted);
1750                 break;
1751             case EXCP_ATOMIC:
1752                 qemu_mutex_unlock_iothread();
1753                 cpu_exec_step_atomic(cpu);
1754                 qemu_mutex_lock_iothread();
1755             default:
1756                 /* Ignore everything else? */
1757                 break;
1758             }
1759         }
1760
1761         atomic_mb_set(&cpu->exit_request, 0);
1762         qemu_wait_io_event(cpu);
1763     } while (!cpu->unplug || cpu_can_run(cpu));
1764
1765     qemu_tcg_destroy_vcpu(cpu);
1766     cpu->created = false;
1767     qemu_cond_signal(&qemu_cpu_cond);
1768     qemu_mutex_unlock_iothread();
1769     rcu_unregister_thread();
1770     return NULL;
1771 }
1772
1773 static void qemu_cpu_kick_thread(CPUState *cpu)
1774 {
1775 #ifndef _WIN32
1776     int err;
1777
1778     if (cpu->thread_kicked) {
1779         return;
1780     }
1781     cpu->thread_kicked = true;
1782     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1783     if (err && err != ESRCH) {
1784         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1785         exit(1);
1786     }
1787 #else /* _WIN32 */
1788     if (!qemu_cpu_is_self(cpu)) {
1789         if (whpx_enabled()) {
1790             whpx_vcpu_kick(cpu);
1791         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1792             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1793                     __func__, GetLastError());
1794             exit(1);
1795         }
1796     }
1797 #endif
1798 }
1799
1800 void qemu_cpu_kick(CPUState *cpu)
1801 {
1802     qemu_cond_broadcast(cpu->halt_cond);
1803     if (tcg_enabled()) {
1804         cpu_exit(cpu);
1805         /* NOP unless doing single-thread RR */
1806         qemu_cpu_kick_rr_cpu();
1807     } else {
1808         if (hax_enabled()) {
1809             /*
1810              * FIXME: race condition with the exit_request check in
1811              * hax_vcpu_hax_exec
1812              */
1813             cpu->exit_request = 1;
1814         }
1815         qemu_cpu_kick_thread(cpu);
1816     }
1817 }
1818
1819 void qemu_cpu_kick_self(void)
1820 {
1821     assert(current_cpu);
1822     qemu_cpu_kick_thread(current_cpu);
1823 }
1824
1825 bool qemu_cpu_is_self(CPUState *cpu)
1826 {
1827     return qemu_thread_is_self(cpu->thread);
1828 }
1829
1830 bool qemu_in_vcpu_thread(void)
1831 {
1832     return current_cpu && qemu_cpu_is_self(current_cpu);
1833 }
1834
1835 static __thread bool iothread_locked = false;
1836
1837 bool qemu_mutex_iothread_locked(void)
1838 {
1839     return iothread_locked;
1840 }
1841
1842 /*
1843  * The BQL is taken from so many places that it is worth profiling the
1844  * callers directly, instead of funneling them all through a single function.
1845  */
1846 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1847 {
1848     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1849
1850     g_assert(!qemu_mutex_iothread_locked());
1851     bql_lock(&qemu_global_mutex, file, line);
1852     iothread_locked = true;
1853 }
1854
1855 void qemu_mutex_unlock_iothread(void)
1856 {
1857     g_assert(qemu_mutex_iothread_locked());
1858     iothread_locked = false;
1859     qemu_mutex_unlock(&qemu_global_mutex);
1860 }
1861
1862 static bool all_vcpus_paused(void)
1863 {
1864     CPUState *cpu;
1865
1866     CPU_FOREACH(cpu) {
1867         if (!cpu->stopped) {
1868             return false;
1869         }
1870     }
1871
1872     return true;
1873 }
1874
1875 void pause_all_vcpus(void)
1876 {
1877     CPUState *cpu;
1878
1879     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1880     CPU_FOREACH(cpu) {
1881         if (qemu_cpu_is_self(cpu)) {
1882             qemu_cpu_stop(cpu, true);
1883         } else {
1884             cpu->stop = true;
1885             qemu_cpu_kick(cpu);
1886         }
1887     }
1888
1889     /* We need to drop the replay_lock so any vCPU threads woken up
1890      * can finish their replay tasks
1891      */
1892     replay_mutex_unlock();
1893
1894     while (!all_vcpus_paused()) {
1895         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1896         CPU_FOREACH(cpu) {
1897             qemu_cpu_kick(cpu);
1898         }
1899     }
1900
1901     qemu_mutex_unlock_iothread();
1902     replay_mutex_lock();
1903     qemu_mutex_lock_iothread();
1904 }
1905
1906 void cpu_resume(CPUState *cpu)
1907 {
1908     cpu->stop = false;
1909     cpu->stopped = false;
1910     qemu_cpu_kick(cpu);
1911 }
1912
1913 void resume_all_vcpus(void)
1914 {
1915     CPUState *cpu;
1916
1917     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1918     CPU_FOREACH(cpu) {
1919         cpu_resume(cpu);
1920     }
1921 }
1922
1923 void cpu_remove_sync(CPUState *cpu)
1924 {
1925     cpu->stop = true;
1926     cpu->unplug = true;
1927     qemu_cpu_kick(cpu);
1928     qemu_mutex_unlock_iothread();
1929     qemu_thread_join(cpu->thread);
1930     qemu_mutex_lock_iothread();
1931 }
1932
1933 /* For temporary buffers for forming a name */
1934 #define VCPU_THREAD_NAME_SIZE 16
1935
1936 static void qemu_tcg_init_vcpu(CPUState *cpu)
1937 {
1938     char thread_name[VCPU_THREAD_NAME_SIZE];
1939     static QemuCond *single_tcg_halt_cond;
1940     static QemuThread *single_tcg_cpu_thread;
1941     static int tcg_region_inited;
1942
1943     assert(tcg_enabled());
1944     /*
1945      * Initialize TCG regions--once. Now is a good time, because:
1946      * (1) TCG's init context, prologue and target globals have been set up.
1947      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1948      *     -accel flag is processed, so the check doesn't work then).
1949      */
1950     if (!tcg_region_inited) {
1951         tcg_region_inited = 1;
1952         tcg_region_init();
1953     }
1954
1955     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1956         cpu->thread = g_malloc0(sizeof(QemuThread));
1957         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1958         qemu_cond_init(cpu->halt_cond);
1959
1960         if (qemu_tcg_mttcg_enabled()) {
1961             /* create a thread per vCPU with TCG (MTTCG) */
1962             parallel_cpus = true;
1963             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1964                  cpu->cpu_index);
1965
1966             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1967                                cpu, QEMU_THREAD_JOINABLE);
1968
1969         } else {
1970             /* share a single thread for all cpus with TCG */
1971             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1972             qemu_thread_create(cpu->thread, thread_name,
1973                                qemu_tcg_rr_cpu_thread_fn,
1974                                cpu, QEMU_THREAD_JOINABLE);
1975
1976             single_tcg_halt_cond = cpu->halt_cond;
1977             single_tcg_cpu_thread = cpu->thread;
1978         }
1979 #ifdef _WIN32
1980         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1981 #endif
1982     } else {
1983         /* For non-MTTCG cases we share the thread */
1984         cpu->thread = single_tcg_cpu_thread;
1985         cpu->halt_cond = single_tcg_halt_cond;
1986         cpu->thread_id = first_cpu->thread_id;
1987         cpu->can_do_io = 1;
1988         cpu->created = true;
1989     }
1990 }
1991
1992 static void qemu_hax_start_vcpu(CPUState *cpu)
1993 {
1994     char thread_name[VCPU_THREAD_NAME_SIZE];
1995
1996     cpu->thread = g_malloc0(sizeof(QemuThread));
1997     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1998     qemu_cond_init(cpu->halt_cond);
1999
2000     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2001              cpu->cpu_index);
2002     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2003                        cpu, QEMU_THREAD_JOINABLE);
2004 #ifdef _WIN32
2005     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2006 #endif
2007 }
2008
2009 static void qemu_kvm_start_vcpu(CPUState *cpu)
2010 {
2011     char thread_name[VCPU_THREAD_NAME_SIZE];
2012
2013     cpu->thread = g_malloc0(sizeof(QemuThread));
2014     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2015     qemu_cond_init(cpu->halt_cond);
2016     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2017              cpu->cpu_index);
2018     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2019                        cpu, QEMU_THREAD_JOINABLE);
2020 }
2021
2022 static void qemu_hvf_start_vcpu(CPUState *cpu)
2023 {
2024     char thread_name[VCPU_THREAD_NAME_SIZE];
2025
2026     /* HVF currently does not support TCG, and only runs in
2027      * unrestricted-guest mode. */
2028     assert(hvf_enabled());
2029
2030     cpu->thread = g_malloc0(sizeof(QemuThread));
2031     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2032     qemu_cond_init(cpu->halt_cond);
2033
2034     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2035              cpu->cpu_index);
2036     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2037                        cpu, QEMU_THREAD_JOINABLE);
2038 }
2039
2040 static void qemu_whpx_start_vcpu(CPUState *cpu)
2041 {
2042     char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044     cpu->thread = g_malloc0(sizeof(QemuThread));
2045     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046     qemu_cond_init(cpu->halt_cond);
2047     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2048              cpu->cpu_index);
2049     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2050                        cpu, QEMU_THREAD_JOINABLE);
2051 #ifdef _WIN32
2052     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2053 #endif
2054 }
2055
2056 static void qemu_dummy_start_vcpu(CPUState *cpu)
2057 {
2058     char thread_name[VCPU_THREAD_NAME_SIZE];
2059
2060     cpu->thread = g_malloc0(sizeof(QemuThread));
2061     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2062     qemu_cond_init(cpu->halt_cond);
2063     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2064              cpu->cpu_index);
2065     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2066                        QEMU_THREAD_JOINABLE);
2067 }
2068
2069 void qemu_init_vcpu(CPUState *cpu)
2070 {
2071     cpu->nr_cores = smp_cores;
2072     cpu->nr_threads = smp_threads;
2073     cpu->stopped = true;
2074
2075     if (!cpu->as) {
2076         /* If the target cpu hasn't set up any address spaces itself,
2077          * give it the default one.
2078          */
2079         cpu->num_ases = 1;
2080         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2081     }
2082
2083     if (kvm_enabled()) {
2084         qemu_kvm_start_vcpu(cpu);
2085     } else if (hax_enabled()) {
2086         qemu_hax_start_vcpu(cpu);
2087     } else if (hvf_enabled()) {
2088         qemu_hvf_start_vcpu(cpu);
2089     } else if (tcg_enabled()) {
2090         qemu_tcg_init_vcpu(cpu);
2091     } else if (whpx_enabled()) {
2092         qemu_whpx_start_vcpu(cpu);
2093     } else {
2094         qemu_dummy_start_vcpu(cpu);
2095     }
2096
2097     while (!cpu->created) {
2098         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2099     }
2100 }
2101
2102 void cpu_stop_current(void)
2103 {
2104     if (current_cpu) {
2105         current_cpu->stop = true;
2106         cpu_exit(current_cpu);
2107     }
2108 }
2109
2110 int vm_stop(RunState state)
2111 {
2112     if (qemu_in_vcpu_thread()) {
2113         qemu_system_vmstop_request_prepare();
2114         qemu_system_vmstop_request(state);
2115         /*
2116          * FIXME: should not return to device code in case
2117          * vm_stop() has been requested.
2118          */
2119         cpu_stop_current();
2120         return 0;
2121     }
2122
2123     return do_vm_stop(state, true);
2124 }
2125
2126 /**
2127  * Prepare for (re)starting the VM.
2128  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2129  * running or in case of an error condition), 0 otherwise.
2130  */
2131 int vm_prepare_start(void)
2132 {
2133     RunState requested;
2134
2135     qemu_vmstop_requested(&requested);
2136     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2137         return -1;
2138     }
2139
2140     /* Ensure that a STOP/RESUME pair of events is emitted if a
2141      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2142      * example, according to documentation is always followed by
2143      * the STOP event.
2144      */
2145     if (runstate_is_running()) {
2146         qapi_event_send_stop();
2147         qapi_event_send_resume();
2148         return -1;
2149     }
2150
2151     /* We are sending this now, but the CPUs will be resumed shortly later */
2152     qapi_event_send_resume();
2153
2154     replay_enable_events();
2155     cpu_enable_ticks();
2156     runstate_set(RUN_STATE_RUNNING);
2157     vm_state_notify(1, RUN_STATE_RUNNING);
2158     return 0;
2159 }
2160
2161 void vm_start(void)
2162 {
2163     if (!vm_prepare_start()) {
2164         resume_all_vcpus();
2165     }
2166 }
2167
2168 /* does a state transition even if the VM is already stopped,
2169    current state is forgotten forever */
2170 int vm_stop_force_state(RunState state)
2171 {
2172     if (runstate_is_running()) {
2173         return vm_stop(state);
2174     } else {
2175         runstate_set(state);
2176
2177         bdrv_drain_all();
2178         /* Make sure to return an error if the flush in a previous vm_stop()
2179          * failed. */
2180         return bdrv_flush_all();
2181     }
2182 }
2183
2184 void list_cpus(const char *optarg)
2185 {
2186     /* XXX: implement xxx_cpu_list for targets that still miss it */
2187 #if defined(cpu_list)
2188     cpu_list();
2189 #endif
2190 }
2191
2192 CpuInfoList *qmp_query_cpus(Error **errp)
2193 {
2194     MachineState *ms = MACHINE(qdev_get_machine());
2195     MachineClass *mc = MACHINE_GET_CLASS(ms);
2196     CpuInfoList *head = NULL, *cur_item = NULL;
2197     CPUState *cpu;
2198
2199     CPU_FOREACH(cpu) {
2200         CpuInfoList *info;
2201 #if defined(TARGET_I386)
2202         X86CPU *x86_cpu = X86_CPU(cpu);
2203         CPUX86State *env = &x86_cpu->env;
2204 #elif defined(TARGET_PPC)
2205         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2206         CPUPPCState *env = &ppc_cpu->env;
2207 #elif defined(TARGET_SPARC)
2208         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2209         CPUSPARCState *env = &sparc_cpu->env;
2210 #elif defined(TARGET_RISCV)
2211         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2212         CPURISCVState *env = &riscv_cpu->env;
2213 #elif defined(TARGET_MIPS)
2214         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2215         CPUMIPSState *env = &mips_cpu->env;
2216 #elif defined(TARGET_TRICORE)
2217         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2218         CPUTriCoreState *env = &tricore_cpu->env;
2219 #elif defined(TARGET_S390X)
2220         S390CPU *s390_cpu = S390_CPU(cpu);
2221         CPUS390XState *env = &s390_cpu->env;
2222 #endif
2223
2224         cpu_synchronize_state(cpu);
2225
2226         info = g_malloc0(sizeof(*info));
2227         info->value = g_malloc0(sizeof(*info->value));
2228         info->value->CPU = cpu->cpu_index;
2229         info->value->current = (cpu == first_cpu);
2230         info->value->halted = cpu->halted;
2231         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2232         info->value->thread_id = cpu->thread_id;
2233 #if defined(TARGET_I386)
2234         info->value->arch = CPU_INFO_ARCH_X86;
2235         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2236 #elif defined(TARGET_PPC)
2237         info->value->arch = CPU_INFO_ARCH_PPC;
2238         info->value->u.ppc.nip = env->nip;
2239 #elif defined(TARGET_SPARC)
2240         info->value->arch = CPU_INFO_ARCH_SPARC;
2241         info->value->u.q_sparc.pc = env->pc;
2242         info->value->u.q_sparc.npc = env->npc;
2243 #elif defined(TARGET_MIPS)
2244         info->value->arch = CPU_INFO_ARCH_MIPS;
2245         info->value->u.q_mips.PC = env->active_tc.PC;
2246 #elif defined(TARGET_TRICORE)
2247         info->value->arch = CPU_INFO_ARCH_TRICORE;
2248         info->value->u.tricore.PC = env->PC;
2249 #elif defined(TARGET_S390X)
2250         info->value->arch = CPU_INFO_ARCH_S390;
2251         info->value->u.s390.cpu_state = env->cpu_state;
2252 #elif defined(TARGET_RISCV)
2253         info->value->arch = CPU_INFO_ARCH_RISCV;
2254         info->value->u.riscv.pc = env->pc;
2255 #else
2256         info->value->arch = CPU_INFO_ARCH_OTHER;
2257 #endif
2258         info->value->has_props = !!mc->cpu_index_to_instance_props;
2259         if (info->value->has_props) {
2260             CpuInstanceProperties *props;
2261             props = g_malloc0(sizeof(*props));
2262             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2263             info->value->props = props;
2264         }
2265
2266         /* XXX: waiting for the qapi to support GSList */
2267         if (!cur_item) {
2268             head = cur_item = info;
2269         } else {
2270             cur_item->next = info;
2271             cur_item = info;
2272         }
2273     }
2274
2275     return head;
2276 }
2277
2278 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2279 {
2280     /*
2281      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2282      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2283      */
2284     switch (target) {
2285     case SYS_EMU_TARGET_I386:
2286     case SYS_EMU_TARGET_X86_64:
2287         return CPU_INFO_ARCH_X86;
2288
2289     case SYS_EMU_TARGET_PPC:
2290     case SYS_EMU_TARGET_PPC64:
2291         return CPU_INFO_ARCH_PPC;
2292
2293     case SYS_EMU_TARGET_SPARC:
2294     case SYS_EMU_TARGET_SPARC64:
2295         return CPU_INFO_ARCH_SPARC;
2296
2297     case SYS_EMU_TARGET_MIPS:
2298     case SYS_EMU_TARGET_MIPSEL:
2299     case SYS_EMU_TARGET_MIPS64:
2300     case SYS_EMU_TARGET_MIPS64EL:
2301         return CPU_INFO_ARCH_MIPS;
2302
2303     case SYS_EMU_TARGET_TRICORE:
2304         return CPU_INFO_ARCH_TRICORE;
2305
2306     case SYS_EMU_TARGET_S390X:
2307         return CPU_INFO_ARCH_S390;
2308
2309     case SYS_EMU_TARGET_RISCV32:
2310     case SYS_EMU_TARGET_RISCV64:
2311         return CPU_INFO_ARCH_RISCV;
2312
2313     default:
2314         return CPU_INFO_ARCH_OTHER;
2315     }
2316 }
2317
2318 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2319 {
2320 #ifdef TARGET_S390X
2321     S390CPU *s390_cpu = S390_CPU(cpu);
2322     CPUS390XState *env = &s390_cpu->env;
2323
2324     info->cpu_state = env->cpu_state;
2325 #else
2326     abort();
2327 #endif
2328 }
2329
2330 /*
2331  * fast means: we NEVER interrupt vCPU threads to retrieve
2332  * information from KVM.
2333  */
2334 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2335 {
2336     MachineState *ms = MACHINE(qdev_get_machine());
2337     MachineClass *mc = MACHINE_GET_CLASS(ms);
2338     CpuInfoFastList *head = NULL, *cur_item = NULL;
2339     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2340                                           -1, &error_abort);
2341     CPUState *cpu;
2342
2343     CPU_FOREACH(cpu) {
2344         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2345         info->value = g_malloc0(sizeof(*info->value));
2346
2347         info->value->cpu_index = cpu->cpu_index;
2348         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2349         info->value->thread_id = cpu->thread_id;
2350
2351         info->value->has_props = !!mc->cpu_index_to_instance_props;
2352         if (info->value->has_props) {
2353             CpuInstanceProperties *props;
2354             props = g_malloc0(sizeof(*props));
2355             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2356             info->value->props = props;
2357         }
2358
2359         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2360         info->value->target = target;
2361         if (target == SYS_EMU_TARGET_S390X) {
2362             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2363         }
2364
2365         if (!cur_item) {
2366             head = cur_item = info;
2367         } else {
2368             cur_item->next = info;
2369             cur_item = info;
2370         }
2371     }
2372
2373     return head;
2374 }
2375
2376 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2377                  bool has_cpu, int64_t cpu_index, Error **errp)
2378 {
2379     FILE *f;
2380     uint32_t l;
2381     CPUState *cpu;
2382     uint8_t buf[1024];
2383     int64_t orig_addr = addr, orig_size = size;
2384
2385     if (!has_cpu) {
2386         cpu_index = 0;
2387     }
2388
2389     cpu = qemu_get_cpu(cpu_index);
2390     if (cpu == NULL) {
2391         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2392                    "a CPU number");
2393         return;
2394     }
2395
2396     f = fopen(filename, "wb");
2397     if (!f) {
2398         error_setg_file_open(errp, errno, filename);
2399         return;
2400     }
2401
2402     while (size != 0) {
2403         l = sizeof(buf);
2404         if (l > size)
2405             l = size;
2406         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2407             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2408                              " specified", orig_addr, orig_size);
2409             goto exit;
2410         }
2411         if (fwrite(buf, 1, l, f) != l) {
2412             error_setg(errp, QERR_IO_ERROR);
2413             goto exit;
2414         }
2415         addr += l;
2416         size -= l;
2417     }
2418
2419 exit:
2420     fclose(f);
2421 }
2422
2423 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2424                   Error **errp)
2425 {
2426     FILE *f;
2427     uint32_t l;
2428     uint8_t buf[1024];
2429
2430     f = fopen(filename, "wb");
2431     if (!f) {
2432         error_setg_file_open(errp, errno, filename);
2433         return;
2434     }
2435
2436     while (size != 0) {
2437         l = sizeof(buf);
2438         if (l > size)
2439             l = size;
2440         cpu_physical_memory_read(addr, buf, l);
2441         if (fwrite(buf, 1, l, f) != l) {
2442             error_setg(errp, QERR_IO_ERROR);
2443             goto exit;
2444         }
2445         addr += l;
2446         size -= l;
2447     }
2448
2449 exit:
2450     fclose(f);
2451 }
2452
2453 void qmp_inject_nmi(Error **errp)
2454 {
2455     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2456 }
2457
2458 void dump_drift_info(void)
2459 {
2460     if (!use_icount) {
2461         return;
2462     }
2463
2464     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2465                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2466     if (icount_align_option) {
2467         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2468                     -max_delay / SCALE_MS);
2469         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2470                     max_advance / SCALE_MS);
2471     } else {
2472         qemu_printf("Max guest delay     NA\n");
2473         qemu_printf("Max guest advance   NA\n");
2474     }
2475 }