cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "sysemu/runstate.h"
  59 #include "hw/boards.h"
  60 #include "hw/hw.h"
  61
  62 #ifdef CONFIG_LINUX
  63
  64 #include <sys/prctl.h>
  65
  66 #ifndef PR_MCE_KILL
  67 #define PR_MCE_KILL 33
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_SET
  71 #define PR_MCE_KILL_SET 1
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_EARLY
  75 #define PR_MCE_KILL_EARLY 1
  76 #endif
  77
  78 #endif /* CONFIG_LINUX */
  79
  80 int64_t max_delay;
  81 int64_t max_advance;
  82
  83 /* vcpu throttling controls */
  84 static QEMUTimer *throttle_timer;
  85 static unsigned int throttle_percentage;
  86
  87 #define CPU_THROTTLE_PCT_MIN 1
  88 #define CPU_THROTTLE_PCT_MAX 99
  89 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  90
  91 bool cpu_is_stopped(CPUState *cpu)
  92 {
  93     return cpu->stopped || !runstate_is_running();
  94 }
  95
  96 static bool cpu_thread_is_idle(CPUState *cpu)
  97 {
  98     if (cpu->stop || cpu->queued_work_first) {
  99         return false;
 100     }
 101     if (cpu_is_stopped(cpu)) {
 102         return true;
 103     }
 104     if (!cpu->halted || cpu_has_work(cpu) ||
 105         kvm_halt_in_kernel()) {
 106         return false;
 107     }
 108     return true;
 109 }
 110
 111 static bool all_cpu_threads_idle(void)
 112 {
 113     CPUState *cpu;
 114
 115     CPU_FOREACH(cpu) {
 116         if (!cpu_thread_is_idle(cpu)) {
 117             return false;
 118         }
 119     }
 120     return true;
 121 }
 122
 123 /***********************************************************/
 124 /* guest cycle counter */
 125
 126 /* Protected by TimersState seqlock */
 127
 128 static bool icount_sleep = true;
 129 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 130 #define MAX_ICOUNT_SHIFT 10
 131
 132 typedef struct TimersState {
 133     /* Protected by BQL.  */
 134     int64_t cpu_ticks_prev;
 135     int64_t cpu_ticks_offset;
 136
 137     /* Protect fields that can be respectively read outside the
 138      * BQL, and written from multiple threads.
 139      */
 140     QemuSeqLock vm_clock_seqlock;
 141     QemuSpin vm_clock_lock;
 142
 143     int16_t cpu_ticks_enabled;
 144
 145     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 146     int16_t icount_time_shift;
 147
 148     /* Compensate for varying guest execution speed.  */
 149     int64_t qemu_icount_bias;
 150
 151     int64_t vm_clock_warp_start;
 152     int64_t cpu_clock_offset;
 153
 154     /* Only written by TCG thread */
 155     int64_t qemu_icount;
 156
 157     /* for adjusting icount */
 158     QEMUTimer *icount_rt_timer;
 159     QEMUTimer *icount_vm_timer;
 160     QEMUTimer *icount_warp_timer;
 161 } TimersState;
 162
 163 static TimersState timers_state;
 164 bool mttcg_enabled;
 165
 166 /*
 167  * We default to false if we know other options have been enabled
 168  * which are currently incompatible with MTTCG. Otherwise when each
 169  * guest (target) has been updated to support:
 170  *   - atomic instructions
 171  *   - memory ordering primitives (barriers)
 172  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 173  *
 174  * Once a guest architecture has been converted to the new primitives
 175  * there are two remaining limitations to check.
 176  *
 177  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 178  * - The host must have a stronger memory order than the guest
 179  *
 180  * It may be possible in future to support strong guests on weak hosts
 181  * but that will require tagging all load/stores in a guest with their
 182  * implicit memory order requirements which would likely slow things
 183  * down a lot.
 184  */
 185
 186 static bool check_tcg_memory_orders_compatible(void)
 187 {
 188 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 189     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 190 #else
 191     return false;
 192 #endif
 193 }
 194
 195 static bool default_mttcg_enabled(void)
 196 {
 197     if (use_icount || TCG_OVERSIZED_GUEST) {
 198         return false;
 199     } else {
 200 #ifdef TARGET_SUPPORTS_MTTCG
 201         return check_tcg_memory_orders_compatible();
 202 #else
 203         return false;
 204 #endif
 205     }
 206 }
 207
 208 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 209 {
 210     const char *t = qemu_opt_get(opts, "thread");
 211     if (t) {
 212         if (strcmp(t, "multi") == 0) {
 213             if (TCG_OVERSIZED_GUEST) {
 214                 error_setg(errp, "No MTTCG when guest word size > hosts");
 215             } else if (use_icount) {
 216                 error_setg(errp, "No MTTCG when icount is enabled");
 217             } else {
 218 #ifndef TARGET_SUPPORTS_MTTCG
 219                 warn_report("Guest not yet converted to MTTCG - "
 220                             "you may get unexpected results");
 221 #endif
 222                 if (!check_tcg_memory_orders_compatible()) {
 223                     warn_report("Guest expects a stronger memory ordering "
 224                                 "than the host provides");
 225                     error_printf("This may cause strange/hard to debug errors\n");
 226                 }
 227                 mttcg_enabled = true;
 228             }
 229         } else if (strcmp(t, "single") == 0) {
 230             mttcg_enabled = false;
 231         } else {
 232             error_setg(errp, "Invalid 'thread' setting %s", t);
 233         }
 234     } else {
 235         mttcg_enabled = default_mttcg_enabled();
 236     }
 237 }
 238
 239 /* The current number of executed instructions is based on what we
 240  * originally budgeted minus the current state of the decrementing
 241  * icount counters in extra/u16.low.
 242  */
 243 static int64_t cpu_get_icount_executed(CPUState *cpu)
 244 {
 245     return (cpu->icount_budget -
 246             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 247 }
 248
 249 /*
 250  * Update the global shared timer_state.qemu_icount to take into
 251  * account executed instructions. This is done by the TCG vCPU
 252  * thread so the main-loop can see time has moved forward.
 253  */
 254 static void cpu_update_icount_locked(CPUState *cpu)
 255 {
 256     int64_t executed = cpu_get_icount_executed(cpu);
 257     cpu->icount_budget -= executed;
 258
 259     atomic_set_i64(&timers_state.qemu_icount,
 260                    timers_state.qemu_icount + executed);
 261 }
 262
 263 /*
 264  * Update the global shared timer_state.qemu_icount to take into
 265  * account executed instructions. This is done by the TCG vCPU
 266  * thread so the main-loop can see time has moved forward.
 267  */
 268 void cpu_update_icount(CPUState *cpu)
 269 {
 270     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 271                        &timers_state.vm_clock_lock);
 272     cpu_update_icount_locked(cpu);
 273     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 274                          &timers_state.vm_clock_lock);
 275 }
 276
 277 static int64_t cpu_get_icount_raw_locked(void)
 278 {
 279     CPUState *cpu = current_cpu;
 280
 281     if (cpu && cpu->running) {
 282         if (!cpu->can_do_io) {
 283             error_report("Bad icount read");
 284             exit(1);
 285         }
 286         /* Take into account what has run */
 287         cpu_update_icount_locked(cpu);
 288     }
 289     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 290     return atomic_read_i64(&timers_state.qemu_icount);
 291 }
 292
 293 static int64_t cpu_get_icount_locked(void)
 294 {
 295     int64_t icount = cpu_get_icount_raw_locked();
 296     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 297         cpu_icount_to_ns(icount);
 298 }
 299
 300 int64_t cpu_get_icount_raw(void)
 301 {
 302     int64_t icount;
 303     unsigned start;
 304
 305     do {
 306         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 307         icount = cpu_get_icount_raw_locked();
 308     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 309
 310     return icount;
 311 }
 312
 313 /* Return the virtual CPU time, based on the instruction counter.  */
 314 int64_t cpu_get_icount(void)
 315 {
 316     int64_t icount;
 317     unsigned start;
 318
 319     do {
 320         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 321         icount = cpu_get_icount_locked();
 322     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 323
 324     return icount;
 325 }
 326
 327 int64_t cpu_icount_to_ns(int64_t icount)
 328 {
 329     return icount << atomic_read(&timers_state.icount_time_shift);
 330 }
 331
 332 static int64_t cpu_get_ticks_locked(void)
 333 {
 334     int64_t ticks = timers_state.cpu_ticks_offset;
 335     if (timers_state.cpu_ticks_enabled) {
 336         ticks += cpu_get_host_ticks();
 337     }
 338
 339     if (timers_state.cpu_ticks_prev > ticks) {
 340         /* Non increasing ticks may happen if the host uses software suspend.  */
 341         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 342         ticks = timers_state.cpu_ticks_prev;
 343     }
 344
 345     timers_state.cpu_ticks_prev = ticks;
 346     return ticks;
 347 }
 348
 349 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 350  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 351  * counter.
 352  */
 353 int64_t cpu_get_ticks(void)
 354 {
 355     int64_t ticks;
 356
 357     if (use_icount) {
 358         return cpu_get_icount();
 359     }
 360
 361     qemu_spin_lock(&timers_state.vm_clock_lock);
 362     ticks = cpu_get_ticks_locked();
 363     qemu_spin_unlock(&timers_state.vm_clock_lock);
 364     return ticks;
 365 }
 366
 367 static int64_t cpu_get_clock_locked(void)
 368 {
 369     int64_t time;
 370
 371     time = timers_state.cpu_clock_offset;
 372     if (timers_state.cpu_ticks_enabled) {
 373         time += get_clock();
 374     }
 375
 376     return time;
 377 }
 378
 379 /* Return the monotonic time elapsed in VM, i.e.,
 380  * the time between vm_start and vm_stop
 381  */
 382 int64_t cpu_get_clock(void)
 383 {
 384     int64_t ti;
 385     unsigned start;
 386
 387     do {
 388         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 389         ti = cpu_get_clock_locked();
 390     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 391
 392     return ti;
 393 }
 394
 395 /* enable cpu_get_ticks()
 396  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 397  */
 398 void cpu_enable_ticks(void)
 399 {
 400     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 401                        &timers_state.vm_clock_lock);
 402     if (!timers_state.cpu_ticks_enabled) {
 403         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 404         timers_state.cpu_clock_offset -= get_clock();
 405         timers_state.cpu_ticks_enabled = 1;
 406     }
 407     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 408                        &timers_state.vm_clock_lock);
 409 }
 410
 411 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 412  * cpu_get_ticks() after that.
 413  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 414  */
 415 void cpu_disable_ticks(void)
 416 {
 417     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 418                        &timers_state.vm_clock_lock);
 419     if (timers_state.cpu_ticks_enabled) {
 420         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 421         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 422         timers_state.cpu_ticks_enabled = 0;
 423     }
 424     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 425                          &timers_state.vm_clock_lock);
 426 }
 427
 428 /* Correlation between real and virtual time is always going to be
 429    fairly approximate, so ignore small variation.
 430    When the guest is idle real and virtual time will be aligned in
 431    the IO wait loop.  */
 432 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 433
 434 static void icount_adjust(void)
 435 {
 436     int64_t cur_time;
 437     int64_t cur_icount;
 438     int64_t delta;
 439
 440     /* Protected by TimersState mutex.  */
 441     static int64_t last_delta;
 442
 443     /* If the VM is not running, then do nothing.  */
 444     if (!runstate_is_running()) {
 445         return;
 446     }
 447
 448     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 449                        &timers_state.vm_clock_lock);
 450     cur_time = cpu_get_clock_locked();
 451     cur_icount = cpu_get_icount_locked();
 452
 453     delta = cur_icount - cur_time;
 454     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 455     if (delta > 0
 456         && last_delta + ICOUNT_WOBBLE < delta * 2
 457         && timers_state.icount_time_shift > 0) {
 458         /* The guest is getting too far ahead.  Slow time down.  */
 459         atomic_set(&timers_state.icount_time_shift,
 460                    timers_state.icount_time_shift - 1);
 461     }
 462     if (delta < 0
 463         && last_delta - ICOUNT_WOBBLE > delta * 2
 464         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 465         /* The guest is getting too far behind.  Speed time up.  */
 466         atomic_set(&timers_state.icount_time_shift,
 467                    timers_state.icount_time_shift + 1);
 468     }
 469     last_delta = delta;
 470     atomic_set_i64(&timers_state.qemu_icount_bias,
 471                    cur_icount - (timers_state.qemu_icount
 472                                  << timers_state.icount_time_shift));
 473     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 474                          &timers_state.vm_clock_lock);
 475 }
 476
 477 static void icount_adjust_rt(void *opaque)
 478 {
 479     timer_mod(timers_state.icount_rt_timer,
 480               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 481     icount_adjust();
 482 }
 483
 484 static void icount_adjust_vm(void *opaque)
 485 {
 486     timer_mod(timers_state.icount_vm_timer,
 487                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 488                    NANOSECONDS_PER_SECOND / 10);
 489     icount_adjust();
 490 }
 491
 492 static int64_t qemu_icount_round(int64_t count)
 493 {
 494     int shift = atomic_read(&timers_state.icount_time_shift);
 495     return (count + (1 << shift) - 1) >> shift;
 496 }
 497
 498 static void icount_warp_rt(void)
 499 {
 500     unsigned seq;
 501     int64_t warp_start;
 502
 503     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 504      * changes from -1 to another value, so the race here is okay.
 505      */
 506     do {
 507         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 508         warp_start = timers_state.vm_clock_warp_start;
 509     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 510
 511     if (warp_start == -1) {
 512         return;
 513     }
 514
 515     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 516                        &timers_state.vm_clock_lock);
 517     if (runstate_is_running()) {
 518         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 519                                             cpu_get_clock_locked());
 520         int64_t warp_delta;
 521
 522         warp_delta = clock - timers_state.vm_clock_warp_start;
 523         if (use_icount == 2) {
 524             /*
 525              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 526              * far ahead of real time.
 527              */
 528             int64_t cur_icount = cpu_get_icount_locked();
 529             int64_t delta = clock - cur_icount;
 530             warp_delta = MIN(warp_delta, delta);
 531         }
 532         atomic_set_i64(&timers_state.qemu_icount_bias,
 533                        timers_state.qemu_icount_bias + warp_delta);
 534     }
 535     timers_state.vm_clock_warp_start = -1;
 536     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 537                        &timers_state.vm_clock_lock);
 538
 539     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 540         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 541     }
 542 }
 543
 544 static void icount_timer_cb(void *opaque)
 545 {
 546     /* No need for a checkpoint because the timer already synchronizes
 547      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 548      */
 549     icount_warp_rt();
 550 }
 551
 552 void qtest_clock_warp(int64_t dest)
 553 {
 554     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 555     AioContext *aio_context;
 556     assert(qtest_enabled());
 557     aio_context = qemu_get_aio_context();
 558     while (clock < dest) {
 559         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 560         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 561
 562         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 563                            &timers_state.vm_clock_lock);
 564         atomic_set_i64(&timers_state.qemu_icount_bias,
 565                        timers_state.qemu_icount_bias + warp);
 566         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 567                              &timers_state.vm_clock_lock);
 568
 569         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 570         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 571         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 572     }
 573     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 574 }
 575
 576 void qemu_start_warp_timer(void)
 577 {
 578     int64_t clock;
 579     int64_t deadline;
 580
 581     if (!use_icount) {
 582         return;
 583     }
 584
 585     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 586      * do not fire, so computing the deadline does not make sense.
 587      */
 588     if (!runstate_is_running()) {
 589         return;
 590     }
 591
 592     if (replay_mode != REPLAY_MODE_PLAY) {
 593         if (!all_cpu_threads_idle()) {
 594             return;
 595         }
 596
 597         if (qtest_enabled()) {
 598             /* When testing, qtest commands advance icount.  */
 599             return;
 600         }
 601
 602         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 603     } else {
 604         /* warp clock deterministically in record/replay mode */
 605         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 606             /* vCPU is sleeping and warp can't be started.
 607                It is probably a race condition: notification sent
 608                to vCPU was processed in advance and vCPU went to sleep.
 609                Therefore we have to wake it up for doing someting. */
 610             if (replay_has_checkpoint()) {
 611                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 612             }
 613             return;
 614         }
 615     }
 616
 617     /* We want to use the earliest deadline from ALL vm_clocks */
 618     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 619     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 620     if (deadline < 0) {
 621         static bool notified;
 622         if (!icount_sleep && !notified) {
 623             warn_report("icount sleep disabled and no active timers");
 624             notified = true;
 625         }
 626         return;
 627     }
 628
 629     if (deadline > 0) {
 630         /*
 631          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 632          * sleep.  Otherwise, the CPU might be waiting for a future timer
 633          * interrupt to wake it up, but the interrupt never comes because
 634          * the vCPU isn't running any insns and thus doesn't advance the
 635          * QEMU_CLOCK_VIRTUAL.
 636          */
 637         if (!icount_sleep) {
 638             /*
 639              * We never let VCPUs sleep in no sleep icount mode.
 640              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 641              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 642              * It is useful when we want a deterministic execution time,
 643              * isolated from host latencies.
 644              */
 645             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 646                                &timers_state.vm_clock_lock);
 647             atomic_set_i64(&timers_state.qemu_icount_bias,
 648                            timers_state.qemu_icount_bias + deadline);
 649             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 650                                  &timers_state.vm_clock_lock);
 651             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 652         } else {
 653             /*
 654              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 655              * "real" time, (related to the time left until the next event) has
 656              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 657              * This avoids that the warps are visible externally; for example,
 658              * you will not be sending network packets continuously instead of
 659              * every 100ms.
 660              */
 661             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 662                                &timers_state.vm_clock_lock);
 663             if (timers_state.vm_clock_warp_start == -1
 664                 || timers_state.vm_clock_warp_start > clock) {
 665                 timers_state.vm_clock_warp_start = clock;
 666             }
 667             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 668                                  &timers_state.vm_clock_lock);
 669             timer_mod_anticipate(timers_state.icount_warp_timer,
 670                                  clock + deadline);
 671         }
 672     } else if (deadline == 0) {
 673         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 674     }
 675 }
 676
 677 static void qemu_account_warp_timer(void)
 678 {
 679     if (!use_icount || !icount_sleep) {
 680         return;
 681     }
 682
 683     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 684      * do not fire, so computing the deadline does not make sense.
 685      */
 686     if (!runstate_is_running()) {
 687         return;
 688     }
 689
 690     /* warp clock deterministically in record/replay mode */
 691     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 692         return;
 693     }
 694
 695     timer_del(timers_state.icount_warp_timer);
 696     icount_warp_rt();
 697 }
 698
 699 static bool icount_state_needed(void *opaque)
 700 {
 701     return use_icount;
 702 }
 703
 704 static bool warp_timer_state_needed(void *opaque)
 705 {
 706     TimersState *s = opaque;
 707     return s->icount_warp_timer != NULL;
 708 }
 709
 710 static bool adjust_timers_state_needed(void *opaque)
 711 {
 712     TimersState *s = opaque;
 713     return s->icount_rt_timer != NULL;
 714 }
 715
 716 /*
 717  * Subsection for warp timer migration is optional, because may not be created
 718  */
 719 static const VMStateDescription icount_vmstate_warp_timer = {
 720     .name = "timer/icount/warp_timer",
 721     .version_id = 1,
 722     .minimum_version_id = 1,
 723     .needed = warp_timer_state_needed,
 724     .fields = (VMStateField[]) {
 725         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 726         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 727         VMSTATE_END_OF_LIST()
 728     }
 729 };
 730
 731 static const VMStateDescription icount_vmstate_adjust_timers = {
 732     .name = "timer/icount/timers",
 733     .version_id = 1,
 734     .minimum_version_id = 1,
 735     .needed = adjust_timers_state_needed,
 736     .fields = (VMStateField[]) {
 737         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 738         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 739         VMSTATE_END_OF_LIST()
 740     }
 741 };
 742
 743 /*
 744  * This is a subsection for icount migration.
 745  */
 746 static const VMStateDescription icount_vmstate_timers = {
 747     .name = "timer/icount",
 748     .version_id = 1,
 749     .minimum_version_id = 1,
 750     .needed = icount_state_needed,
 751     .fields = (VMStateField[]) {
 752         VMSTATE_INT64(qemu_icount_bias, TimersState),
 753         VMSTATE_INT64(qemu_icount, TimersState),
 754         VMSTATE_END_OF_LIST()
 755     },
 756     .subsections = (const VMStateDescription*[]) {
 757         &icount_vmstate_warp_timer,
 758         &icount_vmstate_adjust_timers,
 759         NULL
 760     }
 761 };
 762
 763 static const VMStateDescription vmstate_timers = {
 764     .name = "timer",
 765     .version_id = 2,
 766     .minimum_version_id = 1,
 767     .fields = (VMStateField[]) {
 768         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 769         VMSTATE_UNUSED(8),
 770         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 771         VMSTATE_END_OF_LIST()
 772     },
 773     .subsections = (const VMStateDescription*[]) {
 774         &icount_vmstate_timers,
 775         NULL
 776     }
 777 };
 778
 779 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 780 {
 781     double pct;
 782     double throttle_ratio;
 783     long sleeptime_ns;
 784
 785     if (!cpu_throttle_get_percentage()) {
 786         return;
 787     }
 788
 789     pct = (double)cpu_throttle_get_percentage()/100;
 790     throttle_ratio = pct / (1 - pct);
 791     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 792
 793     qemu_mutex_unlock_iothread();
 794     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 795     qemu_mutex_lock_iothread();
 796     atomic_set(&cpu->throttle_thread_scheduled, 0);
 797 }
 798
 799 static void cpu_throttle_timer_tick(void *opaque)
 800 {
 801     CPUState *cpu;
 802     double pct;
 803
 804     /* Stop the timer if needed */
 805     if (!cpu_throttle_get_percentage()) {
 806         return;
 807     }
 808     CPU_FOREACH(cpu) {
 809         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 810             async_run_on_cpu(cpu, cpu_throttle_thread,
 811                              RUN_ON_CPU_NULL);
 812         }
 813     }
 814
 815     pct = (double)cpu_throttle_get_percentage()/100;
 816     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 817                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 818 }
 819
 820 void cpu_throttle_set(int new_throttle_pct)
 821 {
 822     /* Ensure throttle percentage is within valid range */
 823     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 824     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 825
 826     atomic_set(&throttle_percentage, new_throttle_pct);
 827
 828     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 829                                        CPU_THROTTLE_TIMESLICE_NS);
 830 }
 831
 832 void cpu_throttle_stop(void)
 833 {
 834     atomic_set(&throttle_percentage, 0);
 835 }
 836
 837 bool cpu_throttle_active(void)
 838 {
 839     return (cpu_throttle_get_percentage() != 0);
 840 }
 841
 842 int cpu_throttle_get_percentage(void)
 843 {
 844     return atomic_read(&throttle_percentage);
 845 }
 846
 847 void cpu_ticks_init(void)
 848 {
 849     seqlock_init(&timers_state.vm_clock_seqlock);
 850     qemu_spin_init(&timers_state.vm_clock_lock);
 851     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 852     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 853                                            cpu_throttle_timer_tick, NULL);
 854 }
 855
 856 void configure_icount(QemuOpts *opts, Error **errp)
 857 {
 858     const char *option;
 859     char *rem_str = NULL;
 860
 861     option = qemu_opt_get(opts, "shift");
 862     if (!option) {
 863         if (qemu_opt_get(opts, "align") != NULL) {
 864             error_setg(errp, "Please specify shift option when using align");
 865         }
 866         return;
 867     }
 868
 869     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 870     if (icount_sleep) {
 871         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 872                                          icount_timer_cb, NULL);
 873     }
 874
 875     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 876
 877     if (icount_align_option && !icount_sleep) {
 878         error_setg(errp, "align=on and sleep=off are incompatible");
 879     }
 880     if (strcmp(option, "auto") != 0) {
 881         errno = 0;
 882         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 883         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 884             error_setg(errp, "icount: Invalid shift value");
 885         }
 886         use_icount = 1;
 887         return;
 888     } else if (icount_align_option) {
 889         error_setg(errp, "shift=auto and align=on are incompatible");
 890     } else if (!icount_sleep) {
 891         error_setg(errp, "shift=auto and sleep=off are incompatible");
 892     }
 893
 894     use_icount = 2;
 895
 896     /* 125MIPS seems a reasonable initial guess at the guest speed.
 897        It will be corrected fairly quickly anyway.  */
 898     timers_state.icount_time_shift = 3;
 899
 900     /* Have both realtime and virtual time triggers for speed adjustment.
 901        The realtime trigger catches emulated time passing too slowly,
 902        the virtual time trigger catches emulated time passing too fast.
 903        Realtime triggers occur even when idle, so use them less frequently
 904        than VM triggers.  */
 905     timers_state.vm_clock_warp_start = -1;
 906     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 907                                    icount_adjust_rt, NULL);
 908     timer_mod(timers_state.icount_rt_timer,
 909                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 910     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 911                                         icount_adjust_vm, NULL);
 912     timer_mod(timers_state.icount_vm_timer,
 913                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 914                    NANOSECONDS_PER_SECOND / 10);
 915 }
 916
 917 /***********************************************************/
 918 /* TCG vCPU kick timer
 919  *
 920  * The kick timer is responsible for moving single threaded vCPU
 921  * emulation on to the next vCPU. If more than one vCPU is running a
 922  * timer event with force a cpu->exit so the next vCPU can get
 923  * scheduled.
 924  *
 925  * The timer is removed if all vCPUs are idle and restarted again once
 926  * idleness is complete.
 927  */
 928
 929 static QEMUTimer *tcg_kick_vcpu_timer;
 930 static CPUState *tcg_current_rr_cpu;
 931
 932 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 933
 934 static inline int64_t qemu_tcg_next_kick(void)
 935 {
 936     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 937 }
 938
 939 /* Kick the currently round-robin scheduled vCPU */
 940 static void qemu_cpu_kick_rr_cpu(void)
 941 {
 942     CPUState *cpu;
 943     do {
 944         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 945         if (cpu) {
 946             cpu_exit(cpu);
 947         }
 948     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 949 }
 950
 951 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 952 {
 953 }
 954
 955 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 956 {
 957     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 958         qemu_notify_event();
 959         return;
 960     }
 961
 962     if (qemu_in_vcpu_thread()) {
 963         /* A CPU is currently running; kick it back out to the
 964          * tcg_cpu_exec() loop so it will recalculate its
 965          * icount deadline immediately.
 966          */
 967         qemu_cpu_kick(current_cpu);
 968     } else if (first_cpu) {
 969         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 970          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 971          * causes cpu_thread_is_idle to return false.  This way,
 972          * handle_icount_deadline can run.
 973          * If we have no CPUs at all for some reason, we don't
 974          * need to do anything.
 975          */
 976         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 977     }
 978 }
 979
 980 static void kick_tcg_thread(void *opaque)
 981 {
 982     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 983     qemu_cpu_kick_rr_cpu();
 984 }
 985
 986 static void start_tcg_kick_timer(void)
 987 {
 988     assert(!mttcg_enabled);
 989     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 990         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 991                                            kick_tcg_thread, NULL);
 992     }
 993     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 994         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 995     }
 996 }
 997
 998 static void stop_tcg_kick_timer(void)
 999 {
1000     assert(!mttcg_enabled);
1001     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1002         timer_del(tcg_kick_vcpu_timer);
1003     }
1004 }
1005
1006 /***********************************************************/
1007 void hw_error(const char *fmt, ...)
1008 {
1009     va_list ap;
1010     CPUState *cpu;
1011
1012     va_start(ap, fmt);
1013     fprintf(stderr, "qemu: hardware error: ");
1014     vfprintf(stderr, fmt, ap);
1015     fprintf(stderr, "\n");
1016     CPU_FOREACH(cpu) {
1017         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1018         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1019     }
1020     va_end(ap);
1021     abort();
1022 }
1023
1024 void cpu_synchronize_all_states(void)
1025 {
1026     CPUState *cpu;
1027
1028     CPU_FOREACH(cpu) {
1029         cpu_synchronize_state(cpu);
1030         /* TODO: move to cpu_synchronize_state() */
1031         if (hvf_enabled()) {
1032             hvf_cpu_synchronize_state(cpu);
1033         }
1034     }
1035 }
1036
1037 void cpu_synchronize_all_post_reset(void)
1038 {
1039     CPUState *cpu;
1040
1041     CPU_FOREACH(cpu) {
1042         cpu_synchronize_post_reset(cpu);
1043         /* TODO: move to cpu_synchronize_post_reset() */
1044         if (hvf_enabled()) {
1045             hvf_cpu_synchronize_post_reset(cpu);
1046         }
1047     }
1048 }
1049
1050 void cpu_synchronize_all_post_init(void)
1051 {
1052     CPUState *cpu;
1053
1054     CPU_FOREACH(cpu) {
1055         cpu_synchronize_post_init(cpu);
1056         /* TODO: move to cpu_synchronize_post_init() */
1057         if (hvf_enabled()) {
1058             hvf_cpu_synchronize_post_init(cpu);
1059         }
1060     }
1061 }
1062
1063 void cpu_synchronize_all_pre_loadvm(void)
1064 {
1065     CPUState *cpu;
1066
1067     CPU_FOREACH(cpu) {
1068         cpu_synchronize_pre_loadvm(cpu);
1069     }
1070 }
1071
1072 static int do_vm_stop(RunState state, bool send_stop)
1073 {
1074     int ret = 0;
1075
1076     if (runstate_is_running()) {
1077         cpu_disable_ticks();
1078         pause_all_vcpus();
1079         runstate_set(state);
1080         vm_state_notify(0, state);
1081         if (send_stop) {
1082             qapi_event_send_stop();
1083         }
1084     }
1085
1086     bdrv_drain_all();
1087     replay_disable_events();
1088     ret = bdrv_flush_all();
1089
1090     return ret;
1091 }
1092
1093 /* Special vm_stop() variant for terminating the process.  Historically clients
1094  * did not expect a QMP STOP event and so we need to retain compatibility.
1095  */
1096 int vm_shutdown(void)
1097 {
1098     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1099 }
1100
1101 static bool cpu_can_run(CPUState *cpu)
1102 {
1103     if (cpu->stop) {
1104         return false;
1105     }
1106     if (cpu_is_stopped(cpu)) {
1107         return false;
1108     }
1109     return true;
1110 }
1111
1112 static void cpu_handle_guest_debug(CPUState *cpu)
1113 {
1114     gdb_set_stop_cpu(cpu);
1115     qemu_system_debug_request();
1116     cpu->stopped = true;
1117 }
1118
1119 #ifdef CONFIG_LINUX
1120 static void sigbus_reraise(void)
1121 {
1122     sigset_t set;
1123     struct sigaction action;
1124
1125     memset(&action, 0, sizeof(action));
1126     action.sa_handler = SIG_DFL;
1127     if (!sigaction(SIGBUS, &action, NULL)) {
1128         raise(SIGBUS);
1129         sigemptyset(&set);
1130         sigaddset(&set, SIGBUS);
1131         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1132     }
1133     perror("Failed to re-raise SIGBUS!\n");
1134     abort();
1135 }
1136
1137 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1138 {
1139     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1140         sigbus_reraise();
1141     }
1142
1143     if (current_cpu) {
1144         /* Called asynchronously in VCPU thread.  */
1145         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1146             sigbus_reraise();
1147         }
1148     } else {
1149         /* Called synchronously (via signalfd) in main thread.  */
1150         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1151             sigbus_reraise();
1152         }
1153     }
1154 }
1155
1156 static void qemu_init_sigbus(void)
1157 {
1158     struct sigaction action;
1159
1160     memset(&action, 0, sizeof(action));
1161     action.sa_flags = SA_SIGINFO;
1162     action.sa_sigaction = sigbus_handler;
1163     sigaction(SIGBUS, &action, NULL);
1164
1165     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1166 }
1167 #else /* !CONFIG_LINUX */
1168 static void qemu_init_sigbus(void)
1169 {
1170 }
1171 #endif /* !CONFIG_LINUX */
1172
1173 static QemuMutex qemu_global_mutex;
1174
1175 static QemuThread io_thread;
1176
1177 /* cpu creation */
1178 static QemuCond qemu_cpu_cond;
1179 /* system init */
1180 static QemuCond qemu_pause_cond;
1181
1182 void qemu_init_cpu_loop(void)
1183 {
1184     qemu_init_sigbus();
1185     qemu_cond_init(&qemu_cpu_cond);
1186     qemu_cond_init(&qemu_pause_cond);
1187     qemu_mutex_init(&qemu_global_mutex);
1188
1189     qemu_thread_get_self(&io_thread);
1190 }
1191
1192 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1193 {
1194     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1195 }
1196
1197 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1198 {
1199     if (kvm_destroy_vcpu(cpu) < 0) {
1200         error_report("kvm_destroy_vcpu failed");
1201         exit(EXIT_FAILURE);
1202     }
1203 }
1204
1205 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1206 {
1207 }
1208
1209 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1210 {
1211     g_assert(qemu_cpu_is_self(cpu));
1212     cpu->stop = false;
1213     cpu->stopped = true;
1214     if (exit) {
1215         cpu_exit(cpu);
1216     }
1217     qemu_cond_broadcast(&qemu_pause_cond);
1218 }
1219
1220 static void qemu_wait_io_event_common(CPUState *cpu)
1221 {
1222     atomic_mb_set(&cpu->thread_kicked, false);
1223     if (cpu->stop) {
1224         qemu_cpu_stop(cpu, false);
1225     }
1226     process_queued_cpu_work(cpu);
1227 }
1228
1229 static void qemu_tcg_rr_wait_io_event(void)
1230 {
1231     CPUState *cpu;
1232
1233     while (all_cpu_threads_idle()) {
1234         stop_tcg_kick_timer();
1235         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1236     }
1237
1238     start_tcg_kick_timer();
1239
1240     CPU_FOREACH(cpu) {
1241         qemu_wait_io_event_common(cpu);
1242     }
1243 }
1244
1245 static void qemu_wait_io_event(CPUState *cpu)
1246 {
1247     while (cpu_thread_is_idle(cpu)) {
1248         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1249     }
1250
1251 #ifdef _WIN32
1252     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1253     if (!tcg_enabled()) {
1254         SleepEx(0, TRUE);
1255     }
1256 #endif
1257     qemu_wait_io_event_common(cpu);
1258 }
1259
1260 static void *qemu_kvm_cpu_thread_fn(void *arg)
1261 {
1262     CPUState *cpu = arg;
1263     int r;
1264
1265     rcu_register_thread();
1266
1267     qemu_mutex_lock_iothread();
1268     qemu_thread_get_self(cpu->thread);
1269     cpu->thread_id = qemu_get_thread_id();
1270     cpu->can_do_io = 1;
1271     current_cpu = cpu;
1272
1273     r = kvm_init_vcpu(cpu);
1274     if (r < 0) {
1275         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1276         exit(1);
1277     }
1278
1279     kvm_init_cpu_signals(cpu);
1280
1281     /* signal CPU creation */
1282     cpu->created = true;
1283     qemu_cond_signal(&qemu_cpu_cond);
1284     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1285
1286     do {
1287         if (cpu_can_run(cpu)) {
1288             r = kvm_cpu_exec(cpu);
1289             if (r == EXCP_DEBUG) {
1290                 cpu_handle_guest_debug(cpu);
1291             }
1292         }
1293         qemu_wait_io_event(cpu);
1294     } while (!cpu->unplug || cpu_can_run(cpu));
1295
1296     qemu_kvm_destroy_vcpu(cpu);
1297     cpu->created = false;
1298     qemu_cond_signal(&qemu_cpu_cond);
1299     qemu_mutex_unlock_iothread();
1300     rcu_unregister_thread();
1301     return NULL;
1302 }
1303
1304 static void *qemu_dummy_cpu_thread_fn(void *arg)
1305 {
1306 #ifdef _WIN32
1307     error_report("qtest is not supported under Windows");
1308     exit(1);
1309 #else
1310     CPUState *cpu = arg;
1311     sigset_t waitset;
1312     int r;
1313
1314     rcu_register_thread();
1315
1316     qemu_mutex_lock_iothread();
1317     qemu_thread_get_self(cpu->thread);
1318     cpu->thread_id = qemu_get_thread_id();
1319     cpu->can_do_io = 1;
1320     current_cpu = cpu;
1321
1322     sigemptyset(&waitset);
1323     sigaddset(&waitset, SIG_IPI);
1324
1325     /* signal CPU creation */
1326     cpu->created = true;
1327     qemu_cond_signal(&qemu_cpu_cond);
1328     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1329
1330     do {
1331         qemu_mutex_unlock_iothread();
1332         do {
1333             int sig;
1334             r = sigwait(&waitset, &sig);
1335         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1336         if (r == -1) {
1337             perror("sigwait");
1338             exit(1);
1339         }
1340         qemu_mutex_lock_iothread();
1341         qemu_wait_io_event(cpu);
1342     } while (!cpu->unplug);
1343
1344     qemu_mutex_unlock_iothread();
1345     rcu_unregister_thread();
1346     return NULL;
1347 #endif
1348 }
1349
1350 static int64_t tcg_get_icount_limit(void)
1351 {
1352     int64_t deadline;
1353
1354     if (replay_mode != REPLAY_MODE_PLAY) {
1355         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1356
1357         /* Maintain prior (possibly buggy) behaviour where if no deadline
1358          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1359          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1360          * nanoseconds.
1361          */
1362         if ((deadline < 0) || (deadline > INT32_MAX)) {
1363             deadline = INT32_MAX;
1364         }
1365
1366         return qemu_icount_round(deadline);
1367     } else {
1368         return replay_get_instructions();
1369     }
1370 }
1371
1372 static void handle_icount_deadline(void)
1373 {
1374     assert(qemu_in_vcpu_thread());
1375     if (use_icount) {
1376         int64_t deadline =
1377             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1378
1379         if (deadline == 0) {
1380             /* Wake up other AioContexts.  */
1381             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1382             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1383         }
1384     }
1385 }
1386
1387 static void prepare_icount_for_run(CPUState *cpu)
1388 {
1389     if (use_icount) {
1390         int insns_left;
1391
1392         /* These should always be cleared by process_icount_data after
1393          * each vCPU execution. However u16.high can be raised
1394          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1395          */
1396         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1397         g_assert(cpu->icount_extra == 0);
1398
1399         cpu->icount_budget = tcg_get_icount_limit();
1400         insns_left = MIN(0xffff, cpu->icount_budget);
1401         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1402         cpu->icount_extra = cpu->icount_budget - insns_left;
1403
1404         replay_mutex_lock();
1405     }
1406 }
1407
1408 static void process_icount_data(CPUState *cpu)
1409 {
1410     if (use_icount) {
1411         /* Account for executed instructions */
1412         cpu_update_icount(cpu);
1413
1414         /* Reset the counters */
1415         cpu_neg(cpu)->icount_decr.u16.low = 0;
1416         cpu->icount_extra = 0;
1417         cpu->icount_budget = 0;
1418
1419         replay_account_executed_instructions();
1420
1421         replay_mutex_unlock();
1422     }
1423 }
1424
1425
1426 static int tcg_cpu_exec(CPUState *cpu)
1427 {
1428     int ret;
1429 #ifdef CONFIG_PROFILER
1430     int64_t ti;
1431 #endif
1432
1433     assert(tcg_enabled());
1434 #ifdef CONFIG_PROFILER
1435     ti = profile_getclock();
1436 #endif
1437     cpu_exec_start(cpu);
1438     ret = cpu_exec(cpu);
1439     cpu_exec_end(cpu);
1440 #ifdef CONFIG_PROFILER
1441     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1442                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1443 #endif
1444     return ret;
1445 }
1446
1447 /* Destroy any remaining vCPUs which have been unplugged and have
1448  * finished running
1449  */
1450 static void deal_with_unplugged_cpus(void)
1451 {
1452     CPUState *cpu;
1453
1454     CPU_FOREACH(cpu) {
1455         if (cpu->unplug && !cpu_can_run(cpu)) {
1456             qemu_tcg_destroy_vcpu(cpu);
1457             cpu->created = false;
1458             qemu_cond_signal(&qemu_cpu_cond);
1459             break;
1460         }
1461     }
1462 }
1463
1464 /* Single-threaded TCG
1465  *
1466  * In the single-threaded case each vCPU is simulated in turn. If
1467  * there is more than a single vCPU we create a simple timer to kick
1468  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1469  * This is done explicitly rather than relying on side-effects
1470  * elsewhere.
1471  */
1472
1473 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1474 {
1475     CPUState *cpu = arg;
1476
1477     assert(tcg_enabled());
1478     rcu_register_thread();
1479     tcg_register_thread();
1480
1481     qemu_mutex_lock_iothread();
1482     qemu_thread_get_self(cpu->thread);
1483
1484     cpu->thread_id = qemu_get_thread_id();
1485     cpu->created = true;
1486     cpu->can_do_io = 1;
1487     qemu_cond_signal(&qemu_cpu_cond);
1488     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1489
1490     /* wait for initial kick-off after machine start */
1491     while (first_cpu->stopped) {
1492         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1493
1494         /* process any pending work */
1495         CPU_FOREACH(cpu) {
1496             current_cpu = cpu;
1497             qemu_wait_io_event_common(cpu);
1498         }
1499     }
1500
1501     start_tcg_kick_timer();
1502
1503     cpu = first_cpu;
1504
1505     /* process any pending work */
1506     cpu->exit_request = 1;
1507
1508     while (1) {
1509         qemu_mutex_unlock_iothread();
1510         replay_mutex_lock();
1511         qemu_mutex_lock_iothread();
1512         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1513         qemu_account_warp_timer();
1514
1515         /* Run the timers here.  This is much more efficient than
1516          * waking up the I/O thread and waiting for completion.
1517          */
1518         handle_icount_deadline();
1519
1520         replay_mutex_unlock();
1521
1522         if (!cpu) {
1523             cpu = first_cpu;
1524         }
1525
1526         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1527
1528             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1529             current_cpu = cpu;
1530
1531             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1532                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1533
1534             if (cpu_can_run(cpu)) {
1535                 int r;
1536
1537                 qemu_mutex_unlock_iothread();
1538                 prepare_icount_for_run(cpu);
1539
1540                 r = tcg_cpu_exec(cpu);
1541
1542                 process_icount_data(cpu);
1543                 qemu_mutex_lock_iothread();
1544
1545                 if (r == EXCP_DEBUG) {
1546                     cpu_handle_guest_debug(cpu);
1547                     break;
1548                 } else if (r == EXCP_ATOMIC) {
1549                     qemu_mutex_unlock_iothread();
1550                     cpu_exec_step_atomic(cpu);
1551                     qemu_mutex_lock_iothread();
1552                     break;
1553                 }
1554             } else if (cpu->stop) {
1555                 if (cpu->unplug) {
1556                     cpu = CPU_NEXT(cpu);
1557                 }
1558                 break;
1559             }
1560
1561             cpu = CPU_NEXT(cpu);
1562         } /* while (cpu && !cpu->exit_request).. */
1563
1564         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1565         atomic_set(&tcg_current_rr_cpu, NULL);
1566
1567         if (cpu && cpu->exit_request) {
1568             atomic_mb_set(&cpu->exit_request, 0);
1569         }
1570
1571         if (use_icount && all_cpu_threads_idle()) {
1572             /*
1573              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1574              * in the main_loop, wake it up in order to start the warp timer.
1575              */
1576             qemu_notify_event();
1577         }
1578
1579         qemu_tcg_rr_wait_io_event();
1580         deal_with_unplugged_cpus();
1581     }
1582
1583     rcu_unregister_thread();
1584     return NULL;
1585 }
1586
1587 static void *qemu_hax_cpu_thread_fn(void *arg)
1588 {
1589     CPUState *cpu = arg;
1590     int r;
1591
1592     rcu_register_thread();
1593     qemu_mutex_lock_iothread();
1594     qemu_thread_get_self(cpu->thread);
1595
1596     cpu->thread_id = qemu_get_thread_id();
1597     cpu->created = true;
1598     current_cpu = cpu;
1599
1600     hax_init_vcpu(cpu);
1601     qemu_cond_signal(&qemu_cpu_cond);
1602     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1603
1604     do {
1605         if (cpu_can_run(cpu)) {
1606             r = hax_smp_cpu_exec(cpu);
1607             if (r == EXCP_DEBUG) {
1608                 cpu_handle_guest_debug(cpu);
1609             }
1610         }
1611
1612         qemu_wait_io_event(cpu);
1613     } while (!cpu->unplug || cpu_can_run(cpu));
1614     rcu_unregister_thread();
1615     return NULL;
1616 }
1617
1618 /* The HVF-specific vCPU thread function. This one should only run when the host
1619  * CPU supports the VMX "unrestricted guest" feature. */
1620 static void *qemu_hvf_cpu_thread_fn(void *arg)
1621 {
1622     CPUState *cpu = arg;
1623
1624     int r;
1625
1626     assert(hvf_enabled());
1627
1628     rcu_register_thread();
1629
1630     qemu_mutex_lock_iothread();
1631     qemu_thread_get_self(cpu->thread);
1632
1633     cpu->thread_id = qemu_get_thread_id();
1634     cpu->can_do_io = 1;
1635     current_cpu = cpu;
1636
1637     hvf_init_vcpu(cpu);
1638
1639     /* signal CPU creation */
1640     cpu->created = true;
1641     qemu_cond_signal(&qemu_cpu_cond);
1642     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1643
1644     do {
1645         if (cpu_can_run(cpu)) {
1646             r = hvf_vcpu_exec(cpu);
1647             if (r == EXCP_DEBUG) {
1648                 cpu_handle_guest_debug(cpu);
1649             }
1650         }
1651         qemu_wait_io_event(cpu);
1652     } while (!cpu->unplug || cpu_can_run(cpu));
1653
1654     hvf_vcpu_destroy(cpu);
1655     cpu->created = false;
1656     qemu_cond_signal(&qemu_cpu_cond);
1657     qemu_mutex_unlock_iothread();
1658     rcu_unregister_thread();
1659     return NULL;
1660 }
1661
1662 static void *qemu_whpx_cpu_thread_fn(void *arg)
1663 {
1664     CPUState *cpu = arg;
1665     int r;
1666
1667     rcu_register_thread();
1668
1669     qemu_mutex_lock_iothread();
1670     qemu_thread_get_self(cpu->thread);
1671     cpu->thread_id = qemu_get_thread_id();
1672     current_cpu = cpu;
1673
1674     r = whpx_init_vcpu(cpu);
1675     if (r < 0) {
1676         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1677         exit(1);
1678     }
1679
1680     /* signal CPU creation */
1681     cpu->created = true;
1682     qemu_cond_signal(&qemu_cpu_cond);
1683     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1684
1685     do {
1686         if (cpu_can_run(cpu)) {
1687             r = whpx_vcpu_exec(cpu);
1688             if (r == EXCP_DEBUG) {
1689                 cpu_handle_guest_debug(cpu);
1690             }
1691         }
1692         while (cpu_thread_is_idle(cpu)) {
1693             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1694         }
1695         qemu_wait_io_event_common(cpu);
1696     } while (!cpu->unplug || cpu_can_run(cpu));
1697
1698     whpx_destroy_vcpu(cpu);
1699     cpu->created = false;
1700     qemu_cond_signal(&qemu_cpu_cond);
1701     qemu_mutex_unlock_iothread();
1702     rcu_unregister_thread();
1703     return NULL;
1704 }
1705
1706 #ifdef _WIN32
1707 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1708 {
1709 }
1710 #endif
1711
1712 /* Multi-threaded TCG
1713  *
1714  * In the multi-threaded case each vCPU has its own thread. The TLS
1715  * variable current_cpu can be used deep in the code to find the
1716  * current CPUState for a given thread.
1717  */
1718
1719 static void *qemu_tcg_cpu_thread_fn(void *arg)
1720 {
1721     CPUState *cpu = arg;
1722
1723     assert(tcg_enabled());
1724     g_assert(!use_icount);
1725
1726     rcu_register_thread();
1727     tcg_register_thread();
1728
1729     qemu_mutex_lock_iothread();
1730     qemu_thread_get_self(cpu->thread);
1731
1732     cpu->thread_id = qemu_get_thread_id();
1733     cpu->created = true;
1734     cpu->can_do_io = 1;
1735     current_cpu = cpu;
1736     qemu_cond_signal(&qemu_cpu_cond);
1737     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1738
1739     /* process any pending work */
1740     cpu->exit_request = 1;
1741
1742     do {
1743         if (cpu_can_run(cpu)) {
1744             int r;
1745             qemu_mutex_unlock_iothread();
1746             r = tcg_cpu_exec(cpu);
1747             qemu_mutex_lock_iothread();
1748             switch (r) {
1749             case EXCP_DEBUG:
1750                 cpu_handle_guest_debug(cpu);
1751                 break;
1752             case EXCP_HALTED:
1753                 /* during start-up the vCPU is reset and the thread is
1754                  * kicked several times. If we don't ensure we go back
1755                  * to sleep in the halted state we won't cleanly
1756                  * start-up when the vCPU is enabled.
1757                  *
1758                  * cpu->halted should ensure we sleep in wait_io_event
1759                  */
1760                 g_assert(cpu->halted);
1761                 break;
1762             case EXCP_ATOMIC:
1763                 qemu_mutex_unlock_iothread();
1764                 cpu_exec_step_atomic(cpu);
1765                 qemu_mutex_lock_iothread();
1766             default:
1767                 /* Ignore everything else? */
1768                 break;
1769             }
1770         }
1771
1772         atomic_mb_set(&cpu->exit_request, 0);
1773         qemu_wait_io_event(cpu);
1774     } while (!cpu->unplug || cpu_can_run(cpu));
1775
1776     qemu_tcg_destroy_vcpu(cpu);
1777     cpu->created = false;
1778     qemu_cond_signal(&qemu_cpu_cond);
1779     qemu_mutex_unlock_iothread();
1780     rcu_unregister_thread();
1781     return NULL;
1782 }
1783
1784 static void qemu_cpu_kick_thread(CPUState *cpu)
1785 {
1786 #ifndef _WIN32
1787     int err;
1788
1789     if (cpu->thread_kicked) {
1790         return;
1791     }
1792     cpu->thread_kicked = true;
1793     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1794     if (err && err != ESRCH) {
1795         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1796         exit(1);
1797     }
1798 #else /* _WIN32 */
1799     if (!qemu_cpu_is_self(cpu)) {
1800         if (whpx_enabled()) {
1801             whpx_vcpu_kick(cpu);
1802         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1803             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1804                     __func__, GetLastError());
1805             exit(1);
1806         }
1807     }
1808 #endif
1809 }
1810
1811 void qemu_cpu_kick(CPUState *cpu)
1812 {
1813     qemu_cond_broadcast(cpu->halt_cond);
1814     if (tcg_enabled()) {
1815         cpu_exit(cpu);
1816         /* NOP unless doing single-thread RR */
1817         qemu_cpu_kick_rr_cpu();
1818     } else {
1819         if (hax_enabled()) {
1820             /*
1821              * FIXME: race condition with the exit_request check in
1822              * hax_vcpu_hax_exec
1823              */
1824             cpu->exit_request = 1;
1825         }
1826         qemu_cpu_kick_thread(cpu);
1827     }
1828 }
1829
1830 void qemu_cpu_kick_self(void)
1831 {
1832     assert(current_cpu);
1833     qemu_cpu_kick_thread(current_cpu);
1834 }
1835
1836 bool qemu_cpu_is_self(CPUState *cpu)
1837 {
1838     return qemu_thread_is_self(cpu->thread);
1839 }
1840
1841 bool qemu_in_vcpu_thread(void)
1842 {
1843     return current_cpu && qemu_cpu_is_self(current_cpu);
1844 }
1845
1846 static __thread bool iothread_locked = false;
1847
1848 bool qemu_mutex_iothread_locked(void)
1849 {
1850     return iothread_locked;
1851 }
1852
1853 /*
1854  * The BQL is taken from so many places that it is worth profiling the
1855  * callers directly, instead of funneling them all through a single function.
1856  */
1857 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1858 {
1859     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1860
1861     g_assert(!qemu_mutex_iothread_locked());
1862     bql_lock(&qemu_global_mutex, file, line);
1863     iothread_locked = true;
1864 }
1865
1866 void qemu_mutex_unlock_iothread(void)
1867 {
1868     g_assert(qemu_mutex_iothread_locked());
1869     iothread_locked = false;
1870     qemu_mutex_unlock(&qemu_global_mutex);
1871 }
1872
1873 static bool all_vcpus_paused(void)
1874 {
1875     CPUState *cpu;
1876
1877     CPU_FOREACH(cpu) {
1878         if (!cpu->stopped) {
1879             return false;
1880         }
1881     }
1882
1883     return true;
1884 }
1885
1886 void pause_all_vcpus(void)
1887 {
1888     CPUState *cpu;
1889
1890     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1891     CPU_FOREACH(cpu) {
1892         if (qemu_cpu_is_self(cpu)) {
1893             qemu_cpu_stop(cpu, true);
1894         } else {
1895             cpu->stop = true;
1896             qemu_cpu_kick(cpu);
1897         }
1898     }
1899
1900     /* We need to drop the replay_lock so any vCPU threads woken up
1901      * can finish their replay tasks
1902      */
1903     replay_mutex_unlock();
1904
1905     while (!all_vcpus_paused()) {
1906         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1907         CPU_FOREACH(cpu) {
1908             qemu_cpu_kick(cpu);
1909         }
1910     }
1911
1912     qemu_mutex_unlock_iothread();
1913     replay_mutex_lock();
1914     qemu_mutex_lock_iothread();
1915 }
1916
1917 void cpu_resume(CPUState *cpu)
1918 {
1919     cpu->stop = false;
1920     cpu->stopped = false;
1921     qemu_cpu_kick(cpu);
1922 }
1923
1924 void resume_all_vcpus(void)
1925 {
1926     CPUState *cpu;
1927
1928     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1929     CPU_FOREACH(cpu) {
1930         cpu_resume(cpu);
1931     }
1932 }
1933
1934 void cpu_remove_sync(CPUState *cpu)
1935 {
1936     cpu->stop = true;
1937     cpu->unplug = true;
1938     qemu_cpu_kick(cpu);
1939     qemu_mutex_unlock_iothread();
1940     qemu_thread_join(cpu->thread);
1941     qemu_mutex_lock_iothread();
1942 }
1943
1944 /* For temporary buffers for forming a name */
1945 #define VCPU_THREAD_NAME_SIZE 16
1946
1947 static void qemu_tcg_init_vcpu(CPUState *cpu)
1948 {
1949     char thread_name[VCPU_THREAD_NAME_SIZE];
1950     static QemuCond *single_tcg_halt_cond;
1951     static QemuThread *single_tcg_cpu_thread;
1952     static int tcg_region_inited;
1953
1954     assert(tcg_enabled());
1955     /*
1956      * Initialize TCG regions--once. Now is a good time, because:
1957      * (1) TCG's init context, prologue and target globals have been set up.
1958      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1959      *     -accel flag is processed, so the check doesn't work then).
1960      */
1961     if (!tcg_region_inited) {
1962         tcg_region_inited = 1;
1963         tcg_region_init();
1964     }
1965
1966     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1967         cpu->thread = g_malloc0(sizeof(QemuThread));
1968         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1969         qemu_cond_init(cpu->halt_cond);
1970
1971         if (qemu_tcg_mttcg_enabled()) {
1972             /* create a thread per vCPU with TCG (MTTCG) */
1973             parallel_cpus = true;
1974             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1975                  cpu->cpu_index);
1976
1977             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1978                                cpu, QEMU_THREAD_JOINABLE);
1979
1980         } else {
1981             /* share a single thread for all cpus with TCG */
1982             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1983             qemu_thread_create(cpu->thread, thread_name,
1984                                qemu_tcg_rr_cpu_thread_fn,
1985                                cpu, QEMU_THREAD_JOINABLE);
1986
1987             single_tcg_halt_cond = cpu->halt_cond;
1988             single_tcg_cpu_thread = cpu->thread;
1989         }
1990 #ifdef _WIN32
1991         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1992 #endif
1993     } else {
1994         /* For non-MTTCG cases we share the thread */
1995         cpu->thread = single_tcg_cpu_thread;
1996         cpu->halt_cond = single_tcg_halt_cond;
1997         cpu->thread_id = first_cpu->thread_id;
1998         cpu->can_do_io = 1;
1999         cpu->created = true;
2000     }
2001 }
2002
2003 static void qemu_hax_start_vcpu(CPUState *cpu)
2004 {
2005     char thread_name[VCPU_THREAD_NAME_SIZE];
2006
2007     cpu->thread = g_malloc0(sizeof(QemuThread));
2008     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2009     qemu_cond_init(cpu->halt_cond);
2010
2011     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2012              cpu->cpu_index);
2013     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2014                        cpu, QEMU_THREAD_JOINABLE);
2015 #ifdef _WIN32
2016     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2017 #endif
2018 }
2019
2020 static void qemu_kvm_start_vcpu(CPUState *cpu)
2021 {
2022     char thread_name[VCPU_THREAD_NAME_SIZE];
2023
2024     cpu->thread = g_malloc0(sizeof(QemuThread));
2025     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2026     qemu_cond_init(cpu->halt_cond);
2027     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2028              cpu->cpu_index);
2029     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2030                        cpu, QEMU_THREAD_JOINABLE);
2031 }
2032
2033 static void qemu_hvf_start_vcpu(CPUState *cpu)
2034 {
2035     char thread_name[VCPU_THREAD_NAME_SIZE];
2036
2037     /* HVF currently does not support TCG, and only runs in
2038      * unrestricted-guest mode. */
2039     assert(hvf_enabled());
2040
2041     cpu->thread = g_malloc0(sizeof(QemuThread));
2042     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2043     qemu_cond_init(cpu->halt_cond);
2044
2045     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2046              cpu->cpu_index);
2047     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2048                        cpu, QEMU_THREAD_JOINABLE);
2049 }
2050
2051 static void qemu_whpx_start_vcpu(CPUState *cpu)
2052 {
2053     char thread_name[VCPU_THREAD_NAME_SIZE];
2054
2055     cpu->thread = g_malloc0(sizeof(QemuThread));
2056     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2057     qemu_cond_init(cpu->halt_cond);
2058     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2059              cpu->cpu_index);
2060     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2061                        cpu, QEMU_THREAD_JOINABLE);
2062 #ifdef _WIN32
2063     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2064 #endif
2065 }
2066
2067 static void qemu_dummy_start_vcpu(CPUState *cpu)
2068 {
2069     char thread_name[VCPU_THREAD_NAME_SIZE];
2070
2071     cpu->thread = g_malloc0(sizeof(QemuThread));
2072     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2073     qemu_cond_init(cpu->halt_cond);
2074     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2075              cpu->cpu_index);
2076     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2077                        QEMU_THREAD_JOINABLE);
2078 }
2079
2080 void qemu_init_vcpu(CPUState *cpu)
2081 {
2082     MachineState *ms = MACHINE(qdev_get_machine());
2083
2084     cpu->nr_cores = ms->smp.cores;
2085     cpu->nr_threads =  ms->smp.threads;
2086     cpu->stopped = true;
2087     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2088
2089     if (!cpu->as) {
2090         /* If the target cpu hasn't set up any address spaces itself,
2091          * give it the default one.
2092          */
2093         cpu->num_ases = 1;
2094         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2095     }
2096
2097     if (kvm_enabled()) {
2098         qemu_kvm_start_vcpu(cpu);
2099     } else if (hax_enabled()) {
2100         qemu_hax_start_vcpu(cpu);
2101     } else if (hvf_enabled()) {
2102         qemu_hvf_start_vcpu(cpu);
2103     } else if (tcg_enabled()) {
2104         qemu_tcg_init_vcpu(cpu);
2105     } else if (whpx_enabled()) {
2106         qemu_whpx_start_vcpu(cpu);
2107     } else {
2108         qemu_dummy_start_vcpu(cpu);
2109     }
2110
2111     while (!cpu->created) {
2112         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2113     }
2114 }
2115
2116 void cpu_stop_current(void)
2117 {
2118     if (current_cpu) {
2119         current_cpu->stop = true;
2120         cpu_exit(current_cpu);
2121     }
2122 }
2123
2124 int vm_stop(RunState state)
2125 {
2126     if (qemu_in_vcpu_thread()) {
2127         qemu_system_vmstop_request_prepare();
2128         qemu_system_vmstop_request(state);
2129         /*
2130          * FIXME: should not return to device code in case
2131          * vm_stop() has been requested.
2132          */
2133         cpu_stop_current();
2134         return 0;
2135     }
2136
2137     return do_vm_stop(state, true);
2138 }
2139
2140 /**
2141  * Prepare for (re)starting the VM.
2142  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2143  * running or in case of an error condition), 0 otherwise.
2144  */
2145 int vm_prepare_start(void)
2146 {
2147     RunState requested;
2148
2149     qemu_vmstop_requested(&requested);
2150     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2151         return -1;
2152     }
2153
2154     /* Ensure that a STOP/RESUME pair of events is emitted if a
2155      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2156      * example, according to documentation is always followed by
2157      * the STOP event.
2158      */
2159     if (runstate_is_running()) {
2160         qapi_event_send_stop();
2161         qapi_event_send_resume();
2162         return -1;
2163     }
2164
2165     /* We are sending this now, but the CPUs will be resumed shortly later */
2166     qapi_event_send_resume();
2167
2168     replay_enable_events();
2169     cpu_enable_ticks();
2170     runstate_set(RUN_STATE_RUNNING);
2171     vm_state_notify(1, RUN_STATE_RUNNING);
2172     return 0;
2173 }
2174
2175 void vm_start(void)
2176 {
2177     if (!vm_prepare_start()) {
2178         resume_all_vcpus();
2179     }
2180 }
2181
2182 /* does a state transition even if the VM is already stopped,
2183    current state is forgotten forever */
2184 int vm_stop_force_state(RunState state)
2185 {
2186     if (runstate_is_running()) {
2187         return vm_stop(state);
2188     } else {
2189         runstate_set(state);
2190
2191         bdrv_drain_all();
2192         /* Make sure to return an error if the flush in a previous vm_stop()
2193          * failed. */
2194         return bdrv_flush_all();
2195     }
2196 }
2197
2198 void list_cpus(const char *optarg)
2199 {
2200     /* XXX: implement xxx_cpu_list for targets that still miss it */
2201 #if defined(cpu_list)
2202     cpu_list();
2203 #endif
2204 }
2205
2206 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2207                  bool has_cpu, int64_t cpu_index, Error **errp)
2208 {
2209     FILE *f;
2210     uint32_t l;
2211     CPUState *cpu;
2212     uint8_t buf[1024];
2213     int64_t orig_addr = addr, orig_size = size;
2214
2215     if (!has_cpu) {
2216         cpu_index = 0;
2217     }
2218
2219     cpu = qemu_get_cpu(cpu_index);
2220     if (cpu == NULL) {
2221         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2222                    "a CPU number");
2223         return;
2224     }
2225
2226     f = fopen(filename, "wb");
2227     if (!f) {
2228         error_setg_file_open(errp, errno, filename);
2229         return;
2230     }
2231
2232     while (size != 0) {
2233         l = sizeof(buf);
2234         if (l > size)
2235             l = size;
2236         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2237             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2238                              " specified", orig_addr, orig_size);
2239             goto exit;
2240         }
2241         if (fwrite(buf, 1, l, f) != l) {
2242             error_setg(errp, QERR_IO_ERROR);
2243             goto exit;
2244         }
2245         addr += l;
2246         size -= l;
2247     }
2248
2249 exit:
2250     fclose(f);
2251 }
2252
2253 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2254                   Error **errp)
2255 {
2256     FILE *f;
2257     uint32_t l;
2258     uint8_t buf[1024];
2259
2260     f = fopen(filename, "wb");
2261     if (!f) {
2262         error_setg_file_open(errp, errno, filename);
2263         return;
2264     }
2265
2266     while (size != 0) {
2267         l = sizeof(buf);
2268         if (l > size)
2269             l = size;
2270         cpu_physical_memory_read(addr, buf, l);
2271         if (fwrite(buf, 1, l, f) != l) {
2272             error_setg(errp, QERR_IO_ERROR);
2273             goto exit;
2274         }
2275         addr += l;
2276         size -= l;
2277     }
2278
2279 exit:
2280     fclose(f);
2281 }
2282
2283 void qmp_inject_nmi(Error **errp)
2284 {
2285     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2286 }
2287
2288 void dump_drift_info(void)
2289 {
2290     if (!use_icount) {
2291         return;
2292     }
2293
2294     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2295                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2296     if (icount_align_option) {
2297         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2298                     -max_delay / SCALE_MS);
2299         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2300                     max_advance / SCALE_MS);
2301     } else {
2302         qemu_printf("Max guest delay     NA\n");
2303         qemu_printf("Max guest advance   NA\n");
2304     }
2305 }