cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #ifdef CONFIG_LINUX
  65
  66 #include <sys/prctl.h>
  67
  68 #ifndef PR_MCE_KILL
  69 #define PR_MCE_KILL 33
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_SET
  73 #define PR_MCE_KILL_SET 1
  74 #endif
  75
  76 #ifndef PR_MCE_KILL_EARLY
  77 #define PR_MCE_KILL_EARLY 1
  78 #endif
  79
  80 #endif /* CONFIG_LINUX */
  81
  82 static QemuMutex qemu_global_mutex;
  83
  84 int64_t max_delay;
  85 int64_t max_advance;
  86
  87 /* vcpu throttling controls */
  88 static QEMUTimer *throttle_timer;
  89 static unsigned int throttle_percentage;
  90
  91 #define CPU_THROTTLE_PCT_MIN 1
  92 #define CPU_THROTTLE_PCT_MAX 99
  93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  94
  95 bool cpu_is_stopped(CPUState *cpu)
  96 {
  97     return cpu->stopped || !runstate_is_running();
  98 }
  99
 100 static bool cpu_thread_is_idle(CPUState *cpu)
 101 {
 102     if (cpu->stop || cpu->queued_work_first) {
 103         return false;
 104     }
 105     if (cpu_is_stopped(cpu)) {
 106         return true;
 107     }
 108     if (!cpu->halted || cpu_has_work(cpu) ||
 109         kvm_halt_in_kernel()) {
 110         return false;
 111     }
 112     return true;
 113 }
 114
 115 static bool all_cpu_threads_idle(void)
 116 {
 117     CPUState *cpu;
 118
 119     CPU_FOREACH(cpu) {
 120         if (!cpu_thread_is_idle(cpu)) {
 121             return false;
 122         }
 123     }
 124     return true;
 125 }
 126
 127 /***********************************************************/
 128 /* guest cycle counter */
 129
 130 /* Protected by TimersState seqlock */
 131
 132 static bool icount_sleep = true;
 133 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 134 #define MAX_ICOUNT_SHIFT 10
 135
 136 typedef struct TimersState {
 137     /* Protected by BQL.  */
 138     int64_t cpu_ticks_prev;
 139     int64_t cpu_ticks_offset;
 140
 141     /* Protect fields that can be respectively read outside the
 142      * BQL, and written from multiple threads.
 143      */
 144     QemuSeqLock vm_clock_seqlock;
 145     QemuSpin vm_clock_lock;
 146
 147     int16_t cpu_ticks_enabled;
 148
 149     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 150     int16_t icount_time_shift;
 151
 152     /* Compensate for varying guest execution speed.  */
 153     int64_t qemu_icount_bias;
 154
 155     int64_t vm_clock_warp_start;
 156     int64_t cpu_clock_offset;
 157
 158     /* Only written by TCG thread */
 159     int64_t qemu_icount;
 160
 161     /* for adjusting icount */
 162     QEMUTimer *icount_rt_timer;
 163     QEMUTimer *icount_vm_timer;
 164     QEMUTimer *icount_warp_timer;
 165 } TimersState;
 166
 167 static TimersState timers_state;
 168 bool mttcg_enabled;
 169
 170
 171 /* The current number of executed instructions is based on what we
 172  * originally budgeted minus the current state of the decrementing
 173  * icount counters in extra/u16.low.
 174  */
 175 static int64_t cpu_get_icount_executed(CPUState *cpu)
 176 {
 177     return (cpu->icount_budget -
 178             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 179 }
 180
 181 /*
 182  * Update the global shared timer_state.qemu_icount to take into
 183  * account executed instructions. This is done by the TCG vCPU
 184  * thread so the main-loop can see time has moved forward.
 185  */
 186 static void cpu_update_icount_locked(CPUState *cpu)
 187 {
 188     int64_t executed = cpu_get_icount_executed(cpu);
 189     cpu->icount_budget -= executed;
 190
 191     atomic_set_i64(&timers_state.qemu_icount,
 192                    timers_state.qemu_icount + executed);
 193 }
 194
 195 /*
 196  * Update the global shared timer_state.qemu_icount to take into
 197  * account executed instructions. This is done by the TCG vCPU
 198  * thread so the main-loop can see time has moved forward.
 199  */
 200 void cpu_update_icount(CPUState *cpu)
 201 {
 202     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 203                        &timers_state.vm_clock_lock);
 204     cpu_update_icount_locked(cpu);
 205     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 206                          &timers_state.vm_clock_lock);
 207 }
 208
 209 static int64_t cpu_get_icount_raw_locked(void)
 210 {
 211     CPUState *cpu = current_cpu;
 212
 213     if (cpu && cpu->running) {
 214         if (!cpu->can_do_io) {
 215             error_report("Bad icount read");
 216             exit(1);
 217         }
 218         /* Take into account what has run */
 219         cpu_update_icount_locked(cpu);
 220     }
 221     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 222     return atomic_read_i64(&timers_state.qemu_icount);
 223 }
 224
 225 static int64_t cpu_get_icount_locked(void)
 226 {
 227     int64_t icount = cpu_get_icount_raw_locked();
 228     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 229         cpu_icount_to_ns(icount);
 230 }
 231
 232 int64_t cpu_get_icount_raw(void)
 233 {
 234     int64_t icount;
 235     unsigned start;
 236
 237     do {
 238         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 239         icount = cpu_get_icount_raw_locked();
 240     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 241
 242     return icount;
 243 }
 244
 245 /* Return the virtual CPU time, based on the instruction counter.  */
 246 int64_t cpu_get_icount(void)
 247 {
 248     int64_t icount;
 249     unsigned start;
 250
 251     do {
 252         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 253         icount = cpu_get_icount_locked();
 254     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 255
 256     return icount;
 257 }
 258
 259 int64_t cpu_icount_to_ns(int64_t icount)
 260 {
 261     return icount << atomic_read(&timers_state.icount_time_shift);
 262 }
 263
 264 static int64_t cpu_get_ticks_locked(void)
 265 {
 266     int64_t ticks = timers_state.cpu_ticks_offset;
 267     if (timers_state.cpu_ticks_enabled) {
 268         ticks += cpu_get_host_ticks();
 269     }
 270
 271     if (timers_state.cpu_ticks_prev > ticks) {
 272         /* Non increasing ticks may happen if the host uses software suspend.  */
 273         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 274         ticks = timers_state.cpu_ticks_prev;
 275     }
 276
 277     timers_state.cpu_ticks_prev = ticks;
 278     return ticks;
 279 }
 280
 281 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 282  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 283  * counter.
 284  */
 285 int64_t cpu_get_ticks(void)
 286 {
 287     int64_t ticks;
 288
 289     if (use_icount) {
 290         return cpu_get_icount();
 291     }
 292
 293     qemu_spin_lock(&timers_state.vm_clock_lock);
 294     ticks = cpu_get_ticks_locked();
 295     qemu_spin_unlock(&timers_state.vm_clock_lock);
 296     return ticks;
 297 }
 298
 299 static int64_t cpu_get_clock_locked(void)
 300 {
 301     int64_t time;
 302
 303     time = timers_state.cpu_clock_offset;
 304     if (timers_state.cpu_ticks_enabled) {
 305         time += get_clock();
 306     }
 307
 308     return time;
 309 }
 310
 311 /* Return the monotonic time elapsed in VM, i.e.,
 312  * the time between vm_start and vm_stop
 313  */
 314 int64_t cpu_get_clock(void)
 315 {
 316     int64_t ti;
 317     unsigned start;
 318
 319     do {
 320         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 321         ti = cpu_get_clock_locked();
 322     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 323
 324     return ti;
 325 }
 326
 327 /* enable cpu_get_ticks()
 328  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 329  */
 330 void cpu_enable_ticks(void)
 331 {
 332     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 333                        &timers_state.vm_clock_lock);
 334     if (!timers_state.cpu_ticks_enabled) {
 335         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 336         timers_state.cpu_clock_offset -= get_clock();
 337         timers_state.cpu_ticks_enabled = 1;
 338     }
 339     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 340                        &timers_state.vm_clock_lock);
 341 }
 342
 343 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 344  * cpu_get_ticks() after that.
 345  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 346  */
 347 void cpu_disable_ticks(void)
 348 {
 349     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 350                        &timers_state.vm_clock_lock);
 351     if (timers_state.cpu_ticks_enabled) {
 352         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 353         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 354         timers_state.cpu_ticks_enabled = 0;
 355     }
 356     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 357                          &timers_state.vm_clock_lock);
 358 }
 359
 360 /* Correlation between real and virtual time is always going to be
 361    fairly approximate, so ignore small variation.
 362    When the guest is idle real and virtual time will be aligned in
 363    the IO wait loop.  */
 364 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 365
 366 static void icount_adjust(void)
 367 {
 368     int64_t cur_time;
 369     int64_t cur_icount;
 370     int64_t delta;
 371
 372     /* Protected by TimersState mutex.  */
 373     static int64_t last_delta;
 374
 375     /* If the VM is not running, then do nothing.  */
 376     if (!runstate_is_running()) {
 377         return;
 378     }
 379
 380     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 381                        &timers_state.vm_clock_lock);
 382     cur_time = cpu_get_clock_locked();
 383     cur_icount = cpu_get_icount_locked();
 384
 385     delta = cur_icount - cur_time;
 386     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 387     if (delta > 0
 388         && last_delta + ICOUNT_WOBBLE < delta * 2
 389         && timers_state.icount_time_shift > 0) {
 390         /* The guest is getting too far ahead.  Slow time down.  */
 391         atomic_set(&timers_state.icount_time_shift,
 392                    timers_state.icount_time_shift - 1);
 393     }
 394     if (delta < 0
 395         && last_delta - ICOUNT_WOBBLE > delta * 2
 396         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 397         /* The guest is getting too far behind.  Speed time up.  */
 398         atomic_set(&timers_state.icount_time_shift,
 399                    timers_state.icount_time_shift + 1);
 400     }
 401     last_delta = delta;
 402     atomic_set_i64(&timers_state.qemu_icount_bias,
 403                    cur_icount - (timers_state.qemu_icount
 404                                  << timers_state.icount_time_shift));
 405     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 406                          &timers_state.vm_clock_lock);
 407 }
 408
 409 static void icount_adjust_rt(void *opaque)
 410 {
 411     timer_mod(timers_state.icount_rt_timer,
 412               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 413     icount_adjust();
 414 }
 415
 416 static void icount_adjust_vm(void *opaque)
 417 {
 418     timer_mod(timers_state.icount_vm_timer,
 419                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 420                    NANOSECONDS_PER_SECOND / 10);
 421     icount_adjust();
 422 }
 423
 424 static int64_t qemu_icount_round(int64_t count)
 425 {
 426     int shift = atomic_read(&timers_state.icount_time_shift);
 427     return (count + (1 << shift) - 1) >> shift;
 428 }
 429
 430 static void icount_warp_rt(void)
 431 {
 432     unsigned seq;
 433     int64_t warp_start;
 434
 435     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 436      * changes from -1 to another value, so the race here is okay.
 437      */
 438     do {
 439         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 440         warp_start = timers_state.vm_clock_warp_start;
 441     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 442
 443     if (warp_start == -1) {
 444         return;
 445     }
 446
 447     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 448                        &timers_state.vm_clock_lock);
 449     if (runstate_is_running()) {
 450         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 451                                             cpu_get_clock_locked());
 452         int64_t warp_delta;
 453
 454         warp_delta = clock - timers_state.vm_clock_warp_start;
 455         if (use_icount == 2) {
 456             /*
 457              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 458              * far ahead of real time.
 459              */
 460             int64_t cur_icount = cpu_get_icount_locked();
 461             int64_t delta = clock - cur_icount;
 462             warp_delta = MIN(warp_delta, delta);
 463         }
 464         atomic_set_i64(&timers_state.qemu_icount_bias,
 465                        timers_state.qemu_icount_bias + warp_delta);
 466     }
 467     timers_state.vm_clock_warp_start = -1;
 468     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 469                        &timers_state.vm_clock_lock);
 470
 471     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 472         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 473     }
 474 }
 475
 476 static void icount_timer_cb(void *opaque)
 477 {
 478     /* No need for a checkpoint because the timer already synchronizes
 479      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 480      */
 481     icount_warp_rt();
 482 }
 483
 484 void qtest_clock_warp(int64_t dest)
 485 {
 486     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 487     AioContext *aio_context;
 488     assert(qtest_enabled());
 489     aio_context = qemu_get_aio_context();
 490     while (clock < dest) {
 491         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 492                                                       QEMU_TIMER_ATTR_ALL);
 493         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 494
 495         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 496                            &timers_state.vm_clock_lock);
 497         atomic_set_i64(&timers_state.qemu_icount_bias,
 498                        timers_state.qemu_icount_bias + warp);
 499         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 500                              &timers_state.vm_clock_lock);
 501
 502         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 503         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 504         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 505     }
 506     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 507 }
 508
 509 void qemu_start_warp_timer(void)
 510 {
 511     int64_t clock;
 512     int64_t deadline;
 513
 514     if (!use_icount) {
 515         return;
 516     }
 517
 518     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 519      * do not fire, so computing the deadline does not make sense.
 520      */
 521     if (!runstate_is_running()) {
 522         return;
 523     }
 524
 525     if (replay_mode != REPLAY_MODE_PLAY) {
 526         if (!all_cpu_threads_idle()) {
 527             return;
 528         }
 529
 530         if (qtest_enabled()) {
 531             /* When testing, qtest commands advance icount.  */
 532             return;
 533         }
 534
 535         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 536     } else {
 537         /* warp clock deterministically in record/replay mode */
 538         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 539             /* vCPU is sleeping and warp can't be started.
 540                It is probably a race condition: notification sent
 541                to vCPU was processed in advance and vCPU went to sleep.
 542                Therefore we have to wake it up for doing someting. */
 543             if (replay_has_checkpoint()) {
 544                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 545             }
 546             return;
 547         }
 548     }
 549
 550     /* We want to use the earliest deadline from ALL vm_clocks */
 551     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 552     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 553                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 554     if (deadline < 0) {
 555         static bool notified;
 556         if (!icount_sleep && !notified) {
 557             warn_report("icount sleep disabled and no active timers");
 558             notified = true;
 559         }
 560         return;
 561     }
 562
 563     if (deadline > 0) {
 564         /*
 565          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 566          * sleep.  Otherwise, the CPU might be waiting for a future timer
 567          * interrupt to wake it up, but the interrupt never comes because
 568          * the vCPU isn't running any insns and thus doesn't advance the
 569          * QEMU_CLOCK_VIRTUAL.
 570          */
 571         if (!icount_sleep) {
 572             /*
 573              * We never let VCPUs sleep in no sleep icount mode.
 574              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 575              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 576              * It is useful when we want a deterministic execution time,
 577              * isolated from host latencies.
 578              */
 579             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 580                                &timers_state.vm_clock_lock);
 581             atomic_set_i64(&timers_state.qemu_icount_bias,
 582                            timers_state.qemu_icount_bias + deadline);
 583             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 584                                  &timers_state.vm_clock_lock);
 585             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 586         } else {
 587             /*
 588              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 589              * "real" time, (related to the time left until the next event) has
 590              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 591              * This avoids that the warps are visible externally; for example,
 592              * you will not be sending network packets continuously instead of
 593              * every 100ms.
 594              */
 595             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 596                                &timers_state.vm_clock_lock);
 597             if (timers_state.vm_clock_warp_start == -1
 598                 || timers_state.vm_clock_warp_start > clock) {
 599                 timers_state.vm_clock_warp_start = clock;
 600             }
 601             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 602                                  &timers_state.vm_clock_lock);
 603             timer_mod_anticipate(timers_state.icount_warp_timer,
 604                                  clock + deadline);
 605         }
 606     } else if (deadline == 0) {
 607         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608     }
 609 }
 610
 611 static void qemu_account_warp_timer(void)
 612 {
 613     if (!use_icount || !icount_sleep) {
 614         return;
 615     }
 616
 617     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 618      * do not fire, so computing the deadline does not make sense.
 619      */
 620     if (!runstate_is_running()) {
 621         return;
 622     }
 623
 624     /* warp clock deterministically in record/replay mode */
 625     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 626         return;
 627     }
 628
 629     timer_del(timers_state.icount_warp_timer);
 630     icount_warp_rt();
 631 }
 632
 633 static bool icount_state_needed(void *opaque)
 634 {
 635     return use_icount;
 636 }
 637
 638 static bool warp_timer_state_needed(void *opaque)
 639 {
 640     TimersState *s = opaque;
 641     return s->icount_warp_timer != NULL;
 642 }
 643
 644 static bool adjust_timers_state_needed(void *opaque)
 645 {
 646     TimersState *s = opaque;
 647     return s->icount_rt_timer != NULL;
 648 }
 649
 650 /*
 651  * Subsection for warp timer migration is optional, because may not be created
 652  */
 653 static const VMStateDescription icount_vmstate_warp_timer = {
 654     .name = "timer/icount/warp_timer",
 655     .version_id = 1,
 656     .minimum_version_id = 1,
 657     .needed = warp_timer_state_needed,
 658     .fields = (VMStateField[]) {
 659         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 660         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 661         VMSTATE_END_OF_LIST()
 662     }
 663 };
 664
 665 static const VMStateDescription icount_vmstate_adjust_timers = {
 666     .name = "timer/icount/timers",
 667     .version_id = 1,
 668     .minimum_version_id = 1,
 669     .needed = adjust_timers_state_needed,
 670     .fields = (VMStateField[]) {
 671         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 672         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 673         VMSTATE_END_OF_LIST()
 674     }
 675 };
 676
 677 /*
 678  * This is a subsection for icount migration.
 679  */
 680 static const VMStateDescription icount_vmstate_timers = {
 681     .name = "timer/icount",
 682     .version_id = 1,
 683     .minimum_version_id = 1,
 684     .needed = icount_state_needed,
 685     .fields = (VMStateField[]) {
 686         VMSTATE_INT64(qemu_icount_bias, TimersState),
 687         VMSTATE_INT64(qemu_icount, TimersState),
 688         VMSTATE_END_OF_LIST()
 689     },
 690     .subsections = (const VMStateDescription*[]) {
 691         &icount_vmstate_warp_timer,
 692         &icount_vmstate_adjust_timers,
 693         NULL
 694     }
 695 };
 696
 697 static const VMStateDescription vmstate_timers = {
 698     .name = "timer",
 699     .version_id = 2,
 700     .minimum_version_id = 1,
 701     .fields = (VMStateField[]) {
 702         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 703         VMSTATE_UNUSED(8),
 704         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 705         VMSTATE_END_OF_LIST()
 706     },
 707     .subsections = (const VMStateDescription*[]) {
 708         &icount_vmstate_timers,
 709         NULL
 710     }
 711 };
 712
 713 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 714 {
 715     double pct;
 716     double throttle_ratio;
 717     int64_t sleeptime_ns, endtime_ns;
 718
 719     if (!cpu_throttle_get_percentage()) {
 720         return;
 721     }
 722
 723     pct = (double)cpu_throttle_get_percentage()/100;
 724     throttle_ratio = pct / (1 - pct);
 725     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 726     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 727     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 728     while (sleeptime_ns > 0 && !cpu->stop) {
 729         if (sleeptime_ns > SCALE_MS) {
 730             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 731                                 sleeptime_ns / SCALE_MS);
 732         } else {
 733             qemu_mutex_unlock_iothread();
 734             g_usleep(sleeptime_ns / SCALE_US);
 735             qemu_mutex_lock_iothread();
 736         }
 737         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 738     }
 739     atomic_set(&cpu->throttle_thread_scheduled, 0);
 740 }
 741
 742 static void cpu_throttle_timer_tick(void *opaque)
 743 {
 744     CPUState *cpu;
 745     double pct;
 746
 747     /* Stop the timer if needed */
 748     if (!cpu_throttle_get_percentage()) {
 749         return;
 750     }
 751     CPU_FOREACH(cpu) {
 752         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 753             async_run_on_cpu(cpu, cpu_throttle_thread,
 754                              RUN_ON_CPU_NULL);
 755         }
 756     }
 757
 758     pct = (double)cpu_throttle_get_percentage()/100;
 759     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 760                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 761 }
 762
 763 void cpu_throttle_set(int new_throttle_pct)
 764 {
 765     /* Ensure throttle percentage is within valid range */
 766     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 767     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 768
 769     atomic_set(&throttle_percentage, new_throttle_pct);
 770
 771     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 772                                        CPU_THROTTLE_TIMESLICE_NS);
 773 }
 774
 775 void cpu_throttle_stop(void)
 776 {
 777     atomic_set(&throttle_percentage, 0);
 778 }
 779
 780 bool cpu_throttle_active(void)
 781 {
 782     return (cpu_throttle_get_percentage() != 0);
 783 }
 784
 785 int cpu_throttle_get_percentage(void)
 786 {
 787     return atomic_read(&throttle_percentage);
 788 }
 789
 790 void cpu_ticks_init(void)
 791 {
 792     seqlock_init(&timers_state.vm_clock_seqlock);
 793     qemu_spin_init(&timers_state.vm_clock_lock);
 794     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 795     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 796                                            cpu_throttle_timer_tick, NULL);
 797 }
 798
 799 void configure_icount(QemuOpts *opts, Error **errp)
 800 {
 801     const char *option = qemu_opt_get(opts, "shift");
 802     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 803     bool align = qemu_opt_get_bool(opts, "align", false);
 804     long time_shift = -1;
 805
 806     if (!option && qemu_opt_get(opts, "align")) {
 807         error_setg(errp, "Please specify shift option when using align");
 808         return;
 809     }
 810
 811     if (align && !sleep) {
 812         error_setg(errp, "align=on and sleep=off are incompatible");
 813         return;
 814     }
 815
 816     if (strcmp(option, "auto") != 0) {
 817         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 818             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 819             error_setg(errp, "icount: Invalid shift value");
 820             return;
 821         }
 822     } else if (icount_align_option) {
 823         error_setg(errp, "shift=auto and align=on are incompatible");
 824         return;
 825     } else if (!icount_sleep) {
 826         error_setg(errp, "shift=auto and sleep=off are incompatible");
 827         return;
 828     }
 829
 830     icount_sleep = sleep;
 831     if (icount_sleep) {
 832         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 833                                          icount_timer_cb, NULL);
 834     }
 835
 836     icount_align_option = align;
 837
 838     if (time_shift >= 0) {
 839         timers_state.icount_time_shift = time_shift;
 840         use_icount = 1;
 841         return;
 842     }
 843
 844     use_icount = 2;
 845
 846     /* 125MIPS seems a reasonable initial guess at the guest speed.
 847        It will be corrected fairly quickly anyway.  */
 848     timers_state.icount_time_shift = 3;
 849
 850     /* Have both realtime and virtual time triggers for speed adjustment.
 851        The realtime trigger catches emulated time passing too slowly,
 852        the virtual time trigger catches emulated time passing too fast.
 853        Realtime triggers occur even when idle, so use them less frequently
 854        than VM triggers.  */
 855     timers_state.vm_clock_warp_start = -1;
 856     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 857                                    icount_adjust_rt, NULL);
 858     timer_mod(timers_state.icount_rt_timer,
 859                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 860     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 861                                         icount_adjust_vm, NULL);
 862     timer_mod(timers_state.icount_vm_timer,
 863                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 864                    NANOSECONDS_PER_SECOND / 10);
 865 }
 866
 867 /***********************************************************/
 868 /* TCG vCPU kick timer
 869  *
 870  * The kick timer is responsible for moving single threaded vCPU
 871  * emulation on to the next vCPU. If more than one vCPU is running a
 872  * timer event with force a cpu->exit so the next vCPU can get
 873  * scheduled.
 874  *
 875  * The timer is removed if all vCPUs are idle and restarted again once
 876  * idleness is complete.
 877  */
 878
 879 static QEMUTimer *tcg_kick_vcpu_timer;
 880 static CPUState *tcg_current_rr_cpu;
 881
 882 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 883
 884 static inline int64_t qemu_tcg_next_kick(void)
 885 {
 886     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 887 }
 888
 889 /* Kick the currently round-robin scheduled vCPU to next */
 890 static void qemu_cpu_kick_rr_next_cpu(void)
 891 {
 892     CPUState *cpu;
 893     do {
 894         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 895         if (cpu) {
 896             cpu_exit(cpu);
 897         }
 898     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 899 }
 900
 901 /* Kick all RR vCPUs */
 902 static void qemu_cpu_kick_rr_cpus(void)
 903 {
 904     CPUState *cpu;
 905
 906     CPU_FOREACH(cpu) {
 907         cpu_exit(cpu);
 908     };
 909 }
 910
 911 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 912 {
 913 }
 914
 915 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 916 {
 917     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 918         qemu_notify_event();
 919         return;
 920     }
 921
 922     if (qemu_in_vcpu_thread()) {
 923         /* A CPU is currently running; kick it back out to the
 924          * tcg_cpu_exec() loop so it will recalculate its
 925          * icount deadline immediately.
 926          */
 927         qemu_cpu_kick(current_cpu);
 928     } else if (first_cpu) {
 929         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 930          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 931          * causes cpu_thread_is_idle to return false.  This way,
 932          * handle_icount_deadline can run.
 933          * If we have no CPUs at all for some reason, we don't
 934          * need to do anything.
 935          */
 936         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 937     }
 938 }
 939
 940 static void kick_tcg_thread(void *opaque)
 941 {
 942     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 943     qemu_cpu_kick_rr_next_cpu();
 944 }
 945
 946 static void start_tcg_kick_timer(void)
 947 {
 948     assert(!mttcg_enabled);
 949     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 950         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 951                                            kick_tcg_thread, NULL);
 952     }
 953     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 954         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 955     }
 956 }
 957
 958 static void stop_tcg_kick_timer(void)
 959 {
 960     assert(!mttcg_enabled);
 961     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 962         timer_del(tcg_kick_vcpu_timer);
 963     }
 964 }
 965
 966 /***********************************************************/
 967 void hw_error(const char *fmt, ...)
 968 {
 969     va_list ap;
 970     CPUState *cpu;
 971
 972     va_start(ap, fmt);
 973     fprintf(stderr, "qemu: hardware error: ");
 974     vfprintf(stderr, fmt, ap);
 975     fprintf(stderr, "\n");
 976     CPU_FOREACH(cpu) {
 977         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 978         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 979     }
 980     va_end(ap);
 981     abort();
 982 }
 983
 984 void cpu_synchronize_all_states(void)
 985 {
 986     CPUState *cpu;
 987
 988     CPU_FOREACH(cpu) {
 989         cpu_synchronize_state(cpu);
 990         /* TODO: move to cpu_synchronize_state() */
 991         if (hvf_enabled()) {
 992             hvf_cpu_synchronize_state(cpu);
 993         }
 994     }
 995 }
 996
 997 void cpu_synchronize_all_post_reset(void)
 998 {
 999     CPUState *cpu;
1000
1001     CPU_FOREACH(cpu) {
1002         cpu_synchronize_post_reset(cpu);
1003         /* TODO: move to cpu_synchronize_post_reset() */
1004         if (hvf_enabled()) {
1005             hvf_cpu_synchronize_post_reset(cpu);
1006         }
1007     }
1008 }
1009
1010 void cpu_synchronize_all_post_init(void)
1011 {
1012     CPUState *cpu;
1013
1014     CPU_FOREACH(cpu) {
1015         cpu_synchronize_post_init(cpu);
1016         /* TODO: move to cpu_synchronize_post_init() */
1017         if (hvf_enabled()) {
1018             hvf_cpu_synchronize_post_init(cpu);
1019         }
1020     }
1021 }
1022
1023 void cpu_synchronize_all_pre_loadvm(void)
1024 {
1025     CPUState *cpu;
1026
1027     CPU_FOREACH(cpu) {
1028         cpu_synchronize_pre_loadvm(cpu);
1029     }
1030 }
1031
1032 static int do_vm_stop(RunState state, bool send_stop)
1033 {
1034     int ret = 0;
1035
1036     if (runstate_is_running()) {
1037         runstate_set(state);
1038         cpu_disable_ticks();
1039         pause_all_vcpus();
1040         vm_state_notify(0, state);
1041         if (send_stop) {
1042             qapi_event_send_stop();
1043         }
1044     }
1045
1046     bdrv_drain_all();
1047     ret = bdrv_flush_all();
1048
1049     return ret;
1050 }
1051
1052 /* Special vm_stop() variant for terminating the process.  Historically clients
1053  * did not expect a QMP STOP event and so we need to retain compatibility.
1054  */
1055 int vm_shutdown(void)
1056 {
1057     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1058 }
1059
1060 static bool cpu_can_run(CPUState *cpu)
1061 {
1062     if (cpu->stop) {
1063         return false;
1064     }
1065     if (cpu_is_stopped(cpu)) {
1066         return false;
1067     }
1068     return true;
1069 }
1070
1071 static void cpu_handle_guest_debug(CPUState *cpu)
1072 {
1073     gdb_set_stop_cpu(cpu);
1074     qemu_system_debug_request();
1075     cpu->stopped = true;
1076 }
1077
1078 #ifdef CONFIG_LINUX
1079 static void sigbus_reraise(void)
1080 {
1081     sigset_t set;
1082     struct sigaction action;
1083
1084     memset(&action, 0, sizeof(action));
1085     action.sa_handler = SIG_DFL;
1086     if (!sigaction(SIGBUS, &action, NULL)) {
1087         raise(SIGBUS);
1088         sigemptyset(&set);
1089         sigaddset(&set, SIGBUS);
1090         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1091     }
1092     perror("Failed to re-raise SIGBUS!\n");
1093     abort();
1094 }
1095
1096 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1097 {
1098     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1099         sigbus_reraise();
1100     }
1101
1102     if (current_cpu) {
1103         /* Called asynchronously in VCPU thread.  */
1104         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1105             sigbus_reraise();
1106         }
1107     } else {
1108         /* Called synchronously (via signalfd) in main thread.  */
1109         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1110             sigbus_reraise();
1111         }
1112     }
1113 }
1114
1115 static void qemu_init_sigbus(void)
1116 {
1117     struct sigaction action;
1118
1119     memset(&action, 0, sizeof(action));
1120     action.sa_flags = SA_SIGINFO;
1121     action.sa_sigaction = sigbus_handler;
1122     sigaction(SIGBUS, &action, NULL);
1123
1124     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1125 }
1126 #else /* !CONFIG_LINUX */
1127 static void qemu_init_sigbus(void)
1128 {
1129 }
1130 #endif /* !CONFIG_LINUX */
1131
1132 static QemuThread io_thread;
1133
1134 /* cpu creation */
1135 static QemuCond qemu_cpu_cond;
1136 /* system init */
1137 static QemuCond qemu_pause_cond;
1138
1139 void qemu_init_cpu_loop(void)
1140 {
1141     qemu_init_sigbus();
1142     qemu_cond_init(&qemu_cpu_cond);
1143     qemu_cond_init(&qemu_pause_cond);
1144     qemu_mutex_init(&qemu_global_mutex);
1145
1146     qemu_thread_get_self(&io_thread);
1147 }
1148
1149 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1150 {
1151     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1152 }
1153
1154 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1155 {
1156     if (kvm_destroy_vcpu(cpu) < 0) {
1157         error_report("kvm_destroy_vcpu failed");
1158         exit(EXIT_FAILURE);
1159     }
1160 }
1161
1162 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1163 {
1164 }
1165
1166 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1167 {
1168     g_assert(qemu_cpu_is_self(cpu));
1169     cpu->stop = false;
1170     cpu->stopped = true;
1171     if (exit) {
1172         cpu_exit(cpu);
1173     }
1174     qemu_cond_broadcast(&qemu_pause_cond);
1175 }
1176
1177 static void qemu_wait_io_event_common(CPUState *cpu)
1178 {
1179     atomic_mb_set(&cpu->thread_kicked, false);
1180     if (cpu->stop) {
1181         qemu_cpu_stop(cpu, false);
1182     }
1183     process_queued_cpu_work(cpu);
1184 }
1185
1186 static void qemu_tcg_rr_wait_io_event(void)
1187 {
1188     CPUState *cpu;
1189
1190     while (all_cpu_threads_idle()) {
1191         stop_tcg_kick_timer();
1192         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1193     }
1194
1195     start_tcg_kick_timer();
1196
1197     CPU_FOREACH(cpu) {
1198         qemu_wait_io_event_common(cpu);
1199     }
1200 }
1201
1202 static void qemu_wait_io_event(CPUState *cpu)
1203 {
1204     bool slept = false;
1205
1206     while (cpu_thread_is_idle(cpu)) {
1207         if (!slept) {
1208             slept = true;
1209             qemu_plugin_vcpu_idle_cb(cpu);
1210         }
1211         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1212     }
1213     if (slept) {
1214         qemu_plugin_vcpu_resume_cb(cpu);
1215     }
1216
1217 #ifdef _WIN32
1218     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1219     if (!tcg_enabled()) {
1220         SleepEx(0, TRUE);
1221     }
1222 #endif
1223     qemu_wait_io_event_common(cpu);
1224 }
1225
1226 static void *qemu_kvm_cpu_thread_fn(void *arg)
1227 {
1228     CPUState *cpu = arg;
1229     int r;
1230
1231     rcu_register_thread();
1232
1233     qemu_mutex_lock_iothread();
1234     qemu_thread_get_self(cpu->thread);
1235     cpu->thread_id = qemu_get_thread_id();
1236     cpu->can_do_io = 1;
1237     current_cpu = cpu;
1238
1239     r = kvm_init_vcpu(cpu);
1240     if (r < 0) {
1241         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1242         exit(1);
1243     }
1244
1245     kvm_init_cpu_signals(cpu);
1246
1247     /* signal CPU creation */
1248     cpu->created = true;
1249     qemu_cond_signal(&qemu_cpu_cond);
1250     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1251
1252     do {
1253         if (cpu_can_run(cpu)) {
1254             r = kvm_cpu_exec(cpu);
1255             if (r == EXCP_DEBUG) {
1256                 cpu_handle_guest_debug(cpu);
1257             }
1258         }
1259         qemu_wait_io_event(cpu);
1260     } while (!cpu->unplug || cpu_can_run(cpu));
1261
1262     qemu_kvm_destroy_vcpu(cpu);
1263     cpu->created = false;
1264     qemu_cond_signal(&qemu_cpu_cond);
1265     qemu_mutex_unlock_iothread();
1266     rcu_unregister_thread();
1267     return NULL;
1268 }
1269
1270 static void *qemu_dummy_cpu_thread_fn(void *arg)
1271 {
1272 #ifdef _WIN32
1273     error_report("qtest is not supported under Windows");
1274     exit(1);
1275 #else
1276     CPUState *cpu = arg;
1277     sigset_t waitset;
1278     int r;
1279
1280     rcu_register_thread();
1281
1282     qemu_mutex_lock_iothread();
1283     qemu_thread_get_self(cpu->thread);
1284     cpu->thread_id = qemu_get_thread_id();
1285     cpu->can_do_io = 1;
1286     current_cpu = cpu;
1287
1288     sigemptyset(&waitset);
1289     sigaddset(&waitset, SIG_IPI);
1290
1291     /* signal CPU creation */
1292     cpu->created = true;
1293     qemu_cond_signal(&qemu_cpu_cond);
1294     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1295
1296     do {
1297         qemu_mutex_unlock_iothread();
1298         do {
1299             int sig;
1300             r = sigwait(&waitset, &sig);
1301         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1302         if (r == -1) {
1303             perror("sigwait");
1304             exit(1);
1305         }
1306         qemu_mutex_lock_iothread();
1307         qemu_wait_io_event(cpu);
1308     } while (!cpu->unplug);
1309
1310     qemu_mutex_unlock_iothread();
1311     rcu_unregister_thread();
1312     return NULL;
1313 #endif
1314 }
1315
1316 static int64_t tcg_get_icount_limit(void)
1317 {
1318     int64_t deadline;
1319
1320     if (replay_mode != REPLAY_MODE_PLAY) {
1321         /*
1322          * Include all the timers, because they may need an attention.
1323          * Too long CPU execution may create unnecessary delay in UI.
1324          */
1325         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1326                                               QEMU_TIMER_ATTR_ALL);
1327         /* Check realtime timers, because they help with input processing */
1328         deadline = qemu_soonest_timeout(deadline,
1329                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1330                                            QEMU_TIMER_ATTR_ALL));
1331
1332         /* Maintain prior (possibly buggy) behaviour where if no deadline
1333          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1334          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1335          * nanoseconds.
1336          */
1337         if ((deadline < 0) || (deadline > INT32_MAX)) {
1338             deadline = INT32_MAX;
1339         }
1340
1341         return qemu_icount_round(deadline);
1342     } else {
1343         return replay_get_instructions();
1344     }
1345 }
1346
1347 static void handle_icount_deadline(void)
1348 {
1349     assert(qemu_in_vcpu_thread());
1350     if (use_icount) {
1351         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1352                                                       QEMU_TIMER_ATTR_ALL);
1353
1354         if (deadline == 0) {
1355             /* Wake up other AioContexts.  */
1356             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1357             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1358         }
1359     }
1360 }
1361
1362 static void prepare_icount_for_run(CPUState *cpu)
1363 {
1364     if (use_icount) {
1365         int insns_left;
1366
1367         /* These should always be cleared by process_icount_data after
1368          * each vCPU execution. However u16.high can be raised
1369          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1370          */
1371         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1372         g_assert(cpu->icount_extra == 0);
1373
1374         cpu->icount_budget = tcg_get_icount_limit();
1375         insns_left = MIN(0xffff, cpu->icount_budget);
1376         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1377         cpu->icount_extra = cpu->icount_budget - insns_left;
1378
1379         replay_mutex_lock();
1380     }
1381 }
1382
1383 static void process_icount_data(CPUState *cpu)
1384 {
1385     if (use_icount) {
1386         /* Account for executed instructions */
1387         cpu_update_icount(cpu);
1388
1389         /* Reset the counters */
1390         cpu_neg(cpu)->icount_decr.u16.low = 0;
1391         cpu->icount_extra = 0;
1392         cpu->icount_budget = 0;
1393
1394         replay_account_executed_instructions();
1395
1396         replay_mutex_unlock();
1397     }
1398 }
1399
1400
1401 static int tcg_cpu_exec(CPUState *cpu)
1402 {
1403     int ret;
1404 #ifdef CONFIG_PROFILER
1405     int64_t ti;
1406 #endif
1407
1408     assert(tcg_enabled());
1409 #ifdef CONFIG_PROFILER
1410     ti = profile_getclock();
1411 #endif
1412     cpu_exec_start(cpu);
1413     ret = cpu_exec(cpu);
1414     cpu_exec_end(cpu);
1415 #ifdef CONFIG_PROFILER
1416     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1417                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1418 #endif
1419     return ret;
1420 }
1421
1422 /* Destroy any remaining vCPUs which have been unplugged and have
1423  * finished running
1424  */
1425 static void deal_with_unplugged_cpus(void)
1426 {
1427     CPUState *cpu;
1428
1429     CPU_FOREACH(cpu) {
1430         if (cpu->unplug && !cpu_can_run(cpu)) {
1431             qemu_tcg_destroy_vcpu(cpu);
1432             cpu->created = false;
1433             qemu_cond_signal(&qemu_cpu_cond);
1434             break;
1435         }
1436     }
1437 }
1438
1439 /* Single-threaded TCG
1440  *
1441  * In the single-threaded case each vCPU is simulated in turn. If
1442  * there is more than a single vCPU we create a simple timer to kick
1443  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1444  * This is done explicitly rather than relying on side-effects
1445  * elsewhere.
1446  */
1447
1448 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1449 {
1450     CPUState *cpu = arg;
1451
1452     assert(tcg_enabled());
1453     rcu_register_thread();
1454     tcg_register_thread();
1455
1456     qemu_mutex_lock_iothread();
1457     qemu_thread_get_self(cpu->thread);
1458
1459     cpu->thread_id = qemu_get_thread_id();
1460     cpu->created = true;
1461     cpu->can_do_io = 1;
1462     qemu_cond_signal(&qemu_cpu_cond);
1463     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1464
1465     /* wait for initial kick-off after machine start */
1466     while (first_cpu->stopped) {
1467         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1468
1469         /* process any pending work */
1470         CPU_FOREACH(cpu) {
1471             current_cpu = cpu;
1472             qemu_wait_io_event_common(cpu);
1473         }
1474     }
1475
1476     start_tcg_kick_timer();
1477
1478     cpu = first_cpu;
1479
1480     /* process any pending work */
1481     cpu->exit_request = 1;
1482
1483     while (1) {
1484         qemu_mutex_unlock_iothread();
1485         replay_mutex_lock();
1486         qemu_mutex_lock_iothread();
1487         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1488         qemu_account_warp_timer();
1489
1490         /* Run the timers here.  This is much more efficient than
1491          * waking up the I/O thread and waiting for completion.
1492          */
1493         handle_icount_deadline();
1494
1495         replay_mutex_unlock();
1496
1497         if (!cpu) {
1498             cpu = first_cpu;
1499         }
1500
1501         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1502
1503             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1504             current_cpu = cpu;
1505
1506             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1507                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1508
1509             if (cpu_can_run(cpu)) {
1510                 int r;
1511
1512                 qemu_mutex_unlock_iothread();
1513                 prepare_icount_for_run(cpu);
1514
1515                 r = tcg_cpu_exec(cpu);
1516
1517                 process_icount_data(cpu);
1518                 qemu_mutex_lock_iothread();
1519
1520                 if (r == EXCP_DEBUG) {
1521                     cpu_handle_guest_debug(cpu);
1522                     break;
1523                 } else if (r == EXCP_ATOMIC) {
1524                     qemu_mutex_unlock_iothread();
1525                     cpu_exec_step_atomic(cpu);
1526                     qemu_mutex_lock_iothread();
1527                     break;
1528                 }
1529             } else if (cpu->stop) {
1530                 if (cpu->unplug) {
1531                     cpu = CPU_NEXT(cpu);
1532                 }
1533                 break;
1534             }
1535
1536             cpu = CPU_NEXT(cpu);
1537         } /* while (cpu && !cpu->exit_request).. */
1538
1539         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1540         atomic_set(&tcg_current_rr_cpu, NULL);
1541
1542         if (cpu && cpu->exit_request) {
1543             atomic_mb_set(&cpu->exit_request, 0);
1544         }
1545
1546         if (use_icount && all_cpu_threads_idle()) {
1547             /*
1548              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1549              * in the main_loop, wake it up in order to start the warp timer.
1550              */
1551             qemu_notify_event();
1552         }
1553
1554         qemu_tcg_rr_wait_io_event();
1555         deal_with_unplugged_cpus();
1556     }
1557
1558     rcu_unregister_thread();
1559     return NULL;
1560 }
1561
1562 static void *qemu_hax_cpu_thread_fn(void *arg)
1563 {
1564     CPUState *cpu = arg;
1565     int r;
1566
1567     rcu_register_thread();
1568     qemu_mutex_lock_iothread();
1569     qemu_thread_get_self(cpu->thread);
1570
1571     cpu->thread_id = qemu_get_thread_id();
1572     cpu->created = true;
1573     current_cpu = cpu;
1574
1575     hax_init_vcpu(cpu);
1576     qemu_cond_signal(&qemu_cpu_cond);
1577     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1578
1579     do {
1580         if (cpu_can_run(cpu)) {
1581             r = hax_smp_cpu_exec(cpu);
1582             if (r == EXCP_DEBUG) {
1583                 cpu_handle_guest_debug(cpu);
1584             }
1585         }
1586
1587         qemu_wait_io_event(cpu);
1588     } while (!cpu->unplug || cpu_can_run(cpu));
1589     rcu_unregister_thread();
1590     return NULL;
1591 }
1592
1593 /* The HVF-specific vCPU thread function. This one should only run when the host
1594  * CPU supports the VMX "unrestricted guest" feature. */
1595 static void *qemu_hvf_cpu_thread_fn(void *arg)
1596 {
1597     CPUState *cpu = arg;
1598
1599     int r;
1600
1601     assert(hvf_enabled());
1602
1603     rcu_register_thread();
1604
1605     qemu_mutex_lock_iothread();
1606     qemu_thread_get_self(cpu->thread);
1607
1608     cpu->thread_id = qemu_get_thread_id();
1609     cpu->can_do_io = 1;
1610     current_cpu = cpu;
1611
1612     hvf_init_vcpu(cpu);
1613
1614     /* signal CPU creation */
1615     cpu->created = true;
1616     qemu_cond_signal(&qemu_cpu_cond);
1617     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1618
1619     do {
1620         if (cpu_can_run(cpu)) {
1621             r = hvf_vcpu_exec(cpu);
1622             if (r == EXCP_DEBUG) {
1623                 cpu_handle_guest_debug(cpu);
1624             }
1625         }
1626         qemu_wait_io_event(cpu);
1627     } while (!cpu->unplug || cpu_can_run(cpu));
1628
1629     hvf_vcpu_destroy(cpu);
1630     cpu->created = false;
1631     qemu_cond_signal(&qemu_cpu_cond);
1632     qemu_mutex_unlock_iothread();
1633     rcu_unregister_thread();
1634     return NULL;
1635 }
1636
1637 static void *qemu_whpx_cpu_thread_fn(void *arg)
1638 {
1639     CPUState *cpu = arg;
1640     int r;
1641
1642     rcu_register_thread();
1643
1644     qemu_mutex_lock_iothread();
1645     qemu_thread_get_self(cpu->thread);
1646     cpu->thread_id = qemu_get_thread_id();
1647     current_cpu = cpu;
1648
1649     r = whpx_init_vcpu(cpu);
1650     if (r < 0) {
1651         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1652         exit(1);
1653     }
1654
1655     /* signal CPU creation */
1656     cpu->created = true;
1657     qemu_cond_signal(&qemu_cpu_cond);
1658     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1659
1660     do {
1661         if (cpu_can_run(cpu)) {
1662             r = whpx_vcpu_exec(cpu);
1663             if (r == EXCP_DEBUG) {
1664                 cpu_handle_guest_debug(cpu);
1665             }
1666         }
1667         while (cpu_thread_is_idle(cpu)) {
1668             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1669         }
1670         qemu_wait_io_event_common(cpu);
1671     } while (!cpu->unplug || cpu_can_run(cpu));
1672
1673     whpx_destroy_vcpu(cpu);
1674     cpu->created = false;
1675     qemu_cond_signal(&qemu_cpu_cond);
1676     qemu_mutex_unlock_iothread();
1677     rcu_unregister_thread();
1678     return NULL;
1679 }
1680
1681 #ifdef _WIN32
1682 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1683 {
1684 }
1685 #endif
1686
1687 /* Multi-threaded TCG
1688  *
1689  * In the multi-threaded case each vCPU has its own thread. The TLS
1690  * variable current_cpu can be used deep in the code to find the
1691  * current CPUState for a given thread.
1692  */
1693
1694 static void *qemu_tcg_cpu_thread_fn(void *arg)
1695 {
1696     CPUState *cpu = arg;
1697
1698     assert(tcg_enabled());
1699     g_assert(!use_icount);
1700
1701     rcu_register_thread();
1702     tcg_register_thread();
1703
1704     qemu_mutex_lock_iothread();
1705     qemu_thread_get_self(cpu->thread);
1706
1707     cpu->thread_id = qemu_get_thread_id();
1708     cpu->created = true;
1709     cpu->can_do_io = 1;
1710     current_cpu = cpu;
1711     qemu_cond_signal(&qemu_cpu_cond);
1712     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1713
1714     /* process any pending work */
1715     cpu->exit_request = 1;
1716
1717     do {
1718         if (cpu_can_run(cpu)) {
1719             int r;
1720             qemu_mutex_unlock_iothread();
1721             r = tcg_cpu_exec(cpu);
1722             qemu_mutex_lock_iothread();
1723             switch (r) {
1724             case EXCP_DEBUG:
1725                 cpu_handle_guest_debug(cpu);
1726                 break;
1727             case EXCP_HALTED:
1728                 /* during start-up the vCPU is reset and the thread is
1729                  * kicked several times. If we don't ensure we go back
1730                  * to sleep in the halted state we won't cleanly
1731                  * start-up when the vCPU is enabled.
1732                  *
1733                  * cpu->halted should ensure we sleep in wait_io_event
1734                  */
1735                 g_assert(cpu->halted);
1736                 break;
1737             case EXCP_ATOMIC:
1738                 qemu_mutex_unlock_iothread();
1739                 cpu_exec_step_atomic(cpu);
1740                 qemu_mutex_lock_iothread();
1741             default:
1742                 /* Ignore everything else? */
1743                 break;
1744             }
1745         }
1746
1747         atomic_mb_set(&cpu->exit_request, 0);
1748         qemu_wait_io_event(cpu);
1749     } while (!cpu->unplug || cpu_can_run(cpu));
1750
1751     qemu_tcg_destroy_vcpu(cpu);
1752     cpu->created = false;
1753     qemu_cond_signal(&qemu_cpu_cond);
1754     qemu_mutex_unlock_iothread();
1755     rcu_unregister_thread();
1756     return NULL;
1757 }
1758
1759 static void qemu_cpu_kick_thread(CPUState *cpu)
1760 {
1761 #ifndef _WIN32
1762     int err;
1763
1764     if (cpu->thread_kicked) {
1765         return;
1766     }
1767     cpu->thread_kicked = true;
1768     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1769     if (err && err != ESRCH) {
1770         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1771         exit(1);
1772     }
1773 #else /* _WIN32 */
1774     if (!qemu_cpu_is_self(cpu)) {
1775         if (whpx_enabled()) {
1776             whpx_vcpu_kick(cpu);
1777         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1778             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1779                     __func__, GetLastError());
1780             exit(1);
1781         }
1782     }
1783 #endif
1784 }
1785
1786 void qemu_cpu_kick(CPUState *cpu)
1787 {
1788     qemu_cond_broadcast(cpu->halt_cond);
1789     if (tcg_enabled()) {
1790         if (qemu_tcg_mttcg_enabled()) {
1791             cpu_exit(cpu);
1792         } else {
1793             qemu_cpu_kick_rr_cpus();
1794         }
1795     } else {
1796         if (hax_enabled()) {
1797             /*
1798              * FIXME: race condition with the exit_request check in
1799              * hax_vcpu_hax_exec
1800              */
1801             cpu->exit_request = 1;
1802         }
1803         qemu_cpu_kick_thread(cpu);
1804     }
1805 }
1806
1807 void qemu_cpu_kick_self(void)
1808 {
1809     assert(current_cpu);
1810     qemu_cpu_kick_thread(current_cpu);
1811 }
1812
1813 bool qemu_cpu_is_self(CPUState *cpu)
1814 {
1815     return qemu_thread_is_self(cpu->thread);
1816 }
1817
1818 bool qemu_in_vcpu_thread(void)
1819 {
1820     return current_cpu && qemu_cpu_is_self(current_cpu);
1821 }
1822
1823 static __thread bool iothread_locked = false;
1824
1825 bool qemu_mutex_iothread_locked(void)
1826 {
1827     return iothread_locked;
1828 }
1829
1830 /*
1831  * The BQL is taken from so many places that it is worth profiling the
1832  * callers directly, instead of funneling them all through a single function.
1833  */
1834 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1835 {
1836     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1837
1838     g_assert(!qemu_mutex_iothread_locked());
1839     bql_lock(&qemu_global_mutex, file, line);
1840     iothread_locked = true;
1841 }
1842
1843 void qemu_mutex_unlock_iothread(void)
1844 {
1845     g_assert(qemu_mutex_iothread_locked());
1846     iothread_locked = false;
1847     qemu_mutex_unlock(&qemu_global_mutex);
1848 }
1849
1850 void qemu_cond_wait_iothread(QemuCond *cond)
1851 {
1852     qemu_cond_wait(cond, &qemu_global_mutex);
1853 }
1854
1855 static bool all_vcpus_paused(void)
1856 {
1857     CPUState *cpu;
1858
1859     CPU_FOREACH(cpu) {
1860         if (!cpu->stopped) {
1861             return false;
1862         }
1863     }
1864
1865     return true;
1866 }
1867
1868 void pause_all_vcpus(void)
1869 {
1870     CPUState *cpu;
1871
1872     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1873     CPU_FOREACH(cpu) {
1874         if (qemu_cpu_is_self(cpu)) {
1875             qemu_cpu_stop(cpu, true);
1876         } else {
1877             cpu->stop = true;
1878             qemu_cpu_kick(cpu);
1879         }
1880     }
1881
1882     /* We need to drop the replay_lock so any vCPU threads woken up
1883      * can finish their replay tasks
1884      */
1885     replay_mutex_unlock();
1886
1887     while (!all_vcpus_paused()) {
1888         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1889         CPU_FOREACH(cpu) {
1890             qemu_cpu_kick(cpu);
1891         }
1892     }
1893
1894     qemu_mutex_unlock_iothread();
1895     replay_mutex_lock();
1896     qemu_mutex_lock_iothread();
1897 }
1898
1899 void cpu_resume(CPUState *cpu)
1900 {
1901     cpu->stop = false;
1902     cpu->stopped = false;
1903     qemu_cpu_kick(cpu);
1904 }
1905
1906 void resume_all_vcpus(void)
1907 {
1908     CPUState *cpu;
1909
1910     if (!runstate_is_running()) {
1911         return;
1912     }
1913
1914     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1915     CPU_FOREACH(cpu) {
1916         cpu_resume(cpu);
1917     }
1918 }
1919
1920 void cpu_remove_sync(CPUState *cpu)
1921 {
1922     cpu->stop = true;
1923     cpu->unplug = true;
1924     qemu_cpu_kick(cpu);
1925     qemu_mutex_unlock_iothread();
1926     qemu_thread_join(cpu->thread);
1927     qemu_mutex_lock_iothread();
1928 }
1929
1930 /* For temporary buffers for forming a name */
1931 #define VCPU_THREAD_NAME_SIZE 16
1932
1933 static void qemu_tcg_init_vcpu(CPUState *cpu)
1934 {
1935     char thread_name[VCPU_THREAD_NAME_SIZE];
1936     static QemuCond *single_tcg_halt_cond;
1937     static QemuThread *single_tcg_cpu_thread;
1938     static int tcg_region_inited;
1939
1940     assert(tcg_enabled());
1941     /*
1942      * Initialize TCG regions--once. Now is a good time, because:
1943      * (1) TCG's init context, prologue and target globals have been set up.
1944      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1945      *     -accel flag is processed, so the check doesn't work then).
1946      */
1947     if (!tcg_region_inited) {
1948         tcg_region_inited = 1;
1949         tcg_region_init();
1950     }
1951
1952     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1953         cpu->thread = g_malloc0(sizeof(QemuThread));
1954         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1955         qemu_cond_init(cpu->halt_cond);
1956
1957         if (qemu_tcg_mttcg_enabled()) {
1958             /* create a thread per vCPU with TCG (MTTCG) */
1959             parallel_cpus = true;
1960             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1961                  cpu->cpu_index);
1962
1963             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1964                                cpu, QEMU_THREAD_JOINABLE);
1965
1966         } else {
1967             /* share a single thread for all cpus with TCG */
1968             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1969             qemu_thread_create(cpu->thread, thread_name,
1970                                qemu_tcg_rr_cpu_thread_fn,
1971                                cpu, QEMU_THREAD_JOINABLE);
1972
1973             single_tcg_halt_cond = cpu->halt_cond;
1974             single_tcg_cpu_thread = cpu->thread;
1975         }
1976 #ifdef _WIN32
1977         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1978 #endif
1979     } else {
1980         /* For non-MTTCG cases we share the thread */
1981         cpu->thread = single_tcg_cpu_thread;
1982         cpu->halt_cond = single_tcg_halt_cond;
1983         cpu->thread_id = first_cpu->thread_id;
1984         cpu->can_do_io = 1;
1985         cpu->created = true;
1986     }
1987 }
1988
1989 static void qemu_hax_start_vcpu(CPUState *cpu)
1990 {
1991     char thread_name[VCPU_THREAD_NAME_SIZE];
1992
1993     cpu->thread = g_malloc0(sizeof(QemuThread));
1994     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1995     qemu_cond_init(cpu->halt_cond);
1996
1997     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1998              cpu->cpu_index);
1999     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2000                        cpu, QEMU_THREAD_JOINABLE);
2001 #ifdef _WIN32
2002     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2003 #endif
2004 }
2005
2006 static void qemu_kvm_start_vcpu(CPUState *cpu)
2007 {
2008     char thread_name[VCPU_THREAD_NAME_SIZE];
2009
2010     cpu->thread = g_malloc0(sizeof(QemuThread));
2011     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2012     qemu_cond_init(cpu->halt_cond);
2013     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2014              cpu->cpu_index);
2015     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2016                        cpu, QEMU_THREAD_JOINABLE);
2017 }
2018
2019 static void qemu_hvf_start_vcpu(CPUState *cpu)
2020 {
2021     char thread_name[VCPU_THREAD_NAME_SIZE];
2022
2023     /* HVF currently does not support TCG, and only runs in
2024      * unrestricted-guest mode. */
2025     assert(hvf_enabled());
2026
2027     cpu->thread = g_malloc0(sizeof(QemuThread));
2028     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2029     qemu_cond_init(cpu->halt_cond);
2030
2031     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2032              cpu->cpu_index);
2033     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2034                        cpu, QEMU_THREAD_JOINABLE);
2035 }
2036
2037 static void qemu_whpx_start_vcpu(CPUState *cpu)
2038 {
2039     char thread_name[VCPU_THREAD_NAME_SIZE];
2040
2041     cpu->thread = g_malloc0(sizeof(QemuThread));
2042     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2043     qemu_cond_init(cpu->halt_cond);
2044     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2045              cpu->cpu_index);
2046     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2047                        cpu, QEMU_THREAD_JOINABLE);
2048 #ifdef _WIN32
2049     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2050 #endif
2051 }
2052
2053 static void qemu_dummy_start_vcpu(CPUState *cpu)
2054 {
2055     char thread_name[VCPU_THREAD_NAME_SIZE];
2056
2057     cpu->thread = g_malloc0(sizeof(QemuThread));
2058     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2059     qemu_cond_init(cpu->halt_cond);
2060     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2061              cpu->cpu_index);
2062     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2063                        QEMU_THREAD_JOINABLE);
2064 }
2065
2066 void qemu_init_vcpu(CPUState *cpu)
2067 {
2068     MachineState *ms = MACHINE(qdev_get_machine());
2069
2070     cpu->nr_cores = ms->smp.cores;
2071     cpu->nr_threads =  ms->smp.threads;
2072     cpu->stopped = true;
2073     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2074
2075     if (!cpu->as) {
2076         /* If the target cpu hasn't set up any address spaces itself,
2077          * give it the default one.
2078          */
2079         cpu->num_ases = 1;
2080         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2081     }
2082
2083     if (kvm_enabled()) {
2084         qemu_kvm_start_vcpu(cpu);
2085     } else if (hax_enabled()) {
2086         qemu_hax_start_vcpu(cpu);
2087     } else if (hvf_enabled()) {
2088         qemu_hvf_start_vcpu(cpu);
2089     } else if (tcg_enabled()) {
2090         qemu_tcg_init_vcpu(cpu);
2091     } else if (whpx_enabled()) {
2092         qemu_whpx_start_vcpu(cpu);
2093     } else {
2094         qemu_dummy_start_vcpu(cpu);
2095     }
2096
2097     while (!cpu->created) {
2098         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2099     }
2100 }
2101
2102 void cpu_stop_current(void)
2103 {
2104     if (current_cpu) {
2105         current_cpu->stop = true;
2106         cpu_exit(current_cpu);
2107     }
2108 }
2109
2110 int vm_stop(RunState state)
2111 {
2112     if (qemu_in_vcpu_thread()) {
2113         qemu_system_vmstop_request_prepare();
2114         qemu_system_vmstop_request(state);
2115         /*
2116          * FIXME: should not return to device code in case
2117          * vm_stop() has been requested.
2118          */
2119         cpu_stop_current();
2120         return 0;
2121     }
2122
2123     return do_vm_stop(state, true);
2124 }
2125
2126 /**
2127  * Prepare for (re)starting the VM.
2128  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2129  * running or in case of an error condition), 0 otherwise.
2130  */
2131 int vm_prepare_start(void)
2132 {
2133     RunState requested;
2134
2135     qemu_vmstop_requested(&requested);
2136     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2137         return -1;
2138     }
2139
2140     /* Ensure that a STOP/RESUME pair of events is emitted if a
2141      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2142      * example, according to documentation is always followed by
2143      * the STOP event.
2144      */
2145     if (runstate_is_running()) {
2146         qapi_event_send_stop();
2147         qapi_event_send_resume();
2148         return -1;
2149     }
2150
2151     /* We are sending this now, but the CPUs will be resumed shortly later */
2152     qapi_event_send_resume();
2153
2154     cpu_enable_ticks();
2155     runstate_set(RUN_STATE_RUNNING);
2156     vm_state_notify(1, RUN_STATE_RUNNING);
2157     return 0;
2158 }
2159
2160 void vm_start(void)
2161 {
2162     if (!vm_prepare_start()) {
2163         resume_all_vcpus();
2164     }
2165 }
2166
2167 /* does a state transition even if the VM is already stopped,
2168    current state is forgotten forever */
2169 int vm_stop_force_state(RunState state)
2170 {
2171     if (runstate_is_running()) {
2172         return vm_stop(state);
2173     } else {
2174         runstate_set(state);
2175
2176         bdrv_drain_all();
2177         /* Make sure to return an error if the flush in a previous vm_stop()
2178          * failed. */
2179         return bdrv_flush_all();
2180     }
2181 }
2182
2183 void list_cpus(const char *optarg)
2184 {
2185     /* XXX: implement xxx_cpu_list for targets that still miss it */
2186 #if defined(cpu_list)
2187     cpu_list();
2188 #endif
2189 }
2190
2191 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2192                  bool has_cpu, int64_t cpu_index, Error **errp)
2193 {
2194     FILE *f;
2195     uint32_t l;
2196     CPUState *cpu;
2197     uint8_t buf[1024];
2198     int64_t orig_addr = addr, orig_size = size;
2199
2200     if (!has_cpu) {
2201         cpu_index = 0;
2202     }
2203
2204     cpu = qemu_get_cpu(cpu_index);
2205     if (cpu == NULL) {
2206         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2207                    "a CPU number");
2208         return;
2209     }
2210
2211     f = fopen(filename, "wb");
2212     if (!f) {
2213         error_setg_file_open(errp, errno, filename);
2214         return;
2215     }
2216
2217     while (size != 0) {
2218         l = sizeof(buf);
2219         if (l > size)
2220             l = size;
2221         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2222             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2223                              " specified", orig_addr, orig_size);
2224             goto exit;
2225         }
2226         if (fwrite(buf, 1, l, f) != l) {
2227             error_setg(errp, QERR_IO_ERROR);
2228             goto exit;
2229         }
2230         addr += l;
2231         size -= l;
2232     }
2233
2234 exit:
2235     fclose(f);
2236 }
2237
2238 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2239                   Error **errp)
2240 {
2241     FILE *f;
2242     uint32_t l;
2243     uint8_t buf[1024];
2244
2245     f = fopen(filename, "wb");
2246     if (!f) {
2247         error_setg_file_open(errp, errno, filename);
2248         return;
2249     }
2250
2251     while (size != 0) {
2252         l = sizeof(buf);
2253         if (l > size)
2254             l = size;
2255         cpu_physical_memory_read(addr, buf, l);
2256         if (fwrite(buf, 1, l, f) != l) {
2257             error_setg(errp, QERR_IO_ERROR);
2258             goto exit;
2259         }
2260         addr += l;
2261         size -= l;
2262     }
2263
2264 exit:
2265     fclose(f);
2266 }
2267
2268 void qmp_inject_nmi(Error **errp)
2269 {
2270     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2271 }
2272
2273 void dump_drift_info(void)
2274 {
2275     if (!use_icount) {
2276         return;
2277     }
2278
2279     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2280                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2281     if (icount_align_option) {
2282         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2283                     -max_delay / SCALE_MS);
2284         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2285                     max_advance / SCALE_MS);
2286     } else {
2287         qemu_printf("Max guest delay     NA\n");
2288         qemu_printf("Max guest advance   NA\n");
2289     }
2290 }