cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "qemu/plugin.h"
  49 #include "sysemu/cpus.h"
  50 #include "sysemu/qtest.h"
  51 #include "qemu/main-loop.h"
  52 #include "qemu/option.h"
  53 #include "qemu/bitmap.h"
  54 #include "qemu/seqlock.h"
  55 #include "qemu/guest-random.h"
  56 #include "tcg/tcg.h"
  57 #include "hw/nmi.h"
  58 #include "sysemu/replay.h"
  59 #include "sysemu/runstate.h"
  60 #include "hw/boards.h"
  61 #include "hw/hw.h"
  62
  63 #ifdef CONFIG_LINUX
  64
  65 #include <sys/prctl.h>
  66
  67 #ifndef PR_MCE_KILL
  68 #define PR_MCE_KILL 33
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_SET
  72 #define PR_MCE_KILL_SET 1
  73 #endif
  74
  75 #ifndef PR_MCE_KILL_EARLY
  76 #define PR_MCE_KILL_EARLY 1
  77 #endif
  78
  79 #endif /* CONFIG_LINUX */
  80
  81 static QemuMutex qemu_global_mutex;
  82
  83 int64_t max_delay;
  84 int64_t max_advance;
  85
  86 /* vcpu throttling controls */
  87 static QEMUTimer *throttle_timer;
  88 static unsigned int throttle_percentage;
  89
  90 #define CPU_THROTTLE_PCT_MIN 1
  91 #define CPU_THROTTLE_PCT_MAX 99
  92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94 bool cpu_is_stopped(CPUState *cpu)
  95 {
  96     return cpu->stopped || !runstate_is_running();
  97 }
  98
  99 static bool cpu_thread_is_idle(CPUState *cpu)
 100 {
 101     if (cpu->stop || cpu->queued_work_first) {
 102         return false;
 103     }
 104     if (cpu_is_stopped(cpu)) {
 105         return true;
 106     }
 107     if (!cpu->halted || cpu_has_work(cpu) ||
 108         kvm_halt_in_kernel()) {
 109         return false;
 110     }
 111     return true;
 112 }
 113
 114 static bool all_cpu_threads_idle(void)
 115 {
 116     CPUState *cpu;
 117
 118     CPU_FOREACH(cpu) {
 119         if (!cpu_thread_is_idle(cpu)) {
 120             return false;
 121         }
 122     }
 123     return true;
 124 }
 125
 126 /***********************************************************/
 127 /* guest cycle counter */
 128
 129 /* Protected by TimersState seqlock */
 130
 131 static bool icount_sleep = true;
 132 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133 #define MAX_ICOUNT_SHIFT 10
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* Protect fields that can be respectively read outside the
 141      * BQL, and written from multiple threads.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     QemuSpin vm_clock_lock;
 145
 146     int16_t cpu_ticks_enabled;
 147
 148     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149     int16_t icount_time_shift;
 150
 151     /* Compensate for varying guest execution speed.  */
 152     int64_t qemu_icount_bias;
 153
 154     int64_t vm_clock_warp_start;
 155     int64_t cpu_clock_offset;
 156
 157     /* Only written by TCG thread */
 158     int64_t qemu_icount;
 159
 160     /* for adjusting icount */
 161     QEMUTimer *icount_rt_timer;
 162     QEMUTimer *icount_vm_timer;
 163     QEMUTimer *icount_warp_timer;
 164 } TimersState;
 165
 166 static TimersState timers_state;
 167 bool mttcg_enabled;
 168
 169
 170 /* The current number of executed instructions is based on what we
 171  * originally budgeted minus the current state of the decrementing
 172  * icount counters in extra/u16.low.
 173  */
 174 static int64_t cpu_get_icount_executed(CPUState *cpu)
 175 {
 176     return (cpu->icount_budget -
 177             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 178 }
 179
 180 /*
 181  * Update the global shared timer_state.qemu_icount to take into
 182  * account executed instructions. This is done by the TCG vCPU
 183  * thread so the main-loop can see time has moved forward.
 184  */
 185 static void cpu_update_icount_locked(CPUState *cpu)
 186 {
 187     int64_t executed = cpu_get_icount_executed(cpu);
 188     cpu->icount_budget -= executed;
 189
 190     atomic_set_i64(&timers_state.qemu_icount,
 191                    timers_state.qemu_icount + executed);
 192 }
 193
 194 /*
 195  * Update the global shared timer_state.qemu_icount to take into
 196  * account executed instructions. This is done by the TCG vCPU
 197  * thread so the main-loop can see time has moved forward.
 198  */
 199 void cpu_update_icount(CPUState *cpu)
 200 {
 201     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 202                        &timers_state.vm_clock_lock);
 203     cpu_update_icount_locked(cpu);
 204     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 205                          &timers_state.vm_clock_lock);
 206 }
 207
 208 static int64_t cpu_get_icount_raw_locked(void)
 209 {
 210     CPUState *cpu = current_cpu;
 211
 212     if (cpu && cpu->running) {
 213         if (!cpu->can_do_io) {
 214             error_report("Bad icount read");
 215             exit(1);
 216         }
 217         /* Take into account what has run */
 218         cpu_update_icount_locked(cpu);
 219     }
 220     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 221     return atomic_read_i64(&timers_state.qemu_icount);
 222 }
 223
 224 static int64_t cpu_get_icount_locked(void)
 225 {
 226     int64_t icount = cpu_get_icount_raw_locked();
 227     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 228         cpu_icount_to_ns(icount);
 229 }
 230
 231 int64_t cpu_get_icount_raw(void)
 232 {
 233     int64_t icount;
 234     unsigned start;
 235
 236     do {
 237         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238         icount = cpu_get_icount_raw_locked();
 239     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241     return icount;
 242 }
 243
 244 /* Return the virtual CPU time, based on the instruction counter.  */
 245 int64_t cpu_get_icount(void)
 246 {
 247     int64_t icount;
 248     unsigned start;
 249
 250     do {
 251         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 252         icount = cpu_get_icount_locked();
 253     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 254
 255     return icount;
 256 }
 257
 258 int64_t cpu_icount_to_ns(int64_t icount)
 259 {
 260     return icount << atomic_read(&timers_state.icount_time_shift);
 261 }
 262
 263 static int64_t cpu_get_ticks_locked(void)
 264 {
 265     int64_t ticks = timers_state.cpu_ticks_offset;
 266     if (timers_state.cpu_ticks_enabled) {
 267         ticks += cpu_get_host_ticks();
 268     }
 269
 270     if (timers_state.cpu_ticks_prev > ticks) {
 271         /* Non increasing ticks may happen if the host uses software suspend.  */
 272         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 273         ticks = timers_state.cpu_ticks_prev;
 274     }
 275
 276     timers_state.cpu_ticks_prev = ticks;
 277     return ticks;
 278 }
 279
 280 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 281  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 282  * counter.
 283  */
 284 int64_t cpu_get_ticks(void)
 285 {
 286     int64_t ticks;
 287
 288     if (use_icount) {
 289         return cpu_get_icount();
 290     }
 291
 292     qemu_spin_lock(&timers_state.vm_clock_lock);
 293     ticks = cpu_get_ticks_locked();
 294     qemu_spin_unlock(&timers_state.vm_clock_lock);
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 332                        &timers_state.vm_clock_lock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 339                        &timers_state.vm_clock_lock);
 340 }
 341
 342 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 343  * cpu_get_ticks() after that.
 344  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 345  */
 346 void cpu_disable_ticks(void)
 347 {
 348     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 349                        &timers_state.vm_clock_lock);
 350     if (timers_state.cpu_ticks_enabled) {
 351         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 352         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 353         timers_state.cpu_ticks_enabled = 0;
 354     }
 355     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 356                          &timers_state.vm_clock_lock);
 357 }
 358
 359 /* Correlation between real and virtual time is always going to be
 360    fairly approximate, so ignore small variation.
 361    When the guest is idle real and virtual time will be aligned in
 362    the IO wait loop.  */
 363 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 364
 365 static void icount_adjust(void)
 366 {
 367     int64_t cur_time;
 368     int64_t cur_icount;
 369     int64_t delta;
 370
 371     /* Protected by TimersState mutex.  */
 372     static int64_t last_delta;
 373
 374     /* If the VM is not running, then do nothing.  */
 375     if (!runstate_is_running()) {
 376         return;
 377     }
 378
 379     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 380                        &timers_state.vm_clock_lock);
 381     cur_time = cpu_get_clock_locked();
 382     cur_icount = cpu_get_icount_locked();
 383
 384     delta = cur_icount - cur_time;
 385     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 386     if (delta > 0
 387         && last_delta + ICOUNT_WOBBLE < delta * 2
 388         && timers_state.icount_time_shift > 0) {
 389         /* The guest is getting too far ahead.  Slow time down.  */
 390         atomic_set(&timers_state.icount_time_shift,
 391                    timers_state.icount_time_shift - 1);
 392     }
 393     if (delta < 0
 394         && last_delta - ICOUNT_WOBBLE > delta * 2
 395         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 396         /* The guest is getting too far behind.  Speed time up.  */
 397         atomic_set(&timers_state.icount_time_shift,
 398                    timers_state.icount_time_shift + 1);
 399     }
 400     last_delta = delta;
 401     atomic_set_i64(&timers_state.qemu_icount_bias,
 402                    cur_icount - (timers_state.qemu_icount
 403                                  << timers_state.icount_time_shift));
 404     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                          &timers_state.vm_clock_lock);
 406 }
 407
 408 static void icount_adjust_rt(void *opaque)
 409 {
 410     timer_mod(timers_state.icount_rt_timer,
 411               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 412     icount_adjust();
 413 }
 414
 415 static void icount_adjust_vm(void *opaque)
 416 {
 417     timer_mod(timers_state.icount_vm_timer,
 418                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 419                    NANOSECONDS_PER_SECOND / 10);
 420     icount_adjust();
 421 }
 422
 423 static int64_t qemu_icount_round(int64_t count)
 424 {
 425     int shift = atomic_read(&timers_state.icount_time_shift);
 426     return (count + (1 << shift) - 1) >> shift;
 427 }
 428
 429 static void icount_warp_rt(void)
 430 {
 431     unsigned seq;
 432     int64_t warp_start;
 433
 434     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 435      * changes from -1 to another value, so the race here is okay.
 436      */
 437     do {
 438         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 439         warp_start = timers_state.vm_clock_warp_start;
 440     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 441
 442     if (warp_start == -1) {
 443         return;
 444     }
 445
 446     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 447                        &timers_state.vm_clock_lock);
 448     if (runstate_is_running()) {
 449         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 450                                             cpu_get_clock_locked());
 451         int64_t warp_delta;
 452
 453         warp_delta = clock - timers_state.vm_clock_warp_start;
 454         if (use_icount == 2) {
 455             /*
 456              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 457              * far ahead of real time.
 458              */
 459             int64_t cur_icount = cpu_get_icount_locked();
 460             int64_t delta = clock - cur_icount;
 461             warp_delta = MIN(warp_delta, delta);
 462         }
 463         atomic_set_i64(&timers_state.qemu_icount_bias,
 464                        timers_state.qemu_icount_bias + warp_delta);
 465     }
 466     timers_state.vm_clock_warp_start = -1;
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                        &timers_state.vm_clock_lock);
 469
 470     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 471         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 472     }
 473 }
 474
 475 static void icount_timer_cb(void *opaque)
 476 {
 477     /* No need for a checkpoint because the timer already synchronizes
 478      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 479      */
 480     icount_warp_rt();
 481 }
 482
 483 void qtest_clock_warp(int64_t dest)
 484 {
 485     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 486     AioContext *aio_context;
 487     assert(qtest_enabled());
 488     aio_context = qemu_get_aio_context();
 489     while (clock < dest) {
 490         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 491                                                       QEMU_TIMER_ATTR_ALL);
 492         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 493
 494         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 495                            &timers_state.vm_clock_lock);
 496         atomic_set_i64(&timers_state.qemu_icount_bias,
 497                        timers_state.qemu_icount_bias + warp);
 498         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 499                              &timers_state.vm_clock_lock);
 500
 501         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 502         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 503         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 504     }
 505     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 506 }
 507
 508 void qemu_start_warp_timer(void)
 509 {
 510     int64_t clock;
 511     int64_t deadline;
 512
 513     if (!use_icount) {
 514         return;
 515     }
 516
 517     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 518      * do not fire, so computing the deadline does not make sense.
 519      */
 520     if (!runstate_is_running()) {
 521         return;
 522     }
 523
 524     if (replay_mode != REPLAY_MODE_PLAY) {
 525         if (!all_cpu_threads_idle()) {
 526             return;
 527         }
 528
 529         if (qtest_enabled()) {
 530             /* When testing, qtest commands advance icount.  */
 531             return;
 532         }
 533
 534         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 535     } else {
 536         /* warp clock deterministically in record/replay mode */
 537         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 538             /* vCPU is sleeping and warp can't be started.
 539                It is probably a race condition: notification sent
 540                to vCPU was processed in advance and vCPU went to sleep.
 541                Therefore we have to wake it up for doing someting. */
 542             if (replay_has_checkpoint()) {
 543                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544             }
 545             return;
 546         }
 547     }
 548
 549     /* We want to use the earliest deadline from ALL vm_clocks */
 550     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 551     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 552                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 553     if (deadline < 0) {
 554         static bool notified;
 555         if (!icount_sleep && !notified) {
 556             warn_report("icount sleep disabled and no active timers");
 557             notified = true;
 558         }
 559         return;
 560     }
 561
 562     if (deadline > 0) {
 563         /*
 564          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 565          * sleep.  Otherwise, the CPU might be waiting for a future timer
 566          * interrupt to wake it up, but the interrupt never comes because
 567          * the vCPU isn't running any insns and thus doesn't advance the
 568          * QEMU_CLOCK_VIRTUAL.
 569          */
 570         if (!icount_sleep) {
 571             /*
 572              * We never let VCPUs sleep in no sleep icount mode.
 573              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 574              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 575              * It is useful when we want a deterministic execution time,
 576              * isolated from host latencies.
 577              */
 578             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 579                                &timers_state.vm_clock_lock);
 580             atomic_set_i64(&timers_state.qemu_icount_bias,
 581                            timers_state.qemu_icount_bias + deadline);
 582             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 583                                  &timers_state.vm_clock_lock);
 584             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 585         } else {
 586             /*
 587              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 588              * "real" time, (related to the time left until the next event) has
 589              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 590              * This avoids that the warps are visible externally; for example,
 591              * you will not be sending network packets continuously instead of
 592              * every 100ms.
 593              */
 594             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 595                                &timers_state.vm_clock_lock);
 596             if (timers_state.vm_clock_warp_start == -1
 597                 || timers_state.vm_clock_warp_start > clock) {
 598                 timers_state.vm_clock_warp_start = clock;
 599             }
 600             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 601                                  &timers_state.vm_clock_lock);
 602             timer_mod_anticipate(timers_state.icount_warp_timer,
 603                                  clock + deadline);
 604         }
 605     } else if (deadline == 0) {
 606         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 607     }
 608 }
 609
 610 static void qemu_account_warp_timer(void)
 611 {
 612     if (!use_icount || !icount_sleep) {
 613         return;
 614     }
 615
 616     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 617      * do not fire, so computing the deadline does not make sense.
 618      */
 619     if (!runstate_is_running()) {
 620         return;
 621     }
 622
 623     /* warp clock deterministically in record/replay mode */
 624     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 625         return;
 626     }
 627
 628     timer_del(timers_state.icount_warp_timer);
 629     icount_warp_rt();
 630 }
 631
 632 static bool icount_state_needed(void *opaque)
 633 {
 634     return use_icount;
 635 }
 636
 637 static bool warp_timer_state_needed(void *opaque)
 638 {
 639     TimersState *s = opaque;
 640     return s->icount_warp_timer != NULL;
 641 }
 642
 643 static bool adjust_timers_state_needed(void *opaque)
 644 {
 645     TimersState *s = opaque;
 646     return s->icount_rt_timer != NULL;
 647 }
 648
 649 /*
 650  * Subsection for warp timer migration is optional, because may not be created
 651  */
 652 static const VMStateDescription icount_vmstate_warp_timer = {
 653     .name = "timer/icount/warp_timer",
 654     .version_id = 1,
 655     .minimum_version_id = 1,
 656     .needed = warp_timer_state_needed,
 657     .fields = (VMStateField[]) {
 658         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 659         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 660         VMSTATE_END_OF_LIST()
 661     }
 662 };
 663
 664 static const VMStateDescription icount_vmstate_adjust_timers = {
 665     .name = "timer/icount/timers",
 666     .version_id = 1,
 667     .minimum_version_id = 1,
 668     .needed = adjust_timers_state_needed,
 669     .fields = (VMStateField[]) {
 670         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 671         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 672         VMSTATE_END_OF_LIST()
 673     }
 674 };
 675
 676 /*
 677  * This is a subsection for icount migration.
 678  */
 679 static const VMStateDescription icount_vmstate_timers = {
 680     .name = "timer/icount",
 681     .version_id = 1,
 682     .minimum_version_id = 1,
 683     .needed = icount_state_needed,
 684     .fields = (VMStateField[]) {
 685         VMSTATE_INT64(qemu_icount_bias, TimersState),
 686         VMSTATE_INT64(qemu_icount, TimersState),
 687         VMSTATE_END_OF_LIST()
 688     },
 689     .subsections = (const VMStateDescription*[]) {
 690         &icount_vmstate_warp_timer,
 691         &icount_vmstate_adjust_timers,
 692         NULL
 693     }
 694 };
 695
 696 static const VMStateDescription vmstate_timers = {
 697     .name = "timer",
 698     .version_id = 2,
 699     .minimum_version_id = 1,
 700     .fields = (VMStateField[]) {
 701         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 702         VMSTATE_UNUSED(8),
 703         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 704         VMSTATE_END_OF_LIST()
 705     },
 706     .subsections = (const VMStateDescription*[]) {
 707         &icount_vmstate_timers,
 708         NULL
 709     }
 710 };
 711
 712 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 713 {
 714     double pct;
 715     double throttle_ratio;
 716     int64_t sleeptime_ns, endtime_ns;
 717
 718     if (!cpu_throttle_get_percentage()) {
 719         return;
 720     }
 721
 722     pct = (double)cpu_throttle_get_percentage()/100;
 723     throttle_ratio = pct / (1 - pct);
 724     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 725     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 726     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 727     while (sleeptime_ns > 0 && !cpu->stop) {
 728         if (sleeptime_ns > SCALE_MS) {
 729             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 730                                 sleeptime_ns / SCALE_MS);
 731         } else {
 732             qemu_mutex_unlock_iothread();
 733             g_usleep(sleeptime_ns / SCALE_US);
 734             qemu_mutex_lock_iothread();
 735         }
 736         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 737     }
 738     atomic_set(&cpu->throttle_thread_scheduled, 0);
 739 }
 740
 741 static void cpu_throttle_timer_tick(void *opaque)
 742 {
 743     CPUState *cpu;
 744     double pct;
 745
 746     /* Stop the timer if needed */
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750     CPU_FOREACH(cpu) {
 751         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 752             async_run_on_cpu(cpu, cpu_throttle_thread,
 753                              RUN_ON_CPU_NULL);
 754         }
 755     }
 756
 757     pct = (double)cpu_throttle_get_percentage()/100;
 758     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 759                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 760 }
 761
 762 void cpu_throttle_set(int new_throttle_pct)
 763 {
 764     /* Ensure throttle percentage is within valid range */
 765     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 766     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 767
 768     atomic_set(&throttle_percentage, new_throttle_pct);
 769
 770     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 771                                        CPU_THROTTLE_TIMESLICE_NS);
 772 }
 773
 774 void cpu_throttle_stop(void)
 775 {
 776     atomic_set(&throttle_percentage, 0);
 777 }
 778
 779 bool cpu_throttle_active(void)
 780 {
 781     return (cpu_throttle_get_percentage() != 0);
 782 }
 783
 784 int cpu_throttle_get_percentage(void)
 785 {
 786     return atomic_read(&throttle_percentage);
 787 }
 788
 789 void cpu_ticks_init(void)
 790 {
 791     seqlock_init(&timers_state.vm_clock_seqlock);
 792     qemu_spin_init(&timers_state.vm_clock_lock);
 793     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 794     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 795                                            cpu_throttle_timer_tick, NULL);
 796 }
 797
 798 void configure_icount(QemuOpts *opts, Error **errp)
 799 {
 800     const char *option = qemu_opt_get(opts, "shift");
 801     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 802     bool align = qemu_opt_get_bool(opts, "align", false);
 803     long time_shift = -1;
 804     char *rem_str = NULL;
 805
 806     if (!option && qemu_opt_get(opts, "align")) {
 807         error_setg(errp, "Please specify shift option when using align");
 808         return;
 809     }
 810
 811     if (align && !sleep) {
 812         error_setg(errp, "align=on and sleep=off are incompatible");
 813         return;
 814     }
 815
 816     if (strcmp(option, "auto") != 0) {
 817         errno = 0;
 818         time_shift = strtol(option, &rem_str, 0);
 819         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 820             error_setg(errp, "icount: Invalid shift value");
 821             return;
 822         }
 823     } else if (icount_align_option) {
 824         error_setg(errp, "shift=auto and align=on are incompatible");
 825         return;
 826     } else if (!icount_sleep) {
 827         error_setg(errp, "shift=auto and sleep=off are incompatible");
 828         return;
 829     }
 830
 831     icount_sleep = sleep;
 832     if (icount_sleep) {
 833         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 834                                          icount_timer_cb, NULL);
 835     }
 836
 837     icount_align_option = align;
 838
 839     if (time_shift >= 0) {
 840         timers_state.icount_time_shift = time_shift;
 841         use_icount = 1;
 842         return;
 843     }
 844
 845     use_icount = 2;
 846
 847     /* 125MIPS seems a reasonable initial guess at the guest speed.
 848        It will be corrected fairly quickly anyway.  */
 849     timers_state.icount_time_shift = 3;
 850
 851     /* Have both realtime and virtual time triggers for speed adjustment.
 852        The realtime trigger catches emulated time passing too slowly,
 853        the virtual time trigger catches emulated time passing too fast.
 854        Realtime triggers occur even when idle, so use them less frequently
 855        than VM triggers.  */
 856     timers_state.vm_clock_warp_start = -1;
 857     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 858                                    icount_adjust_rt, NULL);
 859     timer_mod(timers_state.icount_rt_timer,
 860                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 861     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 862                                         icount_adjust_vm, NULL);
 863     timer_mod(timers_state.icount_vm_timer,
 864                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 865                    NANOSECONDS_PER_SECOND / 10);
 866 }
 867
 868 /***********************************************************/
 869 /* TCG vCPU kick timer
 870  *
 871  * The kick timer is responsible for moving single threaded vCPU
 872  * emulation on to the next vCPU. If more than one vCPU is running a
 873  * timer event with force a cpu->exit so the next vCPU can get
 874  * scheduled.
 875  *
 876  * The timer is removed if all vCPUs are idle and restarted again once
 877  * idleness is complete.
 878  */
 879
 880 static QEMUTimer *tcg_kick_vcpu_timer;
 881 static CPUState *tcg_current_rr_cpu;
 882
 883 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 884
 885 static inline int64_t qemu_tcg_next_kick(void)
 886 {
 887     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 888 }
 889
 890 /* Kick the currently round-robin scheduled vCPU to next */
 891 static void qemu_cpu_kick_rr_next_cpu(void)
 892 {
 893     CPUState *cpu;
 894     do {
 895         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 896         if (cpu) {
 897             cpu_exit(cpu);
 898         }
 899     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 900 }
 901
 902 /* Kick all RR vCPUs */
 903 static void qemu_cpu_kick_rr_cpus(void)
 904 {
 905     CPUState *cpu;
 906
 907     CPU_FOREACH(cpu) {
 908         cpu_exit(cpu);
 909     };
 910 }
 911
 912 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 913 {
 914 }
 915
 916 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 917 {
 918     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 919         qemu_notify_event();
 920         return;
 921     }
 922
 923     if (qemu_in_vcpu_thread()) {
 924         /* A CPU is currently running; kick it back out to the
 925          * tcg_cpu_exec() loop so it will recalculate its
 926          * icount deadline immediately.
 927          */
 928         qemu_cpu_kick(current_cpu);
 929     } else if (first_cpu) {
 930         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 931          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 932          * causes cpu_thread_is_idle to return false.  This way,
 933          * handle_icount_deadline can run.
 934          * If we have no CPUs at all for some reason, we don't
 935          * need to do anything.
 936          */
 937         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 938     }
 939 }
 940
 941 static void kick_tcg_thread(void *opaque)
 942 {
 943     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 944     qemu_cpu_kick_rr_next_cpu();
 945 }
 946
 947 static void start_tcg_kick_timer(void)
 948 {
 949     assert(!mttcg_enabled);
 950     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 951         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 952                                            kick_tcg_thread, NULL);
 953     }
 954     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 955         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 956     }
 957 }
 958
 959 static void stop_tcg_kick_timer(void)
 960 {
 961     assert(!mttcg_enabled);
 962     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 963         timer_del(tcg_kick_vcpu_timer);
 964     }
 965 }
 966
 967 /***********************************************************/
 968 void hw_error(const char *fmt, ...)
 969 {
 970     va_list ap;
 971     CPUState *cpu;
 972
 973     va_start(ap, fmt);
 974     fprintf(stderr, "qemu: hardware error: ");
 975     vfprintf(stderr, fmt, ap);
 976     fprintf(stderr, "\n");
 977     CPU_FOREACH(cpu) {
 978         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 979         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 980     }
 981     va_end(ap);
 982     abort();
 983 }
 984
 985 void cpu_synchronize_all_states(void)
 986 {
 987     CPUState *cpu;
 988
 989     CPU_FOREACH(cpu) {
 990         cpu_synchronize_state(cpu);
 991         /* TODO: move to cpu_synchronize_state() */
 992         if (hvf_enabled()) {
 993             hvf_cpu_synchronize_state(cpu);
 994         }
 995     }
 996 }
 997
 998 void cpu_synchronize_all_post_reset(void)
 999 {
1000     CPUState *cpu;
1001
1002     CPU_FOREACH(cpu) {
1003         cpu_synchronize_post_reset(cpu);
1004         /* TODO: move to cpu_synchronize_post_reset() */
1005         if (hvf_enabled()) {
1006             hvf_cpu_synchronize_post_reset(cpu);
1007         }
1008     }
1009 }
1010
1011 void cpu_synchronize_all_post_init(void)
1012 {
1013     CPUState *cpu;
1014
1015     CPU_FOREACH(cpu) {
1016         cpu_synchronize_post_init(cpu);
1017         /* TODO: move to cpu_synchronize_post_init() */
1018         if (hvf_enabled()) {
1019             hvf_cpu_synchronize_post_init(cpu);
1020         }
1021     }
1022 }
1023
1024 void cpu_synchronize_all_pre_loadvm(void)
1025 {
1026     CPUState *cpu;
1027
1028     CPU_FOREACH(cpu) {
1029         cpu_synchronize_pre_loadvm(cpu);
1030     }
1031 }
1032
1033 static int do_vm_stop(RunState state, bool send_stop)
1034 {
1035     int ret = 0;
1036
1037     if (runstate_is_running()) {
1038         runstate_set(state);
1039         cpu_disable_ticks();
1040         pause_all_vcpus();
1041         vm_state_notify(0, state);
1042         if (send_stop) {
1043             qapi_event_send_stop();
1044         }
1045     }
1046
1047     bdrv_drain_all();
1048     ret = bdrv_flush_all();
1049
1050     return ret;
1051 }
1052
1053 /* Special vm_stop() variant for terminating the process.  Historically clients
1054  * did not expect a QMP STOP event and so we need to retain compatibility.
1055  */
1056 int vm_shutdown(void)
1057 {
1058     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1059 }
1060
1061 static bool cpu_can_run(CPUState *cpu)
1062 {
1063     if (cpu->stop) {
1064         return false;
1065     }
1066     if (cpu_is_stopped(cpu)) {
1067         return false;
1068     }
1069     return true;
1070 }
1071
1072 static void cpu_handle_guest_debug(CPUState *cpu)
1073 {
1074     gdb_set_stop_cpu(cpu);
1075     qemu_system_debug_request();
1076     cpu->stopped = true;
1077 }
1078
1079 #ifdef CONFIG_LINUX
1080 static void sigbus_reraise(void)
1081 {
1082     sigset_t set;
1083     struct sigaction action;
1084
1085     memset(&action, 0, sizeof(action));
1086     action.sa_handler = SIG_DFL;
1087     if (!sigaction(SIGBUS, &action, NULL)) {
1088         raise(SIGBUS);
1089         sigemptyset(&set);
1090         sigaddset(&set, SIGBUS);
1091         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1092     }
1093     perror("Failed to re-raise SIGBUS!\n");
1094     abort();
1095 }
1096
1097 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1098 {
1099     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1100         sigbus_reraise();
1101     }
1102
1103     if (current_cpu) {
1104         /* Called asynchronously in VCPU thread.  */
1105         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1106             sigbus_reraise();
1107         }
1108     } else {
1109         /* Called synchronously (via signalfd) in main thread.  */
1110         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1111             sigbus_reraise();
1112         }
1113     }
1114 }
1115
1116 static void qemu_init_sigbus(void)
1117 {
1118     struct sigaction action;
1119
1120     memset(&action, 0, sizeof(action));
1121     action.sa_flags = SA_SIGINFO;
1122     action.sa_sigaction = sigbus_handler;
1123     sigaction(SIGBUS, &action, NULL);
1124
1125     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1126 }
1127 #else /* !CONFIG_LINUX */
1128 static void qemu_init_sigbus(void)
1129 {
1130 }
1131 #endif /* !CONFIG_LINUX */
1132
1133 static QemuThread io_thread;
1134
1135 /* cpu creation */
1136 static QemuCond qemu_cpu_cond;
1137 /* system init */
1138 static QemuCond qemu_pause_cond;
1139
1140 void qemu_init_cpu_loop(void)
1141 {
1142     qemu_init_sigbus();
1143     qemu_cond_init(&qemu_cpu_cond);
1144     qemu_cond_init(&qemu_pause_cond);
1145     qemu_mutex_init(&qemu_global_mutex);
1146
1147     qemu_thread_get_self(&io_thread);
1148 }
1149
1150 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1151 {
1152     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1153 }
1154
1155 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1156 {
1157     if (kvm_destroy_vcpu(cpu) < 0) {
1158         error_report("kvm_destroy_vcpu failed");
1159         exit(EXIT_FAILURE);
1160     }
1161 }
1162
1163 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1164 {
1165 }
1166
1167 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1168 {
1169     g_assert(qemu_cpu_is_self(cpu));
1170     cpu->stop = false;
1171     cpu->stopped = true;
1172     if (exit) {
1173         cpu_exit(cpu);
1174     }
1175     qemu_cond_broadcast(&qemu_pause_cond);
1176 }
1177
1178 static void qemu_wait_io_event_common(CPUState *cpu)
1179 {
1180     atomic_mb_set(&cpu->thread_kicked, false);
1181     if (cpu->stop) {
1182         qemu_cpu_stop(cpu, false);
1183     }
1184     process_queued_cpu_work(cpu);
1185 }
1186
1187 static void qemu_tcg_rr_wait_io_event(void)
1188 {
1189     CPUState *cpu;
1190
1191     while (all_cpu_threads_idle()) {
1192         stop_tcg_kick_timer();
1193         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1194     }
1195
1196     start_tcg_kick_timer();
1197
1198     CPU_FOREACH(cpu) {
1199         qemu_wait_io_event_common(cpu);
1200     }
1201 }
1202
1203 static void qemu_wait_io_event(CPUState *cpu)
1204 {
1205     bool slept = false;
1206
1207     while (cpu_thread_is_idle(cpu)) {
1208         if (!slept) {
1209             slept = true;
1210             qemu_plugin_vcpu_idle_cb(cpu);
1211         }
1212         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1213     }
1214     if (slept) {
1215         qemu_plugin_vcpu_resume_cb(cpu);
1216     }
1217
1218 #ifdef _WIN32
1219     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1220     if (!tcg_enabled()) {
1221         SleepEx(0, TRUE);
1222     }
1223 #endif
1224     qemu_wait_io_event_common(cpu);
1225 }
1226
1227 static void *qemu_kvm_cpu_thread_fn(void *arg)
1228 {
1229     CPUState *cpu = arg;
1230     int r;
1231
1232     rcu_register_thread();
1233
1234     qemu_mutex_lock_iothread();
1235     qemu_thread_get_self(cpu->thread);
1236     cpu->thread_id = qemu_get_thread_id();
1237     cpu->can_do_io = 1;
1238     current_cpu = cpu;
1239
1240     r = kvm_init_vcpu(cpu);
1241     if (r < 0) {
1242         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1243         exit(1);
1244     }
1245
1246     kvm_init_cpu_signals(cpu);
1247
1248     /* signal CPU creation */
1249     cpu->created = true;
1250     qemu_cond_signal(&qemu_cpu_cond);
1251     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1252
1253     do {
1254         if (cpu_can_run(cpu)) {
1255             r = kvm_cpu_exec(cpu);
1256             if (r == EXCP_DEBUG) {
1257                 cpu_handle_guest_debug(cpu);
1258             }
1259         }
1260         qemu_wait_io_event(cpu);
1261     } while (!cpu->unplug || cpu_can_run(cpu));
1262
1263     qemu_kvm_destroy_vcpu(cpu);
1264     cpu->created = false;
1265     qemu_cond_signal(&qemu_cpu_cond);
1266     qemu_mutex_unlock_iothread();
1267     rcu_unregister_thread();
1268     return NULL;
1269 }
1270
1271 static void *qemu_dummy_cpu_thread_fn(void *arg)
1272 {
1273 #ifdef _WIN32
1274     error_report("qtest is not supported under Windows");
1275     exit(1);
1276 #else
1277     CPUState *cpu = arg;
1278     sigset_t waitset;
1279     int r;
1280
1281     rcu_register_thread();
1282
1283     qemu_mutex_lock_iothread();
1284     qemu_thread_get_self(cpu->thread);
1285     cpu->thread_id = qemu_get_thread_id();
1286     cpu->can_do_io = 1;
1287     current_cpu = cpu;
1288
1289     sigemptyset(&waitset);
1290     sigaddset(&waitset, SIG_IPI);
1291
1292     /* signal CPU creation */
1293     cpu->created = true;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1296
1297     do {
1298         qemu_mutex_unlock_iothread();
1299         do {
1300             int sig;
1301             r = sigwait(&waitset, &sig);
1302         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1303         if (r == -1) {
1304             perror("sigwait");
1305             exit(1);
1306         }
1307         qemu_mutex_lock_iothread();
1308         qemu_wait_io_event(cpu);
1309     } while (!cpu->unplug);
1310
1311     qemu_mutex_unlock_iothread();
1312     rcu_unregister_thread();
1313     return NULL;
1314 #endif
1315 }
1316
1317 static int64_t tcg_get_icount_limit(void)
1318 {
1319     int64_t deadline;
1320
1321     if (replay_mode != REPLAY_MODE_PLAY) {
1322         /*
1323          * Include all the timers, because they may need an attention.
1324          * Too long CPU execution may create unnecessary delay in UI.
1325          */
1326         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1327                                               QEMU_TIMER_ATTR_ALL);
1328         /* Check realtime timers, because they help with input processing */
1329         deadline = qemu_soonest_timeout(deadline,
1330                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1331                                            QEMU_TIMER_ATTR_ALL));
1332
1333         /* Maintain prior (possibly buggy) behaviour where if no deadline
1334          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1335          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1336          * nanoseconds.
1337          */
1338         if ((deadline < 0) || (deadline > INT32_MAX)) {
1339             deadline = INT32_MAX;
1340         }
1341
1342         return qemu_icount_round(deadline);
1343     } else {
1344         return replay_get_instructions();
1345     }
1346 }
1347
1348 static void handle_icount_deadline(void)
1349 {
1350     assert(qemu_in_vcpu_thread());
1351     if (use_icount) {
1352         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1353                                                       QEMU_TIMER_ATTR_ALL);
1354
1355         if (deadline == 0) {
1356             /* Wake up other AioContexts.  */
1357             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1358             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1359         }
1360     }
1361 }
1362
1363 static void prepare_icount_for_run(CPUState *cpu)
1364 {
1365     if (use_icount) {
1366         int insns_left;
1367
1368         /* These should always be cleared by process_icount_data after
1369          * each vCPU execution. However u16.high can be raised
1370          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1371          */
1372         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1373         g_assert(cpu->icount_extra == 0);
1374
1375         cpu->icount_budget = tcg_get_icount_limit();
1376         insns_left = MIN(0xffff, cpu->icount_budget);
1377         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1378         cpu->icount_extra = cpu->icount_budget - insns_left;
1379
1380         replay_mutex_lock();
1381     }
1382 }
1383
1384 static void process_icount_data(CPUState *cpu)
1385 {
1386     if (use_icount) {
1387         /* Account for executed instructions */
1388         cpu_update_icount(cpu);
1389
1390         /* Reset the counters */
1391         cpu_neg(cpu)->icount_decr.u16.low = 0;
1392         cpu->icount_extra = 0;
1393         cpu->icount_budget = 0;
1394
1395         replay_account_executed_instructions();
1396
1397         replay_mutex_unlock();
1398     }
1399 }
1400
1401
1402 static int tcg_cpu_exec(CPUState *cpu)
1403 {
1404     int ret;
1405 #ifdef CONFIG_PROFILER
1406     int64_t ti;
1407 #endif
1408
1409     assert(tcg_enabled());
1410 #ifdef CONFIG_PROFILER
1411     ti = profile_getclock();
1412 #endif
1413     cpu_exec_start(cpu);
1414     ret = cpu_exec(cpu);
1415     cpu_exec_end(cpu);
1416 #ifdef CONFIG_PROFILER
1417     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1418                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1419 #endif
1420     return ret;
1421 }
1422
1423 /* Destroy any remaining vCPUs which have been unplugged and have
1424  * finished running
1425  */
1426 static void deal_with_unplugged_cpus(void)
1427 {
1428     CPUState *cpu;
1429
1430     CPU_FOREACH(cpu) {
1431         if (cpu->unplug && !cpu_can_run(cpu)) {
1432             qemu_tcg_destroy_vcpu(cpu);
1433             cpu->created = false;
1434             qemu_cond_signal(&qemu_cpu_cond);
1435             break;
1436         }
1437     }
1438 }
1439
1440 /* Single-threaded TCG
1441  *
1442  * In the single-threaded case each vCPU is simulated in turn. If
1443  * there is more than a single vCPU we create a simple timer to kick
1444  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1445  * This is done explicitly rather than relying on side-effects
1446  * elsewhere.
1447  */
1448
1449 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1450 {
1451     CPUState *cpu = arg;
1452
1453     assert(tcg_enabled());
1454     rcu_register_thread();
1455     tcg_register_thread();
1456
1457     qemu_mutex_lock_iothread();
1458     qemu_thread_get_self(cpu->thread);
1459
1460     cpu->thread_id = qemu_get_thread_id();
1461     cpu->created = true;
1462     cpu->can_do_io = 1;
1463     qemu_cond_signal(&qemu_cpu_cond);
1464     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1465
1466     /* wait for initial kick-off after machine start */
1467     while (first_cpu->stopped) {
1468         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1469
1470         /* process any pending work */
1471         CPU_FOREACH(cpu) {
1472             current_cpu = cpu;
1473             qemu_wait_io_event_common(cpu);
1474         }
1475     }
1476
1477     start_tcg_kick_timer();
1478
1479     cpu = first_cpu;
1480
1481     /* process any pending work */
1482     cpu->exit_request = 1;
1483
1484     while (1) {
1485         qemu_mutex_unlock_iothread();
1486         replay_mutex_lock();
1487         qemu_mutex_lock_iothread();
1488         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1489         qemu_account_warp_timer();
1490
1491         /* Run the timers here.  This is much more efficient than
1492          * waking up the I/O thread and waiting for completion.
1493          */
1494         handle_icount_deadline();
1495
1496         replay_mutex_unlock();
1497
1498         if (!cpu) {
1499             cpu = first_cpu;
1500         }
1501
1502         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1503
1504             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1505             current_cpu = cpu;
1506
1507             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1508                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1509
1510             if (cpu_can_run(cpu)) {
1511                 int r;
1512
1513                 qemu_mutex_unlock_iothread();
1514                 prepare_icount_for_run(cpu);
1515
1516                 r = tcg_cpu_exec(cpu);
1517
1518                 process_icount_data(cpu);
1519                 qemu_mutex_lock_iothread();
1520
1521                 if (r == EXCP_DEBUG) {
1522                     cpu_handle_guest_debug(cpu);
1523                     break;
1524                 } else if (r == EXCP_ATOMIC) {
1525                     qemu_mutex_unlock_iothread();
1526                     cpu_exec_step_atomic(cpu);
1527                     qemu_mutex_lock_iothread();
1528                     break;
1529                 }
1530             } else if (cpu->stop) {
1531                 if (cpu->unplug) {
1532                     cpu = CPU_NEXT(cpu);
1533                 }
1534                 break;
1535             }
1536
1537             cpu = CPU_NEXT(cpu);
1538         } /* while (cpu && !cpu->exit_request).. */
1539
1540         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1541         atomic_set(&tcg_current_rr_cpu, NULL);
1542
1543         if (cpu && cpu->exit_request) {
1544             atomic_mb_set(&cpu->exit_request, 0);
1545         }
1546
1547         if (use_icount && all_cpu_threads_idle()) {
1548             /*
1549              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1550              * in the main_loop, wake it up in order to start the warp timer.
1551              */
1552             qemu_notify_event();
1553         }
1554
1555         qemu_tcg_rr_wait_io_event();
1556         deal_with_unplugged_cpus();
1557     }
1558
1559     rcu_unregister_thread();
1560     return NULL;
1561 }
1562
1563 static void *qemu_hax_cpu_thread_fn(void *arg)
1564 {
1565     CPUState *cpu = arg;
1566     int r;
1567
1568     rcu_register_thread();
1569     qemu_mutex_lock_iothread();
1570     qemu_thread_get_self(cpu->thread);
1571
1572     cpu->thread_id = qemu_get_thread_id();
1573     cpu->created = true;
1574     current_cpu = cpu;
1575
1576     hax_init_vcpu(cpu);
1577     qemu_cond_signal(&qemu_cpu_cond);
1578     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1579
1580     do {
1581         if (cpu_can_run(cpu)) {
1582             r = hax_smp_cpu_exec(cpu);
1583             if (r == EXCP_DEBUG) {
1584                 cpu_handle_guest_debug(cpu);
1585             }
1586         }
1587
1588         qemu_wait_io_event(cpu);
1589     } while (!cpu->unplug || cpu_can_run(cpu));
1590     rcu_unregister_thread();
1591     return NULL;
1592 }
1593
1594 /* The HVF-specific vCPU thread function. This one should only run when the host
1595  * CPU supports the VMX "unrestricted guest" feature. */
1596 static void *qemu_hvf_cpu_thread_fn(void *arg)
1597 {
1598     CPUState *cpu = arg;
1599
1600     int r;
1601
1602     assert(hvf_enabled());
1603
1604     rcu_register_thread();
1605
1606     qemu_mutex_lock_iothread();
1607     qemu_thread_get_self(cpu->thread);
1608
1609     cpu->thread_id = qemu_get_thread_id();
1610     cpu->can_do_io = 1;
1611     current_cpu = cpu;
1612
1613     hvf_init_vcpu(cpu);
1614
1615     /* signal CPU creation */
1616     cpu->created = true;
1617     qemu_cond_signal(&qemu_cpu_cond);
1618     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1619
1620     do {
1621         if (cpu_can_run(cpu)) {
1622             r = hvf_vcpu_exec(cpu);
1623             if (r == EXCP_DEBUG) {
1624                 cpu_handle_guest_debug(cpu);
1625             }
1626         }
1627         qemu_wait_io_event(cpu);
1628     } while (!cpu->unplug || cpu_can_run(cpu));
1629
1630     hvf_vcpu_destroy(cpu);
1631     cpu->created = false;
1632     qemu_cond_signal(&qemu_cpu_cond);
1633     qemu_mutex_unlock_iothread();
1634     rcu_unregister_thread();
1635     return NULL;
1636 }
1637
1638 static void *qemu_whpx_cpu_thread_fn(void *arg)
1639 {
1640     CPUState *cpu = arg;
1641     int r;
1642
1643     rcu_register_thread();
1644
1645     qemu_mutex_lock_iothread();
1646     qemu_thread_get_self(cpu->thread);
1647     cpu->thread_id = qemu_get_thread_id();
1648     current_cpu = cpu;
1649
1650     r = whpx_init_vcpu(cpu);
1651     if (r < 0) {
1652         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1653         exit(1);
1654     }
1655
1656     /* signal CPU creation */
1657     cpu->created = true;
1658     qemu_cond_signal(&qemu_cpu_cond);
1659     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1660
1661     do {
1662         if (cpu_can_run(cpu)) {
1663             r = whpx_vcpu_exec(cpu);
1664             if (r == EXCP_DEBUG) {
1665                 cpu_handle_guest_debug(cpu);
1666             }
1667         }
1668         while (cpu_thread_is_idle(cpu)) {
1669             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1670         }
1671         qemu_wait_io_event_common(cpu);
1672     } while (!cpu->unplug || cpu_can_run(cpu));
1673
1674     whpx_destroy_vcpu(cpu);
1675     cpu->created = false;
1676     qemu_cond_signal(&qemu_cpu_cond);
1677     qemu_mutex_unlock_iothread();
1678     rcu_unregister_thread();
1679     return NULL;
1680 }
1681
1682 #ifdef _WIN32
1683 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1684 {
1685 }
1686 #endif
1687
1688 /* Multi-threaded TCG
1689  *
1690  * In the multi-threaded case each vCPU has its own thread. The TLS
1691  * variable current_cpu can be used deep in the code to find the
1692  * current CPUState for a given thread.
1693  */
1694
1695 static void *qemu_tcg_cpu_thread_fn(void *arg)
1696 {
1697     CPUState *cpu = arg;
1698
1699     assert(tcg_enabled());
1700     g_assert(!use_icount);
1701
1702     rcu_register_thread();
1703     tcg_register_thread();
1704
1705     qemu_mutex_lock_iothread();
1706     qemu_thread_get_self(cpu->thread);
1707
1708     cpu->thread_id = qemu_get_thread_id();
1709     cpu->created = true;
1710     cpu->can_do_io = 1;
1711     current_cpu = cpu;
1712     qemu_cond_signal(&qemu_cpu_cond);
1713     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1714
1715     /* process any pending work */
1716     cpu->exit_request = 1;
1717
1718     do {
1719         if (cpu_can_run(cpu)) {
1720             int r;
1721             qemu_mutex_unlock_iothread();
1722             r = tcg_cpu_exec(cpu);
1723             qemu_mutex_lock_iothread();
1724             switch (r) {
1725             case EXCP_DEBUG:
1726                 cpu_handle_guest_debug(cpu);
1727                 break;
1728             case EXCP_HALTED:
1729                 /* during start-up the vCPU is reset and the thread is
1730                  * kicked several times. If we don't ensure we go back
1731                  * to sleep in the halted state we won't cleanly
1732                  * start-up when the vCPU is enabled.
1733                  *
1734                  * cpu->halted should ensure we sleep in wait_io_event
1735                  */
1736                 g_assert(cpu->halted);
1737                 break;
1738             case EXCP_ATOMIC:
1739                 qemu_mutex_unlock_iothread();
1740                 cpu_exec_step_atomic(cpu);
1741                 qemu_mutex_lock_iothread();
1742             default:
1743                 /* Ignore everything else? */
1744                 break;
1745             }
1746         }
1747
1748         atomic_mb_set(&cpu->exit_request, 0);
1749         qemu_wait_io_event(cpu);
1750     } while (!cpu->unplug || cpu_can_run(cpu));
1751
1752     qemu_tcg_destroy_vcpu(cpu);
1753     cpu->created = false;
1754     qemu_cond_signal(&qemu_cpu_cond);
1755     qemu_mutex_unlock_iothread();
1756     rcu_unregister_thread();
1757     return NULL;
1758 }
1759
1760 static void qemu_cpu_kick_thread(CPUState *cpu)
1761 {
1762 #ifndef _WIN32
1763     int err;
1764
1765     if (cpu->thread_kicked) {
1766         return;
1767     }
1768     cpu->thread_kicked = true;
1769     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1770     if (err && err != ESRCH) {
1771         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1772         exit(1);
1773     }
1774 #else /* _WIN32 */
1775     if (!qemu_cpu_is_self(cpu)) {
1776         if (whpx_enabled()) {
1777             whpx_vcpu_kick(cpu);
1778         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1779             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1780                     __func__, GetLastError());
1781             exit(1);
1782         }
1783     }
1784 #endif
1785 }
1786
1787 void qemu_cpu_kick(CPUState *cpu)
1788 {
1789     qemu_cond_broadcast(cpu->halt_cond);
1790     if (tcg_enabled()) {
1791         if (qemu_tcg_mttcg_enabled()) {
1792             cpu_exit(cpu);
1793         } else {
1794             qemu_cpu_kick_rr_cpus();
1795         }
1796     } else {
1797         if (hax_enabled()) {
1798             /*
1799              * FIXME: race condition with the exit_request check in
1800              * hax_vcpu_hax_exec
1801              */
1802             cpu->exit_request = 1;
1803         }
1804         qemu_cpu_kick_thread(cpu);
1805     }
1806 }
1807
1808 void qemu_cpu_kick_self(void)
1809 {
1810     assert(current_cpu);
1811     qemu_cpu_kick_thread(current_cpu);
1812 }
1813
1814 bool qemu_cpu_is_self(CPUState *cpu)
1815 {
1816     return qemu_thread_is_self(cpu->thread);
1817 }
1818
1819 bool qemu_in_vcpu_thread(void)
1820 {
1821     return current_cpu && qemu_cpu_is_self(current_cpu);
1822 }
1823
1824 static __thread bool iothread_locked = false;
1825
1826 bool qemu_mutex_iothread_locked(void)
1827 {
1828     return iothread_locked;
1829 }
1830
1831 /*
1832  * The BQL is taken from so many places that it is worth profiling the
1833  * callers directly, instead of funneling them all through a single function.
1834  */
1835 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1836 {
1837     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1838
1839     g_assert(!qemu_mutex_iothread_locked());
1840     bql_lock(&qemu_global_mutex, file, line);
1841     iothread_locked = true;
1842 }
1843
1844 void qemu_mutex_unlock_iothread(void)
1845 {
1846     g_assert(qemu_mutex_iothread_locked());
1847     iothread_locked = false;
1848     qemu_mutex_unlock(&qemu_global_mutex);
1849 }
1850
1851 void qemu_cond_wait_iothread(QemuCond *cond)
1852 {
1853     qemu_cond_wait(cond, &qemu_global_mutex);
1854 }
1855
1856 static bool all_vcpus_paused(void)
1857 {
1858     CPUState *cpu;
1859
1860     CPU_FOREACH(cpu) {
1861         if (!cpu->stopped) {
1862             return false;
1863         }
1864     }
1865
1866     return true;
1867 }
1868
1869 void pause_all_vcpus(void)
1870 {
1871     CPUState *cpu;
1872
1873     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1874     CPU_FOREACH(cpu) {
1875         if (qemu_cpu_is_self(cpu)) {
1876             qemu_cpu_stop(cpu, true);
1877         } else {
1878             cpu->stop = true;
1879             qemu_cpu_kick(cpu);
1880         }
1881     }
1882
1883     /* We need to drop the replay_lock so any vCPU threads woken up
1884      * can finish their replay tasks
1885      */
1886     replay_mutex_unlock();
1887
1888     while (!all_vcpus_paused()) {
1889         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1890         CPU_FOREACH(cpu) {
1891             qemu_cpu_kick(cpu);
1892         }
1893     }
1894
1895     qemu_mutex_unlock_iothread();
1896     replay_mutex_lock();
1897     qemu_mutex_lock_iothread();
1898 }
1899
1900 void cpu_resume(CPUState *cpu)
1901 {
1902     cpu->stop = false;
1903     cpu->stopped = false;
1904     qemu_cpu_kick(cpu);
1905 }
1906
1907 void resume_all_vcpus(void)
1908 {
1909     CPUState *cpu;
1910
1911     if (!runstate_is_running()) {
1912         return;
1913     }
1914
1915     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1916     CPU_FOREACH(cpu) {
1917         cpu_resume(cpu);
1918     }
1919 }
1920
1921 void cpu_remove_sync(CPUState *cpu)
1922 {
1923     cpu->stop = true;
1924     cpu->unplug = true;
1925     qemu_cpu_kick(cpu);
1926     qemu_mutex_unlock_iothread();
1927     qemu_thread_join(cpu->thread);
1928     qemu_mutex_lock_iothread();
1929 }
1930
1931 /* For temporary buffers for forming a name */
1932 #define VCPU_THREAD_NAME_SIZE 16
1933
1934 static void qemu_tcg_init_vcpu(CPUState *cpu)
1935 {
1936     char thread_name[VCPU_THREAD_NAME_SIZE];
1937     static QemuCond *single_tcg_halt_cond;
1938     static QemuThread *single_tcg_cpu_thread;
1939     static int tcg_region_inited;
1940
1941     assert(tcg_enabled());
1942     /*
1943      * Initialize TCG regions--once. Now is a good time, because:
1944      * (1) TCG's init context, prologue and target globals have been set up.
1945      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1946      *     -accel flag is processed, so the check doesn't work then).
1947      */
1948     if (!tcg_region_inited) {
1949         tcg_region_inited = 1;
1950         tcg_region_init();
1951     }
1952
1953     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1954         cpu->thread = g_malloc0(sizeof(QemuThread));
1955         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1956         qemu_cond_init(cpu->halt_cond);
1957
1958         if (qemu_tcg_mttcg_enabled()) {
1959             /* create a thread per vCPU with TCG (MTTCG) */
1960             parallel_cpus = true;
1961             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1962                  cpu->cpu_index);
1963
1964             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1965                                cpu, QEMU_THREAD_JOINABLE);
1966
1967         } else {
1968             /* share a single thread for all cpus with TCG */
1969             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1970             qemu_thread_create(cpu->thread, thread_name,
1971                                qemu_tcg_rr_cpu_thread_fn,
1972                                cpu, QEMU_THREAD_JOINABLE);
1973
1974             single_tcg_halt_cond = cpu->halt_cond;
1975             single_tcg_cpu_thread = cpu->thread;
1976         }
1977 #ifdef _WIN32
1978         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1979 #endif
1980     } else {
1981         /* For non-MTTCG cases we share the thread */
1982         cpu->thread = single_tcg_cpu_thread;
1983         cpu->halt_cond = single_tcg_halt_cond;
1984         cpu->thread_id = first_cpu->thread_id;
1985         cpu->can_do_io = 1;
1986         cpu->created = true;
1987     }
1988 }
1989
1990 static void qemu_hax_start_vcpu(CPUState *cpu)
1991 {
1992     char thread_name[VCPU_THREAD_NAME_SIZE];
1993
1994     cpu->thread = g_malloc0(sizeof(QemuThread));
1995     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996     qemu_cond_init(cpu->halt_cond);
1997
1998     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1999              cpu->cpu_index);
2000     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2001                        cpu, QEMU_THREAD_JOINABLE);
2002 #ifdef _WIN32
2003     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2004 #endif
2005 }
2006
2007 static void qemu_kvm_start_vcpu(CPUState *cpu)
2008 {
2009     char thread_name[VCPU_THREAD_NAME_SIZE];
2010
2011     cpu->thread = g_malloc0(sizeof(QemuThread));
2012     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2013     qemu_cond_init(cpu->halt_cond);
2014     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2015              cpu->cpu_index);
2016     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2017                        cpu, QEMU_THREAD_JOINABLE);
2018 }
2019
2020 static void qemu_hvf_start_vcpu(CPUState *cpu)
2021 {
2022     char thread_name[VCPU_THREAD_NAME_SIZE];
2023
2024     /* HVF currently does not support TCG, and only runs in
2025      * unrestricted-guest mode. */
2026     assert(hvf_enabled());
2027
2028     cpu->thread = g_malloc0(sizeof(QemuThread));
2029     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030     qemu_cond_init(cpu->halt_cond);
2031
2032     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2033              cpu->cpu_index);
2034     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2035                        cpu, QEMU_THREAD_JOINABLE);
2036 }
2037
2038 static void qemu_whpx_start_vcpu(CPUState *cpu)
2039 {
2040     char thread_name[VCPU_THREAD_NAME_SIZE];
2041
2042     cpu->thread = g_malloc0(sizeof(QemuThread));
2043     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2044     qemu_cond_init(cpu->halt_cond);
2045     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2046              cpu->cpu_index);
2047     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2048                        cpu, QEMU_THREAD_JOINABLE);
2049 #ifdef _WIN32
2050     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2051 #endif
2052 }
2053
2054 static void qemu_dummy_start_vcpu(CPUState *cpu)
2055 {
2056     char thread_name[VCPU_THREAD_NAME_SIZE];
2057
2058     cpu->thread = g_malloc0(sizeof(QemuThread));
2059     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2060     qemu_cond_init(cpu->halt_cond);
2061     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2062              cpu->cpu_index);
2063     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2064                        QEMU_THREAD_JOINABLE);
2065 }
2066
2067 void qemu_init_vcpu(CPUState *cpu)
2068 {
2069     MachineState *ms = MACHINE(qdev_get_machine());
2070
2071     cpu->nr_cores = ms->smp.cores;
2072     cpu->nr_threads =  ms->smp.threads;
2073     cpu->stopped = true;
2074     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2075
2076     if (!cpu->as) {
2077         /* If the target cpu hasn't set up any address spaces itself,
2078          * give it the default one.
2079          */
2080         cpu->num_ases = 1;
2081         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2082     }
2083
2084     if (kvm_enabled()) {
2085         qemu_kvm_start_vcpu(cpu);
2086     } else if (hax_enabled()) {
2087         qemu_hax_start_vcpu(cpu);
2088     } else if (hvf_enabled()) {
2089         qemu_hvf_start_vcpu(cpu);
2090     } else if (tcg_enabled()) {
2091         qemu_tcg_init_vcpu(cpu);
2092     } else if (whpx_enabled()) {
2093         qemu_whpx_start_vcpu(cpu);
2094     } else {
2095         qemu_dummy_start_vcpu(cpu);
2096     }
2097
2098     while (!cpu->created) {
2099         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2100     }
2101 }
2102
2103 void cpu_stop_current(void)
2104 {
2105     if (current_cpu) {
2106         current_cpu->stop = true;
2107         cpu_exit(current_cpu);
2108     }
2109 }
2110
2111 int vm_stop(RunState state)
2112 {
2113     if (qemu_in_vcpu_thread()) {
2114         qemu_system_vmstop_request_prepare();
2115         qemu_system_vmstop_request(state);
2116         /*
2117          * FIXME: should not return to device code in case
2118          * vm_stop() has been requested.
2119          */
2120         cpu_stop_current();
2121         return 0;
2122     }
2123
2124     return do_vm_stop(state, true);
2125 }
2126
2127 /**
2128  * Prepare for (re)starting the VM.
2129  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2130  * running or in case of an error condition), 0 otherwise.
2131  */
2132 int vm_prepare_start(void)
2133 {
2134     RunState requested;
2135
2136     qemu_vmstop_requested(&requested);
2137     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2138         return -1;
2139     }
2140
2141     /* Ensure that a STOP/RESUME pair of events is emitted if a
2142      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2143      * example, according to documentation is always followed by
2144      * the STOP event.
2145      */
2146     if (runstate_is_running()) {
2147         qapi_event_send_stop();
2148         qapi_event_send_resume();
2149         return -1;
2150     }
2151
2152     /* We are sending this now, but the CPUs will be resumed shortly later */
2153     qapi_event_send_resume();
2154
2155     cpu_enable_ticks();
2156     runstate_set(RUN_STATE_RUNNING);
2157     vm_state_notify(1, RUN_STATE_RUNNING);
2158     return 0;
2159 }
2160
2161 void vm_start(void)
2162 {
2163     if (!vm_prepare_start()) {
2164         resume_all_vcpus();
2165     }
2166 }
2167
2168 /* does a state transition even if the VM is already stopped,
2169    current state is forgotten forever */
2170 int vm_stop_force_state(RunState state)
2171 {
2172     if (runstate_is_running()) {
2173         return vm_stop(state);
2174     } else {
2175         runstate_set(state);
2176
2177         bdrv_drain_all();
2178         /* Make sure to return an error if the flush in a previous vm_stop()
2179          * failed. */
2180         return bdrv_flush_all();
2181     }
2182 }
2183
2184 void list_cpus(const char *optarg)
2185 {
2186     /* XXX: implement xxx_cpu_list for targets that still miss it */
2187 #if defined(cpu_list)
2188     cpu_list();
2189 #endif
2190 }
2191
2192 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2193                  bool has_cpu, int64_t cpu_index, Error **errp)
2194 {
2195     FILE *f;
2196     uint32_t l;
2197     CPUState *cpu;
2198     uint8_t buf[1024];
2199     int64_t orig_addr = addr, orig_size = size;
2200
2201     if (!has_cpu) {
2202         cpu_index = 0;
2203     }
2204
2205     cpu = qemu_get_cpu(cpu_index);
2206     if (cpu == NULL) {
2207         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2208                    "a CPU number");
2209         return;
2210     }
2211
2212     f = fopen(filename, "wb");
2213     if (!f) {
2214         error_setg_file_open(errp, errno, filename);
2215         return;
2216     }
2217
2218     while (size != 0) {
2219         l = sizeof(buf);
2220         if (l > size)
2221             l = size;
2222         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2223             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2224                              " specified", orig_addr, orig_size);
2225             goto exit;
2226         }
2227         if (fwrite(buf, 1, l, f) != l) {
2228             error_setg(errp, QERR_IO_ERROR);
2229             goto exit;
2230         }
2231         addr += l;
2232         size -= l;
2233     }
2234
2235 exit:
2236     fclose(f);
2237 }
2238
2239 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2240                   Error **errp)
2241 {
2242     FILE *f;
2243     uint32_t l;
2244     uint8_t buf[1024];
2245
2246     f = fopen(filename, "wb");
2247     if (!f) {
2248         error_setg_file_open(errp, errno, filename);
2249         return;
2250     }
2251
2252     while (size != 0) {
2253         l = sizeof(buf);
2254         if (l > size)
2255             l = size;
2256         cpu_physical_memory_read(addr, buf, l);
2257         if (fwrite(buf, 1, l, f) != l) {
2258             error_setg(errp, QERR_IO_ERROR);
2259             goto exit;
2260         }
2261         addr += l;
2262         size -= l;
2263     }
2264
2265 exit:
2266     fclose(f);
2267 }
2268
2269 void qmp_inject_nmi(Error **errp)
2270 {
2271     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2272 }
2273
2274 void dump_drift_info(void)
2275 {
2276     if (!use_icount) {
2277         return;
2278     }
2279
2280     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2281                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2282     if (icount_align_option) {
2283         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2284                     -max_delay / SCALE_MS);
2285         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2286                     max_advance / SCALE_MS);
2287     } else {
2288         qemu_printf("Max guest delay     NA\n");
2289         qemu_printf("Max guest advance   NA\n");
2290     }
2291 }