cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "qemu/plugin.h"
  49 #include "sysemu/cpus.h"
  50 #include "sysemu/qtest.h"
  51 #include "qemu/main-loop.h"
  52 #include "qemu/option.h"
  53 #include "qemu/bitmap.h"
  54 #include "qemu/seqlock.h"
  55 #include "qemu/guest-random.h"
  56 #include "tcg.h"
  57 #include "hw/nmi.h"
  58 #include "sysemu/replay.h"
  59 #include "sysemu/runstate.h"
  60 #include "hw/boards.h"
  61 #include "hw/hw.h"
  62
  63 #ifdef CONFIG_LINUX
  64
  65 #include <sys/prctl.h>
  66
  67 #ifndef PR_MCE_KILL
  68 #define PR_MCE_KILL 33
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_SET
  72 #define PR_MCE_KILL_SET 1
  73 #endif
  74
  75 #ifndef PR_MCE_KILL_EARLY
  76 #define PR_MCE_KILL_EARLY 1
  77 #endif
  78
  79 #endif /* CONFIG_LINUX */
  80
  81 static QemuMutex qemu_global_mutex;
  82
  83 int64_t max_delay;
  84 int64_t max_advance;
  85
  86 /* vcpu throttling controls */
  87 static QEMUTimer *throttle_timer;
  88 static unsigned int throttle_percentage;
  89
  90 #define CPU_THROTTLE_PCT_MIN 1
  91 #define CPU_THROTTLE_PCT_MAX 99
  92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94 bool cpu_is_stopped(CPUState *cpu)
  95 {
  96     return cpu->stopped || !runstate_is_running();
  97 }
  98
  99 static bool cpu_thread_is_idle(CPUState *cpu)
 100 {
 101     if (cpu->stop || cpu->queued_work_first) {
 102         return false;
 103     }
 104     if (cpu_is_stopped(cpu)) {
 105         return true;
 106     }
 107     if (!cpu->halted || cpu_has_work(cpu) ||
 108         kvm_halt_in_kernel()) {
 109         return false;
 110     }
 111     return true;
 112 }
 113
 114 static bool all_cpu_threads_idle(void)
 115 {
 116     CPUState *cpu;
 117
 118     CPU_FOREACH(cpu) {
 119         if (!cpu_thread_is_idle(cpu)) {
 120             return false;
 121         }
 122     }
 123     return true;
 124 }
 125
 126 /***********************************************************/
 127 /* guest cycle counter */
 128
 129 /* Protected by TimersState seqlock */
 130
 131 static bool icount_sleep = true;
 132 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133 #define MAX_ICOUNT_SHIFT 10
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* Protect fields that can be respectively read outside the
 141      * BQL, and written from multiple threads.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     QemuSpin vm_clock_lock;
 145
 146     int16_t cpu_ticks_enabled;
 147
 148     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149     int16_t icount_time_shift;
 150
 151     /* Compensate for varying guest execution speed.  */
 152     int64_t qemu_icount_bias;
 153
 154     int64_t vm_clock_warp_start;
 155     int64_t cpu_clock_offset;
 156
 157     /* Only written by TCG thread */
 158     int64_t qemu_icount;
 159
 160     /* for adjusting icount */
 161     QEMUTimer *icount_rt_timer;
 162     QEMUTimer *icount_vm_timer;
 163     QEMUTimer *icount_warp_timer;
 164 } TimersState;
 165
 166 static TimersState timers_state;
 167 bool mttcg_enabled;
 168
 169
 170 /* The current number of executed instructions is based on what we
 171  * originally budgeted minus the current state of the decrementing
 172  * icount counters in extra/u16.low.
 173  */
 174 static int64_t cpu_get_icount_executed(CPUState *cpu)
 175 {
 176     return (cpu->icount_budget -
 177             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 178 }
 179
 180 /*
 181  * Update the global shared timer_state.qemu_icount to take into
 182  * account executed instructions. This is done by the TCG vCPU
 183  * thread so the main-loop can see time has moved forward.
 184  */
 185 static void cpu_update_icount_locked(CPUState *cpu)
 186 {
 187     int64_t executed = cpu_get_icount_executed(cpu);
 188     cpu->icount_budget -= executed;
 189
 190     atomic_set_i64(&timers_state.qemu_icount,
 191                    timers_state.qemu_icount + executed);
 192 }
 193
 194 /*
 195  * Update the global shared timer_state.qemu_icount to take into
 196  * account executed instructions. This is done by the TCG vCPU
 197  * thread so the main-loop can see time has moved forward.
 198  */
 199 void cpu_update_icount(CPUState *cpu)
 200 {
 201     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 202                        &timers_state.vm_clock_lock);
 203     cpu_update_icount_locked(cpu);
 204     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 205                          &timers_state.vm_clock_lock);
 206 }
 207
 208 static int64_t cpu_get_icount_raw_locked(void)
 209 {
 210     CPUState *cpu = current_cpu;
 211
 212     if (cpu && cpu->running) {
 213         if (!cpu->can_do_io) {
 214             error_report("Bad icount read");
 215             exit(1);
 216         }
 217         /* Take into account what has run */
 218         cpu_update_icount_locked(cpu);
 219     }
 220     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 221     return atomic_read_i64(&timers_state.qemu_icount);
 222 }
 223
 224 static int64_t cpu_get_icount_locked(void)
 225 {
 226     int64_t icount = cpu_get_icount_raw_locked();
 227     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 228         cpu_icount_to_ns(icount);
 229 }
 230
 231 int64_t cpu_get_icount_raw(void)
 232 {
 233     int64_t icount;
 234     unsigned start;
 235
 236     do {
 237         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238         icount = cpu_get_icount_raw_locked();
 239     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241     return icount;
 242 }
 243
 244 /* Return the virtual CPU time, based on the instruction counter.  */
 245 int64_t cpu_get_icount(void)
 246 {
 247     int64_t icount;
 248     unsigned start;
 249
 250     do {
 251         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 252         icount = cpu_get_icount_locked();
 253     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 254
 255     return icount;
 256 }
 257
 258 int64_t cpu_icount_to_ns(int64_t icount)
 259 {
 260     return icount << atomic_read(&timers_state.icount_time_shift);
 261 }
 262
 263 static int64_t cpu_get_ticks_locked(void)
 264 {
 265     int64_t ticks = timers_state.cpu_ticks_offset;
 266     if (timers_state.cpu_ticks_enabled) {
 267         ticks += cpu_get_host_ticks();
 268     }
 269
 270     if (timers_state.cpu_ticks_prev > ticks) {
 271         /* Non increasing ticks may happen if the host uses software suspend.  */
 272         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 273         ticks = timers_state.cpu_ticks_prev;
 274     }
 275
 276     timers_state.cpu_ticks_prev = ticks;
 277     return ticks;
 278 }
 279
 280 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 281  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 282  * counter.
 283  */
 284 int64_t cpu_get_ticks(void)
 285 {
 286     int64_t ticks;
 287
 288     if (use_icount) {
 289         return cpu_get_icount();
 290     }
 291
 292     qemu_spin_lock(&timers_state.vm_clock_lock);
 293     ticks = cpu_get_ticks_locked();
 294     qemu_spin_unlock(&timers_state.vm_clock_lock);
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 332                        &timers_state.vm_clock_lock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 339                        &timers_state.vm_clock_lock);
 340 }
 341
 342 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 343  * cpu_get_ticks() after that.
 344  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 345  */
 346 void cpu_disable_ticks(void)
 347 {
 348     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 349                        &timers_state.vm_clock_lock);
 350     if (timers_state.cpu_ticks_enabled) {
 351         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 352         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 353         timers_state.cpu_ticks_enabled = 0;
 354     }
 355     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 356                          &timers_state.vm_clock_lock);
 357 }
 358
 359 /* Correlation between real and virtual time is always going to be
 360    fairly approximate, so ignore small variation.
 361    When the guest is idle real and virtual time will be aligned in
 362    the IO wait loop.  */
 363 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 364
 365 static void icount_adjust(void)
 366 {
 367     int64_t cur_time;
 368     int64_t cur_icount;
 369     int64_t delta;
 370
 371     /* Protected by TimersState mutex.  */
 372     static int64_t last_delta;
 373
 374     /* If the VM is not running, then do nothing.  */
 375     if (!runstate_is_running()) {
 376         return;
 377     }
 378
 379     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 380                        &timers_state.vm_clock_lock);
 381     cur_time = cpu_get_clock_locked();
 382     cur_icount = cpu_get_icount_locked();
 383
 384     delta = cur_icount - cur_time;
 385     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 386     if (delta > 0
 387         && last_delta + ICOUNT_WOBBLE < delta * 2
 388         && timers_state.icount_time_shift > 0) {
 389         /* The guest is getting too far ahead.  Slow time down.  */
 390         atomic_set(&timers_state.icount_time_shift,
 391                    timers_state.icount_time_shift - 1);
 392     }
 393     if (delta < 0
 394         && last_delta - ICOUNT_WOBBLE > delta * 2
 395         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 396         /* The guest is getting too far behind.  Speed time up.  */
 397         atomic_set(&timers_state.icount_time_shift,
 398                    timers_state.icount_time_shift + 1);
 399     }
 400     last_delta = delta;
 401     atomic_set_i64(&timers_state.qemu_icount_bias,
 402                    cur_icount - (timers_state.qemu_icount
 403                                  << timers_state.icount_time_shift));
 404     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                          &timers_state.vm_clock_lock);
 406 }
 407
 408 static void icount_adjust_rt(void *opaque)
 409 {
 410     timer_mod(timers_state.icount_rt_timer,
 411               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 412     icount_adjust();
 413 }
 414
 415 static void icount_adjust_vm(void *opaque)
 416 {
 417     timer_mod(timers_state.icount_vm_timer,
 418                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 419                    NANOSECONDS_PER_SECOND / 10);
 420     icount_adjust();
 421 }
 422
 423 static int64_t qemu_icount_round(int64_t count)
 424 {
 425     int shift = atomic_read(&timers_state.icount_time_shift);
 426     return (count + (1 << shift) - 1) >> shift;
 427 }
 428
 429 static void icount_warp_rt(void)
 430 {
 431     unsigned seq;
 432     int64_t warp_start;
 433
 434     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 435      * changes from -1 to another value, so the race here is okay.
 436      */
 437     do {
 438         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 439         warp_start = timers_state.vm_clock_warp_start;
 440     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 441
 442     if (warp_start == -1) {
 443         return;
 444     }
 445
 446     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 447                        &timers_state.vm_clock_lock);
 448     if (runstate_is_running()) {
 449         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 450                                             cpu_get_clock_locked());
 451         int64_t warp_delta;
 452
 453         warp_delta = clock - timers_state.vm_clock_warp_start;
 454         if (use_icount == 2) {
 455             /*
 456              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 457              * far ahead of real time.
 458              */
 459             int64_t cur_icount = cpu_get_icount_locked();
 460             int64_t delta = clock - cur_icount;
 461             warp_delta = MIN(warp_delta, delta);
 462         }
 463         atomic_set_i64(&timers_state.qemu_icount_bias,
 464                        timers_state.qemu_icount_bias + warp_delta);
 465     }
 466     timers_state.vm_clock_warp_start = -1;
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                        &timers_state.vm_clock_lock);
 469
 470     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 471         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 472     }
 473 }
 474
 475 static void icount_timer_cb(void *opaque)
 476 {
 477     /* No need for a checkpoint because the timer already synchronizes
 478      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 479      */
 480     icount_warp_rt();
 481 }
 482
 483 void qtest_clock_warp(int64_t dest)
 484 {
 485     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 486     AioContext *aio_context;
 487     assert(qtest_enabled());
 488     aio_context = qemu_get_aio_context();
 489     while (clock < dest) {
 490         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 491                                                       QEMU_TIMER_ATTR_ALL);
 492         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 493
 494         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 495                            &timers_state.vm_clock_lock);
 496         atomic_set_i64(&timers_state.qemu_icount_bias,
 497                        timers_state.qemu_icount_bias + warp);
 498         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 499                              &timers_state.vm_clock_lock);
 500
 501         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 502         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 503         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 504     }
 505     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 506 }
 507
 508 void qemu_start_warp_timer(void)
 509 {
 510     int64_t clock;
 511     int64_t deadline;
 512
 513     if (!use_icount) {
 514         return;
 515     }
 516
 517     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 518      * do not fire, so computing the deadline does not make sense.
 519      */
 520     if (!runstate_is_running()) {
 521         return;
 522     }
 523
 524     if (replay_mode != REPLAY_MODE_PLAY) {
 525         if (!all_cpu_threads_idle()) {
 526             return;
 527         }
 528
 529         if (qtest_enabled()) {
 530             /* When testing, qtest commands advance icount.  */
 531             return;
 532         }
 533
 534         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 535     } else {
 536         /* warp clock deterministically in record/replay mode */
 537         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 538             /* vCPU is sleeping and warp can't be started.
 539                It is probably a race condition: notification sent
 540                to vCPU was processed in advance and vCPU went to sleep.
 541                Therefore we have to wake it up for doing someting. */
 542             if (replay_has_checkpoint()) {
 543                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544             }
 545             return;
 546         }
 547     }
 548
 549     /* We want to use the earliest deadline from ALL vm_clocks */
 550     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 551     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 552                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 553     if (deadline < 0) {
 554         static bool notified;
 555         if (!icount_sleep && !notified) {
 556             warn_report("icount sleep disabled and no active timers");
 557             notified = true;
 558         }
 559         return;
 560     }
 561
 562     if (deadline > 0) {
 563         /*
 564          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 565          * sleep.  Otherwise, the CPU might be waiting for a future timer
 566          * interrupt to wake it up, but the interrupt never comes because
 567          * the vCPU isn't running any insns and thus doesn't advance the
 568          * QEMU_CLOCK_VIRTUAL.
 569          */
 570         if (!icount_sleep) {
 571             /*
 572              * We never let VCPUs sleep in no sleep icount mode.
 573              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 574              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 575              * It is useful when we want a deterministic execution time,
 576              * isolated from host latencies.
 577              */
 578             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 579                                &timers_state.vm_clock_lock);
 580             atomic_set_i64(&timers_state.qemu_icount_bias,
 581                            timers_state.qemu_icount_bias + deadline);
 582             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 583                                  &timers_state.vm_clock_lock);
 584             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 585         } else {
 586             /*
 587              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 588              * "real" time, (related to the time left until the next event) has
 589              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 590              * This avoids that the warps are visible externally; for example,
 591              * you will not be sending network packets continuously instead of
 592              * every 100ms.
 593              */
 594             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 595                                &timers_state.vm_clock_lock);
 596             if (timers_state.vm_clock_warp_start == -1
 597                 || timers_state.vm_clock_warp_start > clock) {
 598                 timers_state.vm_clock_warp_start = clock;
 599             }
 600             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 601                                  &timers_state.vm_clock_lock);
 602             timer_mod_anticipate(timers_state.icount_warp_timer,
 603                                  clock + deadline);
 604         }
 605     } else if (deadline == 0) {
 606         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 607     }
 608 }
 609
 610 static void qemu_account_warp_timer(void)
 611 {
 612     if (!use_icount || !icount_sleep) {
 613         return;
 614     }
 615
 616     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 617      * do not fire, so computing the deadline does not make sense.
 618      */
 619     if (!runstate_is_running()) {
 620         return;
 621     }
 622
 623     /* warp clock deterministically in record/replay mode */
 624     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 625         return;
 626     }
 627
 628     timer_del(timers_state.icount_warp_timer);
 629     icount_warp_rt();
 630 }
 631
 632 static bool icount_state_needed(void *opaque)
 633 {
 634     return use_icount;
 635 }
 636
 637 static bool warp_timer_state_needed(void *opaque)
 638 {
 639     TimersState *s = opaque;
 640     return s->icount_warp_timer != NULL;
 641 }
 642
 643 static bool adjust_timers_state_needed(void *opaque)
 644 {
 645     TimersState *s = opaque;
 646     return s->icount_rt_timer != NULL;
 647 }
 648
 649 /*
 650  * Subsection for warp timer migration is optional, because may not be created
 651  */
 652 static const VMStateDescription icount_vmstate_warp_timer = {
 653     .name = "timer/icount/warp_timer",
 654     .version_id = 1,
 655     .minimum_version_id = 1,
 656     .needed = warp_timer_state_needed,
 657     .fields = (VMStateField[]) {
 658         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 659         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 660         VMSTATE_END_OF_LIST()
 661     }
 662 };
 663
 664 static const VMStateDescription icount_vmstate_adjust_timers = {
 665     .name = "timer/icount/timers",
 666     .version_id = 1,
 667     .minimum_version_id = 1,
 668     .needed = adjust_timers_state_needed,
 669     .fields = (VMStateField[]) {
 670         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 671         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 672         VMSTATE_END_OF_LIST()
 673     }
 674 };
 675
 676 /*
 677  * This is a subsection for icount migration.
 678  */
 679 static const VMStateDescription icount_vmstate_timers = {
 680     .name = "timer/icount",
 681     .version_id = 1,
 682     .minimum_version_id = 1,
 683     .needed = icount_state_needed,
 684     .fields = (VMStateField[]) {
 685         VMSTATE_INT64(qemu_icount_bias, TimersState),
 686         VMSTATE_INT64(qemu_icount, TimersState),
 687         VMSTATE_END_OF_LIST()
 688     },
 689     .subsections = (const VMStateDescription*[]) {
 690         &icount_vmstate_warp_timer,
 691         &icount_vmstate_adjust_timers,
 692         NULL
 693     }
 694 };
 695
 696 static const VMStateDescription vmstate_timers = {
 697     .name = "timer",
 698     .version_id = 2,
 699     .minimum_version_id = 1,
 700     .fields = (VMStateField[]) {
 701         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 702         VMSTATE_UNUSED(8),
 703         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 704         VMSTATE_END_OF_LIST()
 705     },
 706     .subsections = (const VMStateDescription*[]) {
 707         &icount_vmstate_timers,
 708         NULL
 709     }
 710 };
 711
 712 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 713 {
 714     double pct;
 715     double throttle_ratio;
 716     int64_t sleeptime_ns, endtime_ns;
 717
 718     if (!cpu_throttle_get_percentage()) {
 719         return;
 720     }
 721
 722     pct = (double)cpu_throttle_get_percentage()/100;
 723     throttle_ratio = pct / (1 - pct);
 724     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 725     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 726     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 727     while (sleeptime_ns > 0 && !cpu->stop) {
 728         if (sleeptime_ns > SCALE_MS) {
 729             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 730                                 sleeptime_ns / SCALE_MS);
 731         } else {
 732             qemu_mutex_unlock_iothread();
 733             g_usleep(sleeptime_ns / SCALE_US);
 734             qemu_mutex_lock_iothread();
 735         }
 736         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 737     }
 738     atomic_set(&cpu->throttle_thread_scheduled, 0);
 739 }
 740
 741 static void cpu_throttle_timer_tick(void *opaque)
 742 {
 743     CPUState *cpu;
 744     double pct;
 745
 746     /* Stop the timer if needed */
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750     CPU_FOREACH(cpu) {
 751         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 752             async_run_on_cpu(cpu, cpu_throttle_thread,
 753                              RUN_ON_CPU_NULL);
 754         }
 755     }
 756
 757     pct = (double)cpu_throttle_get_percentage()/100;
 758     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 759                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 760 }
 761
 762 void cpu_throttle_set(int new_throttle_pct)
 763 {
 764     /* Ensure throttle percentage is within valid range */
 765     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 766     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 767
 768     atomic_set(&throttle_percentage, new_throttle_pct);
 769
 770     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 771                                        CPU_THROTTLE_TIMESLICE_NS);
 772 }
 773
 774 void cpu_throttle_stop(void)
 775 {
 776     atomic_set(&throttle_percentage, 0);
 777 }
 778
 779 bool cpu_throttle_active(void)
 780 {
 781     return (cpu_throttle_get_percentage() != 0);
 782 }
 783
 784 int cpu_throttle_get_percentage(void)
 785 {
 786     return atomic_read(&throttle_percentage);
 787 }
 788
 789 void cpu_ticks_init(void)
 790 {
 791     seqlock_init(&timers_state.vm_clock_seqlock);
 792     qemu_spin_init(&timers_state.vm_clock_lock);
 793     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 794     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 795                                            cpu_throttle_timer_tick, NULL);
 796 }
 797
 798 void configure_icount(QemuOpts *opts, Error **errp)
 799 {
 800     const char *option;
 801     char *rem_str = NULL;
 802
 803     option = qemu_opt_get(opts, "shift");
 804     if (!option) {
 805         if (qemu_opt_get(opts, "align") != NULL) {
 806             error_setg(errp, "Please specify shift option when using align");
 807         }
 808         return;
 809     }
 810
 811     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 812     if (icount_sleep) {
 813         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 814                                          icount_timer_cb, NULL);
 815     }
 816
 817     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 818
 819     if (icount_align_option && !icount_sleep) {
 820         error_setg(errp, "align=on and sleep=off are incompatible");
 821     }
 822     if (strcmp(option, "auto") != 0) {
 823         errno = 0;
 824         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 825         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 826             error_setg(errp, "icount: Invalid shift value");
 827         }
 828         use_icount = 1;
 829         return;
 830     } else if (icount_align_option) {
 831         error_setg(errp, "shift=auto and align=on are incompatible");
 832     } else if (!icount_sleep) {
 833         error_setg(errp, "shift=auto and sleep=off are incompatible");
 834     }
 835
 836     use_icount = 2;
 837
 838     /* 125MIPS seems a reasonable initial guess at the guest speed.
 839        It will be corrected fairly quickly anyway.  */
 840     timers_state.icount_time_shift = 3;
 841
 842     /* Have both realtime and virtual time triggers for speed adjustment.
 843        The realtime trigger catches emulated time passing too slowly,
 844        the virtual time trigger catches emulated time passing too fast.
 845        Realtime triggers occur even when idle, so use them less frequently
 846        than VM triggers.  */
 847     timers_state.vm_clock_warp_start = -1;
 848     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 849                                    icount_adjust_rt, NULL);
 850     timer_mod(timers_state.icount_rt_timer,
 851                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 852     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 853                                         icount_adjust_vm, NULL);
 854     timer_mod(timers_state.icount_vm_timer,
 855                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 856                    NANOSECONDS_PER_SECOND / 10);
 857 }
 858
 859 /***********************************************************/
 860 /* TCG vCPU kick timer
 861  *
 862  * The kick timer is responsible for moving single threaded vCPU
 863  * emulation on to the next vCPU. If more than one vCPU is running a
 864  * timer event with force a cpu->exit so the next vCPU can get
 865  * scheduled.
 866  *
 867  * The timer is removed if all vCPUs are idle and restarted again once
 868  * idleness is complete.
 869  */
 870
 871 static QEMUTimer *tcg_kick_vcpu_timer;
 872 static CPUState *tcg_current_rr_cpu;
 873
 874 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 875
 876 static inline int64_t qemu_tcg_next_kick(void)
 877 {
 878     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 879 }
 880
 881 /* Kick the currently round-robin scheduled vCPU to next */
 882 static void qemu_cpu_kick_rr_next_cpu(void)
 883 {
 884     CPUState *cpu;
 885     do {
 886         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 887         if (cpu) {
 888             cpu_exit(cpu);
 889         }
 890     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 891 }
 892
 893 /* Kick all RR vCPUs */
 894 static void qemu_cpu_kick_rr_cpus(void)
 895 {
 896     CPUState *cpu;
 897
 898     CPU_FOREACH(cpu) {
 899         cpu_exit(cpu);
 900     };
 901 }
 902
 903 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 904 {
 905 }
 906
 907 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 908 {
 909     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 910         qemu_notify_event();
 911         return;
 912     }
 913
 914     if (qemu_in_vcpu_thread()) {
 915         /* A CPU is currently running; kick it back out to the
 916          * tcg_cpu_exec() loop so it will recalculate its
 917          * icount deadline immediately.
 918          */
 919         qemu_cpu_kick(current_cpu);
 920     } else if (first_cpu) {
 921         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 922          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 923          * causes cpu_thread_is_idle to return false.  This way,
 924          * handle_icount_deadline can run.
 925          * If we have no CPUs at all for some reason, we don't
 926          * need to do anything.
 927          */
 928         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 929     }
 930 }
 931
 932 static void kick_tcg_thread(void *opaque)
 933 {
 934     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 935     qemu_cpu_kick_rr_next_cpu();
 936 }
 937
 938 static void start_tcg_kick_timer(void)
 939 {
 940     assert(!mttcg_enabled);
 941     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 942         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 943                                            kick_tcg_thread, NULL);
 944     }
 945     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 946         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 947     }
 948 }
 949
 950 static void stop_tcg_kick_timer(void)
 951 {
 952     assert(!mttcg_enabled);
 953     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 954         timer_del(tcg_kick_vcpu_timer);
 955     }
 956 }
 957
 958 /***********************************************************/
 959 void hw_error(const char *fmt, ...)
 960 {
 961     va_list ap;
 962     CPUState *cpu;
 963
 964     va_start(ap, fmt);
 965     fprintf(stderr, "qemu: hardware error: ");
 966     vfprintf(stderr, fmt, ap);
 967     fprintf(stderr, "\n");
 968     CPU_FOREACH(cpu) {
 969         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 970         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 971     }
 972     va_end(ap);
 973     abort();
 974 }
 975
 976 void cpu_synchronize_all_states(void)
 977 {
 978     CPUState *cpu;
 979
 980     CPU_FOREACH(cpu) {
 981         cpu_synchronize_state(cpu);
 982         /* TODO: move to cpu_synchronize_state() */
 983         if (hvf_enabled()) {
 984             hvf_cpu_synchronize_state(cpu);
 985         }
 986     }
 987 }
 988
 989 void cpu_synchronize_all_post_reset(void)
 990 {
 991     CPUState *cpu;
 992
 993     CPU_FOREACH(cpu) {
 994         cpu_synchronize_post_reset(cpu);
 995         /* TODO: move to cpu_synchronize_post_reset() */
 996         if (hvf_enabled()) {
 997             hvf_cpu_synchronize_post_reset(cpu);
 998         }
 999     }
1000 }
1001
1002 void cpu_synchronize_all_post_init(void)
1003 {
1004     CPUState *cpu;
1005
1006     CPU_FOREACH(cpu) {
1007         cpu_synchronize_post_init(cpu);
1008         /* TODO: move to cpu_synchronize_post_init() */
1009         if (hvf_enabled()) {
1010             hvf_cpu_synchronize_post_init(cpu);
1011         }
1012     }
1013 }
1014
1015 void cpu_synchronize_all_pre_loadvm(void)
1016 {
1017     CPUState *cpu;
1018
1019     CPU_FOREACH(cpu) {
1020         cpu_synchronize_pre_loadvm(cpu);
1021     }
1022 }
1023
1024 static int do_vm_stop(RunState state, bool send_stop)
1025 {
1026     int ret = 0;
1027
1028     if (runstate_is_running()) {
1029         cpu_disable_ticks();
1030         pause_all_vcpus();
1031         runstate_set(state);
1032         vm_state_notify(0, state);
1033         if (send_stop) {
1034             qapi_event_send_stop();
1035         }
1036     }
1037
1038     bdrv_drain_all();
1039     ret = bdrv_flush_all();
1040
1041     return ret;
1042 }
1043
1044 /* Special vm_stop() variant for terminating the process.  Historically clients
1045  * did not expect a QMP STOP event and so we need to retain compatibility.
1046  */
1047 int vm_shutdown(void)
1048 {
1049     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1050 }
1051
1052 static bool cpu_can_run(CPUState *cpu)
1053 {
1054     if (cpu->stop) {
1055         return false;
1056     }
1057     if (cpu_is_stopped(cpu)) {
1058         return false;
1059     }
1060     return true;
1061 }
1062
1063 static void cpu_handle_guest_debug(CPUState *cpu)
1064 {
1065     gdb_set_stop_cpu(cpu);
1066     qemu_system_debug_request();
1067     cpu->stopped = true;
1068 }
1069
1070 #ifdef CONFIG_LINUX
1071 static void sigbus_reraise(void)
1072 {
1073     sigset_t set;
1074     struct sigaction action;
1075
1076     memset(&action, 0, sizeof(action));
1077     action.sa_handler = SIG_DFL;
1078     if (!sigaction(SIGBUS, &action, NULL)) {
1079         raise(SIGBUS);
1080         sigemptyset(&set);
1081         sigaddset(&set, SIGBUS);
1082         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1083     }
1084     perror("Failed to re-raise SIGBUS!\n");
1085     abort();
1086 }
1087
1088 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1089 {
1090     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1091         sigbus_reraise();
1092     }
1093
1094     if (current_cpu) {
1095         /* Called asynchronously in VCPU thread.  */
1096         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1097             sigbus_reraise();
1098         }
1099     } else {
1100         /* Called synchronously (via signalfd) in main thread.  */
1101         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1102             sigbus_reraise();
1103         }
1104     }
1105 }
1106
1107 static void qemu_init_sigbus(void)
1108 {
1109     struct sigaction action;
1110
1111     memset(&action, 0, sizeof(action));
1112     action.sa_flags = SA_SIGINFO;
1113     action.sa_sigaction = sigbus_handler;
1114     sigaction(SIGBUS, &action, NULL);
1115
1116     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1117 }
1118 #else /* !CONFIG_LINUX */
1119 static void qemu_init_sigbus(void)
1120 {
1121 }
1122 #endif /* !CONFIG_LINUX */
1123
1124 static QemuThread io_thread;
1125
1126 /* cpu creation */
1127 static QemuCond qemu_cpu_cond;
1128 /* system init */
1129 static QemuCond qemu_pause_cond;
1130
1131 void qemu_init_cpu_loop(void)
1132 {
1133     qemu_init_sigbus();
1134     qemu_cond_init(&qemu_cpu_cond);
1135     qemu_cond_init(&qemu_pause_cond);
1136     qemu_mutex_init(&qemu_global_mutex);
1137
1138     qemu_thread_get_self(&io_thread);
1139 }
1140
1141 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1142 {
1143     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1144 }
1145
1146 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1147 {
1148     if (kvm_destroy_vcpu(cpu) < 0) {
1149         error_report("kvm_destroy_vcpu failed");
1150         exit(EXIT_FAILURE);
1151     }
1152 }
1153
1154 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1155 {
1156 }
1157
1158 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1159 {
1160     g_assert(qemu_cpu_is_self(cpu));
1161     cpu->stop = false;
1162     cpu->stopped = true;
1163     if (exit) {
1164         cpu_exit(cpu);
1165     }
1166     qemu_cond_broadcast(&qemu_pause_cond);
1167 }
1168
1169 static void qemu_wait_io_event_common(CPUState *cpu)
1170 {
1171     atomic_mb_set(&cpu->thread_kicked, false);
1172     if (cpu->stop) {
1173         qemu_cpu_stop(cpu, false);
1174     }
1175     process_queued_cpu_work(cpu);
1176 }
1177
1178 static void qemu_tcg_rr_wait_io_event(void)
1179 {
1180     CPUState *cpu;
1181
1182     while (all_cpu_threads_idle()) {
1183         stop_tcg_kick_timer();
1184         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1185     }
1186
1187     start_tcg_kick_timer();
1188
1189     CPU_FOREACH(cpu) {
1190         qemu_wait_io_event_common(cpu);
1191     }
1192 }
1193
1194 static void qemu_wait_io_event(CPUState *cpu)
1195 {
1196     bool slept = false;
1197
1198     while (cpu_thread_is_idle(cpu)) {
1199         if (!slept) {
1200             slept = true;
1201             qemu_plugin_vcpu_idle_cb(cpu);
1202         }
1203         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1204     }
1205     if (slept) {
1206         qemu_plugin_vcpu_resume_cb(cpu);
1207     }
1208
1209 #ifdef _WIN32
1210     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1211     if (!tcg_enabled()) {
1212         SleepEx(0, TRUE);
1213     }
1214 #endif
1215     qemu_wait_io_event_common(cpu);
1216 }
1217
1218 static void *qemu_kvm_cpu_thread_fn(void *arg)
1219 {
1220     CPUState *cpu = arg;
1221     int r;
1222
1223     rcu_register_thread();
1224
1225     qemu_mutex_lock_iothread();
1226     qemu_thread_get_self(cpu->thread);
1227     cpu->thread_id = qemu_get_thread_id();
1228     cpu->can_do_io = 1;
1229     current_cpu = cpu;
1230
1231     r = kvm_init_vcpu(cpu);
1232     if (r < 0) {
1233         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1234         exit(1);
1235     }
1236
1237     kvm_init_cpu_signals(cpu);
1238
1239     /* signal CPU creation */
1240     cpu->created = true;
1241     qemu_cond_signal(&qemu_cpu_cond);
1242     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1243
1244     do {
1245         if (cpu_can_run(cpu)) {
1246             r = kvm_cpu_exec(cpu);
1247             if (r == EXCP_DEBUG) {
1248                 cpu_handle_guest_debug(cpu);
1249             }
1250         }
1251         qemu_wait_io_event(cpu);
1252     } while (!cpu->unplug || cpu_can_run(cpu));
1253
1254     qemu_kvm_destroy_vcpu(cpu);
1255     cpu->created = false;
1256     qemu_cond_signal(&qemu_cpu_cond);
1257     qemu_mutex_unlock_iothread();
1258     rcu_unregister_thread();
1259     return NULL;
1260 }
1261
1262 static void *qemu_dummy_cpu_thread_fn(void *arg)
1263 {
1264 #ifdef _WIN32
1265     error_report("qtest is not supported under Windows");
1266     exit(1);
1267 #else
1268     CPUState *cpu = arg;
1269     sigset_t waitset;
1270     int r;
1271
1272     rcu_register_thread();
1273
1274     qemu_mutex_lock_iothread();
1275     qemu_thread_get_self(cpu->thread);
1276     cpu->thread_id = qemu_get_thread_id();
1277     cpu->can_do_io = 1;
1278     current_cpu = cpu;
1279
1280     sigemptyset(&waitset);
1281     sigaddset(&waitset, SIG_IPI);
1282
1283     /* signal CPU creation */
1284     cpu->created = true;
1285     qemu_cond_signal(&qemu_cpu_cond);
1286     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1287
1288     do {
1289         qemu_mutex_unlock_iothread();
1290         do {
1291             int sig;
1292             r = sigwait(&waitset, &sig);
1293         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1294         if (r == -1) {
1295             perror("sigwait");
1296             exit(1);
1297         }
1298         qemu_mutex_lock_iothread();
1299         qemu_wait_io_event(cpu);
1300     } while (!cpu->unplug);
1301
1302     qemu_mutex_unlock_iothread();
1303     rcu_unregister_thread();
1304     return NULL;
1305 #endif
1306 }
1307
1308 static int64_t tcg_get_icount_limit(void)
1309 {
1310     int64_t deadline;
1311
1312     if (replay_mode != REPLAY_MODE_PLAY) {
1313         /*
1314          * Include all the timers, because they may need an attention.
1315          * Too long CPU execution may create unnecessary delay in UI.
1316          */
1317         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1318                                               QEMU_TIMER_ATTR_ALL);
1319
1320         /* Maintain prior (possibly buggy) behaviour where if no deadline
1321          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1322          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1323          * nanoseconds.
1324          */
1325         if ((deadline < 0) || (deadline > INT32_MAX)) {
1326             deadline = INT32_MAX;
1327         }
1328
1329         return qemu_icount_round(deadline);
1330     } else {
1331         return replay_get_instructions();
1332     }
1333 }
1334
1335 static void handle_icount_deadline(void)
1336 {
1337     assert(qemu_in_vcpu_thread());
1338     if (use_icount) {
1339         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1340                                                       QEMU_TIMER_ATTR_ALL);
1341
1342         if (deadline == 0) {
1343             /* Wake up other AioContexts.  */
1344             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1345             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1346         }
1347     }
1348 }
1349
1350 static void prepare_icount_for_run(CPUState *cpu)
1351 {
1352     if (use_icount) {
1353         int insns_left;
1354
1355         /* These should always be cleared by process_icount_data after
1356          * each vCPU execution. However u16.high can be raised
1357          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1358          */
1359         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1360         g_assert(cpu->icount_extra == 0);
1361
1362         cpu->icount_budget = tcg_get_icount_limit();
1363         insns_left = MIN(0xffff, cpu->icount_budget);
1364         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1365         cpu->icount_extra = cpu->icount_budget - insns_left;
1366
1367         replay_mutex_lock();
1368     }
1369 }
1370
1371 static void process_icount_data(CPUState *cpu)
1372 {
1373     if (use_icount) {
1374         /* Account for executed instructions */
1375         cpu_update_icount(cpu);
1376
1377         /* Reset the counters */
1378         cpu_neg(cpu)->icount_decr.u16.low = 0;
1379         cpu->icount_extra = 0;
1380         cpu->icount_budget = 0;
1381
1382         replay_account_executed_instructions();
1383
1384         replay_mutex_unlock();
1385     }
1386 }
1387
1388
1389 static int tcg_cpu_exec(CPUState *cpu)
1390 {
1391     int ret;
1392 #ifdef CONFIG_PROFILER
1393     int64_t ti;
1394 #endif
1395
1396     assert(tcg_enabled());
1397 #ifdef CONFIG_PROFILER
1398     ti = profile_getclock();
1399 #endif
1400     cpu_exec_start(cpu);
1401     ret = cpu_exec(cpu);
1402     cpu_exec_end(cpu);
1403 #ifdef CONFIG_PROFILER
1404     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1405                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1406 #endif
1407     return ret;
1408 }
1409
1410 /* Destroy any remaining vCPUs which have been unplugged and have
1411  * finished running
1412  */
1413 static void deal_with_unplugged_cpus(void)
1414 {
1415     CPUState *cpu;
1416
1417     CPU_FOREACH(cpu) {
1418         if (cpu->unplug && !cpu_can_run(cpu)) {
1419             qemu_tcg_destroy_vcpu(cpu);
1420             cpu->created = false;
1421             qemu_cond_signal(&qemu_cpu_cond);
1422             break;
1423         }
1424     }
1425 }
1426
1427 /* Single-threaded TCG
1428  *
1429  * In the single-threaded case each vCPU is simulated in turn. If
1430  * there is more than a single vCPU we create a simple timer to kick
1431  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1432  * This is done explicitly rather than relying on side-effects
1433  * elsewhere.
1434  */
1435
1436 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1437 {
1438     CPUState *cpu = arg;
1439
1440     assert(tcg_enabled());
1441     rcu_register_thread();
1442     tcg_register_thread();
1443
1444     qemu_mutex_lock_iothread();
1445     qemu_thread_get_self(cpu->thread);
1446
1447     cpu->thread_id = qemu_get_thread_id();
1448     cpu->created = true;
1449     cpu->can_do_io = 1;
1450     qemu_cond_signal(&qemu_cpu_cond);
1451     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1452
1453     /* wait for initial kick-off after machine start */
1454     while (first_cpu->stopped) {
1455         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1456
1457         /* process any pending work */
1458         CPU_FOREACH(cpu) {
1459             current_cpu = cpu;
1460             qemu_wait_io_event_common(cpu);
1461         }
1462     }
1463
1464     start_tcg_kick_timer();
1465
1466     cpu = first_cpu;
1467
1468     /* process any pending work */
1469     cpu->exit_request = 1;
1470
1471     while (1) {
1472         qemu_mutex_unlock_iothread();
1473         replay_mutex_lock();
1474         qemu_mutex_lock_iothread();
1475         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1476         qemu_account_warp_timer();
1477
1478         /* Run the timers here.  This is much more efficient than
1479          * waking up the I/O thread and waiting for completion.
1480          */
1481         handle_icount_deadline();
1482
1483         replay_mutex_unlock();
1484
1485         if (!cpu) {
1486             cpu = first_cpu;
1487         }
1488
1489         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1490
1491             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1492             current_cpu = cpu;
1493
1494             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1495                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1496
1497             if (cpu_can_run(cpu)) {
1498                 int r;
1499
1500                 qemu_mutex_unlock_iothread();
1501                 prepare_icount_for_run(cpu);
1502
1503                 r = tcg_cpu_exec(cpu);
1504
1505                 process_icount_data(cpu);
1506                 qemu_mutex_lock_iothread();
1507
1508                 if (r == EXCP_DEBUG) {
1509                     cpu_handle_guest_debug(cpu);
1510                     break;
1511                 } else if (r == EXCP_ATOMIC) {
1512                     qemu_mutex_unlock_iothread();
1513                     cpu_exec_step_atomic(cpu);
1514                     qemu_mutex_lock_iothread();
1515                     break;
1516                 }
1517             } else if (cpu->stop) {
1518                 if (cpu->unplug) {
1519                     cpu = CPU_NEXT(cpu);
1520                 }
1521                 break;
1522             }
1523
1524             cpu = CPU_NEXT(cpu);
1525         } /* while (cpu && !cpu->exit_request).. */
1526
1527         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1528         atomic_set(&tcg_current_rr_cpu, NULL);
1529
1530         if (cpu && cpu->exit_request) {
1531             atomic_mb_set(&cpu->exit_request, 0);
1532         }
1533
1534         if (use_icount && all_cpu_threads_idle()) {
1535             /*
1536              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1537              * in the main_loop, wake it up in order to start the warp timer.
1538              */
1539             qemu_notify_event();
1540         }
1541
1542         qemu_tcg_rr_wait_io_event();
1543         deal_with_unplugged_cpus();
1544     }
1545
1546     rcu_unregister_thread();
1547     return NULL;
1548 }
1549
1550 static void *qemu_hax_cpu_thread_fn(void *arg)
1551 {
1552     CPUState *cpu = arg;
1553     int r;
1554
1555     rcu_register_thread();
1556     qemu_mutex_lock_iothread();
1557     qemu_thread_get_self(cpu->thread);
1558
1559     cpu->thread_id = qemu_get_thread_id();
1560     cpu->created = true;
1561     current_cpu = cpu;
1562
1563     hax_init_vcpu(cpu);
1564     qemu_cond_signal(&qemu_cpu_cond);
1565     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1566
1567     do {
1568         if (cpu_can_run(cpu)) {
1569             r = hax_smp_cpu_exec(cpu);
1570             if (r == EXCP_DEBUG) {
1571                 cpu_handle_guest_debug(cpu);
1572             }
1573         }
1574
1575         qemu_wait_io_event(cpu);
1576     } while (!cpu->unplug || cpu_can_run(cpu));
1577     rcu_unregister_thread();
1578     return NULL;
1579 }
1580
1581 /* The HVF-specific vCPU thread function. This one should only run when the host
1582  * CPU supports the VMX "unrestricted guest" feature. */
1583 static void *qemu_hvf_cpu_thread_fn(void *arg)
1584 {
1585     CPUState *cpu = arg;
1586
1587     int r;
1588
1589     assert(hvf_enabled());
1590
1591     rcu_register_thread();
1592
1593     qemu_mutex_lock_iothread();
1594     qemu_thread_get_self(cpu->thread);
1595
1596     cpu->thread_id = qemu_get_thread_id();
1597     cpu->can_do_io = 1;
1598     current_cpu = cpu;
1599
1600     hvf_init_vcpu(cpu);
1601
1602     /* signal CPU creation */
1603     cpu->created = true;
1604     qemu_cond_signal(&qemu_cpu_cond);
1605     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1606
1607     do {
1608         if (cpu_can_run(cpu)) {
1609             r = hvf_vcpu_exec(cpu);
1610             if (r == EXCP_DEBUG) {
1611                 cpu_handle_guest_debug(cpu);
1612             }
1613         }
1614         qemu_wait_io_event(cpu);
1615     } while (!cpu->unplug || cpu_can_run(cpu));
1616
1617     hvf_vcpu_destroy(cpu);
1618     cpu->created = false;
1619     qemu_cond_signal(&qemu_cpu_cond);
1620     qemu_mutex_unlock_iothread();
1621     rcu_unregister_thread();
1622     return NULL;
1623 }
1624
1625 static void *qemu_whpx_cpu_thread_fn(void *arg)
1626 {
1627     CPUState *cpu = arg;
1628     int r;
1629
1630     rcu_register_thread();
1631
1632     qemu_mutex_lock_iothread();
1633     qemu_thread_get_self(cpu->thread);
1634     cpu->thread_id = qemu_get_thread_id();
1635     current_cpu = cpu;
1636
1637     r = whpx_init_vcpu(cpu);
1638     if (r < 0) {
1639         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1640         exit(1);
1641     }
1642
1643     /* signal CPU creation */
1644     cpu->created = true;
1645     qemu_cond_signal(&qemu_cpu_cond);
1646     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1647
1648     do {
1649         if (cpu_can_run(cpu)) {
1650             r = whpx_vcpu_exec(cpu);
1651             if (r == EXCP_DEBUG) {
1652                 cpu_handle_guest_debug(cpu);
1653             }
1654         }
1655         while (cpu_thread_is_idle(cpu)) {
1656             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1657         }
1658         qemu_wait_io_event_common(cpu);
1659     } while (!cpu->unplug || cpu_can_run(cpu));
1660
1661     whpx_destroy_vcpu(cpu);
1662     cpu->created = false;
1663     qemu_cond_signal(&qemu_cpu_cond);
1664     qemu_mutex_unlock_iothread();
1665     rcu_unregister_thread();
1666     return NULL;
1667 }
1668
1669 #ifdef _WIN32
1670 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1671 {
1672 }
1673 #endif
1674
1675 /* Multi-threaded TCG
1676  *
1677  * In the multi-threaded case each vCPU has its own thread. The TLS
1678  * variable current_cpu can be used deep in the code to find the
1679  * current CPUState for a given thread.
1680  */
1681
1682 static void *qemu_tcg_cpu_thread_fn(void *arg)
1683 {
1684     CPUState *cpu = arg;
1685
1686     assert(tcg_enabled());
1687     g_assert(!use_icount);
1688
1689     rcu_register_thread();
1690     tcg_register_thread();
1691
1692     qemu_mutex_lock_iothread();
1693     qemu_thread_get_self(cpu->thread);
1694
1695     cpu->thread_id = qemu_get_thread_id();
1696     cpu->created = true;
1697     cpu->can_do_io = 1;
1698     current_cpu = cpu;
1699     qemu_cond_signal(&qemu_cpu_cond);
1700     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1701
1702     /* process any pending work */
1703     cpu->exit_request = 1;
1704
1705     do {
1706         if (cpu_can_run(cpu)) {
1707             int r;
1708             qemu_mutex_unlock_iothread();
1709             r = tcg_cpu_exec(cpu);
1710             qemu_mutex_lock_iothread();
1711             switch (r) {
1712             case EXCP_DEBUG:
1713                 cpu_handle_guest_debug(cpu);
1714                 break;
1715             case EXCP_HALTED:
1716                 /* during start-up the vCPU is reset and the thread is
1717                  * kicked several times. If we don't ensure we go back
1718                  * to sleep in the halted state we won't cleanly
1719                  * start-up when the vCPU is enabled.
1720                  *
1721                  * cpu->halted should ensure we sleep in wait_io_event
1722                  */
1723                 g_assert(cpu->halted);
1724                 break;
1725             case EXCP_ATOMIC:
1726                 qemu_mutex_unlock_iothread();
1727                 cpu_exec_step_atomic(cpu);
1728                 qemu_mutex_lock_iothread();
1729             default:
1730                 /* Ignore everything else? */
1731                 break;
1732             }
1733         }
1734
1735         atomic_mb_set(&cpu->exit_request, 0);
1736         qemu_wait_io_event(cpu);
1737     } while (!cpu->unplug || cpu_can_run(cpu));
1738
1739     qemu_tcg_destroy_vcpu(cpu);
1740     cpu->created = false;
1741     qemu_cond_signal(&qemu_cpu_cond);
1742     qemu_mutex_unlock_iothread();
1743     rcu_unregister_thread();
1744     return NULL;
1745 }
1746
1747 static void qemu_cpu_kick_thread(CPUState *cpu)
1748 {
1749 #ifndef _WIN32
1750     int err;
1751
1752     if (cpu->thread_kicked) {
1753         return;
1754     }
1755     cpu->thread_kicked = true;
1756     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1757     if (err && err != ESRCH) {
1758         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1759         exit(1);
1760     }
1761 #else /* _WIN32 */
1762     if (!qemu_cpu_is_self(cpu)) {
1763         if (whpx_enabled()) {
1764             whpx_vcpu_kick(cpu);
1765         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1766             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1767                     __func__, GetLastError());
1768             exit(1);
1769         }
1770     }
1771 #endif
1772 }
1773
1774 void qemu_cpu_kick(CPUState *cpu)
1775 {
1776     qemu_cond_broadcast(cpu->halt_cond);
1777     if (tcg_enabled()) {
1778         if (qemu_tcg_mttcg_enabled()) {
1779             cpu_exit(cpu);
1780         } else {
1781             qemu_cpu_kick_rr_cpus();
1782         }
1783     } else {
1784         if (hax_enabled()) {
1785             /*
1786              * FIXME: race condition with the exit_request check in
1787              * hax_vcpu_hax_exec
1788              */
1789             cpu->exit_request = 1;
1790         }
1791         qemu_cpu_kick_thread(cpu);
1792     }
1793 }
1794
1795 void qemu_cpu_kick_self(void)
1796 {
1797     assert(current_cpu);
1798     qemu_cpu_kick_thread(current_cpu);
1799 }
1800
1801 bool qemu_cpu_is_self(CPUState *cpu)
1802 {
1803     return qemu_thread_is_self(cpu->thread);
1804 }
1805
1806 bool qemu_in_vcpu_thread(void)
1807 {
1808     return current_cpu && qemu_cpu_is_self(current_cpu);
1809 }
1810
1811 static __thread bool iothread_locked = false;
1812
1813 bool qemu_mutex_iothread_locked(void)
1814 {
1815     return iothread_locked;
1816 }
1817
1818 /*
1819  * The BQL is taken from so many places that it is worth profiling the
1820  * callers directly, instead of funneling them all through a single function.
1821  */
1822 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1823 {
1824     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1825
1826     g_assert(!qemu_mutex_iothread_locked());
1827     bql_lock(&qemu_global_mutex, file, line);
1828     iothread_locked = true;
1829 }
1830
1831 void qemu_mutex_unlock_iothread(void)
1832 {
1833     g_assert(qemu_mutex_iothread_locked());
1834     iothread_locked = false;
1835     qemu_mutex_unlock(&qemu_global_mutex);
1836 }
1837
1838 static bool all_vcpus_paused(void)
1839 {
1840     CPUState *cpu;
1841
1842     CPU_FOREACH(cpu) {
1843         if (!cpu->stopped) {
1844             return false;
1845         }
1846     }
1847
1848     return true;
1849 }
1850
1851 void pause_all_vcpus(void)
1852 {
1853     CPUState *cpu;
1854
1855     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1856     CPU_FOREACH(cpu) {
1857         if (qemu_cpu_is_self(cpu)) {
1858             qemu_cpu_stop(cpu, true);
1859         } else {
1860             cpu->stop = true;
1861             qemu_cpu_kick(cpu);
1862         }
1863     }
1864
1865     /* We need to drop the replay_lock so any vCPU threads woken up
1866      * can finish their replay tasks
1867      */
1868     replay_mutex_unlock();
1869
1870     while (!all_vcpus_paused()) {
1871         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1872         CPU_FOREACH(cpu) {
1873             qemu_cpu_kick(cpu);
1874         }
1875     }
1876
1877     qemu_mutex_unlock_iothread();
1878     replay_mutex_lock();
1879     qemu_mutex_lock_iothread();
1880 }
1881
1882 void cpu_resume(CPUState *cpu)
1883 {
1884     cpu->stop = false;
1885     cpu->stopped = false;
1886     qemu_cpu_kick(cpu);
1887 }
1888
1889 void resume_all_vcpus(void)
1890 {
1891     CPUState *cpu;
1892
1893     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1894     CPU_FOREACH(cpu) {
1895         cpu_resume(cpu);
1896     }
1897 }
1898
1899 void cpu_remove_sync(CPUState *cpu)
1900 {
1901     cpu->stop = true;
1902     cpu->unplug = true;
1903     qemu_cpu_kick(cpu);
1904     qemu_mutex_unlock_iothread();
1905     qemu_thread_join(cpu->thread);
1906     qemu_mutex_lock_iothread();
1907 }
1908
1909 /* For temporary buffers for forming a name */
1910 #define VCPU_THREAD_NAME_SIZE 16
1911
1912 static void qemu_tcg_init_vcpu(CPUState *cpu)
1913 {
1914     char thread_name[VCPU_THREAD_NAME_SIZE];
1915     static QemuCond *single_tcg_halt_cond;
1916     static QemuThread *single_tcg_cpu_thread;
1917     static int tcg_region_inited;
1918
1919     assert(tcg_enabled());
1920     /*
1921      * Initialize TCG regions--once. Now is a good time, because:
1922      * (1) TCG's init context, prologue and target globals have been set up.
1923      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1924      *     -accel flag is processed, so the check doesn't work then).
1925      */
1926     if (!tcg_region_inited) {
1927         tcg_region_inited = 1;
1928         tcg_region_init();
1929     }
1930
1931     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1932         cpu->thread = g_malloc0(sizeof(QemuThread));
1933         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1934         qemu_cond_init(cpu->halt_cond);
1935
1936         if (qemu_tcg_mttcg_enabled()) {
1937             /* create a thread per vCPU with TCG (MTTCG) */
1938             parallel_cpus = true;
1939             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1940                  cpu->cpu_index);
1941
1942             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1943                                cpu, QEMU_THREAD_JOINABLE);
1944
1945         } else {
1946             /* share a single thread for all cpus with TCG */
1947             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1948             qemu_thread_create(cpu->thread, thread_name,
1949                                qemu_tcg_rr_cpu_thread_fn,
1950                                cpu, QEMU_THREAD_JOINABLE);
1951
1952             single_tcg_halt_cond = cpu->halt_cond;
1953             single_tcg_cpu_thread = cpu->thread;
1954         }
1955 #ifdef _WIN32
1956         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1957 #endif
1958     } else {
1959         /* For non-MTTCG cases we share the thread */
1960         cpu->thread = single_tcg_cpu_thread;
1961         cpu->halt_cond = single_tcg_halt_cond;
1962         cpu->thread_id = first_cpu->thread_id;
1963         cpu->can_do_io = 1;
1964         cpu->created = true;
1965     }
1966 }
1967
1968 static void qemu_hax_start_vcpu(CPUState *cpu)
1969 {
1970     char thread_name[VCPU_THREAD_NAME_SIZE];
1971
1972     cpu->thread = g_malloc0(sizeof(QemuThread));
1973     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1974     qemu_cond_init(cpu->halt_cond);
1975
1976     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1977              cpu->cpu_index);
1978     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1979                        cpu, QEMU_THREAD_JOINABLE);
1980 #ifdef _WIN32
1981     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1982 #endif
1983 }
1984
1985 static void qemu_kvm_start_vcpu(CPUState *cpu)
1986 {
1987     char thread_name[VCPU_THREAD_NAME_SIZE];
1988
1989     cpu->thread = g_malloc0(sizeof(QemuThread));
1990     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1991     qemu_cond_init(cpu->halt_cond);
1992     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1993              cpu->cpu_index);
1994     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1995                        cpu, QEMU_THREAD_JOINABLE);
1996 }
1997
1998 static void qemu_hvf_start_vcpu(CPUState *cpu)
1999 {
2000     char thread_name[VCPU_THREAD_NAME_SIZE];
2001
2002     /* HVF currently does not support TCG, and only runs in
2003      * unrestricted-guest mode. */
2004     assert(hvf_enabled());
2005
2006     cpu->thread = g_malloc0(sizeof(QemuThread));
2007     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2008     qemu_cond_init(cpu->halt_cond);
2009
2010     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2011              cpu->cpu_index);
2012     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2013                        cpu, QEMU_THREAD_JOINABLE);
2014 }
2015
2016 static void qemu_whpx_start_vcpu(CPUState *cpu)
2017 {
2018     char thread_name[VCPU_THREAD_NAME_SIZE];
2019
2020     cpu->thread = g_malloc0(sizeof(QemuThread));
2021     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022     qemu_cond_init(cpu->halt_cond);
2023     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2024              cpu->cpu_index);
2025     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2026                        cpu, QEMU_THREAD_JOINABLE);
2027 #ifdef _WIN32
2028     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2029 #endif
2030 }
2031
2032 static void qemu_dummy_start_vcpu(CPUState *cpu)
2033 {
2034     char thread_name[VCPU_THREAD_NAME_SIZE];
2035
2036     cpu->thread = g_malloc0(sizeof(QemuThread));
2037     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2038     qemu_cond_init(cpu->halt_cond);
2039     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2040              cpu->cpu_index);
2041     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2042                        QEMU_THREAD_JOINABLE);
2043 }
2044
2045 void qemu_init_vcpu(CPUState *cpu)
2046 {
2047     MachineState *ms = MACHINE(qdev_get_machine());
2048
2049     cpu->nr_cores = ms->smp.cores;
2050     cpu->nr_threads =  ms->smp.threads;
2051     cpu->stopped = true;
2052     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2053
2054     if (!cpu->as) {
2055         /* If the target cpu hasn't set up any address spaces itself,
2056          * give it the default one.
2057          */
2058         cpu->num_ases = 1;
2059         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2060     }
2061
2062     if (kvm_enabled()) {
2063         qemu_kvm_start_vcpu(cpu);
2064     } else if (hax_enabled()) {
2065         qemu_hax_start_vcpu(cpu);
2066     } else if (hvf_enabled()) {
2067         qemu_hvf_start_vcpu(cpu);
2068     } else if (tcg_enabled()) {
2069         qemu_tcg_init_vcpu(cpu);
2070     } else if (whpx_enabled()) {
2071         qemu_whpx_start_vcpu(cpu);
2072     } else {
2073         qemu_dummy_start_vcpu(cpu);
2074     }
2075
2076     while (!cpu->created) {
2077         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2078     }
2079 }
2080
2081 void cpu_stop_current(void)
2082 {
2083     if (current_cpu) {
2084         current_cpu->stop = true;
2085         cpu_exit(current_cpu);
2086     }
2087 }
2088
2089 int vm_stop(RunState state)
2090 {
2091     if (qemu_in_vcpu_thread()) {
2092         qemu_system_vmstop_request_prepare();
2093         qemu_system_vmstop_request(state);
2094         /*
2095          * FIXME: should not return to device code in case
2096          * vm_stop() has been requested.
2097          */
2098         cpu_stop_current();
2099         return 0;
2100     }
2101
2102     return do_vm_stop(state, true);
2103 }
2104
2105 /**
2106  * Prepare for (re)starting the VM.
2107  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2108  * running or in case of an error condition), 0 otherwise.
2109  */
2110 int vm_prepare_start(void)
2111 {
2112     RunState requested;
2113
2114     qemu_vmstop_requested(&requested);
2115     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2116         return -1;
2117     }
2118
2119     /* Ensure that a STOP/RESUME pair of events is emitted if a
2120      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2121      * example, according to documentation is always followed by
2122      * the STOP event.
2123      */
2124     if (runstate_is_running()) {
2125         qapi_event_send_stop();
2126         qapi_event_send_resume();
2127         return -1;
2128     }
2129
2130     /* We are sending this now, but the CPUs will be resumed shortly later */
2131     qapi_event_send_resume();
2132
2133     cpu_enable_ticks();
2134     runstate_set(RUN_STATE_RUNNING);
2135     vm_state_notify(1, RUN_STATE_RUNNING);
2136     return 0;
2137 }
2138
2139 void vm_start(void)
2140 {
2141     if (!vm_prepare_start()) {
2142         resume_all_vcpus();
2143     }
2144 }
2145
2146 /* does a state transition even if the VM is already stopped,
2147    current state is forgotten forever */
2148 int vm_stop_force_state(RunState state)
2149 {
2150     if (runstate_is_running()) {
2151         return vm_stop(state);
2152     } else {
2153         runstate_set(state);
2154
2155         bdrv_drain_all();
2156         /* Make sure to return an error if the flush in a previous vm_stop()
2157          * failed. */
2158         return bdrv_flush_all();
2159     }
2160 }
2161
2162 void list_cpus(const char *optarg)
2163 {
2164     /* XXX: implement xxx_cpu_list for targets that still miss it */
2165 #if defined(cpu_list)
2166     cpu_list();
2167 #endif
2168 }
2169
2170 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2171                  bool has_cpu, int64_t cpu_index, Error **errp)
2172 {
2173     FILE *f;
2174     uint32_t l;
2175     CPUState *cpu;
2176     uint8_t buf[1024];
2177     int64_t orig_addr = addr, orig_size = size;
2178
2179     if (!has_cpu) {
2180         cpu_index = 0;
2181     }
2182
2183     cpu = qemu_get_cpu(cpu_index);
2184     if (cpu == NULL) {
2185         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2186                    "a CPU number");
2187         return;
2188     }
2189
2190     f = fopen(filename, "wb");
2191     if (!f) {
2192         error_setg_file_open(errp, errno, filename);
2193         return;
2194     }
2195
2196     while (size != 0) {
2197         l = sizeof(buf);
2198         if (l > size)
2199             l = size;
2200         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2201             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2202                              " specified", orig_addr, orig_size);
2203             goto exit;
2204         }
2205         if (fwrite(buf, 1, l, f) != l) {
2206             error_setg(errp, QERR_IO_ERROR);
2207             goto exit;
2208         }
2209         addr += l;
2210         size -= l;
2211     }
2212
2213 exit:
2214     fclose(f);
2215 }
2216
2217 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2218                   Error **errp)
2219 {
2220     FILE *f;
2221     uint32_t l;
2222     uint8_t buf[1024];
2223
2224     f = fopen(filename, "wb");
2225     if (!f) {
2226         error_setg_file_open(errp, errno, filename);
2227         return;
2228     }
2229
2230     while (size != 0) {
2231         l = sizeof(buf);
2232         if (l > size)
2233             l = size;
2234         cpu_physical_memory_read(addr, buf, l);
2235         if (fwrite(buf, 1, l, f) != l) {
2236             error_setg(errp, QERR_IO_ERROR);
2237             goto exit;
2238         }
2239         addr += l;
2240         size -= l;
2241     }
2242
2243 exit:
2244     fclose(f);
2245 }
2246
2247 void qmp_inject_nmi(Error **errp)
2248 {
2249     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2250 }
2251
2252 void dump_drift_info(void)
2253 {
2254     if (!use_icount) {
2255         return;
2256     }
2257
2258     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2259                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2260     if (icount_align_option) {
2261         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2262                     -max_delay / SCALE_MS);
2263         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2264                     max_advance / SCALE_MS);
2265     } else {
2266         qemu_printf("Max guest delay     NA\n");
2267         qemu_printf("Max guest advance   NA\n");
2268     }
2269 }