cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
 185     const char *rr = qemu_opt_get(icount_opts, "rr");
 186
 187     if (rr || TCG_OVERSIZED_GUEST) {
 188         return false;
 189     } else {
 190 #ifdef TARGET_SUPPORTS_MTTCG
 191         return check_tcg_memory_orders_compatible();
 192 #else
 193         return false;
 194 #endif
 195     }
 196 }
 197
 198 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 199 {
 200     const char *t = qemu_opt_get(opts, "thread");
 201     if (t) {
 202         if (strcmp(t, "multi") == 0) {
 203             if (TCG_OVERSIZED_GUEST) {
 204                 error_setg(errp, "No MTTCG when guest word size > hosts");
 205             } else {
 206                 if (!check_tcg_memory_orders_compatible()) {
 207                     error_report("Guest expects a stronger memory ordering "
 208                                  "than the host provides");
 209                     error_printf("This may cause strange/hard to debug errors");
 210                 }
 211                 mttcg_enabled = true;
 212             }
 213         } else if (strcmp(t, "single") == 0) {
 214             mttcg_enabled = false;
 215         } else {
 216             error_setg(errp, "Invalid 'thread' setting %s", t);
 217         }
 218     } else {
 219         mttcg_enabled = default_mttcg_enabled();
 220     }
 221 }
 222
 223 int64_t cpu_get_icount_raw(void)
 224 {
 225     int64_t icount;
 226     CPUState *cpu = current_cpu;
 227
 228     icount = timers_state.qemu_icount;
 229     if (cpu) {
 230         if (!cpu->can_do_io) {
 231             fprintf(stderr, "Bad icount read\n");
 232             exit(1);
 233         }
 234         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 235     }
 236     return icount;
 237 }
 238
 239 /* Return the virtual CPU time, based on the instruction counter.  */
 240 static int64_t cpu_get_icount_locked(void)
 241 {
 242     int64_t icount = cpu_get_icount_raw();
 243     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 244 }
 245
 246 int64_t cpu_get_icount(void)
 247 {
 248     int64_t icount;
 249     unsigned start;
 250
 251     do {
 252         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 253         icount = cpu_get_icount_locked();
 254     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 255
 256     return icount;
 257 }
 258
 259 int64_t cpu_icount_to_ns(int64_t icount)
 260 {
 261     return icount << icount_time_shift;
 262 }
 263
 264 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 265  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 266  * counter.
 267  *
 268  * Caller must hold the BQL
 269  */
 270 int64_t cpu_get_ticks(void)
 271 {
 272     int64_t ticks;
 273
 274     if (use_icount) {
 275         return cpu_get_icount();
 276     }
 277
 278     ticks = timers_state.cpu_ticks_offset;
 279     if (timers_state.cpu_ticks_enabled) {
 280         ticks += cpu_get_host_ticks();
 281     }
 282
 283     if (timers_state.cpu_ticks_prev > ticks) {
 284         /* Note: non increasing ticks may happen if the host uses
 285            software suspend */
 286         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 287         ticks = timers_state.cpu_ticks_prev;
 288     }
 289
 290     timers_state.cpu_ticks_prev = ticks;
 291     return ticks;
 292 }
 293
 294 static int64_t cpu_get_clock_locked(void)
 295 {
 296     int64_t time;
 297
 298     time = timers_state.cpu_clock_offset;
 299     if (timers_state.cpu_ticks_enabled) {
 300         time += get_clock();
 301     }
 302
 303     return time;
 304 }
 305
 306 /* Return the monotonic time elapsed in VM, i.e.,
 307  * the time between vm_start and vm_stop
 308  */
 309 int64_t cpu_get_clock(void)
 310 {
 311     int64_t ti;
 312     unsigned start;
 313
 314     do {
 315         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 316         ti = cpu_get_clock_locked();
 317     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 318
 319     return ti;
 320 }
 321
 322 /* enable cpu_get_ticks()
 323  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 324  */
 325 void cpu_enable_ticks(void)
 326 {
 327     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 328     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 329     if (!timers_state.cpu_ticks_enabled) {
 330         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 331         timers_state.cpu_clock_offset -= get_clock();
 332         timers_state.cpu_ticks_enabled = 1;
 333     }
 334     seqlock_write_end(&timers_state.vm_clock_seqlock);
 335 }
 336
 337 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 338  * cpu_get_ticks() after that.
 339  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 340  */
 341 void cpu_disable_ticks(void)
 342 {
 343     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 344     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 345     if (timers_state.cpu_ticks_enabled) {
 346         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 347         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 348         timers_state.cpu_ticks_enabled = 0;
 349     }
 350     seqlock_write_end(&timers_state.vm_clock_seqlock);
 351 }
 352
 353 /* Correlation between real and virtual time is always going to be
 354    fairly approximate, so ignore small variation.
 355    When the guest is idle real and virtual time will be aligned in
 356    the IO wait loop.  */
 357 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 358
 359 static void icount_adjust(void)
 360 {
 361     int64_t cur_time;
 362     int64_t cur_icount;
 363     int64_t delta;
 364
 365     /* Protected by TimersState mutex.  */
 366     static int64_t last_delta;
 367
 368     /* If the VM is not running, then do nothing.  */
 369     if (!runstate_is_running()) {
 370         return;
 371     }
 372
 373     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 374     cur_time = cpu_get_clock_locked();
 375     cur_icount = cpu_get_icount_locked();
 376
 377     delta = cur_icount - cur_time;
 378     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 379     if (delta > 0
 380         && last_delta + ICOUNT_WOBBLE < delta * 2
 381         && icount_time_shift > 0) {
 382         /* The guest is getting too far ahead.  Slow time down.  */
 383         icount_time_shift--;
 384     }
 385     if (delta < 0
 386         && last_delta - ICOUNT_WOBBLE > delta * 2
 387         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 388         /* The guest is getting too far behind.  Speed time up.  */
 389         icount_time_shift++;
 390     }
 391     last_delta = delta;
 392     timers_state.qemu_icount_bias = cur_icount
 393                               - (timers_state.qemu_icount << icount_time_shift);
 394     seqlock_write_end(&timers_state.vm_clock_seqlock);
 395 }
 396
 397 static void icount_adjust_rt(void *opaque)
 398 {
 399     timer_mod(icount_rt_timer,
 400               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 401     icount_adjust();
 402 }
 403
 404 static void icount_adjust_vm(void *opaque)
 405 {
 406     timer_mod(icount_vm_timer,
 407                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 408                    NANOSECONDS_PER_SECOND / 10);
 409     icount_adjust();
 410 }
 411
 412 static int64_t qemu_icount_round(int64_t count)
 413 {
 414     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 415 }
 416
 417 static void icount_warp_rt(void)
 418 {
 419     unsigned seq;
 420     int64_t warp_start;
 421
 422     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 423      * changes from -1 to another value, so the race here is okay.
 424      */
 425     do {
 426         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 427         warp_start = vm_clock_warp_start;
 428     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 429
 430     if (warp_start == -1) {
 431         return;
 432     }
 433
 434     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 435     if (runstate_is_running()) {
 436         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 437                                      cpu_get_clock_locked());
 438         int64_t warp_delta;
 439
 440         warp_delta = clock - vm_clock_warp_start;
 441         if (use_icount == 2) {
 442             /*
 443              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 444              * far ahead of real time.
 445              */
 446             int64_t cur_icount = cpu_get_icount_locked();
 447             int64_t delta = clock - cur_icount;
 448             warp_delta = MIN(warp_delta, delta);
 449         }
 450         timers_state.qemu_icount_bias += warp_delta;
 451     }
 452     vm_clock_warp_start = -1;
 453     seqlock_write_end(&timers_state.vm_clock_seqlock);
 454
 455     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 456         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 457     }
 458 }
 459
 460 static void icount_timer_cb(void *opaque)
 461 {
 462     /* No need for a checkpoint because the timer already synchronizes
 463      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 464      */
 465     icount_warp_rt();
 466 }
 467
 468 void qtest_clock_warp(int64_t dest)
 469 {
 470     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 471     AioContext *aio_context;
 472     assert(qtest_enabled());
 473     aio_context = qemu_get_aio_context();
 474     while (clock < dest) {
 475         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 476         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 477
 478         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 479         timers_state.qemu_icount_bias += warp;
 480         seqlock_write_end(&timers_state.vm_clock_seqlock);
 481
 482         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 483         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 484         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 485     }
 486     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 487 }
 488
 489 void qemu_start_warp_timer(void)
 490 {
 491     int64_t clock;
 492     int64_t deadline;
 493
 494     if (!use_icount) {
 495         return;
 496     }
 497
 498     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 499      * do not fire, so computing the deadline does not make sense.
 500      */
 501     if (!runstate_is_running()) {
 502         return;
 503     }
 504
 505     /* warp clock deterministically in record/replay mode */
 506     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 507         return;
 508     }
 509
 510     if (!all_cpu_threads_idle()) {
 511         return;
 512     }
 513
 514     if (qtest_enabled()) {
 515         /* When testing, qtest commands advance icount.  */
 516         return;
 517     }
 518
 519     /* We want to use the earliest deadline from ALL vm_clocks */
 520     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 521     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 522     if (deadline < 0) {
 523         static bool notified;
 524         if (!icount_sleep && !notified) {
 525             error_report("WARNING: icount sleep disabled and no active timers");
 526             notified = true;
 527         }
 528         return;
 529     }
 530
 531     if (deadline > 0) {
 532         /*
 533          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 534          * sleep.  Otherwise, the CPU might be waiting for a future timer
 535          * interrupt to wake it up, but the interrupt never comes because
 536          * the vCPU isn't running any insns and thus doesn't advance the
 537          * QEMU_CLOCK_VIRTUAL.
 538          */
 539         if (!icount_sleep) {
 540             /*
 541              * We never let VCPUs sleep in no sleep icount mode.
 542              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 543              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 544              * It is useful when we want a deterministic execution time,
 545              * isolated from host latencies.
 546              */
 547             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 548             timers_state.qemu_icount_bias += deadline;
 549             seqlock_write_end(&timers_state.vm_clock_seqlock);
 550             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 551         } else {
 552             /*
 553              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 554              * "real" time, (related to the time left until the next event) has
 555              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 556              * This avoids that the warps are visible externally; for example,
 557              * you will not be sending network packets continuously instead of
 558              * every 100ms.
 559              */
 560             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 561             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 562                 vm_clock_warp_start = clock;
 563             }
 564             seqlock_write_end(&timers_state.vm_clock_seqlock);
 565             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 566         }
 567     } else if (deadline == 0) {
 568         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 569     }
 570 }
 571
 572 static void qemu_account_warp_timer(void)
 573 {
 574     if (!use_icount || !icount_sleep) {
 575         return;
 576     }
 577
 578     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 579      * do not fire, so computing the deadline does not make sense.
 580      */
 581     if (!runstate_is_running()) {
 582         return;
 583     }
 584
 585     /* warp clock deterministically in record/replay mode */
 586     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 587         return;
 588     }
 589
 590     timer_del(icount_warp_timer);
 591     icount_warp_rt();
 592 }
 593
 594 static bool icount_state_needed(void *opaque)
 595 {
 596     return use_icount;
 597 }
 598
 599 /*
 600  * This is a subsection for icount migration.
 601  */
 602 static const VMStateDescription icount_vmstate_timers = {
 603     .name = "timer/icount",
 604     .version_id = 1,
 605     .minimum_version_id = 1,
 606     .needed = icount_state_needed,
 607     .fields = (VMStateField[]) {
 608         VMSTATE_INT64(qemu_icount_bias, TimersState),
 609         VMSTATE_INT64(qemu_icount, TimersState),
 610         VMSTATE_END_OF_LIST()
 611     }
 612 };
 613
 614 static const VMStateDescription vmstate_timers = {
 615     .name = "timer",
 616     .version_id = 2,
 617     .minimum_version_id = 1,
 618     .fields = (VMStateField[]) {
 619         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 620         VMSTATE_INT64(dummy, TimersState),
 621         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 622         VMSTATE_END_OF_LIST()
 623     },
 624     .subsections = (const VMStateDescription*[]) {
 625         &icount_vmstate_timers,
 626         NULL
 627     }
 628 };
 629
 630 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 631 {
 632     double pct;
 633     double throttle_ratio;
 634     long sleeptime_ns;
 635
 636     if (!cpu_throttle_get_percentage()) {
 637         return;
 638     }
 639
 640     pct = (double)cpu_throttle_get_percentage()/100;
 641     throttle_ratio = pct / (1 - pct);
 642     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 643
 644     qemu_mutex_unlock_iothread();
 645     atomic_set(&cpu->throttle_thread_scheduled, 0);
 646     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 647     qemu_mutex_lock_iothread();
 648 }
 649
 650 static void cpu_throttle_timer_tick(void *opaque)
 651 {
 652     CPUState *cpu;
 653     double pct;
 654
 655     /* Stop the timer if needed */
 656     if (!cpu_throttle_get_percentage()) {
 657         return;
 658     }
 659     CPU_FOREACH(cpu) {
 660         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 661             async_run_on_cpu(cpu, cpu_throttle_thread,
 662                              RUN_ON_CPU_NULL);
 663         }
 664     }
 665
 666     pct = (double)cpu_throttle_get_percentage()/100;
 667     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 668                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 669 }
 670
 671 void cpu_throttle_set(int new_throttle_pct)
 672 {
 673     /* Ensure throttle percentage is within valid range */
 674     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 675     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 676
 677     atomic_set(&throttle_percentage, new_throttle_pct);
 678
 679     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 680                                        CPU_THROTTLE_TIMESLICE_NS);
 681 }
 682
 683 void cpu_throttle_stop(void)
 684 {
 685     atomic_set(&throttle_percentage, 0);
 686 }
 687
 688 bool cpu_throttle_active(void)
 689 {
 690     return (cpu_throttle_get_percentage() != 0);
 691 }
 692
 693 int cpu_throttle_get_percentage(void)
 694 {
 695     return atomic_read(&throttle_percentage);
 696 }
 697
 698 void cpu_ticks_init(void)
 699 {
 700     seqlock_init(&timers_state.vm_clock_seqlock);
 701     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 702     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 703                                            cpu_throttle_timer_tick, NULL);
 704 }
 705
 706 void configure_icount(QemuOpts *opts, Error **errp)
 707 {
 708     const char *option;
 709     char *rem_str = NULL;
 710
 711     option = qemu_opt_get(opts, "shift");
 712     if (!option) {
 713         if (qemu_opt_get(opts, "align") != NULL) {
 714             error_setg(errp, "Please specify shift option when using align");
 715         }
 716         return;
 717     }
 718
 719     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 720     if (icount_sleep) {
 721         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 722                                          icount_timer_cb, NULL);
 723     }
 724
 725     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 726
 727     if (icount_align_option && !icount_sleep) {
 728         error_setg(errp, "align=on and sleep=off are incompatible");
 729     }
 730     if (strcmp(option, "auto") != 0) {
 731         errno = 0;
 732         icount_time_shift = strtol(option, &rem_str, 0);
 733         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 734             error_setg(errp, "icount: Invalid shift value");
 735         }
 736         use_icount = 1;
 737         return;
 738     } else if (icount_align_option) {
 739         error_setg(errp, "shift=auto and align=on are incompatible");
 740     } else if (!icount_sleep) {
 741         error_setg(errp, "shift=auto and sleep=off are incompatible");
 742     }
 743
 744     use_icount = 2;
 745
 746     /* 125MIPS seems a reasonable initial guess at the guest speed.
 747        It will be corrected fairly quickly anyway.  */
 748     icount_time_shift = 3;
 749
 750     /* Have both realtime and virtual time triggers for speed adjustment.
 751        The realtime trigger catches emulated time passing too slowly,
 752        the virtual time trigger catches emulated time passing too fast.
 753        Realtime triggers occur even when idle, so use them less frequently
 754        than VM triggers.  */
 755     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 756                                    icount_adjust_rt, NULL);
 757     timer_mod(icount_rt_timer,
 758                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 759     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 760                                         icount_adjust_vm, NULL);
 761     timer_mod(icount_vm_timer,
 762                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 763                    NANOSECONDS_PER_SECOND / 10);
 764 }
 765
 766 /***********************************************************/
 767 /* TCG vCPU kick timer
 768  *
 769  * The kick timer is responsible for moving single threaded vCPU
 770  * emulation on to the next vCPU. If more than one vCPU is running a
 771  * timer event with force a cpu->exit so the next vCPU can get
 772  * scheduled.
 773  *
 774  * The timer is removed if all vCPUs are idle and restarted again once
 775  * idleness is complete.
 776  */
 777
 778 static QEMUTimer *tcg_kick_vcpu_timer;
 779 static CPUState *tcg_current_rr_cpu;
 780
 781 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 782
 783 static inline int64_t qemu_tcg_next_kick(void)
 784 {
 785     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 786 }
 787
 788 /* Kick the currently round-robin scheduled vCPU */
 789 static void qemu_cpu_kick_rr_cpu(void)
 790 {
 791     CPUState *cpu;
 792     do {
 793         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 794         if (cpu) {
 795             cpu_exit(cpu);
 796         }
 797     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 798 }
 799
 800 static void kick_tcg_thread(void *opaque)
 801 {
 802     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 803     qemu_cpu_kick_rr_cpu();
 804 }
 805
 806 static void start_tcg_kick_timer(void)
 807 {
 808     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 809         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 810                                            kick_tcg_thread, NULL);
 811         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 812     }
 813 }
 814
 815 static void stop_tcg_kick_timer(void)
 816 {
 817     if (tcg_kick_vcpu_timer) {
 818         timer_del(tcg_kick_vcpu_timer);
 819         tcg_kick_vcpu_timer = NULL;
 820     }
 821 }
 822
 823 /***********************************************************/
 824 void hw_error(const char *fmt, ...)
 825 {
 826     va_list ap;
 827     CPUState *cpu;
 828
 829     va_start(ap, fmt);
 830     fprintf(stderr, "qemu: hardware error: ");
 831     vfprintf(stderr, fmt, ap);
 832     fprintf(stderr, "\n");
 833     CPU_FOREACH(cpu) {
 834         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 835         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 836     }
 837     va_end(ap);
 838     abort();
 839 }
 840
 841 void cpu_synchronize_all_states(void)
 842 {
 843     CPUState *cpu;
 844
 845     CPU_FOREACH(cpu) {
 846         cpu_synchronize_state(cpu);
 847     }
 848 }
 849
 850 void cpu_synchronize_all_post_reset(void)
 851 {
 852     CPUState *cpu;
 853
 854     CPU_FOREACH(cpu) {
 855         cpu_synchronize_post_reset(cpu);
 856     }
 857 }
 858
 859 void cpu_synchronize_all_post_init(void)
 860 {
 861     CPUState *cpu;
 862
 863     CPU_FOREACH(cpu) {
 864         cpu_synchronize_post_init(cpu);
 865     }
 866 }
 867
 868 static int do_vm_stop(RunState state)
 869 {
 870     int ret = 0;
 871
 872     if (runstate_is_running()) {
 873         cpu_disable_ticks();
 874         pause_all_vcpus();
 875         runstate_set(state);
 876         vm_state_notify(0, state);
 877         qapi_event_send_stop(&error_abort);
 878     }
 879
 880     bdrv_drain_all();
 881     replay_disable_events();
 882     ret = bdrv_flush_all();
 883
 884     return ret;
 885 }
 886
 887 static bool cpu_can_run(CPUState *cpu)
 888 {
 889     if (cpu->stop) {
 890         return false;
 891     }
 892     if (cpu_is_stopped(cpu)) {
 893         return false;
 894     }
 895     return true;
 896 }
 897
 898 static void cpu_handle_guest_debug(CPUState *cpu)
 899 {
 900     gdb_set_stop_cpu(cpu);
 901     qemu_system_debug_request();
 902     cpu->stopped = true;
 903 }
 904
 905 #ifdef CONFIG_LINUX
 906 static void sigbus_reraise(void)
 907 {
 908     sigset_t set;
 909     struct sigaction action;
 910
 911     memset(&action, 0, sizeof(action));
 912     action.sa_handler = SIG_DFL;
 913     if (!sigaction(SIGBUS, &action, NULL)) {
 914         raise(SIGBUS);
 915         sigemptyset(&set);
 916         sigaddset(&set, SIGBUS);
 917         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 918     }
 919     perror("Failed to re-raise SIGBUS!\n");
 920     abort();
 921 }
 922
 923 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 924 {
 925     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 926         sigbus_reraise();
 927     }
 928
 929     if (current_cpu) {
 930         /* Called asynchronously in VCPU thread.  */
 931         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 932             sigbus_reraise();
 933         }
 934     } else {
 935         /* Called synchronously (via signalfd) in main thread.  */
 936         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 937             sigbus_reraise();
 938         }
 939     }
 940 }
 941
 942 static void qemu_init_sigbus(void)
 943 {
 944     struct sigaction action;
 945
 946     memset(&action, 0, sizeof(action));
 947     action.sa_flags = SA_SIGINFO;
 948     action.sa_sigaction = sigbus_handler;
 949     sigaction(SIGBUS, &action, NULL);
 950
 951     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 952 }
 953 #else /* !CONFIG_LINUX */
 954 static void qemu_init_sigbus(void)
 955 {
 956 }
 957 #endif /* !CONFIG_LINUX */
 958
 959 static QemuMutex qemu_global_mutex;
 960
 961 static QemuThread io_thread;
 962
 963 /* cpu creation */
 964 static QemuCond qemu_cpu_cond;
 965 /* system init */
 966 static QemuCond qemu_pause_cond;
 967
 968 void qemu_init_cpu_loop(void)
 969 {
 970     qemu_init_sigbus();
 971     qemu_cond_init(&qemu_cpu_cond);
 972     qemu_cond_init(&qemu_pause_cond);
 973     qemu_mutex_init(&qemu_global_mutex);
 974
 975     qemu_thread_get_self(&io_thread);
 976 }
 977
 978 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 979 {
 980     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 981 }
 982
 983 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
 984 {
 985     if (kvm_destroy_vcpu(cpu) < 0) {
 986         error_report("kvm_destroy_vcpu failed");
 987         exit(EXIT_FAILURE);
 988     }
 989 }
 990
 991 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 992 {
 993 }
 994
 995 static void qemu_wait_io_event_common(CPUState *cpu)
 996 {
 997     atomic_mb_set(&cpu->thread_kicked, false);
 998     if (cpu->stop) {
 999         cpu->stop = false;
1000         cpu->stopped = true;
1001         qemu_cond_broadcast(&qemu_pause_cond);
1002     }
1003     process_queued_cpu_work(cpu);
1004 }
1005
1006 static bool qemu_tcg_should_sleep(CPUState *cpu)
1007 {
1008     if (mttcg_enabled) {
1009         return cpu_thread_is_idle(cpu);
1010     } else {
1011         return all_cpu_threads_idle();
1012     }
1013 }
1014
1015 static void qemu_tcg_wait_io_event(CPUState *cpu)
1016 {
1017     while (qemu_tcg_should_sleep(cpu)) {
1018         stop_tcg_kick_timer();
1019         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1020     }
1021
1022     start_tcg_kick_timer();
1023
1024     qemu_wait_io_event_common(cpu);
1025 }
1026
1027 static void qemu_kvm_wait_io_event(CPUState *cpu)
1028 {
1029     while (cpu_thread_is_idle(cpu)) {
1030         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1031     }
1032
1033     qemu_wait_io_event_common(cpu);
1034 }
1035
1036 static void *qemu_kvm_cpu_thread_fn(void *arg)
1037 {
1038     CPUState *cpu = arg;
1039     int r;
1040
1041     rcu_register_thread();
1042
1043     qemu_mutex_lock_iothread();
1044     qemu_thread_get_self(cpu->thread);
1045     cpu->thread_id = qemu_get_thread_id();
1046     cpu->can_do_io = 1;
1047     current_cpu = cpu;
1048
1049     r = kvm_init_vcpu(cpu);
1050     if (r < 0) {
1051         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1052         exit(1);
1053     }
1054
1055     kvm_init_cpu_signals(cpu);
1056
1057     /* signal CPU creation */
1058     cpu->created = true;
1059     qemu_cond_signal(&qemu_cpu_cond);
1060
1061     do {
1062         if (cpu_can_run(cpu)) {
1063             r = kvm_cpu_exec(cpu);
1064             if (r == EXCP_DEBUG) {
1065                 cpu_handle_guest_debug(cpu);
1066             }
1067         }
1068         qemu_kvm_wait_io_event(cpu);
1069     } while (!cpu->unplug || cpu_can_run(cpu));
1070
1071     qemu_kvm_destroy_vcpu(cpu);
1072     cpu->created = false;
1073     qemu_cond_signal(&qemu_cpu_cond);
1074     qemu_mutex_unlock_iothread();
1075     return NULL;
1076 }
1077
1078 static void *qemu_dummy_cpu_thread_fn(void *arg)
1079 {
1080 #ifdef _WIN32
1081     fprintf(stderr, "qtest is not supported under Windows\n");
1082     exit(1);
1083 #else
1084     CPUState *cpu = arg;
1085     sigset_t waitset;
1086     int r;
1087
1088     rcu_register_thread();
1089
1090     qemu_mutex_lock_iothread();
1091     qemu_thread_get_self(cpu->thread);
1092     cpu->thread_id = qemu_get_thread_id();
1093     cpu->can_do_io = 1;
1094     current_cpu = cpu;
1095
1096     sigemptyset(&waitset);
1097     sigaddset(&waitset, SIG_IPI);
1098
1099     /* signal CPU creation */
1100     cpu->created = true;
1101     qemu_cond_signal(&qemu_cpu_cond);
1102
1103     while (1) {
1104         qemu_mutex_unlock_iothread();
1105         do {
1106             int sig;
1107             r = sigwait(&waitset, &sig);
1108         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1109         if (r == -1) {
1110             perror("sigwait");
1111             exit(1);
1112         }
1113         qemu_mutex_lock_iothread();
1114         qemu_wait_io_event_common(cpu);
1115     }
1116
1117     return NULL;
1118 #endif
1119 }
1120
1121 static int64_t tcg_get_icount_limit(void)
1122 {
1123     int64_t deadline;
1124
1125     if (replay_mode != REPLAY_MODE_PLAY) {
1126         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1127
1128         /* Maintain prior (possibly buggy) behaviour where if no deadline
1129          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1130          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1131          * nanoseconds.
1132          */
1133         if ((deadline < 0) || (deadline > INT32_MAX)) {
1134             deadline = INT32_MAX;
1135         }
1136
1137         return qemu_icount_round(deadline);
1138     } else {
1139         return replay_get_instructions();
1140     }
1141 }
1142
1143 static void handle_icount_deadline(void)
1144 {
1145     if (use_icount) {
1146         int64_t deadline =
1147             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1148
1149         if (deadline == 0) {
1150             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1151         }
1152     }
1153 }
1154
1155 static int tcg_cpu_exec(CPUState *cpu)
1156 {
1157     int ret;
1158 #ifdef CONFIG_PROFILER
1159     int64_t ti;
1160 #endif
1161
1162 #ifdef CONFIG_PROFILER
1163     ti = profile_getclock();
1164 #endif
1165     if (use_icount) {
1166         int64_t count;
1167         int decr;
1168         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1169                                     + cpu->icount_extra);
1170         cpu->icount_decr.u16.low = 0;
1171         cpu->icount_extra = 0;
1172         count = tcg_get_icount_limit();
1173         timers_state.qemu_icount += count;
1174         decr = (count > 0xffff) ? 0xffff : count;
1175         count -= decr;
1176         cpu->icount_decr.u16.low = decr;
1177         cpu->icount_extra = count;
1178     }
1179     qemu_mutex_unlock_iothread();
1180     cpu_exec_start(cpu);
1181     ret = cpu_exec(cpu);
1182     cpu_exec_end(cpu);
1183     qemu_mutex_lock_iothread();
1184 #ifdef CONFIG_PROFILER
1185     tcg_time += profile_getclock() - ti;
1186 #endif
1187     if (use_icount) {
1188         /* Fold pending instructions back into the
1189            instruction counter, and clear the interrupt flag.  */
1190         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1191                         + cpu->icount_extra);
1192         cpu->icount_decr.u32 = 0;
1193         cpu->icount_extra = 0;
1194         replay_account_executed_instructions();
1195     }
1196     return ret;
1197 }
1198
1199 /* Destroy any remaining vCPUs which have been unplugged and have
1200  * finished running
1201  */
1202 static void deal_with_unplugged_cpus(void)
1203 {
1204     CPUState *cpu;
1205
1206     CPU_FOREACH(cpu) {
1207         if (cpu->unplug && !cpu_can_run(cpu)) {
1208             qemu_tcg_destroy_vcpu(cpu);
1209             cpu->created = false;
1210             qemu_cond_signal(&qemu_cpu_cond);
1211             break;
1212         }
1213     }
1214 }
1215
1216 /* Single-threaded TCG
1217  *
1218  * In the single-threaded case each vCPU is simulated in turn. If
1219  * there is more than a single vCPU we create a simple timer to kick
1220  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1221  * This is done explicitly rather than relying on side-effects
1222  * elsewhere.
1223  */
1224
1225 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1226 {
1227     CPUState *cpu = arg;
1228
1229     rcu_register_thread();
1230
1231     qemu_mutex_lock_iothread();
1232     qemu_thread_get_self(cpu->thread);
1233
1234     CPU_FOREACH(cpu) {
1235         cpu->thread_id = qemu_get_thread_id();
1236         cpu->created = true;
1237         cpu->can_do_io = 1;
1238     }
1239     qemu_cond_signal(&qemu_cpu_cond);
1240
1241     /* wait for initial kick-off after machine start */
1242     while (first_cpu->stopped) {
1243         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1244
1245         /* process any pending work */
1246         CPU_FOREACH(cpu) {
1247             current_cpu = cpu;
1248             qemu_wait_io_event_common(cpu);
1249         }
1250     }
1251
1252     start_tcg_kick_timer();
1253
1254     cpu = first_cpu;
1255
1256     /* process any pending work */
1257     cpu->exit_request = 1;
1258
1259     while (1) {
1260         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1261         qemu_account_warp_timer();
1262
1263         if (!cpu) {
1264             cpu = first_cpu;
1265         }
1266
1267         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1268
1269             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1270             current_cpu = cpu;
1271
1272             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1273                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1274
1275             if (cpu_can_run(cpu)) {
1276                 int r;
1277                 r = tcg_cpu_exec(cpu);
1278                 if (r == EXCP_DEBUG) {
1279                     cpu_handle_guest_debug(cpu);
1280                     break;
1281                 } else if (r == EXCP_ATOMIC) {
1282                     qemu_mutex_unlock_iothread();
1283                     cpu_exec_step_atomic(cpu);
1284                     qemu_mutex_lock_iothread();
1285                     break;
1286                 }
1287             } else if (cpu->stop) {
1288                 if (cpu->unplug) {
1289                     cpu = CPU_NEXT(cpu);
1290                 }
1291                 break;
1292             }
1293
1294             cpu = CPU_NEXT(cpu);
1295         } /* while (cpu && !cpu->exit_request).. */
1296
1297         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1298         atomic_set(&tcg_current_rr_cpu, NULL);
1299
1300         if (cpu && cpu->exit_request) {
1301             atomic_mb_set(&cpu->exit_request, 0);
1302         }
1303
1304         handle_icount_deadline();
1305
1306         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1307         deal_with_unplugged_cpus();
1308     }
1309
1310     return NULL;
1311 }
1312
1313 static void *qemu_hax_cpu_thread_fn(void *arg)
1314 {
1315     CPUState *cpu = arg;
1316     int r;
1317     qemu_thread_get_self(cpu->thread);
1318     qemu_mutex_lock(&qemu_global_mutex);
1319
1320     cpu->thread_id = qemu_get_thread_id();
1321     cpu->created = true;
1322     cpu->halted = 0;
1323     current_cpu = cpu;
1324
1325     hax_init_vcpu(cpu);
1326     qemu_cond_signal(&qemu_cpu_cond);
1327
1328     while (1) {
1329         if (cpu_can_run(cpu)) {
1330             r = hax_smp_cpu_exec(cpu);
1331             if (r == EXCP_DEBUG) {
1332                 cpu_handle_guest_debug(cpu);
1333             }
1334         }
1335
1336         while (cpu_thread_is_idle(cpu)) {
1337             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1338         }
1339 #ifdef _WIN32
1340         SleepEx(0, TRUE);
1341 #endif
1342         qemu_wait_io_event_common(cpu);
1343     }
1344     return NULL;
1345 }
1346
1347 #ifdef _WIN32
1348 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1349 {
1350 }
1351 #endif
1352
1353 /* Multi-threaded TCG
1354  *
1355  * In the multi-threaded case each vCPU has its own thread. The TLS
1356  * variable current_cpu can be used deep in the code to find the
1357  * current CPUState for a given thread.
1358  */
1359
1360 static void *qemu_tcg_cpu_thread_fn(void *arg)
1361 {
1362     CPUState *cpu = arg;
1363
1364     rcu_register_thread();
1365
1366     qemu_mutex_lock_iothread();
1367     qemu_thread_get_self(cpu->thread);
1368
1369     cpu->thread_id = qemu_get_thread_id();
1370     cpu->created = true;
1371     cpu->can_do_io = 1;
1372     current_cpu = cpu;
1373     qemu_cond_signal(&qemu_cpu_cond);
1374
1375     /* process any pending work */
1376     cpu->exit_request = 1;
1377
1378     while (1) {
1379         if (cpu_can_run(cpu)) {
1380             int r;
1381             r = tcg_cpu_exec(cpu);
1382             switch (r) {
1383             case EXCP_DEBUG:
1384                 cpu_handle_guest_debug(cpu);
1385                 break;
1386             case EXCP_HALTED:
1387                 /* during start-up the vCPU is reset and the thread is
1388                  * kicked several times. If we don't ensure we go back
1389                  * to sleep in the halted state we won't cleanly
1390                  * start-up when the vCPU is enabled.
1391                  *
1392                  * cpu->halted should ensure we sleep in wait_io_event
1393                  */
1394                 g_assert(cpu->halted);
1395                 break;
1396             case EXCP_ATOMIC:
1397                 qemu_mutex_unlock_iothread();
1398                 cpu_exec_step_atomic(cpu);
1399                 qemu_mutex_lock_iothread();
1400             default:
1401                 /* Ignore everything else? */
1402                 break;
1403             }
1404         }
1405
1406         handle_icount_deadline();
1407
1408         atomic_mb_set(&cpu->exit_request, 0);
1409         qemu_tcg_wait_io_event(cpu);
1410     }
1411
1412     return NULL;
1413 }
1414
1415 static void qemu_cpu_kick_thread(CPUState *cpu)
1416 {
1417 #ifndef _WIN32
1418     int err;
1419
1420     if (cpu->thread_kicked) {
1421         return;
1422     }
1423     cpu->thread_kicked = true;
1424     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1425     if (err) {
1426         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1427         exit(1);
1428     }
1429 #else /* _WIN32 */
1430     if (!qemu_cpu_is_self(cpu)) {
1431         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1432             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1433                     __func__, GetLastError());
1434             exit(1);
1435         }
1436     }
1437 #endif
1438 }
1439
1440 void qemu_cpu_kick(CPUState *cpu)
1441 {
1442     qemu_cond_broadcast(cpu->halt_cond);
1443     if (tcg_enabled()) {
1444         cpu_exit(cpu);
1445         /* NOP unless doing single-thread RR */
1446         qemu_cpu_kick_rr_cpu();
1447     } else {
1448         if (hax_enabled()) {
1449             /*
1450              * FIXME: race condition with the exit_request check in
1451              * hax_vcpu_hax_exec
1452              */
1453             cpu->exit_request = 1;
1454         }
1455         qemu_cpu_kick_thread(cpu);
1456     }
1457 }
1458
1459 void qemu_cpu_kick_self(void)
1460 {
1461     assert(current_cpu);
1462     qemu_cpu_kick_thread(current_cpu);
1463 }
1464
1465 bool qemu_cpu_is_self(CPUState *cpu)
1466 {
1467     return qemu_thread_is_self(cpu->thread);
1468 }
1469
1470 bool qemu_in_vcpu_thread(void)
1471 {
1472     return current_cpu && qemu_cpu_is_self(current_cpu);
1473 }
1474
1475 static __thread bool iothread_locked = false;
1476
1477 bool qemu_mutex_iothread_locked(void)
1478 {
1479     return iothread_locked;
1480 }
1481
1482 void qemu_mutex_lock_iothread(void)
1483 {
1484     g_assert(!qemu_mutex_iothread_locked());
1485     qemu_mutex_lock(&qemu_global_mutex);
1486     iothread_locked = true;
1487 }
1488
1489 void qemu_mutex_unlock_iothread(void)
1490 {
1491     g_assert(qemu_mutex_iothread_locked());
1492     iothread_locked = false;
1493     qemu_mutex_unlock(&qemu_global_mutex);
1494 }
1495
1496 static bool all_vcpus_paused(void)
1497 {
1498     CPUState *cpu;
1499
1500     CPU_FOREACH(cpu) {
1501         if (!cpu->stopped) {
1502             return false;
1503         }
1504     }
1505
1506     return true;
1507 }
1508
1509 void pause_all_vcpus(void)
1510 {
1511     CPUState *cpu;
1512
1513     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1514     CPU_FOREACH(cpu) {
1515         cpu->stop = true;
1516         qemu_cpu_kick(cpu);
1517     }
1518
1519     if (qemu_in_vcpu_thread()) {
1520         cpu_stop_current();
1521     }
1522
1523     while (!all_vcpus_paused()) {
1524         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1525         CPU_FOREACH(cpu) {
1526             qemu_cpu_kick(cpu);
1527         }
1528     }
1529 }
1530
1531 void cpu_resume(CPUState *cpu)
1532 {
1533     cpu->stop = false;
1534     cpu->stopped = false;
1535     qemu_cpu_kick(cpu);
1536 }
1537
1538 void resume_all_vcpus(void)
1539 {
1540     CPUState *cpu;
1541
1542     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1543     CPU_FOREACH(cpu) {
1544         cpu_resume(cpu);
1545     }
1546 }
1547
1548 void cpu_remove(CPUState *cpu)
1549 {
1550     cpu->stop = true;
1551     cpu->unplug = true;
1552     qemu_cpu_kick(cpu);
1553 }
1554
1555 void cpu_remove_sync(CPUState *cpu)
1556 {
1557     cpu_remove(cpu);
1558     while (cpu->created) {
1559         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1560     }
1561 }
1562
1563 /* For temporary buffers for forming a name */
1564 #define VCPU_THREAD_NAME_SIZE 16
1565
1566 static void qemu_tcg_init_vcpu(CPUState *cpu)
1567 {
1568     char thread_name[VCPU_THREAD_NAME_SIZE];
1569     static QemuCond *single_tcg_halt_cond;
1570     static QemuThread *single_tcg_cpu_thread;
1571
1572     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1573         cpu->thread = g_malloc0(sizeof(QemuThread));
1574         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1575         qemu_cond_init(cpu->halt_cond);
1576
1577         if (qemu_tcg_mttcg_enabled()) {
1578             /* create a thread per vCPU with TCG (MTTCG) */
1579             parallel_cpus = true;
1580             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1581                  cpu->cpu_index);
1582
1583             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1584                                cpu, QEMU_THREAD_JOINABLE);
1585
1586         } else {
1587             /* share a single thread for all cpus with TCG */
1588             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1589             qemu_thread_create(cpu->thread, thread_name,
1590                                qemu_tcg_rr_cpu_thread_fn,
1591                                cpu, QEMU_THREAD_JOINABLE);
1592
1593             single_tcg_halt_cond = cpu->halt_cond;
1594             single_tcg_cpu_thread = cpu->thread;
1595         }
1596 #ifdef _WIN32
1597         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1598 #endif
1599         while (!cpu->created) {
1600             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1601         }
1602     } else {
1603         /* For non-MTTCG cases we share the thread */
1604         cpu->thread = single_tcg_cpu_thread;
1605         cpu->halt_cond = single_tcg_halt_cond;
1606     }
1607 }
1608
1609 static void qemu_hax_start_vcpu(CPUState *cpu)
1610 {
1611     char thread_name[VCPU_THREAD_NAME_SIZE];
1612
1613     cpu->thread = g_malloc0(sizeof(QemuThread));
1614     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1615     qemu_cond_init(cpu->halt_cond);
1616
1617     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1618              cpu->cpu_index);
1619     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1620                        cpu, QEMU_THREAD_JOINABLE);
1621 #ifdef _WIN32
1622     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1623 #endif
1624     while (!cpu->created) {
1625         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1626     }
1627 }
1628
1629 static void qemu_kvm_start_vcpu(CPUState *cpu)
1630 {
1631     char thread_name[VCPU_THREAD_NAME_SIZE];
1632
1633     cpu->thread = g_malloc0(sizeof(QemuThread));
1634     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1635     qemu_cond_init(cpu->halt_cond);
1636     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1637              cpu->cpu_index);
1638     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1639                        cpu, QEMU_THREAD_JOINABLE);
1640     while (!cpu->created) {
1641         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1642     }
1643 }
1644
1645 static void qemu_dummy_start_vcpu(CPUState *cpu)
1646 {
1647     char thread_name[VCPU_THREAD_NAME_SIZE];
1648
1649     cpu->thread = g_malloc0(sizeof(QemuThread));
1650     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1651     qemu_cond_init(cpu->halt_cond);
1652     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1653              cpu->cpu_index);
1654     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1655                        QEMU_THREAD_JOINABLE);
1656     while (!cpu->created) {
1657         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1658     }
1659 }
1660
1661 void qemu_init_vcpu(CPUState *cpu)
1662 {
1663     cpu->nr_cores = smp_cores;
1664     cpu->nr_threads = smp_threads;
1665     cpu->stopped = true;
1666
1667     if (!cpu->as) {
1668         /* If the target cpu hasn't set up any address spaces itself,
1669          * give it the default one.
1670          */
1671         AddressSpace *as = address_space_init_shareable(cpu->memory,
1672                                                         "cpu-memory");
1673         cpu->num_ases = 1;
1674         cpu_address_space_init(cpu, as, 0);
1675     }
1676
1677     if (kvm_enabled()) {
1678         qemu_kvm_start_vcpu(cpu);
1679     } else if (hax_enabled()) {
1680         qemu_hax_start_vcpu(cpu);
1681     } else if (tcg_enabled()) {
1682         qemu_tcg_init_vcpu(cpu);
1683     } else {
1684         qemu_dummy_start_vcpu(cpu);
1685     }
1686 }
1687
1688 void cpu_stop_current(void)
1689 {
1690     if (current_cpu) {
1691         current_cpu->stop = false;
1692         current_cpu->stopped = true;
1693         cpu_exit(current_cpu);
1694         qemu_cond_broadcast(&qemu_pause_cond);
1695     }
1696 }
1697
1698 int vm_stop(RunState state)
1699 {
1700     if (qemu_in_vcpu_thread()) {
1701         qemu_system_vmstop_request_prepare();
1702         qemu_system_vmstop_request(state);
1703         /*
1704          * FIXME: should not return to device code in case
1705          * vm_stop() has been requested.
1706          */
1707         cpu_stop_current();
1708         return 0;
1709     }
1710
1711     return do_vm_stop(state);
1712 }
1713
1714 /**
1715  * Prepare for (re)starting the VM.
1716  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1717  * running or in case of an error condition), 0 otherwise.
1718  */
1719 int vm_prepare_start(void)
1720 {
1721     RunState requested;
1722     int res = 0;
1723
1724     qemu_vmstop_requested(&requested);
1725     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1726         return -1;
1727     }
1728
1729     /* Ensure that a STOP/RESUME pair of events is emitted if a
1730      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1731      * example, according to documentation is always followed by
1732      * the STOP event.
1733      */
1734     if (runstate_is_running()) {
1735         qapi_event_send_stop(&error_abort);
1736         res = -1;
1737     } else {
1738         replay_enable_events();
1739         cpu_enable_ticks();
1740         runstate_set(RUN_STATE_RUNNING);
1741         vm_state_notify(1, RUN_STATE_RUNNING);
1742     }
1743
1744     /* We are sending this now, but the CPUs will be resumed shortly later */
1745     qapi_event_send_resume(&error_abort);
1746     return res;
1747 }
1748
1749 void vm_start(void)
1750 {
1751     if (!vm_prepare_start()) {
1752         resume_all_vcpus();
1753     }
1754 }
1755
1756 /* does a state transition even if the VM is already stopped,
1757    current state is forgotten forever */
1758 int vm_stop_force_state(RunState state)
1759 {
1760     if (runstate_is_running()) {
1761         return vm_stop(state);
1762     } else {
1763         runstate_set(state);
1764
1765         bdrv_drain_all();
1766         /* Make sure to return an error if the flush in a previous vm_stop()
1767          * failed. */
1768         return bdrv_flush_all();
1769     }
1770 }
1771
1772 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1773 {
1774     /* XXX: implement xxx_cpu_list for targets that still miss it */
1775 #if defined(cpu_list)
1776     cpu_list(f, cpu_fprintf);
1777 #endif
1778 }
1779
1780 CpuInfoList *qmp_query_cpus(Error **errp)
1781 {
1782     CpuInfoList *head = NULL, *cur_item = NULL;
1783     CPUState *cpu;
1784
1785     CPU_FOREACH(cpu) {
1786         CpuInfoList *info;
1787 #if defined(TARGET_I386)
1788         X86CPU *x86_cpu = X86_CPU(cpu);
1789         CPUX86State *env = &x86_cpu->env;
1790 #elif defined(TARGET_PPC)
1791         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1792         CPUPPCState *env = &ppc_cpu->env;
1793 #elif defined(TARGET_SPARC)
1794         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1795         CPUSPARCState *env = &sparc_cpu->env;
1796 #elif defined(TARGET_MIPS)
1797         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1798         CPUMIPSState *env = &mips_cpu->env;
1799 #elif defined(TARGET_TRICORE)
1800         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1801         CPUTriCoreState *env = &tricore_cpu->env;
1802 #endif
1803
1804         cpu_synchronize_state(cpu);
1805
1806         info = g_malloc0(sizeof(*info));
1807         info->value = g_malloc0(sizeof(*info->value));
1808         info->value->CPU = cpu->cpu_index;
1809         info->value->current = (cpu == first_cpu);
1810         info->value->halted = cpu->halted;
1811         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1812         info->value->thread_id = cpu->thread_id;
1813 #if defined(TARGET_I386)
1814         info->value->arch = CPU_INFO_ARCH_X86;
1815         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1816 #elif defined(TARGET_PPC)
1817         info->value->arch = CPU_INFO_ARCH_PPC;
1818         info->value->u.ppc.nip = env->nip;
1819 #elif defined(TARGET_SPARC)
1820         info->value->arch = CPU_INFO_ARCH_SPARC;
1821         info->value->u.q_sparc.pc = env->pc;
1822         info->value->u.q_sparc.npc = env->npc;
1823 #elif defined(TARGET_MIPS)
1824         info->value->arch = CPU_INFO_ARCH_MIPS;
1825         info->value->u.q_mips.PC = env->active_tc.PC;
1826 #elif defined(TARGET_TRICORE)
1827         info->value->arch = CPU_INFO_ARCH_TRICORE;
1828         info->value->u.tricore.PC = env->PC;
1829 #else
1830         info->value->arch = CPU_INFO_ARCH_OTHER;
1831 #endif
1832
1833         /* XXX: waiting for the qapi to support GSList */
1834         if (!cur_item) {
1835             head = cur_item = info;
1836         } else {
1837             cur_item->next = info;
1838             cur_item = info;
1839         }
1840     }
1841
1842     return head;
1843 }
1844
1845 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1846                  bool has_cpu, int64_t cpu_index, Error **errp)
1847 {
1848     FILE *f;
1849     uint32_t l;
1850     CPUState *cpu;
1851     uint8_t buf[1024];
1852     int64_t orig_addr = addr, orig_size = size;
1853
1854     if (!has_cpu) {
1855         cpu_index = 0;
1856     }
1857
1858     cpu = qemu_get_cpu(cpu_index);
1859     if (cpu == NULL) {
1860         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1861                    "a CPU number");
1862         return;
1863     }
1864
1865     f = fopen(filename, "wb");
1866     if (!f) {
1867         error_setg_file_open(errp, errno, filename);
1868         return;
1869     }
1870
1871     while (size != 0) {
1872         l = sizeof(buf);
1873         if (l > size)
1874             l = size;
1875         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1876             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1877                              " specified", orig_addr, orig_size);
1878             goto exit;
1879         }
1880         if (fwrite(buf, 1, l, f) != l) {
1881             error_setg(errp, QERR_IO_ERROR);
1882             goto exit;
1883         }
1884         addr += l;
1885         size -= l;
1886     }
1887
1888 exit:
1889     fclose(f);
1890 }
1891
1892 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1893                   Error **errp)
1894 {
1895     FILE *f;
1896     uint32_t l;
1897     uint8_t buf[1024];
1898
1899     f = fopen(filename, "wb");
1900     if (!f) {
1901         error_setg_file_open(errp, errno, filename);
1902         return;
1903     }
1904
1905     while (size != 0) {
1906         l = sizeof(buf);
1907         if (l > size)
1908             l = size;
1909         cpu_physical_memory_read(addr, buf, l);
1910         if (fwrite(buf, 1, l, f) != l) {
1911             error_setg(errp, QERR_IO_ERROR);
1912             goto exit;
1913         }
1914         addr += l;
1915         size -= l;
1916     }
1917
1918 exit:
1919     fclose(f);
1920 }
1921
1922 void qmp_inject_nmi(Error **errp)
1923 {
1924     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1925 }
1926
1927 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1928 {
1929     if (!use_icount) {
1930         return;
1931     }
1932
1933     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1934                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1935     if (icount_align_option) {
1936         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1937         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1938     } else {
1939         cpu_fprintf(f, "Max guest delay     NA\n");
1940         cpu_fprintf(f, "Max guest advance   NA\n");
1941     }
1942 }