cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     if (use_icount || TCG_OVERSIZED_GUEST) {
 185         return false;
 186     } else {
 187 #ifdef TARGET_SUPPORTS_MTTCG
 188         return check_tcg_memory_orders_compatible();
 189 #else
 190         return false;
 191 #endif
 192     }
 193 }
 194
 195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 196 {
 197     const char *t = qemu_opt_get(opts, "thread");
 198     if (t) {
 199         if (strcmp(t, "multi") == 0) {
 200             if (TCG_OVERSIZED_GUEST) {
 201                 error_setg(errp, "No MTTCG when guest word size > hosts");
 202             } else if (use_icount) {
 203                 error_setg(errp, "No MTTCG when icount is enabled");
 204             } else {
 205                 if (!check_tcg_memory_orders_compatible()) {
 206                     error_report("Guest expects a stronger memory ordering "
 207                                  "than the host provides");
 208                     error_printf("This may cause strange/hard to debug errors");
 209                 }
 210                 mttcg_enabled = true;
 211             }
 212         } else if (strcmp(t, "single") == 0) {
 213             mttcg_enabled = false;
 214         } else {
 215             error_setg(errp, "Invalid 'thread' setting %s", t);
 216         }
 217     } else {
 218         mttcg_enabled = default_mttcg_enabled();
 219     }
 220 }
 221
 222 int64_t cpu_get_icount_raw(void)
 223 {
 224     int64_t icount;
 225     CPUState *cpu = current_cpu;
 226
 227     icount = timers_state.qemu_icount;
 228     if (cpu) {
 229         if (!cpu->can_do_io) {
 230             fprintf(stderr, "Bad icount read\n");
 231             exit(1);
 232         }
 233         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 234     }
 235     return icount;
 236 }
 237
 238 /* Return the virtual CPU time, based on the instruction counter.  */
 239 static int64_t cpu_get_icount_locked(void)
 240 {
 241     int64_t icount = cpu_get_icount_raw();
 242     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 243 }
 244
 245 int64_t cpu_get_icount(void)
 246 {
 247     int64_t icount;
 248     unsigned start;
 249
 250     do {
 251         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 252         icount = cpu_get_icount_locked();
 253     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 254
 255     return icount;
 256 }
 257
 258 int64_t cpu_icount_to_ns(int64_t icount)
 259 {
 260     return icount << icount_time_shift;
 261 }
 262
 263 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 264  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 265  * counter.
 266  *
 267  * Caller must hold the BQL
 268  */
 269 int64_t cpu_get_ticks(void)
 270 {
 271     int64_t ticks;
 272
 273     if (use_icount) {
 274         return cpu_get_icount();
 275     }
 276
 277     ticks = timers_state.cpu_ticks_offset;
 278     if (timers_state.cpu_ticks_enabled) {
 279         ticks += cpu_get_host_ticks();
 280     }
 281
 282     if (timers_state.cpu_ticks_prev > ticks) {
 283         /* Note: non increasing ticks may happen if the host uses
 284            software suspend */
 285         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 286         ticks = timers_state.cpu_ticks_prev;
 287     }
 288
 289     timers_state.cpu_ticks_prev = ticks;
 290     return ticks;
 291 }
 292
 293 static int64_t cpu_get_clock_locked(void)
 294 {
 295     int64_t time;
 296
 297     time = timers_state.cpu_clock_offset;
 298     if (timers_state.cpu_ticks_enabled) {
 299         time += get_clock();
 300     }
 301
 302     return time;
 303 }
 304
 305 /* Return the monotonic time elapsed in VM, i.e.,
 306  * the time between vm_start and vm_stop
 307  */
 308 int64_t cpu_get_clock(void)
 309 {
 310     int64_t ti;
 311     unsigned start;
 312
 313     do {
 314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315         ti = cpu_get_clock_locked();
 316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318     return ti;
 319 }
 320
 321 /* enable cpu_get_ticks()
 322  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 323  */
 324 void cpu_enable_ticks(void)
 325 {
 326     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 327     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 328     if (!timers_state.cpu_ticks_enabled) {
 329         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 330         timers_state.cpu_clock_offset -= get_clock();
 331         timers_state.cpu_ticks_enabled = 1;
 332     }
 333     seqlock_write_end(&timers_state.vm_clock_seqlock);
 334 }
 335
 336 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 337  * cpu_get_ticks() after that.
 338  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 339  */
 340 void cpu_disable_ticks(void)
 341 {
 342     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 343     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 344     if (timers_state.cpu_ticks_enabled) {
 345         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 346         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 347         timers_state.cpu_ticks_enabled = 0;
 348     }
 349     seqlock_write_end(&timers_state.vm_clock_seqlock);
 350 }
 351
 352 /* Correlation between real and virtual time is always going to be
 353    fairly approximate, so ignore small variation.
 354    When the guest is idle real and virtual time will be aligned in
 355    the IO wait loop.  */
 356 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 357
 358 static void icount_adjust(void)
 359 {
 360     int64_t cur_time;
 361     int64_t cur_icount;
 362     int64_t delta;
 363
 364     /* Protected by TimersState mutex.  */
 365     static int64_t last_delta;
 366
 367     /* If the VM is not running, then do nothing.  */
 368     if (!runstate_is_running()) {
 369         return;
 370     }
 371
 372     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 373     cur_time = cpu_get_clock_locked();
 374     cur_icount = cpu_get_icount_locked();
 375
 376     delta = cur_icount - cur_time;
 377     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 378     if (delta > 0
 379         && last_delta + ICOUNT_WOBBLE < delta * 2
 380         && icount_time_shift > 0) {
 381         /* The guest is getting too far ahead.  Slow time down.  */
 382         icount_time_shift--;
 383     }
 384     if (delta < 0
 385         && last_delta - ICOUNT_WOBBLE > delta * 2
 386         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 387         /* The guest is getting too far behind.  Speed time up.  */
 388         icount_time_shift++;
 389     }
 390     last_delta = delta;
 391     timers_state.qemu_icount_bias = cur_icount
 392                               - (timers_state.qemu_icount << icount_time_shift);
 393     seqlock_write_end(&timers_state.vm_clock_seqlock);
 394 }
 395
 396 static void icount_adjust_rt(void *opaque)
 397 {
 398     timer_mod(icount_rt_timer,
 399               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 400     icount_adjust();
 401 }
 402
 403 static void icount_adjust_vm(void *opaque)
 404 {
 405     timer_mod(icount_vm_timer,
 406                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 407                    NANOSECONDS_PER_SECOND / 10);
 408     icount_adjust();
 409 }
 410
 411 static int64_t qemu_icount_round(int64_t count)
 412 {
 413     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 414 }
 415
 416 static void icount_warp_rt(void)
 417 {
 418     unsigned seq;
 419     int64_t warp_start;
 420
 421     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 422      * changes from -1 to another value, so the race here is okay.
 423      */
 424     do {
 425         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 426         warp_start = vm_clock_warp_start;
 427     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 428
 429     if (warp_start == -1) {
 430         return;
 431     }
 432
 433     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 434     if (runstate_is_running()) {
 435         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 436                                      cpu_get_clock_locked());
 437         int64_t warp_delta;
 438
 439         warp_delta = clock - vm_clock_warp_start;
 440         if (use_icount == 2) {
 441             /*
 442              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 443              * far ahead of real time.
 444              */
 445             int64_t cur_icount = cpu_get_icount_locked();
 446             int64_t delta = clock - cur_icount;
 447             warp_delta = MIN(warp_delta, delta);
 448         }
 449         timers_state.qemu_icount_bias += warp_delta;
 450     }
 451     vm_clock_warp_start = -1;
 452     seqlock_write_end(&timers_state.vm_clock_seqlock);
 453
 454     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 455         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 456     }
 457 }
 458
 459 static void icount_timer_cb(void *opaque)
 460 {
 461     /* No need for a checkpoint because the timer already synchronizes
 462      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 463      */
 464     icount_warp_rt();
 465 }
 466
 467 void qtest_clock_warp(int64_t dest)
 468 {
 469     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 470     AioContext *aio_context;
 471     assert(qtest_enabled());
 472     aio_context = qemu_get_aio_context();
 473     while (clock < dest) {
 474         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 475         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 476
 477         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 478         timers_state.qemu_icount_bias += warp;
 479         seqlock_write_end(&timers_state.vm_clock_seqlock);
 480
 481         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 482         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 483         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 484     }
 485     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 486 }
 487
 488 void qemu_start_warp_timer(void)
 489 {
 490     int64_t clock;
 491     int64_t deadline;
 492
 493     if (!use_icount) {
 494         return;
 495     }
 496
 497     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 498      * do not fire, so computing the deadline does not make sense.
 499      */
 500     if (!runstate_is_running()) {
 501         return;
 502     }
 503
 504     /* warp clock deterministically in record/replay mode */
 505     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 506         return;
 507     }
 508
 509     if (!all_cpu_threads_idle()) {
 510         return;
 511     }
 512
 513     if (qtest_enabled()) {
 514         /* When testing, qtest commands advance icount.  */
 515         return;
 516     }
 517
 518     /* We want to use the earliest deadline from ALL vm_clocks */
 519     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 520     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 521     if (deadline < 0) {
 522         static bool notified;
 523         if (!icount_sleep && !notified) {
 524             error_report("WARNING: icount sleep disabled and no active timers");
 525             notified = true;
 526         }
 527         return;
 528     }
 529
 530     if (deadline > 0) {
 531         /*
 532          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 533          * sleep.  Otherwise, the CPU might be waiting for a future timer
 534          * interrupt to wake it up, but the interrupt never comes because
 535          * the vCPU isn't running any insns and thus doesn't advance the
 536          * QEMU_CLOCK_VIRTUAL.
 537          */
 538         if (!icount_sleep) {
 539             /*
 540              * We never let VCPUs sleep in no sleep icount mode.
 541              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 542              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 543              * It is useful when we want a deterministic execution time,
 544              * isolated from host latencies.
 545              */
 546             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 547             timers_state.qemu_icount_bias += deadline;
 548             seqlock_write_end(&timers_state.vm_clock_seqlock);
 549             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 550         } else {
 551             /*
 552              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 553              * "real" time, (related to the time left until the next event) has
 554              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 555              * This avoids that the warps are visible externally; for example,
 556              * you will not be sending network packets continuously instead of
 557              * every 100ms.
 558              */
 559             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 560             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 561                 vm_clock_warp_start = clock;
 562             }
 563             seqlock_write_end(&timers_state.vm_clock_seqlock);
 564             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 565         }
 566     } else if (deadline == 0) {
 567         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568     }
 569 }
 570
 571 static void qemu_account_warp_timer(void)
 572 {
 573     if (!use_icount || !icount_sleep) {
 574         return;
 575     }
 576
 577     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 578      * do not fire, so computing the deadline does not make sense.
 579      */
 580     if (!runstate_is_running()) {
 581         return;
 582     }
 583
 584     /* warp clock deterministically in record/replay mode */
 585     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 586         return;
 587     }
 588
 589     timer_del(icount_warp_timer);
 590     icount_warp_rt();
 591 }
 592
 593 static bool icount_state_needed(void *opaque)
 594 {
 595     return use_icount;
 596 }
 597
 598 /*
 599  * This is a subsection for icount migration.
 600  */
 601 static const VMStateDescription icount_vmstate_timers = {
 602     .name = "timer/icount",
 603     .version_id = 1,
 604     .minimum_version_id = 1,
 605     .needed = icount_state_needed,
 606     .fields = (VMStateField[]) {
 607         VMSTATE_INT64(qemu_icount_bias, TimersState),
 608         VMSTATE_INT64(qemu_icount, TimersState),
 609         VMSTATE_END_OF_LIST()
 610     }
 611 };
 612
 613 static const VMStateDescription vmstate_timers = {
 614     .name = "timer",
 615     .version_id = 2,
 616     .minimum_version_id = 1,
 617     .fields = (VMStateField[]) {
 618         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 619         VMSTATE_INT64(dummy, TimersState),
 620         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 621         VMSTATE_END_OF_LIST()
 622     },
 623     .subsections = (const VMStateDescription*[]) {
 624         &icount_vmstate_timers,
 625         NULL
 626     }
 627 };
 628
 629 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 630 {
 631     double pct;
 632     double throttle_ratio;
 633     long sleeptime_ns;
 634
 635     if (!cpu_throttle_get_percentage()) {
 636         return;
 637     }
 638
 639     pct = (double)cpu_throttle_get_percentage()/100;
 640     throttle_ratio = pct / (1 - pct);
 641     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 642
 643     qemu_mutex_unlock_iothread();
 644     atomic_set(&cpu->throttle_thread_scheduled, 0);
 645     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 646     qemu_mutex_lock_iothread();
 647 }
 648
 649 static void cpu_throttle_timer_tick(void *opaque)
 650 {
 651     CPUState *cpu;
 652     double pct;
 653
 654     /* Stop the timer if needed */
 655     if (!cpu_throttle_get_percentage()) {
 656         return;
 657     }
 658     CPU_FOREACH(cpu) {
 659         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 660             async_run_on_cpu(cpu, cpu_throttle_thread,
 661                              RUN_ON_CPU_NULL);
 662         }
 663     }
 664
 665     pct = (double)cpu_throttle_get_percentage()/100;
 666     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 667                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 668 }
 669
 670 void cpu_throttle_set(int new_throttle_pct)
 671 {
 672     /* Ensure throttle percentage is within valid range */
 673     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 674     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 675
 676     atomic_set(&throttle_percentage, new_throttle_pct);
 677
 678     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 679                                        CPU_THROTTLE_TIMESLICE_NS);
 680 }
 681
 682 void cpu_throttle_stop(void)
 683 {
 684     atomic_set(&throttle_percentage, 0);
 685 }
 686
 687 bool cpu_throttle_active(void)
 688 {
 689     return (cpu_throttle_get_percentage() != 0);
 690 }
 691
 692 int cpu_throttle_get_percentage(void)
 693 {
 694     return atomic_read(&throttle_percentage);
 695 }
 696
 697 void cpu_ticks_init(void)
 698 {
 699     seqlock_init(&timers_state.vm_clock_seqlock);
 700     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 701     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 702                                            cpu_throttle_timer_tick, NULL);
 703 }
 704
 705 void configure_icount(QemuOpts *opts, Error **errp)
 706 {
 707     const char *option;
 708     char *rem_str = NULL;
 709
 710     option = qemu_opt_get(opts, "shift");
 711     if (!option) {
 712         if (qemu_opt_get(opts, "align") != NULL) {
 713             error_setg(errp, "Please specify shift option when using align");
 714         }
 715         return;
 716     }
 717
 718     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 719     if (icount_sleep) {
 720         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 721                                          icount_timer_cb, NULL);
 722     }
 723
 724     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 725
 726     if (icount_align_option && !icount_sleep) {
 727         error_setg(errp, "align=on and sleep=off are incompatible");
 728     }
 729     if (strcmp(option, "auto") != 0) {
 730         errno = 0;
 731         icount_time_shift = strtol(option, &rem_str, 0);
 732         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 733             error_setg(errp, "icount: Invalid shift value");
 734         }
 735         use_icount = 1;
 736         return;
 737     } else if (icount_align_option) {
 738         error_setg(errp, "shift=auto and align=on are incompatible");
 739     } else if (!icount_sleep) {
 740         error_setg(errp, "shift=auto and sleep=off are incompatible");
 741     }
 742
 743     use_icount = 2;
 744
 745     /* 125MIPS seems a reasonable initial guess at the guest speed.
 746        It will be corrected fairly quickly anyway.  */
 747     icount_time_shift = 3;
 748
 749     /* Have both realtime and virtual time triggers for speed adjustment.
 750        The realtime trigger catches emulated time passing too slowly,
 751        the virtual time trigger catches emulated time passing too fast.
 752        Realtime triggers occur even when idle, so use them less frequently
 753        than VM triggers.  */
 754     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 755                                    icount_adjust_rt, NULL);
 756     timer_mod(icount_rt_timer,
 757                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 758     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 759                                         icount_adjust_vm, NULL);
 760     timer_mod(icount_vm_timer,
 761                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 762                    NANOSECONDS_PER_SECOND / 10);
 763 }
 764
 765 /***********************************************************/
 766 /* TCG vCPU kick timer
 767  *
 768  * The kick timer is responsible for moving single threaded vCPU
 769  * emulation on to the next vCPU. If more than one vCPU is running a
 770  * timer event with force a cpu->exit so the next vCPU can get
 771  * scheduled.
 772  *
 773  * The timer is removed if all vCPUs are idle and restarted again once
 774  * idleness is complete.
 775  */
 776
 777 static QEMUTimer *tcg_kick_vcpu_timer;
 778 static CPUState *tcg_current_rr_cpu;
 779
 780 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 781
 782 static inline int64_t qemu_tcg_next_kick(void)
 783 {
 784     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 785 }
 786
 787 /* Kick the currently round-robin scheduled vCPU */
 788 static void qemu_cpu_kick_rr_cpu(void)
 789 {
 790     CPUState *cpu;
 791     do {
 792         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 793         if (cpu) {
 794             cpu_exit(cpu);
 795         }
 796     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 797 }
 798
 799 static void kick_tcg_thread(void *opaque)
 800 {
 801     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 802     qemu_cpu_kick_rr_cpu();
 803 }
 804
 805 static void start_tcg_kick_timer(void)
 806 {
 807     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 808         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 809                                            kick_tcg_thread, NULL);
 810         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 811     }
 812 }
 813
 814 static void stop_tcg_kick_timer(void)
 815 {
 816     if (tcg_kick_vcpu_timer) {
 817         timer_del(tcg_kick_vcpu_timer);
 818         tcg_kick_vcpu_timer = NULL;
 819     }
 820 }
 821
 822 /***********************************************************/
 823 void hw_error(const char *fmt, ...)
 824 {
 825     va_list ap;
 826     CPUState *cpu;
 827
 828     va_start(ap, fmt);
 829     fprintf(stderr, "qemu: hardware error: ");
 830     vfprintf(stderr, fmt, ap);
 831     fprintf(stderr, "\n");
 832     CPU_FOREACH(cpu) {
 833         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 834         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 835     }
 836     va_end(ap);
 837     abort();
 838 }
 839
 840 void cpu_synchronize_all_states(void)
 841 {
 842     CPUState *cpu;
 843
 844     CPU_FOREACH(cpu) {
 845         cpu_synchronize_state(cpu);
 846     }
 847 }
 848
 849 void cpu_synchronize_all_post_reset(void)
 850 {
 851     CPUState *cpu;
 852
 853     CPU_FOREACH(cpu) {
 854         cpu_synchronize_post_reset(cpu);
 855     }
 856 }
 857
 858 void cpu_synchronize_all_post_init(void)
 859 {
 860     CPUState *cpu;
 861
 862     CPU_FOREACH(cpu) {
 863         cpu_synchronize_post_init(cpu);
 864     }
 865 }
 866
 867 static int do_vm_stop(RunState state)
 868 {
 869     int ret = 0;
 870
 871     if (runstate_is_running()) {
 872         cpu_disable_ticks();
 873         pause_all_vcpus();
 874         runstate_set(state);
 875         vm_state_notify(0, state);
 876         qapi_event_send_stop(&error_abort);
 877     }
 878
 879     bdrv_drain_all();
 880     replay_disable_events();
 881     ret = bdrv_flush_all();
 882
 883     return ret;
 884 }
 885
 886 static bool cpu_can_run(CPUState *cpu)
 887 {
 888     if (cpu->stop) {
 889         return false;
 890     }
 891     if (cpu_is_stopped(cpu)) {
 892         return false;
 893     }
 894     return true;
 895 }
 896
 897 static void cpu_handle_guest_debug(CPUState *cpu)
 898 {
 899     gdb_set_stop_cpu(cpu);
 900     qemu_system_debug_request();
 901     cpu->stopped = true;
 902 }
 903
 904 #ifdef CONFIG_LINUX
 905 static void sigbus_reraise(void)
 906 {
 907     sigset_t set;
 908     struct sigaction action;
 909
 910     memset(&action, 0, sizeof(action));
 911     action.sa_handler = SIG_DFL;
 912     if (!sigaction(SIGBUS, &action, NULL)) {
 913         raise(SIGBUS);
 914         sigemptyset(&set);
 915         sigaddset(&set, SIGBUS);
 916         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 917     }
 918     perror("Failed to re-raise SIGBUS!\n");
 919     abort();
 920 }
 921
 922 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 923 {
 924     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 925         sigbus_reraise();
 926     }
 927
 928     if (current_cpu) {
 929         /* Called asynchronously in VCPU thread.  */
 930         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 931             sigbus_reraise();
 932         }
 933     } else {
 934         /* Called synchronously (via signalfd) in main thread.  */
 935         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 936             sigbus_reraise();
 937         }
 938     }
 939 }
 940
 941 static void qemu_init_sigbus(void)
 942 {
 943     struct sigaction action;
 944
 945     memset(&action, 0, sizeof(action));
 946     action.sa_flags = SA_SIGINFO;
 947     action.sa_sigaction = sigbus_handler;
 948     sigaction(SIGBUS, &action, NULL);
 949
 950     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 951 }
 952 #else /* !CONFIG_LINUX */
 953 static void qemu_init_sigbus(void)
 954 {
 955 }
 956 #endif /* !CONFIG_LINUX */
 957
 958 static QemuMutex qemu_global_mutex;
 959
 960 static QemuThread io_thread;
 961
 962 /* cpu creation */
 963 static QemuCond qemu_cpu_cond;
 964 /* system init */
 965 static QemuCond qemu_pause_cond;
 966
 967 void qemu_init_cpu_loop(void)
 968 {
 969     qemu_init_sigbus();
 970     qemu_cond_init(&qemu_cpu_cond);
 971     qemu_cond_init(&qemu_pause_cond);
 972     qemu_mutex_init(&qemu_global_mutex);
 973
 974     qemu_thread_get_self(&io_thread);
 975 }
 976
 977 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 978 {
 979     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 980 }
 981
 982 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
 983 {
 984     if (kvm_destroy_vcpu(cpu) < 0) {
 985         error_report("kvm_destroy_vcpu failed");
 986         exit(EXIT_FAILURE);
 987     }
 988 }
 989
 990 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 991 {
 992 }
 993
 994 static void qemu_wait_io_event_common(CPUState *cpu)
 995 {
 996     atomic_mb_set(&cpu->thread_kicked, false);
 997     if (cpu->stop) {
 998         cpu->stop = false;
 999         cpu->stopped = true;
1000         qemu_cond_broadcast(&qemu_pause_cond);
1001     }
1002     process_queued_cpu_work(cpu);
1003 }
1004
1005 static bool qemu_tcg_should_sleep(CPUState *cpu)
1006 {
1007     if (mttcg_enabled) {
1008         return cpu_thread_is_idle(cpu);
1009     } else {
1010         return all_cpu_threads_idle();
1011     }
1012 }
1013
1014 static void qemu_tcg_wait_io_event(CPUState *cpu)
1015 {
1016     while (qemu_tcg_should_sleep(cpu)) {
1017         stop_tcg_kick_timer();
1018         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1019     }
1020
1021     start_tcg_kick_timer();
1022
1023     qemu_wait_io_event_common(cpu);
1024 }
1025
1026 static void qemu_kvm_wait_io_event(CPUState *cpu)
1027 {
1028     while (cpu_thread_is_idle(cpu)) {
1029         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1030     }
1031
1032     qemu_wait_io_event_common(cpu);
1033 }
1034
1035 static void *qemu_kvm_cpu_thread_fn(void *arg)
1036 {
1037     CPUState *cpu = arg;
1038     int r;
1039
1040     rcu_register_thread();
1041
1042     qemu_mutex_lock_iothread();
1043     qemu_thread_get_self(cpu->thread);
1044     cpu->thread_id = qemu_get_thread_id();
1045     cpu->can_do_io = 1;
1046     current_cpu = cpu;
1047
1048     r = kvm_init_vcpu(cpu);
1049     if (r < 0) {
1050         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1051         exit(1);
1052     }
1053
1054     kvm_init_cpu_signals(cpu);
1055
1056     /* signal CPU creation */
1057     cpu->created = true;
1058     qemu_cond_signal(&qemu_cpu_cond);
1059
1060     do {
1061         if (cpu_can_run(cpu)) {
1062             r = kvm_cpu_exec(cpu);
1063             if (r == EXCP_DEBUG) {
1064                 cpu_handle_guest_debug(cpu);
1065             }
1066         }
1067         qemu_kvm_wait_io_event(cpu);
1068     } while (!cpu->unplug || cpu_can_run(cpu));
1069
1070     qemu_kvm_destroy_vcpu(cpu);
1071     cpu->created = false;
1072     qemu_cond_signal(&qemu_cpu_cond);
1073     qemu_mutex_unlock_iothread();
1074     return NULL;
1075 }
1076
1077 static void *qemu_dummy_cpu_thread_fn(void *arg)
1078 {
1079 #ifdef _WIN32
1080     fprintf(stderr, "qtest is not supported under Windows\n");
1081     exit(1);
1082 #else
1083     CPUState *cpu = arg;
1084     sigset_t waitset;
1085     int r;
1086
1087     rcu_register_thread();
1088
1089     qemu_mutex_lock_iothread();
1090     qemu_thread_get_self(cpu->thread);
1091     cpu->thread_id = qemu_get_thread_id();
1092     cpu->can_do_io = 1;
1093     current_cpu = cpu;
1094
1095     sigemptyset(&waitset);
1096     sigaddset(&waitset, SIG_IPI);
1097
1098     /* signal CPU creation */
1099     cpu->created = true;
1100     qemu_cond_signal(&qemu_cpu_cond);
1101
1102     while (1) {
1103         qemu_mutex_unlock_iothread();
1104         do {
1105             int sig;
1106             r = sigwait(&waitset, &sig);
1107         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1108         if (r == -1) {
1109             perror("sigwait");
1110             exit(1);
1111         }
1112         qemu_mutex_lock_iothread();
1113         qemu_wait_io_event_common(cpu);
1114     }
1115
1116     return NULL;
1117 #endif
1118 }
1119
1120 static int64_t tcg_get_icount_limit(void)
1121 {
1122     int64_t deadline;
1123
1124     if (replay_mode != REPLAY_MODE_PLAY) {
1125         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1126
1127         /* Maintain prior (possibly buggy) behaviour where if no deadline
1128          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1129          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1130          * nanoseconds.
1131          */
1132         if ((deadline < 0) || (deadline > INT32_MAX)) {
1133             deadline = INT32_MAX;
1134         }
1135
1136         return qemu_icount_round(deadline);
1137     } else {
1138         return replay_get_instructions();
1139     }
1140 }
1141
1142 static void handle_icount_deadline(void)
1143 {
1144     if (use_icount) {
1145         int64_t deadline =
1146             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1147
1148         if (deadline == 0) {
1149             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1150         }
1151     }
1152 }
1153
1154 static int tcg_cpu_exec(CPUState *cpu)
1155 {
1156     int ret;
1157 #ifdef CONFIG_PROFILER
1158     int64_t ti;
1159 #endif
1160
1161 #ifdef CONFIG_PROFILER
1162     ti = profile_getclock();
1163 #endif
1164     if (use_icount) {
1165         int64_t count;
1166         int decr;
1167         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1168                                     + cpu->icount_extra);
1169         cpu->icount_decr.u16.low = 0;
1170         cpu->icount_extra = 0;
1171         count = tcg_get_icount_limit();
1172         timers_state.qemu_icount += count;
1173         decr = (count > 0xffff) ? 0xffff : count;
1174         count -= decr;
1175         cpu->icount_decr.u16.low = decr;
1176         cpu->icount_extra = count;
1177     }
1178     qemu_mutex_unlock_iothread();
1179     cpu_exec_start(cpu);
1180     ret = cpu_exec(cpu);
1181     cpu_exec_end(cpu);
1182     qemu_mutex_lock_iothread();
1183 #ifdef CONFIG_PROFILER
1184     tcg_time += profile_getclock() - ti;
1185 #endif
1186     if (use_icount) {
1187         /* Fold pending instructions back into the
1188            instruction counter, and clear the interrupt flag.  */
1189         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1190                         + cpu->icount_extra);
1191         cpu->icount_decr.u32 = 0;
1192         cpu->icount_extra = 0;
1193         replay_account_executed_instructions();
1194     }
1195     return ret;
1196 }
1197
1198 /* Destroy any remaining vCPUs which have been unplugged and have
1199  * finished running
1200  */
1201 static void deal_with_unplugged_cpus(void)
1202 {
1203     CPUState *cpu;
1204
1205     CPU_FOREACH(cpu) {
1206         if (cpu->unplug && !cpu_can_run(cpu)) {
1207             qemu_tcg_destroy_vcpu(cpu);
1208             cpu->created = false;
1209             qemu_cond_signal(&qemu_cpu_cond);
1210             break;
1211         }
1212     }
1213 }
1214
1215 /* Single-threaded TCG
1216  *
1217  * In the single-threaded case each vCPU is simulated in turn. If
1218  * there is more than a single vCPU we create a simple timer to kick
1219  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1220  * This is done explicitly rather than relying on side-effects
1221  * elsewhere.
1222  */
1223
1224 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1225 {
1226     CPUState *cpu = arg;
1227
1228     rcu_register_thread();
1229
1230     qemu_mutex_lock_iothread();
1231     qemu_thread_get_self(cpu->thread);
1232
1233     CPU_FOREACH(cpu) {
1234         cpu->thread_id = qemu_get_thread_id();
1235         cpu->created = true;
1236         cpu->can_do_io = 1;
1237     }
1238     qemu_cond_signal(&qemu_cpu_cond);
1239
1240     /* wait for initial kick-off after machine start */
1241     while (first_cpu->stopped) {
1242         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1243
1244         /* process any pending work */
1245         CPU_FOREACH(cpu) {
1246             current_cpu = cpu;
1247             qemu_wait_io_event_common(cpu);
1248         }
1249     }
1250
1251     start_tcg_kick_timer();
1252
1253     cpu = first_cpu;
1254
1255     /* process any pending work */
1256     cpu->exit_request = 1;
1257
1258     while (1) {
1259         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1260         qemu_account_warp_timer();
1261
1262         if (!cpu) {
1263             cpu = first_cpu;
1264         }
1265
1266         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1267
1268             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1269             current_cpu = cpu;
1270
1271             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1272                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1273
1274             if (cpu_can_run(cpu)) {
1275                 int r;
1276                 r = tcg_cpu_exec(cpu);
1277                 if (r == EXCP_DEBUG) {
1278                     cpu_handle_guest_debug(cpu);
1279                     break;
1280                 } else if (r == EXCP_ATOMIC) {
1281                     qemu_mutex_unlock_iothread();
1282                     cpu_exec_step_atomic(cpu);
1283                     qemu_mutex_lock_iothread();
1284                     break;
1285                 }
1286             } else if (cpu->stop) {
1287                 if (cpu->unplug) {
1288                     cpu = CPU_NEXT(cpu);
1289                 }
1290                 break;
1291             }
1292
1293             cpu = CPU_NEXT(cpu);
1294         } /* while (cpu && !cpu->exit_request).. */
1295
1296         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1297         atomic_set(&tcg_current_rr_cpu, NULL);
1298
1299         if (cpu && cpu->exit_request) {
1300             atomic_mb_set(&cpu->exit_request, 0);
1301         }
1302
1303         handle_icount_deadline();
1304
1305         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1306         deal_with_unplugged_cpus();
1307     }
1308
1309     return NULL;
1310 }
1311
1312 static void *qemu_hax_cpu_thread_fn(void *arg)
1313 {
1314     CPUState *cpu = arg;
1315     int r;
1316     qemu_thread_get_self(cpu->thread);
1317     qemu_mutex_lock(&qemu_global_mutex);
1318
1319     cpu->thread_id = qemu_get_thread_id();
1320     cpu->created = true;
1321     cpu->halted = 0;
1322     current_cpu = cpu;
1323
1324     hax_init_vcpu(cpu);
1325     qemu_cond_signal(&qemu_cpu_cond);
1326
1327     while (1) {
1328         if (cpu_can_run(cpu)) {
1329             r = hax_smp_cpu_exec(cpu);
1330             if (r == EXCP_DEBUG) {
1331                 cpu_handle_guest_debug(cpu);
1332             }
1333         }
1334
1335         while (cpu_thread_is_idle(cpu)) {
1336             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1337         }
1338 #ifdef _WIN32
1339         SleepEx(0, TRUE);
1340 #endif
1341         qemu_wait_io_event_common(cpu);
1342     }
1343     return NULL;
1344 }
1345
1346 #ifdef _WIN32
1347 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1348 {
1349 }
1350 #endif
1351
1352 /* Multi-threaded TCG
1353  *
1354  * In the multi-threaded case each vCPU has its own thread. The TLS
1355  * variable current_cpu can be used deep in the code to find the
1356  * current CPUState for a given thread.
1357  */
1358
1359 static void *qemu_tcg_cpu_thread_fn(void *arg)
1360 {
1361     CPUState *cpu = arg;
1362
1363     rcu_register_thread();
1364
1365     qemu_mutex_lock_iothread();
1366     qemu_thread_get_self(cpu->thread);
1367
1368     cpu->thread_id = qemu_get_thread_id();
1369     cpu->created = true;
1370     cpu->can_do_io = 1;
1371     current_cpu = cpu;
1372     qemu_cond_signal(&qemu_cpu_cond);
1373
1374     /* process any pending work */
1375     cpu->exit_request = 1;
1376
1377     while (1) {
1378         if (cpu_can_run(cpu)) {
1379             int r;
1380             r = tcg_cpu_exec(cpu);
1381             switch (r) {
1382             case EXCP_DEBUG:
1383                 cpu_handle_guest_debug(cpu);
1384                 break;
1385             case EXCP_HALTED:
1386                 /* during start-up the vCPU is reset and the thread is
1387                  * kicked several times. If we don't ensure we go back
1388                  * to sleep in the halted state we won't cleanly
1389                  * start-up when the vCPU is enabled.
1390                  *
1391                  * cpu->halted should ensure we sleep in wait_io_event
1392                  */
1393                 g_assert(cpu->halted);
1394                 break;
1395             case EXCP_ATOMIC:
1396                 qemu_mutex_unlock_iothread();
1397                 cpu_exec_step_atomic(cpu);
1398                 qemu_mutex_lock_iothread();
1399             default:
1400                 /* Ignore everything else? */
1401                 break;
1402             }
1403         }
1404
1405         handle_icount_deadline();
1406
1407         atomic_mb_set(&cpu->exit_request, 0);
1408         qemu_tcg_wait_io_event(cpu);
1409     }
1410
1411     return NULL;
1412 }
1413
1414 static void qemu_cpu_kick_thread(CPUState *cpu)
1415 {
1416 #ifndef _WIN32
1417     int err;
1418
1419     if (cpu->thread_kicked) {
1420         return;
1421     }
1422     cpu->thread_kicked = true;
1423     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1424     if (err) {
1425         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1426         exit(1);
1427     }
1428 #else /* _WIN32 */
1429     if (!qemu_cpu_is_self(cpu)) {
1430         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1431             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1432                     __func__, GetLastError());
1433             exit(1);
1434         }
1435     }
1436 #endif
1437 }
1438
1439 void qemu_cpu_kick(CPUState *cpu)
1440 {
1441     qemu_cond_broadcast(cpu->halt_cond);
1442     if (tcg_enabled()) {
1443         cpu_exit(cpu);
1444         /* NOP unless doing single-thread RR */
1445         qemu_cpu_kick_rr_cpu();
1446     } else {
1447         if (hax_enabled()) {
1448             /*
1449              * FIXME: race condition with the exit_request check in
1450              * hax_vcpu_hax_exec
1451              */
1452             cpu->exit_request = 1;
1453         }
1454         qemu_cpu_kick_thread(cpu);
1455     }
1456 }
1457
1458 void qemu_cpu_kick_self(void)
1459 {
1460     assert(current_cpu);
1461     qemu_cpu_kick_thread(current_cpu);
1462 }
1463
1464 bool qemu_cpu_is_self(CPUState *cpu)
1465 {
1466     return qemu_thread_is_self(cpu->thread);
1467 }
1468
1469 bool qemu_in_vcpu_thread(void)
1470 {
1471     return current_cpu && qemu_cpu_is_self(current_cpu);
1472 }
1473
1474 static __thread bool iothread_locked = false;
1475
1476 bool qemu_mutex_iothread_locked(void)
1477 {
1478     return iothread_locked;
1479 }
1480
1481 void qemu_mutex_lock_iothread(void)
1482 {
1483     g_assert(!qemu_mutex_iothread_locked());
1484     qemu_mutex_lock(&qemu_global_mutex);
1485     iothread_locked = true;
1486 }
1487
1488 void qemu_mutex_unlock_iothread(void)
1489 {
1490     g_assert(qemu_mutex_iothread_locked());
1491     iothread_locked = false;
1492     qemu_mutex_unlock(&qemu_global_mutex);
1493 }
1494
1495 static bool all_vcpus_paused(void)
1496 {
1497     CPUState *cpu;
1498
1499     CPU_FOREACH(cpu) {
1500         if (!cpu->stopped) {
1501             return false;
1502         }
1503     }
1504
1505     return true;
1506 }
1507
1508 void pause_all_vcpus(void)
1509 {
1510     CPUState *cpu;
1511
1512     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1513     CPU_FOREACH(cpu) {
1514         cpu->stop = true;
1515         qemu_cpu_kick(cpu);
1516     }
1517
1518     if (qemu_in_vcpu_thread()) {
1519         cpu_stop_current();
1520     }
1521
1522     while (!all_vcpus_paused()) {
1523         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1524         CPU_FOREACH(cpu) {
1525             qemu_cpu_kick(cpu);
1526         }
1527     }
1528 }
1529
1530 void cpu_resume(CPUState *cpu)
1531 {
1532     cpu->stop = false;
1533     cpu->stopped = false;
1534     qemu_cpu_kick(cpu);
1535 }
1536
1537 void resume_all_vcpus(void)
1538 {
1539     CPUState *cpu;
1540
1541     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1542     CPU_FOREACH(cpu) {
1543         cpu_resume(cpu);
1544     }
1545 }
1546
1547 void cpu_remove(CPUState *cpu)
1548 {
1549     cpu->stop = true;
1550     cpu->unplug = true;
1551     qemu_cpu_kick(cpu);
1552 }
1553
1554 void cpu_remove_sync(CPUState *cpu)
1555 {
1556     cpu_remove(cpu);
1557     while (cpu->created) {
1558         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1559     }
1560 }
1561
1562 /* For temporary buffers for forming a name */
1563 #define VCPU_THREAD_NAME_SIZE 16
1564
1565 static void qemu_tcg_init_vcpu(CPUState *cpu)
1566 {
1567     char thread_name[VCPU_THREAD_NAME_SIZE];
1568     static QemuCond *single_tcg_halt_cond;
1569     static QemuThread *single_tcg_cpu_thread;
1570
1571     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1572         cpu->thread = g_malloc0(sizeof(QemuThread));
1573         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1574         qemu_cond_init(cpu->halt_cond);
1575
1576         if (qemu_tcg_mttcg_enabled()) {
1577             /* create a thread per vCPU with TCG (MTTCG) */
1578             parallel_cpus = true;
1579             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1580                  cpu->cpu_index);
1581
1582             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1583                                cpu, QEMU_THREAD_JOINABLE);
1584
1585         } else {
1586             /* share a single thread for all cpus with TCG */
1587             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1588             qemu_thread_create(cpu->thread, thread_name,
1589                                qemu_tcg_rr_cpu_thread_fn,
1590                                cpu, QEMU_THREAD_JOINABLE);
1591
1592             single_tcg_halt_cond = cpu->halt_cond;
1593             single_tcg_cpu_thread = cpu->thread;
1594         }
1595 #ifdef _WIN32
1596         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1597 #endif
1598         while (!cpu->created) {
1599             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1600         }
1601     } else {
1602         /* For non-MTTCG cases we share the thread */
1603         cpu->thread = single_tcg_cpu_thread;
1604         cpu->halt_cond = single_tcg_halt_cond;
1605     }
1606 }
1607
1608 static void qemu_hax_start_vcpu(CPUState *cpu)
1609 {
1610     char thread_name[VCPU_THREAD_NAME_SIZE];
1611
1612     cpu->thread = g_malloc0(sizeof(QemuThread));
1613     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1614     qemu_cond_init(cpu->halt_cond);
1615
1616     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1617              cpu->cpu_index);
1618     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1619                        cpu, QEMU_THREAD_JOINABLE);
1620 #ifdef _WIN32
1621     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1622 #endif
1623     while (!cpu->created) {
1624         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1625     }
1626 }
1627
1628 static void qemu_kvm_start_vcpu(CPUState *cpu)
1629 {
1630     char thread_name[VCPU_THREAD_NAME_SIZE];
1631
1632     cpu->thread = g_malloc0(sizeof(QemuThread));
1633     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1634     qemu_cond_init(cpu->halt_cond);
1635     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1636              cpu->cpu_index);
1637     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1638                        cpu, QEMU_THREAD_JOINABLE);
1639     while (!cpu->created) {
1640         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1641     }
1642 }
1643
1644 static void qemu_dummy_start_vcpu(CPUState *cpu)
1645 {
1646     char thread_name[VCPU_THREAD_NAME_SIZE];
1647
1648     cpu->thread = g_malloc0(sizeof(QemuThread));
1649     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1650     qemu_cond_init(cpu->halt_cond);
1651     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1652              cpu->cpu_index);
1653     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1654                        QEMU_THREAD_JOINABLE);
1655     while (!cpu->created) {
1656         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1657     }
1658 }
1659
1660 void qemu_init_vcpu(CPUState *cpu)
1661 {
1662     cpu->nr_cores = smp_cores;
1663     cpu->nr_threads = smp_threads;
1664     cpu->stopped = true;
1665
1666     if (!cpu->as) {
1667         /* If the target cpu hasn't set up any address spaces itself,
1668          * give it the default one.
1669          */
1670         AddressSpace *as = address_space_init_shareable(cpu->memory,
1671                                                         "cpu-memory");
1672         cpu->num_ases = 1;
1673         cpu_address_space_init(cpu, as, 0);
1674     }
1675
1676     if (kvm_enabled()) {
1677         qemu_kvm_start_vcpu(cpu);
1678     } else if (hax_enabled()) {
1679         qemu_hax_start_vcpu(cpu);
1680     } else if (tcg_enabled()) {
1681         qemu_tcg_init_vcpu(cpu);
1682     } else {
1683         qemu_dummy_start_vcpu(cpu);
1684     }
1685 }
1686
1687 void cpu_stop_current(void)
1688 {
1689     if (current_cpu) {
1690         current_cpu->stop = false;
1691         current_cpu->stopped = true;
1692         cpu_exit(current_cpu);
1693         qemu_cond_broadcast(&qemu_pause_cond);
1694     }
1695 }
1696
1697 int vm_stop(RunState state)
1698 {
1699     if (qemu_in_vcpu_thread()) {
1700         qemu_system_vmstop_request_prepare();
1701         qemu_system_vmstop_request(state);
1702         /*
1703          * FIXME: should not return to device code in case
1704          * vm_stop() has been requested.
1705          */
1706         cpu_stop_current();
1707         return 0;
1708     }
1709
1710     return do_vm_stop(state);
1711 }
1712
1713 /**
1714  * Prepare for (re)starting the VM.
1715  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1716  * running or in case of an error condition), 0 otherwise.
1717  */
1718 int vm_prepare_start(void)
1719 {
1720     RunState requested;
1721     int res = 0;
1722
1723     qemu_vmstop_requested(&requested);
1724     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1725         return -1;
1726     }
1727
1728     /* Ensure that a STOP/RESUME pair of events is emitted if a
1729      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1730      * example, according to documentation is always followed by
1731      * the STOP event.
1732      */
1733     if (runstate_is_running()) {
1734         qapi_event_send_stop(&error_abort);
1735         res = -1;
1736     } else {
1737         replay_enable_events();
1738         cpu_enable_ticks();
1739         runstate_set(RUN_STATE_RUNNING);
1740         vm_state_notify(1, RUN_STATE_RUNNING);
1741     }
1742
1743     /* We are sending this now, but the CPUs will be resumed shortly later */
1744     qapi_event_send_resume(&error_abort);
1745     return res;
1746 }
1747
1748 void vm_start(void)
1749 {
1750     if (!vm_prepare_start()) {
1751         resume_all_vcpus();
1752     }
1753 }
1754
1755 /* does a state transition even if the VM is already stopped,
1756    current state is forgotten forever */
1757 int vm_stop_force_state(RunState state)
1758 {
1759     if (runstate_is_running()) {
1760         return vm_stop(state);
1761     } else {
1762         runstate_set(state);
1763
1764         bdrv_drain_all();
1765         /* Make sure to return an error if the flush in a previous vm_stop()
1766          * failed. */
1767         return bdrv_flush_all();
1768     }
1769 }
1770
1771 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1772 {
1773     /* XXX: implement xxx_cpu_list for targets that still miss it */
1774 #if defined(cpu_list)
1775     cpu_list(f, cpu_fprintf);
1776 #endif
1777 }
1778
1779 CpuInfoList *qmp_query_cpus(Error **errp)
1780 {
1781     CpuInfoList *head = NULL, *cur_item = NULL;
1782     CPUState *cpu;
1783
1784     CPU_FOREACH(cpu) {
1785         CpuInfoList *info;
1786 #if defined(TARGET_I386)
1787         X86CPU *x86_cpu = X86_CPU(cpu);
1788         CPUX86State *env = &x86_cpu->env;
1789 #elif defined(TARGET_PPC)
1790         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1791         CPUPPCState *env = &ppc_cpu->env;
1792 #elif defined(TARGET_SPARC)
1793         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1794         CPUSPARCState *env = &sparc_cpu->env;
1795 #elif defined(TARGET_MIPS)
1796         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1797         CPUMIPSState *env = &mips_cpu->env;
1798 #elif defined(TARGET_TRICORE)
1799         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1800         CPUTriCoreState *env = &tricore_cpu->env;
1801 #endif
1802
1803         cpu_synchronize_state(cpu);
1804
1805         info = g_malloc0(sizeof(*info));
1806         info->value = g_malloc0(sizeof(*info->value));
1807         info->value->CPU = cpu->cpu_index;
1808         info->value->current = (cpu == first_cpu);
1809         info->value->halted = cpu->halted;
1810         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1811         info->value->thread_id = cpu->thread_id;
1812 #if defined(TARGET_I386)
1813         info->value->arch = CPU_INFO_ARCH_X86;
1814         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1815 #elif defined(TARGET_PPC)
1816         info->value->arch = CPU_INFO_ARCH_PPC;
1817         info->value->u.ppc.nip = env->nip;
1818 #elif defined(TARGET_SPARC)
1819         info->value->arch = CPU_INFO_ARCH_SPARC;
1820         info->value->u.q_sparc.pc = env->pc;
1821         info->value->u.q_sparc.npc = env->npc;
1822 #elif defined(TARGET_MIPS)
1823         info->value->arch = CPU_INFO_ARCH_MIPS;
1824         info->value->u.q_mips.PC = env->active_tc.PC;
1825 #elif defined(TARGET_TRICORE)
1826         info->value->arch = CPU_INFO_ARCH_TRICORE;
1827         info->value->u.tricore.PC = env->PC;
1828 #else
1829         info->value->arch = CPU_INFO_ARCH_OTHER;
1830 #endif
1831
1832         /* XXX: waiting for the qapi to support GSList */
1833         if (!cur_item) {
1834             head = cur_item = info;
1835         } else {
1836             cur_item->next = info;
1837             cur_item = info;
1838         }
1839     }
1840
1841     return head;
1842 }
1843
1844 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1845                  bool has_cpu, int64_t cpu_index, Error **errp)
1846 {
1847     FILE *f;
1848     uint32_t l;
1849     CPUState *cpu;
1850     uint8_t buf[1024];
1851     int64_t orig_addr = addr, orig_size = size;
1852
1853     if (!has_cpu) {
1854         cpu_index = 0;
1855     }
1856
1857     cpu = qemu_get_cpu(cpu_index);
1858     if (cpu == NULL) {
1859         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1860                    "a CPU number");
1861         return;
1862     }
1863
1864     f = fopen(filename, "wb");
1865     if (!f) {
1866         error_setg_file_open(errp, errno, filename);
1867         return;
1868     }
1869
1870     while (size != 0) {
1871         l = sizeof(buf);
1872         if (l > size)
1873             l = size;
1874         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1875             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1876                              " specified", orig_addr, orig_size);
1877             goto exit;
1878         }
1879         if (fwrite(buf, 1, l, f) != l) {
1880             error_setg(errp, QERR_IO_ERROR);
1881             goto exit;
1882         }
1883         addr += l;
1884         size -= l;
1885     }
1886
1887 exit:
1888     fclose(f);
1889 }
1890
1891 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1892                   Error **errp)
1893 {
1894     FILE *f;
1895     uint32_t l;
1896     uint8_t buf[1024];
1897
1898     f = fopen(filename, "wb");
1899     if (!f) {
1900         error_setg_file_open(errp, errno, filename);
1901         return;
1902     }
1903
1904     while (size != 0) {
1905         l = sizeof(buf);
1906         if (l > size)
1907             l = size;
1908         cpu_physical_memory_read(addr, buf, l);
1909         if (fwrite(buf, 1, l, f) != l) {
1910             error_setg(errp, QERR_IO_ERROR);
1911             goto exit;
1912         }
1913         addr += l;
1914         size -= l;
1915     }
1916
1917 exit:
1918     fclose(f);
1919 }
1920
1921 void qmp_inject_nmi(Error **errp)
1922 {
1923     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1924 }
1925
1926 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1927 {
1928     if (!use_icount) {
1929         return;
1930     }
1931
1932     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1933                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1934     if (icount_align_option) {
1935         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1936         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1937     } else {
1938         cpu_fprintf(f, "Max guest delay     NA\n");
1939         cpu_fprintf(f, "Max guest advance   NA\n");
1940     }
1941 }