]> Git Repo - qemu.git/blob - cpus.c
virtio-blk: reject configs with logical block size > physical block size
[qemu.git] / cpus.c
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53 #include "hw/boards.h"
54
55 #ifdef CONFIG_LINUX
56
57 #include <sys/prctl.h>
58
59 #ifndef PR_MCE_KILL
60 #define PR_MCE_KILL 33
61 #endif
62
63 #ifndef PR_MCE_KILL_SET
64 #define PR_MCE_KILL_SET 1
65 #endif
66
67 #ifndef PR_MCE_KILL_EARLY
68 #define PR_MCE_KILL_EARLY 1
69 #endif
70
71 #endif /* CONFIG_LINUX */
72
73 int64_t max_delay;
74 int64_t max_advance;
75
76 /* vcpu throttling controls */
77 static QEMUTimer *throttle_timer;
78 static unsigned int throttle_percentage;
79
80 #define CPU_THROTTLE_PCT_MIN 1
81 #define CPU_THROTTLE_PCT_MAX 99
82 #define CPU_THROTTLE_TIMESLICE_NS 10000000
83
84 bool cpu_is_stopped(CPUState *cpu)
85 {
86     return cpu->stopped || !runstate_is_running();
87 }
88
89 static bool cpu_thread_is_idle(CPUState *cpu)
90 {
91     if (cpu->stop || cpu->queued_work_first) {
92         return false;
93     }
94     if (cpu_is_stopped(cpu)) {
95         return true;
96     }
97     if (!cpu->halted || cpu_has_work(cpu) ||
98         kvm_halt_in_kernel()) {
99         return false;
100     }
101     return true;
102 }
103
104 static bool all_cpu_threads_idle(void)
105 {
106     CPUState *cpu;
107
108     CPU_FOREACH(cpu) {
109         if (!cpu_thread_is_idle(cpu)) {
110             return false;
111         }
112     }
113     return true;
114 }
115
116 /***********************************************************/
117 /* guest cycle counter */
118
119 /* Protected by TimersState seqlock */
120
121 static bool icount_sleep = true;
122 static int64_t vm_clock_warp_start = -1;
123 /* Conversion factor from emulated instructions to virtual clock ticks.  */
124 static int icount_time_shift;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
126 #define MAX_ICOUNT_SHIFT 10
127
128 static QEMUTimer *icount_rt_timer;
129 static QEMUTimer *icount_vm_timer;
130 static QEMUTimer *icount_warp_timer;
131
132 typedef struct TimersState {
133     /* Protected by BQL.  */
134     int64_t cpu_ticks_prev;
135     int64_t cpu_ticks_offset;
136
137     /* cpu_clock_offset can be read out of BQL, so protect it with
138      * this lock.
139      */
140     QemuSeqLock vm_clock_seqlock;
141     int64_t cpu_clock_offset;
142     int32_t cpu_ticks_enabled;
143     int64_t dummy;
144
145     /* Compensate for varying guest execution speed.  */
146     int64_t qemu_icount_bias;
147     /* Only written by TCG thread */
148     int64_t qemu_icount;
149 } TimersState;
150
151 static TimersState timers_state;
152 bool mttcg_enabled;
153
154 /*
155  * We default to false if we know other options have been enabled
156  * which are currently incompatible with MTTCG. Otherwise when each
157  * guest (target) has been updated to support:
158  *   - atomic instructions
159  *   - memory ordering primitives (barriers)
160  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
161  *
162  * Once a guest architecture has been converted to the new primitives
163  * there are two remaining limitations to check.
164  *
165  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
166  * - The host must have a stronger memory order than the guest
167  *
168  * It may be possible in future to support strong guests on weak hosts
169  * but that will require tagging all load/stores in a guest with their
170  * implicit memory order requirements which would likely slow things
171  * down a lot.
172  */
173
174 static bool check_tcg_memory_orders_compatible(void)
175 {
176 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
177     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
178 #else
179     return false;
180 #endif
181 }
182
183 static bool default_mttcg_enabled(void)
184 {
185     if (use_icount || TCG_OVERSIZED_GUEST) {
186         return false;
187     } else {
188 #ifdef TARGET_SUPPORTS_MTTCG
189         return check_tcg_memory_orders_compatible();
190 #else
191         return false;
192 #endif
193     }
194 }
195
196 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
197 {
198     const char *t = qemu_opt_get(opts, "thread");
199     if (t) {
200         if (strcmp(t, "multi") == 0) {
201             if (TCG_OVERSIZED_GUEST) {
202                 error_setg(errp, "No MTTCG when guest word size > hosts");
203             } else if (use_icount) {
204                 error_setg(errp, "No MTTCG when icount is enabled");
205             } else {
206 #ifndef TARGET_SUPPORTS_MTTCG
207                 error_report("Guest not yet converted to MTTCG - "
208                              "you may get unexpected results");
209 #endif
210                 if (!check_tcg_memory_orders_compatible()) {
211                     error_report("Guest expects a stronger memory ordering "
212                                  "than the host provides");
213                     error_printf("This may cause strange/hard to debug errors\n");
214                 }
215                 mttcg_enabled = true;
216             }
217         } else if (strcmp(t, "single") == 0) {
218             mttcg_enabled = false;
219         } else {
220             error_setg(errp, "Invalid 'thread' setting %s", t);
221         }
222     } else {
223         mttcg_enabled = default_mttcg_enabled();
224     }
225 }
226
227 /* The current number of executed instructions is based on what we
228  * originally budgeted minus the current state of the decrementing
229  * icount counters in extra/u16.low.
230  */
231 static int64_t cpu_get_icount_executed(CPUState *cpu)
232 {
233     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
234 }
235
236 /*
237  * Update the global shared timer_state.qemu_icount to take into
238  * account executed instructions. This is done by the TCG vCPU
239  * thread so the main-loop can see time has moved forward.
240  */
241 void cpu_update_icount(CPUState *cpu)
242 {
243     int64_t executed = cpu_get_icount_executed(cpu);
244     cpu->icount_budget -= executed;
245
246 #ifdef CONFIG_ATOMIC64
247     atomic_set__nocheck(&timers_state.qemu_icount,
248                         atomic_read__nocheck(&timers_state.qemu_icount) +
249                         executed);
250 #else /* FIXME: we need 64bit atomics to do this safely */
251     timers_state.qemu_icount += executed;
252 #endif
253 }
254
255 int64_t cpu_get_icount_raw(void)
256 {
257     CPUState *cpu = current_cpu;
258
259     if (cpu && cpu->running) {
260         if (!cpu->can_do_io) {
261             fprintf(stderr, "Bad icount read\n");
262             exit(1);
263         }
264         /* Take into account what has run */
265         cpu_update_icount(cpu);
266     }
267 #ifdef CONFIG_ATOMIC64
268     return atomic_read__nocheck(&timers_state.qemu_icount);
269 #else /* FIXME: we need 64bit atomics to do this safely */
270     return timers_state.qemu_icount;
271 #endif
272 }
273
274 /* Return the virtual CPU time, based on the instruction counter.  */
275 static int64_t cpu_get_icount_locked(void)
276 {
277     int64_t icount = cpu_get_icount_raw();
278     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
279 }
280
281 int64_t cpu_get_icount(void)
282 {
283     int64_t icount;
284     unsigned start;
285
286     do {
287         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
288         icount = cpu_get_icount_locked();
289     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
290
291     return icount;
292 }
293
294 int64_t cpu_icount_to_ns(int64_t icount)
295 {
296     return icount << icount_time_shift;
297 }
298
299 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
300  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
301  * counter.
302  *
303  * Caller must hold the BQL
304  */
305 int64_t cpu_get_ticks(void)
306 {
307     int64_t ticks;
308
309     if (use_icount) {
310         return cpu_get_icount();
311     }
312
313     ticks = timers_state.cpu_ticks_offset;
314     if (timers_state.cpu_ticks_enabled) {
315         ticks += cpu_get_host_ticks();
316     }
317
318     if (timers_state.cpu_ticks_prev > ticks) {
319         /* Note: non increasing ticks may happen if the host uses
320            software suspend */
321         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
322         ticks = timers_state.cpu_ticks_prev;
323     }
324
325     timers_state.cpu_ticks_prev = ticks;
326     return ticks;
327 }
328
329 static int64_t cpu_get_clock_locked(void)
330 {
331     int64_t time;
332
333     time = timers_state.cpu_clock_offset;
334     if (timers_state.cpu_ticks_enabled) {
335         time += get_clock();
336     }
337
338     return time;
339 }
340
341 /* Return the monotonic time elapsed in VM, i.e.,
342  * the time between vm_start and vm_stop
343  */
344 int64_t cpu_get_clock(void)
345 {
346     int64_t ti;
347     unsigned start;
348
349     do {
350         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
351         ti = cpu_get_clock_locked();
352     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
353
354     return ti;
355 }
356
357 /* enable cpu_get_ticks()
358  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
359  */
360 void cpu_enable_ticks(void)
361 {
362     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
363     seqlock_write_begin(&timers_state.vm_clock_seqlock);
364     if (!timers_state.cpu_ticks_enabled) {
365         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
366         timers_state.cpu_clock_offset -= get_clock();
367         timers_state.cpu_ticks_enabled = 1;
368     }
369     seqlock_write_end(&timers_state.vm_clock_seqlock);
370 }
371
372 /* disable cpu_get_ticks() : the clock is stopped. You must not call
373  * cpu_get_ticks() after that.
374  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
375  */
376 void cpu_disable_ticks(void)
377 {
378     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
379     seqlock_write_begin(&timers_state.vm_clock_seqlock);
380     if (timers_state.cpu_ticks_enabled) {
381         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
382         timers_state.cpu_clock_offset = cpu_get_clock_locked();
383         timers_state.cpu_ticks_enabled = 0;
384     }
385     seqlock_write_end(&timers_state.vm_clock_seqlock);
386 }
387
388 /* Correlation between real and virtual time is always going to be
389    fairly approximate, so ignore small variation.
390    When the guest is idle real and virtual time will be aligned in
391    the IO wait loop.  */
392 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
393
394 static void icount_adjust(void)
395 {
396     int64_t cur_time;
397     int64_t cur_icount;
398     int64_t delta;
399
400     /* Protected by TimersState mutex.  */
401     static int64_t last_delta;
402
403     /* If the VM is not running, then do nothing.  */
404     if (!runstate_is_running()) {
405         return;
406     }
407
408     seqlock_write_begin(&timers_state.vm_clock_seqlock);
409     cur_time = cpu_get_clock_locked();
410     cur_icount = cpu_get_icount_locked();
411
412     delta = cur_icount - cur_time;
413     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
414     if (delta > 0
415         && last_delta + ICOUNT_WOBBLE < delta * 2
416         && icount_time_shift > 0) {
417         /* The guest is getting too far ahead.  Slow time down.  */
418         icount_time_shift--;
419     }
420     if (delta < 0
421         && last_delta - ICOUNT_WOBBLE > delta * 2
422         && icount_time_shift < MAX_ICOUNT_SHIFT) {
423         /* The guest is getting too far behind.  Speed time up.  */
424         icount_time_shift++;
425     }
426     last_delta = delta;
427     timers_state.qemu_icount_bias = cur_icount
428                               - (timers_state.qemu_icount << icount_time_shift);
429     seqlock_write_end(&timers_state.vm_clock_seqlock);
430 }
431
432 static void icount_adjust_rt(void *opaque)
433 {
434     timer_mod(icount_rt_timer,
435               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
436     icount_adjust();
437 }
438
439 static void icount_adjust_vm(void *opaque)
440 {
441     timer_mod(icount_vm_timer,
442                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
443                    NANOSECONDS_PER_SECOND / 10);
444     icount_adjust();
445 }
446
447 static int64_t qemu_icount_round(int64_t count)
448 {
449     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
450 }
451
452 static void icount_warp_rt(void)
453 {
454     unsigned seq;
455     int64_t warp_start;
456
457     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
458      * changes from -1 to another value, so the race here is okay.
459      */
460     do {
461         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
462         warp_start = vm_clock_warp_start;
463     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
464
465     if (warp_start == -1) {
466         return;
467     }
468
469     seqlock_write_begin(&timers_state.vm_clock_seqlock);
470     if (runstate_is_running()) {
471         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
472                                      cpu_get_clock_locked());
473         int64_t warp_delta;
474
475         warp_delta = clock - vm_clock_warp_start;
476         if (use_icount == 2) {
477             /*
478              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
479              * far ahead of real time.
480              */
481             int64_t cur_icount = cpu_get_icount_locked();
482             int64_t delta = clock - cur_icount;
483             warp_delta = MIN(warp_delta, delta);
484         }
485         timers_state.qemu_icount_bias += warp_delta;
486     }
487     vm_clock_warp_start = -1;
488     seqlock_write_end(&timers_state.vm_clock_seqlock);
489
490     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
491         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
492     }
493 }
494
495 static void icount_timer_cb(void *opaque)
496 {
497     /* No need for a checkpoint because the timer already synchronizes
498      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
499      */
500     icount_warp_rt();
501 }
502
503 void qtest_clock_warp(int64_t dest)
504 {
505     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
506     AioContext *aio_context;
507     assert(qtest_enabled());
508     aio_context = qemu_get_aio_context();
509     while (clock < dest) {
510         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
511         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
512
513         seqlock_write_begin(&timers_state.vm_clock_seqlock);
514         timers_state.qemu_icount_bias += warp;
515         seqlock_write_end(&timers_state.vm_clock_seqlock);
516
517         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
518         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
519         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
520     }
521     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
522 }
523
524 void qemu_start_warp_timer(void)
525 {
526     int64_t clock;
527     int64_t deadline;
528
529     if (!use_icount) {
530         return;
531     }
532
533     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
534      * do not fire, so computing the deadline does not make sense.
535      */
536     if (!runstate_is_running()) {
537         return;
538     }
539
540     /* warp clock deterministically in record/replay mode */
541     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
542         return;
543     }
544
545     if (!all_cpu_threads_idle()) {
546         return;
547     }
548
549     if (qtest_enabled()) {
550         /* When testing, qtest commands advance icount.  */
551         return;
552     }
553
554     /* We want to use the earliest deadline from ALL vm_clocks */
555     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
556     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
557     if (deadline < 0) {
558         static bool notified;
559         if (!icount_sleep && !notified) {
560             warn_report("icount sleep disabled and no active timers");
561             notified = true;
562         }
563         return;
564     }
565
566     if (deadline > 0) {
567         /*
568          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
569          * sleep.  Otherwise, the CPU might be waiting for a future timer
570          * interrupt to wake it up, but the interrupt never comes because
571          * the vCPU isn't running any insns and thus doesn't advance the
572          * QEMU_CLOCK_VIRTUAL.
573          */
574         if (!icount_sleep) {
575             /*
576              * We never let VCPUs sleep in no sleep icount mode.
577              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
578              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
579              * It is useful when we want a deterministic execution time,
580              * isolated from host latencies.
581              */
582             seqlock_write_begin(&timers_state.vm_clock_seqlock);
583             timers_state.qemu_icount_bias += deadline;
584             seqlock_write_end(&timers_state.vm_clock_seqlock);
585             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
586         } else {
587             /*
588              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
589              * "real" time, (related to the time left until the next event) has
590              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
591              * This avoids that the warps are visible externally; for example,
592              * you will not be sending network packets continuously instead of
593              * every 100ms.
594              */
595             seqlock_write_begin(&timers_state.vm_clock_seqlock);
596             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
597                 vm_clock_warp_start = clock;
598             }
599             seqlock_write_end(&timers_state.vm_clock_seqlock);
600             timer_mod_anticipate(icount_warp_timer, clock + deadline);
601         }
602     } else if (deadline == 0) {
603         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
604     }
605 }
606
607 static void qemu_account_warp_timer(void)
608 {
609     if (!use_icount || !icount_sleep) {
610         return;
611     }
612
613     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
614      * do not fire, so computing the deadline does not make sense.
615      */
616     if (!runstate_is_running()) {
617         return;
618     }
619
620     /* warp clock deterministically in record/replay mode */
621     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
622         return;
623     }
624
625     timer_del(icount_warp_timer);
626     icount_warp_rt();
627 }
628
629 static bool icount_state_needed(void *opaque)
630 {
631     return use_icount;
632 }
633
634 /*
635  * This is a subsection for icount migration.
636  */
637 static const VMStateDescription icount_vmstate_timers = {
638     .name = "timer/icount",
639     .version_id = 1,
640     .minimum_version_id = 1,
641     .needed = icount_state_needed,
642     .fields = (VMStateField[]) {
643         VMSTATE_INT64(qemu_icount_bias, TimersState),
644         VMSTATE_INT64(qemu_icount, TimersState),
645         VMSTATE_END_OF_LIST()
646     }
647 };
648
649 static const VMStateDescription vmstate_timers = {
650     .name = "timer",
651     .version_id = 2,
652     .minimum_version_id = 1,
653     .fields = (VMStateField[]) {
654         VMSTATE_INT64(cpu_ticks_offset, TimersState),
655         VMSTATE_INT64(dummy, TimersState),
656         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
657         VMSTATE_END_OF_LIST()
658     },
659     .subsections = (const VMStateDescription*[]) {
660         &icount_vmstate_timers,
661         NULL
662     }
663 };
664
665 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
666 {
667     double pct;
668     double throttle_ratio;
669     long sleeptime_ns;
670
671     if (!cpu_throttle_get_percentage()) {
672         return;
673     }
674
675     pct = (double)cpu_throttle_get_percentage()/100;
676     throttle_ratio = pct / (1 - pct);
677     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
678
679     qemu_mutex_unlock_iothread();
680     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
681     qemu_mutex_lock_iothread();
682     atomic_set(&cpu->throttle_thread_scheduled, 0);
683 }
684
685 static void cpu_throttle_timer_tick(void *opaque)
686 {
687     CPUState *cpu;
688     double pct;
689
690     /* Stop the timer if needed */
691     if (!cpu_throttle_get_percentage()) {
692         return;
693     }
694     CPU_FOREACH(cpu) {
695         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
696             async_run_on_cpu(cpu, cpu_throttle_thread,
697                              RUN_ON_CPU_NULL);
698         }
699     }
700
701     pct = (double)cpu_throttle_get_percentage()/100;
702     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
703                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
704 }
705
706 void cpu_throttle_set(int new_throttle_pct)
707 {
708     /* Ensure throttle percentage is within valid range */
709     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
710     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
711
712     atomic_set(&throttle_percentage, new_throttle_pct);
713
714     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
715                                        CPU_THROTTLE_TIMESLICE_NS);
716 }
717
718 void cpu_throttle_stop(void)
719 {
720     atomic_set(&throttle_percentage, 0);
721 }
722
723 bool cpu_throttle_active(void)
724 {
725     return (cpu_throttle_get_percentage() != 0);
726 }
727
728 int cpu_throttle_get_percentage(void)
729 {
730     return atomic_read(&throttle_percentage);
731 }
732
733 void cpu_ticks_init(void)
734 {
735     seqlock_init(&timers_state.vm_clock_seqlock);
736     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
737     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
738                                            cpu_throttle_timer_tick, NULL);
739 }
740
741 void configure_icount(QemuOpts *opts, Error **errp)
742 {
743     const char *option;
744     char *rem_str = NULL;
745
746     option = qemu_opt_get(opts, "shift");
747     if (!option) {
748         if (qemu_opt_get(opts, "align") != NULL) {
749             error_setg(errp, "Please specify shift option when using align");
750         }
751         return;
752     }
753
754     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
755     if (icount_sleep) {
756         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
757                                          icount_timer_cb, NULL);
758     }
759
760     icount_align_option = qemu_opt_get_bool(opts, "align", false);
761
762     if (icount_align_option && !icount_sleep) {
763         error_setg(errp, "align=on and sleep=off are incompatible");
764     }
765     if (strcmp(option, "auto") != 0) {
766         errno = 0;
767         icount_time_shift = strtol(option, &rem_str, 0);
768         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
769             error_setg(errp, "icount: Invalid shift value");
770         }
771         use_icount = 1;
772         return;
773     } else if (icount_align_option) {
774         error_setg(errp, "shift=auto and align=on are incompatible");
775     } else if (!icount_sleep) {
776         error_setg(errp, "shift=auto and sleep=off are incompatible");
777     }
778
779     use_icount = 2;
780
781     /* 125MIPS seems a reasonable initial guess at the guest speed.
782        It will be corrected fairly quickly anyway.  */
783     icount_time_shift = 3;
784
785     /* Have both realtime and virtual time triggers for speed adjustment.
786        The realtime trigger catches emulated time passing too slowly,
787        the virtual time trigger catches emulated time passing too fast.
788        Realtime triggers occur even when idle, so use them less frequently
789        than VM triggers.  */
790     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
791                                    icount_adjust_rt, NULL);
792     timer_mod(icount_rt_timer,
793                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
794     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
795                                         icount_adjust_vm, NULL);
796     timer_mod(icount_vm_timer,
797                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
798                    NANOSECONDS_PER_SECOND / 10);
799 }
800
801 /***********************************************************/
802 /* TCG vCPU kick timer
803  *
804  * The kick timer is responsible for moving single threaded vCPU
805  * emulation on to the next vCPU. If more than one vCPU is running a
806  * timer event with force a cpu->exit so the next vCPU can get
807  * scheduled.
808  *
809  * The timer is removed if all vCPUs are idle and restarted again once
810  * idleness is complete.
811  */
812
813 static QEMUTimer *tcg_kick_vcpu_timer;
814 static CPUState *tcg_current_rr_cpu;
815
816 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
817
818 static inline int64_t qemu_tcg_next_kick(void)
819 {
820     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
821 }
822
823 /* Kick the currently round-robin scheduled vCPU */
824 static void qemu_cpu_kick_rr_cpu(void)
825 {
826     CPUState *cpu;
827     do {
828         cpu = atomic_mb_read(&tcg_current_rr_cpu);
829         if (cpu) {
830             cpu_exit(cpu);
831         }
832     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
833 }
834
835 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
836 {
837 }
838
839 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
840 {
841     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
842         qemu_notify_event();
843         return;
844     }
845
846     if (!qemu_in_vcpu_thread() && first_cpu) {
847         /* qemu_cpu_kick is not enough to kick a halted CPU out of
848          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
849          * causes cpu_thread_is_idle to return false.  This way,
850          * handle_icount_deadline can run.
851          */
852         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
853     }
854 }
855
856 static void kick_tcg_thread(void *opaque)
857 {
858     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
859     qemu_cpu_kick_rr_cpu();
860 }
861
862 static void start_tcg_kick_timer(void)
863 {
864     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
865         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
866                                            kick_tcg_thread, NULL);
867         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
868     }
869 }
870
871 static void stop_tcg_kick_timer(void)
872 {
873     if (tcg_kick_vcpu_timer) {
874         timer_del(tcg_kick_vcpu_timer);
875         tcg_kick_vcpu_timer = NULL;
876     }
877 }
878
879 /***********************************************************/
880 void hw_error(const char *fmt, ...)
881 {
882     va_list ap;
883     CPUState *cpu;
884
885     va_start(ap, fmt);
886     fprintf(stderr, "qemu: hardware error: ");
887     vfprintf(stderr, fmt, ap);
888     fprintf(stderr, "\n");
889     CPU_FOREACH(cpu) {
890         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
891         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
892     }
893     va_end(ap);
894     abort();
895 }
896
897 void cpu_synchronize_all_states(void)
898 {
899     CPUState *cpu;
900
901     CPU_FOREACH(cpu) {
902         cpu_synchronize_state(cpu);
903     }
904 }
905
906 void cpu_synchronize_all_post_reset(void)
907 {
908     CPUState *cpu;
909
910     CPU_FOREACH(cpu) {
911         cpu_synchronize_post_reset(cpu);
912     }
913 }
914
915 void cpu_synchronize_all_post_init(void)
916 {
917     CPUState *cpu;
918
919     CPU_FOREACH(cpu) {
920         cpu_synchronize_post_init(cpu);
921     }
922 }
923
924 void cpu_synchronize_all_pre_loadvm(void)
925 {
926     CPUState *cpu;
927
928     CPU_FOREACH(cpu) {
929         cpu_synchronize_pre_loadvm(cpu);
930     }
931 }
932
933 static int do_vm_stop(RunState state)
934 {
935     int ret = 0;
936
937     if (runstate_is_running()) {
938         cpu_disable_ticks();
939         pause_all_vcpus();
940         runstate_set(state);
941         vm_state_notify(0, state);
942         qapi_event_send_stop(&error_abort);
943     }
944
945     bdrv_drain_all();
946     replay_disable_events();
947     ret = bdrv_flush_all();
948
949     return ret;
950 }
951
952 static bool cpu_can_run(CPUState *cpu)
953 {
954     if (cpu->stop) {
955         return false;
956     }
957     if (cpu_is_stopped(cpu)) {
958         return false;
959     }
960     return true;
961 }
962
963 static void cpu_handle_guest_debug(CPUState *cpu)
964 {
965     gdb_set_stop_cpu(cpu);
966     qemu_system_debug_request();
967     cpu->stopped = true;
968 }
969
970 #ifdef CONFIG_LINUX
971 static void sigbus_reraise(void)
972 {
973     sigset_t set;
974     struct sigaction action;
975
976     memset(&action, 0, sizeof(action));
977     action.sa_handler = SIG_DFL;
978     if (!sigaction(SIGBUS, &action, NULL)) {
979         raise(SIGBUS);
980         sigemptyset(&set);
981         sigaddset(&set, SIGBUS);
982         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
983     }
984     perror("Failed to re-raise SIGBUS!\n");
985     abort();
986 }
987
988 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
989 {
990     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
991         sigbus_reraise();
992     }
993
994     if (current_cpu) {
995         /* Called asynchronously in VCPU thread.  */
996         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
997             sigbus_reraise();
998         }
999     } else {
1000         /* Called synchronously (via signalfd) in main thread.  */
1001         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1002             sigbus_reraise();
1003         }
1004     }
1005 }
1006
1007 static void qemu_init_sigbus(void)
1008 {
1009     struct sigaction action;
1010
1011     memset(&action, 0, sizeof(action));
1012     action.sa_flags = SA_SIGINFO;
1013     action.sa_sigaction = sigbus_handler;
1014     sigaction(SIGBUS, &action, NULL);
1015
1016     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1017 }
1018 #else /* !CONFIG_LINUX */
1019 static void qemu_init_sigbus(void)
1020 {
1021 }
1022 #endif /* !CONFIG_LINUX */
1023
1024 static QemuMutex qemu_global_mutex;
1025
1026 static QemuThread io_thread;
1027
1028 /* cpu creation */
1029 static QemuCond qemu_cpu_cond;
1030 /* system init */
1031 static QemuCond qemu_pause_cond;
1032
1033 void qemu_init_cpu_loop(void)
1034 {
1035     qemu_init_sigbus();
1036     qemu_cond_init(&qemu_cpu_cond);
1037     qemu_cond_init(&qemu_pause_cond);
1038     qemu_mutex_init(&qemu_global_mutex);
1039
1040     qemu_thread_get_self(&io_thread);
1041 }
1042
1043 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1044 {
1045     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1046 }
1047
1048 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1049 {
1050     if (kvm_destroy_vcpu(cpu) < 0) {
1051         error_report("kvm_destroy_vcpu failed");
1052         exit(EXIT_FAILURE);
1053     }
1054 }
1055
1056 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1057 {
1058 }
1059
1060 static void qemu_wait_io_event_common(CPUState *cpu)
1061 {
1062     atomic_mb_set(&cpu->thread_kicked, false);
1063     if (cpu->stop) {
1064         cpu->stop = false;
1065         cpu->stopped = true;
1066         qemu_cond_broadcast(&qemu_pause_cond);
1067     }
1068     process_queued_cpu_work(cpu);
1069 }
1070
1071 static bool qemu_tcg_should_sleep(CPUState *cpu)
1072 {
1073     if (mttcg_enabled) {
1074         return cpu_thread_is_idle(cpu);
1075     } else {
1076         return all_cpu_threads_idle();
1077     }
1078 }
1079
1080 static void qemu_tcg_wait_io_event(CPUState *cpu)
1081 {
1082     while (qemu_tcg_should_sleep(cpu)) {
1083         stop_tcg_kick_timer();
1084         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1085     }
1086
1087     start_tcg_kick_timer();
1088
1089     qemu_wait_io_event_common(cpu);
1090 }
1091
1092 static void qemu_kvm_wait_io_event(CPUState *cpu)
1093 {
1094     while (cpu_thread_is_idle(cpu)) {
1095         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1096     }
1097
1098     qemu_wait_io_event_common(cpu);
1099 }
1100
1101 static void *qemu_kvm_cpu_thread_fn(void *arg)
1102 {
1103     CPUState *cpu = arg;
1104     int r;
1105
1106     rcu_register_thread();
1107
1108     qemu_mutex_lock_iothread();
1109     qemu_thread_get_self(cpu->thread);
1110     cpu->thread_id = qemu_get_thread_id();
1111     cpu->can_do_io = 1;
1112     current_cpu = cpu;
1113
1114     r = kvm_init_vcpu(cpu);
1115     if (r < 0) {
1116         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1117         exit(1);
1118     }
1119
1120     kvm_init_cpu_signals(cpu);
1121
1122     /* signal CPU creation */
1123     cpu->created = true;
1124     qemu_cond_signal(&qemu_cpu_cond);
1125
1126     do {
1127         if (cpu_can_run(cpu)) {
1128             r = kvm_cpu_exec(cpu);
1129             if (r == EXCP_DEBUG) {
1130                 cpu_handle_guest_debug(cpu);
1131             }
1132         }
1133         qemu_kvm_wait_io_event(cpu);
1134     } while (!cpu->unplug || cpu_can_run(cpu));
1135
1136     qemu_kvm_destroy_vcpu(cpu);
1137     cpu->created = false;
1138     qemu_cond_signal(&qemu_cpu_cond);
1139     qemu_mutex_unlock_iothread();
1140     return NULL;
1141 }
1142
1143 static void *qemu_dummy_cpu_thread_fn(void *arg)
1144 {
1145 #ifdef _WIN32
1146     fprintf(stderr, "qtest is not supported under Windows\n");
1147     exit(1);
1148 #else
1149     CPUState *cpu = arg;
1150     sigset_t waitset;
1151     int r;
1152
1153     rcu_register_thread();
1154
1155     qemu_mutex_lock_iothread();
1156     qemu_thread_get_self(cpu->thread);
1157     cpu->thread_id = qemu_get_thread_id();
1158     cpu->can_do_io = 1;
1159     current_cpu = cpu;
1160
1161     sigemptyset(&waitset);
1162     sigaddset(&waitset, SIG_IPI);
1163
1164     /* signal CPU creation */
1165     cpu->created = true;
1166     qemu_cond_signal(&qemu_cpu_cond);
1167
1168     while (1) {
1169         qemu_mutex_unlock_iothread();
1170         do {
1171             int sig;
1172             r = sigwait(&waitset, &sig);
1173         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1174         if (r == -1) {
1175             perror("sigwait");
1176             exit(1);
1177         }
1178         qemu_mutex_lock_iothread();
1179         qemu_wait_io_event_common(cpu);
1180     }
1181
1182     return NULL;
1183 #endif
1184 }
1185
1186 static int64_t tcg_get_icount_limit(void)
1187 {
1188     int64_t deadline;
1189
1190     if (replay_mode != REPLAY_MODE_PLAY) {
1191         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1192
1193         /* Maintain prior (possibly buggy) behaviour where if no deadline
1194          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1195          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1196          * nanoseconds.
1197          */
1198         if ((deadline < 0) || (deadline > INT32_MAX)) {
1199             deadline = INT32_MAX;
1200         }
1201
1202         return qemu_icount_round(deadline);
1203     } else {
1204         return replay_get_instructions();
1205     }
1206 }
1207
1208 static void handle_icount_deadline(void)
1209 {
1210     assert(qemu_in_vcpu_thread());
1211     if (use_icount) {
1212         int64_t deadline =
1213             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1214
1215         if (deadline == 0) {
1216             /* Wake up other AioContexts.  */
1217             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1218             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1219         }
1220     }
1221 }
1222
1223 static void prepare_icount_for_run(CPUState *cpu)
1224 {
1225     if (use_icount) {
1226         int insns_left;
1227
1228         /* These should always be cleared by process_icount_data after
1229          * each vCPU execution. However u16.high can be raised
1230          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1231          */
1232         g_assert(cpu->icount_decr.u16.low == 0);
1233         g_assert(cpu->icount_extra == 0);
1234
1235         cpu->icount_budget = tcg_get_icount_limit();
1236         insns_left = MIN(0xffff, cpu->icount_budget);
1237         cpu->icount_decr.u16.low = insns_left;
1238         cpu->icount_extra = cpu->icount_budget - insns_left;
1239     }
1240 }
1241
1242 static void process_icount_data(CPUState *cpu)
1243 {
1244     if (use_icount) {
1245         /* Account for executed instructions */
1246         cpu_update_icount(cpu);
1247
1248         /* Reset the counters */
1249         cpu->icount_decr.u16.low = 0;
1250         cpu->icount_extra = 0;
1251         cpu->icount_budget = 0;
1252
1253         replay_account_executed_instructions();
1254     }
1255 }
1256
1257
1258 static int tcg_cpu_exec(CPUState *cpu)
1259 {
1260     int ret;
1261 #ifdef CONFIG_PROFILER
1262     int64_t ti;
1263 #endif
1264
1265 #ifdef CONFIG_PROFILER
1266     ti = profile_getclock();
1267 #endif
1268     qemu_mutex_unlock_iothread();
1269     cpu_exec_start(cpu);
1270     ret = cpu_exec(cpu);
1271     cpu_exec_end(cpu);
1272     qemu_mutex_lock_iothread();
1273 #ifdef CONFIG_PROFILER
1274     tcg_time += profile_getclock() - ti;
1275 #endif
1276     return ret;
1277 }
1278
1279 /* Destroy any remaining vCPUs which have been unplugged and have
1280  * finished running
1281  */
1282 static void deal_with_unplugged_cpus(void)
1283 {
1284     CPUState *cpu;
1285
1286     CPU_FOREACH(cpu) {
1287         if (cpu->unplug && !cpu_can_run(cpu)) {
1288             qemu_tcg_destroy_vcpu(cpu);
1289             cpu->created = false;
1290             qemu_cond_signal(&qemu_cpu_cond);
1291             break;
1292         }
1293     }
1294 }
1295
1296 /* Single-threaded TCG
1297  *
1298  * In the single-threaded case each vCPU is simulated in turn. If
1299  * there is more than a single vCPU we create a simple timer to kick
1300  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1301  * This is done explicitly rather than relying on side-effects
1302  * elsewhere.
1303  */
1304
1305 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1306 {
1307     CPUState *cpu = arg;
1308
1309     rcu_register_thread();
1310     tcg_register_thread();
1311
1312     qemu_mutex_lock_iothread();
1313     qemu_thread_get_self(cpu->thread);
1314
1315     CPU_FOREACH(cpu) {
1316         cpu->thread_id = qemu_get_thread_id();
1317         cpu->created = true;
1318         cpu->can_do_io = 1;
1319     }
1320     qemu_cond_signal(&qemu_cpu_cond);
1321
1322     /* wait for initial kick-off after machine start */
1323     while (first_cpu->stopped) {
1324         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1325
1326         /* process any pending work */
1327         CPU_FOREACH(cpu) {
1328             current_cpu = cpu;
1329             qemu_wait_io_event_common(cpu);
1330         }
1331     }
1332
1333     start_tcg_kick_timer();
1334
1335     cpu = first_cpu;
1336
1337     /* process any pending work */
1338     cpu->exit_request = 1;
1339
1340     while (1) {
1341         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1342         qemu_account_warp_timer();
1343
1344         /* Run the timers here.  This is much more efficient than
1345          * waking up the I/O thread and waiting for completion.
1346          */
1347         handle_icount_deadline();
1348
1349         if (!cpu) {
1350             cpu = first_cpu;
1351         }
1352
1353         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1354
1355             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1356             current_cpu = cpu;
1357
1358             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1359                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1360
1361             if (cpu_can_run(cpu)) {
1362                 int r;
1363
1364                 prepare_icount_for_run(cpu);
1365
1366                 r = tcg_cpu_exec(cpu);
1367
1368                 process_icount_data(cpu);
1369
1370                 if (r == EXCP_DEBUG) {
1371                     cpu_handle_guest_debug(cpu);
1372                     break;
1373                 } else if (r == EXCP_ATOMIC) {
1374                     qemu_mutex_unlock_iothread();
1375                     cpu_exec_step_atomic(cpu);
1376                     qemu_mutex_lock_iothread();
1377                     break;
1378                 }
1379             } else if (cpu->stop) {
1380                 if (cpu->unplug) {
1381                     cpu = CPU_NEXT(cpu);
1382                 }
1383                 break;
1384             }
1385
1386             cpu = CPU_NEXT(cpu);
1387         } /* while (cpu && !cpu->exit_request).. */
1388
1389         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1390         atomic_set(&tcg_current_rr_cpu, NULL);
1391
1392         if (cpu && cpu->exit_request) {
1393             atomic_mb_set(&cpu->exit_request, 0);
1394         }
1395
1396         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1397         deal_with_unplugged_cpus();
1398     }
1399
1400     return NULL;
1401 }
1402
1403 static void *qemu_hax_cpu_thread_fn(void *arg)
1404 {
1405     CPUState *cpu = arg;
1406     int r;
1407
1408     qemu_mutex_lock_iothread();
1409     qemu_thread_get_self(cpu->thread);
1410
1411     cpu->thread_id = qemu_get_thread_id();
1412     cpu->created = true;
1413     cpu->halted = 0;
1414     current_cpu = cpu;
1415
1416     hax_init_vcpu(cpu);
1417     qemu_cond_signal(&qemu_cpu_cond);
1418
1419     while (1) {
1420         if (cpu_can_run(cpu)) {
1421             r = hax_smp_cpu_exec(cpu);
1422             if (r == EXCP_DEBUG) {
1423                 cpu_handle_guest_debug(cpu);
1424             }
1425         }
1426
1427         while (cpu_thread_is_idle(cpu)) {
1428             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1429         }
1430 #ifdef _WIN32
1431         SleepEx(0, TRUE);
1432 #endif
1433         qemu_wait_io_event_common(cpu);
1434     }
1435     return NULL;
1436 }
1437
1438 #ifdef _WIN32
1439 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1440 {
1441 }
1442 #endif
1443
1444 /* Multi-threaded TCG
1445  *
1446  * In the multi-threaded case each vCPU has its own thread. The TLS
1447  * variable current_cpu can be used deep in the code to find the
1448  * current CPUState for a given thread.
1449  */
1450
1451 static void *qemu_tcg_cpu_thread_fn(void *arg)
1452 {
1453     CPUState *cpu = arg;
1454
1455     g_assert(!use_icount);
1456
1457     rcu_register_thread();
1458     tcg_register_thread();
1459
1460     qemu_mutex_lock_iothread();
1461     qemu_thread_get_self(cpu->thread);
1462
1463     cpu->thread_id = qemu_get_thread_id();
1464     cpu->created = true;
1465     cpu->can_do_io = 1;
1466     current_cpu = cpu;
1467     qemu_cond_signal(&qemu_cpu_cond);
1468
1469     /* process any pending work */
1470     cpu->exit_request = 1;
1471
1472     while (1) {
1473         if (cpu_can_run(cpu)) {
1474             int r;
1475             r = tcg_cpu_exec(cpu);
1476             switch (r) {
1477             case EXCP_DEBUG:
1478                 cpu_handle_guest_debug(cpu);
1479                 break;
1480             case EXCP_HALTED:
1481                 /* during start-up the vCPU is reset and the thread is
1482                  * kicked several times. If we don't ensure we go back
1483                  * to sleep in the halted state we won't cleanly
1484                  * start-up when the vCPU is enabled.
1485                  *
1486                  * cpu->halted should ensure we sleep in wait_io_event
1487                  */
1488                 g_assert(cpu->halted);
1489                 break;
1490             case EXCP_ATOMIC:
1491                 qemu_mutex_unlock_iothread();
1492                 cpu_exec_step_atomic(cpu);
1493                 qemu_mutex_lock_iothread();
1494             default:
1495                 /* Ignore everything else? */
1496                 break;
1497             }
1498         } else if (cpu->unplug) {
1499             qemu_tcg_destroy_vcpu(cpu);
1500             cpu->created = false;
1501             qemu_cond_signal(&qemu_cpu_cond);
1502             qemu_mutex_unlock_iothread();
1503             return NULL;
1504         }
1505
1506         atomic_mb_set(&cpu->exit_request, 0);
1507         qemu_tcg_wait_io_event(cpu);
1508     }
1509
1510     return NULL;
1511 }
1512
1513 static void qemu_cpu_kick_thread(CPUState *cpu)
1514 {
1515 #ifndef _WIN32
1516     int err;
1517
1518     if (cpu->thread_kicked) {
1519         return;
1520     }
1521     cpu->thread_kicked = true;
1522     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1523     if (err) {
1524         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1525         exit(1);
1526     }
1527 #else /* _WIN32 */
1528     if (!qemu_cpu_is_self(cpu)) {
1529         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1530             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1531                     __func__, GetLastError());
1532             exit(1);
1533         }
1534     }
1535 #endif
1536 }
1537
1538 void qemu_cpu_kick(CPUState *cpu)
1539 {
1540     qemu_cond_broadcast(cpu->halt_cond);
1541     if (tcg_enabled()) {
1542         cpu_exit(cpu);
1543         /* NOP unless doing single-thread RR */
1544         qemu_cpu_kick_rr_cpu();
1545     } else {
1546         if (hax_enabled()) {
1547             /*
1548              * FIXME: race condition with the exit_request check in
1549              * hax_vcpu_hax_exec
1550              */
1551             cpu->exit_request = 1;
1552         }
1553         qemu_cpu_kick_thread(cpu);
1554     }
1555 }
1556
1557 void qemu_cpu_kick_self(void)
1558 {
1559     assert(current_cpu);
1560     qemu_cpu_kick_thread(current_cpu);
1561 }
1562
1563 bool qemu_cpu_is_self(CPUState *cpu)
1564 {
1565     return qemu_thread_is_self(cpu->thread);
1566 }
1567
1568 bool qemu_in_vcpu_thread(void)
1569 {
1570     return current_cpu && qemu_cpu_is_self(current_cpu);
1571 }
1572
1573 static __thread bool iothread_locked = false;
1574
1575 bool qemu_mutex_iothread_locked(void)
1576 {
1577     return iothread_locked;
1578 }
1579
1580 void qemu_mutex_lock_iothread(void)
1581 {
1582     g_assert(!qemu_mutex_iothread_locked());
1583     qemu_mutex_lock(&qemu_global_mutex);
1584     iothread_locked = true;
1585 }
1586
1587 void qemu_mutex_unlock_iothread(void)
1588 {
1589     g_assert(qemu_mutex_iothread_locked());
1590     iothread_locked = false;
1591     qemu_mutex_unlock(&qemu_global_mutex);
1592 }
1593
1594 static bool all_vcpus_paused(void)
1595 {
1596     CPUState *cpu;
1597
1598     CPU_FOREACH(cpu) {
1599         if (!cpu->stopped) {
1600             return false;
1601         }
1602     }
1603
1604     return true;
1605 }
1606
1607 void pause_all_vcpus(void)
1608 {
1609     CPUState *cpu;
1610
1611     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1612     CPU_FOREACH(cpu) {
1613         cpu->stop = true;
1614         qemu_cpu_kick(cpu);
1615     }
1616
1617     if (qemu_in_vcpu_thread()) {
1618         cpu_stop_current();
1619     }
1620
1621     while (!all_vcpus_paused()) {
1622         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1623         CPU_FOREACH(cpu) {
1624             qemu_cpu_kick(cpu);
1625         }
1626     }
1627 }
1628
1629 void cpu_resume(CPUState *cpu)
1630 {
1631     cpu->stop = false;
1632     cpu->stopped = false;
1633     qemu_cpu_kick(cpu);
1634 }
1635
1636 void resume_all_vcpus(void)
1637 {
1638     CPUState *cpu;
1639
1640     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1641     CPU_FOREACH(cpu) {
1642         cpu_resume(cpu);
1643     }
1644 }
1645
1646 void cpu_remove(CPUState *cpu)
1647 {
1648     cpu->stop = true;
1649     cpu->unplug = true;
1650     qemu_cpu_kick(cpu);
1651 }
1652
1653 void cpu_remove_sync(CPUState *cpu)
1654 {
1655     cpu_remove(cpu);
1656     while (cpu->created) {
1657         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1658     }
1659 }
1660
1661 /* For temporary buffers for forming a name */
1662 #define VCPU_THREAD_NAME_SIZE 16
1663
1664 static void qemu_tcg_init_vcpu(CPUState *cpu)
1665 {
1666     char thread_name[VCPU_THREAD_NAME_SIZE];
1667     static QemuCond *single_tcg_halt_cond;
1668     static QemuThread *single_tcg_cpu_thread;
1669     static int tcg_region_inited;
1670
1671     /*
1672      * Initialize TCG regions--once. Now is a good time, because:
1673      * (1) TCG's init context, prologue and target globals have been set up.
1674      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1675      *     -accel flag is processed, so the check doesn't work then).
1676      */
1677     if (!tcg_region_inited) {
1678         tcg_region_inited = 1;
1679         tcg_region_init();
1680     }
1681
1682     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1683         cpu->thread = g_malloc0(sizeof(QemuThread));
1684         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1685         qemu_cond_init(cpu->halt_cond);
1686
1687         if (qemu_tcg_mttcg_enabled()) {
1688             /* create a thread per vCPU with TCG (MTTCG) */
1689             parallel_cpus = true;
1690             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1691                  cpu->cpu_index);
1692
1693             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1694                                cpu, QEMU_THREAD_JOINABLE);
1695
1696         } else {
1697             /* share a single thread for all cpus with TCG */
1698             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1699             qemu_thread_create(cpu->thread, thread_name,
1700                                qemu_tcg_rr_cpu_thread_fn,
1701                                cpu, QEMU_THREAD_JOINABLE);
1702
1703             single_tcg_halt_cond = cpu->halt_cond;
1704             single_tcg_cpu_thread = cpu->thread;
1705         }
1706 #ifdef _WIN32
1707         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1708 #endif
1709         while (!cpu->created) {
1710             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1711         }
1712     } else {
1713         /* For non-MTTCG cases we share the thread */
1714         cpu->thread = single_tcg_cpu_thread;
1715         cpu->halt_cond = single_tcg_halt_cond;
1716     }
1717 }
1718
1719 static void qemu_hax_start_vcpu(CPUState *cpu)
1720 {
1721     char thread_name[VCPU_THREAD_NAME_SIZE];
1722
1723     cpu->thread = g_malloc0(sizeof(QemuThread));
1724     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1725     qemu_cond_init(cpu->halt_cond);
1726
1727     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1728              cpu->cpu_index);
1729     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1730                        cpu, QEMU_THREAD_JOINABLE);
1731 #ifdef _WIN32
1732     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1733 #endif
1734     while (!cpu->created) {
1735         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1736     }
1737 }
1738
1739 static void qemu_kvm_start_vcpu(CPUState *cpu)
1740 {
1741     char thread_name[VCPU_THREAD_NAME_SIZE];
1742
1743     cpu->thread = g_malloc0(sizeof(QemuThread));
1744     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1745     qemu_cond_init(cpu->halt_cond);
1746     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1747              cpu->cpu_index);
1748     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1749                        cpu, QEMU_THREAD_JOINABLE);
1750     while (!cpu->created) {
1751         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1752     }
1753 }
1754
1755 static void qemu_dummy_start_vcpu(CPUState *cpu)
1756 {
1757     char thread_name[VCPU_THREAD_NAME_SIZE];
1758
1759     cpu->thread = g_malloc0(sizeof(QemuThread));
1760     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1761     qemu_cond_init(cpu->halt_cond);
1762     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1763              cpu->cpu_index);
1764     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1765                        QEMU_THREAD_JOINABLE);
1766     while (!cpu->created) {
1767         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1768     }
1769 }
1770
1771 void qemu_init_vcpu(CPUState *cpu)
1772 {
1773     cpu->nr_cores = smp_cores;
1774     cpu->nr_threads = smp_threads;
1775     cpu->stopped = true;
1776
1777     if (!cpu->as) {
1778         /* If the target cpu hasn't set up any address spaces itself,
1779          * give it the default one.
1780          */
1781         AddressSpace *as = g_new0(AddressSpace, 1);
1782
1783         address_space_init(as, cpu->memory, "cpu-memory");
1784         cpu->num_ases = 1;
1785         cpu_address_space_init(cpu, as, 0);
1786     }
1787
1788     if (kvm_enabled()) {
1789         qemu_kvm_start_vcpu(cpu);
1790     } else if (hax_enabled()) {
1791         qemu_hax_start_vcpu(cpu);
1792     } else if (tcg_enabled()) {
1793         qemu_tcg_init_vcpu(cpu);
1794     } else {
1795         qemu_dummy_start_vcpu(cpu);
1796     }
1797 }
1798
1799 void cpu_stop_current(void)
1800 {
1801     if (current_cpu) {
1802         current_cpu->stop = false;
1803         current_cpu->stopped = true;
1804         cpu_exit(current_cpu);
1805         qemu_cond_broadcast(&qemu_pause_cond);
1806     }
1807 }
1808
1809 int vm_stop(RunState state)
1810 {
1811     if (qemu_in_vcpu_thread()) {
1812         qemu_system_vmstop_request_prepare();
1813         qemu_system_vmstop_request(state);
1814         /*
1815          * FIXME: should not return to device code in case
1816          * vm_stop() has been requested.
1817          */
1818         cpu_stop_current();
1819         return 0;
1820     }
1821
1822     return do_vm_stop(state);
1823 }
1824
1825 /**
1826  * Prepare for (re)starting the VM.
1827  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1828  * running or in case of an error condition), 0 otherwise.
1829  */
1830 int vm_prepare_start(void)
1831 {
1832     RunState requested;
1833     int res = 0;
1834
1835     qemu_vmstop_requested(&requested);
1836     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1837         return -1;
1838     }
1839
1840     /* Ensure that a STOP/RESUME pair of events is emitted if a
1841      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1842      * example, according to documentation is always followed by
1843      * the STOP event.
1844      */
1845     if (runstate_is_running()) {
1846         qapi_event_send_stop(&error_abort);
1847         res = -1;
1848     } else {
1849         replay_enable_events();
1850         cpu_enable_ticks();
1851         runstate_set(RUN_STATE_RUNNING);
1852         vm_state_notify(1, RUN_STATE_RUNNING);
1853     }
1854
1855     /* We are sending this now, but the CPUs will be resumed shortly later */
1856     qapi_event_send_resume(&error_abort);
1857     return res;
1858 }
1859
1860 void vm_start(void)
1861 {
1862     if (!vm_prepare_start()) {
1863         resume_all_vcpus();
1864     }
1865 }
1866
1867 /* does a state transition even if the VM is already stopped,
1868    current state is forgotten forever */
1869 int vm_stop_force_state(RunState state)
1870 {
1871     if (runstate_is_running()) {
1872         return vm_stop(state);
1873     } else {
1874         runstate_set(state);
1875
1876         bdrv_drain_all();
1877         /* Make sure to return an error if the flush in a previous vm_stop()
1878          * failed. */
1879         return bdrv_flush_all();
1880     }
1881 }
1882
1883 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1884 {
1885     /* XXX: implement xxx_cpu_list for targets that still miss it */
1886 #if defined(cpu_list)
1887     cpu_list(f, cpu_fprintf);
1888 #endif
1889 }
1890
1891 CpuInfoList *qmp_query_cpus(Error **errp)
1892 {
1893     MachineState *ms = MACHINE(qdev_get_machine());
1894     MachineClass *mc = MACHINE_GET_CLASS(ms);
1895     CpuInfoList *head = NULL, *cur_item = NULL;
1896     CPUState *cpu;
1897
1898     CPU_FOREACH(cpu) {
1899         CpuInfoList *info;
1900 #if defined(TARGET_I386)
1901         X86CPU *x86_cpu = X86_CPU(cpu);
1902         CPUX86State *env = &x86_cpu->env;
1903 #elif defined(TARGET_PPC)
1904         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1905         CPUPPCState *env = &ppc_cpu->env;
1906 #elif defined(TARGET_SPARC)
1907         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1908         CPUSPARCState *env = &sparc_cpu->env;
1909 #elif defined(TARGET_MIPS)
1910         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1911         CPUMIPSState *env = &mips_cpu->env;
1912 #elif defined(TARGET_TRICORE)
1913         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1914         CPUTriCoreState *env = &tricore_cpu->env;
1915 #endif
1916
1917         cpu_synchronize_state(cpu);
1918
1919         info = g_malloc0(sizeof(*info));
1920         info->value = g_malloc0(sizeof(*info->value));
1921         info->value->CPU = cpu->cpu_index;
1922         info->value->current = (cpu == first_cpu);
1923         info->value->halted = cpu->halted;
1924         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1925         info->value->thread_id = cpu->thread_id;
1926 #if defined(TARGET_I386)
1927         info->value->arch = CPU_INFO_ARCH_X86;
1928         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1929 #elif defined(TARGET_PPC)
1930         info->value->arch = CPU_INFO_ARCH_PPC;
1931         info->value->u.ppc.nip = env->nip;
1932 #elif defined(TARGET_SPARC)
1933         info->value->arch = CPU_INFO_ARCH_SPARC;
1934         info->value->u.q_sparc.pc = env->pc;
1935         info->value->u.q_sparc.npc = env->npc;
1936 #elif defined(TARGET_MIPS)
1937         info->value->arch = CPU_INFO_ARCH_MIPS;
1938         info->value->u.q_mips.PC = env->active_tc.PC;
1939 #elif defined(TARGET_TRICORE)
1940         info->value->arch = CPU_INFO_ARCH_TRICORE;
1941         info->value->u.tricore.PC = env->PC;
1942 #else
1943         info->value->arch = CPU_INFO_ARCH_OTHER;
1944 #endif
1945         info->value->has_props = !!mc->cpu_index_to_instance_props;
1946         if (info->value->has_props) {
1947             CpuInstanceProperties *props;
1948             props = g_malloc0(sizeof(*props));
1949             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
1950             info->value->props = props;
1951         }
1952
1953         /* XXX: waiting for the qapi to support GSList */
1954         if (!cur_item) {
1955             head = cur_item = info;
1956         } else {
1957             cur_item->next = info;
1958             cur_item = info;
1959         }
1960     }
1961
1962     return head;
1963 }
1964
1965 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1966                  bool has_cpu, int64_t cpu_index, Error **errp)
1967 {
1968     FILE *f;
1969     uint32_t l;
1970     CPUState *cpu;
1971     uint8_t buf[1024];
1972     int64_t orig_addr = addr, orig_size = size;
1973
1974     if (!has_cpu) {
1975         cpu_index = 0;
1976     }
1977
1978     cpu = qemu_get_cpu(cpu_index);
1979     if (cpu == NULL) {
1980         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1981                    "a CPU number");
1982         return;
1983     }
1984
1985     f = fopen(filename, "wb");
1986     if (!f) {
1987         error_setg_file_open(errp, errno, filename);
1988         return;
1989     }
1990
1991     while (size != 0) {
1992         l = sizeof(buf);
1993         if (l > size)
1994             l = size;
1995         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1996             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1997                              " specified", orig_addr, orig_size);
1998             goto exit;
1999         }
2000         if (fwrite(buf, 1, l, f) != l) {
2001             error_setg(errp, QERR_IO_ERROR);
2002             goto exit;
2003         }
2004         addr += l;
2005         size -= l;
2006     }
2007
2008 exit:
2009     fclose(f);
2010 }
2011
2012 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2013                   Error **errp)
2014 {
2015     FILE *f;
2016     uint32_t l;
2017     uint8_t buf[1024];
2018
2019     f = fopen(filename, "wb");
2020     if (!f) {
2021         error_setg_file_open(errp, errno, filename);
2022         return;
2023     }
2024
2025     while (size != 0) {
2026         l = sizeof(buf);
2027         if (l > size)
2028             l = size;
2029         cpu_physical_memory_read(addr, buf, l);
2030         if (fwrite(buf, 1, l, f) != l) {
2031             error_setg(errp, QERR_IO_ERROR);
2032             goto exit;
2033         }
2034         addr += l;
2035         size -= l;
2036     }
2037
2038 exit:
2039     fclose(f);
2040 }
2041
2042 void qmp_inject_nmi(Error **errp)
2043 {
2044     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2045 }
2046
2047 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2048 {
2049     if (!use_icount) {
2050         return;
2051     }
2052
2053     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2054                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2055     if (icount_align_option) {
2056         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2057         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2058     } else {
2059         cpu_fprintf(f, "Max guest delay     NA\n");
2060         cpu_fprintf(f, "Max guest advance   NA\n");
2061     }
2062 }
This page took 0.128012 seconds and 4 git commands to generate.