--- /dev/null
+=======================================
+Silicon Errata and Software Workarounds
+=======================================
+
+
+Date : 27 November 2015
+
+It is an unfortunate fact of life that hardware is often produced with
+so-called "errata", which can cause it to deviate from the architecture
+under specific circumstances. For hardware produced by ARM, these
+errata are broadly classified into the following categories:
+
+ ========== ========================================================
+ Category A A critical error without a viable workaround.
+ Category B A significant or critical error with an acceptable
+ workaround.
+ Category C A minor error that is not expected to occur under normal
+ operation.
+ ========== ========================================================
+
+For more information, consult one of the "Software Developers Errata
+Notice" documents available on infocenter.arm.com (registration
+required).
+
+As far as Linux is concerned, Category B errata may require some special
+treatment in the operating system. For example, avoiding a particular
+sequence of code, or configuring the processor in a particular way. A
+less common situation may require similar actions in order to declassify
+a Category A erratum into a Category C erratum. These are collectively
+known as "software workarounds" and are only required in the minority of
+cases (e.g. those cases that both require a non-secure workaround *and*
+can be triggered by Linux).
+
+For software workarounds that may adversely impact systems unaffected by
+the erratum in question, a Kconfig entry is added under "Kernel
+Features" -> "ARM errata workarounds via the alternatives framework".
+These are enabled by default and patched in at runtime when an affected
+CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+available and the code is structured (preferably with a comment) in such
+a way that the erratum will not be hit.
+
+This approach can make it slightly onerous to determine exactly which
+errata are worked around in an arbitrary kernel source tree, so this
+file acts as a registry of software workarounds in the Linux Kernel and
+will be updated when new workarounds are committed and backported to
+stable kernels.
+
++----------------+-----------------+-----------------+-----------------------------+
+| Implementor | Component | Erratum ID | Kconfig |
++================+=================+=================+=============================+
+| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #852523 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A72 | #853709 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
++----------------+-----------------+-----------------+-----------------------------+
++| ARM | Neoverse-N1 | #1349291 | N/A |
+++----------------+-----------------+-----------------+-----------------------------+
+| ARM | MMU-500 | #841119,826419 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX ITS | #22375,24313 | CAVIUM_ERRATUM_22375 |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX SMMUv2 | #27704 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX2 SMMUv3| #74 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| Cavium | ThunderX2 SMMUv3| #126 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip0{6,7} | #161010701 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
++----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 |
++----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 |
++----------------+-----------------+-----------------+-----------------------------+
this vcpu, and determines which register slices are visible through
this ioctl interface.
-(See Documentation/arm64/sve.txt for an explanation of the "vq"
+(See Documentation/arm64/sve.rst for an explanation of the "vq"
nomenclature.)
KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT.
See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization
using this ioctl.
+ 4.120 KVM_SET_PMU_EVENT_FILTER
+
+ Capability: KVM_CAP_PMU_EVENT_FILTER
+ Architectures: x86
+ Type: vm ioctl
+ Parameters: struct kvm_pmu_event_filter (in)
+ Returns: 0 on success, -1 on error
+
+ struct kvm_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u64 events[0];
+ };
+
+ This ioctl restricts the set of PMU events that the guest can program.
+ The argument holds a list of events which will be allowed or denied.
+ The eventsel+umask of each event the guest attempts to program is compared
+ against the events field to determine whether the guest should have access.
+ This only affects general purpose counters; fixed purpose counters can
+ be disabled by changing the perfmon CPUID leaf.
+
+ Valid values for 'action':
+ #define KVM_PMU_EVENT_ALLOW 0
+ #define KVM_PMU_EVENT_DENY 1
+
+
5. The kvm_run structure
------------------------
#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
+ #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
+ #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3)
Enabling this capability on a VM provides userspace with a way to no
longer intercept some instructions for improved latency in some
cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING);
}
+static inline bool system_has_prio_mask_debugging(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) &&
+ system_uses_irq_prio_masking();
+}
+
+ #define ARM64_BP_HARDEN_UNKNOWN -1
+ #define ARM64_BP_HARDEN_WA_NEEDED 0
+ #define ARM64_BP_HARDEN_NOT_REQUIRED 1
+
+ int get_spectre_v2_workaround_state(void);
+
#define ARM64_SSBD_UNKNOWN -1
#define ARM64_SSBD_FORCE_DISABLE 0
#define ARM64_SSBD_KERNEL 1
#include <asm/arch_gicv3.h>
#include <asm/barrier.h>
#include <asm/cpufeature.h>
+ #include <asm/cputype.h>
#include <asm/daifflags.h>
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmio.h>
- #include <asm/smp_plat.h>
#include <asm/thread_info.h>
#define __KVM_HAVE_ARCH_INTC_INITIALIZED
DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
- static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt,
- int cpu)
+ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
/* The host's MPIDR is immutable, so let's set it up at boot time */
- cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu);
+ cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr();
}
void __kvm_enable_ssbs(void);
* will not signal the CPU of interrupts of lower priority, and the
* only way to get out will be via guest exceptions.
* Naturally, we want to avoid this.
+ *
+ * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
+ * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
*/
- if (system_uses_irq_prio_masking()) {
- gic_write_pmr(GIC_PRIO_IRQON);
+ if (system_uses_irq_prio_masking())
dsb(sy);
- }
}
static inline void kvm_arm_vhe_guest_exit(void)
isb();
}
- static inline bool kvm_arm_harden_branch_predictor(void)
+ #define KVM_BP_HARDEN_UNKNOWN -1
+ #define KVM_BP_HARDEN_WA_NEEDED 0
+ #define KVM_BP_HARDEN_NOT_REQUIRED 1
+
+ static inline int kvm_arm_harden_branch_predictor(void)
{
- return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
+ switch (get_spectre_v2_workaround_state()) {
+ case ARM64_BP_HARDEN_WA_NEEDED:
+ return KVM_BP_HARDEN_WA_NEEDED;
+ case ARM64_BP_HARDEN_NOT_REQUIRED:
+ return KVM_BP_HARDEN_NOT_REQUIRED;
+ case ARM64_BP_HARDEN_UNKNOWN:
+ default:
+ return KVM_BP_HARDEN_UNKNOWN;
+ }
}
#define KVM_SSBD_UNKNOWN -1
#define SYS_APGAKEYLO_EL1 sys_reg(3, 0, 2, 3, 0)
#define SYS_APGAKEYHI_EL1 sys_reg(3, 0, 2, 3, 1)
+ #define SYS_SPSR_EL1 sys_reg(3, 0, 4, 0, 0)
+ #define SYS_ELR_EL1 sys_reg(3, 0, 4, 0, 1)
+
#define SYS_ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0)
#define SYS_AFSR0_EL1 sys_reg(3, 0, 5, 1, 0)
#define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1)
#define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2)
+ #define SYS_CNTV_CTL_EL0 sys_reg(3, 3, 14, 3, 1)
+ #define SYS_CNTV_CVAL_EL0 sys_reg(3, 3, 14, 3, 2)
+
#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0)
#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1)
#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0)
#define __TYPER_CRm(n) (0xc | (((n) >> 3) & 0x3))
#define SYS_PMEVTYPERn_EL0(n) sys_reg(3, 3, 14, __TYPER_CRm(n), __PMEV_op2(n))
- #define SYS_PMCCFILTR_EL0 sys_reg (3, 3, 14, 15, 7)
+ #define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7)
#define SYS_ZCR_EL2 sys_reg(3, 4, 1, 2, 0)
-
#define SYS_DACR32_EL2 sys_reg(3, 4, 3, 0, 0)
+ #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0)
+ #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1)
#define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1)
+ #define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0)
#define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3)
#define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0)
+ #define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0)
#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1)
#define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x)
#define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7)
/* VHE encodings for architectural EL0/1 system registers */
+ #define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0)
+ #define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2)
#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0)
+ #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0)
+ #define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1)
+ #define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2)
+ #define SYS_SPSR_EL12 sys_reg(3, 5, 4, 0, 0)
+ #define SYS_ELR_EL12 sys_reg(3, 5, 4, 0, 1)
+ #define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0)
+ #define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1)
+ #define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0)
+ #define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0)
+ #define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0)
+ #define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0)
+ #define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0)
+ #define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1)
+ #define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0)
+ #define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0)
+ #define SYS_CNTP_CTL_EL02 sys_reg(3, 5, 14, 2, 1)
+ #define SYS_CNTP_CVAL_EL02 sys_reg(3, 5, 14, 2, 2)
+ #define SYS_CNTV_TVAL_EL02 sys_reg(3, 5, 14, 3, 0)
+ #define SYS_CNTV_CTL_EL02 sys_reg(3, 5, 14, 3, 1)
+ #define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2)
/* Common SCTLR_ELx flags. */
#define SCTLR_ELx_DSSBS (_BITUL(44))
/* id_aa64isar1 */
#define ID_AA64ISAR1_SB_SHIFT 36
+#define ID_AA64ISAR1_FRINTTS_SHIFT 32
#define ID_AA64ISAR1_GPI_SHIFT 28
#define ID_AA64ISAR1_GPA_SHIFT 24
#define ID_AA64ISAR1_LRCPC_SHIFT 20
printk(" %pS\n", (void *)where);
}
-static void __dump_instr(const char *lvl, struct pt_regs *regs)
+static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
int i;
+ if (user_mode(regs))
+ return;
+
for (i = -4; i < 1; i++) {
unsigned int val, bad;
- bad = get_user(val, &((u32 *)addr)[i]);
+ bad = aarch64_insn_read(&((u32 *)addr)[i], &val);
if (!bad)
p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val);
break;
}
}
- printk("%sCode: %s\n", lvl, str);
-}
-static void dump_instr(const char *lvl, struct pt_regs *regs)
-{
- if (!user_mode(regs)) {
- mm_segment_t fs = get_fs();
- set_fs(KERNEL_DS);
- __dump_instr(lvl, regs);
- set_fs(fs);
- } else {
- __dump_instr(lvl, regs);
- }
+ printk("%sCode: %s\n", lvl, str);
}
void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
print_modules();
show_regs(regs);
- if (!user_mode(regs))
- dump_instr(KERN_EMERG, regs);
+ dump_kernel_instr(KERN_EMERG, regs);
return ret;
}
{
arm64_show_signal(signo, str);
if (signo == SIGKILL)
- force_sig(SIGKILL, current);
+ force_sig(SIGKILL);
else
- force_sig_fault(signo, code, addr, current);
+ force_sig_fault(signo, code, addr);
}
void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
const char *str)
{
arm64_show_signal(SIGBUS, str);
- force_sig_mceerr(code, addr, lsb, current);
+ force_sig_mceerr(code, addr, lsb);
}
void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr,
/*
* The CPU can't make progress. The exception may have
* been imprecise.
+ *
+ * Neoverse-N1 #1349291 means a non-KVM SError reported as
+ * Unrecoverable should be treated as Uncontainable. We
+ * call arm64_serror_panic() in both cases.
*/
return true;
if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
return true;
- far = read_sysreg_el2(far);
+ far = read_sysreg_el2(SYS_FAR);
/*
* The HPFAR can be invalid if the stage 2 fault did not
static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
- vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr);
+ vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
/*
* We're using the raw exception code in order to only process
* Naturally, we want to avoid this.
*/
if (system_uses_irq_prio_masking()) {
- gic_write_pmr(GIC_PRIO_IRQON);
+ gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
dsb(sy);
}
asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
__hyp_do_panic(str_va,
- spsr, elr,
- read_sysreg(esr_el2), read_sysreg_el2(far),
+ spsr, elr,
+ read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
read_sysreg(hpfar_el2), par, vcpu);
}
panic(__hyp_panic_string,
spsr, elr,
- read_sysreg_el2(esr), read_sysreg_el2(far),
+ read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
read_sysreg(hpfar_el2), par, vcpu);
}
NOKPROBE_SYMBOL(__hyp_call_panic_vhe);
void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
{
- u64 spsr = read_sysreg_el2(spsr);
- u64 elr = read_sysreg_el2(elr);
+ u64 spsr = read_sysreg_el2(SYS_SPSR);
+ u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
if (!has_vhe())
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/seqlock.h>
+#include <linux/module.h>
#include <asm/debug.h>
#include <asm/cpu.h>
#include <asm/fpu/api.h>
unsigned short ibc;
};
+struct kvm_s390_module_hook {
+ int (*hook)(struct kvm_vcpu *vcpu);
+ struct module *owner;
+};
+
struct kvm_s390_crypto {
struct kvm_s390_crypto_cb *crycb;
+ struct kvm_s390_module_hook *pqap_hook;
__u32 crycbd;
__u8 aes_kw;
__u8 dea_kw;
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
static inline void kvm_arch_hardware_disable(void) {}
- static inline void kvm_arch_check_processor_compat(void *rtn) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
return 0;
}
+ int kvm_arch_check_processor_compat(void)
+ {
+ return 0;
+ }
+
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
if (!kvm->arch.sca)
goto out_err;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
sca_offset += 16;
if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
sca_offset = 0;
kvm->arch.sca = (struct bsca_block *)
((char *) kvm->arch.sca + sca_offset);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
sprintf(debug_name, "kvm-%u", current->pid);
set_kvm_facility(kvm->arch.model.fac_list, 147);
}
+ if (css_general_characteristics.aiv && test_facility(65))
+ set_kvm_facility(kvm->arch.model.fac_mask, 65);
+
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
#define X2APIC_BROADCAST 0xFFFFFFFFul
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+ #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
apic_test_vector(vector, apic->regs + APIC_IRR);
}
- static inline void apic_clear_vector(int vec, void *bitmap)
- {
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
- }
-
static inline int __apic_test_and_set_vector(int vec, void *bitmap)
{
return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
if (apic_search_irr(apic) != -1)
apic->irr_pending = true;
}
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
+ kvm_lapic_set_vector(vector,
+ apic->regs + APIC_TMR);
else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
+ kvm_lapic_clear_vector(vector,
+ apic->regs + APIC_TMR);
}
if (vcpu->arch.apicv_active)
return container_of(dev, struct kvm_lapic, dev);
}
+ #define APIC_REG_MASK(reg) (1ull << ((reg) >> 4))
+ #define APIC_REGS_MASK(first, count) \
+ (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
+
int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
void *data)
{
unsigned char alignment = offset & 0xf;
u32 result;
/* this bitmask has a bit cleared for each reserved register */
- static const u64 rmask = 0x43ff01ffffffe70cULL;
-
- if ((alignment + len) > 4) {
- apic_debug("KVM_APIC_READ: alignment error %x %d\n",
- offset, len);
- return 1;
- }
+ u64 valid_reg_mask =
+ APIC_REG_MASK(APIC_ID) |
+ APIC_REG_MASK(APIC_LVR) |
+ APIC_REG_MASK(APIC_TASKPRI) |
+ APIC_REG_MASK(APIC_PROCPRI) |
+ APIC_REG_MASK(APIC_LDR) |
+ APIC_REG_MASK(APIC_DFR) |
+ APIC_REG_MASK(APIC_SPIV) |
+ APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
+ APIC_REG_MASK(APIC_ESR) |
+ APIC_REG_MASK(APIC_ICR) |
+ APIC_REG_MASK(APIC_ICR2) |
+ APIC_REG_MASK(APIC_LVTT) |
+ APIC_REG_MASK(APIC_LVTTHMR) |
+ APIC_REG_MASK(APIC_LVTPC) |
+ APIC_REG_MASK(APIC_LVT0) |
+ APIC_REG_MASK(APIC_LVT1) |
+ APIC_REG_MASK(APIC_LVTERR) |
+ APIC_REG_MASK(APIC_TMICT) |
+ APIC_REG_MASK(APIC_TMCCT) |
+ APIC_REG_MASK(APIC_TDCR);
+
+ /* ARBPRI is not valid on x2APIC */
+ if (!apic_x2apic_mode(apic))
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
- if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) {
apic_debug("KVM_APIC_READ: read reserved register %x\n",
offset);
return 1;
}
}
- void wait_lapic_expire(struct kvm_vcpu *vcpu)
+ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
+ s64 advance_expire_delta)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
- u64 guest_tsc, tsc_deadline, ns;
+ u64 ns;
+
+ /* too early */
+ if (advance_expire_delta < 0) {
+ ns = -advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns -= min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ } else {
+ /* too late */
+ ns = advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns += min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ }
+
+ if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ if (unlikely(timer_advance_ns > 5000)) {
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ apic->lapic_timer.timer_advance_adjust_done = false;
+ }
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ }
+
+ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+ {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
if (apic->lapic_timer.expired_tscdeadline == 0)
return;
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+ apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (!apic->lapic_timer.timer_advance_adjust_done) {
- /* too early */
- if (guest_tsc < tsc_deadline) {
- ns = (tsc_deadline - guest_tsc) * 1000000ULL;
- do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns -= min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
- } else {
- /* too late */
- ns = (guest_tsc - tsc_deadline) * 1000000ULL;
- do_div(ns, vcpu->arch.virtual_tsc_khz);
- timer_advance_ns += min((u32)ns,
- timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
- }
- if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- apic->lapic_timer.timer_advance_adjust_done = true;
- if (unlikely(timer_advance_ns > 5000)) {
- timer_advance_ns = 0;
- apic->lapic_timer.timer_advance_adjust_done = true;
- }
- apic->lapic_timer.timer_advance_ns = timer_advance_ns;
- }
+ if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+ adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
}
+ EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
static void start_sw_tscdeadline(struct kvm_lapic *apic)
{
apic_debug("%s: offset 0x%x with length 0x%x, and value is "
"0x%x\n", __func__, offset, len, val);
- kvm_lapic_reg_write(apic, offset & 0xff0, val);
+ kvm_lapic_reg_write(apic, offset, val);
return 0;
}
HRTIMER_MODE_ABS_PINNED);
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
- apic->lapic_timer.timer_advance_ns = 1000;
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
- * thinking that APIC satet has changed.
+ * thinking that APIC state has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
return 0;
nomem_free_apic:
kfree(apic);
+ vcpu->arch.apic = NULL;
nomem:
return -ENOMEM;
}
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!apic_enabled(apic))
+ if (!kvm_apic_hw_enabled(apic))
return -1;
__apic_update_ppr(apic, &ppr);
#include <trace/events/kvm.h>
- #define CREATE_TRACE_POINTS
- #include "mmutrace.h"
-
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
*/
static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+ /*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+ static u8 __read_mostly shadow_phys_bits;
static void mmu_spte_set(u64 *sptep, u64 spte);
+ static bool is_executable_pte(u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+ #define CREATE_TRACE_POINTS
+ #include "mmutrace.h"
+
static inline bool kvm_available_flush_tlb_with_range(void)
{
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+ static u8 kvm_get_shadow_phys_bits(void)
+ {
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+ * in CPU detection code, but MKTME treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore for MKTME
+ * we should still return physical address bits reported by CPUID.
+ */
+ if (!boot_cpu_has(X86_FEATURE_TME) ||
+ WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+ return boot_cpu_data.x86_phys_bits;
+
+ return cpuid_eax(0x80000008) & 0xff;
+ }
+
static void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
shadow_present_mask = 0;
shadow_acc_track_mask = 0;
+ shadow_phys_bits = kvm_get_shadow_phys_bits();
+
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
/*
* The idea using the light way get the spte on x86_32 guest is from
- * gup_get_pte(arch/x86/mm/gup.c).
+ * gup_get_pte (mm/gup.c).
*
* An spte tlb flush may be pending, because kvm_set_pte_rmapp
* coalesces them and we are running out of the MMU lock. Therefore
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
- if (sp->role.direct)
- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
- else
+ if (!sp->role.direct) {
sp->gfns[index] = gfn;
+ return;
+ }
+
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
+ "(expected %llx, got %llx)\n",
+ sp->gfn,
+ kvm_mmu_page_get_gfn(sp, index), gfn);
}
/*
ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
- is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
- *sptep, sptep);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
}
}
- kvm_release_pfn_clean(pfn);
-
return ret;
}
if (ret <= 0)
return -1;
- for (i = 0; i < ret; i++, gfn++, start++)
+ for (i = 0; i < ret; i++, gfn++, start++) {
mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
+ put_page(pages[i]);
+ }
return 0;
}
__direct_pte_prefetch(vcpu, sp, sptep);
}
- static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+ int map_writable, int level, kvm_pfn_t pfn,
+ bool prefault)
{
- struct kvm_shadow_walk_iterator iterator;
+ struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int emulate = 0;
- gfn_t pseudo_gfn;
+ int ret;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ gfn_t base_gfn = gfn;
if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return 0;
+ return RET_PF_RETRY;
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == level) {
- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, level, gfn, pfn, prefault,
- map_writable);
- direct_pte_prefetch(vcpu, iterator.sptep);
- ++vcpu->stat.pf_fixed;
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ for_each_shadow_entry(vcpu, gpa, it) {
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == level)
break;
- }
- drop_large_spte(vcpu, iterator.sptep);
- if (!is_shadow_present_pte(*iterator.sptep)) {
- u64 base_addr = iterator.addr;
+ drop_large_spte(vcpu, it.sptep);
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
- base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
- pseudo_gfn = base_addr >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1, 1, ACC_ALL);
-
- link_shadow_page(vcpu, iterator.sptep, sp);
+ link_shadow_page(vcpu, it.sptep, sp);
}
}
- return emulate;
+
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+ write, level, base_gfn, pfn, prefault,
+ map_writable);
+ direct_pte_prefetch(vcpu, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, kvm_pfn_t *pfnp,
+ gfn_t gfn, kvm_pfn_t *pfnp,
int *levelp)
{
kvm_pfn_t pfn = *pfnp;
- gfn_t gfn = *gfnp;
int level = *levelp;
/*
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
- gfn &= ~mask;
- *gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
kvm_get_pfn(pfn);
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return RET_PF_RETRY;
+ return r;
}
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
- bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
- {
- if (unlikely(!lapic_in_kernel(vcpu) ||
- kvm_event_needs_reinjection(vcpu) ||
- vcpu->arch.exception.pending))
- return false;
-
- if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
- return false;
-
- return kvm_x86_ops->interrupt_allowed(vcpu);
- }
-
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
{
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
+ r = RET_PF_RETRY;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return RET_PF_RETRY;
+ return r;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
*/
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
context->shadow_root_level, uses_nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), true);
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- boot_cpu_data.x86_phys_bits,
+ shadow_phys_bits,
false);
if (!shadow_me_mask)
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- boot_cpu_data.x86_phys_bits, execonly);
+ shadow_phys_bits, execonly);
}
#define BYTE_MASK(access) \
int nr_to_scan = sc->nr_to_scan;
unsigned long freed = 0;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
break;
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
return freed;
}
kmem_cache_destroy(mmu_page_header_cache);
}
+ static void kvm_set_mmio_spte_mask(void)
+ {
+ u64 mask;
+
+ /*
+ * Set the reserved bits and the present bit of an paging-structure
+ * entry to generate page fault with PFER.RSV = 1.
+ */
+
+ /*
+ * Mask the uppermost physical address bit, which would be reserved as
+ * long as the supported physical address width is less than 52.
+ */
+ mask = 1ull << 51;
+
+ /* Set the present bit. */
+ mask |= 1ull;
+
+ /*
+ * If reserved bit is not supported, clear the present bit to disable
+ * mmio page fault.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+ mask &= ~1ull;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+ }
+
int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
kvm_mmu_reset_all_pte_masks();
+ kvm_set_mmio_spte_mask();
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
#include "lapic.h"
#include "pmu.h"
+ /* This keeps the total size of the filter under 4k. */
+ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63
+
/* NOTE:
* - Each perf counter is defined as "struct kvm_pmc";
* - There are two types of perf counters: general purpose (gp) and fixed.
{
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
+ struct kvm *kvm = pmc->vcpu->kvm;
+ struct kvm_pmu_event_filter *filter;
+ int i;
+ bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
printk_once("kvm pmu: pin control bit is ignored\n");
if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
return;
+ filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
+ if (filter) {
+ for (i = 0; i < filter->nevents; i++)
+ if (filter->events[i] ==
+ (eventsel & AMD64_RAW_EVENT_MASK_NB))
+ break;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ i == filter->nevents)
+ allow_event = false;
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ i < filter->nevents)
+ allow_event = false;
+ }
+ if (!allow_event)
+ return;
+
event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
ctr_val = rdtsc();
break;
case VMWARE_BACKDOOR_PMC_REAL_TIME:
- ctr_val = ktime_get_boot_ns();
+ ctr_val = ktime_get_boottime_ns();
break;
case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
- ctr_val = ktime_get_boot_ns() +
+ ctr_val = ktime_get_boottime_ns() +
vcpu->kvm->arch.kvmclock_offset;
break;
default:
{
kvm_pmu_reset(vcpu);
}
+
+ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
+ {
+ struct kvm_pmu_event_filter tmp, *filter;
+ size_t size;
+ int r;
+
+ if (copy_from_user(&tmp, argp, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.action != KVM_PMU_EVENT_ALLOW &&
+ tmp.action != KVM_PMU_EVENT_DENY)
+ return -EINVAL;
+
+ if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
+ return -E2BIG;
+
+ size = struct_size(filter, events, tmp.nevents);
+ filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!filter)
+ return -ENOMEM;
+
+ r = -EFAULT;
+ if (copy_from_user(filter, argp, size))
+ goto cleanup;
+
+ /* Ensure nevents can't be changed between the user copies. */
+ *filter = tmp;
+
+ mutex_lock(&kvm->lock);
+ rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+
+ synchronize_srcu_expedited(&kvm->srcu);
+ r = 0;
+ cleanup:
+ kfree(filter);
+ return r;
+ }
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
- static u16 shadow_read_only_fields[] = {
- #define SHADOW_FIELD_RO(x) x,
+ struct shadow_vmcs_field {
+ u16 encoding;
+ u16 offset;
+ };
+ static struct shadow_vmcs_field shadow_read_only_fields[] = {
+ #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
#include "vmcs_shadow_fields.h"
};
static int max_shadow_read_only_fields =
ARRAY_SIZE(shadow_read_only_fields);
- static u16 shadow_read_write_fields[] = {
- #define SHADOW_FIELD_RW(x) x,
+ static struct shadow_vmcs_field shadow_read_write_fields[] = {
+ #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
#include "vmcs_shadow_fields.h"
};
static int max_shadow_read_write_fields =
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
for (i = j = 0; i < max_shadow_read_only_fields; i++) {
- u16 field = shadow_read_only_fields[i];
+ struct shadow_vmcs_field entry = shadow_read_only_fields[i];
+ u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_only_fields ||
- shadow_read_only_fields[i + 1] != field + 1))
+ shadow_read_only_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_only_field %x\n",
field + 1);
clear_bit(field, vmx_vmread_bitmap);
- #ifdef CONFIG_X86_64
if (field & 1)
+ #ifdef CONFIG_X86_64
continue;
+ #else
+ entry.offset += sizeof(u32);
#endif
- if (j < i)
- shadow_read_only_fields[j] = field;
- j++;
+ shadow_read_only_fields[j++] = entry;
}
max_shadow_read_only_fields = j;
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
- u16 field = shadow_read_write_fields[i];
+ struct shadow_vmcs_field entry = shadow_read_write_fields[i];
+ u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_write_fields ||
- shadow_read_write_fields[i + 1] != field + 1))
+ shadow_read_write_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_write_field %x\n",
field + 1);
+ WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
+ field <= GUEST_TR_AR_BYTES,
+ "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+
/*
* PML and the preemption timer can be emulated, but the
* processor cannot vmwrite to fields that don't exist
clear_bit(field, vmx_vmwrite_bitmap);
clear_bit(field, vmx_vmread_bitmap);
- #ifdef CONFIG_X86_64
if (field & 1)
+ #ifdef CONFIG_X86_64
continue;
+ #else
+ entry.offset += sizeof(u32);
#endif
- if (j < i)
- shadow_read_write_fields[j] = field;
- j++;
+ shadow_read_write_fields[j++] = entry;
}
max_shadow_read_write_fields = j;
}
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
free_loaded_vmcs(&vmx->nested.vmcs02);
}
+ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+ {
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+ #ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+ #endif
+ }
+
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
int cpu;
if (vmx->loaded_vmcs == vmcs)
return;
cpu = get_cpu();
- vmx_vcpu_put(vcpu);
+ prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vm_entry_controls_reset_shadow(vmx);
- vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
}
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
* must not be dereferenced.
*/
- if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
- !nested_ept) {
+ if (is_pae_paging(vcpu) && !nested_ept) {
if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
vmx->nested.msrs.misc_low = data;
vmx->nested.msrs.misc_high = data >> 32;
- /*
- * If L1 has read-only VM-exit information fields, use the
- * less permissive vmx_vmwrite_bitmap to specify write
- * permissions for the shadow VMCS.
- */
- if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
- vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
-
return 0;
}
case MSR_IA32_VMX_VMCS_ENUM:
vmx->nested.msrs.vmcs_enum = data;
return 0;
+ case MSR_IA32_VMX_VMFUNC:
+ if (data & ~vmx->nested.msrs.vmfunc_controls)
+ return -EINVAL;
+ vmx->nested.msrs.vmfunc_controls = data;
+ return 0;
default:
/*
* The rest of the VMX capability MSRs do not support restore.
}
/*
- * Copy the writable VMCS shadow fields back to the VMCS12, in case
- * they have been modified by the L1 guest. Note that the "read-only"
- * VM-exit information fields are actually writable if the vCPU is
- * configured to support "VMWRITE to any supported field in the VMCS."
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
+ * been modified by the L1 guest. Note, "writable" in this context means
+ * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
+ * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
+ * VM-exit information fields (which are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS").
*/
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{
- const u16 *fields[] = {
- shadow_read_write_fields,
- shadow_read_only_fields
- };
- const int max_fields[] = {
- max_shadow_read_write_fields,
- max_shadow_read_only_fields
- };
- int i, q;
- unsigned long field;
- u64 field_value;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i;
preempt_disable();
vmcs_load(shadow_vmcs);
- for (q = 0; q < ARRAY_SIZE(fields); q++) {
- for (i = 0; i < max_fields[q]; i++) {
- field = fields[q][i];
- field_value = __vmcs_readl(field);
- vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
- }
- /*
- * Skip the VM-exit information fields if they are read-only.
- */
- if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
- break;
+ for (i = 0; i < max_shadow_read_write_fields; i++) {
+ field = shadow_read_write_fields[i];
+ val = __vmcs_readl(field.encoding);
+ vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
}
vmcs_clear(shadow_vmcs);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{
- const u16 *fields[] = {
+ const struct shadow_vmcs_field *fields[] = {
shadow_read_write_fields,
shadow_read_only_fields
};
max_shadow_read_write_fields,
max_shadow_read_only_fields
};
- int i, q;
- unsigned long field;
- u64 field_value = 0;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i, q;
vmcs_load(shadow_vmcs);
for (q = 0; q < ARRAY_SIZE(fields); q++) {
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
- vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
- __vmcs_writel(field, field_value);
+ val = vmcs12_read_any(vmcs12, field.encoding,
+ field.offset);
+ __vmcs_writel(field.encoding, val);
}
}
* evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
* evmcs->host_idtr_base = vmcs12->host_idtr_base;
* evmcs->host_rsp = vmcs12->host_rsp;
- * sync_vmcs12() doesn't read these:
+ * sync_vmcs02_to_vmcs12() doesn't read these:
* evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
* evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
* evmcs->msr_bitmap = vmcs12->msr_bitmap;
bool from_launch)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct hv_vp_assist_page assist_page;
+ bool evmcs_gpa_changed = false;
+ u64 evmcs_gpa;
if (likely(!vmx->nested.enlightened_vmcs_enabled))
return 1;
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
- return 1;
-
- if (unlikely(!assist_page.enlighten_vmentry))
+ if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
return 1;
- if (unlikely(assist_page.current_nested_vmcs !=
- vmx->nested.hv_evmcs_vmptr)) {
-
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
if (!vmx->nested.hv_evmcs)
vmx->nested.current_vmptr = -1ull;
nested_release_evmcs(vcpu);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
&vmx->nested.hv_evmcs_map))
return 0;
}
vmx->nested.dirty_vmcs12 = true;
- /*
- * As we keep L2 state for one guest only 'hv_clean_fields' mask
- * can't be used when we switch between them. Reset it here for
- * simplicity.
- */
- vmx->nested.hv_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+ vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
+ evmcs_gpa_changed = true;
/*
* Unlike normal vmcs12, enlightened vmcs12 is not fully
* reloaded from guest's memory (read only fields, fields not
}
}
+
+ /*
+ * Clean fields data can't de used on VMLAUNCH and when we switch
+ * between different L2 guests as KVM keeps a single VMCS12 per L1.
+ */
+ if (from_launch || evmcs_gpa_changed)
+ vmx->nested.hv_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
return 1;
}
- void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
+ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
copy_vmcs12_to_shadow(vmx);
}
- vmx->nested.need_vmcs12_sync = false;
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
}
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
- if (enable_pml)
+ /*
+ * The PML address never changes, so it is constant in vmcs02.
+ * Conceptually we want to copy the PML index from vmcs01 here,
+ * and then back to vmcs01 on nested vmexit. But since we flush
+ * the log and reset GUEST_PML_INDEX on each vmexit, the PML
+ * index is also effectively constant in vmcs02.
+ */
+ if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
/*
* Set the MSR load/store lists to match L0's settings. Only the
vmx_set_constant_host_state(vmx);
}
- static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
struct vmcs12 *vmcs12)
{
prepare_vmcs02_constant_state(vmx);
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
- prepare_vmcs02_early_full(vmx, vmcs12);
+ prepare_vmcs02_early_rare(vmx, vmcs12);
/*
* PIN CONTROLS
*/
- exec_control = vmcs12->pin_based_vm_exec_control;
-
- /* Preemption timer setting is computed directly in vmx_vcpu_run. */
- exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
- vmx->loaded_vmcs->hv_timer_armed = false;
+ exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control |= (vmcs12->pin_based_vm_exec_control &
+ ~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+ pin_controls_set(vmx, exec_control);
/*
* EXEC CONTROLS
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
- /*
- * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
- * nested_get_vmcs12_pages can't fix it up, the illegal value
- * will result in a VM entry failure.
- */
- if (exec_control & CPU_BASED_TPR_SHADOW) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ if (exec_control & CPU_BASED_TPR_SHADOW)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
- } else {
#ifdef CONFIG_X86_64
+ else
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING;
#endif
- }
/*
* A vmexit (to either L1 hypervisor or L0 userspace) is always needed
* for I/O port accesses.
*/
- exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+
+ /*
+ * This bit will be computed in nested_get_vmcs12_pages, because
+ * we do not have access to L1's MSR bitmap yet. For now, keep
+ * the same bit as before, hoping to avoid multiple VMWRITEs that
+ * only set/clear this bit.
+ */
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
+
+ exec_controls_set(vmx, exec_control);
/*
* SECONDARY EXEC CONTROLS
/* VMCS shadowing for L2 is emulated for now */
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
- vmcs_write16(GUEST_INTR_STATUS,
- vmcs12->guest_intr_status);
-
/*
- * Write an illegal value to APIC_ACCESS_ADDR. Later,
- * nested_get_vmcs12_pages will either fix it up or
- * remove the VM execution control.
+ * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
+ * will not have to rewrite the controls just for this bit.
*/
- if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
- vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
+ (vmcs12->guest_cr4 & X86_CR4_UMIP))
+ exec_control |= SECONDARY_EXEC_DESC;
- if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ secondary_exec_controls_set(vmx, exec_control);
}
/*
if (guest_efer != host_efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
- vm_entry_controls_init(vmx, exec_control);
+ vm_entry_controls_set(vmx, exec_control);
/*
* EXIT CONTROLS
exec_control = vmx_vmexit_ctrl();
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
- vm_exit_controls_init(vmx, exec_control);
-
- /*
- * Conceptually we want to copy the PML address and index from
- * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
- * since we always flush the log on each vmexit and never change
- * the PML address (once set), this happens to be equivalent to
- * simply resetting the index in vmcs02.
- */
- if (enable_pml)
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vm_exit_controls_set(vmx, exec_control);
/*
* Interrupt/Exception Fields
}
}
- static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
}
if (nested_cpu_has_xsaves(vmcs12))
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
set_cr4_guest_host_mask(vmx);
-
- if (kvm_mpx_supported()) {
- if (vmx->nested.nested_run_pending &&
- (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
- else
- vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
- }
}
/*
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+ bool load_guest_pdptrs_vmcs12 = false;
- if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
- prepare_vmcs02_full(vmx, vmcs12);
+ if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+ prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
- }
- /*
- * First, the fields that are shadowed. This must be kept in sync
- * with vmcs_shadow_fields.h.
- */
- if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
- vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
- vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ load_guest_pdptrs_vmcs12 = !hv_evmcs ||
+ !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
if (vmx->nested.nested_run_pending &&
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
}
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
entry_failure_code))
return -EINVAL;
+ /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
+ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
+ is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
!kvm_pat_valid(vmcs12->host_ia32_pat))
return -EINVAL;
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+
+ if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_cs_selector == 0 ||
+ vmcs12->host_tr_selector == 0 ||
+ (vmcs12->host_ss_selector == 0 && !ia32e))
+ return -EINVAL;
+
+ #ifdef CONFIG_X86_64
+ if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_tr_base, vcpu))
+ return -EINVAL;
+ #endif
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
* the host address-space size VM-exit control.
*/
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
- ia32e = (vmcs12->vm_exit_controls &
- VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
[launched]"i"(offsetof(struct loaded_vmcs, launched)),
[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
[wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
+ : "memory"
);
if (vmx->msr_autoload.host.nr)
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ secondary_exec_controls_clearbit(vmx,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
}
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
map = &vmx->nested.virtual_apic_map;
- /*
- * If translation failed, VM entry will fail because
- * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
- */
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
* _not_ what the processor does but it's basically the
* only possibility we have.
*/
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_TPR_SHADOW);
+ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
} else {
- printk("bad virtual-APIC page address\n");
- dump_vmcs();
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
+ * force VM-Entry to fail.
+ */
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
}
}
}
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
- vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_USE_MSR_BITMAPS);
+ exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_USE_MSR_BITMAPS);
+ exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
}
/*
u32 exit_reason = EXIT_REASON_INVALID_STATE;
u32 exit_qual;
- evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ /*
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
+ * nested early checks are disabled. In the event of a "late" VM-Fail,
+ * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
+ * software model to the pre-VMEntry host state. When EPT is disabled,
+ * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
+ * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
+ * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
+ * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
+ * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
+ * guaranteed to be overwritten with a shadow CR3 prior to re-entering
+ * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
+ * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
+ * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
+ * path would need to manually save/restore vmcs01.GUEST_CR3.
+ */
+ if (!enable_ept && !nested_early_check)
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
prepare_vmcs02_early(vmx, vmcs12);
vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
vmcs12->exit_qualification = exit_qual;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
return 1;
}
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
+ if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
return 1;
if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
}
- /*
- * Update the guest state fields of vmcs12 to reflect changes that
- * occurred while L2 was running. (The "IA-32e mode guest" bit of the
- * VM-entry controls is also updated, since this is really a guest
- * state bit.)
- */
- static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
- {
- vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
- vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+ static bool is_vmcs12_ext_field(unsigned long field)
+ {
+ switch (field) {
+ case GUEST_ES_SELECTOR:
+ case GUEST_CS_SELECTOR:
+ case GUEST_SS_SELECTOR:
+ case GUEST_DS_SELECTOR:
+ case GUEST_FS_SELECTOR:
+ case GUEST_GS_SELECTOR:
+ case GUEST_LDTR_SELECTOR:
+ case GUEST_TR_SELECTOR:
+ case GUEST_ES_LIMIT:
+ case GUEST_CS_LIMIT:
+ case GUEST_SS_LIMIT:
+ case GUEST_DS_LIMIT:
+ case GUEST_FS_LIMIT:
+ case GUEST_GS_LIMIT:
+ case GUEST_LDTR_LIMIT:
+ case GUEST_TR_LIMIT:
+ case GUEST_GDTR_LIMIT:
+ case GUEST_IDTR_LIMIT:
+ case GUEST_ES_AR_BYTES:
+ case GUEST_DS_AR_BYTES:
+ case GUEST_FS_AR_BYTES:
+ case GUEST_GS_AR_BYTES:
+ case GUEST_LDTR_AR_BYTES:
+ case GUEST_TR_AR_BYTES:
+ case GUEST_ES_BASE:
+ case GUEST_CS_BASE:
+ case GUEST_SS_BASE:
+ case GUEST_DS_BASE:
+ case GUEST_FS_BASE:
+ case GUEST_GS_BASE:
+ case GUEST_LDTR_BASE:
+ case GUEST_TR_BASE:
+ case GUEST_GDTR_BASE:
+ case GUEST_IDTR_BASE:
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ case GUEST_BNDCFGS:
+ return true;
+ default:
+ break;
+ }
- vmcs12->guest_rsp = kvm_rsp_read(vcpu);
- vmcs12->guest_rip = kvm_rip_read(vcpu);
- vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+ return false;
+ }
+
+ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
- vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
- vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ if (kvm_mpx_supported())
+ vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+ }
+
+ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+ return;
+
+
+ WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->nested.vmcs02;
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ put_cpu();
+ }
+
+ /*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->nested.hv_evmcs)
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
vmcs12->guest_interruptibility_info =
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- vmcs12->guest_pending_dbg_exceptions =
- vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
else
*/
if (enable_ept) {
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
- vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
- vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
- vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
- vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
}
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
- if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- }
- /* TODO: These cannot have changed unless we have MSR bitmaps and
- * the relevant bit asks not to trap the change */
- if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
- vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
- vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
- vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
- vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
- if (kvm_mpx_supported())
- vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
}
/*
u32 exit_reason, u32 exit_intr_info,
unsigned long exit_qualification)
{
- /* update guest state fields: */
- sync_vmcs12(vcpu, vmcs12);
-
/* update exit information fields: */
-
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
nested_ept_uninit_mmu_context(vcpu);
-
- /*
- * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
- * points to shadow pages! Fortunately we only get here after a WARN_ON
- * if EPT is disabled, so a VMabort is perfectly fine.
- */
- if (enable_ept) {
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- } else {
- nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
- }
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
* VMFail, like everything else we just need to ensure our
* software model is up-to-date.
*/
- ept_save_pdptrs(vcpu);
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
kvm_mmu_reset_context(vcpu);
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
- if (exit_reason == -1)
- sync_vmcs12(vcpu, vmcs12);
- else
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+
+ if (exit_reason != -1)
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
/*
- * Must happen outside of sync_vmcs12() as it will
+ * Must happen outside of sync_vmcs02_to_vmcs12() as it will
* also be used to capture vmcs12 cache as part of
* capturing nVMX state for snapshot (migration).
*
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
* #UD or #GP.
*/
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
- u32 vmx_instruction_info, bool wr, gva_t *ret)
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
{
gva_t off;
bool exn;
*/
if (!(s.base == 0 && s.limit == 0xffffffff &&
((s.type & 8) || !(s.type & 4))))
- exn = exn || (off + sizeof(u64) > s.limit);
+ exn = exn || ((u64)off + len - 1 > s.limit);
}
if (exn) {
kvm_queue_exception_e(vcpu,
struct x86_exception e;
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
+ vmcs_read32(VMX_INSTRUCTION_INFO), false,
+ sizeof(*vmpointer), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
if (vmx->nested.current_vmptr == -1ull)
return;
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
- vmx->nested.need_vmcs12_sync = false;
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
+ u64 evmcs_gpa;
if (!nested_vmx_check_permission(vcpu))
return 1;
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- if (vmx->nested.hv_evmcs_map.hva) {
- if (vmptr == vmx->nested.hv_evmcs_vmptr)
- nested_release_evmcs(vcpu);
- } else {
+ /*
+ * When Enlightened VMEntry is enabled on the calling CPU we treat
+ * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+ * way to distinguish it from VMCS12) and we must not corrupt it by
+ * writing to the non-existent 'launch_state' field. The area doesn't
+ * have to be the currently active EVMCS on the calling CPU and there's
+ * nothing KVM has to do to transition it from 'active' to 'non-active'
+ * state. It is possible that the area will stay mapped as
+ * vmx->nested.hv_evmcs but this shouldn't be a problem.
+ */
+ if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+ !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
u64 field_value;
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ int len;
gva_t gva = 0;
struct vmcs12 *vmcs12;
+ short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- /* Read the field, zero-extended to a u64 field_value */
- if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
return nested_vmx_failValid(vcpu,
VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /* Read the field, zero-extended to a u64 field_value */
+ field_value = vmcs12_read_any(vmcs12, field, offset);
+
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
field_value);
} else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, &gva))
+ vmx_instruction_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value,
- (is_long_mode(vcpu) ? 8 : 4), NULL);
+ kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
}
return nested_vmx_succeed(vcpu);
}
+ static bool is_shadow_field_rw(unsigned long field)
+ {
+ switch (field) {
+ #define SHADOW_FIELD_RW(x, y) case x:
+ #include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+ }
+
+ static bool is_shadow_field_ro(unsigned long field)
+ {
+ switch (field) {
+ #define SHADOW_FIELD_RO(x, y) case x:
+ #include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+ }
static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
unsigned long field;
+ int len;
gva_t gva;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u64 field_value = 0;
struct x86_exception e;
struct vmcs12 *vmcs12;
+ short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &field_value,
- (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- if (!is_guest_mode(vcpu))
+ if (!is_guest_mode(vcpu)) {
vmcs12 = get_vmcs12(vcpu);
- else {
+
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else {
/*
* When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
* to shadowed-field sets the ALU flags for VMfailInvalid.
vmcs12 = get_shadow_vmcs12(vcpu);
}
- if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
return nested_vmx_failValid(vcpu,
VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
- * Do not track vmcs12 dirty-state if in guest-mode
- * as we actually dirty shadow vmcs12 instead of vmcs12.
+ * Some Intel CPUs intentionally drop the reserved bits of the AR byte
+ * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
+ * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
+ * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
+ * from L1 will return a different value than VMREAD from L2 (L1 sees
+ * the stripped down value, L2 sees the full value as stored by KVM).
*/
- if (!is_guest_mode(vcpu)) {
- switch (field) {
- #define SHADOW_FIELD_RW(x) case x:
- #include "vmcs_shadow_fields.h"
- /*
- * The fields that can be updated by L1 without a vmexit are
- * always updated in the vmcs02, the others go down the slow
- * path of prepare_vmcs02.
- */
- break;
- default:
- vmx->nested.dirty_vmcs12 = true;
- break;
+ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
+ field_value &= 0x1f0ff;
+
+ vmcs12_write_any(vmcs12, field, offset, field_value);
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode as we actually
+ * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
+ * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
+ * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
+ /*
+ * L1 can read these fields without exiting, ensure the
+ * shadow VMCS is up-to-date.
+ */
+ if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
+ preempt_disable();
+ vmcs_load(vmx->vmcs01.shadow_vmcs);
+
+ __vmcs_writel(field, field_value);
+
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ preempt_enable();
}
+ vmx->nested.dirty_vmcs12 = true;
}
return nested_vmx_succeed(vcpu);
{
vmx->nested.current_vmptr = vmptr;
if (enable_shadow_vmcs) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->vmcs01.shadow_vmcs));
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
}
if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
return 1;
- if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
+ true, sizeof(gpa_t), &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
* operand is read even if it isn't needed (e.g., for type==global)
*/
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
switch (type) {
case VMX_EPT_EXTENT_GLOBAL:
+ case VMX_EPT_EXTENT_CONTEXT:
/*
- * TODO: track mappings and invalidate
- * single context requests appropriately
+ * TODO: Sync the necessary shadow EPT roots here, rather than
+ * at the next emulated VM-entry.
*/
- case VMX_EPT_EXTENT_CONTEXT:
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
break;
default:
BUG_ON(1);
* operand is read even if it isn't needed (e.g., for type==global)
*/
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
- kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
-
if (nested_vmx_allowed(vcpu) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
if (vmx_has_valid_vmcs12(vcpu)) {
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
+ if (vmx->nested.hv_evmcs)
+ kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
if (is_guest_mode(vcpu) &&
nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull)
* When running L2, the authoritative vmcs12 state is in the
* vmcs02. When running L1, the authoritative vmcs12 state is
* in the shadow or enlightened vmcs linked to vmcs01, unless
- * need_vmcs12_sync is set, in which case, the authoritative
+ * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
* vmcs12 state is in the vmcs12 already.
*/
if (is_guest_mode(vcpu)) {
- sync_vmcs12(vcpu, vmcs12);
- } else if (!vmx->nested.need_vmcs12_sync) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
if (vmx->nested.hv_evmcs)
copy_enlightened_to_vmcs12(vmx);
else if (enable_shadow_vmcs)
if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
return -EINVAL;
+ /*
+ * KVM_STATE_NESTED_EVMCS used to signal that KVM should
+ * enable eVMCS capability on vCPU. However, since then
+ * code was changed such that flag signals vmcs12 should
+ * be copied into eVMCS in guest memory.
+ *
+ * To preserve backwards compatability, allow user
+ * to set this flag even when there is no VMXON region.
+ */
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
return -EINVAL;
- }
+ }
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
* must be zero.
*/
- if (is_smm(vcpu) ? kvm_state->flags : kvm_state->hdr.vmx.smm.flags)
+ if (is_smm(vcpu) ?
+ (kvm_state->flags &
+ (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
+ : kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
- vmx_leave_nested(vcpu);
- if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
- if (!nested_vmx_allowed(vcpu))
+ if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
+ (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
- nested_enable_evmcs(vcpu, NULL);
- }
+ vmx_leave_nested(vcpu);
if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
return 0;
* Sync eVMCS upon entry as we may not have
* HV_X64_MSR_VP_ASSIST_PAGE set up yet.
*/
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
} else {
return -EINVAL;
}
void nested_vmx_vcpu_setup(void)
{
if (enable_shadow_vmcs) {
- /*
- * At vCPU creation, "VMWRITE to any supported field
- * in the VMCS" is supported, so use the more
- * permissive vmx_vmread_bitmap to specify both read
- * and write permissions for the shadow VMCS.
- */
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
- vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
}
}
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_XSAVES;
/*
* We can emulate "VMCS shadowing," even if the hardware
{
int i;
- /*
- * Without EPT it is not possible to restore L1's CR3 and PDPTR on
- * VMfail, because they are not available in vmcs01. Just always
- * use hardware checks.
- */
- if (!enable_ept)
- nested_early_check = 1;
-
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/intel_pt.h>
+#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
gfn_t gfn;
int r;
- if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
+ if (!is_pae_paging(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
if (is_long_mode(vcpu) &&
(cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
return 1;
- else if (is_pae(vcpu) && is_paging(vcpu) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ else if (is_pae_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
MSR_AMD64_VIRT_SPEC_CTRL,
MSR_IA32_POWER_CTL,
+ /*
+ * The following list leaves out MSRs whose values are determined
+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
+ * We always support the "true" VMX control MSRs, even if the host
+ * processor does not, so I am putting these registers here rather
+ * than in msrs_to_save.
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
};
static unsigned num_emulated_msrs;
static unsigned int num_msr_based_features;
- u64 kvm_get_arch_capabilities(void)
+ static u64 kvm_get_arch_capabilities(void)
{
- u64 data;
+ u64 data = 0;
- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
/*
* If we're doing cache flushes (either "always" or "cond")
return data;
}
- EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
vcpu->arch.tsc_always_catchup = 1;
return 0;
} else {
- WARN(1, "user requested TSC rate below hardware speed\n");
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
return -1;
}
}
user_tsc_khz, tsc_khz);
if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
- WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
return -1;
}
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boot_ns();
+ ns = ktime_get_boottime_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boot_ns() + ka->kvmclock_offset;
+ return ktime_get_boottime_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+ ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
put_cpu();
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boot_ns();
+ kernel_ns = ktime_get_boottime_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
}
break;
case MSR_IA32_MISC_ENABLE:
- vcpu->arch.ia32_misc_enable_msr = data;
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+ return 1;
+ vcpu->arch.ia32_misc_enable_msr = data;
+ kvm_update_cpuid(vcpu);
+ } else {
+ vcpu->arch.ia32_misc_enable_msr = data;
+ }
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
+ case MSR_IA32_POWER_CTL:
+ vcpu->arch.msr_ia32_power_ctl = data;
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
return 1;
break;
+ case MSR_KVM_POLL_CONTROL:
+ /* only enable bit supported */
+ if (data & (-1ULL << 1))
+ return 1;
+
+ vcpu->arch.msr_kvm_poll_control = data;
+ break;
+
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return 1;
msr_info->data = vcpu->arch.arch_capabilities;
break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+ break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
case MSR_KVM_PV_EOI_EN:
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
+ case MSR_KVM_POLL_CONTROL:
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_PMU_EVENT_FILTER:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
r = KVM_CLOCK_TSC_STABLE;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_CSTATE;
if(kvm_can_mwait_in_guest())
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
break;
}
+ case KVM_SET_PMU_EVENT_FILTER:
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
+ break;
default:
r = -ENOTTY;
}
vcpu->arch.db);
if (dr6 != 0) {
- vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
*r = EMULATE_DONE;
struct kvm_vcpu *vcpu;
int cpu;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
spin_unlock(&ka->pvclock_gtod_sync_lock);
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
}
#endif
smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != smp_processor_id())
+ if (vcpu->cpu != raw_smp_processor_id())
send_ipi = 1;
}
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
.handle_intel_pt_intr = kvm_handle_intel_pt_intr,
};
- static void kvm_set_mmio_spte_mask(void)
- {
- u64 mask;
- int maxphyaddr = boot_cpu_data.x86_phys_bits;
-
- /*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
- */
-
- /*
- * Mask the uppermost physical address bit, which would be reserved as
- * long as the supported physical address width is less than 52.
- */
- mask = 1ull << 51;
-
- /* Set the present bit. */
- mask |= 1ull;
-
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
- mask &= ~1ull;
-
- kvm_mmu_set_mmio_spte_mask(mask, mask);
- }
-
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm_vcpu *vcpu;
int i;
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
if (r)
goto out_free_percpu;
- kvm_set_mmio_spte_mask();
-
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
+ static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+ {
+ struct kvm_vcpu *target = NULL;
+ struct kvm_apic_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+
+ rcu_read_unlock();
+
+ if (target)
+ kvm_vcpu_yield_to(target);
+ }
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
+ case KVM_HC_SCHED_YIELD:
+ kvm_sched_yield(vcpu->kvm, a0);
+ ret = 0;
+ break;
default:
ret = -KVM_ENOSYS;
break;
}
trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- wait_lapic_expire(vcpu);
guest_enter_irqoff();
fpregs_assert_state_consistent();
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_before_interrupt(vcpu);
- kvm_x86_ops->handle_external_intr(vcpu);
- kvm_after_interrupt(vcpu);
+ kvm_x86_ops->handle_exit_irqoff(vcpu);
+ /*
+ * Consume any pending interrupts, including the possible source of
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
+ * An instruction is required after local_irq_enable() to fully unblock
+ * interrupts on processors that implement an interrupt shadow, the
+ * stat.exits increment will do nicely.
+ */
+ kvm_before_interrupt(vcpu);
+ local_irq_enable();
++vcpu->stat.exits;
+ local_irq_disable();
+ kvm_after_interrupt(vcpu);
guest_exit_irqoff();
+ if (lapic_in_kernel(vcpu)) {
+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
+ if (delta != S64_MIN) {
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
+ }
+ }
local_irq_enable();
preempt_enable();
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
+ if (is_pae_paging(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
msr.host_initiated = true;
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+
+ /* poll control enabled by default */
+ vcpu->arch.msr_kvm_poll_control = 1;
+
mutex_unlock(&vcpu->mutex);
if (!kvmclock_periodic_sync)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
kvm_x86_ops->hardware_unsetup();
}
- void kvm_arch_check_processor_compat(void *rtn)
+ int kvm_arch_check_processor_compat(void)
{
- kvm_x86_ops->check_processor_compatibility(rtn);
+ return kvm_x86_ops->check_processor_compatibility();
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+ kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
kvm_hv_destroy_vm(kvm);
sizeof(u32));
}
+ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
+ {
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+ return false;
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ return false;
+
+ return true;
+ }
+
+ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+ {
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ vcpu->arch.exception.pending))
+ return false;
+
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
+ return false;
+
+ /*
+ * If interrupts are off we cannot even use an artificial
+ * halt state.
+ */
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+ }
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_not_present(work->arch.token, work->gva);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ if (kvm_can_deliver_async_pf(vcpu) &&
+ !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
+ } else {
+ /*
+ * It is not possible to deliver a paravirtualized asynchronous
+ * page fault, but putting the guest in an artificial halt state
+ * can be beneficial nevertheless: if an interrupt arrives, we
+ * can deliver it timely and perhaps the guest will schedule
+ * another process. When the instruction that triggered a page
+ * fault is retried, hopefully the page will be ready in the host.
+ */
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
}
}
}
EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+ {
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
/* Restore state in a new VM. */
kvm_vm_restart(vm, O_RDWR);
- vm_vcpu_add(vm, VCPU_ID, 0, 0);
+ vm_vcpu_add(vm, VCPU_ID);
vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap);
vcpu_load_state(vm, VCPU_ID, state);
run = vcpu_state(vm, VCPU_ID);
free(state);