u64 smi_count;
bool at_instruction_boundary;
bool tpr_access_reporting;
- bool xsaves_enabled;
bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ /*
+ * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
+ * when "struct kvm_vcpu_arch" is no longer defined in an
+ * arch/x86/include/asm header. The max is mostly arbitrary, i.e.
+ * can be increased as necessary.
+ */
+ #define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG
+
+ /*
+ * Track whether or not the guest is allowed to use features that are
+ * governed by KVM, where "governed" means KVM needs to manage state
+ * and/or explicitly enable the feature in hardware. Typically, but
+ * not always, governed features can be used by the guest if and only
+ * if both KVM and userspace want to expose the feature to the guest.
+ */
+ struct {
+ DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES);
+ } governed_features;
+
u64 reserved_gpa_bits;
int maxphyaddr;
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
/*
* Retrieve somewhat arbitrary exit information. Intended to
#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
-#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
if (kvm_x86_ops.flush_remote_tlbs &&
!static_call(kvm_x86_flush_remote_tlbs)(kvm))
return -ENOTSUPP;
}
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+ #include "linux/lockdep.h"
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
struct kvm_cpuid_entry2 *e;
int i;
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
for (i = 0; i < nent; i++) {
e = &entries[i];
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ bool allow_gbpages;
+
+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
+ bitmap_zero(vcpu->arch.governed_features.enabled,
+ KVM_MAX_NR_GOVERNED_FEATURES);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+ if (allow_gbpages)
+ kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
- F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
+ F(AMX_COMPLEX)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+ kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
);
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
case 0x80000006:
/* Drop reserved bits, pass host L2 cache and TLB info. */
entry->edx &= ~GENMASK(17, 16);
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
- bool xapic_id_mismatch = false;
+ bool xapic_id_mismatch;
+ int r;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
"Dirty APIC map without an in-kernel local APIC");
mutex_lock(&kvm->arch.apic_map_lock);
+
+ retry:
/*
- * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
- * (if clean) or the APIC registers (if dirty).
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
*/
if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
return;
}
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
if (!kvm_apic_present(vcpu))
continue;
- if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
kvfree(new);
new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
goto out;
}
*max_irr = -1;
for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = *p_irr;
pir_val = READ_ONCE(pir[i]);
- irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+
if (pir_val) {
+ pir_val = xchg(&pir[i], 0);
+
prev_irr_val = irr_val;
- irr_val |= xchg(&pir[i], 0);
- *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
- if (prev_irr_val != irr_val) {
- max_updated_irr =
- __fls(irr_val ^ prev_irr_val) + vec;
- }
+ do {
+ irr_val = prev_irr_val | pir_val;
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
}
if (irr_val)
*max_irr = __fls(irr_val) + vec;
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
- return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
return kvm_x86_ops.flush_remote_tlbs_range;
}
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
- gfn_t nr_pages)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
- int ret = -EOPNOTSUPP;
+ if (!kvm_x86_ops.flush_remote_tlbs_range)
+ return -EOPNOTSUPP;
- if (kvm_x86_ops.flush_remote_tlbs_range)
- ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
- nr_pages);
- if (ret)
- kvm_flush_remote_tlbs(kvm);
+ return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
range->start, range->end - 1, &iterator)
ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->pte);
+ iterator.level, range->arg.pte);
return ret;
}
}
}
- static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
- {
- /*
- * If TDP is enabled, let the guest use GBPAGES if they're supported in
- * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
- * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
- * walk for performance and complexity reasons. Not to mention KVM
- * _can't_ solve the problem because GVA->GPA walks aren't visible to
- * KVM once a TDP translation is installed. Mimic hardware behavior so
- * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
- */
- return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
- }
-
static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use_gbpages(vcpu),
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use_gbpages(vcpu), is_pse, is_amd);
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
if (!shadow_me_mask)
return;
*/
if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
}
}
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
-{
- /*
- * All current use cases for flushing the TLBs for a specific memslot
- * related to dirty logging, and many do the TLB flush out of mmu_lock.
- * The interaction between the various operations on memslot must be
- * serialized by slots_locks to ensure the TLB flush from one operation
- * is observed by any other operation on the same memslot.
- */
- lockdep_assert_held(&kvm->slots_lock);
- kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
-}
-
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
- return sprintf(buffer, "never\n");
+ return sysfs_emit(buffer, "never\n");
return param_get_bool(buffer, kp);
}
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+ #include <asm/reboot.h>
#include <asm/fpu/api.h>
- #include <asm/virtext.h>
-
#include <trace/events/ipi.h>
#include "trace.h"
module_param(nested, int, S_IRUGO);
/* enable/disable Next RIP Save */
-static int nrips = true;
+int nrips = true;
module_param(nrips, int, 0444);
/* enable/disable Virtual VMLOAD VMSAVE */
svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
}
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
bool commit_side_effects)
}
if (!svm->next_rip) {
+ /*
+ * FIXME: Drop this when kvm_emulate_instruction() does the
+ * right thing and treats "can't emulate" as outright failure
+ * for EMULTYPE_SKIP.
+ */
+ if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
+ return 0;
+
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
vcpu->arch.osvw.status |= 1;
}
- static bool kvm_is_svm_supported(void)
+ static bool __kvm_is_svm_supported(void)
{
- int cpu = raw_smp_processor_id();
- const char *msg;
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
u64 vm_cr;
- if (!cpu_has_svm(&msg)) {
- pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+ return false;
+ }
+
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
return false;
}
return true;
}
+ static bool kvm_is_svm_supported(void)
+ {
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+ }
+
static int svm_check_processor_compat(void)
{
- if (!kvm_is_svm_supported())
+ if (!__kvm_is_svm_supported())
return -EIO;
return 0;
}
- void __svm_write_tsc_multiplier(u64 multiplier)
+ static void __svm_write_tsc_multiplier(u64 multiplier)
{
- preempt_disable();
-
if (multiplier == __this_cpu_read(current_tsc_ratio))
- goto out;
+ return;
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
__this_cpu_write(current_tsc_ratio, multiplier);
- out:
- preempt_enable();
+ }
+
+ static inline void kvm_cpu_svm_disable(void)
+ {
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
+ }
+
+ static void svm_emergency_disable(void)
+ {
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
}
static void svm_hardware_disable(void)
if (tsc_scaling)
__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
- cpu_svm_disable();
+ kvm_cpu_svm_disable();
amd_pmu_disable_virt();
}
}
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
static int direct_access_msr_slot(u32 msr)
{
u32 i;
svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
{
/*
- * If the LBR virtualization is disabled, the LBR msrs are always
- * kept in the vmcb01 to avoid copying them on nested guest entries.
- *
- * If nested, and the LBR virtualization is enabled/disabled, the msrs
- * are moved between the vmcb01 and vmcb02 as needed.
+ * If LBR virtualization is disabled, the LBR MSRs are always kept in
+ * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
+ * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
*/
- struct vmcb *vmcb =
- (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
- svm->vmcb : svm->vmcb01.ptr;
-
- switch (index) {
- case MSR_IA32_DEBUGCTLMSR:
- return vmcb->save.dbgctl;
- case MSR_IA32_LASTBRANCHFROMIP:
- return vmcb->save.br_from;
- case MSR_IA32_LASTBRANCHTOIP:
- return vmcb->save.br_to;
- case MSR_IA32_LASTINTFROMIP:
- return vmcb->save.last_excp_from;
- case MSR_IA32_LASTINTTOIP:
- return vmcb->save.last_excp_to;
- default:
- KVM_BUG(false, svm->vcpu.kvm,
- "%s: Unknown MSR 0x%x", __func__, index);
- return 0;
- }
+ return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+ svm->vmcb01.ptr;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
-
- bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
- DEBUGCTLMSR_LBR;
-
- bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
- LBR_CTL_ENABLE_MASK);
-
- if (unlikely(is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV)))
- if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
- enable_lbrv = true;
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
- (is_guest_mode(vcpu) && svm->lbrv_enabled &&
++ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
if (enable_lbrv == current_enable_lbrv)
return;
return svm->tsc_ratio_msr;
}
- static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
- svm->vmcb->control.tsc_offset = offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
- static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- __svm_write_tsc_multiplier(multiplier);
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
}
-
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
-
- svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does. Don't intercept #GP for SEV guests as KVM can't
- * decrypt guest memory to decode the faulting instruction.
+ * as VMware does.
*/
- if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
+ if (enable_vmware_backdoor)
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
- indirect_branch_prediction_barrier();
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
}
}
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (vcpu->arch.guest_state_protected)
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
return;
get_debugreg(vcpu->arch.db[0], 0);
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;
svm_clr_iret_intercept(svm);
- if (!sev_es_guest(vcpu->kvm))
- svm->nmi_iret_rip = kvm_rip_read(vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
unsigned long val;
int err = 0;
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
- if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ if (!msr_info->host_initiated &&
+ !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ break;
case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ break;
case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ break;
case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!svm->tsc_scaling_enabled) {
+ if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
svm->tsc_ratio_msr = data;
- if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
break;
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
- svm->vmcb->save.dbgctl = data;
- else
- svm->vmcb01.ptr->save.dbgctl = data;
-
+ svm_get_lbr_vmcb(svm)->save.dbgctl = data;
svm_update_lbrv(vcpu);
-
break;
case MSR_VM_HSAVE_PA:
/*
if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
+ /*
+ * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ * KVM can't intercept and single-step IRET to detect when NMIs are
+ * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
+ *
+ * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ * supported NAEs in the GHCB protocol.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
if (!gif_set(svm)) {
if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
svm->soft_int_injected = false;
/*
- * If we've made progress since setting HF_IRET_MASK, we've
+ * If we've made progress since setting awaiting_iret_completion, we've
* executed an IRET and can allow NMI injection.
*/
if (svm->awaiting_iret_completion &&
- (sev_es_guest(vcpu->kvm) ||
- kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
svm->awaiting_iret_completion = false;
svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
- /*
- * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
- * can't read guest memory (dereference memslots) to decode the WRMSR.
- */
- if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
- nrips && control->next_rip)
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
return EXIT_FASTPATH_NONE;
guest_state_enter_irqoff();
+ amd_clear_divider();
+
if (sev_es_guest(vcpu->kvm))
__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_cpuid_entry2 *best;
- vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES);
-
- /* Update nrips enabled cache */
- svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
-
- svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
- svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
-
- svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
-
- svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
- svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
- svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ /*
+ * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+ * SVM on Intel is bonkers and extremely unlikely to work).
+ */
+ if (!guest_cpuid_is_intel(vcpu))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
svm_recalc_instruction_intercepts(vcpu, svm);
* and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
* decode garbage.
*
- * Inject #UD if KVM reached this point without an instruction buffer.
- * In practice, this path should never be hit by a well-behaved guest,
- * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
- * is still theoretically reachable, e.g. via unaccelerated fault-like
- * AVIC access, and needs to be handled by KVM to avoid putting the
- * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
- * but its the least awful option given lack of insight into the guest.
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
*/
if (unlikely(!insn)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (!(emul_type & EMULTYPE_SKIP))
+ kvm_queue_exception(vcpu, UD_VECTOR);
return false;
}
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
.set_cr0 = svm_set_cr0,
.post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
svm_adjust_mmio_mask();
+ nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
- * may be modified by svm_adjust_mmio_mask()).
+ * may be modified by svm_adjust_mmio_mask()), as well as nrips.
*/
sev_hardware_setup();
goto err;
}
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
.pmu_ops = &amd_pmu_ops,
};
+ static void __svm_exit(void)
+ {
+ kvm_x86_vendor_exit();
+
+ cpu_emergency_unregister_virt_callback(svm_emergency_disable);
+ }
+
static int __init svm_init(void)
{
int r;
if (r)
return r;
+ cpu_emergency_register_virt_callback(svm_emergency_disable);
+
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
return 0;
err_kvm_init:
- kvm_x86_vendor_exit();
+ __svm_exit();
return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
- kvm_x86_vendor_exit();
+ __svm_exit();
}
module_init(svm_init)
#include <asm/svm.h>
#include <asm/sev-common.h>
+ #include "cpuid.h"
#include "kvm_cache_regs.h"
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+extern int nrips;
extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
/* SEV-ES support */
struct sev_es_save_area *vmsa;
struct ghcb *ghcb;
+ u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
/* SEV-ES scratch area support */
+ u64 sw_scratch;
void *ghcb_sa;
u32 ghcb_sa_len;
bool ghcb_sa_sync;
unsigned long soft_int_next_rip;
bool soft_int_injected;
- /* optional nested SVM features that are enabled for this guest */
- bool nrips_enabled : 1;
- bool tsc_scaling_enabled : 1;
- bool v_vmload_vmsave_enabled : 1;
- bool lbrv_enabled : 1;
- bool pause_filter_enabled : 1;
- bool pause_threshold_enabled : 1;
- bool vgif_enabled : 1;
- bool vnmi_enabled : 1;
-
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
return test_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline void set_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb01.ptr;
-
- if (!sev_es_guest(svm->vcpu.kvm)) {
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
- }
-
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb01.ptr;
-
- vmcb->control.intercepts[INTERCEPT_DR] = 0;
-
- /* DR7 access must remain intercepted for an SEV-ES guest */
- if (sev_es_guest(svm->vcpu.kvm)) {
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- }
-
- recalc_intercepts(svm);
-}
-
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
}
static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
{
- return svm->vnmi_enabled &&
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) &&
(svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
}
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
- void __svm_write_tsc_multiplier(u64 multiplier);
+ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+ static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ { \
+ return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ } \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
#endif
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
- #include <asm/kexec.h>
+ #include <asm/reboot.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
- #include <asm/virtext.h>
#include <asm/vmx.h>
#include "capabilities.h"
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
- /* Control for disabling CPU Fill buffer clear */
- static bool __read_mostly vmx_fb_clear_ctrl_available;
-
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
return 0;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- u64 msr;
-
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
- return 0;
- }
+ if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
}
/* If set to auto use the default l1tf mitigation method */
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
- return sprintf(s, "???\n");
+ return sysfs_emit(s, "???\n");
- return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
- }
-
- static void vmx_setup_fb_clear_ctrl(void)
- {
- u64 msr;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
- !boot_cpu_has_bug(X86_BUG_MDS) &&
- !boot_cpu_has_bug(X86_BUG_TAA)) {
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_FB_CLEAR_CTRL)
- vmx_fb_clear_ctrl_available = true;
- }
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
- vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+ vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
/*
* If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
pr_warn_ratelimited(fmt); \
} while (0)
-void vmread_error(unsigned long field, bool fault)
+noinline void vmread_error(unsigned long field)
{
- if (fault)
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
kvm_spurious_fault();
- else
- vmx_insn_failed("vmread failed: field=%lx\n", field);
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
}
+#endif
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
return ret;
}
- #ifdef CONFIG_KEXEC_CORE
- static void crash_vmclear_local_loaded_vmcss(void)
+ /*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+ static int kvm_cpu_vmxoff(void)
+ {
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+ fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
+ }
+
+ static void vmx_emergency_disable(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
+ return;
+
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
+
+ kvm_cpu_vmxoff();
}
- #endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
return kvm_caps.default_tsc_scaling_ratio;
}
- static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
- {
- vmcs_write64(TSC_OFFSET, offset);
- }
-
- static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
- vmcs_write64(TSC_MULTIPLIER, multiplier);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
- /*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
- bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+ static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
/*
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
return 0;
}
- static bool kvm_is_vmx_supported(void)
+ static bool __kvm_is_vmx_supported(void)
{
- int cpu = raw_smp_processor_id();
+ int cpu = smp_processor_id();
- if (!cpu_has_vmx()) {
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
pr_err("VMX not supported by CPU %d\n", cpu);
return false;
}
return true;
}
+ static bool kvm_is_vmx_supported(void)
+ {
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+ }
+
static int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
- if (!kvm_is_vmx_supported())
+ if (!__kvm_is_vmx_supported())
return -EIO;
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
{
vmclear_local_loaded_vmcss();
- if (cpu_vmxoff())
+ if (kvm_cpu_vmxoff())
kvm_spurious_fault();
hv_reset_evmcs();
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx->rmode.vm86_active = 1;
- /*
- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- * vcpu. Warn the user that an update is overdue.
- */
- if (!kvm_vmx->tss_addr)
- pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
-
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
+static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
+
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
+}
+
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ if (enable_ept && !enable_unrestricted_guest) {
/*
* Ensure KVM has an up-to-date snapshot of the guest's CR3. If
* the below code _enables_ CR3 exiting, vmx_cache_reg() will
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static int vmx_get_max_tdp_level(void)
+static int vmx_get_max_ept_level(void)
{
if (cpu_has_vmx_ept_5levels())
return 5;
* this bit, even if host CR4.MCE == 0.
*/
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!is_unrestricted_guest(vcpu)) {
+ if (!enable_unrestricted_guest) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
* based on a single guest CPUID bit, with a dedicated feature bit. This also
* verifies that the control is actually supported by KVM and hardware.
*/
- #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
- ({ \
- bool __enabled; \
- \
- if (cpu_has_vmx_##name()) { \
- __enabled = guest_cpuid_has(&(vmx)->vcpu, \
- X86_FEATURE_##feat_name); \
- vmx_adjust_secondary_exec_control(vmx, exec_control, \
- SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
- } \
+ #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+ ({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
+ __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
+ else \
+ __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
})
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (cpu_has_vmx_xsaves()) {
- /* Exposing XSAVES only when XSAVE is exposed */
- bool xsaves_enabled =
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
- vcpu->arch.xsaves_enabled = xsaves_enabled;
-
- vmx_adjust_secondary_exec_control(vmx, &exec_control,
- SECONDARY_EXEC_XSAVES,
- xsaves_enabled, false);
- }
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
/*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
SECONDARY_EXEC_ENABLE_RDTSCP,
rdpid_or_rdtscp_enabled, false);
}
+
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
if (kvm_vmx->pid_table)
return 0;
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
if (!pages)
return -ENOMEM;
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_guest_cr0_valid(vcpu, val))
- return 1;
-
if (kvm_set_cr0(vcpu, val))
return 1;
vmcs_writel(CR0_READ_SHADOW, orig_val);
return 0;
} else {
- if (to_vmx(vcpu)->nested.vmxon &&
- !nested_host_cr0_valid(vcpu, val))
- return 1;
-
return kvm_set_cr0(vcpu, val);
}
}
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
read_unlock(&vcpu->kvm->mmu_lock);
- vmx_flush_tlb_current(vcpu);
-
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
out:
/*
* Do not pin apic access page in memory, the MMU notifier
flags);
vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
vmx_enable_fb_clear(vmx);
- if (unlikely(vmx->fail))
+ if (unlikely(vmx->fail)) {
vmx->exit_reason.full = 0xdead;
- else
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ goto out;
+ }
+
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
is_nmi(vmx_get_intr_info(vcpu))) {
kvm_after_interrupt(vcpu);
}
+out:
guest_state_exit_irqoff();
}
loadsegment(es, __USER_DS);
#endif
- vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
-
pt_guest_exit(vmx);
kvm_load_host_xsave_state(vcpu);
vmx->nested.nested_run_pending = 0;
}
- vmx->idt_vectoring_info = 0;
-
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- if (likely(!vmx->exit_reason.failed_vmentry))
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
- vcpu->arch.xsaves_enabled = false;
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
+
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
vmx_setup_uret_msrs(vmx);
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (nested_vmx_allowed(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (nested_vmx_allowed(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
.set_cr0 = vmx_set_cr0,
.is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
*/
vmx_setup_me_spte_mask();
- kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
ept_caps_to_lpage_level(vmx_capability.ept));
/*
{
allow_smaller_maxphyaddr = false;
- #ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
- #endif
+ cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+
vmx_cleanup_l1d_flush();
}
if (r)
goto err_l1d_flush;
- vmx_setup_fb_clear_ctrl();
-
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
pi_init_cpu(cpu);
}
- #ifdef CONFIG_KEXEC_CORE
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
- #endif
+ cpu_emergency_register_virt_callback(vmx_emergency_disable);
+
vmx_check_vmcs12_offsets();
/*
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
+ u64 __read_mostly host_arch_capabilities;
+ EXPORT_SYMBOL_GPL(host_arch_capabilities);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
}
EXPORT_SYMBOL_GPL(load_pdptrs);
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return false;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
+
+ return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+}
+
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
/*
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- cr0 |= X86_CR0_ET;
-
-#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
+ if (!kvm_is_valid_cr0(vcpu, cr0))
return 1;
-#endif
-
- cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
+ cr0 |= X86_CR0_ET;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, host_xss);
}
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = 0;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
- data &= KVM_SUPPORTED_ARCH_CAP;
- }
+ u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
*/
}
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
return data;
}
u64 data;
fastpath_t ret = EXIT_FASTPATH_NONE;
+ kvm_vcpu_srcu_read_lock(vcpu);
+
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
if (ret != EXIT_FASTPATH_NONE)
trace_kvm_msr_write(msr, data);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
else
vcpu->arch.tsc_offset = l1_offset;
- static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+ static_call(kvm_x86_write_tsc_offset)(vcpu);
}
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
if (kvm_caps.has_tsc_control)
- static_call(kvm_x86_write_tsc_multiplier)(
- vcpu, vcpu->arch.tsc_scaling_ratio);
+ static_call(kvm_x86_write_tsc_multiplier)(vcpu);
}
static inline bool kvm_check_tsc_unstable(void)
return 0;
default:
return -ENXIO;
- break;
}
}
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- unsigned long *bitmap = NULL;
+ unsigned long *bitmap;
size_t bitmap_size;
if (!user_range->nmsrs)
return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
}
- static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
- {
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
- }
-
static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
{
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
- .guest_has_long_mode = emulator_guest_has_long_mode,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
static void tsc_khz_changed(void *data)
{
struct cpufreq_freqs *freq = data;
- unsigned long khz = 0;
+ unsigned long khz;
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
kvm_init_pmu_capability(ops->pmu_ops);
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+
r = ops->hardware_setup();
if (r != 0)
goto out_mmu_exit;
if (r < 0)
goto out;
if (r) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ int irq = kvm_cpu_get_interrupt(vcpu);
+
+ if (!WARN_ON_ONCE(irq == -1)) {
+ kvm_queue_interrupt(vcpu, irq, false);
+ static_call(kvm_x86_inject_irq)(vcpu, false);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ }
}
if (kvm_cpu_has_injectable_intr(vcpu))
static_call(kvm_x86_enable_irq_window)(vcpu);
r = -EINTR;
goto out;
}
+
/*
- * It should be impossible for the hypervisor timer to be in
- * use before KVM has ever run the vCPU.
+ * Don't bother switching APIC timer emulation from the
+ * hypervisor timer to the software timer, the only way for the
+ * APIC timer to be active is if userspace stuffed vCPU state,
+ * i.e. put the vCPU into a nonsensical state. Only an INIT
+ * will transition the vCPU out of UNINITIALIZED (without more
+ * state stuffing from userspace), which will reset the local
+ * APIC and thus cancel the timer or drop the IRQ (if the timer
+ * already expired).
*/
- WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
-
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
kvm_vcpu_srcu_read_lock(vcpu);
return false;
}
- return kvm_is_valid_cr4(vcpu, sregs->cr4);
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
}
static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
}
+
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
- if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
+
+ if (__set_sregs(vcpu, &sregs))
return -EINVAL;
+
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
}
+
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
- if (kvm_vcpu_ioctl_x86_set_vcpu_events(
- vcpu, &vcpu->run->s.regs.events))
+ struct kvm_vcpu_events events = vcpu->run->s.regs.events;
+
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
return -EINVAL;
+
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
}
* See is_writable_pte() for more details (the case involving
* access-tracked SPTEs is particularly relevant).
*/
- kvm_arch_flush_remote_tlbs_memslot(kvm, new);
+ kvm_flush_remote_tlbs_memslot(kvm, new);
}
}
bool kvm_arch_has_irq_bypass(void)
{
- return true;
+ return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,