* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
+KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP_OPTIONAL(update_cr8_intercept)
- KVM_X86_OP(check_apicv_inhibit_reasons)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
};
/*
- * We use as the mode the number of bits allocated in the LDR for the
- * logical processor ID. It happens that these are all powers of two.
- * This makes it is very easy to detect cases where the APICs are
- * configured for multiple modes; in that case, we cannot use the map and
- * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ * Track the mode of the optimized logical map, as the rules for decoding the
+ * destination vary per mode. Enabling the optimized logical map requires all
+ * software-enabled local APIs to be in the same mode, each addressable APIC to
+ * be mapped to only one MDA, and each MDA to map to at most one APIC.
*/
- #define KVM_APIC_MODE_XAPIC_CLUSTER 4
- #define KVM_APIC_MODE_XAPIC_FLAT 8
- #define KVM_APIC_MODE_X2APIC 16
+ enum kvm_apic_logical_mode {
+ /* All local APICs are software disabled. */
+ KVM_APIC_MODE_SW_DISABLED,
+ /* All software enabled local APICs in xAPIC cluster addressing mode. */
+ KVM_APIC_MODE_XAPIC_CLUSTER,
+ /* All software enabled local APICs in xAPIC flat addressing mode. */
+ KVM_APIC_MODE_XAPIC_FLAT,
+ /* All software enabled local APICs in x2APIC mode. */
+ KVM_APIC_MODE_X2APIC,
+ /*
+ * Optimized map disabled, e.g. not all local APICs in the same logical
+ * mode, same logical ID assigned to multiple APICs, etc.
+ */
+ KVM_APIC_MODE_MAP_DISABLED,
+ };
struct kvm_apic_map {
struct rcu_head rcu;
- u8 mode;
+ enum kvm_apic_logical_mode logical_mode;
u32 max_apic_id;
union {
struct kvm_lapic *xapic_flat_map[8];
/* Xen emulation context */
struct kvm_xen {
+ struct mutex xen_lock;
u32 xen_version;
bool long_mode;
bool runstate_update_flag;
*/
APICV_INHIBIT_REASON_BLOCKIRQ,
+ /*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
/*
* For simplicity, the APIC acceleration is inhibited
* first time either APIC ID or APIC base are changed by the guest
* AVIC is disabled because SEV doesn't support it.
*/
APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
};
struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
- struct rw_semaphore apicv_update_lock;
-
bool apic_access_memslot_enabled;
+ bool apic_access_memslot_inhibited;
+
+ /* Protects apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
struct kvm_x86_ops {
const char *name;
+ int (*check_processor_compatibility)(void);
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ const unsigned long required_apicv_inhibits;
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
};
struct kvm_x86_init_ops {
- int (*cpu_has_kvm_support)(void);
- int (*disabled_by_bios)(void);
- int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
unsigned int (*handle_intel_pt_intr)(void);
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
bool kvm_apicv_activated(struct kvm *kvm);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
- void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
+ static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+ {
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+ }
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
- switch (map->mode) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
case KVM_APIC_MODE_X2APIC: {
u32 offset = (dest_id >> 16) * 16;
u32 max_apic_id = map->max_apic_id;
*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
*mask = dest_id & 0xf;
return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
default:
- /* Not optimized. */
+ WARN_ON_ONCE(1);
return false;
}
}
kvfree(map);
}
+ static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+ {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+ }
+
+ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+ {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic)) {
+ WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ return;
+ }
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+ }
+
/*
* CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
*
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch = false;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
goto out;
new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_lapic **cluster;
- u16 mask;
- u32 ldr;
- u8 xapic_id;
- u32 x2apic_id;
-
if (!kvm_apic_present(vcpu))
continue;
- xapic_id = kvm_xapic_id(apic);
- x2apic_id = kvm_x2apic_id(apic);
-
- /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
- if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
- x2apic_id <= new->max_apic_id)
- new->phys_map[x2apic_id] = apic;
- /*
- * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
- * prevent them from masking VCPUs with APIC ID <= 0xff.
- */
- if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
- new->phys_map[xapic_id] = apic;
-
- if (!kvm_apic_sw_enabled(apic))
- continue;
-
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
- if (apic_x2apic_mode(apic)) {
- new->mode |= KVM_APIC_MODE_X2APIC;
- } else if (ldr) {
- ldr = GET_APIC_LOGICAL_ID(ldr);
- if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
- new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
- else
- new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ kvfree(new);
+ new = NULL;
+ goto out;
}
- if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
- continue;
-
- if (mask)
- cluster[ffs(mask) - 1] = apic;
+ kvm_recalculate_logical_map(new, vcpu);
}
out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
- static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
- {
- return ((id >> 4) << 16) | (1 << (id & 0xf));
- }
-
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = kvm_apic_calc_x2apic_ldr(id);
{
if (!kvm->arch.disabled_lapic_found) {
kvm->arch.disabled_lapic_found = true;
- printk(KERN_INFO
- "Disabled LAPIC found during irq injection\n");
+ pr_info("Disabled LAPIC found during irq injection\n");
}
}
{
if (kvm->arch.x2apic_broadcast_quirk_disabled) {
if ((irq->dest_id == APIC_BROADCAST &&
- map->mode != KVM_APIC_MODE_X2APIC))
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
return true;
if (irq->dest_id == X2APIC_BROADCAST)
return true;
if (apic->lapic_timer.period < min_period) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
+ "vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
deadline = apic->lapic_timer.period;
else if (unlikely(deadline > apic->lapic_timer.period)) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested lapic timer restore with "
+ "vcpu %i: requested lapic timer restore with "
"starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
"Using initial count to start timer.\n",
apic->vcpu->vcpu_id,
}
}
- static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
- {
- struct kvm *kvm = apic->vcpu->kvm;
-
- if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
- return;
-
- if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
- return;
-
- kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
- }
-
static int get_lvt_index(u32 reg)
{
if (reg == APIC_LVTCMCI)
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- kvm_lapic_xapic_id_updated(apic);
} else {
ret = 1;
}
struct kvm_lapic *apic = vcpu->arch.apic;
u64 val;
- if (apic_x2apic_mode(apic)) {
- if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm))
- return;
- } else {
- val = kvm_lapic_get_reg(apic, offset);
- }
-
/*
* ICR is a single 64-bit register when x2APIC is enabled. For legacy
* xAPIC, ICR writes need to go down the common (slightly slower) path
* to get the upper half from ICR2.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+ val = kvm_lapic_get_reg64(apic, APIC_ICR);
kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
trace_kvm_apic_write(APIC_ICR, val);
} else {
/* TODO: optimize to just emulate side effect w/o one more write */
+ val = kvm_lapic_get_reg(apic, offset);
kvm_lapic_reg_write(apic, offset, (u32)val);
}
}
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
- kvm_vcpu_update_apicv(vcpu);
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
}
*/
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
+ apic->highest_isr_cache = -1;
+ }
+
+ int kvm_alloc_apic_access_page(struct kvm *kvm)
+ {
+ struct page *page;
+ void __user *hva;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ goto out;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
+ goto out;
+ }
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_memslot_enabled = true;
+ out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+
+ void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+ {
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
}
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
- if (!apic_x2apic_mode(apic))
- kvm_lapic_xapic_id_updated(apic);
-
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
__start_apic_timer(apic, APIC_TMCCT);
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
- enum avic_modes avic_mode;
+ bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- /* Note:
- * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
- * MSR accesses, while interrupt injection to a running vCPU
- * can be achieved using AVIC doorbell. The AVIC hardware still
- * accelerate MMIO accesses, but this does not cause any harm
- * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
*/
- if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
- avic_mode == AVIC_MODE_X2) {
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+ (index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
return &avic_physical_id_table[index];
}
- /*
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
- static int avic_alloc_access_page(struct kvm *kvm)
- {
- void __user *ret;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
-
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
-
- ret = __x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
- if (IS_ERR(ret)) {
- r = PTR_ERR(ret);
- goto out;
- }
-
- kvm->arch.apic_access_memslot_enabled = true;
- out:
- mutex_unlock(&kvm->slots_lock);
- return r;
- }
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_alloc_access_page(vcpu->kvm);
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
if (ret)
return ret;
}
put_cpu();
}
+
+ static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+ {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
+
+ static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+ {
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+ }
+
+ static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+ {
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+ }
+
/*
* A fast-path version of avic_kick_target_vcpus(), which attempts to match
* destination APIC ID to vCPU without looping through all vCPUs.
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 l1_physical_id, dest;
- struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
return -EINVAL;
- l1_physical_id = dest;
-
- if (WARN_ON_ONCE(l1_physical_id != index))
+ if (WARN_ON_ONCE(dest != index))
return -EINVAL;
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
} else {
- u32 bitmap, cluster;
- int logid_index;
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
if (apic_x2apic_mode(source)) {
/* 16 bit dest mask, 16 bit cluster id */
- bitmap = dest & 0xFFFF0000;
+ bitmap = dest & 0xFFFF;
cluster = (dest >> 16) << 4;
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
/* 8 bit dest mask*/
cluster = (dest >> 4) << 2;
}
+ /* Nothing to do if there are no destinations in the cluster. */
if (unlikely(!bitmap))
- /* guest bug: nobody to send the logical interrupt to */
return 0;
- if (!is_power_of_2(bitmap))
- /* multiple logical destinations, use slow path */
- return -EINVAL;
-
- logid_index = cluster + __ffs(bitmap);
-
- if (!apic_x2apic_mode(source)) {
- u32 *avic_logical_id_table =
- page_address(kvm_svm->avic_logical_id_table_page);
-
- u32 logid_entry = avic_logical_id_table[logid_index];
-
- if (WARN_ON_ONCE(index != logid_index))
- return -EINVAL;
-
- /* guest bug: non existing/reserved logical destination */
- if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
- return 0;
-
- l1_physical_id = logid_entry &
- AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
- } else {
- /*
- * For x2APIC logical mode, cannot leverage the index.
- * Instead, calculate physical ID from logical ID in ICRH.
- */
- int cluster = (icrh & 0xffff0000) >> 16;
- int apic = ffs(icrh & 0xffff) - 1;
-
- /*
- * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
- * contains anything but a single bit, we cannot use the
- * fast path, because it is limited to a single vCPU.
- */
- if (apic < 0 || icrh != (1 << apic))
- return -EINVAL;
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
- l1_physical_id = (cluster << 4) + apic;
- }
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
}
- target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
- if (unlikely(!target_vcpu))
- /* guest bug: non existing vCPU is a target of this IPI*/
- return 0;
-
- target_vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(target_vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
unsigned long i;
struct kvm_vcpu *vcpu;
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
- u32 dest;
-
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_XAPIC_DEST_FIELD(icrh);
-
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
- dest, icrl & APIC_DEST_MASK)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- }
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
}
}
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
* Emulate IPIs that are not handled by AVIC hardware, which
- * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
- * a trap, e.g. ICR holds the correct value and RIP has been
- * advanced, KVM is responsible only for emulating the IPI.
- * Sadly, hardware may sometimes leave the BUSY flag set, in
- * which case KVM needs to emulate the ICR write as well in
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
* order to clear the BUSY flag.
*/
if (icrl & APIC_ICR_BUSY)
*/
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
- case AVIC_IPI_FAILURE_INVALID_TARGET:
- break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
break;
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- int index;
u32 *logical_apic_id_table;
- int dlid = GET_APIC_LOGICAL_ID(ldr);
-
- if (!dlid)
- return NULL;
+ u32 cluster, index;
- if (flat) { /* flat */
- index = ffs(dlid) - 1;
- if (index > 7)
- return NULL;
- } else { /* cluster */
- int cluster = (dlid & 0xf0) >> 4;
- int apic = ffs(dlid & 0x0f) - 1;
+ ldr = GET_APIC_LOGICAL_ID(ldr);
- if ((apic < 0) || (apic > 7) ||
- (cluster >= 0xf))
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
return NULL;
- index = (cluster << 2) + apic;
+ ldr &= 0xf;
}
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
- static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+ static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
entry = avic_get_logical_id_entry(vcpu, ldr, flat);
if (!entry)
- return -EINVAL;
+ return;
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
-
- return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
- static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+ static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
/* AVIC does not support LDR update for x2APIC */
if (apic_x2apic_mode(vcpu->arch.apic))
- return 0;
+ return;
if (ldr == svm->ldr_reg)
- return 0;
+ return;
avic_invalidate_logical_id_entry(vcpu);
- if (ldr)
- ret = avic_ldr_write(vcpu, id, ldr);
-
- if (!ret)
- svm->ldr_reg = ldr;
-
- return ret;
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
}
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
switch (offset) {
case APIC_LDR:
- if (avic_handle_ldr_update(vcpu))
- return 0;
+ avic_handle_ldr_update(vcpu);
break;
case APIC_DFR:
avic_handle_dfr_update(vcpu);
break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
default:
break;
}
avic_handle_ldr_update(vcpu);
}
- void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
- {
- if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
- return;
-
- if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
- WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
- return;
- }
- avic_refresh_apicv_exec_ctrl(vcpu);
- }
-
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
return ret;
}
- bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
- {
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_NESTED) |
- BIT(APICV_INHIBIT_REASON_IRQWIN) |
- BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
- }
-
-
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-
- void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
- if (!enable_apicv)
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
return;
- if (activated) {
+ if (kvm_vcpu_apicv_active(vcpu)) {
/**
* During AVIC temporary deactivation, guest could update
* APIC ID, DFR and LDR registers, which would not be trapped
avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+ }
+
+ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+ {
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
if (!npt_enabled)
return false;
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ return false;
+ }
+
if (boot_cpu_has(X86_FEATURE_AVIC)) {
- avic_mode = AVIC_MODE_X1;
pr_info("AVIC enabled\n");
} else if (force_avic) {
/*
* Some older systems does not advertise AVIC support.
* See Revision Guide for specific AMD processor for more detail.
*/
- avic_mode = AVIC_MODE_X1;
pr_warn("AVIC is not supported in CPUID but force enabled");
pr_warn("Your system might crash and burn");
}
/* AVIC is a prerequisite for x2AVIC. */
- if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
- if (avic_mode == AVIC_MODE_X1) {
- avic_mode = AVIC_MODE_X2;
- pr_info("x2AVIC enabled\n");
- } else {
- pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
- pr_warn(FW_BUG "Try enable AVIC using force_avic option");
- }
- }
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled)
+ pr_info("x2AVIC enabled\n");
- if (avic_mode != AVIC_MODE_NONE)
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- return !!avic_mode;
+ return true;
}
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+ * does not affect any interrupt we may want to inject;
+ * therefore, writes to CR8 are irrelevant to L0, as are
+ * interrupt window vmexits.
*/
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
* to benefit from it right away.
*/
if (kvm_apicv_activated(vcpu->kvm))
- kvm_vcpu_update_apicv(vcpu);
+ __kvm_vcpu_update_apicv(vcpu);
return 0;
}
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
vcpu->arch.osvw.status |= 1;
}
-static int has_svm(void)
+static bool kvm_is_svm_supported(void)
{
+ int cpu = raw_smp_processor_id();
const char *msg;
+ u64 vm_cr;
if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
+ pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ return false;
}
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
- return 0;
+ return false;
}
- return 1;
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
}
void __svm_write_tsc_multiplier(u64 multiplier)
if (efer & EFER_SVME)
return -EBUSY;
- if (!has_svm()) {
- pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- return -EINVAL;
- }
sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
if (intercept == svm->x2avic_msrs_intercepted)
return;
- if (avic_mode != AVIC_MODE_X2 ||
+ if (!x2avic_enabled ||
!apic_x2apic_mode(svm->vcpu.arch.apic))
return;
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ pr_err("Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
msr->data = 0;
switch (msr->index) {
- case MSR_F10H_DECFG:
- if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
return KVM_MSR_RET_INVALID;
msr_info->data = 0x1E;
}
break;
- case MSR_F10H_DECFG:
+ case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
default:
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
- case MSR_F10H_DECFG: {
+ case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
hypercall[2] = 0xd9;
}
-static int __init svm_check_processor_compat(void)
-{
- return 0;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
/*
* If the fault occurred in userspace, arbitrarily inject #GP
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .name = "kvm_amd",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = avic_set_virtual_apic_mode,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
}
if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ } else if (!x2avic_enabled) {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
}
if (vls) {
static struct kvm_x86_init_ops svm_init_ops __initdata = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
.pmu_ops = &amd_pmu_ops,
static int __init svm_init(void)
{
+ int r;
+
__unused_size_checks();
- return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
+ kvm_x86_vendor_exit();
}
module_init(svm_init)
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
if (fault)
kvm_spurious_fault();
else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int vmx_hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void vmx_hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
return false;
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
return 0;
}
-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
return ctl_opt & allowed;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
return 0;
}
+static bool kvm_is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_has_vmx()) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
+ pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
vmx_segment_cache_clear(vmx);
vmcs_write32(sf->ar_bytes, ar);
}
- static int alloc_apic_access_page(struct kvm *kvm)
- {
- struct page *page;
- void __user *hva;
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
- hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (IS_ERR(hva)) {
- ret = PTR_ERR(hva);
- goto out;
- }
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_memslot_enabled = true;
- out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
- }
-
int allocate_vpid(void)
{
int vpid;
gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
- err = alloc_apic_access_page(vcpu->kvm);
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
return 0;
}
-static int __init vmx_check_processor_compat(void)
-{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
-}
-
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
free_kvm_area();
}
- static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
- {
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
- }
+ #define VMX_REQUIRED_APICV_INHIBITS \
+ ( \
+ BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
+ )
static void vmx_vm_destroy(struct kvm *kvm)
{
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
return -EIO;
if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
if (boot_cpu_has(X86_FEATURE_NX))
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
}
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.handle_intel_pt_intr = NULL,
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
+ kvm_x86_vendor_exit();
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
+ hv_init_evmcs();
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
- = hv_enable_l2_tlb_flush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
-
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vmx_init_ops);
if (r)
return r;
/*
- * Must be called after kvm_init() so enable_ept is properly set
+ * Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;
vmx_setup_fb_clear_ctrl();
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
module_init(vmx_init);
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static DEFINE_MUTEX(vendor_module_lock);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
- MSR_F10H_DECFG,
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
!guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
- pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
return X86EMUL_CONTINUE;
emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ pr_warn_once("emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
if (!ctxt) {
- pr_err("kvm: failed to allocate vcpu's emulator\n");
+ pr_err("failed to allocate vcpu's emulator\n");
return NULL;
}
};
#endif
-int kvm_arch_init(void *opaque)
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
- struct kvm_x86_init_ops *ops = opaque;
u64 host_pat;
- int r;
+ int r, cpu;
if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
+ pr_err("inadequate fpu\n");
return -EOPNOTSUPP;
}
*/
if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
+ pr_err("host PAT[0] is not WB\n");
return -EIO;
}
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ pr_err("failed to allocate cache for x86 emulator\n");
return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
if (r)
goto out_free_percpu;
- kvm_timer_init();
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability();
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_list();
return 0;
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
return r;
}
-void kvm_arch_exit(void)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
{
+ kvm_unregister_perf_callbacks();
+
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
- void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool activate;
preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
- EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+ EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+
+ static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ {
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can the AVIC doorbell to inject interrupts to
+ * running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+ }
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set)
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ pr_warn_once("SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
if (!kvm->arch.max_vcpu_ids)
goto free_wbinvd_dirty_mask;
if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- pr_err("kvm: failed to allocate vcpu's fpu\n");
+ pr_err("failed to allocate vcpu's fpu\n");
goto free_emulate_ctxt;
}
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
drop_user_return_notifiers();
}
-static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
-{
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-
- kvm_pmu_ops_update(ops->pmu_ops);
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- struct kvm_x86_init_ops *ops = opaque;
- int r;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
- kvm_init_pmu_capability();
-
- r = ops->hardware_setup();
- if (r != 0)
- return r;
-
- kvm_ops_update(ops);
-
- kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
-
- if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- kvm_caps.supported_xss = 0;
-
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
- if (kvm_caps.has_tsc_control) {
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated because it will always
- * be 1 on all machines.
- */
- u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- kvm_caps.max_guest_tsc_khz = max;
- }
- kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_unregister_perf_callbacks();
-
- static_call(kvm_x86_hardware_unsetup)();
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- struct kvm_x86_init_ops *ops = opaque;
-
- WARN_ON(!irqs_disabled());
-
- if (__cr4_reserved_bits(cpu_has, c) !=
- __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- return -EIO;
-
- return ops->check_processor_compatibility();
-}
-
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;