Merge branch 'kvm-lapic-fix-and-cleanup' into HEAD

author Paolo Bonzini <[email protected]>

Tue, 27 Dec 2022 12:56:16 +0000 (07:56 -0500)

committer Paolo Bonzini <[email protected]>

Tue, 24 Jan 2023 11:08:01 +0000 (06:08 -0500)
author Paolo Bonzini <[email protected]>
Tue, 27 Dec 2022 12:56:16 +0000 (07:56 -0500)
committer Paolo Bonzini <[email protected]>
Tue, 24 Jan 2023 11:08:01 +0000 (06:08 -0500)
diff --combined arch/x86/include/asm/kvm-x86-ops.h

index dba2909e5ae2a12658f94e54398da251b9bbf439,84f43caef9b7eaa73b178f009389ed6dbac49e87..8dc345cc63188b4f338317dc18752b1bfd0954e4
--- 1/arch/x86/include/asm/kvm-x86-ops.h
--- 2/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@@ -14,7 -14,6 +14,7 @@@ BUILD_BUG_ON(1
    * to make a definition optional, but in this case the default will
    * be __static_call_return0.
    */
+ +KVM_X86_OP(check_processor_compatibility)
   KVM_X86_OP(hardware_enable)
   KVM_X86_OP(hardware_disable)
   KVM_X86_OP(hardware_unsetup)
@@@ -77,7 -76,6 +77,6 @@@ KVM_X86_OP(set_nmi_mask
   KVM_X86_OP(enable_nmi_window)
   KVM_X86_OP(enable_irq_window)
   KVM_X86_OP_OPTIONAL(update_cr8_intercept)
- KVM_X86_OP(check_apicv_inhibit_reasons)
   KVM_X86_OP(refresh_apicv_exec_ctrl)
   KVM_X86_OP_OPTIONAL(hwapic_irr_update)
   KVM_X86_OP_OPTIONAL(hwapic_isr_update)
diff --combined arch/x86/include/asm/kvm_host.h

index 8d0a0a7c34fc399286569acb94c1d8d9e9fb4524,7ca854714ccda12af7767aed5d93a55f8e271117..4d2bc08794e45c6ef11cd551ebd9a52f2e8dfc5b
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -1022,19 -1022,30 +1022,30 @@@ struct kvm_arch_memory_slot 
   };
   
   /*
-  * We use as the mode the number of bits allocated in the LDR for the
-  * logical processor ID.  It happens that these are all powers of two.
-  * This makes it is very easy to detect cases where the APICs are
-  * configured for multiple modes; in that case, we cannot use the map and
-  * hence cannot use kvm_irq_delivery_to_apic_fast either.
+  * Track the mode of the optimized logical map, as the rules for decoding the
+  * destination vary per mode.  Enabling the optimized logical map requires all
+  * software-enabled local APIs to be in the same mode, each addressable APIC to
+  * be mapped to only one MDA, and each MDA to map to at most one APIC.
    */
- #define KVM_APIC_MODE_XAPIC_CLUSTER          4
- #define KVM_APIC_MODE_XAPIC_FLAT             8
- #define KVM_APIC_MODE_X2APIC                16
+ enum kvm_apic_logical_mode {
+       /* All local APICs are software disabled. */
+       KVM_APIC_MODE_SW_DISABLED,
+       /* All software enabled local APICs in xAPIC cluster addressing mode. */
+       KVM_APIC_MODE_XAPIC_CLUSTER,
+       /* All software enabled local APICs in xAPIC flat addressing mode. */
+       KVM_APIC_MODE_XAPIC_FLAT,
+       /* All software enabled local APICs in x2APIC mode. */
+       KVM_APIC_MODE_X2APIC,
+       /*
+        * Optimized map disabled, e.g. not all local APICs in the same logical
+        * mode, same logical ID assigned to multiple APICs, etc.
+        */
+       KVM_APIC_MODE_MAP_DISABLED,
+ };
   
   struct kvm_apic_map {
         struct rcu_head rcu;
-       u8 mode;
+       enum kvm_apic_logical_mode logical_mode;
         u32 max_apic_id;
         union {
                 struct kvm_lapic *xapic_flat_map[8];
@@@ -1112,7 -1123,6 +1123,7 @@@ struct msr_bitmap_range 
   
   /* Xen emulation context */
   struct kvm_xen {
+ +      struct mutex xen_lock;
         u32 xen_version;
         bool long_mode;
         bool runstate_update_flag;
@@@ -1164,6 -1174,12 +1175,12 @@@ enum kvm_apicv_inhibit 
          */
         APICV_INHIBIT_REASON_BLOCKIRQ,
   
+       /*
+        * APICv is disabled because not all vCPUs have a 1:1 mapping between
+        * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+        */
+       APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+ 
         /*
          * For simplicity, the APIC acceleration is inhibited
          * first time either APIC ID or APIC base are changed by the guest
@@@ -1202,6 -1218,12 +1219,12 @@@
          * AVIC is disabled because SEV doesn't support it.
          */
         APICV_INHIBIT_REASON_SEV,
+ 
+       /*
+        * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+        * mapping between logical ID and vCPU.
+        */
+       APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
   };
   
   struct kvm_arch {
@@@ -1250,10 -1272,11 +1273,11 @@@
         struct kvm_apic_map __rcu *apic_map;
         atomic_t apic_map_dirty;
   
-       /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
-       struct rw_semaphore apicv_update_lock;
- 
         bool apic_access_memslot_enabled;
+       bool apic_access_memslot_inhibited;
+ 
+       /* Protects apicv_inhibit_reasons */
+       struct rw_semaphore apicv_update_lock;
         unsigned long apicv_inhibit_reasons;
   
         gpa_t wall_clock;
@@@ -1512,8 -1535,6 +1536,8 @@@ static inline u16 kvm_lapic_irq_dest_mo
   struct kvm_x86_ops {
         const char *name;
   
+ +      int (*check_processor_compatibility)(void);
+ +
         int (*hardware_enable)(void);
         void (*hardware_disable)(void);
         void (*hardware_unsetup)(void);
@@@ -1602,6 -1623,8 +1626,8 @@@
         void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
         bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+       const unsigned long required_apicv_inhibits;
+       bool allow_apicv_in_x2apic_without_x2apic_virtualization;
         void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
         void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
         void (*hwapic_isr_update)(int isr);
@@@ -1725,6 -1748,9 +1751,6 @@@ struct kvm_x86_nested_ops 
   };
   
   struct kvm_x86_init_ops {
- -      int (*cpu_has_kvm_support)(void);
- -      int (*disabled_by_bios)(void);
- -      int (*check_processor_compatibility)(void);
         int (*hardware_setup)(void);
         unsigned int (*handle_intel_pt_intr)(void);
   
@@@ -1751,9 -1777,6 +1777,9 @@@ extern struct kvm_x86_ops kvm_x86_ops
   #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
   #include <asm/kvm-x86-ops.h>
   
+ +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+ +void kvm_x86_vendor_exit(void);
+ +
   #define __KVM_HAVE_ARCH_VM_ALLOC
   static inline struct kvm *kvm_arch_alloc_vm(void)
   {
@@@ -1976,7 -1999,7 +2002,7 @@@ gpa_t kvm_mmu_gva_to_gpa_system(struct 
   
   bool kvm_apicv_activated(struct kvm *kvm);
   bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
- void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
   void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
                                       enum kvm_apicv_inhibit reason, bool set);
   void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
diff --combined arch/x86/kvm/lapic.c

index cfaf1d8c64ca902330fbccf3ed318c5f646960d6,669ea125b7e229f5454e3f087ab5ec11ec81aa1b..7cf4eebc9bcc8ec3c06f029b8463b388f7c58cfc
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -15,7 -15,6 +15,7 @@@
    *
    * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
    */
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_host.h>
   #include <linux/kvm.h>
@@@ -167,9 -166,19 +167,19 @@@ static bool kvm_use_posted_timer_interr
         return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
   }
   
+ static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+ {
+       return ((id >> 4) << 16) | (1 << (id & 0xf));
+ }
+ 
   static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
                 u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
-       switch (map->mode) {
+       switch (map->logical_mode) {
+       case KVM_APIC_MODE_SW_DISABLED:
+               /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+               *cluster = map->xapic_flat_map;
+               *mask = 0;
+               return true;
         case KVM_APIC_MODE_X2APIC: {
                 u32 offset = (dest_id >> 16) * 16;
                 u32 max_apic_id = map->max_apic_id;
@@@ -194,8 -203,10 +204,10 @@@
                 *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
                 *mask = dest_id & 0xf;
                 return true;
+       case KVM_APIC_MODE_MAP_DISABLED:
+               return false;
         default:
-               /* Not optimized. */
+               WARN_ON_ONCE(1);
                 return false;
         }
   }
@@@ -207,6 -218,134 +219,134 @@@ static void kvm_apic_map_free(struct rc
         kvfree(map);
   }
   
+ static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+                                   struct kvm_vcpu *vcpu,
+                                   bool *xapic_id_mismatch)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u32 x2apic_id = kvm_x2apic_id(apic);
+       u32 xapic_id = kvm_xapic_id(apic);
+       u32 physical_id;
+ 
+       /*
+        * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+        * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+        * 32-bit value.  Any unwanted aliasing due to truncation results will
+        * be detected below.
+        */
+       if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+               *xapic_id_mismatch = true;
+ 
+       /*
+        * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+        * Allow sending events to vCPUs by their x2APIC ID even if the target
+        * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+        * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+        * and collide).
+        *
+        * Honor the architectural (and KVM's non-optimized) behavior if
+        * userspace has not enabled 32-bit x2APIC IDs.  Each APIC is supposed
+        * to process messages independently.  If multiple vCPUs have the same
+        * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+        * manually modified its xAPIC IDs, events targeting that ID are
+        * supposed to be recognized by all vCPUs with said ID.
+        */
+       if (vcpu->kvm->arch.x2apic_format) {
+               /* See also kvm_apic_match_physical_addr(). */
+               if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+                       x2apic_id <= new->max_apic_id)
+                       new->phys_map[x2apic_id] = apic;
+ 
+               if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+                       new->phys_map[xapic_id] = apic;
+       } else {
+               /*
+                * Disable the optimized map if the physical APIC ID is already
+                * mapped, i.e. is aliased to multiple vCPUs.  The optimized
+                * map requires a strict 1:1 mapping between IDs and vCPUs.
+                */
+               if (apic_x2apic_mode(apic))
+                       physical_id = x2apic_id;
+               else
+                       physical_id = xapic_id;
+ 
+               if (new->phys_map[physical_id])
+                       return -EINVAL;
+ 
+               new->phys_map[physical_id] = apic;
+       }
+ 
+       return 0;
+ }
+ 
+ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+                                       struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       enum kvm_apic_logical_mode logical_mode;
+       struct kvm_lapic **cluster;
+       u16 mask;
+       u32 ldr;
+ 
+       if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+               return;
+ 
+       if (!kvm_apic_sw_enabled(apic))
+               return;
+ 
+       ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+       if (!ldr)
+               return;
+ 
+       if (apic_x2apic_mode(apic)) {
+               logical_mode = KVM_APIC_MODE_X2APIC;
+       } else {
+               ldr = GET_APIC_LOGICAL_ID(ldr);
+               if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+                       logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+               else
+                       logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+       }
+ 
+       /*
+        * To optimize logical mode delivery, all software-enabled APICs must
+        * be configured for the same mode.
+        */
+       if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+               new->logical_mode = logical_mode;
+       } else if (new->logical_mode != logical_mode) {
+               new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+               return;
+       }
+ 
+       /*
+        * In x2APIC mode, the LDR is read-only and derived directly from the
+        * x2APIC ID, thus is guaranteed to be addressable.  KVM reuses
+        * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+        * reversing the LDR calculation to get cluster of APICs, i.e. no
+        * additional work is required.
+        */
+       if (apic_x2apic_mode(apic)) {
+               WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+               return;
+       }
+ 
+       if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+                                                       &cluster, &mask))) {
+               new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+               return;
+       }
+ 
+       if (!mask)
+               return;
+ 
+       ldr = ffs(mask) - 1;
+       if (!is_power_of_2(mask) || cluster[ldr])
+               new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+       else
+               cluster[ldr] = apic;
+ }
+ 
   /*
    * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
    *
@@@ -225,6 -364,7 +365,7 @@@ void kvm_recalculate_apic_map(struct kv
         struct kvm_vcpu *vcpu;
         unsigned long i;
         u32 max_id = 255; /* enough space for any xAPIC ID */
+       bool xapic_id_mismatch = false;
   
         /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map.  */
         if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@@ -257,54 -397,41 +398,41 @@@
                 goto out;
   
         new->max_apic_id = max_id;
+       new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
   
         kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct kvm_lapic *apic = vcpu->arch.apic;
-               struct kvm_lapic **cluster;
-               u16 mask;
-               u32 ldr;
-               u8 xapic_id;
-               u32 x2apic_id;
- 
                 if (!kvm_apic_present(vcpu))
                         continue;
   
-               xapic_id = kvm_xapic_id(apic);
-               x2apic_id = kvm_x2apic_id(apic);
- 
-               /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
-               if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
-                               x2apic_id <= new->max_apic_id)
-                       new->phys_map[x2apic_id] = apic;
-               /*
-                * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
-                * prevent them from masking VCPUs with APIC ID <= 0xff.
-                */
-               if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
-                       new->phys_map[xapic_id] = apic;
- 
-               if (!kvm_apic_sw_enabled(apic))
-                       continue;
- 
-               ldr = kvm_lapic_get_reg(apic, APIC_LDR);
- 
-               if (apic_x2apic_mode(apic)) {
-                       new->mode |= KVM_APIC_MODE_X2APIC;
-               } else if (ldr) {
-                       ldr = GET_APIC_LOGICAL_ID(ldr);
-                       if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
-                               new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
-                       else
-                               new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+               if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+                       kvfree(new);
+                       new = NULL;
+                       goto out;
                 }
   
-               if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
-                       continue;
- 
-               if (mask)
-                       cluster[ffs(mask) - 1] = apic;
+               kvm_recalculate_logical_map(new, vcpu);
         }
   out:
+       /*
+        * The optimized map is effectively KVM's internal version of APICv,
+        * and all unwanted aliasing that results in disabling the optimized
+        * map also applies to APICv.
+        */
+       if (!new)
+               kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+       else
+               kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ 
+       if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+               kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+       else
+               kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ 
+       if (xapic_id_mismatch)
+               kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+       else
+               kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ 
         old = rcu_dereference_protected(kvm->arch.apic_map,
                         lockdep_is_held(&kvm->arch.apic_map_lock));
         rcu_assign_pointer(kvm->arch.apic_map, new);
@@@ -361,11 -488,6 +489,6 @@@ static inline void kvm_apic_set_dfr(str
         atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
   }
   
- static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
- {
-       return ((id >> 4) << 16) | (1 << (id & 0xf));
- }
- 
   static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
   {
         u32 ldr = kvm_apic_calc_x2apic_ldr(id);
@@@ -942,7 -1064,8 +1065,7 @@@ static void kvm_apic_disabled_lapic_fou
   {
         if (!kvm->arch.disabled_lapic_found) {
                 kvm->arch.disabled_lapic_found = true;
- -              printk(KERN_INFO
- -                     "Disabled LAPIC found during irq injection\n");
+ +              pr_info("Disabled LAPIC found during irq injection\n");
         }
   }
   
@@@ -951,7 -1074,7 +1074,7 @@@ static bool kvm_apic_is_broadcast_dest(
   {
         if (kvm->arch.x2apic_broadcast_quirk_disabled) {
                 if ((irq->dest_id == APIC_BROADCAST &&
-                               map->mode != KVM_APIC_MODE_X2APIC))
+                    map->logical_mode != KVM_APIC_MODE_X2APIC))
                         return true;
                 if (irq->dest_id == X2APIC_BROADCAST)
                         return true;
@@@ -1560,7 -1683,7 +1683,7 @@@ static void limit_periodic_timer_freque
   
                 if (apic->lapic_timer.period < min_period) {
                         pr_info_ratelimited(
- -                          "kvm: vcpu %i: requested %lld ns "
+ +                          "vcpu %i: requested %lld ns "
                             "lapic timer period limited to %lld ns\n",
                             apic->vcpu->vcpu_id,
                             apic->lapic_timer.period, min_period);
@@@ -1845,7 -1968,7 +1968,7 @@@ static bool set_target_expiration(struc
                                 deadline = apic->lapic_timer.period;
                         else if (unlikely(deadline > apic->lapic_timer.period)) {
                                 pr_info_ratelimited(
- -                                  "kvm: vcpu %i: requested lapic timer restore with "
+ +                                  "vcpu %i: requested lapic timer restore with "
                                     "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
                                     "Using initial count to start timer.\n",
                                     apic->vcpu->vcpu_id,
@@@ -2068,19 -2191,6 +2191,6 @@@ static void apic_manage_nmi_watchdog(st
         }
   }
   
- static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
- {
-       struct kvm *kvm = apic->vcpu->kvm;
- 
-       if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
-               return;
- 
-       if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
-               return;
- 
-       kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
- }
- 
   static int get_lvt_index(u32 reg)
   {
         if (reg == APIC_LVTCMCI)
@@@ -2101,7 -2211,6 +2211,6 @@@ static int kvm_lapic_reg_write(struct k
         case APIC_ID:           /* Local APIC ID */
                 if (!apic_x2apic_mode(apic)) {
                         kvm_apic_set_xapic_id(apic, val >> 24);
-                       kvm_lapic_xapic_id_updated(apic);
                 } else {
                         ret = 1;
                 }
@@@ -2284,23 -2393,18 +2393,18 @@@ void kvm_apic_write_nodecode(struct kvm
         struct kvm_lapic *apic = vcpu->arch.apic;
         u64 val;
   
-       if (apic_x2apic_mode(apic)) {
-               if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm))
-                       return;
-       } else {
-               val = kvm_lapic_get_reg(apic, offset);
-       }
- 
         /*
          * ICR is a single 64-bit register when x2APIC is enabled.  For legacy
          * xAPIC, ICR writes need to go down the common (slightly slower) path
          * to get the upper half from ICR2.
          */
         if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+               val = kvm_lapic_get_reg64(apic, APIC_ICR);
                 kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
                 trace_kvm_apic_write(APIC_ICR, val);
         } else {
                 /* TODO: optimize to just emulate side effect w/o one more write */
+               val = kvm_lapic_get_reg(apic, offset);
                 kvm_lapic_reg_write(apic, offset, (u32)val);
         }
   }
@@@ -2398,7 -2502,7 +2502,7 @@@ void kvm_lapic_set_base(struct kvm_vcp
                 kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
   
         if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
-               kvm_vcpu_update_apicv(vcpu);
+               kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
                 static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
         }
   
@@@ -2429,6 -2533,78 +2533,78 @@@ void kvm_apic_update_apicv(struct kvm_v
                  */
                 apic->isr_count = count_vectors(apic->regs + APIC_ISR);
         }
+       apic->highest_isr_cache = -1;
+ }
+ 
+ int kvm_alloc_apic_access_page(struct kvm *kvm)
+ {
+       struct page *page;
+       void __user *hva;
+       int ret = 0;
+ 
+       mutex_lock(&kvm->slots_lock);
+       if (kvm->arch.apic_access_memslot_enabled ||
+           kvm->arch.apic_access_memslot_inhibited)
+               goto out;
+ 
+       hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+       if (IS_ERR(hva)) {
+               ret = PTR_ERR(hva);
+               goto out;
+       }
+ 
+       page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+       if (is_error_page(page)) {
+               ret = -EFAULT;
+               goto out;
+       }
+ 
+       /*
+        * Do not pin the page in memory, so that memory hot-unplug
+        * is able to migrate it.
+        */
+       put_page(page);
+       kvm->arch.apic_access_memslot_enabled = true;
+ out:
+       mutex_unlock(&kvm->slots_lock);
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+ 
+ void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+ 
+       if (!kvm->arch.apic_access_memslot_enabled)
+               return;
+ 
+       kvm_vcpu_srcu_read_unlock(vcpu);
+ 
+       mutex_lock(&kvm->slots_lock);
+ 
+       if (kvm->arch.apic_access_memslot_enabled) {
+               __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+               /*
+                * Clear "enabled" after the memslot is deleted so that a
+                * different vCPU doesn't get a false negative when checking
+                * the flag out of slots_lock.  No additional memory barrier is
+                * needed as modifying memslots requires waiting other vCPUs to
+                * drop SRCU (see above), and false positives are ok as the
+                * flag is rechecked after acquiring slots_lock.
+                */
+               kvm->arch.apic_access_memslot_enabled = false;
+ 
+               /*
+                * Mark the memslot as inhibited to prevent reallocating the
+                * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+                */
+               kvm->arch.apic_access_memslot_inhibited = true;
+       }
+ 
+       mutex_unlock(&kvm->slots_lock);
+ 
+       kvm_vcpu_srcu_read_lock(vcpu);
   }
   
   void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
@@@ -2484,7 -2660,6 +2660,6 @@@
                 kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
         }
         kvm_apic_update_apicv(vcpu);
-       apic->highest_isr_cache = -1;
         update_divide_count(apic);
         atomic_set(&apic->lapic_timer.pending, 0);
   
@@@ -2756,9 -2931,6 +2931,6 @@@ int kvm_apic_set_state(struct kvm_vcpu 
         }
         memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
   
-       if (!apic_x2apic_mode(apic))
-               kvm_lapic_xapic_id_updated(apic);
- 
         atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
         kvm_recalculate_apic_map(vcpu->kvm);
         kvm_apic_set_version(vcpu);
@@@ -2772,7 -2944,6 +2944,6 @@@
         __start_apic_timer(apic, APIC_TMCCT);
         kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
         kvm_apic_update_apicv(vcpu);
-       apic->highest_isr_cache = -1;
         if (apic->apicv_active) {
                 static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
                 static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
diff --combined arch/x86/kvm/svm/avic.c

index f52f5e0dd4658fc15ad430d9e4a9953f2c8f99d3,14677bc31b83afb7f5b76a62fac2178656e0d14e..b3928150a37ce8d5a1d93bba10fc314c8c5e9e11
--- 1/arch/x86/kvm/svm/avic.c
--- 2/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@@ -12,7 -12,7 +12,7 @@@
    *   Avi Kivity   <[email protected]>
    */
   
- -#define pr_fmt(fmt) "SVM: " fmt
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_types.h>
   #include <linux/hashtable.h>
@@@ -53,7 -53,7 +53,7 @@@ static DEFINE_HASHTABLE(svm_vm_data_has
   static u32 next_vm_id = 0;
   static bool next_vm_id_wrapped = 0;
   static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
- enum avic_modes avic_mode;
+ bool x2avic_enabled;
   
   /*
    * This is a wrapper of struct amd_iommu_ir_data.
@@@ -72,20 -72,25 +72,25 @@@ static void avic_activate_vmcb(struct v
   
         vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
   
-       /* Note:
-        * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
-        * MSR accesses, while interrupt injection to a running vCPU
-        * can be achieved using AVIC doorbell. The AVIC hardware still
-        * accelerate MMIO accesses, but this does not cause any harm
-        * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+       /*
+        * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+        * accesses, while interrupt injection to a running vCPU can be
+        * achieved using AVIC doorbell.  KVM disables the APIC access page
+        * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+        * AVIC in hybrid mode activates only the doorbell mechanism.
          */
-       if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
-           avic_mode == AVIC_MODE_X2) {
+       if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
                 vmcb->control.int_ctl |= X2APIC_MODE_MASK;
                 vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
                 /* Disabling MSR intercept for x2APIC registers */
                 svm_set_x2apic_msr_interception(svm, false);
         } else {
+               /*
+                * Flush the TLB, the guest may have inserted a non-APIC
+                * mapping into the TLB while AVIC was disabled.
+                */
+               kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+ 
                 /* For xAVIC and hybrid-xAVIC modes */
                 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
                 /* Enabling MSR intercept for x2APIC registers */
@@@ -241,8 -246,8 +246,8 @@@ static u64 *avic_get_physical_id_entry(
         u64 *avic_physical_id_table;
         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
   
-       if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
-           (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+       if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+           (index > X2AVIC_MAX_PHYSICAL_ID))
                 return NULL;
   
         avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
@@@ -250,47 -255,14 +255,14 @@@
         return &avic_physical_id_table[index];
   }
   
- /*
-  * Note:
-  * AVIC hardware walks the nested page table to check permissions,
-  * but does not use the SPA address specified in the leaf page
-  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
-  * field of the VMCB. Therefore, we set up the
-  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
-  */
- static int avic_alloc_access_page(struct kvm *kvm)
- {
-       void __user *ret;
-       int r = 0;
- 
-       mutex_lock(&kvm->slots_lock);
- 
-       if (kvm->arch.apic_access_memslot_enabled)
-               goto out;
- 
-       ret = __x86_set_memory_region(kvm,
-                                     APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-                                     APIC_DEFAULT_PHYS_BASE,
-                                     PAGE_SIZE);
-       if (IS_ERR(ret)) {
-               r = PTR_ERR(ret);
-               goto out;
-       }
- 
-       kvm->arch.apic_access_memslot_enabled = true;
- out:
-       mutex_unlock(&kvm->slots_lock);
-       return r;
- }
- 
   static int avic_init_backing_page(struct kvm_vcpu *vcpu)
   {
         u64 *entry, new_entry;
         int id = vcpu->vcpu_id;
         struct vcpu_svm *svm = to_svm(vcpu);
   
-       if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
-           (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+       if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+           (id > X2AVIC_MAX_PHYSICAL_ID))
                 return -EINVAL;
   
         if (!vcpu->arch.apic->regs)
@@@ -299,7 -271,13 +271,13 @@@
         if (kvm_apicv_activated(vcpu->kvm)) {
                 int ret;
   
-               ret = avic_alloc_access_page(vcpu->kvm);
+               /*
+                * Note, AVIC hardware walks the nested page table to check
+                * permissions, but does not use the SPA address specified in
+                * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+                * pointer field of the VMCB.
+                */
+               ret = kvm_alloc_apic_access_page(vcpu->kvm);
                 if (ret)
                         return ret;
         }
@@@ -339,6 -317,60 +317,60 @@@ void avic_ring_doorbell(struct kvm_vcp
         put_cpu();
   }
   
+ 
+ static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+ {
+       vcpu->arch.apic->irr_pending = true;
+       svm_complete_interrupt_delivery(vcpu,
+                                       icrl & APIC_MODE_MASK,
+                                       icrl & APIC_INT_LEVELTRIG,
+                                       icrl & APIC_VECTOR_MASK);
+ }
+ 
+ static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+                                         u32 icrl)
+ {
+       /*
+        * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+        * i.e. APIC ID == vCPU ID.
+        */
+       struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+ 
+       /* Once again, nothing to do if the target vCPU doesn't exist. */
+       if (unlikely(!target_vcpu))
+               return;
+ 
+       avic_kick_vcpu(target_vcpu, icrl);
+ }
+ 
+ static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+                                        u32 logid_index, u32 icrl)
+ {
+       u32 physical_id;
+ 
+       if (avic_logical_id_table) {
+               u32 logid_entry = avic_logical_id_table[logid_index];
+ 
+               /* Nothing to do if the logical destination is invalid. */
+               if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+                       return;
+ 
+               physical_id = logid_entry &
+                             AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+       } else {
+               /*
+                * For x2APIC, the logical APIC ID is a read-only value that is
+                * derived from the x2APIC ID, thus the x2APIC ID can be found
+                * by reversing the calculation (stored in logid_index).  Note,
+                * bits 31:20 of the x2APIC ID aren't propagated to the logical
+                * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+                */
+               physical_id = logid_index;
+       }
+ 
+       avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+ }
+ 
   /*
    * A fast-path version of avic_kick_target_vcpus(), which attempts to match
    * destination APIC ID to vCPU without looping through all vCPUs.
@@@ -346,11 -378,10 +378,10 @@@
   static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
                                        u32 icrl, u32 icrh, u32 index)
   {
-       u32 l1_physical_id, dest;
-       struct kvm_vcpu *target_vcpu;
         int dest_mode = icrl & APIC_DEST_MASK;
         int shorthand = icrl & APIC_SHORT_MASK;
         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+       u32 dest;
   
         if (shorthand != APIC_DEST_NOSHORT)
                 return -EINVAL;
@@@ -367,18 -398,18 +398,18 @@@
                 if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
                         return -EINVAL;
   
-               l1_physical_id = dest;
- 
-               if (WARN_ON_ONCE(l1_physical_id != index))
+               if (WARN_ON_ONCE(dest != index))
                         return -EINVAL;
   
+               avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
         } else {
-               u32 bitmap, cluster;
-               int logid_index;
+               u32 *avic_logical_id_table;
+               unsigned long bitmap, i;
+               u32 cluster;
   
                 if (apic_x2apic_mode(source)) {
                         /* 16 bit dest mask, 16 bit cluster id */
-                       bitmap = dest & 0xFFFF0000;
+                       bitmap = dest & 0xFFFF;
                         cluster = (dest >> 16) << 4;
                 } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
                         /* 8 bit dest mask*/
@@@ -390,67 -421,32 +421,32 @@@
                         cluster = (dest >> 4) << 2;
                 }
   
+               /* Nothing to do if there are no destinations in the cluster. */
                 if (unlikely(!bitmap))
-                       /* guest bug: nobody to send the logical interrupt to */
                         return 0;
   
-               if (!is_power_of_2(bitmap))
-                       /* multiple logical destinations, use slow path */
-                       return -EINVAL;
- 
-               logid_index = cluster + __ffs(bitmap);
- 
-               if (!apic_x2apic_mode(source)) {
-                       u32 *avic_logical_id_table =
-                               page_address(kvm_svm->avic_logical_id_table_page);
- 
-                       u32 logid_entry = avic_logical_id_table[logid_index];
- 
-                       if (WARN_ON_ONCE(index != logid_index))
-                               return -EINVAL;
- 
-                       /* guest bug: non existing/reserved logical destination */
-                       if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
-                               return 0;
- 
-                       l1_physical_id = logid_entry &
-                                        AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
-               } else {
-                       /*
-                        * For x2APIC logical mode, cannot leverage the index.
-                        * Instead, calculate physical ID from logical ID in ICRH.
-                        */
-                       int cluster = (icrh & 0xffff0000) >> 16;
-                       int apic = ffs(icrh & 0xffff) - 1;
- 
-                       /*
-                        * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
-                        * contains anything but a single bit, we cannot use the
-                        * fast path, because it is limited to a single vCPU.
-                        */
-                       if (apic < 0 || icrh != (1 << apic))
-                               return -EINVAL;
+               if (apic_x2apic_mode(source))
+                       avic_logical_id_table = NULL;
+               else
+                       avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
   
-                       l1_physical_id = (cluster << 4) + apic;
-               }
+               /*
+                * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+                * IDs, thus each bit in the destination is guaranteed to map
+                * to at most one vCPU.
+                */
+               for_each_set_bit(i, &bitmap, 16)
+                       avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+                                                    cluster + i, icrl);
         }
   
-       target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
-       if (unlikely(!target_vcpu))
-               /* guest bug: non existing vCPU is a target of this IPI*/
-               return 0;
- 
-       target_vcpu->arch.apic->irr_pending = true;
-       svm_complete_interrupt_delivery(target_vcpu,
-                                       icrl & APIC_MODE_MASK,
-                                       icrl & APIC_INT_LEVELTRIG,
-                                       icrl & APIC_VECTOR_MASK);
         return 0;
   }
   
   static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
                                    u32 icrl, u32 icrh, u32 index)
   {
+       u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
         unsigned long i;
         struct kvm_vcpu *vcpu;
   
@@@ -466,21 -462,9 +462,9 @@@
          * since entered the guest will have processed pending IRQs at VMRUN.
          */
         kvm_for_each_vcpu(i, vcpu, kvm) {
-               u32 dest;
- 
-               if (apic_x2apic_mode(vcpu->arch.apic))
-                       dest = icrh;
-               else
-                       dest = GET_XAPIC_DEST_FIELD(icrh);
- 
                 if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
-                                       dest, icrl & APIC_DEST_MASK)) {
-                       vcpu->arch.apic->irr_pending = true;
-                       svm_complete_interrupt_delivery(vcpu,
-                                                       icrl & APIC_MODE_MASK,
-                                                       icrl & APIC_INT_LEVELTRIG,
-                                                       icrl & APIC_VECTOR_MASK);
-               }
+                                       dest, icrl & APIC_DEST_MASK))
+                       avic_kick_vcpu(vcpu, icrl);
         }
   }
   
@@@ -496,14 -480,18 +480,18 @@@ int avic_incomplete_ipi_interception(st
         trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
   
         switch (id) {
+       case AVIC_IPI_FAILURE_INVALID_TARGET:
         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
                 /*
                  * Emulate IPIs that are not handled by AVIC hardware, which
-                * only virtualizes Fixed, Edge-Triggered INTRs.  The exit is
-                * a trap, e.g. ICR holds the correct value and RIP has been
-                * advanced, KVM is responsible only for emulating the IPI.
-                * Sadly, hardware may sometimes leave the BUSY flag set, in
-                * which case KVM needs to emulate the ICR write as well in
+                * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+                * if _any_ targets are invalid, e.g. if the logical mode mask
+                * is a superset of running vCPUs.
+                *
+                * The exit is a trap, e.g. ICR holds the correct value and RIP
+                * has been advanced, KVM is responsible only for emulating the
+                * IPI.  Sadly, hardware may sometimes leave the BUSY flag set,
+                * in which case KVM needs to emulate the ICR write as well in
                  * order to clear the BUSY flag.
                  */
                 if (icrl & APIC_ICR_BUSY)
@@@ -519,8 -507,6 +507,6 @@@
                  */
                 avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
                 break;
-       case AVIC_IPI_FAILURE_INVALID_TARGET:
-               break;
         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
                 WARN_ONCE(1, "Invalid backing page\n");
                 break;
@@@ -541,33 -527,33 +527,33 @@@ unsigned long avic_vcpu_get_apicv_inhib
   static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
   {
         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-       int index;
         u32 *logical_apic_id_table;
-       int dlid = GET_APIC_LOGICAL_ID(ldr);
- 
-       if (!dlid)
-               return NULL;
+       u32 cluster, index;
   
-       if (flat) { /* flat */
-               index = ffs(dlid) - 1;
-               if (index > 7)
-                       return NULL;
-       } else { /* cluster */
-               int cluster = (dlid & 0xf0) >> 4;
-               int apic = ffs(dlid & 0x0f) - 1;
+       ldr = GET_APIC_LOGICAL_ID(ldr);
   
-               if ((apic < 0) || (apic > 7) ||
-                   (cluster >= 0xf))
+       if (flat) {
+               cluster = 0;
+       } else {
+               cluster = (ldr >> 4);
+               if (cluster >= 0xf)
                         return NULL;
-               index = (cluster << 2) + apic;
+               ldr &= 0xf;
         }
+       if (!ldr || !is_power_of_2(ldr))
+               return NULL;
+ 
+       index = __ffs(ldr);
+       if (WARN_ON_ONCE(index > 7))
+               return NULL;
+       index += (cluster << 2);
   
         logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
   
         return &logical_apic_id_table[index];
   }
   
- static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+ static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
   {
         bool flat;
         u32 *entry, new_entry;
@@@ -575,15 -561,13 +561,13 @@@
         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
         if (!entry)
-               return -EINVAL;
+               return;
   
         new_entry = READ_ONCE(*entry);
         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
         new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
         WRITE_ONCE(*entry, new_entry);
- 
-       return 0;
   }
   
   static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
@@@ -601,29 -585,23 +585,23 @@@
                 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
   }
   
- static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+ static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
   {
-       int ret = 0;
         struct vcpu_svm *svm = to_svm(vcpu);
         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
         u32 id = kvm_xapic_id(vcpu->arch.apic);
   
         /* AVIC does not support LDR update for x2APIC */
         if (apic_x2apic_mode(vcpu->arch.apic))
-               return 0;
+               return;
   
         if (ldr == svm->ldr_reg)
-               return 0;
+               return;
   
         avic_invalidate_logical_id_entry(vcpu);
   
-       if (ldr)
-               ret = avic_ldr_write(vcpu, id, ldr);
- 
-       if (!ret)
-               svm->ldr_reg = ldr;
- 
-       return ret;
+       svm->ldr_reg = ldr;
+       avic_ldr_write(vcpu, id, ldr);
   }
   
   static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
@@@ -645,12 -623,14 +623,14 @@@ static int avic_unaccel_trap_write(stru
   
         switch (offset) {
         case APIC_LDR:
-               if (avic_handle_ldr_update(vcpu))
-                       return 0;
+               avic_handle_ldr_update(vcpu);
                 break;
         case APIC_DFR:
                 avic_handle_dfr_update(vcpu);
                 break;
+       case APIC_RRR:
+               /* Ignore writes to Read Remote Data, it's read-only. */
+               return 1;
         default:
                 break;
         }
@@@ -739,18 -719,6 +719,6 @@@ void avic_apicv_post_state_restore(stru
         avic_handle_ldr_update(vcpu);
   }
   
- void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
- {
-       if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
-               return;
- 
-       if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
-               WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
-               return;
-       }
-       avic_refresh_apicv_exec_ctrl(vcpu);
- }
- 
   static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
   {
         int ret = 0;
@@@ -995,23 -963,6 +963,6 @@@ out
         return ret;
   }
   
- bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
- {
-       ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
-                         BIT(APICV_INHIBIT_REASON_ABSENT) |
-                         BIT(APICV_INHIBIT_REASON_HYPERV) |
-                         BIT(APICV_INHIBIT_REASON_NESTED) |
-                         BIT(APICV_INHIBIT_REASON_IRQWIN) |
-                         BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
-                         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
-                         BIT(APICV_INHIBIT_REASON_SEV)      |
-                         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
-                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
- 
-       return supported & BIT(reason);
- }
- 
- 
   static inline int
   avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
   {
@@@ -1064,6 -1015,7 +1015,7 @@@ void avic_vcpu_load(struct kvm_vcpu *vc
                 return;
   
         entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
   
         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
@@@ -1092,17 -1044,15 +1044,15 @@@ void avic_vcpu_put(struct kvm_vcpu *vcp
         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
   }
   
- 
- void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb *vmcb = svm->vmcb01.ptr;
-       bool activated = kvm_vcpu_apicv_active(vcpu);
   
-       if (!enable_apicv)
+       if (!lapic_in_kernel(vcpu) || !enable_apicv)
                 return;
   
-       if (activated) {
+       if (kvm_vcpu_apicv_active(vcpu)) {
                 /**
                  * During AVIC temporary deactivation, guest could update
                  * APIC ID, DFR and LDR registers, which would not be trapped
@@@ -1116,6 -1066,16 +1066,16 @@@
                 avic_deactivate_vmcb(svm);
         }
         vmcb_mark_dirty(vmcb, VMCB_AVIC);
+ }
+ 
+ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+ {
+       bool activated = kvm_vcpu_apicv_active(vcpu);
+ 
+       if (!enable_apicv)
+               return;
+ 
+       avic_refresh_virtual_apic_mode(vcpu);
   
         if (activated)
                 avic_vcpu_load(vcpu, vcpu->cpu);
@@@ -1165,32 -1125,32 +1125,32 @@@ bool avic_hardware_setup(struct kvm_x86
         if (!npt_enabled)
                 return false;
   
+       /* AVIC is a prerequisite for x2AVIC. */
+       if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+               if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+                       pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+                       pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+               }
+               return false;
+       }
+ 
         if (boot_cpu_has(X86_FEATURE_AVIC)) {
-               avic_mode = AVIC_MODE_X1;
                 pr_info("AVIC enabled\n");
         } else if (force_avic) {
                 /*
                  * Some older systems does not advertise AVIC support.
                  * See Revision Guide for specific AMD processor for more detail.
                  */
-               avic_mode = AVIC_MODE_X1;
                 pr_warn("AVIC is not supported in CPUID but force enabled");
                 pr_warn("Your system might crash and burn");
         }
   
         /* AVIC is a prerequisite for x2AVIC. */
-       if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
-               if (avic_mode == AVIC_MODE_X1) {
-                       avic_mode = AVIC_MODE_X2;
-                       pr_info("x2AVIC enabled\n");
-               } else {
-                       pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
-                       pr_warn(FW_BUG "Try enable AVIC using force_avic option");
-               }
-       }
+       x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+       if (x2avic_enabled)
+               pr_info("x2AVIC enabled\n");
   
-       if (avic_mode != AVIC_MODE_NONE)
-               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
   
-       return !!avic_mode;
+       return true;
   }
diff --combined arch/x86/kvm/svm/nested.c

index 500da957e5908ca28f9f49c1061e2fdd0c1f68e6,34ac03969f28d39766ac5073faa45697c3722ad9..700df66d23c745c4b3facef6b051ca7ae0edfbf8
--- 1/arch/x86/kvm/svm/nested.c
--- 2/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@@ -12,7 -12,7 +12,7 @@@
    *   Avi Kivity   <[email protected]>
    */
   
- -#define pr_fmt(fmt) "SVM: " fmt
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_types.h>
   #include <linux/kvm_host.h>
@@@ -138,13 -138,15 +138,13 @@@ void recalc_intercepts(struct vcpu_svm 
                 c->intercepts[i] = h->intercepts[i];
   
         if (g->int_ctl & V_INTR_MASKING_MASK) {
- -              /* We only want the cr8 intercept bits of L1 */
- -              vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- -              vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
- -
                 /*
- -               * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- -               * affect any interrupt we may want to inject; therefore,
- -               * interrupt window vmexits are irrelevant to L0.
+ +               * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
+ +               * does not affect any interrupt we may want to inject;
+ +               * therefore, writes to CR8 are irrelevant to L0, as are
+ +               * interrupt window vmexits.
                  */
+ +              vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
                 vmcb_clr_intercept(c, INTERCEPT_VINTR);
         }
   
@@@ -1104,7 -1106,7 +1104,7 @@@ int nested_svm_vmexit(struct vcpu_svm *
          * to benefit from it right away.
          */
         if (kvm_apicv_activated(vcpu->kvm))
-               kvm_vcpu_update_apicv(vcpu);
+               __kvm_vcpu_update_apicv(vcpu);
   
         return 0;
   }
diff --combined arch/x86/kvm/svm/svm.c

index 799b24801d310134903920c4bdbee78384659207,f2453df77727e9b704ea1c88edf54d56525fd9af..d13cf53e739067485f64b303b2f945b93117ae67
--- 1/arch/x86/kvm/svm/svm.c
--- 2/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -1,4 -1,4 +1,4 @@@
- -#define pr_fmt(fmt) "SVM: " fmt
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_host.h>
   
@@@ -519,37 -519,21 +519,37 @@@ static void svm_init_osvw(struct kvm_vc
                 vcpu->arch.osvw.status |= 1;
   }
   
- -static int has_svm(void)
+ +static bool kvm_is_svm_supported(void)
   {
+ +      int cpu = raw_smp_processor_id();
         const char *msg;
+ +      u64 vm_cr;
   
         if (!cpu_has_svm(&msg)) {
- -              printk(KERN_INFO "has_svm: %s\n", msg);
- -              return 0;
+ +              pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ +              return false;
         }
   
         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
                 pr_info("KVM is unsupported when running as an SEV guest\n");
- -              return 0;
+ +              return false;
         }
   
- -      return 1;
+ +      rdmsrl(MSR_VM_CR, vm_cr);
+ +      if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ +              pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ +              return false;
+ +      }
+ +
+ +      return true;
+ +}
+ +
+ +static int svm_check_processor_compat(void)
+ +{
+ +      if (!kvm_is_svm_supported())
+ +              return -EIO;
+ +
+ +      return 0;
   }
   
   void __svm_write_tsc_multiplier(u64 multiplier)
@@@ -588,6 -572,10 +588,6 @@@ static int svm_hardware_enable(void
         if (efer & EFER_SVME)
                 return -EBUSY;
   
- -      if (!has_svm()) {
- -              pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- -              return -EINVAL;
- -      }
         sd = per_cpu_ptr(&svm_data, me);
         sd->asid_generation = 1;
         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
@@@ -825,7 -813,7 +825,7 @@@ void svm_set_x2apic_msr_interception(st
         if (intercept == svm->x2avic_msrs_intercepted)
                 return;
   
-       if (avic_mode != AVIC_MODE_X2 ||
+       if (!x2avic_enabled ||
             !apic_x2apic_mode(svm->vcpu.arch.apic))
                 return;
   
@@@ -2088,7 -2076,7 +2088,7 @@@ static void svm_handle_mce(struct kvm_v
                  * Erratum 383 triggered. Guest state is corrupt so kill the
                  * guest.
                  */
- -              pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ +              pr_err("Guest triggered AMD Erratum 383\n");
   
                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
   
@@@ -2717,9 -2705,9 +2717,9 @@@ static int svm_get_msr_feature(struct k
         msr->data = 0;
   
         switch (msr->index) {
- -      case MSR_F10H_DECFG:
- -              if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- -                      msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ +      case MSR_AMD64_DE_CFG:
+ +              if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ +                      msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
                 break;
         default:
                 return KVM_MSR_RET_INVALID;
@@@ -2818,7 -2806,7 +2818,7 @@@ static int svm_get_msr(struct kvm_vcpu 
                         msr_info->data = 0x1E;
                 }
                 break;
- -      case MSR_F10H_DECFG:
+ +      case MSR_AMD64_DE_CFG:
                 msr_info->data = svm->msr_decfg;
                 break;
         default:
@@@ -3047,7 -3035,7 +3047,7 @@@ static int svm_set_msr(struct kvm_vcpu 
         case MSR_VM_IGNNE:
                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
                 break;
- -      case MSR_F10H_DECFG: {
+ +      case MSR_AMD64_DE_CFG: {
                 struct kvm_msr_entry msr_entry;
   
                 msr_entry.index = msr->index;
@@@ -4088,6 -4076,17 +4088,6 @@@ static void svm_load_mmu_pgd(struct kvm
         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
   }
   
- -static int is_disabled(void)
- -{
- -      u64 vm_cr;
- -
- -      rdmsrl(MSR_VM_CR, vm_cr);
- -      if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- -              return 1;
- -
- -      return 0;
- -}
- -
   static void
   svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
   {
@@@ -4099,6 -4098,11 +4099,6 @@@
         hypercall[2] = 0xd9;
   }
   
- -static int __init svm_check_processor_compat(void)
- -{
- -      return 0;
- -}
- -
   /*
    * The kvm parameter can be NULL (module initialization, or invocation before
    * VM creation). Be sure to check the kvm parameter before using it.
@@@ -4625,7 -4629,7 +4625,7 @@@ static bool svm_can_emulate_instruction
         smap = cr4 & X86_CR4_SMAP;
         is_user = svm_get_cpl(vcpu) == 3;
         if (smap && (!smep || is_user)) {
- -              pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ +              pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
   
                 /*
                  * If the fault occurred in userspace, arbitrarily inject #GP
@@@ -4697,9 -4701,7 +4697,9 @@@ static int svm_vm_init(struct kvm *kvm
   }
   
   static struct kvm_x86_ops svm_x86_ops __initdata = {
- -      .name = "kvm_amd",
+ +      .name = KBUILD_MODNAME,
+ +
+ +      .check_processor_compatibility = svm_check_processor_compat,
   
         .hardware_unsetup = svm_hardware_unsetup,
         .hardware_enable = svm_hardware_enable,
@@@ -4769,10 -4771,10 +4769,10 @@@
         .enable_nmi_window = svm_enable_nmi_window,
         .enable_irq_window = svm_enable_irq_window,
         .update_cr8_intercept = svm_update_cr8_intercept,
-       .set_virtual_apic_mode = avic_set_virtual_apic_mode,
+       .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
-       .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
         .apicv_post_state_restore = avic_apicv_post_state_restore,
+       .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
   
         .get_exit_info = svm_get_exit_info,
   
@@@ -4976,7 -4978,7 +4976,7 @@@ static __init int svm_hardware_setup(vo
         }
   
         if (nested) {
- -              printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ +              pr_info("Nested Virtualization enabled\n");
                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
         }
   
@@@ -4994,7 -4996,7 +4994,7 @@@
         /* Force VM NPT level equal to the host's paging level */
         kvm_configure_mmu(npt_enabled, get_npt_level(),
                           get_npt_level(), PG_LEVEL_1G);
- -      pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ +      pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
   
         /* Setup shadow_me_value and shadow_me_mask */
         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@@ -5026,6 -5028,8 +5026,8 @@@
                 svm_x86_ops.vcpu_blocking = NULL;
                 svm_x86_ops.vcpu_unblocking = NULL;
                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+       } else if (!x2avic_enabled) {
+               svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
         }
   
         if (vls) {
@@@ -5084,7 -5088,10 +5086,7 @@@ err
   
   
   static struct kvm_x86_init_ops svm_init_ops __initdata = {
- -      .cpu_has_kvm_support = has_svm,
- -      .disabled_by_bios = is_disabled,
         .hardware_setup = svm_hardware_setup,
- -      .check_processor_compatibility = svm_check_processor_compat,
   
         .runtime_ops = &svm_x86_ops,
         .pmu_ops = &amd_pmu_ops,
@@@ -5092,37 -5099,15 +5094,37 @@@
   
   static int __init svm_init(void)
   {
+ +      int r;
+ +
         __unused_size_checks();
   
- -      return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- -                      __alignof__(struct vcpu_svm), THIS_MODULE);
+ +      if (!kvm_is_svm_supported())
+ +              return -EOPNOTSUPP;
+ +
+ +      r = kvm_x86_vendor_init(&svm_init_ops);
+ +      if (r)
+ +              return r;
+ +
+ +      /*
+ +       * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ +       * exposed to userspace!
+ +       */
+ +      r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ +                   THIS_MODULE);
+ +      if (r)
+ +              goto err_kvm_init;
+ +
+ +      return 0;
+ +
+ +err_kvm_init:
+ +      kvm_x86_vendor_exit();
+ +      return r;
   }
   
   static void __exit svm_exit(void)
   {
         kvm_exit();
+ +      kvm_x86_vendor_exit();
   }
   
   module_init(svm_init)
diff --combined arch/x86/kvm/vmx/vmx.c

index 73005d7e4e43c29bb03973bae40cc2bbe2ad1936,ad2ac66ef32e271b148692826c79866d829bee1c..c788aa3826119fe051dcdd8dab7c0ae5857c4211
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -12,7 -12,6 +12,7 @@@
    *   Avi Kivity   <[email protected]>
    *   Yaniv Kamay  <[email protected]>
    */
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/highmem.h>
   #include <linux/hrtimer.h>
@@@ -445,36 -444,36 +445,36 @@@ void vmread_error(unsigned long field, 
         if (fault)
                 kvm_spurious_fault();
         else
- -              vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ +              vmx_insn_failed("vmread failed: field=%lx\n", field);
   }
   
   noinline void vmwrite_error(unsigned long field, unsigned long value)
   {
- -      vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ +      vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
   }
   
   noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
   {
- -      vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ +      vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
   }
   
   noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
   {
- -      vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ +      vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
   }
   
   noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
   {
- -      vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ +      vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
                         ext, vpid, gva);
   }
   
   noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
   {
- -      vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ +      vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
                         ext, eptp, gpa);
   }
   
@@@ -489,8 -488,8 +489,8 @@@ static DEFINE_PER_CPU(struct list_head
   static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
   static DEFINE_SPINLOCK(vmx_vpid_lock);
   
- -struct vmcs_config vmcs_config;
- -struct vmx_capability vmx_capability;
+ +struct vmcs_config vmcs_config __ro_after_init;
+ +struct vmx_capability vmx_capability __ro_after_init;
   
   #define VMX_SEGMENT_FIELD(seg)                                        \
         [VCPU_SREG_##seg] = {                                   \
@@@ -524,8 -523,6 +524,8 @@@ static inline void vmx_segment_cache_cl
   static unsigned long host_idt_base;
   
   #if IS_ENABLED(CONFIG_HYPERV)
+ +static struct kvm_x86_ops vmx_x86_ops __initdata;
+ +
   static bool __read_mostly enlightened_vmcs = true;
   module_param(enlightened_vmcs, bool, 0444);
   
@@@ -554,71 -551,6 +554,71 @@@ static int hv_enable_l2_tlb_flush(struc
         return 0;
   }
   
+ +static __init void hv_init_evmcs(void)
+ +{
+ +      int cpu;
+ +
+ +      if (!enlightened_vmcs)
+ +              return;
+ +
+ +      /*
+ +       * Enlightened VMCS usage should be recommended and the host needs
+ +       * to support eVMCS v1 or above.
+ +       */
+ +      if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ +          (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ +           KVM_EVMCS_VERSION) {
+ +
+ +              /* Check that we have assist pages on all online CPUs */
+ +              for_each_online_cpu(cpu) {
+ +                      if (!hv_get_vp_assist_page(cpu)) {
+ +                              enlightened_vmcs = false;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              if (enlightened_vmcs) {
+ +                      pr_info("Using Hyper-V Enlightened VMCS\n");
+ +                      static_branch_enable(&enable_evmcs);
+ +              }
+ +
+ +              if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ +                      vmx_x86_ops.enable_l2_tlb_flush
+ +                              = hv_enable_l2_tlb_flush;
+ +
+ +      } else {
+ +              enlightened_vmcs = false;
+ +      }
+ +}
+ +
+ +static void hv_reset_evmcs(void)
+ +{
+ +      struct hv_vp_assist_page *vp_ap;
+ +
+ +      if (!static_branch_unlikely(&enable_evmcs))
+ +              return;
+ +
+ +      /*
+ +       * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ +       * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ +       * doesn't have a VP assist page allocated.
+ +       */
+ +      vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ +      if (WARN_ON_ONCE(!vp_ap))
+ +              return;
+ +
+ +      /*
+ +       * Reset everything to support using non-enlightened VMCS access later
+ +       * (e.g. when we reload the module with enlightened_vmcs=0)
+ +       */
+ +      vp_ap->nested_control.features.directhypercall = 0;
+ +      vp_ap->current_nested_vmcs = 0;
+ +      vp_ap->enlighten_vmentry = 0;
+ +}
+ +
+ +#else /* IS_ENABLED(CONFIG_HYPERV) */
+ +static void hv_init_evmcs(void) {}
+ +static void hv_reset_evmcs(void) {}
   #endif /* IS_ENABLED(CONFIG_HYPERV) */
   
   /*
@@@ -1681,8 -1613,8 +1681,8 @@@ static int skip_emulated_instruction(st
                 if (!instr_len)
                         goto rip_updated;
   
- -              WARN(exit_reason.enclave_mode,
- -                   "KVM: skipping instruction after SGX enclave VM-Exit");
+ +              WARN_ONCE(exit_reason.enclave_mode,
+ +                        "skipping instruction after SGX enclave VM-Exit");
   
                 orig_rip = kvm_rip_read(vcpu);
                 rip = orig_rip + instr_len;
@@@ -2516,6 -2448,88 +2516,6 @@@ static void vmx_cache_reg(struct kvm_vc
         }
   }
   
- -static __init int cpu_has_kvm_support(void)
- -{
- -      return cpu_has_vmx();
- -}
- -
- -static __init int vmx_disabled_by_bios(void)
- -{
- -      return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- -             !boot_cpu_has(X86_FEATURE_VMX);
- -}
- -
- -static int kvm_cpu_vmxon(u64 vmxon_pointer)
- -{
- -      u64 msr;
- -
- -      cr4_set_bits(X86_CR4_VMXE);
- -
- -      asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- -                        _ASM_EXTABLE(1b, %l[fault])
- -                        : : [vmxon_pointer] "m"(vmxon_pointer)
- -                        : : fault);
- -      return 0;
- -
- -fault:
- -      WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- -                rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- -      cr4_clear_bits(X86_CR4_VMXE);
- -
- -      return -EFAULT;
- -}
- -
- -static int vmx_hardware_enable(void)
- -{
- -      int cpu = raw_smp_processor_id();
- -      u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- -      int r;
- -
- -      if (cr4_read_shadow() & X86_CR4_VMXE)
- -              return -EBUSY;
- -
- -      /*
- -       * This can happen if we hot-added a CPU but failed to allocate
- -       * VP assist page for it.
- -       */
- -      if (static_branch_unlikely(&enable_evmcs) &&
- -          !hv_get_vp_assist_page(cpu))
- -              return -EFAULT;
- -
- -      intel_pt_handle_vmx(1);
- -
- -      r = kvm_cpu_vmxon(phys_addr);
- -      if (r) {
- -              intel_pt_handle_vmx(0);
- -              return r;
- -      }
- -
- -      if (enable_ept)
- -              ept_sync_global();
- -
- -      return 0;
- -}
- -
- -static void vmclear_local_loaded_vmcss(void)
- -{
- -      int cpu = raw_smp_processor_id();
- -      struct loaded_vmcs *v, *n;
- -
- -      list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- -                               loaded_vmcss_on_cpu_link)
- -              __loaded_vmcs_clear(v);
- -}
- -
- -static void vmx_hardware_disable(void)
- -{
- -      vmclear_local_loaded_vmcss();
- -
- -      if (cpu_vmxoff())
- -              kvm_spurious_fault();
- -
- -      intel_pt_handle_vmx(0);
- -}
- -
   /*
    * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
    * directly instead of going through cpu_has(), to ensure KVM is trapping
@@@ -2551,7 -2565,8 +2551,7 @@@ static bool cpu_has_perf_global_ctrl_bu
         return false;
   }
   
- -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- -                                    u32 msr, u32 *result)
+ +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
   {
         u32 vmx_msr_low, vmx_msr_high;
         u32 ctl = ctl_min | ctl_opt;
@@@ -2569,7 -2584,7 +2569,7 @@@
         return 0;
   }
   
- -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+ +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
   {
         u64 allowed;
   
@@@ -2578,8 -2593,8 +2578,8 @@@
         return  ctl_opt & allowed;
   }
   
- -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- -                                  struct vmx_capability *vmx_cap)
+ +static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ +                           struct vmx_capability *vmx_cap)
   {
         u32 vmx_msr_low, vmx_msr_high;
         u32 _pin_based_exec_control = 0;
@@@ -2745,119 -2760,6 +2745,119 @@@
         return 0;
   }
   
+ +static bool kvm_is_vmx_supported(void)
+ +{
+ +      int cpu = raw_smp_processor_id();
+ +
+ +      if (!cpu_has_vmx()) {
+ +              pr_err("VMX not supported by CPU %d\n", cpu);
+ +              return false;
+ +      }
+ +
+ +      if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ +          !this_cpu_has(X86_FEATURE_VMX)) {
+ +              pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ +              return false;
+ +      }
+ +
+ +      return true;
+ +}
+ +
+ +static int vmx_check_processor_compat(void)
+ +{
+ +      int cpu = raw_smp_processor_id();
+ +      struct vmcs_config vmcs_conf;
+ +      struct vmx_capability vmx_cap;
+ +
+ +      if (!kvm_is_vmx_supported())
+ +              return -EIO;
+ +
+ +      if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ +              pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ +              return -EIO;
+ +      }
+ +      if (nested)
+ +              nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ +      if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ +              pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ +              return -EIO;
+ +      }
+ +      return 0;
+ +}
+ +
+ +static int kvm_cpu_vmxon(u64 vmxon_pointer)
+ +{
+ +      u64 msr;
+ +
+ +      cr4_set_bits(X86_CR4_VMXE);
+ +
+ +      asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ +                        _ASM_EXTABLE(1b, %l[fault])
+ +                        : : [vmxon_pointer] "m"(vmxon_pointer)
+ +                        : : fault);
+ +      return 0;
+ +
+ +fault:
+ +      WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ +                rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ +      cr4_clear_bits(X86_CR4_VMXE);
+ +
+ +      return -EFAULT;
+ +}
+ +
+ +static int vmx_hardware_enable(void)
+ +{
+ +      int cpu = raw_smp_processor_id();
+ +      u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ +      int r;
+ +
+ +      if (cr4_read_shadow() & X86_CR4_VMXE)
+ +              return -EBUSY;
+ +
+ +      /*
+ +       * This can happen if we hot-added a CPU but failed to allocate
+ +       * VP assist page for it.
+ +       */
+ +      if (static_branch_unlikely(&enable_evmcs) &&
+ +          !hv_get_vp_assist_page(cpu))
+ +              return -EFAULT;
+ +
+ +      intel_pt_handle_vmx(1);
+ +
+ +      r = kvm_cpu_vmxon(phys_addr);
+ +      if (r) {
+ +              intel_pt_handle_vmx(0);
+ +              return r;
+ +      }
+ +
+ +      if (enable_ept)
+ +              ept_sync_global();
+ +
+ +      return 0;
+ +}
+ +
+ +static void vmclear_local_loaded_vmcss(void)
+ +{
+ +      int cpu = raw_smp_processor_id();
+ +      struct loaded_vmcs *v, *n;
+ +
+ +      list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ +                               loaded_vmcss_on_cpu_link)
+ +              __loaded_vmcs_clear(v);
+ +}
+ +
+ +static void vmx_hardware_disable(void)
+ +{
+ +      vmclear_local_loaded_vmcss();
+ +
+ +      if (cpu_vmxoff())
+ +              kvm_spurious_fault();
+ +
+ +      hv_reset_evmcs();
+ +
+ +      intel_pt_handle_vmx(0);
+ +}
+ +
   struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
   {
         int node = cpu_to_node(cpu);
@@@ -3053,8 -2955,9 +3053,8 @@@ static void fix_rmode_seg(int seg, stru
                 var.type = 0x3;
                 var.avl = 0;
                 if (save->base & 0xf)
- -                      printk_once(KERN_WARNING "kvm: segment base is not "
- -                                      "paragraph aligned when entering "
- -                                      "protected mode (seg=%d)", seg);
+ +                      pr_warn_once("segment base is not paragraph aligned "
+ +                                   "when entering protected mode (seg=%d)", seg);
         }
   
         vmcs_write16(sf->selector, var.selector);
@@@ -3084,7 -2987,8 +3084,7 @@@ static void enter_rmode(struct kvm_vcp
          * vcpu. Warn the user that an update is overdue.
          */
         if (!kvm_vmx->tss_addr)
- -              printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- -                           "called before entering vcpu\n");
+ +              pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
   
         vmx_segment_cache_clear(vmx);
   
@@@ -3904,39 -3808,6 +3904,6 @@@ static void seg_setup(int seg
         vmcs_write32(sf->ar_bytes, ar);
   }
   
- static int alloc_apic_access_page(struct kvm *kvm)
- {
-       struct page *page;
-       void __user *hva;
-       int ret = 0;
- 
-       mutex_lock(&kvm->slots_lock);
-       if (kvm->arch.apic_access_memslot_enabled)
-               goto out;
-       hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-                                     APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
-       if (IS_ERR(hva)) {
-               ret = PTR_ERR(hva);
-               goto out;
-       }
- 
-       page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               ret = -EFAULT;
-               goto out;
-       }
- 
-       /*
-        * Do not pin the page in memory, so that memory hot-unplug
-        * is able to migrate it.
-        */
-       put_page(page);
-       kvm->arch.apic_access_memslot_enabled = true;
- out:
-       mutex_unlock(&kvm->slots_lock);
-       return ret;
- }
- 
   int allocate_vpid(void)
   {
         int vpid;
@@@ -6952,7 -6823,7 +6919,7 @@@ static void handle_external_interrupt_i
         gate_desc *desc = (gate_desc *)host_idt_base + vector;
   
         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- -          "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ +          "unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
   
         handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
@@@ -7490,7 -7361,7 +7457,7 @@@ static int vmx_vcpu_create(struct kvm_v
         vmx->loaded_vmcs = &vmx->vmcs01;
   
         if (cpu_need_virtualize_apic_accesses(vcpu)) {
-               err = alloc_apic_access_page(vcpu->kvm);
+               err = kvm_alloc_apic_access_page(vcpu->kvm);
                 if (err)
                         goto free_vmcs;
         }
@@@ -7550,6 -7421,29 +7517,6 @@@ static int vmx_vm_init(struct kvm *kvm
         return 0;
   }
   
- -static int __init vmx_check_processor_compat(void)
- -{
- -      struct vmcs_config vmcs_conf;
- -      struct vmx_capability vmx_cap;
- -
- -      if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- -          !this_cpu_has(X86_FEATURE_VMX)) {
- -              pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- -              return -EIO;
- -      }
- -
- -      if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- -              return -EIO;
- -      if (nested)
- -              nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- -      if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- -              printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- -                              smp_processor_id());
- -              return -EIO;
- -      }
- -      return 0;
- -}
- -
   static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
   {
         u8 cache;
@@@ -8129,17 -8023,16 +8096,16 @@@ static void vmx_hardware_unsetup(void
         free_kvm_area();
   }
   
- static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
- {
-       ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
-                         BIT(APICV_INHIBIT_REASON_ABSENT) |
-                         BIT(APICV_INHIBIT_REASON_HYPERV) |
-                         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
-                         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
-                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
- 
-       return supported & BIT(reason);
- }
+ #define VMX_REQUIRED_APICV_INHIBITS                   \
+ (                                                     \
+       BIT(APICV_INHIBIT_REASON_DISABLE)|              \
+       BIT(APICV_INHIBIT_REASON_ABSENT) |              \
+       BIT(APICV_INHIBIT_REASON_HYPERV) |              \
+       BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |            \
+       BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+       BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |    \
+       BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)    \
+ )
   
   static void vmx_vm_destroy(struct kvm *kvm)
   {
@@@ -8149,9 -8042,7 +8115,9 @@@
   }
   
   static struct kvm_x86_ops vmx_x86_ops __initdata = {
- -      .name = "kvm_intel",
+ +      .name = KBUILD_MODNAME,
+ +
+ +      .check_processor_compatibility = vmx_check_processor_compat,
   
         .hardware_unsetup = vmx_hardware_unsetup,
   
@@@ -8225,7 -8116,7 +8191,7 @@@
         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
         .load_eoi_exitmap = vmx_load_eoi_exitmap,
         .apicv_post_state_restore = vmx_apicv_post_state_restore,
-       .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+       .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
         .hwapic_irr_update = vmx_hwapic_irr_update,
         .hwapic_isr_update = vmx_hwapic_isr_update,
         .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@@ -8371,7 -8262,7 +8337,7 @@@ static __init int hardware_setup(void
                 return -EIO;
   
         if (cpu_has_perf_global_ctrl_bug())
- -              pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ +              pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
                              "does not work properly. Using workaround\n");
   
         if (boot_cpu_has(X86_FEATURE_NX))
@@@ -8379,7 -8270,7 +8345,7 @@@
   
         if (boot_cpu_has(X86_FEATURE_MPX)) {
                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- -              WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ +              WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
         }
   
         if (!cpu_has_vmx_mpx())
@@@ -8398,7 -8289,7 +8364,7 @@@
   
         /* NX support is required for shadow paging. */
         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- -              pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ +              pr_err_ratelimited("NX (Execute Disable) not supported\n");
                 return -EOPNOTSUPP;
         }
   
@@@ -8550,6 -8441,9 +8516,6 @@@
   }
   
   static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- -      .cpu_has_kvm_support = cpu_has_kvm_support,
- -      .disabled_by_bios = vmx_disabled_by_bios,
- -      .check_processor_compatibility = vmx_check_processor_compat,
         .hardware_setup = hardware_setup,
         .handle_intel_pt_intr = NULL,
   
@@@ -8567,23 -8461,41 +8533,23 @@@ static void vmx_cleanup_l1d_flush(void
         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
   }
   
- -static void vmx_exit(void)
+ +static void __vmx_exit(void)
   {
+ +      allow_smaller_maxphyaddr = false;
+ +
   #ifdef CONFIG_KEXEC_CORE
         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
         synchronize_rcu();
   #endif
+ +      vmx_cleanup_l1d_flush();
+ +}
   
+ +static void vmx_exit(void)
+ +{
         kvm_exit();
+ +      kvm_x86_vendor_exit();
   
- -#if IS_ENABLED(CONFIG_HYPERV)
- -      if (static_branch_unlikely(&enable_evmcs)) {
- -              int cpu;
- -              struct hv_vp_assist_page *vp_ap;
- -              /*
- -               * Reset everything to support using non-enlightened VMCS
- -               * access later (e.g. when we reload the module with
- -               * enlightened_vmcs=0)
- -               */
- -              for_each_online_cpu(cpu) {
- -                      vp_ap = hv_get_vp_assist_page(cpu);
- -
- -                      if (!vp_ap)
- -                              continue;
- -
- -                      vp_ap->nested_control.features.directhypercall = 0;
- -                      vp_ap->current_nested_vmcs = 0;
- -                      vp_ap->enlighten_vmentry = 0;
- -              }
- -
- -              static_branch_disable(&enable_evmcs);
- -      }
- -#endif
- -      vmx_cleanup_l1d_flush();
- -
- -      allow_smaller_maxphyaddr = false;
+ +      __vmx_exit();
   }
   module_exit(vmx_exit);
   
@@@ -8591,29 -8503,56 +8557,29 @@@ static int __init vmx_init(void
   {
         int r, cpu;
   
- -#if IS_ENABLED(CONFIG_HYPERV)
+ +      if (!kvm_is_vmx_supported())
+ +              return -EOPNOTSUPP;
+ +
         /*
- -       * Enlightened VMCS usage should be recommended and the host needs
- -       * to support eVMCS v1 or above. We can also disable eVMCS support
- -       * with module parameter.
+ +       * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ +       * to unwind if a later step fails.
          */
- -      if (enlightened_vmcs &&
- -          ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- -          (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- -          KVM_EVMCS_VERSION) {
+ +      hv_init_evmcs();
   
- -              /* Check that we have assist pages on all online CPUs */
- -              for_each_online_cpu(cpu) {
- -                      if (!hv_get_vp_assist_page(cpu)) {
- -                              enlightened_vmcs = false;
- -                              break;
- -                      }
- -              }
- -
- -              if (enlightened_vmcs) {
- -                      pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- -                      static_branch_enable(&enable_evmcs);
- -              }
- -
- -              if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- -                      vmx_x86_ops.enable_l2_tlb_flush
- -                              = hv_enable_l2_tlb_flush;
- -
- -      } else {
- -              enlightened_vmcs = false;
- -      }
- -#endif
- -
- -      r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- -                   __alignof__(struct vcpu_vmx), THIS_MODULE);
+ +      r = kvm_x86_vendor_init(&vmx_init_ops);
         if (r)
                 return r;
   
         /*
- -       * Must be called after kvm_init() so enable_ept is properly set
+ +       * Must be called after common x86 init so enable_ept is properly set
          * up. Hand the parameter mitigation value in which was stored in
          * the pre module init parser. If no parameter was given, it will
          * contain 'auto' which will be turned into the default 'cond'
          * mitigation mode.
          */
         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- -      if (r) {
- -              vmx_exit();
- -              return r;
- -      }
+ +      if (r)
+ +              goto err_l1d_flush;
   
         vmx_setup_fb_clear_ctrl();
   
@@@ -8637,21 -8576,6 +8603,21 @@@
         if (!enable_ept)
                 allow_smaller_maxphyaddr = true;
   
+ +      /*
+ +       * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ +       * exposed to userspace!
+ +       */
+ +      r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ +                   THIS_MODULE);
+ +      if (r)
+ +              goto err_kvm_init;
+ +
         return 0;
+ +
+ +err_kvm_init:
+ +      __vmx_exit();
+ +err_l1d_flush:
+ +      kvm_x86_vendor_exit();
+ +      return r;
   }
   module_init(vmx_init);
diff --combined arch/x86/kvm/x86.c

index c3ac88036b522c2833c2768541153af6992c9527,5becce5bd45a40d7f622692069a2afbc49c600ee..508074e47bc0ebd4535de4bc548f57abdffe863d
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -15,7 -15,6 +15,7 @@@
    *   Amit Shah    <[email protected]>
    *   Ben-Ami Yassour <[email protected]>
    */
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_host.h>
   #include "irq.h"
@@@ -129,7 -128,6 +129,7 @@@ static int kvm_vcpu_do_singlestep(struc
   static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
   static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
   
+ +static DEFINE_MUTEX(vendor_module_lock);
   struct kvm_x86_ops kvm_x86_ops __read_mostly;
   
   #define KVM_X86_OP(func)                                           \
@@@ -1559,7 -1557,7 +1559,7 @@@ static const u32 msr_based_features_all
         MSR_IA32_VMX_EPT_VPID_CAP,
         MSR_IA32_VMX_VMFUNC,
   
- -      MSR_F10H_DECFG,
+ +      MSR_AMD64_DE_CFG,
         MSR_IA32_UCODE_REV,
         MSR_IA32_ARCH_CAPABILITIES,
         MSR_IA32_PERF_CAPABILITIES,
@@@ -2088,7 -2086,7 +2088,7 @@@ static int kvm_emulate_monitor_mwait(st
             !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
                 return kvm_handle_invalid_op(vcpu);
   
- -      pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ +      pr_warn_once("%s instruction emulated as NOP!\n", insn);
         return kvm_emulate_as_nop(vcpu);
   }
   int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
@@@ -2435,8 -2433,7 +2435,8 @@@ static int kvm_set_tsc_khz(struct kvm_v
         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- -              pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ +              pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ +                       user_tsc_khz, thresh_lo, thresh_hi);
                 use_scaling = 1;
         }
         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
@@@ -7704,7 -7701,7 +7704,7 @@@ static int emulator_cmpxchg_emulated(st
         return X86EMUL_CONTINUE;
   
   emul_write:
- -      printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ +      pr_warn_once("emulating exchange as write\n");
   
         return emulator_write_emulated(ctxt, addr, new, bytes, exception);
   }
@@@ -8265,7 -8262,7 +8265,7 @@@ static struct x86_emulate_ctxt *alloc_e
   
         ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
         if (!ctxt) {
- -              pr_err("kvm: failed to allocate vcpu's emulator\n");
+ +              pr_err("failed to allocate vcpu's emulator\n");
                 return NULL;
         }
   
@@@ -9276,66 -9273,35 +9276,66 @@@ static struct notifier_block pvclock_gt
   };
   #endif
   
- -int kvm_arch_init(void *opaque)
+ +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+ +{
+ +      memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+ +
+ +#define __KVM_X86_OP(func) \
+ +      static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+ +#define KVM_X86_OP(func) \
+ +      WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+ +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+ +#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ +      static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ +                                         (void *)__static_call_return0);
+ +#include <asm/kvm-x86-ops.h>
+ +#undef __KVM_X86_OP
+ +
+ +      kvm_pmu_ops_update(ops->pmu_ops);
+ +}
+ +
+ +static int kvm_x86_check_processor_compatibility(void)
+ +{
+ +      int cpu = smp_processor_id();
+ +      struct cpuinfo_x86 *c = &cpu_data(cpu);
+ +
+ +      /*
+ +       * Compatibility checks are done when loading KVM and when enabling
+ +       * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ +       * compatible, i.e. KVM should never perform a compatibility check on
+ +       * an offline CPU.
+ +       */
+ +      WARN_ON(!cpu_online(cpu));
+ +
+ +      if (__cr4_reserved_bits(cpu_has, c) !=
+ +          __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ +              return -EIO;
+ +
+ +      return static_call(kvm_x86_check_processor_compatibility)();
+ +}
+ +
+ +static void kvm_x86_check_cpu_compat(void *ret)
+ +{
+ +      *(int *)ret = kvm_x86_check_processor_compatibility();
+ +}
+ +
+ +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
   {
- -      struct kvm_x86_init_ops *ops = opaque;
         u64 host_pat;
- -      int r;
+ +      int r, cpu;
   
         if (kvm_x86_ops.hardware_enable) {
- -              pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ +              pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
                 return -EEXIST;
         }
   
- -      if (!ops->cpu_has_kvm_support()) {
- -              pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- -                                 ops->runtime_ops->name);
- -              return -EOPNOTSUPP;
- -      }
- -      if (ops->disabled_by_bios()) {
- -              pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- -                                 ops->runtime_ops->name);
- -              return -EOPNOTSUPP;
- -      }
- -
         /*
          * KVM explicitly assumes that the guest has an FPU and
          * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
          * vCPU's FPU state as a fxregs_state struct.
          */
         if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- -              printk(KERN_ERR "kvm: inadequate fpu\n");
+ +              pr_err("inadequate fpu\n");
                 return -EOPNOTSUPP;
         }
   
@@@ -9353,19 -9319,19 +9353,19 @@@
          */
         if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
             (host_pat & GENMASK(2, 0)) != 6) {
- -              pr_err("kvm: host PAT[0] is not WB\n");
+ +              pr_err("host PAT[0] is not WB\n");
                 return -EIO;
         }
   
         x86_emulator_cache = kvm_alloc_emulator_cache();
         if (!x86_emulator_cache) {
- -              pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ +              pr_err("failed to allocate cache for x86 emulator\n");
                 return -ENOMEM;
         }
   
         user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
         if (!user_return_msrs) {
- -              printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ +              pr_err("failed to allocate percpu kvm_user_return_msrs\n");
                 r = -ENOMEM;
                 goto out_free_x86_emulator_cache;
         }
@@@ -9375,37 -9341,13 +9375,37 @@@
         if (r)
                 goto out_free_percpu;
   
- -      kvm_timer_init();
- -
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
                 kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
         }
   
+ +      rdmsrl_safe(MSR_EFER, &host_efer);
+ +
+ +      if (boot_cpu_has(X86_FEATURE_XSAVES))
+ +              rdmsrl(MSR_IA32_XSS, host_xss);
+ +
+ +      kvm_init_pmu_capability();
+ +
+ +      r = ops->hardware_setup();
+ +      if (r != 0)
+ +              goto out_mmu_exit;
+ +
+ +      kvm_ops_update(ops);
+ +
+ +      for_each_online_cpu(cpu) {
+ +              smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ +              if (r < 0)
+ +                      goto out_unwind_ops;
+ +      }
+ +
+ +      /*
+ +       * Point of no return!  DO NOT add error paths below this point unless
+ +       * absolutely necessary, as most operations from this point forward
+ +       * require unwinding.
+ +       */
+ +      kvm_timer_init();
+ +
         if (pi_inject_timer == -1)
                 pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
   #ifdef CONFIG_X86_64
@@@ -9415,35 -9357,8 +9415,35 @@@
                 set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
   #endif
   
+ +      kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+ +
+ +      if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ +              kvm_caps.supported_xss = 0;
+ +
+ +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ +      cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+ +#undef __kvm_cpu_cap_has
+ +
+ +      if (kvm_caps.has_tsc_control) {
+ +              /*
+ +               * Make sure the user can only configure tsc_khz values that
+ +               * fit into a signed integer.
+ +               * A min value is not calculated because it will always
+ +               * be 1 on all machines.
+ +               */
+ +              u64 max = min(0x7fffffffULL,
+ +                            __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ +              kvm_caps.max_guest_tsc_khz = max;
+ +      }
+ +      kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ +      kvm_init_msr_list();
         return 0;
   
+ +out_unwind_ops:
+ +      kvm_x86_ops.hardware_enable = NULL;
+ +      static_call(kvm_x86_hardware_unsetup)();
+ +out_mmu_exit:
+ +      kvm_mmu_vendor_module_exit();
   out_free_percpu:
         free_percpu(user_return_msrs);
   out_free_x86_emulator_cache:
@@@ -9451,22 -9366,8 +9451,22 @@@
         return r;
   }
   
- -void kvm_arch_exit(void)
+ +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+ +{
+ +      int r;
+ +
+ +      mutex_lock(&vendor_module_lock);
+ +      r = __kvm_x86_vendor_init(ops);
+ +      mutex_unlock(&vendor_module_lock);
+ +
+ +      return r;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+ +
+ +void kvm_x86_vendor_exit(void)
   {
+ +      kvm_unregister_perf_callbacks();
+ +
   #ifdef CONFIG_X86_64
         if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
                 clear_hv_tscchange_cb();
@@@ -9483,7 -9384,7 +9483,7 @@@
         irq_work_sync(&pvclock_irq_work);
         cancel_work_sync(&pvclock_gtod_work);
   #endif
- -      kvm_x86_ops.hardware_enable = NULL;
+ +      static_call(kvm_x86_hardware_unsetup)();
         kvm_mmu_vendor_module_exit();
         free_percpu(user_return_msrs);
         kmem_cache_destroy(x86_emulator_cache);
@@@ -9491,11 -9392,7 +9491,11 @@@
         static_key_deferred_flush(&kvm_xen_enabled);
         WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
   #endif
+ +      mutex_lock(&vendor_module_lock);
+ +      kvm_x86_ops.hardware_enable = NULL;
+ +      mutex_unlock(&vendor_module_lock);
   }
+ +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
   
   static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
   {
@@@ -10148,7 -10045,7 +10148,7 @@@ void kvm_make_scan_ioapic_request(struc
         kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
   }
   
- void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
         bool activate;
@@@ -10183,7 -10080,30 +10183,30 @@@ out
         preempt_enable();
         up_read(&vcpu->kvm->arch.apicv_update_lock);
   }
- EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+ EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+ 
+ static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+ {
+       if (!lapic_in_kernel(vcpu))
+               return;
+ 
+       /*
+        * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+        * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+        * and hardware doesn't support x2APIC virtualization.  E.g. some AMD
+        * CPUs support AVIC but not x2APIC.  KVM still allows enabling AVIC in
+        * this case so that KVM can the AVIC doorbell to inject interrupts to
+        * running vCPUs, but KVM must not create SPTEs for the APIC base as
+        * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+        * despite being in x2APIC mode.  For simplicity, inhibiting the APIC
+        * access page is sticky.
+        */
+       if (apic_x2apic_mode(vcpu->arch.apic) &&
+           kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+               kvm_inhibit_apic_access_page(vcpu);
+ 
+       __kvm_vcpu_update_apicv(vcpu);
+ }
   
   void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
                                       enum kvm_apicv_inhibit reason, bool set)
@@@ -10192,7 -10112,7 +10215,7 @@@
   
         lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
   
-       if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
+       if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
                 return;
   
         old = new = kvm->arch.apicv_inhibit_reasons;
@@@ -11636,7 -11556,7 +11659,7 @@@ static int sync_regs(struct kvm_vcpu *v
   int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
   {
         if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- -              pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ +              pr_warn_once("SMP vm created on host with unstable TSC; "
                              "guest TSC will not be reliable\n");
   
         if (!kvm->arch.max_vcpu_ids)
@@@ -11713,7 -11633,7 +11736,7 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
                 goto free_wbinvd_dirty_mask;
   
         if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- -              pr_err("kvm: failed to allocate vcpu's fpu\n");
+ +              pr_err("failed to allocate vcpu's fpu\n");
                 goto free_emulate_ctxt;
         }
   
@@@ -11987,11 -11907,6 +12010,11 @@@ int kvm_arch_hardware_enable(void
         bool stable, backwards_tsc = false;
   
         kvm_user_return_msr_cpu_online();
+ +
+ +      ret = kvm_x86_check_processor_compatibility();
+ +      if (ret)
+ +              return ret;
+ +
         ret = static_call(kvm_x86_hardware_enable)();
         if (ret != 0)
                 return ret;
@@@ -12078,6 -11993,88 +12101,6 @@@ void kvm_arch_hardware_disable(void
         drop_user_return_notifiers();
   }
   
- -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
- -{
- -      memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
- -
- -#define __KVM_X86_OP(func) \
- -      static_call_update(kvm_x86_##func, kvm_x86_ops.func);
- -#define KVM_X86_OP(func) \
- -      WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
- -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
- -#define KVM_X86_OP_OPTIONAL_RET0(func) \
- -      static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- -                                         (void *)__static_call_return0);
- -#include <asm/kvm-x86-ops.h>
- -#undef __KVM_X86_OP
- -
- -      kvm_pmu_ops_update(ops->pmu_ops);
- -}
- -
- -int kvm_arch_hardware_setup(void *opaque)
- -{
- -      struct kvm_x86_init_ops *ops = opaque;
- -      int r;
- -
- -      rdmsrl_safe(MSR_EFER, &host_efer);
- -
- -      if (boot_cpu_has(X86_FEATURE_XSAVES))
- -              rdmsrl(MSR_IA32_XSS, host_xss);
- -
- -      kvm_init_pmu_capability();
- -
- -      r = ops->hardware_setup();
- -      if (r != 0)
- -              return r;
- -
- -      kvm_ops_update(ops);
- -
- -      kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
- -
- -      if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- -              kvm_caps.supported_xss = 0;
- -
- -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- -      cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
- -#undef __kvm_cpu_cap_has
- -
- -      if (kvm_caps.has_tsc_control) {
- -              /*
- -               * Make sure the user can only configure tsc_khz values that
- -               * fit into a signed integer.
- -               * A min value is not calculated because it will always
- -               * be 1 on all machines.
- -               */
- -              u64 max = min(0x7fffffffULL,
- -                            __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- -              kvm_caps.max_guest_tsc_khz = max;
- -      }
- -      kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- -      kvm_init_msr_list();
- -      return 0;
- -}
- -
- -void kvm_arch_hardware_unsetup(void)
- -{
- -      kvm_unregister_perf_callbacks();
- -
- -      static_call(kvm_x86_hardware_unsetup)();
- -}
- -
- -int kvm_arch_check_processor_compat(void *opaque)
- -{
- -      struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- -      struct kvm_x86_init_ops *ops = opaque;
- -
- -      WARN_ON(!irqs_disabled());
- -
- -      if (__cr4_reserved_bits(cpu_has, c) !=
- -          __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- -              return -EIO;
- -
- -      return ops->check_processor_compatibility();
- -}
- -
   bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
   {
         return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
author	Paolo Bonzini <[email protected]>
	Tue, 27 Dec 2022 12:56:16 +0000 (07:56 -0500)
committer	Paolo Bonzini <[email protected]>
	Tue, 24 Jan 2023 11:08:01 +0000 (06:08 -0500)
		1	2
arch/x86/include/asm/kvm-x86-ops.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/avic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/nested.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history