Merge tag 'kvm-x86-misc-6.6' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <[email protected]>

Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)

committer Paolo Bonzini <[email protected]>

Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)
author Paolo Bonzini <[email protected]>
Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)
committer Paolo Bonzini <[email protected]>
Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)
diff --combined arch/x86/include/asm/kvm_host.h

index 6523f5494cb20f2dc43474798a88f5a7e54f29ca,771adf2438bc283156400dc3f9838f6b426e26d2..e3c9ff4146fca3a6ce3da86cce36c5e690c872cd
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -746,7 -746,6 +746,6 @@@ struct kvm_vcpu_arch 
         u64 smi_count;
         bool at_instruction_boundary;
         bool tpr_access_reporting;
-       bool xsaves_enabled;
         bool xfd_no_write_intercept;
         u64 ia32_xss;
         u64 microcode_version;
@@@ -831,6 -830,25 +830,25 @@@
         struct kvm_cpuid_entry2 *cpuid_entries;
         struct kvm_hypervisor_cpuid kvm_cpuid;
   
+       /*
+        * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
+        * when "struct kvm_vcpu_arch" is no longer defined in an
+        * arch/x86/include/asm header.  The max is mostly arbitrary, i.e.
+        * can be increased as necessary.
+        */
+ #define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG
+ 
+       /*
+        * Track whether or not the guest is allowed to use features that are
+        * governed by KVM, where "governed" means KVM needs to manage state
+        * and/or explicitly enable the feature in hardware.  Typically, but
+        * not always, governed features can be used by the guest if and only
+        * if both KVM and userspace want to expose the feature to the guest.
+        */
+       struct {
+               DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES);
+       } governed_features;
+ 
         u64 reserved_gpa_bits;
         int maxphyaddr;
   
@@@ -1566,10 -1584,9 +1584,10 @@@ struct kvm_x86_ops 
         void (*set_segment)(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
         void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ +      bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
         void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
         void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- -      bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ +      bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
         void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
         int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
         void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@@ -1655,8 -1672,8 +1673,8 @@@
   
         u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
         u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
-       void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
-       void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
+       void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
+       void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
   
         /*
          * Retrieve somewhat arbitrary exit information.  Intended to
@@@ -1795,8 -1812,8 +1813,8 @@@ static inline struct kvm *kvm_arch_allo
   #define __KVM_HAVE_ARCH_VM_FREE
   void kvm_arch_free_vm(struct kvm *kvm);
   
- -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
- -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+ +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+ +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
   {
         if (kvm_x86_ops.flush_remote_tlbs &&
             !static_call(kvm_x86_flush_remote_tlbs)(kvm))
@@@ -1805,8 -1822,6 +1823,8 @@@
                 return -ENOTSUPP;
   }
   
+ +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+ +
   #define kvm_arch_pmi_in_guest(vcpu) \
         ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
   
diff --combined arch/x86/kvm/cpuid.c

index d3432687c9e6315d0a521a5babce201a527a3a0b,e961e9a058477c4b66991a129128748073a97bcf..0544e30b4946d1e5fb40f6737335ac00d58a84b2
--- 1/arch/x86/kvm/cpuid.c
--- 2/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@@ -11,6 -11,7 +11,7 @@@
   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/kvm_host.h>
+ #include "linux/lockdep.h"
   #include <linux/export.h>
   #include <linux/vmalloc.h>
   #include <linux/uaccess.h>
@@@ -84,6 -85,18 +85,18 @@@ static inline struct kvm_cpuid_entry2 *
         struct kvm_cpuid_entry2 *e;
         int i;
   
+       /*
+        * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+        * with IRQs disabled is disallowed.  The CPUID model can legitimately
+        * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+        * typically disabled in KVM only when KVM is in a performance critical
+        * path, e.g. the core VM-Enter/VM-Exit run loop.  Nothing will break
+        * if this rule is violated, this assertion is purely to flag potential
+        * performance issues.  If this fires, consider moving the lookup out
+        * of the hotpath, e.g. by caching information during CPUID updates.
+        */
+       lockdep_assert_irqs_enabled();
+ 
         for (i = 0; i < nent; i++) {
                 e = &entries[i];
   
@@@ -312,6 -325,27 +325,27 @@@ static void kvm_vcpu_after_set_cpuid(st
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
         struct kvm_cpuid_entry2 *best;
+       bool allow_gbpages;
+ 
+       BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
+       bitmap_zero(vcpu->arch.governed_features.enabled,
+                   KVM_MAX_NR_GOVERNED_FEATURES);
+ 
+       /*
+        * If TDP is enabled, let the guest use GBPAGES if they're supported in
+        * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
+        * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+        * walk for performance and complexity reasons.  Not to mention KVM
+        * _can't_ solve the problem because GVA->GPA walks aren't visible to
+        * KVM once a TDP translation is installed.  Mimic hardware behavior so
+        * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+        * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+        * and can install smaller shadow pages if the host lacks 1GiB support.
+        */
+       allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+                                     guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+       if (allow_gbpages)
+               kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
   
         best = kvm_find_cpuid_entry(vcpu, 1);
         if (best && apic) {
@@@ -647,7 -681,8 +681,8 @@@ void kvm_set_cpu_caps(void
         );
   
         kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
-               F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
+               F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
+               F(AMX_COMPLEX)
         );
   
         kvm_cpu_cap_mask(CPUID_D_1_EAX,
@@@ -729,9 -764,6 +764,9 @@@
                 F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
         );
   
+ +      if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+ +              kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+ +
         kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
                 F(PERFMON_V2)
         );
@@@ -1154,6 -1186,9 +1189,9 @@@ static inline int __do_cpuid_func(struc
                 cpuid_entry_override(entry, CPUID_8000_0001_EDX);
                 cpuid_entry_override(entry, CPUID_8000_0001_ECX);
                 break;
+       case 0x80000005:
+               /*  Pass host L1 cache and TLB info. */
+               break;
         case 0x80000006:
                 /* Drop reserved bits, pass host L2 cache and TLB info. */
                 entry->edx &= ~GENMASK(17, 16);
diff --combined arch/x86/kvm/lapic.c

index a983a16163b137524e8295a38d0c3709e788edde,673880bc0762c7dfd5e70b026b613229a47c3b6f..dcd60b39e794d95ab4b1b4046a69ec5cf1a9dffc
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -376,7 -376,8 +376,8 @@@ void kvm_recalculate_apic_map(struct kv
         struct kvm_vcpu *vcpu;
         unsigned long i;
         u32 max_id = 255; /* enough space for any xAPIC ID */
-       bool xapic_id_mismatch = false;
+       bool xapic_id_mismatch;
+       int r;
   
         /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map.  */
         if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@@ -386,9 -387,14 +387,14 @@@
                   "Dirty APIC map without an in-kernel local APIC");
   
         mutex_lock(&kvm->arch.apic_map_lock);
+ 
+ retry:
         /*
-        * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
-        * (if clean) or the APIC registers (if dirty).
+        * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+        * or the APIC registers (if dirty).  Note, on retry the map may have
+        * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+        * ID, i.e. the map may still show up as in-progress.  In that case
+        * this task still needs to retry and complete its calculation.
          */
         if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
                                    DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
@@@ -397,6 -403,15 +403,15 @@@
                 return;
         }
   
+       /*
+        * Reset the mismatch flag between attempts so that KVM does the right
+        * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+        * keep max_id strictly increasing.  Disallowing max_id from shrinking
+        * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+        * with the highest x2APIC ID is toggling its APIC on and off.
+        */
+       xapic_id_mismatch = false;
+ 
         kvm_for_each_vcpu(i, vcpu, kvm)
                 if (kvm_apic_present(vcpu))
                         max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
@@@ -415,9 -430,15 +430,15 @@@
                 if (!kvm_apic_present(vcpu))
                         continue;
   
-               if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+               r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+               if (r) {
                         kvfree(new);
                         new = NULL;
+                       if (r == -E2BIG) {
+                               cond_resched();
+                               goto retry;
+                       }
+ 
                         goto out;
                 }
   
@@@ -637,22 -658,16 +658,22 @@@ bool __kvm_apic_update_irr(u32 *pir, vo
         *max_irr = -1;
   
         for (i = vec = 0; i <= 7; i++, vec += 32) {
+ +              u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+ +
+ +              irr_val = *p_irr;
                 pir_val = READ_ONCE(pir[i]);
- -              irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+ +
                 if (pir_val) {
+ +                      pir_val = xchg(&pir[i], 0);
+ +
                         prev_irr_val = irr_val;
- -                      irr_val |= xchg(&pir[i], 0);
- -                      *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
- -                      if (prev_irr_val != irr_val) {
- -                              max_updated_irr =
- -                                      __fls(irr_val ^ prev_irr_val) + vec;
- -                      }
+ +                      do {
+ +                              irr_val = prev_irr_val | pir_val;
+ +                      } while (prev_irr_val != irr_val &&
+ +                               !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+ +
+ +                      if (prev_irr_val != irr_val)
+ +                              max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
                 }
                 if (irr_val)
                         *max_irr = __fls(irr_val) + vec;
@@@ -666,11 -681,8 +687,11 @@@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr
   bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
+ +      bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
   
- -      return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+ +      if (unlikely(!apic->apicv_active && irr_updated))
+ +              apic->irr_pending = true;
+ +      return irr_updated;
   }
   EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
   
diff --combined arch/x86/kvm/mmu/mmu.c

index 7b52e31f1151ba48634fda5240c6ca0184929319,a0c2acb323ebe73439333fe58f2424cccd3cd9f4..276157f8496cc6775e137b39936d4726131b6c93
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -278,12 -278,16 +278,12 @@@ static inline bool kvm_available_flush_
         return kvm_x86_ops.flush_remote_tlbs_range;
   }
   
- -void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
- -                               gfn_t nr_pages)
+ +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
   {
- -      int ret = -EOPNOTSUPP;
+ +      if (!kvm_x86_ops.flush_remote_tlbs_range)
+ +              return -EOPNOTSUPP;
   
- -      if (kvm_x86_ops.flush_remote_tlbs_range)
- -              ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
- -                                                                 nr_pages);
- -      if (ret)
- -              kvm_flush_remote_tlbs(kvm);
+ +      return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
   }
   
   static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@@ -1584,7 -1588,7 +1584,7 @@@ static __always_inline bool kvm_handle_
         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
                                  range->start, range->end - 1, &iterator)
                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- -                             iterator.level, range->pte);
+ +                             iterator.level, range->arg.pte);
   
         return ret;
   }
@@@ -4804,28 -4808,13 +4804,13 @@@ static void __reset_rsvds_bits_mask(str
         }
   }
   
- static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
- {
-       /*
-        * If TDP is enabled, let the guest use GBPAGES if they're supported in
-        * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
-        * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
-        * walk for performance and complexity reasons.  Not to mention KVM
-        * _can't_ solve the problem because GVA->GPA walks aren't visible to
-        * KVM once a TDP translation is installed.  Mimic hardware behavior so
-        * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
-        */
-       return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
-                            guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
- }
- 
   static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                                         struct kvm_mmu *context)
   {
         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
                                 vcpu->arch.reserved_gpa_bits,
                                 context->cpu_role.base.level, is_efer_nx(context),
-                               guest_can_use_gbpages(vcpu),
+                               guest_can_use(vcpu, X86_FEATURE_GBPAGES),
                                 is_cr4_pse(context),
                                 guest_cpuid_is_amd_or_hygon(vcpu));
   }
@@@ -4902,7 -4891,8 +4887,8 @@@ static void reset_shadow_zero_bits_mask
         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
                                 context->root_role.level,
                                 context->root_role.efer_nx,
-                               guest_can_use_gbpages(vcpu), is_pse, is_amd);
+                               guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+                               is_pse, is_amd);
   
         if (!shadow_me_mask)
                 return;
@@@ -6666,7 -6656,7 +6652,7 @@@ static void kvm_rmap_zap_collapsible_sp
          */
         if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
                             PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
- -              kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ +              kvm_flush_remote_tlbs_memslot(kvm, slot);
   }
   
   void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
@@@ -6685,6 -6675,20 +6671,6 @@@
         }
   }
   
- -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- -                                      const struct kvm_memory_slot *memslot)
- -{
- -      /*
- -       * All current use cases for flushing the TLBs for a specific memslot
- -       * related to dirty logging, and many do the TLB flush out of mmu_lock.
- -       * The interaction between the various operations on memslot must be
- -       * serialized by slots_locks to ensure the TLB flush from one operation
- -       * is observed by any other operation on the same memslot.
- -       */
- -      lockdep_assert_held(&kvm->slots_lock);
- -      kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
- -}
- -
   void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot)
   {
@@@ -6844,7 -6848,7 +6830,7 @@@ static void mmu_destroy_caches(void
   static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
   {
         if (nx_hugepage_mitigation_hard_disabled)
-               return sprintf(buffer, "never\n");
+               return sysfs_emit(buffer, "never\n");
   
         return param_get_bool(buffer, kp);
   }
diff --combined arch/x86/kvm/svm/svm.c

index 488814e919ca0ef3b51574e1a5a51ec320c6170f,226b3a780d0fcb20120040da0fb4e1c494c01e95..f283eb47f6acec7d34453c2047536bfaa14b66c5
--- 1/arch/x86/kvm/svm/svm.c
--- 2/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -39,10 -39,9 +39,9 @@@
   #include <asm/spec-ctrl.h>
   #include <asm/cpu_device_id.h>
   #include <asm/traps.h>
+ #include <asm/reboot.h>
   #include <asm/fpu/api.h>
   
- #include <asm/virtext.h>
- 
   #include <trace/events/ipi.h>
   
   #include "trace.h"
@@@ -203,7 -202,7 +202,7 @@@ static int nested = true
   module_param(nested, int, S_IRUGO);
   
   /* enable/disable Next RIP Save */
- -static int nrips = true;
+ +int nrips = true;
   module_param(nrips, int, 0444);
   
   /* enable/disable Virtual VMLOAD VMSAVE */
@@@ -365,8 -364,6 +364,8 @@@ static void svm_set_interrupt_shadow(st
                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
   
   }
+ +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ +                                      void *insn, int insn_len);
   
   static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
                                            bool commit_side_effects)
@@@ -387,14 -384,6 +386,14 @@@
         }
   
         if (!svm->next_rip) {
+ +              /*
+ +               * FIXME: Drop this when kvm_emulate_instruction() does the
+ +               * right thing and treats "can't emulate" as outright failure
+ +               * for EMULTYPE_SKIP.
+ +               */
+ +              if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
+ +                      return 0;
+ +
                 if (unlikely(!commit_side_effects))
                         old_rflags = svm->vmcb->save.rflags;
   
@@@ -527,14 -516,21 +526,21 @@@ static void svm_init_osvw(struct kvm_vc
                 vcpu->arch.osvw.status |= 1;
   }
   
- static bool kvm_is_svm_supported(void)
+ static bool __kvm_is_svm_supported(void)
   {
-       int cpu = raw_smp_processor_id();
-       const char *msg;
+       int cpu = smp_processor_id();
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+ 
         u64 vm_cr;
   
-       if (!cpu_has_svm(&msg)) {
-               pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+       if (c->x86_vendor != X86_VENDOR_AMD &&
+           c->x86_vendor != X86_VENDOR_HYGON) {
+               pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+               return false;
+       }
+ 
+       if (!cpu_has(c, X86_FEATURE_SVM)) {
+               pr_err("SVM not supported by CPU %d\n", cpu);
                 return false;
         }
   
@@@ -552,25 -548,55 +558,55 @@@
         return true;
   }
   
+ static bool kvm_is_svm_supported(void)
+ {
+       bool supported;
+ 
+       migrate_disable();
+       supported = __kvm_is_svm_supported();
+       migrate_enable();
+ 
+       return supported;
+ }
+ 
   static int svm_check_processor_compat(void)
   {
-       if (!kvm_is_svm_supported())
+       if (!__kvm_is_svm_supported())
                 return -EIO;
   
         return 0;
   }
   
- void __svm_write_tsc_multiplier(u64 multiplier)
+ static void __svm_write_tsc_multiplier(u64 multiplier)
   {
-       preempt_disable();
- 
         if (multiplier == __this_cpu_read(current_tsc_ratio))
-               goto out;
+               return;
   
         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
         __this_cpu_write(current_tsc_ratio, multiplier);
- out:
-       preempt_enable();
+ }
+ 
+ static inline void kvm_cpu_svm_disable(void)
+ {
+       uint64_t efer;
+ 
+       wrmsrl(MSR_VM_HSAVE_PA, 0);
+       rdmsrl(MSR_EFER, efer);
+       if (efer & EFER_SVME) {
+               /*
+                * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+                * NMI aren't blocked.
+                */
+               stgi();
+               wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+       }
+ }
+ 
+ static void svm_emergency_disable(void)
+ {
+       kvm_rebooting = true;
+ 
+       kvm_cpu_svm_disable();
   }
   
   static void svm_hardware_disable(void)
@@@ -579,7 -605,7 +615,7 @@@
         if (tsc_scaling)
                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
   
-       cpu_svm_disable();
+       kvm_cpu_svm_disable();
   
         amd_pmu_disable_virt();
   }
@@@ -687,39 -713,6 +723,39 @@@ free_save_area
   
   }
   
+ +static void set_dr_intercepts(struct vcpu_svm *svm)
+ +{
+ +      struct vmcb *vmcb = svm->vmcb01.ptr;
+ +
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ +      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ +
+ +      recalc_intercepts(svm);
+ +}
+ +
+ +static void clr_dr_intercepts(struct vcpu_svm *svm)
+ +{
+ +      struct vmcb *vmcb = svm->vmcb01.ptr;
+ +
+ +      vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ +
+ +      recalc_intercepts(svm);
+ +}
+ +
   static int direct_access_msr_slot(u32 msr)
   {
         u32 i;
@@@ -990,24 -983,50 +1026,24 @@@ static void svm_disable_lbrv(struct kvm
                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
   }
   
- -static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+ +static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
   {
         /*
- -       * If the LBR virtualization is disabled, the LBR msrs are always
- -       * kept in the vmcb01 to avoid copying them on nested guest entries.
- -       *
- -       * If nested, and the LBR virtualization is enabled/disabled, the msrs
- -       * are moved between the vmcb01 and vmcb02 as needed.
+ +       * If LBR virtualization is disabled, the LBR MSRs are always kept in
+ +       * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
+ +       * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
          */
- -      struct vmcb *vmcb =
- -              (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
- -                      svm->vmcb : svm->vmcb01.ptr;
- -
- -      switch (index) {
- -      case MSR_IA32_DEBUGCTLMSR:
- -              return vmcb->save.dbgctl;
- -      case MSR_IA32_LASTBRANCHFROMIP:
- -              return vmcb->save.br_from;
- -      case MSR_IA32_LASTBRANCHTOIP:
- -              return vmcb->save.br_to;
- -      case MSR_IA32_LASTINTFROMIP:
- -              return vmcb->save.last_excp_from;
- -      case MSR_IA32_LASTINTTOIP:
- -              return vmcb->save.last_excp_to;
- -      default:
- -              KVM_BUG(false, svm->vcpu.kvm,
- -                      "%s: Unknown MSR 0x%x", __func__, index);
- -              return 0;
- -      }
+ +      return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+ +                                                                 svm->vmcb01.ptr;
   }
   
   void svm_update_lbrv(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
- -
- -      bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
- -                                         DEBUGCTLMSR_LBR;
- -
- -      bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
- -                                    LBR_CTL_ENABLE_MASK);
- -
- -      if (unlikely(is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV)))
- -              if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
- -                      enable_lbrv = true;
+ +      bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ +      bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
-                          (is_guest_mode(vcpu) && svm->lbrv_enabled &&
++                          (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ +                          (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
   
         if (enable_lbrv == current_enable_lbrv)
                 return;
@@@ -1118,21 -1137,23 +1154,23 @@@ static u64 svm_get_l2_tsc_multiplier(st
         return svm->tsc_ratio_msr;
   }
   
- static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
-       svm->vmcb->control.tsc_offset = offset;
+       svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
   }
   
- static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
   {
-       __svm_write_tsc_multiplier(multiplier);
+       preempt_disable();
+       if (to_svm(vcpu)->guest_state_loaded)
+               __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+       preempt_enable();
   }
   
- 
   /* Evaluate instruction intercepts that depend on guest CPUID features. */
   static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
                                               struct vcpu_svm *svm)
@@@ -1173,8 -1194,6 +1211,6 @@@ static inline void init_vmcb_after_set_
   
                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
- 
-               svm->v_vmload_vmsave_enabled = false;
         } else {
                 /*
                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
@@@ -1218,9 -1237,10 +1254,9 @@@ static void init_vmcb(struct kvm_vcpu *
          * Guest access to VMware backdoor ports could legitimately
          * trigger #GP because of TSS I/O permission bitmap.
          * We intercept those #GP and allow access to them anyway
- -       * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
- -       * decrypt guest memory to decode the faulting instruction.
+ +       * as VMware does.
          */
- -      if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
+ +      if (enable_vmware_backdoor)
                 set_exception_intercept(svm, GP_VECTOR);
   
         svm_set_intercept(svm, INTERCEPT_INTR);
@@@ -1514,9 -1534,7 +1550,9 @@@ static void svm_vcpu_load(struct kvm_vc
   
         if (sd->current_vmcb != svm->vmcb) {
                 sd->current_vmcb = svm->vmcb;
- -              indirect_branch_prediction_barrier();
+ +
+ +              if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ +                      indirect_branch_prediction_barrier();
         }
         if (kvm_vcpu_apicv_active(vcpu))
                 avic_vcpu_load(vcpu, cpu);
@@@ -1804,11 -1822,6 +1840,11 @@@ static void sev_post_set_cr3(struct kvm
         }
   }
   
+ +static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ +{
+ +      return true;
+ +}
+ +
   void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
   {
         struct vcpu_svm *svm = to_svm(vcpu);
@@@ -1965,7 -1978,7 +2001,7 @@@ static void svm_sync_dirty_debug_regs(s
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
- -      if (vcpu->arch.guest_state_protected)
+ +      if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
                 return;
   
         get_debugreg(vcpu->arch.db[0], 0);
@@@ -2526,13 -2539,12 +2562,13 @@@ static int iret_interception(struct kvm
   {
         struct vcpu_svm *svm = to_svm(vcpu);
   
+ +      WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+ +
         ++vcpu->stat.nmi_window_exits;
         svm->awaiting_iret_completion = true;
   
         svm_clr_iret_intercept(svm);
- -      if (!sev_es_guest(vcpu->kvm))
- -              svm->nmi_iret_rip = kvm_rip_read(vcpu);
+ +      svm->nmi_iret_rip = kvm_rip_read(vcpu);
   
         kvm_make_request(KVM_REQ_EVENT, vcpu);
         return 1;
@@@ -2697,13 -2709,6 +2733,13 @@@ static int dr_interception(struct kvm_v
         unsigned long val;
         int err = 0;
   
+ +      /*
+ +       * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ +       * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ +       */
+ +      if (sev_es_guest(vcpu->kvm))
+ +              return 1;
+ +
         if (vcpu->guest_debug == 0) {
                 /*
                  * No more DR vmexits; force a reload of the debug registers
@@@ -2788,7 -2793,8 +2824,8 @@@ static int svm_get_msr(struct kvm_vcpu 
   
         switch (msr_info->index) {
         case MSR_AMD64_TSC_RATIO:
-               if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+               if (!msr_info->host_initiated &&
+                   !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
                         return 1;
                 msr_info->data = svm->tsc_ratio_msr;
                 break;
@@@ -2826,19 -2832,11 +2863,19 @@@
                 msr_info->data = svm->tsc_aux;
                 break;
         case MSR_IA32_DEBUGCTLMSR:
+ +              msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ +              break;
         case MSR_IA32_LASTBRANCHFROMIP:
+ +              msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ +              break;
         case MSR_IA32_LASTBRANCHTOIP:
+ +              msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ +              break;
         case MSR_IA32_LASTINTFROMIP:
+ +              msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ +              break;
         case MSR_IA32_LASTINTTOIP:
- -              msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
+ +              msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
                 break;
         case MSR_VM_HSAVE_PA:
                 msr_info->data = svm->nested.hsave_msr;
@@@ -2938,7 -2936,7 +2975,7 @@@ static int svm_set_msr(struct kvm_vcpu 
         switch (ecx) {
         case MSR_AMD64_TSC_RATIO:
   
-               if (!svm->tsc_scaling_enabled) {
+               if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
   
                         if (!msr->host_initiated)
                                 return 1;
@@@ -2960,7 -2958,8 +2997,8 @@@
   
                 svm->tsc_ratio_msr = data;
   
-               if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+               if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+                   is_guest_mode(vcpu))
                         nested_svm_update_tsc_ratio_msr(vcpu);
   
                 break;
@@@ -3069,8 -3068,13 +3107,8 @@@
                 if (data & DEBUGCTL_RESERVED_BITS)
                         return 1;
   
- -              if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
- -                      svm->vmcb->save.dbgctl = data;
- -              else
- -                      svm->vmcb01.ptr->save.dbgctl = data;
- -
+ +              svm_get_lbr_vmcb(svm)->save.dbgctl = data;
                 svm_update_lbrv(vcpu);
- -
                 break;
         case MSR_VM_HSAVE_PA:
                 /*
@@@ -3796,19 -3800,6 +3834,19 @@@ static void svm_enable_nmi_window(struc
         if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
                 return; /* IRET will cause a vm exit */
   
+ +      /*
+ +       * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ +       * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ +       * KVM can't intercept and single-step IRET to detect when NMIs are
+ +       * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
+ +       *
+ +       * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ +       * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ +       * supported NAEs in the GHCB protocol.
+ +       */
+ +      if (sev_es_guest(vcpu->kvm))
+ +              return;
+ +
         if (!gif_set(svm)) {
                 if (vgif)
                         svm_set_intercept(svm, INTERCEPT_STGI);
@@@ -3958,11 -3949,12 +3996,11 @@@ static void svm_complete_interrupts(str
         svm->soft_int_injected = false;
   
         /*
- -       * If we've made progress since setting HF_IRET_MASK, we've
+ +       * If we've made progress since setting awaiting_iret_completion, we've
          * executed an IRET and can allow NMI injection.
          */
         if (svm->awaiting_iret_completion &&
- -          (sev_es_guest(vcpu->kvm) ||
- -           kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ +          kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
                 svm->awaiting_iret_completion = false;
                 svm->nmi_masked = false;
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -4032,8 -4024,14 +4070,8 @@@ static int svm_vcpu_pre_run(struct kvm_
   
   static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
   {
- -      struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
- -
- -      /*
- -       * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
- -       * can't read guest memory (dereference memslots) to decode the WRMSR.
- -       */
- -      if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
- -          nrips && control->next_rip)
+ +      if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ +          to_svm(vcpu)->vmcb->control.exit_info_1)
                 return handle_fastpath_set_msr_irqoff(vcpu);
   
         return EXIT_FASTPATH_NONE;
@@@ -4045,8 -4043,6 +4083,8 @@@ static noinstr void svm_vcpu_enter_exit
   
         guest_state_enter_irqoff();
   
+ +      amd_clear_divider();
+ +
         if (sev_es_guest(vcpu->kvm))
                 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
         else
@@@ -4248,28 -4244,37 +4286,37 @@@ static void svm_vcpu_after_set_cpuid(st
         struct vcpu_svm *svm = to_svm(vcpu);
         struct kvm_cpuid_entry2 *best;
   
-       vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-                                   boot_cpu_has(X86_FEATURE_XSAVE) &&
-                                   boot_cpu_has(X86_FEATURE_XSAVES);
- 
-       /* Update nrips enabled cache */
-       svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
- 
-       svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
-       svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
- 
-       svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- 
-       svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+       /*
+        * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+        * can only disable all variants of by disallowing CR4.OSXSAVE from
+        * being set.  As a result, if the host has XSAVE and XSAVES, and the
+        * guest has XSAVE enabled, the guest can execute XSAVES without
+        * faulting.  Treat XSAVES as enabled in this case regardless of
+        * whether it's advertised to the guest so that KVM context switches
+        * XSS on VM-Enter/VM-Exit.  Failure to do so would effectively give
+        * the guest read/write access to the host's XSS.
+        */
+       if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+           boot_cpu_has(X86_FEATURE_XSAVES) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+               kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
   
-       svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
   
-       svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+       /*
+        * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+        * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+        * SVM on Intel is bonkers and extremely unlikely to work).
+        */
+       if (!guest_cpuid_is_intel(vcpu))
+               kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
   
-       svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
   
         svm_recalc_instruction_intercepts(vcpu, svm);
   
@@@ -4690,25 -4695,16 +4737,25 @@@ static bool svm_can_emulate_instruction
          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
          * decode garbage.
          *
- -       * Inject #UD if KVM reached this point without an instruction buffer.
- -       * In practice, this path should never be hit by a well-behaved guest,
- -       * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
- -       * is still theoretically reachable, e.g. via unaccelerated fault-like
- -       * AVIC access, and needs to be handled by KVM to avoid putting the
- -       * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
- -       * but its the least awful option given lack of insight into the guest.
+ +       * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ +       * KVM reached this point without an instruction buffer.  In practice,
+ +       * this path should never be hit by a well-behaved guest, e.g. KVM
+ +       * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ +       * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ +       * access, and needs to be handled by KVM to avoid putting the guest
+ +       * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
+ +       * its the least awful option given lack of insight into the guest.
+ +       *
+ +       * If KVM is trying to skip an instruction, simply resume the guest.
+ +       * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ +       * will attempt to re-inject the INT3/INTO and skip the instruction.
+ +       * In that scenario, retrying the INT3/INTO and hoping the guest will
+ +       * make forward progress is the only option that has a chance of
+ +       * success (and in practice it will work the vast majority of the time).
          */
         if (unlikely(!insn)) {
- -              kvm_queue_exception(vcpu, UD_VECTOR);
+ +              if (!(emul_type & EMULTYPE_SKIP))
+ +                      kvm_queue_exception(vcpu, UD_VECTOR);
                 return false;
         }
   
@@@ -4866,7 -4862,6 +4913,7 @@@ static struct kvm_x86_ops svm_x86_ops _
         .set_segment = svm_set_segment,
         .get_cpl = svm_get_cpl,
         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ +      .is_valid_cr0 = svm_is_valid_cr0,
         .set_cr0 = svm_set_cr0,
         .post_set_cr3 = sev_post_set_cr3,
         .is_valid_cr4 = svm_is_valid_cr4,
@@@ -5160,11 -5155,9 +5207,11 @@@ static __init int svm_hardware_setup(vo
   
         svm_adjust_mmio_mask();
   
+ +      nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+ +
         /*
          * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
- -       * may be modified by svm_adjust_mmio_mask()).
+ +       * may be modified by svm_adjust_mmio_mask()), as well as nrips.
          */
         sev_hardware_setup();
   
@@@ -5176,6 -5169,11 +5223,6 @@@
                         goto err;
         }
   
- -      if (nrips) {
- -              if (!boot_cpu_has(X86_FEATURE_NRIPS))
- -                      nrips = false;
- -      }
- -
         enable_apicv = avic = avic && avic_hardware_setup();
   
         if (!enable_apicv) {
@@@ -5258,6 -5256,13 +5305,13 @@@ static struct kvm_x86_init_ops svm_init
         .pmu_ops = &amd_pmu_ops,
   };
   
+ static void __svm_exit(void)
+ {
+       kvm_x86_vendor_exit();
+ 
+       cpu_emergency_unregister_virt_callback(svm_emergency_disable);
+ }
+ 
   static int __init svm_init(void)
   {
         int r;
@@@ -5271,6 -5276,8 +5325,8 @@@
         if (r)
                 return r;
   
+       cpu_emergency_register_virt_callback(svm_emergency_disable);
+ 
         /*
          * Common KVM initialization _must_ come last, after this, /dev/kvm is
          * exposed to userspace!
@@@ -5283,14 -5290,14 +5339,14 @@@
         return 0;
   
   err_kvm_init:
-       kvm_x86_vendor_exit();
+       __svm_exit();
         return r;
   }
   
   static void __exit svm_exit(void)
   {
         kvm_exit();
-       kvm_x86_vendor_exit();
+       __svm_exit();
   }
   
   module_init(svm_init)
diff --combined arch/x86/kvm/svm/svm.h

index ef508042a5536aedd6f45d48ab2bf3651eb9f67e,06400cfe2244b01a6736c28030f777387859d116..f41253958357b7730b280fe17e990c91e0b23a05
--- 1/arch/x86/kvm/svm/svm.h
--- 2/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@@ -22,6 -22,7 +22,7 @@@
   #include <asm/svm.h>
   #include <asm/sev-common.h>
   
+ #include "cpuid.h"
   #include "kvm_cache_regs.h"
   
   #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
@@@ -33,7 -34,6 +34,7 @@@
   #define MSRPM_OFFSETS 32
   extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
   extern bool npt_enabled;
+ +extern int nrips;
   extern int vgif;
   extern bool intercept_smi;
   extern bool x2avic_enabled;
@@@ -191,12 -191,10 +192,12 @@@ struct vcpu_sev_es_state 
         /* SEV-ES support */
         struct sev_es_save_area *vmsa;
         struct ghcb *ghcb;
+ +      u8 valid_bitmap[16];
         struct kvm_host_map ghcb_map;
         bool received_first_sipi;
   
         /* SEV-ES scratch area support */
+ +      u64 sw_scratch;
         void *ghcb_sa;
         u32 ghcb_sa_len;
         bool ghcb_sa_sync;
@@@ -261,16 -259,6 +262,6 @@@ struct vcpu_svm 
         unsigned long soft_int_next_rip;
         bool soft_int_injected;
   
-       /* optional nested SVM features that are enabled for this guest  */
-       bool nrips_enabled                : 1;
-       bool tsc_scaling_enabled          : 1;
-       bool v_vmload_vmsave_enabled      : 1;
-       bool lbrv_enabled                 : 1;
-       bool pause_filter_enabled         : 1;
-       bool pause_threshold_enabled      : 1;
-       bool vgif_enabled                 : 1;
-       bool vnmi_enabled                 : 1;
- 
         u32 ldr_reg;
         u32 dfr_reg;
         struct page *avic_backing_page;
@@@ -407,6 -395,48 +398,6 @@@ static inline bool vmcb12_is_intercept(
         return test_bit(bit, (unsigned long *)&control->intercepts);
   }
   
- -static inline void set_dr_intercepts(struct vcpu_svm *svm)
- -{
- -      struct vmcb *vmcb = svm->vmcb01.ptr;
- -
- -      if (!sev_es_guest(svm->vcpu.kvm)) {
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
- -      }
- -
- -      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- -      vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- -
- -      recalc_intercepts(svm);
- -}
- -
- -static inline void clr_dr_intercepts(struct vcpu_svm *svm)
- -{
- -      struct vmcb *vmcb = svm->vmcb01.ptr;
- -
- -      vmcb->control.intercepts[INTERCEPT_DR] = 0;
- -
- -      /* DR7 access must remain intercepted for an SEV-ES guest */
- -      if (sev_es_guest(svm->vcpu.kvm)) {
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- -              vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- -      }
- -
- -      recalc_intercepts(svm);
- -}
- -
   static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
   {
         struct vmcb *vmcb = svm->vmcb01.ptr;
@@@ -452,7 -482,8 +443,8 @@@ static inline bool svm_is_intercept(str
   
   static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
   {
-       return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+       return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+              (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
   }
   
   static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
@@@ -503,7 -534,7 +495,7 @@@ static inline bool nested_npt_enabled(s
   
   static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
   {
-       return svm->vnmi_enabled &&
+       return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) &&
                (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
   }
   
@@@ -619,7 -650,7 +611,7 @@@ int nested_svm_check_exception(struct v
                                bool has_error_code, u32 error_code);
   int nested_svm_exit_special(struct vcpu_svm *svm);
   void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
- void __svm_write_tsc_multiplier(u64 multiplier);
+ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
   void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
                                        struct vmcb_control_area *control);
   void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
@@@ -705,28 -736,4 +697,28 @@@ void sev_es_unmap_ghcb(struct vcpu_svm 
   void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
   void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
   
+ +#define DEFINE_KVM_GHCB_ACCESSORS(field)                                              \
+ +      static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ +      {                                                                       \
+ +              return test_bit(GHCB_BITMAP_IDX(field),                         \
+ +                              (unsigned long *)&svm->sev_es.valid_bitmap);    \
+ +      }                                                                       \
+ +                                                                              \
+ +      static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ +      {                                                                       \
+ +              return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ +      }                                                                       \
+ +
+ +DEFINE_KVM_GHCB_ACCESSORS(cpl)
+ +DEFINE_KVM_GHCB_ACCESSORS(rax)
+ +DEFINE_KVM_GHCB_ACCESSORS(rcx)
+ +DEFINE_KVM_GHCB_ACCESSORS(rdx)
+ +DEFINE_KVM_GHCB_ACCESSORS(rbx)
+ +DEFINE_KVM_GHCB_ACCESSORS(rsi)
+ +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+ +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+ +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+ +DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+ +DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+ +
   #endif
diff --combined arch/x86/kvm/vmx/vmx.c

index c0236dd4d8924665dc0cb4c923d2bd1982369248,e9386afd1521fc250ebb14085b16d22004f4f7b4..af73d5d54ec8e8a6a07e8e5f700ce1ee94da9a39
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -41,13 -41,12 +41,12 @@@
   #include <asm/idtentry.h>
   #include <asm/io.h>
   #include <asm/irq_remapping.h>
- #include <asm/kexec.h>
+ #include <asm/reboot.h>
   #include <asm/perf_event.h>
   #include <asm/mmu_context.h>
   #include <asm/mshyperv.h>
   #include <asm/mwait.h>
   #include <asm/spec-ctrl.h>
- #include <asm/virtext.h>
   #include <asm/vmx.h>
   
   #include "capabilities.h"
@@@ -237,9 -236,6 +236,6 @@@ static const struct 
   #define L1D_CACHE_ORDER 4
   static void *vmx_l1d_flush_pages;
   
- /* Control for disabling CPU Fill buffer clear */
- static bool __read_mostly vmx_fb_clear_ctrl_available;
- 
   static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
   {
         struct page *page;
@@@ -255,14 -251,9 +251,9 @@@
                 return 0;
         }
   
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
-               u64 msr;
- 
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
-               if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
-                       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
-                       return 0;
-               }
+       if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+               return 0;
         }
   
         /* If set to auto use the default l1tf mitigation method */
@@@ -366,22 -357,9 +357,9 @@@ static int vmentry_l1d_flush_set(const 
   static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
   {
         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
-               return sprintf(s, "???\n");
+               return sysfs_emit(s, "???\n");
   
-       return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
- }
- 
- static void vmx_setup_fb_clear_ctrl(void)
- {
-       u64 msr;
- 
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
-           !boot_cpu_has_bug(X86_BUG_MDS) &&
-           !boot_cpu_has_bug(X86_BUG_TAA)) {
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
-               if (msr & ARCH_CAP_FB_CLEAR_CTRL)
-                       vmx_fb_clear_ctrl_available = true;
-       }
+       return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
   }
   
   static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
@@@ -409,7 -387,9 +387,9 @@@ static __always_inline void vmx_enable_
   
   static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
   {
-       vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+       vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+                               !boot_cpu_has_bug(X86_BUG_MDS) &&
+                               !boot_cpu_has_bug(X86_BUG_TAA);
   
         /*
          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
@@@ -441,23 -421,13 +421,23 @@@ do {                                    
         pr_warn_ratelimited(fmt);       \
   } while (0)
   
- -void vmread_error(unsigned long field, bool fault)
+ +noinline void vmread_error(unsigned long field)
   {
- -      if (fault)
+ +      vmx_insn_failed("vmread failed: field=%lx\n", field);
+ +}
+ +
+ +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+ +noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+ +{
+ +      if (fault) {
                 kvm_spurious_fault();
- -      else
- -              vmx_insn_failed("vmread failed: field=%lx\n", field);
+ +      } else {
+ +              instrumentation_begin();
+ +              vmread_error(field);
+ +              instrumentation_end();
+ +      }
   }
+ +#endif
   
   noinline void vmwrite_error(unsigned long field, unsigned long value)
   {
@@@ -754,17 -724,51 +734,51 @@@ static int vmx_set_guest_uret_msr(struc
         return ret;
   }
   
- #ifdef CONFIG_KEXEC_CORE
- static void crash_vmclear_local_loaded_vmcss(void)
+ /*
+  * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+  *
+  * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+  * atomically track post-VMXON state, e.g. this may be called in NMI context.
+  * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+  * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+  * magically in RM, VM86, compat mode, or at CPL>0.
+  */
+ static int kvm_cpu_vmxoff(void)
+ {
+       asm_volatile_goto("1: vmxoff\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         ::: "cc", "memory" : fault);
+ 
+       cr4_clear_bits(X86_CR4_VMXE);
+       return 0;
+ 
+ fault:
+       cr4_clear_bits(X86_CR4_VMXE);
+       return -EIO;
+ }
+ 
+ static void vmx_emergency_disable(void)
   {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
   
+       kvm_rebooting = true;
+ 
+       /*
+        * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+        * set in task context.  If this races with VMX is disabled by an NMI,
+        * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+        * kvm_rebooting set.
+        */
+       if (!(__read_cr4() & X86_CR4_VMXE))
+               return;
+ 
         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                             loaded_vmcss_on_cpu_link)
                 vmcs_clear(v->vmcs);
+ 
+       kvm_cpu_vmxoff();
   }
- #endif /* CONFIG_KEXEC_CORE */
   
   static void __loaded_vmcs_clear(void *arg)
   {
@@@ -1513,11 -1517,6 +1527,11 @@@ void vmx_set_rflags(struct kvm_vcpu *vc
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         unsigned long old_rflags;
   
+ +      /*
+ +       * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ +       * is an unrestricted guest in order to mark L2 as needing emulation
+ +       * if L1 runs L2 as a restricted guest.
+ +       */
         if (is_unrestricted_guest(vcpu)) {
                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
                 vmx->rflags = rflags;
@@@ -1899,25 -1898,14 +1913,14 @@@ u64 vmx_get_l2_tsc_multiplier(struct kv
         return kvm_caps.default_tsc_scaling_ratio;
   }
   
- static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
- {
-       vmcs_write64(TSC_OFFSET, offset);
- }
- 
- static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
   {
-       vmcs_write64(TSC_MULTIPLIER, multiplier);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
   }
   
- /*
-  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
-  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
-  * all guests if the "nested" module option is off, and can also be disabled
-  * for a single guest by disabling its VMX cpuid bit.
-  */
- bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+ static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
   {
-       return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+       vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
   }
   
   /*
@@@ -2047,7 -2035,7 +2050,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
                 break;
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
-               if (!nested_vmx_allowed(vcpu))
+               if (!guest_can_use(vcpu, X86_FEATURE_VMX))
                         return 1;
                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
                                     &msr_info->data))
@@@ -2355,7 -2343,7 +2358,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                 if (!msr_info->host_initiated)
                         return 1; /* they are read-only */
-               if (!nested_vmx_allowed(vcpu))
+               if (!guest_can_use(vcpu, X86_FEATURE_VMX))
                         return 1;
                 return vmx_set_vmx_msr(vcpu, msr_index, data);
         case MSR_IA32_RTIT_CTL:
@@@ -2729,11 -2717,11 +2732,11 @@@ static int setup_vmcs_config(struct vmc
         return 0;
   }
   
- static bool kvm_is_vmx_supported(void)
+ static bool __kvm_is_vmx_supported(void)
   {
-       int cpu = raw_smp_processor_id();
+       int cpu = smp_processor_id();
   
-       if (!cpu_has_vmx()) {
+       if (!(cpuid_ecx(1) & feature_bit(VMX))) {
                 pr_err("VMX not supported by CPU %d\n", cpu);
                 return false;
         }
@@@ -2747,13 -2735,24 +2750,24 @@@
         return true;
   }
   
+ static bool kvm_is_vmx_supported(void)
+ {
+       bool supported;
+ 
+       migrate_disable();
+       supported = __kvm_is_vmx_supported();
+       migrate_enable();
+ 
+       return supported;
+ }
+ 
   static int vmx_check_processor_compat(void)
   {
         int cpu = raw_smp_processor_id();
         struct vmcs_config vmcs_conf;
         struct vmx_capability vmx_cap;
   
-       if (!kvm_is_vmx_supported())
+       if (!__kvm_is_vmx_supported())
                 return -EIO;
   
         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
@@@ -2833,7 -2832,7 +2847,7 @@@ static void vmx_hardware_disable(void
   {
         vmclear_local_loaded_vmcss();
   
-       if (cpu_vmxoff())
+       if (kvm_cpu_vmxoff())
                 kvm_spurious_fault();
   
         hv_reset_evmcs();
@@@ -3052,15 -3051,6 +3066,15 @@@ static void enter_rmode(struct kvm_vcp
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
   
+ +      /*
+ +       * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ +       * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ +       * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ +       * should VM-Fail and KVM should reject userspace attempts to stuff
+ +       * CR0.PG=0 when L2 is active.
+ +       */
+ +      WARN_ON_ONCE(is_guest_mode(vcpu));
+ +
         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
@@@ -3071,6 -3061,13 +3085,6 @@@
   
         vmx->rmode.vm86_active = 1;
   
- -      /*
- -       * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- -       * vcpu. Warn the user that an update is overdue.
- -       */
- -      if (!kvm_vmx->tss_addr)
- -              pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
- -
         vmx_segment_cache_clear(vmx);
   
         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
@@@ -3243,17 -3240,6 +3257,17 @@@ void ept_save_pdptrs(struct kvm_vcpu *v
   #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
                           CPU_BASED_CR3_STORE_EXITING)
   
+ +static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ +{
+ +      if (is_guest_mode(vcpu))
+ +              return nested_guest_cr0_valid(vcpu, cr0);
+ +
+ +      if (to_vmx(vcpu)->nested.vmxon)
+ +              return nested_host_cr0_valid(vcpu, cr0);
+ +
+ +      return true;
+ +}
+ +
   void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -3263,7 -3249,7 +3277,7 @@@
         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
   
         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- -      if (is_unrestricted_guest(vcpu))
+ +      if (enable_unrestricted_guest)
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
         else {
                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@@ -3291,7 -3277,7 +3305,7 @@@
         }
   #endif
   
- -      if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ +      if (enable_ept && !enable_unrestricted_guest) {
                 /*
                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
@@@ -3343,7 -3329,7 +3357,7 @@@
         vmx->emulation_required = vmx_emulation_required(vcpu);
   }
   
- -static int vmx_get_max_tdp_level(void)
+ +static int vmx_get_max_ept_level(void)
   {
         if (cpu_has_vmx_ept_5levels())
                 return 5;
@@@ -3422,7 -3408,7 +3436,7 @@@ void vmx_set_cr4(struct kvm_vcpu *vcpu
          * this bit, even if host CR4.MCE == 0.
          */
         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- -      if (is_unrestricted_guest(vcpu))
+ +      if (enable_unrestricted_guest)
                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
         else if (vmx->rmode.vm86_active)
                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@@ -3442,7 -3428,7 +3456,7 @@@
         vcpu->arch.cr4 = cr4;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
   
- -      if (!is_unrestricted_guest(vcpu)) {
+ +      if (!enable_unrestricted_guest) {
                 if (enable_ept) {
                         if (!is_paging(vcpu)) {
                                 hw_cr4 &= ~X86_CR4_PAE;
@@@ -4546,16 -4532,19 +4560,19 @@@ vmx_adjust_secondary_exec_control(struc
    * based on a single guest CPUID bit, with a dedicated feature bit.  This also
    * verifies that the control is actually supported by KVM and hardware.
    */
- #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
- ({                                                                     \
-       bool __enabled;                                                  \
-                                                                        \
-       if (cpu_has_vmx_##name()) {                                      \
-               __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
-                                           X86_FEATURE_##feat_name);    \
-               vmx_adjust_secondary_exec_control(vmx, exec_control,     \
-                       SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
-       }                                                                \
+ #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting)   \
+ ({                                                                                            \
+       struct kvm_vcpu *__vcpu = &(vmx)->vcpu;                                                 \
+       bool __enabled;                                                                         \
+                                                                                               \
+       if (cpu_has_vmx_##name()) {                                                             \
+               if (kvm_is_governed_feature(X86_FEATURE_##feat_name))                           \
+                       __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name);             \
+               else                                                                            \
+                       __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name);           \
+               vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+                                                 __enabled, exiting);                          \
+       }                                                                                       \
   })
   
   /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
@@@ -4615,19 -4604,7 +4632,7 @@@ static u32 vmx_secondary_exec_control(s
         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
   
-       if (cpu_has_vmx_xsaves()) {
-               /* Exposing XSAVES only when XSAVE is exposed */
-               bool xsaves_enabled =
-                       boot_cpu_has(X86_FEATURE_XSAVE) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
- 
-               vcpu->arch.xsaves_enabled = xsaves_enabled;
- 
-               vmx_adjust_secondary_exec_control(vmx, &exec_control,
-                                                 SECONDARY_EXEC_XSAVES,
-                                                 xsaves_enabled, false);
-       }
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
   
         /*
          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@@ -4646,6 -4623,7 +4651,7 @@@
                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
                                                   rdpid_or_rdtscp_enabled, false);
         }
+ 
         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
   
         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@@ -4679,8 -4657,7 +4685,8 @@@ static int vmx_alloc_ipiv_pid_table(str
         if (kvm_vmx->pid_table)
                 return 0;
   
- -      pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ +      pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ +                          vmx_get_pid_table_order(kvm));
         if (!pages)
                 return -ENOMEM;
   
@@@ -5393,11 -5370,18 +5399,11 @@@ static int handle_set_cr0(struct kvm_vc
                 val = (val & ~vmcs12->cr0_guest_host_mask) |
                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
   
- -              if (!nested_guest_cr0_valid(vcpu, val))
- -                      return 1;
- -
                 if (kvm_set_cr0(vcpu, val))
                         return 1;
                 vmcs_writel(CR0_READ_SHADOW, orig_val);
                 return 0;
         } else {
- -              if (to_vmx(vcpu)->nested.vmxon &&
- -                  !nested_host_cr0_valid(vcpu, val))
- -                      return 1;
- -
                 return kvm_set_cr0(vcpu, val);
         }
   }
@@@ -6789,10 -6773,8 +6795,10 @@@ static void vmx_set_apic_access_page_ad
         vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
         read_unlock(&vcpu->kvm->mmu_lock);
   
- -      vmx_flush_tlb_current(vcpu);
- -
+ +      /*
+ +       * No need for a manual TLB flush at this point, KVM has already done a
+ +       * flush if there were SPTEs pointing at the previous page.
+ +       */
   out:
         /*
          * Do not pin apic access page in memory, the MMU notifier
@@@ -7238,20 -7220,13 +7244,20 @@@ static noinstr void vmx_vcpu_enter_exit
                                    flags);
   
         vcpu->arch.cr2 = native_read_cr2();
+ +      vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+ +
+ +      vmx->idt_vectoring_info = 0;
   
         vmx_enable_fb_clear(vmx);
   
- -      if (unlikely(vmx->fail))
+ +      if (unlikely(vmx->fail)) {
                 vmx->exit_reason.full = 0xdead;
- -      else
- -              vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ +              goto out;
+ +      }
+ +
+ +      vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ +      if (likely(!vmx->exit_reason.failed_vmentry))
+ +              vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
   
         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
             is_nmi(vmx_get_intr_info(vcpu))) {
@@@ -7260,7 -7235,6 +7266,7 @@@
                 kvm_after_interrupt(vcpu);
         }
   
+ +out:
         guest_state_exit_irqoff();
   }
   
@@@ -7382,6 -7356,8 +7388,6 @@@ static fastpath_t vmx_vcpu_run(struct k
         loadsegment(es, __USER_DS);
   #endif
   
- -      vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
- -
         pt_guest_exit(vmx);
   
         kvm_load_host_xsave_state(vcpu);
@@@ -7398,12 -7374,17 +7404,12 @@@
                 vmx->nested.nested_run_pending = 0;
         }
   
- -      vmx->idt_vectoring_info = 0;
- -
         if (unlikely(vmx->fail))
                 return EXIT_FASTPATH_NONE;
   
         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
                 kvm_machine_check();
   
- -      if (likely(!vmx->exit_reason.failed_vmentry))
- -              vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- -
         trace_kvm_exit(vcpu, KVM_ISA_VMX);
   
         if (unlikely(vmx->exit_reason.failed_vmentry))
@@@ -7747,8 -7728,16 +7753,16 @@@ static void vmx_vcpu_after_set_cpuid(st
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
-       /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
-       vcpu->arch.xsaves_enabled = false;
+       /*
+        * XSAVES is effectively enabled if and only if XSAVE is also exposed
+        * to the guest.  XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+        * set if and only if XSAVE is supported.
+        */
+       if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+               kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
+ 
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
   
         vmx_setup_uret_msrs(vmx);
   
@@@ -7756,7 -7745,7 +7770,7 @@@
                 vmcs_set_secondary_exec_control(vmx,
                                                 vmx_secondary_exec_control(vmx));
   
-       if (nested_vmx_allowed(vcpu))
+       if (guest_can_use(vcpu, X86_FEATURE_VMX))
                 vmx->msr_ia32_feature_control_valid_bits |=
                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@@ -7765,7 -7754,7 +7779,7 @@@
                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
   
-       if (nested_vmx_allowed(vcpu))
+       if (guest_can_use(vcpu, X86_FEATURE_VMX))
                 nested_vmx_cr_fixed1_bits_update(vcpu);
   
         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@@ -8228,7 -8217,6 +8242,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
         .set_segment = vmx_set_segment,
         .get_cpl = vmx_get_cpl,
         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ +      .is_valid_cr0 = vmx_is_valid_cr0,
         .set_cr0 = vmx_set_cr0,
         .is_valid_cr4 = vmx_is_valid_cr4,
         .set_cr4 = vmx_set_cr4,
@@@ -8522,7 -8510,7 +8536,7 @@@ static __init int hardware_setup(void
          */
         vmx_setup_me_spte_mask();
   
- -      kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ +      kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
                           ept_caps_to_lpage_level(vmx_capability.ept));
   
         /*
@@@ -8618,10 -8606,8 +8632,8 @@@ static void __vmx_exit(void
   {
         allow_smaller_maxphyaddr = false;
   
- #ifdef CONFIG_KEXEC_CORE
-       RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
-       synchronize_rcu();
- #endif
+       cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+ 
         vmx_cleanup_l1d_flush();
   }
   
@@@ -8662,18 -8648,14 +8674,14 @@@ static int __init vmx_init(void
         if (r)
                 goto err_l1d_flush;
   
-       vmx_setup_fb_clear_ctrl();
- 
         for_each_possible_cpu(cpu) {
                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
   
                 pi_init_cpu(cpu);
         }
   
- #ifdef CONFIG_KEXEC_CORE
-       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
-                          crash_vmclear_local_loaded_vmcss);
- #endif
+       cpu_emergency_register_virt_callback(vmx_emergency_disable);
+ 
         vmx_check_vmcs12_offsets();
   
         /*
diff --combined arch/x86/kvm/x86.c

index 94fa36ee073c2434159d91c40aef9a9a5b9c7ba9,7849ea0b0bf7ba9dbb5b1a784c6e60ec2845e914..0b38a046690eb6bfe21db7fddd8ba76322aec843
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -237,6 -237,9 +237,9 @@@ EXPORT_SYMBOL_GPL(enable_apicv)
   u64 __read_mostly host_xss;
   EXPORT_SYMBOL_GPL(host_xss);
   
+ u64 __read_mostly host_arch_capabilities;
+ EXPORT_SYMBOL_GPL(host_arch_capabilities);
+ 
   const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         KVM_GENERIC_VM_STATS(),
         STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@@ -906,22 -909,6 +909,22 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
   }
   EXPORT_SYMBOL_GPL(load_pdptrs);
   
+ +static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+ +{
+ +#ifdef CONFIG_X86_64
+ +      if (cr0 & 0xffffffff00000000UL)
+ +              return false;
+ +#endif
+ +
+ +      if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ +              return false;
+ +
+ +      if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ +              return false;
+ +
+ +      return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+ +}
+ +
   void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
   {
         /*
@@@ -968,13 -955,20 +971,13 @@@ int kvm_set_cr0(struct kvm_vcpu *vcpu, 
   {
         unsigned long old_cr0 = kvm_read_cr0(vcpu);
   
- -      cr0 |= X86_CR0_ET;
- -
- -#ifdef CONFIG_X86_64
- -      if (cr0 & 0xffffffff00000000UL)
+ +      if (!kvm_is_valid_cr0(vcpu, cr0))
                 return 1;
- -#endif
- -
- -      cr0 &= ~CR0_RESERVED_BITS;
   
- -      if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- -              return 1;
+ +      cr0 |= X86_CR0_ET;
   
- -      if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- -              return 1;
+ +      /* Write to CR0 reserved bits are ignored, even on Intel. */
+ +      cr0 &= ~CR0_RESERVED_BITS;
   
   #ifdef CONFIG_X86_64
         if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
@@@ -1021,7 -1015,7 +1024,7 @@@ void kvm_load_guest_xsave_state(struct 
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
   
-               if (vcpu->arch.xsaves_enabled &&
+               if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
                     vcpu->arch.ia32_xss != host_xss)
                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
         }
@@@ -1052,7 -1046,7 +1055,7 @@@ void kvm_load_host_xsave_state(struct k
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
   
-               if (vcpu->arch.xsaves_enabled &&
+               if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
                     vcpu->arch.ia32_xss != host_xss)
                         wrmsrl(MSR_IA32_XSS, host_xss);
         }
@@@ -1616,16 -1610,11 +1619,11 @@@ static bool kvm_is_immutable_feature_ms
          ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
          ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
          ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- -       ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ +       ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
   
   static u64 kvm_get_arch_capabilities(void)
   {
-       u64 data = 0;
- 
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
-               data &= KVM_SUPPORTED_ARCH_CAP;
-       }
+       u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
   
         /*
          * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@@ -1673,9 -1662,6 +1671,9 @@@
                  */
         }
   
+ +      if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ +              data |= ARCH_CAP_GDS_NO;
+ +
         return data;
   }
   
@@@ -2184,8 -2170,6 +2182,8 @@@ fastpath_t handle_fastpath_set_msr_irqo
         u64 data;
         fastpath_t ret = EXIT_FASTPATH_NONE;
   
+ +      kvm_vcpu_srcu_read_lock(vcpu);
+ +
         switch (msr) {
         case APIC_BASE_MSR + (APIC_ICR >> 4):
                 data = kvm_read_edx_eax(vcpu);
@@@ -2208,8 -2192,6 +2206,8 @@@
         if (ret != EXIT_FASTPATH_NONE)
                 trace_kvm_msr_write(msr, data);
   
+ +      kvm_vcpu_srcu_read_unlock(vcpu);
+ +
         return ret;
   }
   EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
@@@ -2631,7 -2613,7 +2629,7 @@@ static void kvm_vcpu_write_tsc_offset(s
         else
                 vcpu->arch.tsc_offset = l1_offset;
   
-       static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+       static_call(kvm_x86_write_tsc_offset)(vcpu);
   }
   
   static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
@@@ -2647,8 -2629,7 +2645,7 @@@
                 vcpu->arch.tsc_scaling_ratio = l1_multiplier;
   
         if (kvm_caps.has_tsc_control)
-               static_call(kvm_x86_write_tsc_multiplier)(
-                       vcpu, vcpu->arch.tsc_scaling_ratio);
+               static_call(kvm_x86_write_tsc_multiplier)(vcpu);
   }
   
   static inline bool kvm_check_tsc_unstable(void)
@@@ -4665,7 -4646,6 +4662,6 @@@ static int kvm_x86_dev_get_attr(struct 
                 return 0;
         default:
                 return -ENXIO;
-               break;
         }
   }
   
@@@ -6532,7 -6512,7 +6528,7 @@@ static void kvm_free_msr_filter(struct 
   static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
                               struct kvm_msr_filter_range *user_range)
   {
-       unsigned long *bitmap = NULL;
+       unsigned long *bitmap;
         size_t bitmap_size;
   
         if (!user_range->nmsrs)
@@@ -8245,11 -8225,6 +8241,6 @@@ static bool emulator_get_cpuid(struct x
         return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
   }
   
- static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
- {
-       return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
- }
- 
   static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
   {
         return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
@@@ -8351,7 -8326,6 +8342,6 @@@ static const struct x86_emulate_ops emu
         .fix_hypercall       = emulator_fix_hypercall,
         .intercept           = emulator_intercept,
         .get_cpuid           = emulator_get_cpuid,
-       .guest_has_long_mode = emulator_guest_has_long_mode,
         .guest_has_movbe     = emulator_guest_has_movbe,
         .guest_has_fxsr      = emulator_guest_has_fxsr,
         .guest_has_rdpid     = emulator_guest_has_rdpid,
@@@ -9172,7 -9146,7 +9162,7 @@@ static int kvmclock_cpu_down_prep(unsig
   static void tsc_khz_changed(void *data)
   {
         struct cpufreq_freqs *freq = data;
-       unsigned long khz = 0;
+       unsigned long khz;
   
         WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
   
@@@ -9512,6 -9486,9 +9502,9 @@@ static int __kvm_x86_vendor_init(struc
   
         kvm_init_pmu_capability(ops->pmu_ops);
   
+       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ 
         r = ops->hardware_setup();
         if (r != 0)
                 goto out_mmu_exit;
@@@ -10219,13 -10196,9 +10212,13 @@@ static int kvm_check_and_inject_events(
                 if (r < 0)
                         goto out;
                 if (r) {
- -                      kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- -                      static_call(kvm_x86_inject_irq)(vcpu, false);
- -                      WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ +                      int irq = kvm_cpu_get_interrupt(vcpu);
+ +
+ +                      if (!WARN_ON_ONCE(irq == -1)) {
+ +                              kvm_queue_interrupt(vcpu, irq, false);
+ +                              static_call(kvm_x86_inject_irq)(vcpu, false);
+ +                              WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ +                      }
                 }
                 if (kvm_cpu_has_injectable_intr(vcpu))
                         static_call(kvm_x86_enable_irq_window)(vcpu);
@@@ -11111,12 -11084,17 +11104,17 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                         r = -EINTR;
                         goto out;
                 }
+ 
                 /*
-                * It should be impossible for the hypervisor timer to be in
-                * use before KVM has ever run the vCPU.
+                * Don't bother switching APIC timer emulation from the
+                * hypervisor timer to the software timer, the only way for the
+                * APIC timer to be active is if userspace stuffed vCPU state,
+                * i.e. put the vCPU into a nonsensical state.  Only an INIT
+                * will transition the vCPU out of UNINITIALIZED (without more
+                * state stuffing from userspace), which will reset the local
+                * APIC and thus cancel the timer or drop the IRQ (if the timer
+                * already expired).
                  */
-               WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
- 
                 kvm_vcpu_srcu_read_unlock(vcpu);
                 kvm_vcpu_block(vcpu);
                 kvm_vcpu_srcu_read_lock(vcpu);
@@@ -11480,8 -11458,7 +11478,8 @@@ static bool kvm_is_valid_sregs(struct k
                         return false;
         }
   
- -      return kvm_is_valid_cr4(vcpu, sregs->cr4);
+ +      return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ +             kvm_is_valid_cr0(vcpu, sregs->cr0);
   }
   
   static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
@@@ -11798,22 -11775,15 +11796,22 @@@ static int sync_regs(struct kvm_vcpu *v
                 __set_regs(vcpu, &vcpu->run->s.regs.regs);
                 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
         }
+ +
         if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
- -              if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ +              struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
+ +
+ +              if (__set_sregs(vcpu, &sregs))
                         return -EINVAL;
+ +
                 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
         }
+ +
         if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
- -              if (kvm_vcpu_ioctl_x86_set_vcpu_events(
- -                              vcpu, &vcpu->run->s.regs.events))
+ +              struct kvm_vcpu_events events = vcpu->run->s.regs.events;
+ +
+ +              if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
                         return -EINVAL;
+ +
                 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
         }
   
@@@ -12779,7 -12749,7 +12777,7 @@@ static void kvm_mmu_slot_apply_flags(st
                  * See is_writable_pte() for more details (the case involving
                  * access-tracked SPTEs is particularly relevant).
                  */
- -              kvm_arch_flush_remote_tlbs_memslot(kvm, new);
+ +              kvm_flush_remote_tlbs_memslot(kvm, new);
         }
   }
   
@@@ -13213,7 -13183,7 +13211,7 @@@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoher
   
   bool kvm_arch_has_irq_bypass(void)
   {
- -      return true;
+ +      return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
   }
   
   int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
author	Paolo Bonzini <[email protected]>
	Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)
committer	Paolo Bonzini <[email protected]>
	Thu, 31 Aug 2023 17:36:33 +0000 (13:36 -0400)
		1	2
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/cpuid.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history