Merge tag 'kvm-x86-vmx-6.12' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <[email protected]>

Sat, 14 Sep 2024 13:56:06 +0000 (09:56 -0400)

committer Paolo Bonzini <[email protected]>

Tue, 17 Sep 2024 16:41:23 +0000 (12:41 -0400)
author Paolo Bonzini <[email protected]>
Sat, 14 Sep 2024 13:56:06 +0000 (09:56 -0400)
committer Paolo Bonzini <[email protected]>
Tue, 17 Sep 2024 16:41:23 +0000 (12:41 -0400)
diff --combined Documentation/virt/kvm/api.rst

index b4d1cf2e4628855d4c76219fd71cc1f4f985a865,a4b7dc4a9ddaa63c4302f8721473493c34fed1a3..e32471977d0a23933698952ae3d9057fa6aeea3b
--- 1/Documentation/virt/kvm/api.rst
--- 2/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@@ -4214,7 -4214,9 +4214,9 @@@ whether or not KVM_CAP_X86_USER_SPACE_M
   enabled.  If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
   on denied accesses, i.e. userspace effectively intercepts the MSR access.  If
   KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
- on denied accesses.
+ on denied accesses.  Note, if an MSR access is denied during emulation of MSR
+ load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER.
+ See the below warning for full details.
   
   If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
   the access in accordance with the vCPU model.  Note, KVM may still ultimately
@@@ -4229,9 -4231,22 +4231,22 @@@ filtering. In that mode, ``KVM_MSR_FILT
   an error.
   
   .. warning::
-    MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
-    This includes both writes to individual VMCS fields and reads/writes
-    through the MSR lists pointed to by the VMCS.
+    MSR accesses that are side effects of instruction execution (emulated or
+    native) are not filtered as hardware does not honor MSR bitmaps outside of
+    RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions
+    to avoid pointless divergence from hardware.  E.g. RDPID reads MSR_TSC_AUX,
+    SYSENTER reads the SYSENTER MSRs, etc.
+ 
+    MSRs that are loaded/stored via dedicated VMCS fields are not filtered as
+    part of VM-Enter/VM-Exit emulation.
+ 
+    MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part
+    of VM-Enter/VM-Exit emulation.  If an MSR access is denied on VM-Enter, KVM
+    synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL).  If an
+    MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort.  In short, KVM
+    extends Intel's architectural list of MSRs that cannot be loaded/saved via
+    the VM-Enter/VM-Exit MSR list.  It is platform owner's responsibility to
+    to communicate any such restrictions to their end users.
   
      x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
      cover any x2APIC MSRs).
@@@ -8082,14 -8097,6 +8097,14 @@@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By 
                                       guest CPUID on writes to MISC_ENABLE if
                                       KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
                                       disabled.
+ +
+ +KVM_X86_QUIRK_SLOT_ZAP_ALL          By default, KVM invalidates all SPTEs in
+ +                                    fast way for memslot deletion when VM type
+ +                                    is KVM_X86_DEFAULT_VM.
+ +                                    When this quirk is disabled or when VM type
+ +                                    is other than KVM_X86_DEFAULT_VM, KVM zaps
+ +                                    only leaf SPTEs that are within the range of
+ +                                    the memslot being deleted.
   =================================== ============================================
   
   7.32 KVM_CAP_MAX_VCPU_ID
diff --combined arch/x86/include/asm/kvm_host.h

index 5f794814226fd98b0492ec313c62f36a49232d08,aa31c4b94977469af40351a94252f1030befcfaa..6d9f763a7bb9d5db422ea5625b2c28420bd14f26
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -36,7 -36,6 +36,7 @@@
   #include <asm/kvm_page_track.h>
   #include <asm/kvm_vcpu_regs.h>
   #include <asm/hyperv-tlfs.h>
+ +#include <asm/reboot.h>
   
   #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
   
@@@ -212,7 -211,6 +212,7 @@@ enum exit_fastpath_completion 
         EXIT_FASTPATH_NONE,
         EXIT_FASTPATH_REENTER_GUEST,
         EXIT_FASTPATH_EXIT_HANDLED,
+ +      EXIT_FASTPATH_EXIT_USERSPACE,
   };
   typedef enum exit_fastpath_completion fastpath_t;
   
@@@ -282,6 -280,10 +282,6 @@@ enum x86_intercept_stage
   #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
   #define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
   
- -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |      \
- -                               PFERR_WRITE_MASK |             \
- -                               PFERR_PRESENT_MASK)
- -
   /* apic attention bits */
   #define KVM_APIC_CHECK_VAPIC  0
   /*
@@@ -1627,10 -1629,8 +1627,10 @@@ struct kvm_x86_ops 
   
         int (*check_processor_compatibility)(void);
   
- -      int (*hardware_enable)(void);
- -      void (*hardware_disable)(void);
+ +      int (*enable_virtualization_cpu)(void);
+ +      void (*disable_virtualization_cpu)(void);
+ +      cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
+ +
         void (*hardware_unsetup)(void);
         bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
         void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@@ -1727,8 -1727,6 +1727,8 @@@
         void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
         void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ +
+ +      const bool x2apic_icr_is_split;
         const unsigned long required_apicv_inhibits;
         bool allow_apicv_in_x2apic_without_x2apic_virtualization;
         void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@@ -1808,7 -1806,7 +1808,7 @@@
         int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
         void (*guest_memory_reclaimed)(struct kvm *kvm);
   
- -      int (*get_msr_feature)(struct kvm_msr_entry *entry);
+ +      int (*get_feature_msr)(u32 msr, u64 *data);
   
         int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
                                          void *insn, int insn_len);
@@@ -2062,6 -2060,8 +2062,8 @@@ void kvm_prepare_emulation_failure_exit
   
   void kvm_enable_efer_bits(u64);
   bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+ int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+ int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
   int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
   int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
   int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
@@@ -2138,15 -2138,7 +2140,15 @@@ int kvm_get_nr_pending_nmis(struct kvm_
   
   void kvm_update_dr7(struct kvm_vcpu *vcpu);
   
- -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
+ +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ +                                     bool always_retry);
+ +
+ +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ +                                                 gpa_t cr2_or_gpa)
+ +{
+ +      return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
+ +}
+ +
   void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
                         ulong roots_to_free);
   void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
@@@ -2264,6 -2256,7 +2266,7 @@@ int kvm_cpu_has_injectable_intr(struct 
   int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
   int kvm_cpu_has_extint(struct kvm_vcpu *v);
   int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+ int kvm_cpu_get_extint(struct kvm_vcpu *v);
   int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
   void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
   
@@@ -2355,8 -2348,7 +2358,8 @@@ int memslot_rmap_alloc(struct kvm_memor
          KVM_X86_QUIRK_OUT_7E_INC_RIP |         \
          KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |   \
          KVM_X86_QUIRK_FIX_HYPERCALL_INSN |     \
- -       KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ +       KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |  \
+ +       KVM_X86_QUIRK_SLOT_ZAP_ALL)
   
   /*
    * KVM previously used a u32 field in kvm_run to indicate the hypercall was
diff --combined arch/x86/kvm/lapic.c

index c7180cb5f4640dd7cb42c41edc54fefb9fcff882,63e67b6301ec6bc5ba833fedaef798e2085b2fb6..2098dc689088bbd4d53353d68fe6e0fb8a6b47a4
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -1944,7 -1944,7 +1944,7 @@@ static void start_sw_tscdeadline(struc
         u64 ns = 0;
         ktime_t expire;
         struct kvm_vcpu *vcpu = apic->vcpu;
- -      unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ +      u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
         unsigned long flags;
         ktime_t now;
   
@@@ -2453,43 -2453,6 +2453,43 @@@ void kvm_lapic_set_eoi(struct kvm_vcpu 
   }
   EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
   
+ +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+ +
+ +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+ +{
+ +      if (data & X2APIC_ICR_RESERVED_BITS)
+ +              return 1;
+ +
+ +      /*
+ +       * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ +       * only AMD requires it to be zero, Intel essentially just ignores the
+ +       * bit.  And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ +       * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ +       * behavior will "win".  Arbitrarily clear the BUSY bit, as there is no
+ +       * sane way to provide consistent behavior with respect to hardware.
+ +       */
+ +      data &= ~APIC_ICR_BUSY;
+ +
+ +      kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ +      if (kvm_x86_ops.x2apic_icr_is_split) {
+ +              kvm_lapic_set_reg(apic, APIC_ICR, data);
+ +              kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ +      } else {
+ +              kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ +      }
+ +      trace_kvm_apic_write(APIC_ICR, data);
+ +      return 0;
+ +}
+ +
+ +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+ +{
+ +      if (kvm_x86_ops.x2apic_icr_is_split)
+ +              return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ +                     (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+ +
+ +      return kvm_lapic_get_reg64(apic, APIC_ICR);
+ +}
+ +
   /* emulate APIC access in a trap manner */
   void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
   {
@@@ -2507,7 -2470,7 +2507,7 @@@
          * maybe-unecessary write, and both are in the noise anyways.
          */
         if (apic_x2apic_mode(apic) && offset == APIC_ICR)
- -              kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
+ +              WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
         else
                 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
   }
@@@ -2959,14 -2922,13 +2959,13 @@@ void kvm_inject_apic_timer_irqs(struct 
         }
   }
   
- int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
   {
-       int vector = kvm_apic_has_interrupt(vcpu);
         struct kvm_lapic *apic = vcpu->arch.apic;
         u32 ppr;
   
-       if (vector == -1)
-               return -1;
+       if (WARN_ON_ONCE(vector < 0 || !apic))
+               return;
   
         /*
          * We get here even with APIC virtualization enabled, if doing
@@@ -2994,8 -2956,8 +2993,8 @@@
                 __apic_update_ppr(apic, &ppr);
         }
   
-       return vector;
   }
+ EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
   
   static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
                 struct kvm_lapic_state *s, bool set)
@@@ -3027,22 -2989,18 +3026,22 @@@
   
                 /*
                  * In x2APIC mode, the LDR is fixed and based on the id.  And
- -               * ICR is internally a single 64-bit register, but needs to be
- -               * split to ICR+ICR2 in userspace for backwards compatibility.
+ +               * if the ICR is _not_ split, ICR is internally a single 64-bit
+ +               * register, but needs to be split to ICR+ICR2 in userspace for
+ +               * backwards compatibility.
                  */
- -              if (set) {
+ +              if (set)
                         *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
   
- -                      icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- -                            (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- -                      __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
- -              } else {
- -                      icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- -                      __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ +              if (!kvm_x86_ops.x2apic_icr_is_split) {
+ +                      if (set) {
+ +                              icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ +                                    (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ +                              __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ +                      } else {
+ +                              icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ +                              __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ +                      }
                 }
         }
   
@@@ -3235,12 -3193,22 +3234,12 @@@ int kvm_lapic_set_vapic_addr(struct kvm
         return 0;
   }
   
- -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
- -{
- -      data &= ~APIC_ICR_BUSY;
- -
- -      kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
- -      kvm_lapic_set_reg64(apic, APIC_ICR, data);
- -      trace_kvm_apic_write(APIC_ICR, data);
- -      return 0;
- -}
- -
   static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
   {
         u32 low;
   
         if (reg == APIC_ICR) {
- -              *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ +              *data = kvm_x2apic_icr_read(apic);
                 return 0;
         }
   
diff --combined arch/x86/kvm/lapic.h

index 7c95eedd771e75c1371beaf97ad5b0564d593ca1,8310ff74be29768ee9af77edb1c9238ac0e5a90f..1b8ef9856422a45d246cfd60f05de6dc70ec88b3
--- 1/arch/x86/kvm/lapic.h
--- 2/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@@ -88,14 -88,15 +88,14 @@@ int kvm_create_lapic(struct kvm_vcpu *v
   void kvm_free_lapic(struct kvm_vcpu *vcpu);
   
   int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
   int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
- int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
   int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
   void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
   u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
   void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
   void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
   void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
- -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
   void kvm_recalculate_apic_map(struct kvm *kvm);
   void kvm_apic_set_version(struct kvm_vcpu *vcpu);
   void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
diff --combined arch/x86/kvm/vmx/nested.c

index 97ecc2722c8fd19ceaf8102f21781fcf265f6313,fc3d2ba036f6002a592178339b63b26998cb9935..a8e7bc04d9bf365277332f0174b617da5747da45
--- 1/arch/x86/kvm/vmx/nested.c
--- 2/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@@ -981,7 -981,7 +981,7 @@@ static u32 nested_vmx_load_msr(struct k
                                 __func__, i, e.index, e.reserved);
                         goto fail;
                 }
-               if (kvm_set_msr(vcpu, e.index, e.value)) {
+               if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
                         pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, e.value);
@@@ -1017,7 -1017,7 +1017,7 @@@ static bool nested_vmx_get_vmexit_msr_v
                 }
         }
   
-       if (kvm_get_msr(vcpu, msr_index, data)) {
+       if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
                 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
                         msr_index);
                 return false;
@@@ -1112,9 -1112,9 +1112,9 @@@ static void prepare_vmx_msr_autostore_l
                         /*
                          * Emulated VMEntry does not fail here.  Instead a less
                          * accurate value will be returned by
-                        * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
-                        * instead of reading the value from the vmcs02 VMExit
-                        * MSR-store area.
+                        * nested_vmx_get_vmexit_msr_value() by reading KVM's
+                        * internal MSR state instead of reading the value from
+                        * the vmcs02 VMExit MSR-store area.
                          */
                         pr_warn_ratelimited(
                                 "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
@@@ -1251,32 -1251,21 +1251,32 @@@ static bool is_bitwise_subset(u64 super
   
   static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
   {
- -      const u64 feature_and_reserved =
- -              /* feature (except bit 48; see below) */
- -              BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
- -              /* reserved */
- -              BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ +      const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ +                               VMX_BASIC_INOUT |
+ +                               VMX_BASIC_TRUE_CTLS;
+ +
+ +      const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ +                                GENMASK_ULL(47, 45) |
+ +                                BIT_ULL(31);
+ +
         u64 vmx_basic = vmcs_config.nested.basic;
   
- -      if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ +      BUILD_BUG_ON(feature_bits & reserved_bits);
+ +
+ +      /*
+ +       * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ +       * inverted polarity), the incoming value must not set feature bits or
+ +       * reserved bits that aren't allowed/supported by KVM.  Fields, i.e.
+ +       * multi-bit values, are explicitly checked below.
+ +       */
+ +      if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
                 return -EINVAL;
   
         /*
          * KVM does not emulate a version of VMX that constrains physical
          * addresses of VMX structures (e.g. VMCS) to 32-bits.
          */
- -      if (data & BIT_ULL(48))
+ +      if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                 return -EINVAL;
   
         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@@ -1345,29 -1334,16 +1345,29 @@@ vmx_restore_control_msr(struct vcpu_vm
   
   static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
   {
- -      const u64 feature_and_reserved_bits =
- -              /* feature */
- -              BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
- -              BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
- -              /* reserved */
- -              GENMASK_ULL(13, 9) | BIT_ULL(31);
+ +      const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ +                               VMX_MISC_ACTIVITY_HLT |
+ +                               VMX_MISC_ACTIVITY_SHUTDOWN |
+ +                               VMX_MISC_ACTIVITY_WAIT_SIPI |
+ +                               VMX_MISC_INTEL_PT |
+ +                               VMX_MISC_RDMSR_IN_SMM |
+ +                               VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ +                               VMX_MISC_VMXOFF_BLOCK_SMI |
+ +                               VMX_MISC_ZERO_LEN_INS;
+ +
+ +      const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+ +
         u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
                                        vmcs_config.nested.misc_high);
   
- -      if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ +      BUILD_BUG_ON(feature_bits & reserved_bits);
+ +
+ +      /*
+ +       * The incoming value must not set feature bits or reserved bits that
+ +       * aren't allowed/supported by KVM.  Fields, i.e. multi-bit values, are
+ +       * explicitly checked below.
+ +       */
+ +      if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
                 return -EINVAL;
   
         if ((vmx->nested.msrs.pinbased_ctls_high &
@@@ -2341,10 -2317,12 +2341,12 @@@ static void prepare_vmcs02_early(struc
   
         /* Posted interrupts setting is only taken from vmcs12.  */
         vmx->nested.pi_pending = false;
-       if (nested_cpu_has_posted_intr(vmcs12))
+       if (nested_cpu_has_posted_intr(vmcs12)) {
                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
-       else
+       } else {
+               vmx->nested.posted_intr_nv = -1;
                 exec_control &= ~PIN_BASED_POSTED_INTR;
+       }
         pin_controls_set(vmx, exec_control);
   
         /*
@@@ -2494,6 -2472,7 +2496,7 @@@ static void prepare_vmcs02_rare(struct 
   
         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+ 
                 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
                 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
                 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@@ -2531,7 -2510,7 +2534,7 @@@
                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
   
-               vmx->segment_cache.bitmask = 0;
+               vmx_segment_cache_clear(vmx);
         }
   
         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@@ -4308,11 -4287,52 +4311,52 @@@ static int vmx_check_nested_events(stru
         }
   
         if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+               int irq;
+ 
                 if (block_nested_events)
                         return -EBUSY;
                 if (!nested_exit_on_intr(vcpu))
                         goto no_vmexit;
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ 
+               if (!nested_exit_intr_ack_set(vcpu)) {
+                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+                       return 0;
+               }
+ 
+               irq = kvm_cpu_get_extint(vcpu);
+               if (irq != -1) {
+                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+                                         INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+                       return 0;
+               }
+ 
+               irq = kvm_apic_has_interrupt(vcpu);
+               if (WARN_ON_ONCE(irq < 0))
+                       goto no_vmexit;
+ 
+               /*
+                * If the IRQ is L2's PI notification vector, process posted
+                * interrupts for L2 instead of injecting VM-Exit, as the
+                * detection/morphing architecturally occurs when the IRQ is
+                * delivered to the CPU.  Note, only interrupts that are routed
+                * through the local APIC trigger posted interrupt processing,
+                * and enabling posted interrupts requires ACK-on-exit.
+                */
+               if (irq == vmx->nested.posted_intr_nv) {
+                       vmx->nested.pi_pending = true;
+                       kvm_apic_clear_irr(vcpu, irq);
+                       goto no_vmexit;
+               }
+ 
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ 
+               /*
+                * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+                * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+                * if APICv is active.
+                */
+               kvm_apic_ack_interrupt(vcpu, irq);
                 return 0;
         }
   
@@@ -4830,7 -4850,7 +4874,7 @@@ static void nested_vmx_restore_host_sta
                                 goto vmabort;
                         }
   
-                       if (kvm_set_msr(vcpu, h.index, h.value)) {
+                       if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
                                 pr_debug_ratelimited(
                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
                                         __func__, j, h.index, h.value);
@@@ -4993,14 -5013,6 +5037,6 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
   
         if (likely(!vmx->fail)) {
-               if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-                   nested_exit_intr_ack_set(vcpu)) {
-                       int irq = kvm_cpu_get_interrupt(vcpu);
-                       WARN_ON(irq < 0);
-                       vmcs12->vm_exit_intr_info = irq |
-                               INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
-               }
- 
                 if (vm_exit_reason != -1)
                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
                                                        vmcs12->exit_qualification,
@@@ -7075,7 -7087,7 +7111,7 @@@ static void nested_vmx_setup_misc_data(
   {
         msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
         msrs->misc_low |=
- -              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ +              VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                 VMX_MISC_ACTIVITY_HLT |
                 VMX_MISC_ACTIVITY_WAIT_SIPI;
@@@ -7090,10 -7102,12 +7126,10 @@@ static void nested_vmx_setup_basic(stru
          * guest, and the VMCS structure we give it - not about the
          * VMX support of the underlying hardware.
          */
- -      msrs->basic =
- -              VMCS12_REVISION |
- -              VMX_BASIC_TRUE_CTLS |
- -              ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- -              (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ +      msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ +                                               X86_MEMTYPE_WB);
   
+ +      msrs->basic |= VMX_BASIC_TRUE_CTLS;
         if (cpu_has_vmx_basic_inout())
                 msrs->basic |= VMX_BASIC_INOUT;
   }
diff --combined arch/x86/kvm/vmx/nested.h

index 0782fe599757aeb73c0f749e01e431038b3a162f,668b6c83a373c312957109d5f4953b454c42ffff..2c296b6abb8ccf0ed56459d479fd8eac281b06fe
--- 1/arch/x86/kvm/vmx/nested.h
--- 2/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@@ -39,11 -39,17 +39,17 @@@ bool nested_vmx_check_io_bitmaps(struc
   
   static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
   {
+       lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+                           !refcount_read(&vcpu->kvm->users_count));
+ 
         return to_vmx(vcpu)->nested.cached_vmcs12;
   }
   
   static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
   {
+       lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+                           !refcount_read(&vcpu->kvm->users_count));
+ 
         return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
   }
   
@@@ -109,7 -115,7 +115,7 @@@ static inline unsigned nested_cpu_vmx_m
   static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
   {
         return to_vmx(vcpu)->nested.msrs.misc_low &
- -              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ +              VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
   }
   
   static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --combined arch/x86/kvm/vmx/vmx.c

index 9cfcfebd5f99bd08867d2f2f186899d1b8170dc4,ec1aee1f9057e99f33f980bd902c46fce2669388..c67e448c6ebd709fcd367e387723b7046e36800b
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -525,10 -525,6 +525,6 @@@ static const struct kvm_vmx_segment_fie
         VMX_SEGMENT_FIELD(LDTR),
   };
   
- static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
- {
-       vmx->segment_cache.bitmask = 0;
- }
   
   static unsigned long host_idt_base;
   
@@@ -755,7 -751,7 +751,7 @@@ fault
         return -EIO;
   }
   
- -static void vmx_emergency_disable(void)
+ +void vmx_emergency_disable_virtualization_cpu(void)
   {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
@@@ -1998,15 -1994,15 +1994,15 @@@ static inline bool is_vmx_feature_contr
         return !(msr->data & ~valid_bits);
   }
   
- -int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+ +int vmx_get_feature_msr(u32 msr, u64 *data)
   {
- -      switch (msr->index) {
+ +      switch (msr) {
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                 if (!nested)
                         return 1;
- -              return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ +              return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
         default:
- -              return KVM_MSR_RET_INVALID;
+ +              return KVM_MSR_RET_UNSUPPORTED;
         }
   }
   
@@@ -2605,13 -2601,13 +2601,13 @@@ static u64 adjust_vmx_controls64(u64 ct
   static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                              struct vmx_capability *vmx_cap)
   {
- -      u32 vmx_msr_low, vmx_msr_high;
         u32 _pin_based_exec_control = 0;
         u32 _cpu_based_exec_control = 0;
         u32 _cpu_based_2nd_exec_control = 0;
         u64 _cpu_based_3rd_exec_control = 0;
         u32 _vmexit_control = 0;
         u32 _vmentry_control = 0;
+ +      u64 basic_msr;
         u64 misc_msr;
         int i;
   
@@@ -2734,29 -2730,29 +2730,29 @@@
                 _vmexit_control &= ~x_ctrl;
         }
   
- -      rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ +      rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
   
         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- -      if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ +      if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
                 return -EIO;
   
   #ifdef CONFIG_X86_64
- -      /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- -      if (vmx_msr_high & (1u<<16))
+ +      /*
+ +       * KVM expects to be able to shove all legal physical addresses into
+ +       * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ +       * 0 for processors that support Intel 64 architecture".
+ +       */
+ +      if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                 return -EIO;
   #endif
   
         /* Require Write-Back (WB) memory type for VMCS accesses. */
- -      if (((vmx_msr_high >> 18) & 15) != 6)
+ +      if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
                 return -EIO;
   
         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
   
- -      vmcs_conf->size = vmx_msr_high & 0x1fff;
- -      vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- -
- -      vmcs_conf->revision_id = vmx_msr_low;
- -
+ +      vmcs_conf->basic = basic_msr;
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@@ -2844,7 -2840,7 +2840,7 @@@ fault
         return -EFAULT;
   }
   
- -int vmx_hardware_enable(void)
+ +int vmx_enable_virtualization_cpu(void)
   {
         int cpu = raw_smp_processor_id();
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@@ -2881,7 -2877,7 +2877,7 @@@ static void vmclear_local_loaded_vmcss(
                 __loaded_vmcs_clear(v);
   }
   
- -void vmx_hardware_disable(void)
+ +void vmx_disable_virtualization_cpu(void)
   {
         vmclear_local_loaded_vmcss();
   
@@@ -2903,13 -2899,13 +2899,13 @@@ struct vmcs *alloc_vmcs_cpu(bool shadow
         if (!pages)
                 return NULL;
         vmcs = page_address(pages);
- -      memset(vmcs, 0, vmcs_config.size);
+ +      memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
   
         /* KVM supports Enlightened VMCS v1 only */
         if (kvm_is_using_evmcs())
                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
         else
- -              vmcs->hdr.revision_id = vmcs_config.revision_id;
+ +              vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
   
         if (shadow)
                 vmcs->hdr.shadow_vmcs = 1;
@@@ -3002,7 -2998,7 +2998,7 @@@ static __init int alloc_kvm_area(void
                  * physical CPU.
                  */
                 if (kvm_is_using_evmcs())
- -                      vmcs->hdr.revision_id = vmcs_config.revision_id;
+ +                      vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
   
                 per_cpu(vmxarea, cpu) = vmcs;
         }
@@@ -4219,6 -4215,13 +4215,13 @@@ static int vmx_deliver_nested_posted_in
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
+       /*
+        * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+        * and freed, and must not be accessed outside of vcpu->mutex.  The
+        * vCPU's cached PI NV is valid if and only if posted interrupts
+        * enabled in its vmcs12, i.e. checking the vector also checks that
+        * L1 has enabled posted interrupts for L2.
+        */
         if (is_guest_mode(vcpu) &&
             vector == vmx->nested.posted_intr_nv) {
                 /*
@@@ -5804,8 -5807,9 +5807,9 @@@ static int handle_ept_violation(struct 
         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
                       ? PFERR_PRESENT_MASK : 0;
   
-       error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
-              PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+       if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+               error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+                             PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
   
         /*
          * Check that the GPA doesn't exceed physical memory limits, as that is
@@@ -7265,8 -7269,6 +7269,8 @@@ static fastpath_t vmx_exit_handlers_fas
                 return handle_fastpath_set_msr_irqoff(vcpu);
         case EXIT_REASON_PREEMPTION_TIMER:
                 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ +      case EXIT_REASON_HLT:
+ +              return handle_fastpath_hlt(vcpu);
         default:
                 return EXIT_FASTPATH_NONE;
         }
@@@ -7969,6 -7971,7 +7973,7 @@@ static __init void vmx_set_cpu_caps(voi
                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
         }
   
         if (vmx_umip_emulated())
@@@ -8519,7 -8522,7 +8524,7 @@@ __init int vmx_hardware_setup(void
                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
   
                 cpu_preemption_timer_multi =
- -                      vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ +                      vmx_misc_preemption_timer_rate(vmcs_config.misc);
   
                 if (tsc_khz)
                         use_timer_freq = (u64)tsc_khz * 1000;
@@@ -8586,6 -8589,8 +8591,6 @@@ static void __vmx_exit(void
   {
         allow_smaller_maxphyaddr = false;
   
- -      cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
- -
         vmx_cleanup_l1d_flush();
   }
   
@@@ -8632,6 -8637,8 +8637,6 @@@ static int __init vmx_init(void
                 pi_init_cpu(cpu);
         }
   
- -      cpu_emergency_register_virt_callback(vmx_emergency_disable);
- -
         vmx_check_vmcs12_offsets();
   
         /*
diff --combined arch/x86/kvm/vmx/vmx.h

index 3839afb921e220a88dc8a8608f06bb46ba58d86a,11b1b70faef29f7081d0e576870d3ce5951e6667..2325f773a20be0ea8068675a17c634c15984f41d
--- 1/arch/x86/kvm/vmx/vmx.h
--- 2/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@@ -17,6 -17,10 +17,6 @@@
   #include "run_flags.h"
   #include "../mmu.h"
   
- -#define MSR_TYPE_R    1
- -#define MSR_TYPE_W    2
- -#define MSR_TYPE_RW   3
- -
   #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
   
   #ifdef CONFIG_X86_64
@@@ -752,4 -756,9 +752,9 @@@ static inline bool vmx_can_use_ipiv(str
         return  lapic_in_kernel(vcpu) && enable_ipiv;
   }
   
+ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+ {
+       vmx->segment_cache.bitmask = 0;
+ }
+ 
   #endif /* __KVM_X86_VMX_H */
diff --combined arch/x86/kvm/x86.c

index 0c1d54d9ef4574d874feb0a0b82380549bf3257a,34b52b49f5e689a9ac0c3d62656cfd16aa670c8a..83fe0a78146fc198115aba0e76ba57ecfb1dd8d9
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -305,237 -305,24 +305,237 @@@ const struct kvm_stats_header kvm_vcpu_
   static struct kmem_cache *x86_emulator_cache;
   
   /*
- - * When called, it means the previous get/set msr reached an invalid msr.
- - * Return true if we want to ignore/silent this failed msr access.
+ + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.  msrs_to_save holds MSRs that
+ + * require host support, i.e. should be probed via RDMSR.  emulated_msrs holds
+ + * MSRs that KVM emulates without strictly requiring host support.
+ + * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ + * CPUID leafs.  Note, msr_based_features isn't mutually exclusive with
+ + * msrs_to_save and emulated_msrs.
    */
- -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
+ +
+ +static const u32 msrs_to_save_base[] = {
+ +      MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ +      MSR_STAR,
+ +#ifdef CONFIG_X86_64
+ +      MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+ +#endif
+ +      MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ +      MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ +      MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ +      MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ +      MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ +      MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ +      MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ +      MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ +      MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ +      MSR_IA32_UMWAIT_CONTROL,
+ +
+ +      MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+ +};
+ +
+ +static const u32 msrs_to_save_pmu[] = {
+ +      MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ +      MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ +      MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ +      MSR_CORE_PERF_GLOBAL_CTRL,
+ +      MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+ +
+ +      /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ +      MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ +      MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ +      MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ +      MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ +      MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ +      MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ +      MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ +      MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+ +
+ +      MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ +      MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+ +
+ +      /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ +      MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ +      MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ +      MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ +      MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+ +
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ +};
+ +
+ +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ +                      ARRAY_SIZE(msrs_to_save_pmu)];
+ +static unsigned num_msrs_to_save;
+ +
+ +static const u32 emulated_msrs_all[] = {
+ +      MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ +      MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ +
+ +#ifdef CONFIG_KVM_HYPERV
+ +      HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ +      HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ +      HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ +      HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ +      HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ +      HV_X64_MSR_RESET,
+ +      HV_X64_MSR_VP_INDEX,
+ +      HV_X64_MSR_VP_RUNTIME,
+ +      HV_X64_MSR_SCONTROL,
+ +      HV_X64_MSR_STIMER0_CONFIG,
+ +      HV_X64_MSR_VP_ASSIST_PAGE,
+ +      HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ +      HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ +      HV_X64_MSR_SYNDBG_OPTIONS,
+ +      HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ +      HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ +      HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+ +#endif
+ +
+ +      MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ +      MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+ +
+ +      MSR_IA32_TSC_ADJUST,
+ +      MSR_IA32_TSC_DEADLINE,
+ +      MSR_IA32_ARCH_CAPABILITIES,
+ +      MSR_IA32_PERF_CAPABILITIES,
+ +      MSR_IA32_MISC_ENABLE,
+ +      MSR_IA32_MCG_STATUS,
+ +      MSR_IA32_MCG_CTL,
+ +      MSR_IA32_MCG_EXT_CTL,
+ +      MSR_IA32_SMBASE,
+ +      MSR_SMI_COUNT,
+ +      MSR_PLATFORM_INFO,
+ +      MSR_MISC_FEATURES_ENABLES,
+ +      MSR_AMD64_VIRT_SPEC_CTRL,
+ +      MSR_AMD64_TSC_RATIO,
+ +      MSR_IA32_POWER_CTL,
+ +      MSR_IA32_UCODE_REV,
+ +
+ +      /*
+ +       * KVM always supports the "true" VMX control MSRs, even if the host
+ +       * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
+ +       * doesn't strictly require them to exist in the host (ignoring that
+ +       * KVM would refuse to load in the first place if the core set of MSRs
+ +       * aren't supported).
+ +       */
+ +      MSR_IA32_VMX_BASIC,
+ +      MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ +      MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ +      MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ +      MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ +      MSR_IA32_VMX_MISC,
+ +      MSR_IA32_VMX_CR0_FIXED0,
+ +      MSR_IA32_VMX_CR4_FIXED0,
+ +      MSR_IA32_VMX_VMCS_ENUM,
+ +      MSR_IA32_VMX_PROCBASED_CTLS2,
+ +      MSR_IA32_VMX_EPT_VPID_CAP,
+ +      MSR_IA32_VMX_VMFUNC,
+ +
+ +      MSR_K7_HWCR,
+ +      MSR_KVM_POLL_CONTROL,
+ +};
+ +
+ +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+ +static unsigned num_emulated_msrs;
+ +
+ +/*
+ + * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ + * that are effectively CPUID leafs.  VMX MSRs are also included in the set of
+ + * feature MSRs, but are handled separately to allow expedited lookups.
+ + */
+ +static const u32 msr_based_features_all_except_vmx[] = {
+ +      MSR_AMD64_DE_CFG,
+ +      MSR_IA32_UCODE_REV,
+ +      MSR_IA32_ARCH_CAPABILITIES,
+ +      MSR_IA32_PERF_CAPABILITIES,
+ +};
+ +
+ +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ +                            (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+ +static unsigned int num_msr_based_features;
+ +
+ +/*
+ + * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ + * patch, are immutable once the vCPU model is defined.
+ + */
+ +static bool kvm_is_immutable_feature_msr(u32 msr)
   {
- -      const char *op = write ? "wrmsr" : "rdmsr";
+ +      int i;
   
- -      if (ignore_msrs) {
- -              if (report_ignored_msrs)
- -                      kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
- -                                    op, msr, data);
- -              /* Mask the error */
+ +      if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
                 return true;
- -      } else {
+ +
+ +      for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ +              if (msr == msr_based_features_all_except_vmx[i])
+ +                      return msr != MSR_IA32_UCODE_REV;
+ +      }
+ +
+ +      return false;
+ +}
+ +
+ +static bool kvm_is_advertised_msr(u32 msr_index)
+ +{
+ +      unsigned int i;
+ +
+ +      for (i = 0; i < num_msrs_to_save; i++) {
+ +              if (msrs_to_save[i] == msr_index)
+ +                      return true;
+ +      }
+ +
+ +      for (i = 0; i < num_emulated_msrs; i++) {
+ +              if (emulated_msrs[i] == msr_index)
+ +                      return true;
+ +      }
+ +
+ +      return false;
+ +}
+ +
+ +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ +                          bool host_initiated);
+ +
+ +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ +                                           u64 *data, bool host_initiated,
+ +                                           enum kvm_msr_access rw,
+ +                                           msr_access_t msr_access_fn)
+ +{
+ +      const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ +      int ret;
+ +
+ +      BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+ +
+ +      /*
+ +       * Zero the data on read failures to avoid leaking stack data to the
+ +       * guest and/or userspace, e.g. if the failure is ignored below.
+ +       */
+ +      ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ +      if (ret && rw == MSR_TYPE_R)
+ +              *data = 0;
+ +
+ +      if (ret != KVM_MSR_RET_UNSUPPORTED)
+ +              return ret;
+ +
+ +      /*
+ +       * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ +       * advertises to userspace, even if an MSR isn't fully supported.
+ +       * Simply check that @data is '0', which covers both the write '0' case
+ +       * and all reads (in which case @data is zeroed on failure; see above).
+ +       */
+ +      if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ +              return 0;
+ +
+ +      if (!ignore_msrs) {
                 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
- -                                    op, msr, data);
- -              return false;
+ +                                    op, msr, *data);
+ +              return ret;
         }
+ +
+ +      if (report_ignored_msrs)
+ +              kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+ +
+ +      return 0;
   }
   
   static struct kmem_cache *kvm_alloc_emulator_cache(void)
@@@ -568,7 -355,7 +568,7 @@@ static void kvm_on_user_return(struct u
   
         /*
          * Disabling irqs at this point since the following code could be
- -       * interrupted and executed through kvm_arch_hardware_disable()
+ +       * interrupted and executed through kvm_arch_disable_virtualization_cpu()
          */
         local_irq_save(flags);
         if (msrs->registered) {
@@@ -626,7 -413,8 +626,7 @@@ EXPORT_SYMBOL_GPL(kvm_find_user_return_
   
   static void kvm_user_return_msr_cpu_online(void)
   {
- -      unsigned int cpu = smp_processor_id();
- -      struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ +      struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
         u64 value;
         int i;
   
@@@ -833,6 -621,12 +833,6 @@@ static void kvm_queue_exception_vmexit(
         ex->payload = payload;
   }
   
- -/* Forcibly leave the nested mode in cases like a vCPU reset */
- -static void kvm_leave_nested(struct kvm_vcpu *vcpu)
- -{
- -      kvm_x86_ops.nested_ops->leave_nested(vcpu);
- -}
- -
   static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                 unsigned nr, bool has_error, u32 error_code,
                 bool has_payload, unsigned long payload, bool reinject)
@@@ -1550,72 -1344,244 +1550,72 @@@ static u64 kvm_dr6_fixed(struct kvm_vcp
         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
                 fixed |= DR6_RTM;
   
- -      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
- -              fixed |= DR6_BUS_LOCK;
- -      return fixed;
- -}
- -
- -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
- -{
- -      size_t size = ARRAY_SIZE(vcpu->arch.db);
- -
- -      switch (dr) {
- -      case 0 ... 3:
- -              vcpu->arch.db[array_index_nospec(dr, size)] = val;
- -              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- -                      vcpu->arch.eff_db[dr] = val;
- -              break;
- -      case 4:
- -      case 6:
- -              if (!kvm_dr6_valid(val))
- -                      return 1; /* #GP */
- -              vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
- -              break;
- -      case 5:
- -      default: /* 7 */
- -              if (!kvm_dr7_valid(val))
- -                      return 1; /* #GP */
- -              vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- -              kvm_update_dr7(vcpu);
- -              break;
- -      }
- -
- -      return 0;
- -}
- -EXPORT_SYMBOL_GPL(kvm_set_dr);
- -
- -unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
- -{
- -      size_t size = ARRAY_SIZE(vcpu->arch.db);
- -
- -      switch (dr) {
- -      case 0 ... 3:
- -              return vcpu->arch.db[array_index_nospec(dr, size)];
- -      case 4:
- -      case 6:
- -              return vcpu->arch.dr6;
- -      case 5:
- -      default: /* 7 */
- -              return vcpu->arch.dr7;
- -      }
- -}
- -EXPORT_SYMBOL_GPL(kvm_get_dr);
- -
- -int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
- -{
- -      u32 ecx = kvm_rcx_read(vcpu);
- -      u64 data;
- -
- -      if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
- -              kvm_inject_gp(vcpu, 0);
- -              return 1;
- -      }
- -
- -      kvm_rax_write(vcpu, (u32)data);
- -      kvm_rdx_write(vcpu, data >> 32);
- -      return kvm_skip_emulated_instruction(vcpu);
- -}
- -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
- -
- -/*
- - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
- - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
- - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.  msrs_to_save holds MSRs that
- - * require host support, i.e. should be probed via RDMSR.  emulated_msrs holds
- - * MSRs that KVM emulates without strictly requiring host support.
- - * msr_based_features holds MSRs that enumerate features, i.e. are effectively
- - * CPUID leafs.  Note, msr_based_features isn't mutually exclusive with
- - * msrs_to_save and emulated_msrs.
- - */
- -
- -static const u32 msrs_to_save_base[] = {
- -      MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- -      MSR_STAR,
- -#ifdef CONFIG_X86_64
- -      MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
- -#endif
- -      MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- -      MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- -      MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
- -      MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
- -      MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
- -      MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
- -      MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
- -      MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
- -      MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
- -      MSR_IA32_UMWAIT_CONTROL,
- -
- -      MSR_IA32_XFD, MSR_IA32_XFD_ERR,
- -};
- -
- -static const u32 msrs_to_save_pmu[] = {
- -      MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- -      MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
- -      MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- -      MSR_CORE_PERF_GLOBAL_CTRL,
- -      MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
- -
- -      /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
- -      MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
- -      MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
- -      MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
- -      MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- -      MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
- -      MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- -      MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
- -      MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
- -
- -      MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
- -      MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
- -
- -      /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
- -      MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
- -      MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
- -      MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
- -      MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
- -
- -      MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
- -      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
- -      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
- -};
- -
- -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
- -                      ARRAY_SIZE(msrs_to_save_pmu)];
- -static unsigned num_msrs_to_save;
- -
- -static const u32 emulated_msrs_all[] = {
- -      MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- -      MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
- -
- -#ifdef CONFIG_KVM_HYPERV
- -      HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- -      HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- -      HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
- -      HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
- -      HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
- -      HV_X64_MSR_RESET,
- -      HV_X64_MSR_VP_INDEX,
- -      HV_X64_MSR_VP_RUNTIME,
- -      HV_X64_MSR_SCONTROL,
- -      HV_X64_MSR_STIMER0_CONFIG,
- -      HV_X64_MSR_VP_ASSIST_PAGE,
- -      HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- -      HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
- -      HV_X64_MSR_SYNDBG_OPTIONS,
- -      HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
- -      HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
- -      HV_X64_MSR_SYNDBG_PENDING_BUFFER,
- -#endif
- -
- -      MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- -      MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
- -
- -      MSR_IA32_TSC_ADJUST,
- -      MSR_IA32_TSC_DEADLINE,
- -      MSR_IA32_ARCH_CAPABILITIES,
- -      MSR_IA32_PERF_CAPABILITIES,
- -      MSR_IA32_MISC_ENABLE,
- -      MSR_IA32_MCG_STATUS,
- -      MSR_IA32_MCG_CTL,
- -      MSR_IA32_MCG_EXT_CTL,
- -      MSR_IA32_SMBASE,
- -      MSR_SMI_COUNT,
- -      MSR_PLATFORM_INFO,
- -      MSR_MISC_FEATURES_ENABLES,
- -      MSR_AMD64_VIRT_SPEC_CTRL,
- -      MSR_AMD64_TSC_RATIO,
- -      MSR_IA32_POWER_CTL,
- -      MSR_IA32_UCODE_REV,
- -
- -      /*
- -       * KVM always supports the "true" VMX control MSRs, even if the host
- -       * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
- -       * doesn't strictly require them to exist in the host (ignoring that
- -       * KVM would refuse to load in the first place if the core set of MSRs
- -       * aren't supported).
- -       */
- -      MSR_IA32_VMX_BASIC,
- -      MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- -      MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- -      MSR_IA32_VMX_TRUE_EXIT_CTLS,
- -      MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- -      MSR_IA32_VMX_MISC,
- -      MSR_IA32_VMX_CR0_FIXED0,
- -      MSR_IA32_VMX_CR4_FIXED0,
- -      MSR_IA32_VMX_VMCS_ENUM,
- -      MSR_IA32_VMX_PROCBASED_CTLS2,
- -      MSR_IA32_VMX_EPT_VPID_CAP,
- -      MSR_IA32_VMX_VMFUNC,
- -
- -      MSR_K7_HWCR,
- -      MSR_KVM_POLL_CONTROL,
- -};
+ +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ +              fixed |= DR6_BUS_LOCK;
+ +      return fixed;
+ +}
   
- -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
- -static unsigned num_emulated_msrs;
+ +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+ +{
+ +      size_t size = ARRAY_SIZE(vcpu->arch.db);
   
- -/*
- - * List of MSRs that control the existence of MSR-based features, i.e. MSRs
- - * that are effectively CPUID leafs.  VMX MSRs are also included in the set of
- - * feature MSRs, but are handled separately to allow expedited lookups.
- - */
- -static const u32 msr_based_features_all_except_vmx[] = {
- -      MSR_AMD64_DE_CFG,
- -      MSR_IA32_UCODE_REV,
- -      MSR_IA32_ARCH_CAPABILITIES,
- -      MSR_IA32_PERF_CAPABILITIES,
- -};
+ +      switch (dr) {
+ +      case 0 ... 3:
+ +              vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ +              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ +                      vcpu->arch.eff_db[dr] = val;
+ +              break;
+ +      case 4:
+ +      case 6:
+ +              if (!kvm_dr6_valid(val))
+ +                      return 1; /* #GP */
+ +              vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ +              break;
+ +      case 5:
+ +      default: /* 7 */
+ +              if (!kvm_dr7_valid(val))
+ +                      return 1; /* #GP */
+ +              vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ +              kvm_update_dr7(vcpu);
+ +              break;
+ +      }
   
- -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
- -                            (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
- -static unsigned int num_msr_based_features;
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_set_dr);
   
- -/*
- - * All feature MSRs except uCode revID, which tracks the currently loaded uCode
- - * patch, are immutable once the vCPU model is defined.
- - */
- -static bool kvm_is_immutable_feature_msr(u32 msr)
+ +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
   {
- -      int i;
+ +      size_t size = ARRAY_SIZE(vcpu->arch.db);
   
- -      if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
- -              return true;
+ +      switch (dr) {
+ +      case 0 ... 3:
+ +              return vcpu->arch.db[array_index_nospec(dr, size)];
+ +      case 4:
+ +      case 6:
+ +              return vcpu->arch.dr6;
+ +      case 5:
+ +      default: /* 7 */
+ +              return vcpu->arch.dr7;
+ +      }
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_get_dr);
   
- -      for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
- -              if (msr == msr_based_features_all_except_vmx[i])
- -                      return msr != MSR_IA32_UCODE_REV;
+ +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
+ +{
+ +      u32 ecx = kvm_rcx_read(vcpu);
+ +      u64 data;
+ +
+ +      if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ +              kvm_inject_gp(vcpu, 0);
+ +              return 1;
         }
   
- -      return false;
+ +      kvm_rax_write(vcpu, (u32)data);
+ +      kvm_rdx_write(vcpu, data >> 32);
+ +      return kvm_skip_emulated_instruction(vcpu);
   }
+ +EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
   
   /*
    * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
@@@ -1694,31 -1660,40 +1694,31 @@@ static u64 kvm_get_arch_capabilities(vo
         return data;
   }
   
- -static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+ +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ +                             bool host_initiated)
   {
- -      switch (msr->index) {
+ +      WARN_ON_ONCE(!host_initiated);
+ +
+ +      switch (index) {
         case MSR_IA32_ARCH_CAPABILITIES:
- -              msr->data = kvm_get_arch_capabilities();
+ +              *data = kvm_get_arch_capabilities();
                 break;
         case MSR_IA32_PERF_CAPABILITIES:
- -              msr->data = kvm_caps.supported_perf_cap;
+ +              *data = kvm_caps.supported_perf_cap;
                 break;
         case MSR_IA32_UCODE_REV:
- -              rdmsrl_safe(msr->index, &msr->data);
+ +              rdmsrl_safe(index, data);
                 break;
         default:
- -              return kvm_x86_call(get_msr_feature)(msr);
+ +              return kvm_x86_call(get_feature_msr)(index, data);
         }
         return 0;
   }
   
- -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+ +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
   {
- -      struct kvm_msr_entry msr;
- -      int r;
- -
- -      /* Unconditionally clear the output for simplicity */
- -      msr.data = 0;
- -      msr.index = index;
- -      r = kvm_get_msr_feature(&msr);
- -
- -      if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
- -              r = 0;
- -
- -      *data = msr.data;
- -
- -      return r;
+ +      return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ +                               kvm_get_feature_msr);
   }
   
   static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@@ -1905,17 -1880,16 +1905,17 @@@ static int __kvm_set_msr(struct kvm_vcp
         return kvm_x86_call(set_msr)(vcpu, &msr);
   }
   
+ +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ +                      bool host_initiated)
+ +{
+ +      return __kvm_set_msr(vcpu, index, *data, host_initiated);
+ +}
+ +
   static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
                                      u32 index, u64 data, bool host_initiated)
   {
- -      int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
- -
- -      if (ret == KVM_MSR_RET_INVALID)
- -              if (kvm_msr_ignored_check(index, data, true))
- -                      ret = 0;
- -
- -      return ret;
+ +      return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ +                               _kvm_set_msr);
   }
   
   /*
@@@ -1954,23 -1928,33 +1954,25 @@@ int __kvm_get_msr(struct kvm_vcpu *vcpu
   static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
                                      u32 index, u64 *data, bool host_initiated)
   {
- -      int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
- -
- -      if (ret == KVM_MSR_RET_INVALID) {
- -              /* Unconditionally clear *data for simplicity */
- -              *data = 0;
- -              if (kvm_msr_ignored_check(index, 0, false))
- -                      ret = 0;
- -      }
- -
- -      return ret;
+ +      return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ +                               __kvm_get_msr);
   }
   
- static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+ int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
   {
         if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
                 return KVM_MSR_RET_FILTERED;
         return kvm_get_msr_ignored_check(vcpu, index, data, false);
   }
+ EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
   
- static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+ int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
   {
         if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
                 return KVM_MSR_RET_FILTERED;
         return kvm_set_msr_ignored_check(vcpu, index, data, false);
   }
+ EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
   
   int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
   {
@@@ -2017,7 -2001,7 +2019,7 @@@ static int complete_fast_rdmsr(struct k
   static u64 kvm_msr_reason(int r)
   {
         switch (r) {
- -      case KVM_MSR_RET_INVALID:
+ +      case KVM_MSR_RET_UNSUPPORTED:
                 return KVM_MSR_EXIT_REASON_UNKNOWN;
         case KVM_MSR_RET_FILTERED:
                 return KVM_MSR_EXIT_REASON_FILTER;
@@@ -2180,34 -2164,31 +2182,34 @@@ fastpath_t handle_fastpath_set_msr_irqo
   {
         u32 msr = kvm_rcx_read(vcpu);
         u64 data;
- -      fastpath_t ret = EXIT_FASTPATH_NONE;
+ +      fastpath_t ret;
+ +      bool handled;
   
         kvm_vcpu_srcu_read_lock(vcpu);
   
         switch (msr) {
         case APIC_BASE_MSR + (APIC_ICR >> 4):
                 data = kvm_read_edx_eax(vcpu);
- -              if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
- -                      kvm_skip_emulated_instruction(vcpu);
- -                      ret = EXIT_FASTPATH_EXIT_HANDLED;
- -              }
+ +              handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                 break;
         case MSR_IA32_TSC_DEADLINE:
                 data = kvm_read_edx_eax(vcpu);
- -              if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
- -                      kvm_skip_emulated_instruction(vcpu);
- -                      ret = EXIT_FASTPATH_REENTER_GUEST;
- -              }
+ +              handled = !handle_fastpath_set_tscdeadline(vcpu, data);
                 break;
         default:
+ +              handled = false;
                 break;
         }
   
- -      if (ret != EXIT_FASTPATH_NONE)
+ +      if (handled) {
+ +              if (!kvm_skip_emulated_instruction(vcpu))
+ +                      ret = EXIT_FASTPATH_EXIT_USERSPACE;
+ +              else
+ +                      ret = EXIT_FASTPATH_REENTER_GUEST;
                 trace_kvm_msr_write(msr, data);
+ +      } else {
+ +              ret = EXIT_FASTPATH_NONE;
+ +      }
   
         kvm_vcpu_srcu_read_unlock(vcpu);
   
@@@ -3767,6 -3748,18 +3769,6 @@@ static void record_steal_time(struct kv
         mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
   }
   
- -static bool kvm_is_msr_to_save(u32 msr_index)
- -{
- -      unsigned int i;
- -
- -      for (i = 0; i < num_msrs_to_save; i++) {
- -              if (msrs_to_save[i] == msr_index)
- -                      return true;
- -      }
- -
- -      return false;
- -}
- -
   int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   {
         u32 msr = msr_info->index;
@@@ -4148,7 -4141,15 +4150,7 @@@
                 if (kvm_pmu_is_valid_msr(vcpu, msr))
                         return kvm_pmu_set_msr(vcpu, msr_info);
   
- -              /*
- -               * Userspace is allowed to write '0' to MSRs that KVM reports
- -               * as to-be-saved, even if an MSRs isn't fully supported.
- -               */
- -              if (msr_info->host_initiated && !data &&
- -                  kvm_is_msr_to_save(msr))
- -                      break;
- -
- -              return KVM_MSR_RET_INVALID;
+ +              return KVM_MSR_RET_UNSUPPORTED;
         }
         return 0;
   }
@@@ -4499,7 -4500,17 +4501,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                         return kvm_pmu_get_msr(vcpu, msr_info);
   
- -              /*
- -               * Userspace is allowed to read MSRs that KVM reports as
- -               * to-be-saved, even if an MSR isn't fully supported.
- -               */
- -              if (msr_info->host_initiated &&
- -                  kvm_is_msr_to_save(msr_info->index)) {
- -                      msr_info->data = 0;
- -                      break;
- -              }
- -
- -              return KVM_MSR_RET_INVALID;
+ +              return KVM_MSR_RET_UNSUPPORTED;
         }
         return 0;
   }
@@@ -4647,6 -4658,7 +4649,6 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_ASYNC_PF_INT:
         case KVM_CAP_GET_TSC_KHZ:
         case KVM_CAP_KVMCLOCK_CTRL:
- -      case KVM_CAP_READONLY_MEM:
         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
         case KVM_CAP_TSC_DEADLINE_TIMER:
         case KVM_CAP_DISABLE_QUIRKS:
@@@ -4805,9 -4817,6 +4807,9 @@@
         case KVM_CAP_VM_TYPES:
                 r = kvm_caps.supported_vm_types;
                 break;
+ +      case KVM_CAP_READONLY_MEM:
+ +              r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
+ +              break;
         default:
                 break;
         }
@@@ -4937,7 -4946,7 +4939,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
                 break;
         }
         case KVM_GET_MSRS:
- -              r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ +              r = msr_io(NULL, argp, do_get_feature_msr, 1);
                 break;
   #ifdef CONFIG_KVM_HYPERV
         case KVM_GET_SUPPORTED_HV_CPUID:
@@@ -6033,9 -6042,7 +6035,9 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
                         break;
   
+ +              kvm_vcpu_srcu_read_lock(vcpu);
                 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ +              kvm_vcpu_srcu_read_unlock(vcpu);
                 break;
         }
         case KVM_GET_DEBUGREGS: {
@@@ -7374,9 -7381,11 +7376,9 @@@ out
   
   static void kvm_probe_feature_msr(u32 msr_index)
   {
- -      struct kvm_msr_entry msr = {
- -              .index = msr_index,
- -      };
+ +      u64 data;
   
- -      if (kvm_get_msr_feature(&msr))
+ +      if (kvm_get_feature_msr(NULL, msr_index, &data, true))
                 return;
   
         msr_based_features[num_msr_based_features++] = msr_index;
@@@ -8854,13 -8863,60 +8856,13 @@@ static int handle_emulation_failure(str
         return 1;
   }
   
- -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- -                                int emulation_type)
+ +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
+ +                                             gpa_t cr2_or_gpa,
+ +                                             int emulation_type)
   {
- -      gpa_t gpa = cr2_or_gpa;
- -      kvm_pfn_t pfn;
- -
         if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                 return false;
   
- -      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- -          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- -              return false;
- -
- -      if (!vcpu->arch.mmu->root_role.direct) {
- -              /*
- -               * Write permission should be allowed since only
- -               * write access need to be emulated.
- -               */
- -              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
- -
- -              /*
- -               * If the mapping is invalid in guest, let cpu retry
- -               * it to generate fault.
- -               */
- -              if (gpa == INVALID_GPA)
- -                      return true;
- -      }
- -
- -      /*
- -       * Do not retry the unhandleable instruction if it faults on the
- -       * readonly host memory, otherwise it will goto a infinite loop:
- -       * retry instruction -> write #PF -> emulation fail -> retry
- -       * instruction -> ...
- -       */
- -      pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- -
- -      /*
- -       * If the instruction failed on the error pfn, it can not be fixed,
- -       * report the error to userspace.
- -       */
- -      if (is_error_noslot_pfn(pfn))
- -              return false;
- -
- -      kvm_release_pfn_clean(pfn);
- -
- -      /*
- -       * If emulation may have been triggered by a write to a shadowed page
- -       * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
- -       * guest to let the CPU re-execute the instruction in the hope that the
- -       * CPU can cleanly execute the instruction that KVM failed to emulate.
- -       */
- -      if (vcpu->kvm->arch.indirect_shadow_pages)
- -              kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
- -
         /*
          * If the failed instruction faulted on an access to page tables that
          * are used to translate any part of the instruction, KVM can't resolve
@@@ -8871,24 -8927,54 +8873,24 @@@
          * then zap the SPTE to unprotect the gfn, and then do it all over
          * again.  Report the error to userspace.
          */
- -      return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
- -}
- -
- -static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- -                            gpa_t cr2_or_gpa,  int emulation_type)
- -{
- -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- -      unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
- -
- -      last_retry_eip = vcpu->arch.last_retry_eip;
- -      last_retry_addr = vcpu->arch.last_retry_addr;
+ +      if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
+ +              return false;
   
         /*
- -       * If the emulation is caused by #PF and it is non-page_table
- -       * writing instruction, it means the VM-EXIT is caused by shadow
- -       * page protected, we can zap the shadow page and retry this
- -       * instruction directly.
- -       *
- -       * Note: if the guest uses a non-page-table modifying instruction
- -       * on the PDE that points to the instruction, then we will unmap
- -       * the instruction and go to an infinite loop. So, we cache the
- -       * last retried eip and the last fault address, if we meet the eip
- -       * and the address again, we can break out of the potential infinite
- -       * loop.
+ +       * If emulation may have been triggered by a write to a shadowed page
+ +       * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ +       * guest to let the CPU re-execute the instruction in the hope that the
+ +       * CPU can cleanly execute the instruction that KVM failed to emulate.
          */
- -      vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
- -
- -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
- -              return false;
- -
- -      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- -          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- -              return false;
- -
- -      if (x86_page_table_writing_insn(ctxt))
- -              return false;
- -
- -      if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
- -              return false;
- -
- -      vcpu->arch.last_retry_eip = ctxt->eip;
- -      vcpu->arch.last_retry_addr = cr2_or_gpa;
- -
- -      if (!vcpu->arch.mmu->root_role.direct)
- -              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
- -
- -      kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ +      __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
   
+ +      /*
+ +       * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
+ +       * all SPTEs were already zapped by a different task.  The alternative
+ +       * is to report the error to userspace and likely terminate the guest,
+ +       * and the last_retry_{eip,addr} checks will prevent retrying the page
+ +       * fault indefinitely, i.e. there's nothing to lose by retrying.
+ +       */
         return true;
   }
   
@@@ -9088,11 -9174,6 +9090,11 @@@ int x86_emulate_instruction(struct kvm_
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         bool writeback = true;
   
+ +      if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ +          (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ +           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
+ +              emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
+ +
         r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
         if (r != X86EMUL_CONTINUE) {
                 if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
@@@ -9123,8 -9204,8 +9125,8 @@@
                                 kvm_queue_exception(vcpu, UD_VECTOR);
                                 return 1;
                         }
- -                      if (reexecute_instruction(vcpu, cr2_or_gpa,
- -                                                emulation_type))
+ +                      if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ +                                                             emulation_type))
                                 return 1;
   
                         if (ctxt->have_exception &&
@@@ -9171,15 -9252,7 +9173,15 @@@
                 return 1;
         }
   
- -      if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ +      /*
+ +       * If emulation was caused by a write-protection #PF on a non-page_table
+ +       * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
+ +       * and retry the instruction, as the vCPU is likely no longer using the
+ +       * gfn as a page table.
+ +       */
+ +      if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ +          !x86_page_table_writing_insn(ctxt) &&
+ +          kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
                 return 1;
   
         /* this is needed for vmware backdoor interface to work since it
@@@ -9210,8 -9283,7 +9212,8 @@@ restart
                 return 1;
   
         if (r == EMULATION_FAILED) {
- -              if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
+ +              if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ +                                                     emulation_type))
                         return 1;
   
                 return handle_emulation_failure(vcpu, emulation_type);
@@@ -9679,7 -9751,7 +9681,7 @@@ int kvm_x86_vendor_init(struct kvm_x86_
   
         guard(mutex)(&vendor_module_lock);
   
- -      if (kvm_x86_ops.hardware_enable) {
+ +      if (kvm_x86_ops.enable_virtualization_cpu) {
                 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
                 return -EEXIST;
         }
@@@ -9806,7 -9878,7 +9808,7 @@@
         return 0;
   
   out_unwind_ops:
- -      kvm_x86_ops.hardware_enable = NULL;
+ +      kvm_x86_ops.enable_virtualization_cpu = NULL;
         kvm_x86_call(hardware_unsetup)();
   out_mmu_exit:
         kvm_mmu_vendor_module_exit();
@@@ -9830,27 -9902,72 +9832,27 @@@ void kvm_x86_vendor_exit(void
   
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
- -                                          CPUFREQ_TRANSITION_NOTIFIER);
- -              cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
- -      }
- -#ifdef CONFIG_X86_64
- -      pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
- -      irq_work_sync(&pvclock_irq_work);
- -      cancel_work_sync(&pvclock_gtod_work);
- -#endif
- -      kvm_x86_call(hardware_unsetup)();
- -      kvm_mmu_vendor_module_exit();
- -      free_percpu(user_return_msrs);
- -      kmem_cache_destroy(x86_emulator_cache);
- -#ifdef CONFIG_KVM_XEN
- -      static_key_deferred_flush(&kvm_xen_enabled);
- -      WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
- -#endif
- -      mutex_lock(&vendor_module_lock);
- -      kvm_x86_ops.hardware_enable = NULL;
- -      mutex_unlock(&vendor_module_lock);
- -}
- -EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
- -
- -static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
- -{
- -      /*
- -       * The vCPU has halted, e.g. executed HLT.  Update the run state if the
- -       * local APIC is in-kernel, the run loop will detect the non-runnable
- -       * state and halt the vCPU.  Exit to userspace if the local APIC is
- -       * managed by userspace, in which case userspace is responsible for
- -       * handling wake events.
- -       */
- -      ++vcpu->stat.halt_exits;
- -      if (lapic_in_kernel(vcpu)) {
- -              vcpu->arch.mp_state = state;
- -              return 1;
- -      } else {
- -              vcpu->run->exit_reason = reason;
- -              return 0;
- -      }
- -}
- -
- -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
- -{
- -      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
- -}
- -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
- -
- -int kvm_emulate_halt(struct kvm_vcpu *vcpu)
- -{
- -      int ret = kvm_skip_emulated_instruction(vcpu);
- -      /*
- -       * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
- -       * KVM_EXIT_DEBUG here.
- -       */
- -      return kvm_emulate_halt_noskip(vcpu) && ret;
- -}
- -EXPORT_SYMBOL_GPL(kvm_emulate_halt);
- -
- -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
- -{
- -      int ret = kvm_skip_emulated_instruction(vcpu);
- -
- -      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
- -                                      KVM_EXIT_AP_RESET_HOLD) && ret;
+ +                                          CPUFREQ_TRANSITION_NOTIFIER);
+ +              cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ +      }
+ +#ifdef CONFIG_X86_64
+ +      pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ +      irq_work_sync(&pvclock_irq_work);
+ +      cancel_work_sync(&pvclock_gtod_work);
+ +#endif
+ +      kvm_x86_call(hardware_unsetup)();
+ +      kvm_mmu_vendor_module_exit();
+ +      free_percpu(user_return_msrs);
+ +      kmem_cache_destroy(x86_emulator_cache);
+ +#ifdef CONFIG_KVM_XEN
+ +      static_key_deferred_flush(&kvm_xen_enabled);
+ +      WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
+ +#endif
+ +      mutex_lock(&vendor_module_lock);
+ +      kvm_x86_ops.enable_virtualization_cpu = NULL;
+ +      mutex_unlock(&vendor_module_lock);
   }
- -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+ +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
   
   #ifdef CONFIG_X86_64
   static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
@@@ -11088,9 -11205,6 +11090,9 @@@ static int vcpu_enter_guest(struct kvm_
         if (vcpu->arch.apic_attention)
                 kvm_lapic_sync_from_vapic(vcpu);
   
+ +      if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ +              return 0;
+ +
         r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
         return r;
   
@@@ -11104,67 -11218,6 +11106,67 @@@ out
         return r;
   }
   
+ +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+ +{
+ +      return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ +              !vcpu->arch.apf.halted);
+ +}
+ +
+ +static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+ +{
+ +      if (!list_empty_careful(&vcpu->async_pf.done))
+ +              return true;
+ +
+ +      if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ +          kvm_apic_init_sipi_allowed(vcpu))
+ +              return true;
+ +
+ +      if (vcpu->arch.pv.pv_unhalted)
+ +              return true;
+ +
+ +      if (kvm_is_exception_pending(vcpu))
+ +              return true;
+ +
+ +      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ +          (vcpu->arch.nmi_pending &&
+ +           kvm_x86_call(nmi_allowed)(vcpu, false)))
+ +              return true;
+ +
+ +#ifdef CONFIG_KVM_SMM
+ +      if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ +          (vcpu->arch.smi_pending &&
+ +           kvm_x86_call(smi_allowed)(vcpu, false)))
+ +              return true;
+ +#endif
+ +
+ +      if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ +              return true;
+ +
+ +      if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ +              return true;
+ +
+ +      if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ +              return true;
+ +
+ +      if (kvm_hv_has_stimer_pending(vcpu))
+ +              return true;
+ +
+ +      if (is_guest_mode(vcpu) &&
+ +          kvm_x86_ops.nested_ops->has_events &&
+ +          kvm_x86_ops.nested_ops->has_events(vcpu, false))
+ +              return true;
+ +
+ +      if (kvm_xen_has_pending_events(vcpu))
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+ +{
+ +      return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+ +}
+ +
   /* Called within kvm->srcu read side.  */
   static inline int vcpu_block(struct kvm_vcpu *vcpu)
   {
@@@ -11236,6 -11289,12 +11238,6 @@@
         return 1;
   }
   
- -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
- -{
- -      return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- -              !vcpu->arch.apf.halted);
- -}
- -
   /* Called within kvm->srcu read side.  */
   static int vcpu_run(struct kvm_vcpu *vcpu)
   {
@@@ -11287,98 -11346,6 +11289,98 @@@
         return r;
   }
   
+ +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+ +{
+ +      /*
+ +       * The vCPU has halted, e.g. executed HLT.  Update the run state if the
+ +       * local APIC is in-kernel, the run loop will detect the non-runnable
+ +       * state and halt the vCPU.  Exit to userspace if the local APIC is
+ +       * managed by userspace, in which case userspace is responsible for
+ +       * handling wake events.
+ +       */
+ +      ++vcpu->stat.halt_exits;
+ +      if (lapic_in_kernel(vcpu)) {
+ +              if (kvm_vcpu_has_events(vcpu))
+ +                      vcpu->arch.pv.pv_unhalted = false;
+ +              else
+ +                      vcpu->arch.mp_state = state;
+ +              return 1;
+ +      } else {
+ +              vcpu->run->exit_reason = reason;
+ +              return 0;
+ +      }
+ +}
+ +
+ +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+ +{
+ +      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+ +
+ +int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+ +{
+ +      int ret = kvm_skip_emulated_instruction(vcpu);
+ +      /*
+ +       * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ +       * KVM_EXIT_DEBUG here.
+ +       */
+ +      return kvm_emulate_halt_noskip(vcpu) && ret;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+ +
+ +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
+ +{
+ +      int ret;
+ +
+ +      kvm_vcpu_srcu_read_lock(vcpu);
+ +      ret = kvm_emulate_halt(vcpu);
+ +      kvm_vcpu_srcu_read_unlock(vcpu);
+ +
+ +      if (!ret)
+ +              return EXIT_FASTPATH_EXIT_USERSPACE;
+ +
+ +      if (kvm_vcpu_running(vcpu))
+ +              return EXIT_FASTPATH_REENTER_GUEST;
+ +
+ +      return EXIT_FASTPATH_EXIT_HANDLED;
+ +}
+ +EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
+ +
+ +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+ +{
+ +      int ret = kvm_skip_emulated_instruction(vcpu);
+ +
+ +      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ +                                      KVM_EXIT_AP_RESET_HOLD) && ret;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+ +
+ +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+ +{
+ +      return kvm_vcpu_apicv_active(vcpu) &&
+ +             kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
+ +}
+ +
+ +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+ +{
+ +      return vcpu->arch.preempted_in_kernel;
+ +}
+ +
+ +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+ +{
+ +      if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ +              return true;
+ +
+ +      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ +#ifdef CONFIG_KVM_SMM
+ +              kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ +#endif
+ +               kvm_test_request(KVM_REQ_EVENT, vcpu))
+ +              return true;
+ +
+ +      return kvm_arch_dy_has_pending_interrupt(vcpu);
+ +}
+ +
   static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
   {
         return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
@@@ -12295,6 -12262,8 +12297,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
         vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
   
- -      vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
- -
         kvm_async_pf_hash_reset(vcpu);
   
         vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
@@@ -12460,8 -12429,6 +12462,8 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
         if (!init_event) {
                 vcpu->arch.smbase = 0x30000;
   
+ +              vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+ +
                 vcpu->arch.msr_misc_features_enables = 0;
                 vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
                                                   MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
@@@ -12547,17 -12514,7 +12549,17 @@@ void kvm_vcpu_deliver_sipi_vector(struc
   }
   EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
   
- -int kvm_arch_hardware_enable(void)
+ +void kvm_arch_enable_virtualization(void)
+ +{
+ +      cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+ +}
+ +
+ +void kvm_arch_disable_virtualization(void)
+ +{
+ +      cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+ +}
+ +
+ +int kvm_arch_enable_virtualization_cpu(void)
   {
         struct kvm *kvm;
         struct kvm_vcpu *vcpu;
@@@ -12573,7 -12530,7 +12575,7 @@@
         if (ret)
                 return ret;
   
- -      ret = kvm_x86_call(hardware_enable)();
+ +      ret = kvm_x86_call(enable_virtualization_cpu)();
         if (ret != 0)
                 return ret;
   
@@@ -12653,9 -12610,9 +12655,9 @@@
         return 0;
   }
   
- -void kvm_arch_hardware_disable(void)
+ +void kvm_arch_disable_virtualization_cpu(void)
   {
- -      kvm_x86_call(hardware_disable)();
+ +      kvm_x86_call(disable_virtualization_cpu)();
         drop_user_return_notifiers();
   }
   
@@@ -13203,6 -13160,87 +13205,6 @@@ void kvm_arch_commit_memory_region(stru
                 kvm_arch_free_memslot(kvm, old);
   }
   
- -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
- -{
- -      if (!list_empty_careful(&vcpu->async_pf.done))
- -              return true;
- -
- -      if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
- -          kvm_apic_init_sipi_allowed(vcpu))
- -              return true;
- -
- -      if (vcpu->arch.pv.pv_unhalted)
- -              return true;
- -
- -      if (kvm_is_exception_pending(vcpu))
- -              return true;
- -
- -      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
- -          (vcpu->arch.nmi_pending &&
- -           kvm_x86_call(nmi_allowed)(vcpu, false)))
- -              return true;
- -
- -#ifdef CONFIG_KVM_SMM
- -      if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- -          (vcpu->arch.smi_pending &&
- -           kvm_x86_call(smi_allowed)(vcpu, false)))
- -              return true;
- -#endif
- -
- -      if (kvm_test_request(KVM_REQ_PMI, vcpu))
- -              return true;
- -
- -      if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
- -              return true;
- -
- -      if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
- -              return true;
- -
- -      if (kvm_hv_has_stimer_pending(vcpu))
- -              return true;
- -
- -      if (is_guest_mode(vcpu) &&
- -          kvm_x86_ops.nested_ops->has_events &&
- -          kvm_x86_ops.nested_ops->has_events(vcpu, false))
- -              return true;
- -
- -      if (kvm_xen_has_pending_events(vcpu))
- -              return true;
- -
- -      return false;
- -}
- -
- -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
- -{
- -      return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
- -}
- -
- -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
- -{
- -      return kvm_vcpu_apicv_active(vcpu) &&
- -             kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
- -}
- -
- -bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
- -{
- -      return vcpu->arch.preempted_in_kernel;
- -}
- -
- -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
- -{
- -      if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
- -              return true;
- -
- -      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
- -#ifdef CONFIG_KVM_SMM
- -              kvm_test_request(KVM_REQ_SMI, vcpu) ||
- -#endif
- -               kvm_test_request(KVM_REQ_EVENT, vcpu))
- -              return true;
- -
- -      return kvm_arch_dy_has_pending_interrupt(vcpu);
- -}
- -
   bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
   {
         if (vcpu->arch.guest_state_protected)
author	Paolo Bonzini <[email protected]>
	Sat, 14 Sep 2024 13:56:06 +0000 (09:56 -0400)
committer	Paolo Bonzini <[email protected]>
	Tue, 17 Sep 2024 16:41:23 +0000 (12:41 -0400)
		1	2
Documentation/virt/kvm/api.rst	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/nested.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/nested.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history