]> Git Repo - J-linux.git/commitdiff
Merge tag 'kvm-x86-vmx-6.12' of https://github.com/kvm-x86/linux into HEAD
authorPaolo Bonzini <[email protected]>
Sat, 14 Sep 2024 13:56:06 +0000 (09:56 -0400)
committerPaolo Bonzini <[email protected]>
Tue, 17 Sep 2024 16:41:23 +0000 (12:41 -0400)
KVM VMX changes for 6.12:

 - Set FINAL/PAGE in the page fault error code for EPT Violations if and only
   if the GVA is valid.  If the GVA is NOT valid, there is no guest-side page
   table walk and so stuffing paging related metadata is nonsensical.

 - Fix a bug where KVM would incorrectly synthesize a nested VM-Exit instead of
   emulating posted interrupt delivery to L2.

 - Add a lockdep assertion to detect unsafe accesses of vmcs12 structures.

 - Harden eVMCS loading against an impossible NULL pointer deref (really truly
   should be impossible).

 - Minor SGX fix and a cleanup.

1  2 
Documentation/virt/kvm/api.rst
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c

index b4d1cf2e4628855d4c76219fd71cc1f4f985a865,a4b7dc4a9ddaa63c4302f8721473493c34fed1a3..e32471977d0a23933698952ae3d9057fa6aeea3b
@@@ -4214,7 -4214,9 +4214,9 @@@ whether or not KVM_CAP_X86_USER_SPACE_M
  enabled.  If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
  on denied accesses, i.e. userspace effectively intercepts the MSR access.  If
  KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
- on denied accesses.
+ on denied accesses.  Note, if an MSR access is denied during emulation of MSR
+ load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER.
+ See the below warning for full details.
  
  If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
  the access in accordance with the vCPU model.  Note, KVM may still ultimately
@@@ -4229,9 -4231,22 +4231,22 @@@ filtering. In that mode, ``KVM_MSR_FILT
  an error.
  
  .. warning::
-    MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
-    This includes both writes to individual VMCS fields and reads/writes
-    through the MSR lists pointed to by the VMCS.
+    MSR accesses that are side effects of instruction execution (emulated or
+    native) are not filtered as hardware does not honor MSR bitmaps outside of
+    RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions
+    to avoid pointless divergence from hardware.  E.g. RDPID reads MSR_TSC_AUX,
+    SYSENTER reads the SYSENTER MSRs, etc.
+    MSRs that are loaded/stored via dedicated VMCS fields are not filtered as
+    part of VM-Enter/VM-Exit emulation.
+    MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part
+    of VM-Enter/VM-Exit emulation.  If an MSR access is denied on VM-Enter, KVM
+    synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL).  If an
+    MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort.  In short, KVM
+    extends Intel's architectural list of MSRs that cannot be loaded/saved via
+    the VM-Enter/VM-Exit MSR list.  It is platform owner's responsibility to
+    to communicate any such restrictions to their end users.
  
     x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
     cover any x2APIC MSRs).
@@@ -8082,14 -8097,6 +8097,14 @@@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By 
                                      guest CPUID on writes to MISC_ENABLE if
                                      KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
                                      disabled.
 +
 +KVM_X86_QUIRK_SLOT_ZAP_ALL          By default, KVM invalidates all SPTEs in
 +                                    fast way for memslot deletion when VM type
 +                                    is KVM_X86_DEFAULT_VM.
 +                                    When this quirk is disabled or when VM type
 +                                    is other than KVM_X86_DEFAULT_VM, KVM zaps
 +                                    only leaf SPTEs that are within the range of
 +                                    the memslot being deleted.
  =================================== ============================================
  
  7.32 KVM_CAP_MAX_VCPU_ID
index 5f794814226fd98b0492ec313c62f36a49232d08,aa31c4b94977469af40351a94252f1030befcfaa..6d9f763a7bb9d5db422ea5625b2c28420bd14f26
@@@ -36,7 -36,6 +36,7 @@@
  #include <asm/kvm_page_track.h>
  #include <asm/kvm_vcpu_regs.h>
  #include <asm/hyperv-tlfs.h>
 +#include <asm/reboot.h>
  
  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
  
@@@ -212,7 -211,6 +212,7 @@@ enum exit_fastpath_completion 
        EXIT_FASTPATH_NONE,
        EXIT_FASTPATH_REENTER_GUEST,
        EXIT_FASTPATH_EXIT_HANDLED,
 +      EXIT_FASTPATH_EXIT_USERSPACE,
  };
  typedef enum exit_fastpath_completion fastpath_t;
  
@@@ -282,6 -280,10 +282,6 @@@ enum x86_intercept_stage
  #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
  #define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
  
 -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |      \
 -                               PFERR_WRITE_MASK |             \
 -                               PFERR_PRESENT_MASK)
 -
  /* apic attention bits */
  #define KVM_APIC_CHECK_VAPIC  0
  /*
@@@ -1627,10 -1629,8 +1627,10 @@@ struct kvm_x86_ops 
  
        int (*check_processor_compatibility)(void);
  
 -      int (*hardware_enable)(void);
 -      void (*hardware_disable)(void);
 +      int (*enable_virtualization_cpu)(void);
 +      void (*disable_virtualization_cpu)(void);
 +      cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
 +
        void (*hardware_unsetup)(void);
        bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
        void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
        void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
        void (*enable_irq_window)(struct kvm_vcpu *vcpu);
        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 +
 +      const bool x2apic_icr_is_split;
        const unsigned long required_apicv_inhibits;
        bool allow_apicv_in_x2apic_without_x2apic_virtualization;
        void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
        int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
        void (*guest_memory_reclaimed)(struct kvm *kvm);
  
 -      int (*get_msr_feature)(struct kvm_msr_entry *entry);
 +      int (*get_feature_msr)(u32 msr, u64 *data);
  
        int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
                                         void *insn, int insn_len);
@@@ -2062,6 -2060,8 +2062,8 @@@ void kvm_prepare_emulation_failure_exit
  
  void kvm_enable_efer_bits(u64);
  bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+ int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+ int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
  int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
  int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
  int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
@@@ -2138,15 -2138,7 +2140,15 @@@ int kvm_get_nr_pending_nmis(struct kvm_
  
  void kvm_update_dr7(struct kvm_vcpu *vcpu);
  
 -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 +                                     bool always_retry);
 +
 +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
 +                                                 gpa_t cr2_or_gpa)
 +{
 +      return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
 +}
 +
  void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
                        ulong roots_to_free);
  void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
@@@ -2264,6 -2256,7 +2266,7 @@@ int kvm_cpu_has_injectable_intr(struct 
  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
  int kvm_cpu_has_extint(struct kvm_vcpu *v);
  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+ int kvm_cpu_get_extint(struct kvm_vcpu *v);
  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
  
@@@ -2355,8 -2348,7 +2358,8 @@@ int memslot_rmap_alloc(struct kvm_memor
         KVM_X86_QUIRK_OUT_7E_INC_RIP |         \
         KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |   \
         KVM_X86_QUIRK_FIX_HYPERCALL_INSN |     \
 -       KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 +       KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |  \
 +       KVM_X86_QUIRK_SLOT_ZAP_ALL)
  
  /*
   * KVM previously used a u32 field in kvm_run to indicate the hypercall was
diff --combined arch/x86/kvm/lapic.c
index c7180cb5f4640dd7cb42c41edc54fefb9fcff882,63e67b6301ec6bc5ba833fedaef798e2085b2fb6..2098dc689088bbd4d53353d68fe6e0fb8a6b47a4
@@@ -1944,7 -1944,7 +1944,7 @@@ static void start_sw_tscdeadline(struc
        u64 ns = 0;
        ktime_t expire;
        struct kvm_vcpu *vcpu = apic->vcpu;
 -      unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 +      u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
        unsigned long flags;
        ktime_t now;
  
@@@ -2453,43 -2453,6 +2453,43 @@@ void kvm_lapic_set_eoi(struct kvm_vcpu 
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
  
 +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
 +
 +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
 +{
 +      if (data & X2APIC_ICR_RESERVED_BITS)
 +              return 1;
 +
 +      /*
 +       * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
 +       * only AMD requires it to be zero, Intel essentially just ignores the
 +       * bit.  And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
 +       * the CPU performs the reserved bits checks, i.e. the underlying CPU
 +       * behavior will "win".  Arbitrarily clear the BUSY bit, as there is no
 +       * sane way to provide consistent behavior with respect to hardware.
 +       */
 +      data &= ~APIC_ICR_BUSY;
 +
 +      kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
 +      if (kvm_x86_ops.x2apic_icr_is_split) {
 +              kvm_lapic_set_reg(apic, APIC_ICR, data);
 +              kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
 +      } else {
 +              kvm_lapic_set_reg64(apic, APIC_ICR, data);
 +      }
 +      trace_kvm_apic_write(APIC_ICR, data);
 +      return 0;
 +}
 +
 +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
 +{
 +      if (kvm_x86_ops.x2apic_icr_is_split)
 +              return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
 +                     (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
 +
 +      return kvm_lapic_get_reg64(apic, APIC_ICR);
 +}
 +
  /* emulate APIC access in a trap manner */
  void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
  {
         * maybe-unecessary write, and both are in the noise anyways.
         */
        if (apic_x2apic_mode(apic) && offset == APIC_ICR)
 -              kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
 +              WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
        else
                kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
  }
@@@ -2959,14 -2922,13 +2959,13 @@@ void kvm_inject_apic_timer_irqs(struct 
        }
  }
  
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
  {
-       int vector = kvm_apic_has_interrupt(vcpu);
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 ppr;
  
-       if (vector == -1)
-               return -1;
+       if (WARN_ON_ONCE(vector < 0 || !apic))
+               return;
  
        /*
         * We get here even with APIC virtualization enabled, if doing
                __apic_update_ppr(apic, &ppr);
        }
  
-       return vector;
  }
+ EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
  
  static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
                struct kvm_lapic_state *s, bool set)
  
                /*
                 * In x2APIC mode, the LDR is fixed and based on the id.  And
 -               * ICR is internally a single 64-bit register, but needs to be
 -               * split to ICR+ICR2 in userspace for backwards compatibility.
 +               * if the ICR is _not_ split, ICR is internally a single 64-bit
 +               * register, but needs to be split to ICR+ICR2 in userspace for
 +               * backwards compatibility.
                 */
 -              if (set) {
 +              if (set)
                        *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
  
 -                      icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
 -                            (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
 -                      __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
 -              } else {
 -                      icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
 -                      __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
 +              if (!kvm_x86_ops.x2apic_icr_is_split) {
 +                      if (set) {
 +                              icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
 +                                    (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
 +                              __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
 +                      } else {
 +                              icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
 +                              __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
 +                      }
                }
        }
  
@@@ -3235,12 -3193,22 +3234,12 @@@ int kvm_lapic_set_vapic_addr(struct kvm
        return 0;
  }
  
 -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
 -{
 -      data &= ~APIC_ICR_BUSY;
 -
 -      kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
 -      kvm_lapic_set_reg64(apic, APIC_ICR, data);
 -      trace_kvm_apic_write(APIC_ICR, data);
 -      return 0;
 -}
 -
  static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
  {
        u32 low;
  
        if (reg == APIC_ICR) {
 -              *data = kvm_lapic_get_reg64(apic, APIC_ICR);
 +              *data = kvm_x2apic_icr_read(apic);
                return 0;
        }
  
diff --combined arch/x86/kvm/lapic.h
index 7c95eedd771e75c1371beaf97ad5b0564d593ca1,8310ff74be29768ee9af77edb1c9238ac0e5a90f..1b8ef9856422a45d246cfd60f05de6dc70ec88b3
@@@ -88,14 -88,15 +88,14 @@@ int kvm_create_lapic(struct kvm_vcpu *v
  void kvm_free_lapic(struct kvm_vcpu *vcpu);
  
  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+ void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
- int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
  int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
  void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
  void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
  void kvm_recalculate_apic_map(struct kvm *kvm);
  void kvm_apic_set_version(struct kvm_vcpu *vcpu);
  void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
index 97ecc2722c8fd19ceaf8102f21781fcf265f6313,fc3d2ba036f6002a592178339b63b26998cb9935..a8e7bc04d9bf365277332f0174b617da5747da45
@@@ -981,7 -981,7 +981,7 @@@ static u32 nested_vmx_load_msr(struct k
                                __func__, i, e.index, e.reserved);
                        goto fail;
                }
-               if (kvm_set_msr(vcpu, e.index, e.value)) {
+               if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
                        pr_debug_ratelimited(
                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                __func__, i, e.index, e.value);
@@@ -1017,7 -1017,7 +1017,7 @@@ static bool nested_vmx_get_vmexit_msr_v
                }
        }
  
-       if (kvm_get_msr(vcpu, msr_index, data)) {
+       if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
                pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
                        msr_index);
                return false;
@@@ -1112,9 -1112,9 +1112,9 @@@ static void prepare_vmx_msr_autostore_l
                        /*
                         * Emulated VMEntry does not fail here.  Instead a less
                         * accurate value will be returned by
-                        * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
-                        * instead of reading the value from the vmcs02 VMExit
-                        * MSR-store area.
+                        * nested_vmx_get_vmexit_msr_value() by reading KVM's
+                        * internal MSR state instead of reading the value from
+                        * the vmcs02 VMExit MSR-store area.
                         */
                        pr_warn_ratelimited(
                                "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
@@@ -1251,32 -1251,21 +1251,32 @@@ static bool is_bitwise_subset(u64 super
  
  static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
  {
 -      const u64 feature_and_reserved =
 -              /* feature (except bit 48; see below) */
 -              BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
 -              /* reserved */
 -              BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
 +      const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
 +                               VMX_BASIC_INOUT |
 +                               VMX_BASIC_TRUE_CTLS;
 +
 +      const u64 reserved_bits = GENMASK_ULL(63, 56) |
 +                                GENMASK_ULL(47, 45) |
 +                                BIT_ULL(31);
 +
        u64 vmx_basic = vmcs_config.nested.basic;
  
 -      if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
 +      BUILD_BUG_ON(feature_bits & reserved_bits);
 +
 +      /*
 +       * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
 +       * inverted polarity), the incoming value must not set feature bits or
 +       * reserved bits that aren't allowed/supported by KVM.  Fields, i.e.
 +       * multi-bit values, are explicitly checked below.
 +       */
 +      if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
                return -EINVAL;
  
        /*
         * KVM does not emulate a version of VMX that constrains physical
         * addresses of VMX structures (e.g. VMCS) to 32-bits.
         */
 -      if (data & BIT_ULL(48))
 +      if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                return -EINVAL;
  
        if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@@ -1345,29 -1334,16 +1345,29 @@@ vmx_restore_control_msr(struct vcpu_vm
  
  static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
  {
 -      const u64 feature_and_reserved_bits =
 -              /* feature */
 -              BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
 -              BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
 -              /* reserved */
 -              GENMASK_ULL(13, 9) | BIT_ULL(31);
 +      const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
 +                               VMX_MISC_ACTIVITY_HLT |
 +                               VMX_MISC_ACTIVITY_SHUTDOWN |
 +                               VMX_MISC_ACTIVITY_WAIT_SIPI |
 +                               VMX_MISC_INTEL_PT |
 +                               VMX_MISC_RDMSR_IN_SMM |
 +                               VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
 +                               VMX_MISC_VMXOFF_BLOCK_SMI |
 +                               VMX_MISC_ZERO_LEN_INS;
 +
 +      const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
 +
        u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
                                       vmcs_config.nested.misc_high);
  
 -      if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
 +      BUILD_BUG_ON(feature_bits & reserved_bits);
 +
 +      /*
 +       * The incoming value must not set feature bits or reserved bits that
 +       * aren't allowed/supported by KVM.  Fields, i.e. multi-bit values, are
 +       * explicitly checked below.
 +       */
 +      if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
                return -EINVAL;
  
        if ((vmx->nested.msrs.pinbased_ctls_high &
@@@ -2341,10 -2317,12 +2341,12 @@@ static void prepare_vmcs02_early(struc
  
        /* Posted interrupts setting is only taken from vmcs12.  */
        vmx->nested.pi_pending = false;
-       if (nested_cpu_has_posted_intr(vmcs12))
+       if (nested_cpu_has_posted_intr(vmcs12)) {
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
-       else
+       } else {
+               vmx->nested.posted_intr_nv = -1;
                exec_control &= ~PIN_BASED_POSTED_INTR;
+       }
        pin_controls_set(vmx, exec_control);
  
        /*
@@@ -2494,6 -2472,7 +2496,7 @@@ static void prepare_vmcs02_rare(struct 
  
        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
                vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
                vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
                vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
                vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
                vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
  
-               vmx->segment_cache.bitmask = 0;
+               vmx_segment_cache_clear(vmx);
        }
  
        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@@ -4308,11 -4287,52 +4311,52 @@@ static int vmx_check_nested_events(stru
        }
  
        if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+               int irq;
                if (block_nested_events)
                        return -EBUSY;
                if (!nested_exit_on_intr(vcpu))
                        goto no_vmexit;
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+               if (!nested_exit_intr_ack_set(vcpu)) {
+                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+                       return 0;
+               }
+               irq = kvm_cpu_get_extint(vcpu);
+               if (irq != -1) {
+                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+                                         INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+                       return 0;
+               }
+               irq = kvm_apic_has_interrupt(vcpu);
+               if (WARN_ON_ONCE(irq < 0))
+                       goto no_vmexit;
+               /*
+                * If the IRQ is L2's PI notification vector, process posted
+                * interrupts for L2 instead of injecting VM-Exit, as the
+                * detection/morphing architecturally occurs when the IRQ is
+                * delivered to the CPU.  Note, only interrupts that are routed
+                * through the local APIC trigger posted interrupt processing,
+                * and enabling posted interrupts requires ACK-on-exit.
+                */
+               if (irq == vmx->nested.posted_intr_nv) {
+                       vmx->nested.pi_pending = true;
+                       kvm_apic_clear_irr(vcpu, irq);
+                       goto no_vmexit;
+               }
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+                                 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+               /*
+                * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+                * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+                * if APICv is active.
+                */
+               kvm_apic_ack_interrupt(vcpu, irq);
                return 0;
        }
  
@@@ -4830,7 -4850,7 +4874,7 @@@ static void nested_vmx_restore_host_sta
                                goto vmabort;
                        }
  
-                       if (kvm_set_msr(vcpu, h.index, h.value)) {
+                       if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
                                pr_debug_ratelimited(
                                        "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
                                        __func__, j, h.index, h.value);
@@@ -4993,14 -5013,6 +5037,6 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  
        if (likely(!vmx->fail)) {
-               if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-                   nested_exit_intr_ack_set(vcpu)) {
-                       int irq = kvm_cpu_get_interrupt(vcpu);
-                       WARN_ON(irq < 0);
-                       vmcs12->vm_exit_intr_info = irq |
-                               INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
-               }
                if (vm_exit_reason != -1)
                        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
                                                       vmcs12->exit_qualification,
@@@ -7075,7 -7087,7 +7111,7 @@@ static void nested_vmx_setup_misc_data(
  {
        msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
        msrs->misc_low |=
 -              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
 +              VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                VMX_MISC_ACTIVITY_HLT |
                VMX_MISC_ACTIVITY_WAIT_SIPI;
@@@ -7090,10 -7102,12 +7126,10 @@@ static void nested_vmx_setup_basic(stru
         * guest, and the VMCS structure we give it - not about the
         * VMX support of the underlying hardware.
         */
 -      msrs->basic =
 -              VMCS12_REVISION |
 -              VMX_BASIC_TRUE_CTLS |
 -              ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
 -              (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
 +      msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
 +                                               X86_MEMTYPE_WB);
  
 +      msrs->basic |= VMX_BASIC_TRUE_CTLS;
        if (cpu_has_vmx_basic_inout())
                msrs->basic |= VMX_BASIC_INOUT;
  }
index 0782fe599757aeb73c0f749e01e431038b3a162f,668b6c83a373c312957109d5f4953b454c42ffff..2c296b6abb8ccf0ed56459d479fd8eac281b06fe
@@@ -39,11 -39,17 +39,17 @@@ bool nested_vmx_check_io_bitmaps(struc
  
  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
  {
+       lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+                           !refcount_read(&vcpu->kvm->users_count));
        return to_vmx(vcpu)->nested.cached_vmcs12;
  }
  
  static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
  {
+       lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+                           !refcount_read(&vcpu->kvm->users_count));
        return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
  }
  
@@@ -109,7 -115,7 +115,7 @@@ static inline unsigned nested_cpu_vmx_m
  static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
  {
        return to_vmx(vcpu)->nested.msrs.misc_low &
 -              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
 +              VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
  }
  
  static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --combined arch/x86/kvm/vmx/vmx.c
index 9cfcfebd5f99bd08867d2f2f186899d1b8170dc4,ec1aee1f9057e99f33f980bd902c46fce2669388..c67e448c6ebd709fcd367e387723b7046e36800b
@@@ -525,10 -525,6 +525,6 @@@ static const struct kvm_vmx_segment_fie
        VMX_SEGMENT_FIELD(LDTR),
  };
  
- static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
- {
-       vmx->segment_cache.bitmask = 0;
- }
  
  static unsigned long host_idt_base;
  
@@@ -755,7 -751,7 +751,7 @@@ fault
        return -EIO;
  }
  
 -static void vmx_emergency_disable(void)
 +void vmx_emergency_disable_virtualization_cpu(void)
  {
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;
@@@ -1998,15 -1994,15 +1994,15 @@@ static inline bool is_vmx_feature_contr
        return !(msr->data & ~valid_bits);
  }
  
 -int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 +int vmx_get_feature_msr(u32 msr, u64 *data)
  {
 -      switch (msr->index) {
 +      switch (msr) {
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                if (!nested)
                        return 1;
 -              return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
 +              return vmx_get_vmx_msr(&vmcs_config.nested, msrdata);
        default:
 -              return KVM_MSR_RET_INVALID;
 +              return KVM_MSR_RET_UNSUPPORTED;
        }
  }
  
@@@ -2605,13 -2601,13 +2601,13 @@@ static u64 adjust_vmx_controls64(u64 ct
  static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                             struct vmx_capability *vmx_cap)
  {
 -      u32 vmx_msr_low, vmx_msr_high;
        u32 _pin_based_exec_control = 0;
        u32 _cpu_based_exec_control = 0;
        u32 _cpu_based_2nd_exec_control = 0;
        u64 _cpu_based_3rd_exec_control = 0;
        u32 _vmexit_control = 0;
        u32 _vmentry_control = 0;
 +      u64 basic_msr;
        u64 misc_msr;
        int i;
  
                _vmexit_control &= ~x_ctrl;
        }
  
 -      rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
 +      rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
  
        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
 -      if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
 +      if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
                return -EIO;
  
  #ifdef CONFIG_X86_64
 -      /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
 -      if (vmx_msr_high & (1u<<16))
 +      /*
 +       * KVM expects to be able to shove all legal physical addresses into
 +       * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
 +       * 0 for processors that support Intel 64 architecture".
 +       */
 +      if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                return -EIO;
  #endif
  
        /* Require Write-Back (WB) memory type for VMCS accesses. */
 -      if (((vmx_msr_high >> 18) & 15) != 6)
 +      if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
                return -EIO;
  
        rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
  
 -      vmcs_conf->size = vmx_msr_high & 0x1fff;
 -      vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
 -
 -      vmcs_conf->revision_id = vmx_msr_low;
 -
 +      vmcs_conf->basic = basic_msr;
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@@ -2844,7 -2840,7 +2840,7 @@@ fault
        return -EFAULT;
  }
  
 -int vmx_hardware_enable(void)
 +int vmx_enable_virtualization_cpu(void)
  {
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@@ -2881,7 -2877,7 +2877,7 @@@ static void vmclear_local_loaded_vmcss(
                __loaded_vmcs_clear(v);
  }
  
 -void vmx_hardware_disable(void)
 +void vmx_disable_virtualization_cpu(void)
  {
        vmclear_local_loaded_vmcss();
  
@@@ -2903,13 -2899,13 +2899,13 @@@ struct vmcs *alloc_vmcs_cpu(bool shadow
        if (!pages)
                return NULL;
        vmcs = page_address(pages);
 -      memset(vmcs, 0, vmcs_config.size);
 +      memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
  
        /* KVM supports Enlightened VMCS v1 only */
        if (kvm_is_using_evmcs())
                vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
        else
 -              vmcs->hdr.revision_id = vmcs_config.revision_id;
 +              vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
  
        if (shadow)
                vmcs->hdr.shadow_vmcs = 1;
@@@ -3002,7 -2998,7 +2998,7 @@@ static __init int alloc_kvm_area(void
                 * physical CPU.
                 */
                if (kvm_is_using_evmcs())
 -                      vmcs->hdr.revision_id = vmcs_config.revision_id;
 +                      vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
  
                per_cpu(vmxarea, cpu) = vmcs;
        }
@@@ -4219,6 -4215,13 +4215,13 @@@ static int vmx_deliver_nested_posted_in
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       /*
+        * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+        * and freed, and must not be accessed outside of vcpu->mutex.  The
+        * vCPU's cached PI NV is valid if and only if posted interrupts
+        * enabled in its vmcs12, i.e. checking the vector also checks that
+        * L1 has enabled posted interrupts for L2.
+        */
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
                /*
@@@ -5804,8 -5807,9 +5807,9 @@@ static int handle_ept_violation(struct 
        error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
                      ? PFERR_PRESENT_MASK : 0;
  
-       error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
-              PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+       if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+               error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+                             PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
  
        /*
         * Check that the GPA doesn't exceed physical memory limits, as that is
@@@ -7265,8 -7269,6 +7269,8 @@@ static fastpath_t vmx_exit_handlers_fas
                return handle_fastpath_set_msr_irqoff(vcpu);
        case EXIT_REASON_PREEMPTION_TIMER:
                return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
 +      case EXIT_REASON_HLT:
 +              return handle_fastpath_hlt(vcpu);
        default:
                return EXIT_FASTPATH_NONE;
        }
@@@ -7969,6 -7971,7 +7973,7 @@@ static __init void vmx_set_cpu_caps(voi
                kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
                kvm_cpu_cap_clear(X86_FEATURE_SGX1);
                kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
        }
  
        if (vmx_umip_emulated())
@@@ -8519,7 -8522,7 +8524,7 @@@ __init int vmx_hardware_setup(void
                u64 use_timer_freq = 5000ULL * 1000 * 1000;
  
                cpu_preemption_timer_multi =
 -                      vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
 +                      vmx_misc_preemption_timer_rate(vmcs_config.misc);
  
                if (tsc_khz)
                        use_timer_freq = (u64)tsc_khz * 1000;
@@@ -8586,6 -8589,8 +8591,6 @@@ static void __vmx_exit(void
  {
        allow_smaller_maxphyaddr = false;
  
 -      cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
 -
        vmx_cleanup_l1d_flush();
  }
  
@@@ -8632,6 -8637,8 +8637,6 @@@ static int __init vmx_init(void
                pi_init_cpu(cpu);
        }
  
 -      cpu_emergency_register_virt_callback(vmx_emergency_disable);
 -
        vmx_check_vmcs12_offsets();
  
        /*
diff --combined arch/x86/kvm/vmx/vmx.h
index 3839afb921e220a88dc8a8608f06bb46ba58d86a,11b1b70faef29f7081d0e576870d3ce5951e6667..2325f773a20be0ea8068675a17c634c15984f41d
  #include "run_flags.h"
  #include "../mmu.h"
  
 -#define MSR_TYPE_R    1
 -#define MSR_TYPE_W    2
 -#define MSR_TYPE_RW   3
 -
  #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
  
  #ifdef CONFIG_X86_64
@@@ -752,4 -756,9 +752,9 @@@ static inline bool vmx_can_use_ipiv(str
        return  lapic_in_kernel(vcpu) && enable_ipiv;
  }
  
+ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+ {
+       vmx->segment_cache.bitmask = 0;
+ }
  #endif /* __KVM_X86_VMX_H */
diff --combined arch/x86/kvm/x86.c
index 0c1d54d9ef4574d874feb0a0b82380549bf3257a,34b52b49f5e689a9ac0c3d62656cfd16aa670c8a..83fe0a78146fc198115aba0e76ba57ecfb1dd8d9
@@@ -305,237 -305,24 +305,237 @@@ const struct kvm_stats_header kvm_vcpu_
  static struct kmem_cache *x86_emulator_cache;
  
  /*
 - * When called, it means the previous get/set msr reached an invalid msr.
 - * Return true if we want to ignore/silent this failed msr access.
 + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
 + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
 + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.  msrs_to_save holds MSRs that
 + * require host support, i.e. should be probed via RDMSR.  emulated_msrs holds
 + * MSRs that KVM emulates without strictly requiring host support.
 + * msr_based_features holds MSRs that enumerate features, i.e. are effectively
 + * CPUID leafs.  Note, msr_based_features isn't mutually exclusive with
 + * msrs_to_save and emulated_msrs.
   */
 -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
 +
 +static const u32 msrs_to_save_base[] = {
 +      MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 +      MSR_STAR,
 +#ifdef CONFIG_X86_64
 +      MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 +#endif
 +      MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 +      MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
 +      MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
 +      MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
 +      MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
 +      MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
 +      MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
 +      MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
 +      MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
 +      MSR_IA32_UMWAIT_CONTROL,
 +
 +      MSR_IA32_XFD, MSR_IA32_XFD_ERR,
 +};
 +
 +static const u32 msrs_to_save_pmu[] = {
 +      MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
 +      MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
 +      MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
 +      MSR_CORE_PERF_GLOBAL_CTRL,
 +      MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
 +
 +      /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
 +      MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
 +      MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
 +      MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
 +      MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
 +      MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
 +      MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
 +      MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
 +      MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
 +
 +      MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
 +      MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
 +
 +      /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
 +      MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
 +      MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
 +      MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
 +      MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
 +
 +      MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
 +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
 +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
 +};
 +
 +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
 +                      ARRAY_SIZE(msrs_to_save_pmu)];
 +static unsigned num_msrs_to_save;
 +
 +static const u32 emulated_msrs_all[] = {
 +      MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 +      MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 +
 +#ifdef CONFIG_KVM_HYPERV
 +      HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 +      HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 +      HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
 +      HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 +      HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 +      HV_X64_MSR_RESET,
 +      HV_X64_MSR_VP_INDEX,
 +      HV_X64_MSR_VP_RUNTIME,
 +      HV_X64_MSR_SCONTROL,
 +      HV_X64_MSR_STIMER0_CONFIG,
 +      HV_X64_MSR_VP_ASSIST_PAGE,
 +      HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
 +      HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
 +      HV_X64_MSR_SYNDBG_OPTIONS,
 +      HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
 +      HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
 +      HV_X64_MSR_SYNDBG_PENDING_BUFFER,
 +#endif
 +
 +      MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 +      MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
 +
 +      MSR_IA32_TSC_ADJUST,
 +      MSR_IA32_TSC_DEADLINE,
 +      MSR_IA32_ARCH_CAPABILITIES,
 +      MSR_IA32_PERF_CAPABILITIES,
 +      MSR_IA32_MISC_ENABLE,
 +      MSR_IA32_MCG_STATUS,
 +      MSR_IA32_MCG_CTL,
 +      MSR_IA32_MCG_EXT_CTL,
 +      MSR_IA32_SMBASE,
 +      MSR_SMI_COUNT,
 +      MSR_PLATFORM_INFO,
 +      MSR_MISC_FEATURES_ENABLES,
 +      MSR_AMD64_VIRT_SPEC_CTRL,
 +      MSR_AMD64_TSC_RATIO,
 +      MSR_IA32_POWER_CTL,
 +      MSR_IA32_UCODE_REV,
 +
 +      /*
 +       * KVM always supports the "true" VMX control MSRs, even if the host
 +       * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
 +       * doesn't strictly require them to exist in the host (ignoring that
 +       * KVM would refuse to load in the first place if the core set of MSRs
 +       * aren't supported).
 +       */
 +      MSR_IA32_VMX_BASIC,
 +      MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 +      MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
 +      MSR_IA32_VMX_TRUE_EXIT_CTLS,
 +      MSR_IA32_VMX_TRUE_ENTRY_CTLS,
 +      MSR_IA32_VMX_MISC,
 +      MSR_IA32_VMX_CR0_FIXED0,
 +      MSR_IA32_VMX_CR4_FIXED0,
 +      MSR_IA32_VMX_VMCS_ENUM,
 +      MSR_IA32_VMX_PROCBASED_CTLS2,
 +      MSR_IA32_VMX_EPT_VPID_CAP,
 +      MSR_IA32_VMX_VMFUNC,
 +
 +      MSR_K7_HWCR,
 +      MSR_KVM_POLL_CONTROL,
 +};
 +
 +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
 +static unsigned num_emulated_msrs;
 +
 +/*
 + * List of MSRs that control the existence of MSR-based features, i.e. MSRs
 + * that are effectively CPUID leafs.  VMX MSRs are also included in the set of
 + * feature MSRs, but are handled separately to allow expedited lookups.
 + */
 +static const u32 msr_based_features_all_except_vmx[] = {
 +      MSR_AMD64_DE_CFG,
 +      MSR_IA32_UCODE_REV,
 +      MSR_IA32_ARCH_CAPABILITIES,
 +      MSR_IA32_PERF_CAPABILITIES,
 +};
 +
 +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
 +                            (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
 +static unsigned int num_msr_based_features;
 +
 +/*
 + * All feature MSRs except uCode revID, which tracks the currently loaded uCode
 + * patch, are immutable once the vCPU model is defined.
 + */
 +static bool kvm_is_immutable_feature_msr(u32 msr)
  {
 -      const char *op = write ? "wrmsr" : "rdmsr";
 +      int i;
  
 -      if (ignore_msrs) {
 -              if (report_ignored_msrs)
 -                      kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
 -                                    op, msr, data);
 -              /* Mask the error */
 +      if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
                return true;
 -      } else {
 +
 +      for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
 +              if (msr == msr_based_features_all_except_vmx[i])
 +                      return msr != MSR_IA32_UCODE_REV;
 +      }
 +
 +      return false;
 +}
 +
 +static bool kvm_is_advertised_msr(u32 msr_index)
 +{
 +      unsigned int i;
 +
 +      for (i = 0; i < num_msrs_to_save; i++) {
 +              if (msrs_to_save[i] == msr_index)
 +                      return true;
 +      }
 +
 +      for (i = 0; i < num_emulated_msrs; i++) {
 +              if (emulated_msrs[i] == msr_index)
 +                      return true;
 +      }
 +
 +      return false;
 +}
 +
 +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 +                          bool host_initiated);
 +
 +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
 +                                           u64 *data, bool host_initiated,
 +                                           enum kvm_msr_access rw,
 +                                           msr_access_t msr_access_fn)
 +{
 +      const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
 +      int ret;
 +
 +      BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
 +
 +      /*
 +       * Zero the data on read failures to avoid leaking stack data to the
 +       * guest and/or userspace, e.g. if the failure is ignored below.
 +       */
 +      ret = msr_access_fn(vcpu, msr, data, host_initiated);
 +      if (ret && rw == MSR_TYPE_R)
 +              *data = 0;
 +
 +      if (ret != KVM_MSR_RET_UNSUPPORTED)
 +              return ret;
 +
 +      /*
 +       * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
 +       * advertises to userspace, even if an MSR isn't fully supported.
 +       * Simply check that @data is '0', which covers both the write '0' case
 +       * and all reads (in which case @data is zeroed on failure; see above).
 +       */
 +      if (host_initiated && !*data && kvm_is_advertised_msr(msr))
 +              return 0;
 +
 +      if (!ignore_msrs) {
                kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
 -                                    op, msr, data);
 -              return false;
 +                                    op, msr, *data);
 +              return ret;
        }
 +
 +      if (report_ignored_msrs)
 +              kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
 +
 +      return 0;
  }
  
  static struct kmem_cache *kvm_alloc_emulator_cache(void)
@@@ -568,7 -355,7 +568,7 @@@ static void kvm_on_user_return(struct u
  
        /*
         * Disabling irqs at this point since the following code could be
 -       * interrupted and executed through kvm_arch_hardware_disable()
 +       * interrupted and executed through kvm_arch_disable_virtualization_cpu()
         */
        local_irq_save(flags);
        if (msrs->registered) {
@@@ -626,7 -413,8 +626,7 @@@ EXPORT_SYMBOL_GPL(kvm_find_user_return_
  
  static void kvm_user_return_msr_cpu_online(void)
  {
 -      unsigned int cpu = smp_processor_id();
 -      struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 +      struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
        u64 value;
        int i;
  
@@@ -833,6 -621,12 +833,6 @@@ static void kvm_queue_exception_vmexit(
        ex->payload = payload;
  }
  
 -/* Forcibly leave the nested mode in cases like a vCPU reset */
 -static void kvm_leave_nested(struct kvm_vcpu *vcpu)
 -{
 -      kvm_x86_ops.nested_ops->leave_nested(vcpu);
 -}
 -
  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                unsigned nr, bool has_error, u32 error_code,
                bool has_payload, unsigned long payload, bool reinject)
@@@ -1550,72 -1344,244 +1550,72 @@@ static u64 kvm_dr6_fixed(struct kvm_vcp
        if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
                fixed |= DR6_RTM;
  
 -      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 -              fixed |= DR6_BUS_LOCK;
 -      return fixed;
 -}
 -
 -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 -{
 -      size_t size = ARRAY_SIZE(vcpu->arch.db);
 -
 -      switch (dr) {
 -      case 0 ... 3:
 -              vcpu->arch.db[array_index_nospec(dr, size)] = val;
 -              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 -                      vcpu->arch.eff_db[dr] = val;
 -              break;
 -      case 4:
 -      case 6:
 -              if (!kvm_dr6_valid(val))
 -                      return 1; /* #GP */
 -              vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
 -              break;
 -      case 5:
 -      default: /* 7 */
 -              if (!kvm_dr7_valid(val))
 -                      return 1; /* #GP */
 -              vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 -              kvm_update_dr7(vcpu);
 -              break;
 -      }
 -
 -      return 0;
 -}
 -EXPORT_SYMBOL_GPL(kvm_set_dr);
 -
 -unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
 -{
 -      size_t size = ARRAY_SIZE(vcpu->arch.db);
 -
 -      switch (dr) {
 -      case 0 ... 3:
 -              return vcpu->arch.db[array_index_nospec(dr, size)];
 -      case 4:
 -      case 6:
 -              return vcpu->arch.dr6;
 -      case 5:
 -      default: /* 7 */
 -              return vcpu->arch.dr7;
 -      }
 -}
 -EXPORT_SYMBOL_GPL(kvm_get_dr);
 -
 -int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 -{
 -      u32 ecx = kvm_rcx_read(vcpu);
 -      u64 data;
 -
 -      if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
 -              kvm_inject_gp(vcpu, 0);
 -              return 1;
 -      }
 -
 -      kvm_rax_write(vcpu, (u32)data);
 -      kvm_rdx_write(vcpu, data >> 32);
 -      return kvm_skip_emulated_instruction(vcpu);
 -}
 -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
 -
 -/*
 - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
 - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
 - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.  msrs_to_save holds MSRs that
 - * require host support, i.e. should be probed via RDMSR.  emulated_msrs holds
 - * MSRs that KVM emulates without strictly requiring host support.
 - * msr_based_features holds MSRs that enumerate features, i.e. are effectively
 - * CPUID leafs.  Note, msr_based_features isn't mutually exclusive with
 - * msrs_to_save and emulated_msrs.
 - */
 -
 -static const u32 msrs_to_save_base[] = {
 -      MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 -      MSR_STAR,
 -#ifdef CONFIG_X86_64
 -      MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 -#endif
 -      MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 -      MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
 -      MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
 -      MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
 -      MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
 -      MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
 -      MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
 -      MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
 -      MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
 -      MSR_IA32_UMWAIT_CONTROL,
 -
 -      MSR_IA32_XFD, MSR_IA32_XFD_ERR,
 -};
 -
 -static const u32 msrs_to_save_pmu[] = {
 -      MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
 -      MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
 -      MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
 -      MSR_CORE_PERF_GLOBAL_CTRL,
 -      MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
 -
 -      /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
 -      MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
 -      MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
 -      MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
 -      MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
 -      MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
 -      MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
 -      MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
 -      MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
 -
 -      MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
 -      MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
 -
 -      /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
 -      MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
 -      MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
 -      MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
 -      MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
 -
 -      MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
 -      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
 -      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
 -};
 -
 -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
 -                      ARRAY_SIZE(msrs_to_save_pmu)];
 -static unsigned num_msrs_to_save;
 -
 -static const u32 emulated_msrs_all[] = {
 -      MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 -      MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 -
 -#ifdef CONFIG_KVM_HYPERV
 -      HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 -      HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 -      HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
 -      HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 -      HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 -      HV_X64_MSR_RESET,
 -      HV_X64_MSR_VP_INDEX,
 -      HV_X64_MSR_VP_RUNTIME,
 -      HV_X64_MSR_SCONTROL,
 -      HV_X64_MSR_STIMER0_CONFIG,
 -      HV_X64_MSR_VP_ASSIST_PAGE,
 -      HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
 -      HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
 -      HV_X64_MSR_SYNDBG_OPTIONS,
 -      HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
 -      HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
 -      HV_X64_MSR_SYNDBG_PENDING_BUFFER,
 -#endif
 -
 -      MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 -      MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
 -
 -      MSR_IA32_TSC_ADJUST,
 -      MSR_IA32_TSC_DEADLINE,
 -      MSR_IA32_ARCH_CAPABILITIES,
 -      MSR_IA32_PERF_CAPABILITIES,
 -      MSR_IA32_MISC_ENABLE,
 -      MSR_IA32_MCG_STATUS,
 -      MSR_IA32_MCG_CTL,
 -      MSR_IA32_MCG_EXT_CTL,
 -      MSR_IA32_SMBASE,
 -      MSR_SMI_COUNT,
 -      MSR_PLATFORM_INFO,
 -      MSR_MISC_FEATURES_ENABLES,
 -      MSR_AMD64_VIRT_SPEC_CTRL,
 -      MSR_AMD64_TSC_RATIO,
 -      MSR_IA32_POWER_CTL,
 -      MSR_IA32_UCODE_REV,
 -
 -      /*
 -       * KVM always supports the "true" VMX control MSRs, even if the host
 -       * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
 -       * doesn't strictly require them to exist in the host (ignoring that
 -       * KVM would refuse to load in the first place if the core set of MSRs
 -       * aren't supported).
 -       */
 -      MSR_IA32_VMX_BASIC,
 -      MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 -      MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
 -      MSR_IA32_VMX_TRUE_EXIT_CTLS,
 -      MSR_IA32_VMX_TRUE_ENTRY_CTLS,
 -      MSR_IA32_VMX_MISC,
 -      MSR_IA32_VMX_CR0_FIXED0,
 -      MSR_IA32_VMX_CR4_FIXED0,
 -      MSR_IA32_VMX_VMCS_ENUM,
 -      MSR_IA32_VMX_PROCBASED_CTLS2,
 -      MSR_IA32_VMX_EPT_VPID_CAP,
 -      MSR_IA32_VMX_VMFUNC,
 -
 -      MSR_K7_HWCR,
 -      MSR_KVM_POLL_CONTROL,
 -};
 +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 +              fixed |= DR6_BUS_LOCK;
 +      return fixed;
 +}
  
 -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
 -static unsigned num_emulated_msrs;
 +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 +{
 +      size_t size = ARRAY_SIZE(vcpu->arch.db);
  
 -/*
 - * List of MSRs that control the existence of MSR-based features, i.e. MSRs
 - * that are effectively CPUID leafs.  VMX MSRs are also included in the set of
 - * feature MSRs, but are handled separately to allow expedited lookups.
 - */
 -static const u32 msr_based_features_all_except_vmx[] = {
 -      MSR_AMD64_DE_CFG,
 -      MSR_IA32_UCODE_REV,
 -      MSR_IA32_ARCH_CAPABILITIES,
 -      MSR_IA32_PERF_CAPABILITIES,
 -};
 +      switch (dr) {
 +      case 0 ... 3:
 +              vcpu->arch.db[array_index_nospec(dr, size)] = val;
 +              if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 +                      vcpu->arch.eff_db[dr] = val;
 +              break;
 +      case 4:
 +      case 6:
 +              if (!kvm_dr6_valid(val))
 +                      return 1; /* #GP */
 +              vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
 +              break;
 +      case 5:
 +      default: /* 7 */
 +              if (!kvm_dr7_valid(val))
 +                      return 1; /* #GP */
 +              vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 +              kvm_update_dr7(vcpu);
 +              break;
 +      }
  
 -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
 -                            (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
 -static unsigned int num_msr_based_features;
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(kvm_set_dr);
  
 -/*
 - * All feature MSRs except uCode revID, which tracks the currently loaded uCode
 - * patch, are immutable once the vCPU model is defined.
 - */
 -static bool kvm_is_immutable_feature_msr(u32 msr)
 +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
  {
 -      int i;
 +      size_t size = ARRAY_SIZE(vcpu->arch.db);
  
 -      if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
 -              return true;
 +      switch (dr) {
 +      case 0 ... 3:
 +              return vcpu->arch.db[array_index_nospec(dr, size)];
 +      case 4:
 +      case 6:
 +              return vcpu->arch.dr6;
 +      case 5:
 +      default: /* 7 */
 +              return vcpu->arch.dr7;
 +      }
 +}
 +EXPORT_SYMBOL_GPL(kvm_get_dr);
  
 -      for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
 -              if (msr == msr_based_features_all_except_vmx[i])
 -                      return msr != MSR_IA32_UCODE_REV;
 +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 +{
 +      u32 ecx = kvm_rcx_read(vcpu);
 +      u64 data;
 +
 +      if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
 +              kvm_inject_gp(vcpu, 0);
 +              return 1;
        }
  
 -      return false;
 +      kvm_rax_write(vcpu, (u32)data);
 +      kvm_rdx_write(vcpu, data >> 32);
 +      return kvm_skip_emulated_instruction(vcpu);
  }
 +EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
  
  /*
   * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
@@@ -1694,31 -1660,40 +1694,31 @@@ static u64 kvm_get_arch_capabilities(vo
        return data;
  }
  
 -static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
 +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 +                             bool host_initiated)
  {
 -      switch (msr->index) {
 +      WARN_ON_ONCE(!host_initiated);
 +
 +      switch (index) {
        case MSR_IA32_ARCH_CAPABILITIES:
 -              msr->data = kvm_get_arch_capabilities();
 +              *data = kvm_get_arch_capabilities();
                break;
        case MSR_IA32_PERF_CAPABILITIES:
 -              msr->data = kvm_caps.supported_perf_cap;
 +              *data = kvm_caps.supported_perf_cap;
                break;
        case MSR_IA32_UCODE_REV:
 -              rdmsrl_safe(msr->index, &msr->data);
 +              rdmsrl_safe(index, data);
                break;
        default:
 -              return kvm_x86_call(get_msr_feature)(msr);
 +              return kvm_x86_call(get_feature_msr)(index, data);
        }
        return 0;
  }
  
 -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
  {
 -      struct kvm_msr_entry msr;
 -      int r;
 -
 -      /* Unconditionally clear the output for simplicity */
 -      msr.data = 0;
 -      msr.index = index;
 -      r = kvm_get_msr_feature(&msr);
 -
 -      if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
 -              r = 0;
 -
 -      *data = msr.data;
 -
 -      return r;
 +      return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
 +                               kvm_get_feature_msr);
  }
  
  static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@@ -1905,17 -1880,16 +1905,17 @@@ static int __kvm_set_msr(struct kvm_vcp
        return kvm_x86_call(set_msr)(vcpu, &msr);
  }
  
 +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 +                      bool host_initiated)
 +{
 +      return __kvm_set_msr(vcpu, index, *data, host_initiated);
 +}
 +
  static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
                                     u32 index, u64 data, bool host_initiated)
  {
 -      int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
 -
 -      if (ret == KVM_MSR_RET_INVALID)
 -              if (kvm_msr_ignored_check(index, data, true))
 -                      ret = 0;
 -
 -      return ret;
 +      return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
 +                               _kvm_set_msr);
  }
  
  /*
@@@ -1954,23 -1928,33 +1954,25 @@@ int __kvm_get_msr(struct kvm_vcpu *vcpu
  static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
                                     u32 index, u64 *data, bool host_initiated)
  {
 -      int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
 -
 -      if (ret == KVM_MSR_RET_INVALID) {
 -              /* Unconditionally clear *data for simplicity */
 -              *data = 0;
 -              if (kvm_msr_ignored_check(index, 0, false))
 -                      ret = 0;
 -      }
 -
 -      return ret;
 +      return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
 +                               __kvm_get_msr);
  }
  
static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+ int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
  {
        if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
                return KVM_MSR_RET_FILTERED;
        return kvm_get_msr_ignored_check(vcpu, index, data, false);
  }
+ EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
  
static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+ int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
  {
        if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
                return KVM_MSR_RET_FILTERED;
        return kvm_set_msr_ignored_check(vcpu, index, data, false);
  }
+ EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
  
  int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
  {
@@@ -2017,7 -2001,7 +2019,7 @@@ static int complete_fast_rdmsr(struct k
  static u64 kvm_msr_reason(int r)
  {
        switch (r) {
 -      case KVM_MSR_RET_INVALID:
 +      case KVM_MSR_RET_UNSUPPORTED:
                return KVM_MSR_EXIT_REASON_UNKNOWN;
        case KVM_MSR_RET_FILTERED:
                return KVM_MSR_EXIT_REASON_FILTER;
@@@ -2180,34 -2164,31 +2182,34 @@@ fastpath_t handle_fastpath_set_msr_irqo
  {
        u32 msr = kvm_rcx_read(vcpu);
        u64 data;
 -      fastpath_t ret = EXIT_FASTPATH_NONE;
 +      fastpath_t ret;
 +      bool handled;
  
        kvm_vcpu_srcu_read_lock(vcpu);
  
        switch (msr) {
        case APIC_BASE_MSR + (APIC_ICR >> 4):
                data = kvm_read_edx_eax(vcpu);
 -              if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
 -                      kvm_skip_emulated_instruction(vcpu);
 -                      ret = EXIT_FASTPATH_EXIT_HANDLED;
 -              }
 +              handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
                break;
        case MSR_IA32_TSC_DEADLINE:
                data = kvm_read_edx_eax(vcpu);
 -              if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
 -                      kvm_skip_emulated_instruction(vcpu);
 -                      ret = EXIT_FASTPATH_REENTER_GUEST;
 -              }
 +              handled = !handle_fastpath_set_tscdeadline(vcpu, data);
                break;
        default:
 +              handled = false;
                break;
        }
  
 -      if (ret != EXIT_FASTPATH_NONE)
 +      if (handled) {
 +              if (!kvm_skip_emulated_instruction(vcpu))
 +                      ret = EXIT_FASTPATH_EXIT_USERSPACE;
 +              else
 +                      ret = EXIT_FASTPATH_REENTER_GUEST;
                trace_kvm_msr_write(msr, data);
 +      } else {
 +              ret = EXIT_FASTPATH_NONE;
 +      }
  
        kvm_vcpu_srcu_read_unlock(vcpu);
  
@@@ -3767,6 -3748,18 +3769,6 @@@ static void record_steal_time(struct kv
        mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
  }
  
 -static bool kvm_is_msr_to_save(u32 msr_index)
 -{
 -      unsigned int i;
 -
 -      for (i = 0; i < num_msrs_to_save; i++) {
 -              if (msrs_to_save[i] == msr_index)
 -                      return true;
 -      }
 -
 -      return false;
 -}
 -
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
        u32 msr = msr_info->index;
                if (kvm_pmu_is_valid_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr_info);
  
 -              /*
 -               * Userspace is allowed to write '0' to MSRs that KVM reports
 -               * as to-be-saved, even if an MSRs isn't fully supported.
 -               */
 -              if (msr_info->host_initiated && !data &&
 -                  kvm_is_msr_to_save(msr))
 -                      break;
 -
 -              return KVM_MSR_RET_INVALID;
 +              return KVM_MSR_RET_UNSUPPORTED;
        }
        return 0;
  }
@@@ -4499,7 -4500,17 +4501,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                        return kvm_pmu_get_msr(vcpu, msr_info);
  
 -              /*
 -               * Userspace is allowed to read MSRs that KVM reports as
 -               * to-be-saved, even if an MSR isn't fully supported.
 -               */
 -              if (msr_info->host_initiated &&
 -                  kvm_is_msr_to_save(msr_info->index)) {
 -                      msr_info->data = 0;
 -                      break;
 -              }
 -
 -              return KVM_MSR_RET_INVALID;
 +              return KVM_MSR_RET_UNSUPPORTED;
        }
        return 0;
  }
@@@ -4647,6 -4658,7 +4649,6 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_ASYNC_PF_INT:
        case KVM_CAP_GET_TSC_KHZ:
        case KVM_CAP_KVMCLOCK_CTRL:
 -      case KVM_CAP_READONLY_MEM:
        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
        case KVM_CAP_TSC_DEADLINE_TIMER:
        case KVM_CAP_DISABLE_QUIRKS:
        case KVM_CAP_VM_TYPES:
                r = kvm_caps.supported_vm_types;
                break;
 +      case KVM_CAP_READONLY_MEM:
 +              r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
 +              break;
        default:
                break;
        }
@@@ -4937,7 -4946,7 +4939,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
                break;
        }
        case KVM_GET_MSRS:
 -              r = msr_io(NULL, argp, do_get_msr_feature, 1);
 +              r = msr_io(NULL, argp, do_get_feature_msr, 1);
                break;
  #ifdef CONFIG_KVM_HYPERV
        case KVM_GET_SUPPORTED_HV_CPUID:
@@@ -6033,9 -6042,7 +6035,9 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
                        break;
  
 +              kvm_vcpu_srcu_read_lock(vcpu);
                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 +              kvm_vcpu_srcu_read_unlock(vcpu);
                break;
        }
        case KVM_GET_DEBUGREGS: {
@@@ -7374,9 -7381,11 +7376,9 @@@ out
  
  static void kvm_probe_feature_msr(u32 msr_index)
  {
 -      struct kvm_msr_entry msr = {
 -              .index = msr_index,
 -      };
 +      u64 data;
  
 -      if (kvm_get_msr_feature(&msr))
 +      if (kvm_get_feature_msr(NULL, msr_index, &data, true))
                return;
  
        msr_based_features[num_msr_based_features++] = msr_index;
@@@ -8854,13 -8863,60 +8856,13 @@@ static int handle_emulation_failure(str
        return 1;
  }
  
 -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 -                                int emulation_type)
 +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
 +                                             gpa_t cr2_or_gpa,
 +                                             int emulation_type)
  {
 -      gpa_t gpa = cr2_or_gpa;
 -      kvm_pfn_t pfn;
 -
        if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
                return false;
  
 -      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 -          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
 -              return false;
 -
 -      if (!vcpu->arch.mmu->root_role.direct) {
 -              /*
 -               * Write permission should be allowed since only
 -               * write access need to be emulated.
 -               */
 -              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
 -
 -              /*
 -               * If the mapping is invalid in guest, let cpu retry
 -               * it to generate fault.
 -               */
 -              if (gpa == INVALID_GPA)
 -                      return true;
 -      }
 -
 -      /*
 -       * Do not retry the unhandleable instruction if it faults on the
 -       * readonly host memory, otherwise it will goto a infinite loop:
 -       * retry instruction -> write #PF -> emulation fail -> retry
 -       * instruction -> ...
 -       */
 -      pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
 -
 -      /*
 -       * If the instruction failed on the error pfn, it can not be fixed,
 -       * report the error to userspace.
 -       */
 -      if (is_error_noslot_pfn(pfn))
 -              return false;
 -
 -      kvm_release_pfn_clean(pfn);
 -
 -      /*
 -       * If emulation may have been triggered by a write to a shadowed page
 -       * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
 -       * guest to let the CPU re-execute the instruction in the hope that the
 -       * CPU can cleanly execute the instruction that KVM failed to emulate.
 -       */
 -      if (vcpu->kvm->arch.indirect_shadow_pages)
 -              kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 -
        /*
         * If the failed instruction faulted on an access to page tables that
         * are used to translate any part of the instruction, KVM can't resolve
         * then zap the SPTE to unprotect the gfn, and then do it all over
         * again.  Report the error to userspace.
         */
 -      return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 -}
 -
 -static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 -                            gpa_t cr2_or_gpa,  int emulation_type)
 -{
 -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 -      unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
 -
 -      last_retry_eip = vcpu->arch.last_retry_eip;
 -      last_retry_addr = vcpu->arch.last_retry_addr;
 +      if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
 +              return false;
  
        /*
 -       * If the emulation is caused by #PF and it is non-page_table
 -       * writing instruction, it means the VM-EXIT is caused by shadow
 -       * page protected, we can zap the shadow page and retry this
 -       * instruction directly.
 -       *
 -       * Note: if the guest uses a non-page-table modifying instruction
 -       * on the PDE that points to the instruction, then we will unmap
 -       * the instruction and go to an infinite loop. So, we cache the
 -       * last retried eip and the last fault address, if we meet the eip
 -       * and the address again, we can break out of the potential infinite
 -       * loop.
 +       * If emulation may have been triggered by a write to a shadowed page
 +       * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
 +       * guest to let the CPU re-execute the instruction in the hope that the
 +       * CPU can cleanly execute the instruction that KVM failed to emulate.
         */
 -      vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
 -
 -      if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
 -              return false;
 -
 -      if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 -          WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
 -              return false;
 -
 -      if (x86_page_table_writing_insn(ctxt))
 -              return false;
 -
 -      if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
 -              return false;
 -
 -      vcpu->arch.last_retry_eip = ctxt->eip;
 -      vcpu->arch.last_retry_addr = cr2_or_gpa;
 -
 -      if (!vcpu->arch.mmu->root_role.direct)
 -              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
 -
 -      kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 +      __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
  
 +      /*
 +       * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
 +       * all SPTEs were already zapped by a different task.  The alternative
 +       * is to report the error to userspace and likely terminate the guest,
 +       * and the last_retry_{eip,addr} checks will prevent retrying the page
 +       * fault indefinitely, i.e. there's nothing to lose by retrying.
 +       */
        return true;
  }
  
@@@ -9088,11 -9174,6 +9090,11 @@@ int x86_emulate_instruction(struct kvm_
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
  
 +      if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
 +          (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
 +           WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
 +              emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
 +
        r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
        if (r != X86EMUL_CONTINUE) {
                if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
                                kvm_queue_exception(vcpu, UD_VECTOR);
                                return 1;
                        }
 -                      if (reexecute_instruction(vcpu, cr2_or_gpa,
 -                                                emulation_type))
 +                      if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
 +                                                             emulation_type))
                                return 1;
  
                        if (ctxt->have_exception &&
                return 1;
        }
  
 -      if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
 +      /*
 +       * If emulation was caused by a write-protection #PF on a non-page_table
 +       * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
 +       * and retry the instruction, as the vCPU is likely no longer using the
 +       * gfn as a page table.
 +       */
 +      if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
 +          !x86_page_table_writing_insn(ctxt) &&
 +          kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
                return 1;
  
        /* this is needed for vmware backdoor interface to work since it
@@@ -9210,8 -9283,7 +9212,8 @@@ restart
                return 1;
  
        if (r == EMULATION_FAILED) {
 -              if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
 +              if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
 +                                                     emulation_type))
                        return 1;
  
                return handle_emulation_failure(vcpu, emulation_type);
@@@ -9679,7 -9751,7 +9681,7 @@@ int kvm_x86_vendor_init(struct kvm_x86_
  
        guard(mutex)(&vendor_module_lock);
  
 -      if (kvm_x86_ops.hardware_enable) {
 +      if (kvm_x86_ops.enable_virtualization_cpu) {
                pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
                return -EEXIST;
        }
        return 0;
  
  out_unwind_ops:
 -      kvm_x86_ops.hardware_enable = NULL;
 +      kvm_x86_ops.enable_virtualization_cpu = NULL;
        kvm_x86_call(hardware_unsetup)();
  out_mmu_exit:
        kvm_mmu_vendor_module_exit();
@@@ -9830,27 -9902,72 +9832,27 @@@ void kvm_x86_vendor_exit(void
  
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 -                                          CPUFREQ_TRANSITION_NOTIFIER);
 -              cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
 -      }
 -#ifdef CONFIG_X86_64
 -      pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 -      irq_work_sync(&pvclock_irq_work);
 -      cancel_work_sync(&pvclock_gtod_work);
 -#endif
 -      kvm_x86_call(hardware_unsetup)();
 -      kvm_mmu_vendor_module_exit();
 -      free_percpu(user_return_msrs);
 -      kmem_cache_destroy(x86_emulator_cache);
 -#ifdef CONFIG_KVM_XEN
 -      static_key_deferred_flush(&kvm_xen_enabled);
 -      WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
 -#endif
 -      mutex_lock(&vendor_module_lock);
 -      kvm_x86_ops.hardware_enable = NULL;
 -      mutex_unlock(&vendor_module_lock);
 -}
 -EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
 -
 -static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
 -{
 -      /*
 -       * The vCPU has halted, e.g. executed HLT.  Update the run state if the
 -       * local APIC is in-kernel, the run loop will detect the non-runnable
 -       * state and halt the vCPU.  Exit to userspace if the local APIC is
 -       * managed by userspace, in which case userspace is responsible for
 -       * handling wake events.
 -       */
 -      ++vcpu->stat.halt_exits;
 -      if (lapic_in_kernel(vcpu)) {
 -              vcpu->arch.mp_state = state;
 -              return 1;
 -      } else {
 -              vcpu->run->exit_reason = reason;
 -              return 0;
 -      }
 -}
 -
 -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
 -{
 -      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
 -}
 -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
 -
 -int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 -{
 -      int ret = kvm_skip_emulated_instruction(vcpu);
 -      /*
 -       * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
 -       * KVM_EXIT_DEBUG here.
 -       */
 -      return kvm_emulate_halt_noskip(vcpu) && ret;
 -}
 -EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 -
 -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
 -{
 -      int ret = kvm_skip_emulated_instruction(vcpu);
 -
 -      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
 -                                      KVM_EXIT_AP_RESET_HOLD) && ret;
 +                                          CPUFREQ_TRANSITION_NOTIFIER);
 +              cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
 +      }
 +#ifdef CONFIG_X86_64
 +      pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 +      irq_work_sync(&pvclock_irq_work);
 +      cancel_work_sync(&pvclock_gtod_work);
 +#endif
 +      kvm_x86_call(hardware_unsetup)();
 +      kvm_mmu_vendor_module_exit();
 +      free_percpu(user_return_msrs);
 +      kmem_cache_destroy(x86_emulator_cache);
 +#ifdef CONFIG_KVM_XEN
 +      static_key_deferred_flush(&kvm_xen_enabled);
 +      WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
 +#endif
 +      mutex_lock(&vendor_module_lock);
 +      kvm_x86_ops.enable_virtualization_cpu = NULL;
 +      mutex_unlock(&vendor_module_lock);
  }
 -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
 +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
  
  #ifdef CONFIG_X86_64
  static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
@@@ -11088,9 -11205,6 +11090,9 @@@ static int vcpu_enter_guest(struct kvm_
        if (vcpu->arch.apic_attention)
                kvm_lapic_sync_from_vapic(vcpu);
  
 +      if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
 +              return 0;
 +
        r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
        return r;
  
        return r;
  }
  
 +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 +{
 +      return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 +              !vcpu->arch.apf.halted);
 +}
 +
 +static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 +{
 +      if (!list_empty_careful(&vcpu->async_pf.done))
 +              return true;
 +
 +      if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
 +          kvm_apic_init_sipi_allowed(vcpu))
 +              return true;
 +
 +      if (vcpu->arch.pv.pv_unhalted)
 +              return true;
 +
 +      if (kvm_is_exception_pending(vcpu))
 +              return true;
 +
 +      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 +          (vcpu->arch.nmi_pending &&
 +           kvm_x86_call(nmi_allowed)(vcpu, false)))
 +              return true;
 +
 +#ifdef CONFIG_KVM_SMM
 +      if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
 +          (vcpu->arch.smi_pending &&
 +           kvm_x86_call(smi_allowed)(vcpu, false)))
 +              return true;
 +#endif
 +
 +      if (kvm_test_request(KVM_REQ_PMI, vcpu))
 +              return true;
 +
 +      if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
 +              return true;
 +
 +      if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
 +              return true;
 +
 +      if (kvm_hv_has_stimer_pending(vcpu))
 +              return true;
 +
 +      if (is_guest_mode(vcpu) &&
 +          kvm_x86_ops.nested_ops->has_events &&
 +          kvm_x86_ops.nested_ops->has_events(vcpu, false))
 +              return true;
 +
 +      if (kvm_xen_has_pending_events(vcpu))
 +              return true;
 +
 +      return false;
 +}
 +
 +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 +{
 +      return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 +}
 +
  /* Called within kvm->srcu read side.  */
  static inline int vcpu_block(struct kvm_vcpu *vcpu)
  {
        return 1;
  }
  
 -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 -{
 -      return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 -              !vcpu->arch.apf.halted);
 -}
 -
  /* Called within kvm->srcu read side.  */
  static int vcpu_run(struct kvm_vcpu *vcpu)
  {
        return r;
  }
  
 +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
 +{
 +      /*
 +       * The vCPU has halted, e.g. executed HLT.  Update the run state if the
 +       * local APIC is in-kernel, the run loop will detect the non-runnable
 +       * state and halt the vCPU.  Exit to userspace if the local APIC is
 +       * managed by userspace, in which case userspace is responsible for
 +       * handling wake events.
 +       */
 +      ++vcpu->stat.halt_exits;
 +      if (lapic_in_kernel(vcpu)) {
 +              if (kvm_vcpu_has_events(vcpu))
 +                      vcpu->arch.pv.pv_unhalted = false;
 +              else
 +                      vcpu->arch.mp_state = state;
 +              return 1;
 +      } else {
 +              vcpu->run->exit_reason = reason;
 +              return 0;
 +      }
 +}
 +
 +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
 +{
 +      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
 +}
 +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
 +
 +int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 +{
 +      int ret = kvm_skip_emulated_instruction(vcpu);
 +      /*
 +       * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
 +       * KVM_EXIT_DEBUG here.
 +       */
 +      return kvm_emulate_halt_noskip(vcpu) && ret;
 +}
 +EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 +
 +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
 +{
 +      int ret;
 +
 +      kvm_vcpu_srcu_read_lock(vcpu);
 +      ret = kvm_emulate_halt(vcpu);
 +      kvm_vcpu_srcu_read_unlock(vcpu);
 +
 +      if (!ret)
 +              return EXIT_FASTPATH_EXIT_USERSPACE;
 +
 +      if (kvm_vcpu_running(vcpu))
 +              return EXIT_FASTPATH_REENTER_GUEST;
 +
 +      return EXIT_FASTPATH_EXIT_HANDLED;
 +}
 +EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
 +
 +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
 +{
 +      int ret = kvm_skip_emulated_instruction(vcpu);
 +
 +      return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
 +                                      KVM_EXIT_AP_RESET_HOLD) && ret;
 +}
 +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
 +
 +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 +{
 +      return kvm_vcpu_apicv_active(vcpu) &&
 +             kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
 +}
 +
 +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
 +{
 +      return vcpu->arch.preempted_in_kernel;
 +}
 +
 +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
 +{
 +      if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
 +              return true;
 +
 +      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 +#ifdef CONFIG_KVM_SMM
 +              kvm_test_request(KVM_REQ_SMI, vcpu) ||
 +#endif
 +               kvm_test_request(KVM_REQ_EVENT, vcpu))
 +              return true;
 +
 +      return kvm_arch_dy_has_pending_interrupt(vcpu);
 +}
 +
  static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
  {
        return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
@@@ -12295,6 -12262,8 +12297,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
        vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
  
 -      vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 -
        kvm_async_pf_hash_reset(vcpu);
  
        vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
@@@ -12460,8 -12429,6 +12462,8 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
        if (!init_event) {
                vcpu->arch.smbase = 0x30000;
  
 +              vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 +
                vcpu->arch.msr_misc_features_enables = 0;
                vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
                                                  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
@@@ -12547,17 -12514,7 +12549,17 @@@ void kvm_vcpu_deliver_sipi_vector(struc
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
  
 -int kvm_arch_hardware_enable(void)
 +void kvm_arch_enable_virtualization(void)
 +{
 +      cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
 +}
 +
 +void kvm_arch_disable_virtualization(void)
 +{
 +      cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
 +}
 +
 +int kvm_arch_enable_virtualization_cpu(void)
  {
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        if (ret)
                return ret;
  
 -      ret = kvm_x86_call(hardware_enable)();
 +      ret = kvm_x86_call(enable_virtualization_cpu)();
        if (ret != 0)
                return ret;
  
        return 0;
  }
  
 -void kvm_arch_hardware_disable(void)
 +void kvm_arch_disable_virtualization_cpu(void)
  {
 -      kvm_x86_call(hardware_disable)();
 +      kvm_x86_call(disable_virtualization_cpu)();
        drop_user_return_notifiers();
  }
  
@@@ -13203,6 -13160,87 +13205,6 @@@ void kvm_arch_commit_memory_region(stru
                kvm_arch_free_memslot(kvm, old);
  }
  
 -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 -{
 -      if (!list_empty_careful(&vcpu->async_pf.done))
 -              return true;
 -
 -      if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
 -          kvm_apic_init_sipi_allowed(vcpu))
 -              return true;
 -
 -      if (vcpu->arch.pv.pv_unhalted)
 -              return true;
 -
 -      if (kvm_is_exception_pending(vcpu))
 -              return true;
 -
 -      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 -          (vcpu->arch.nmi_pending &&
 -           kvm_x86_call(nmi_allowed)(vcpu, false)))
 -              return true;
 -
 -#ifdef CONFIG_KVM_SMM
 -      if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
 -          (vcpu->arch.smi_pending &&
 -           kvm_x86_call(smi_allowed)(vcpu, false)))
 -              return true;
 -#endif
 -
 -      if (kvm_test_request(KVM_REQ_PMI, vcpu))
 -              return true;
 -
 -      if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
 -              return true;
 -
 -      if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
 -              return true;
 -
 -      if (kvm_hv_has_stimer_pending(vcpu))
 -              return true;
 -
 -      if (is_guest_mode(vcpu) &&
 -          kvm_x86_ops.nested_ops->has_events &&
 -          kvm_x86_ops.nested_ops->has_events(vcpu, false))
 -              return true;
 -
 -      if (kvm_xen_has_pending_events(vcpu))
 -              return true;
 -
 -      return false;
 -}
 -
 -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 -{
 -      return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 -}
 -
 -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 -{
 -      return kvm_vcpu_apicv_active(vcpu) &&
 -             kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
 -}
 -
 -bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
 -{
 -      return vcpu->arch.preempted_in_kernel;
 -}
 -
 -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
 -{
 -      if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
 -              return true;
 -
 -      if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 -#ifdef CONFIG_KVM_SMM
 -              kvm_test_request(KVM_REQ_SMI, vcpu) ||
 -#endif
 -               kvm_test_request(KVM_REQ_EVENT, vcpu))
 -              return true;
 -
 -      return kvm_arch_dy_has_pending_interrupt(vcpu);
 -}
 -
  bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
  {
        if (vcpu->arch.guest_state_protected)
This page took 0.174781 seconds and 4 git commands to generate.