Merge tag 'kvm-x86-vmx-6.5' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <[email protected]>

Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)

committer Paolo Bonzini <[email protected]>

Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)
author Paolo Bonzini <[email protected]>
Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)
committer Paolo Bonzini <[email protected]>
Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)
diff --combined arch/x86/kvm/mmu/mmu.c

index 03ff06cd65b33867af1885bc32bf3ecf4137ea09,beb507d82adfd9092ba92c6741d8c7e042186071..ec169f5c7dce21d5f730638ba86ebc99f3050146
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -58,8 -58,6 +58,8 @@@
   
   extern bool itlb_multihit_kvm_mitigation;
   
+ +static bool nx_hugepage_mitigation_hard_disabled;
+ +
   int __read_mostly nx_huge_pages = -1;
   static uint __read_mostly nx_huge_pages_recovery_period_ms;
   #ifdef CONFIG_PREEMPT_RT
@@@ -69,13 -67,12 +69,13 @@@ static uint __read_mostly nx_huge_pages
   static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
   #endif
   
+ +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
   static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
   static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
   
   static const struct kernel_param_ops nx_huge_pages_ops = {
         .set = set_nx_huge_pages,
- -      .get = param_get_bool,
+ +      .get = get_nx_huge_pages,
   };
   
   static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
@@@ -1603,6 -1600,10 +1603,10 @@@ bool kvm_unmap_gfn_range(struct kvm *kv
         if (tdp_mmu_enabled)
                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
   
+       if (kvm_x86_ops.set_apic_access_page_addr &&
+           range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+               kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+ 
         return flush;
   }
   
@@@ -5800,14 -5801,6 +5804,14 @@@ static void __kvm_mmu_invalidate_addr(s
   
         vcpu_clear_mmio_info(vcpu, addr);
   
+ +      /*
+ +       * Walking and synchronizing SPTEs both assume they are operating in
+ +       * the context of the current MMU, and would need to be reworked if
+ +       * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
+ +       */
+ +      if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
+ +              return;
+ +
         if (!VALID_PAGE(root_hpa))
                 return;
   
@@@ -6855,14 -6848,6 +6859,14 @@@ static void mmu_destroy_caches(void
         kmem_cache_destroy(mmu_page_header_cache);
   }
   
+ +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
+ +{
+ +      if (nx_hugepage_mitigation_hard_disabled)
+ +              return sprintf(buffer, "never\n");
+ +
+ +      return param_get_bool(buffer, kp);
+ +}
+ +
   static bool get_nx_auto_mode(void)
   {
         /* Return true when CPU has the bug, and mitigations are ON */
@@@ -6879,29 -6864,15 +6883,29 @@@ static int set_nx_huge_pages(const cha
         bool old_val = nx_huge_pages;
         bool new_val;
   
+ +      if (nx_hugepage_mitigation_hard_disabled)
+ +              return -EPERM;
+ +
         /* In "auto" mode deploy workaround only if CPU has the bug. */
- -      if (sysfs_streq(val, "off"))
+ +      if (sysfs_streq(val, "off")) {
                 new_val = 0;
- -      else if (sysfs_streq(val, "force"))
+ +      } else if (sysfs_streq(val, "force")) {
                 new_val = 1;
- -      else if (sysfs_streq(val, "auto"))
+ +      } else if (sysfs_streq(val, "auto")) {
                 new_val = get_nx_auto_mode();
- -      else if (kstrtobool(val, &new_val) < 0)
+ +      } else if (sysfs_streq(val, "never")) {
+ +              new_val = 0;
+ +
+ +              mutex_lock(&kvm_lock);
+ +              if (!list_empty(&vm_list)) {
+ +                      mutex_unlock(&kvm_lock);
+ +                      return -EBUSY;
+ +              }
+ +              nx_hugepage_mitigation_hard_disabled = true;
+ +              mutex_unlock(&kvm_lock);
+ +      } else if (kstrtobool(val, &new_val) < 0) {
                 return -EINVAL;
+ +      }
   
         __set_nx_huge_pages(new_val);
   
@@@ -7039,9 -7010,6 +7043,9 @@@ static int set_nx_huge_pages_recovery_p
         uint old_period, new_period;
         int err;
   
+ +      if (nx_hugepage_mitigation_hard_disabled)
+ +              return -EPERM;
+ +
         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
   
         err = param_set_uint(val, kp);
@@@ -7127,10 -7095,7 +7131,10 @@@ static void kvm_recover_nx_huge_pages(s
                  */
                 slot = NULL;
                 if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
- -                      slot = gfn_to_memslot(kvm, sp->gfn);
+ +                      struct kvm_memslots *slots;
+ +
+ +                      slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ +                      slot = __gfn_to_memslot(slots, sp->gfn);
                         WARN_ON_ONCE(!slot);
                 }
   
@@@ -7200,9 -7165,6 +7204,9 @@@ int kvm_mmu_post_init_vm(struct kvm *kv
   {
         int err;
   
+ +      if (nx_hugepage_mitigation_hard_disabled)
+ +              return 0;
+ +
         err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
                                           "kvm-nx-lpage-recovery",
                                           &kvm->arch.nx_huge_page_recovery_thread);
diff --combined arch/x86/kvm/vmx/nested.c

index ba2ed6d87364512ebc00c4e059695e9c8925b509,368a43e3b40e6a132a29ceb110a57d06647ca645..516391cc0d64fb9689d17fea0d04903c13abe66a
--- 1/arch/x86/kvm/vmx/nested.c
--- 2/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@@ -2328,8 -2328,7 +2328,7 @@@ static void prepare_vmcs02_early(struc
                  * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
                  * will not have to rewrite the controls just for this bit.
                  */
-               if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
-                   (vmcs12->guest_cr4 & X86_CR4_UMIP))
+               if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
                         exec_control |= SECONDARY_EXEC_DESC;
   
                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
@@@ -2649,7 -2648,7 +2648,7 @@@ static int prepare_vmcs02(struct kvm_vc
         }
   
         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- -          intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
+ +          kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
             WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                      vmcs12->guest_ia32_perf_global_ctrl))) {
                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
@@@ -4524,7 -4523,7 +4523,7 @@@ static void load_vmcs12_host_state(stru
                 vcpu->arch.pat = vmcs12->host_ia32_pat;
         }
         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- -          intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
+ +          kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
                 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                          vmcs12->host_ia32_perf_global_ctrl));
   
diff --combined arch/x86/kvm/vmx/pmu_intel.c

index 30ec9ccdea47252af85095631048b366c6790d32,84be32d9f365d4763796be2b7ba182556df1e476..80c769c58a876530674e823434bcbf627d19d3c6
--- 1/arch/x86/kvm/vmx/pmu_intel.c
--- 2/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@@ -73,6 -73,18 +73,6 @@@ static struct kvm_pmc *intel_pmc_idx_to
         }
   }
   
- -static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
- -{
- -      int bit;
- -
- -      if (!diff)
- -              return;
- -
- -      for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- -              set_bit(bit, pmu->reprogram_pmi);
- -      kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
- -}
- -
   static bool intel_hw_event_available(struct kvm_pmc *pmc)
   {
         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@@ -95,6 -107,17 +95,6 @@@
         return true;
   }
   
- -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
- -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
- -{
- -      struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- -
- -      if (!intel_pmu_has_perf_global_ctrl(pmu))
- -              return true;
- -
- -      return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
- -}
- -
   static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
   {
         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@@ -175,7 -198,11 +175,7 @@@ static bool intel_is_valid_msr(struct k
   
         switch (msr) {
         case MSR_CORE_PERF_FIXED_CTR_CTRL:
- -      case MSR_CORE_PERF_GLOBAL_STATUS:
- -      case MSR_CORE_PERF_GLOBAL_CTRL:
- -      case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- -              return intel_pmu_has_perf_global_ctrl(pmu);
- -              break;
+ +              return kvm_pmu_has_perf_global_ctrl(pmu);
         case MSR_IA32_PEBS_ENABLE:
                 ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
                 break;
@@@ -325,6 -352,15 +325,6 @@@ static int intel_pmu_get_msr(struct kvm
         case MSR_CORE_PERF_FIXED_CTR_CTRL:
                 msr_info->data = pmu->fixed_ctr_ctrl;
                 break;
- -      case MSR_CORE_PERF_GLOBAL_STATUS:
- -              msr_info->data = pmu->global_status;
- -              break;
- -      case MSR_CORE_PERF_GLOBAL_CTRL:
- -              msr_info->data = pmu->global_ctrl;
- -              break;
- -      case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- -              msr_info->data = 0;
- -              break;
         case MSR_IA32_PEBS_ENABLE:
                 msr_info->data = pmu->pebs_enable;
                 break;
@@@ -374,6 -410,29 +374,6 @@@ static int intel_pmu_set_msr(struct kvm
                 if (pmu->fixed_ctr_ctrl != data)
                         reprogram_fixed_counters(pmu, data);
                 break;
- -      case MSR_CORE_PERF_GLOBAL_STATUS:
- -              if (!msr_info->host_initiated)
- -                      return 1; /* RO MSR */
- -
- -              pmu->global_status = data;
- -              break;
- -      case MSR_CORE_PERF_GLOBAL_CTRL:
- -              if (!kvm_valid_perf_global_ctrl(pmu, data))
- -                      return 1;
- -
- -              if (pmu->global_ctrl != data) {
- -                      diff = pmu->global_ctrl ^ data;
- -                      pmu->global_ctrl = data;
- -                      reprogram_counters(pmu, diff);
- -              }
- -              break;
- -      case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- -              if (data & pmu->global_ovf_ctrl_mask)
- -                      return 1;
- -
- -              if (!msr_info->host_initiated)
- -                      pmu->global_status &= ~data;
- -              break;
         case MSR_IA32_PEBS_ENABLE:
                 if (data & pmu->pebs_enable_mask)
                         return 1;
@@@ -385,8 -444,6 +385,6 @@@
                 }
                 break;
         case MSR_IA32_DS_AREA:
-               if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS))
-                       return 1;
                 if (is_noncanonical_address(data, vcpu))
                         return 1;
   
@@@ -472,7 -529,7 +470,7 @@@ static void intel_pmu_refresh(struct kv
         pmu->reserved_bits = 0xffffffff00200000ull;
         pmu->raw_event_mask = X86_RAW_EVENT_MASK;
         pmu->global_ctrl_mask = ~0ull;
- -      pmu->global_ovf_ctrl_mask = ~0ull;
+ +      pmu->global_status_mask = ~0ull;
         pmu->fixed_ctr_ctrl_mask = ~0ull;
         pmu->pebs_enable_mask = ~0ull;
         pmu->pebs_data_cfg_mask = ~0ull;
@@@ -526,17 -583,11 +524,17 @@@
         counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
                 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
         pmu->global_ctrl_mask = counter_mask;
- -      pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
+ +
+ +      /*
+ +       * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
+ +       * share reserved bit definitions.  The kernel just happens to use
+ +       * OVF_CTRL for the names.
+ +       */
+ +      pmu->global_status_mask = pmu->global_ctrl_mask
                         & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
                             MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
         if (vmx_pt_mode_is_host_guest())
- -              pmu->global_ovf_ctrl_mask &=
+ +              pmu->global_status_mask &=
                                 ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
   
         entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@@ -748,7 -799,7 +746,7 @@@ void intel_pmu_cross_mapped_check(struc
                 pmc = intel_pmc_idx_to_pmc(pmu, bit);
   
                 if (!pmc || !pmc_speculative_in_use(pmc) ||
- -                  !intel_pmc_is_enabled(pmc) || !pmc->perf_event)
+ +                  !pmc_is_globally_enabled(pmc) || !pmc->perf_event)
                         continue;
   
                 /*
@@@ -763,6 -814,7 +761,6 @@@
   
   struct kvm_pmu_ops intel_pmu_ops __initdata = {
         .hw_event_available = intel_hw_event_available,
- -      .pmc_is_enabled = intel_pmc_is_enabled,
         .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
         .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
         .msr_idx_to_pmc = intel_msr_idx_to_pmc,
@@@ -777,5 -829,4 +775,5 @@@
         .cleanup = intel_pmu_cleanup,
         .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
         .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ +      .MIN_NR_GP_COUNTERS = 1,
   };
diff --combined arch/x86/kvm/vmx/vmx.c

index 2d9d155691a7f5e20374066902bc60b7be9e13d4,9ea4a5dfe62a1e724fd44dadf2641dee1a183913..0ecf4be2c6af0e11de6b2ea993c29c3b1b00d7c0
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -2287,16 -2287,19 +2287,16 @@@ static int vmx_set_msr(struct kvm_vcpu 
                         return 1;
                 goto find_uret_msr;
         case MSR_IA32_CR_PAT:
- -              if (!kvm_pat_valid(data))
- -                      return 1;
+ +              ret = kvm_set_msr_common(vcpu, msr_info);
+ +              if (ret)
+ +                      break;
   
                 if (is_guest_mode(vcpu) &&
                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                         get_vmcs12(vcpu)->guest_ia32_pat = data;
   
- -              if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ +              if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                         vmcs_write64(GUEST_IA32_PAT, data);
- -                      vcpu->arch.pat = data;
- -                      break;
- -              }
- -              ret = kvm_set_msr_common(vcpu, msr_info);
                 break;
         case MSR_IA32_MCG_EXT_CTL:
                 if ((!msr_info->host_initiated &&
@@@ -3384,15 -3387,15 +3384,15 @@@ static bool vmx_is_valid_cr4(struct kvm
   
   void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   {
-       unsigned long old_cr4 = vcpu->arch.cr4;
+       unsigned long old_cr4 = kvm_read_cr4(vcpu);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long hw_cr4;
+ 
         /*
          * Pass through host's Machine Check Enable value to hw_cr4, which
          * is in force while we are in guest mode.  Do not let guests control
          * this bit, even if host CR4.MCE == 0.
          */
-       unsigned long hw_cr4;
- 
         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
         if (is_unrestricted_guest(vcpu))
                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
@@@ -3401,7 -3404,7 +3401,7 @@@
         else
                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
   
-       if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+       if (vmx_umip_emulated()) {
                 if (cr4 & X86_CR4_UMIP) {
                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
                         hw_cr4 &= ~X86_CR4_UMIP;
@@@ -5399,7 -5402,13 +5399,13 @@@ static int handle_set_cr4(struct kvm_vc
   
   static int handle_desc(struct kvm_vcpu *vcpu)
   {
-       WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+       /*
+        * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+        * and other code needs to be updated if UMIP can be guest owned.
+        */
+       BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+ 
+       WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
         return kvm_emulate_instruction(vcpu, 0);
   }
   
@@@ -6705,7 -6714,12 +6711,12 @@@ void vmx_set_virtual_apic_mode(struct k
   
   static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
   {
-       struct page *page;
+       const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       struct kvm_memory_slot *slot;
+       unsigned long mmu_seq;
+       kvm_pfn_t pfn;
   
         /* Defer reload until vmcs01 is the current VMCS. */
         if (is_guest_mode(vcpu)) {
@@@ -6717,18 -6731,53 +6728,53 @@@
             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                 return;
   
-       page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (is_error_page(page))
+       /*
+        * Grab the memslot so that the hva lookup for the mmu_notifier retry
+        * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
+        * on the pfn lookup's validation of the memslot to ensure a valid hva
+        * is used for the retry check.
+        */
+       slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+       if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                 return;
   
-       vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+       /*
+        * Ensure that the mmu_notifier sequence count is read before KVM
+        * retrieves the pfn from the primary MMU.  Note, the memslot is
+        * protected by SRCU, not the mmu_notifier.  Pairs with the smp_wmb()
+        * in kvm_mmu_invalidate_end().
+        */
+       mmu_seq = kvm->mmu_invalidate_seq;
+       smp_rmb();
+ 
+       /*
+        * No need to retry if the memslot does not exist or is invalid.  KVM
+        * controls the APIC-access page memslot, and only deletes the memslot
+        * if APICv is permanently inhibited, i.e. the memslot won't reappear.
+        */
+       pfn = gfn_to_pfn_memslot(slot, gfn);
+       if (is_error_noslot_pfn(pfn))
+               return;
+ 
+       read_lock(&vcpu->kvm->mmu_lock);
+       if (mmu_invalidate_retry_hva(kvm, mmu_seq,
+                                    gfn_to_hva_memslot(slot, gfn))) {
+               kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+               read_unlock(&vcpu->kvm->mmu_lock);
+               goto out;
+       }
+ 
+       vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+       read_unlock(&vcpu->kvm->mmu_lock);
+ 
         vmx_flush_tlb_current(vcpu);
   
+ out:
         /*
          * Do not pin apic access page in memory, the MMU notifier
          * will call us again if it is migrated or swapped out.
          */
-       put_page(page);
+       kvm_release_pfn_clean(pfn);
   }
   
   static void vmx_hwapic_isr_update(int max_isr)
diff --combined arch/x86/kvm/x86.c

index 7d6e044504482d9f2e136606fe4a4db13c684b57,f962b7e3487ef6d1812ae3f1affc6f2fc3ac6f54..8bca4d2405f8c06047d537dd52a7430397018ca0
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -1017,11 -1017,13 +1017,11 @@@ void kvm_load_guest_xsave_state(struct 
                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
         }
   
- -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
- -      if (static_cpu_has(X86_FEATURE_PKU) &&
+ +      if (cpu_feature_enabled(X86_FEATURE_PKU) &&
             vcpu->arch.pkru != vcpu->arch.host_pkru &&
             ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
              kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
                 write_pkru(vcpu->arch.pkru);
- -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
   }
   EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
   
@@@ -1030,13 -1032,15 +1030,13 @@@ void kvm_load_host_xsave_state(struct k
         if (vcpu->arch.guest_state_protected)
                 return;
   
- -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
- -      if (static_cpu_has(X86_FEATURE_PKU) &&
+ +      if (cpu_feature_enabled(X86_FEATURE_PKU) &&
             ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
              kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
                 vcpu->arch.pkru = rdpkru();
                 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                         write_pkru(vcpu->arch.host_pkru);
         }
- -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
   
         if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
   
@@@ -1423,14 -1427,15 +1423,14 @@@ int kvm_emulate_rdpmc(struct kvm_vcpu *
   EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
   
   /*
- - * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- - *
- - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
- - * extract the supported MSRs from the related const lists.
- - * msrs_to_save is selected from the msrs_to_save_all to reflect the
- - * capabilities of the host cpu. This capabilities test skips MSRs that are
- - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
- - * may depend on host virtualization features rather than host cpu features.
+ + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.  msrs_to_save holds MSRs that
+ + * require host support, i.e. should be probed via RDMSR.  emulated_msrs holds
+ + * MSRs that KVM emulates without strictly requiring host support.
+ + * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ + * CPUID leafs.  Note, msr_based_features isn't mutually exclusive with
+ + * msrs_to_save and emulated_msrs.
    */
   
   static const u32 msrs_to_save_base[] = {
@@@ -1478,10 -1483,6 +1478,10 @@@ static const u32 msrs_to_save_pmu[] = 
         MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
         MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
         MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+ +
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ +      MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
   };
   
   static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
@@@ -1530,11 -1531,11 +1530,11 @@@ static const u32 emulated_msrs_all[] = 
         MSR_IA32_UCODE_REV,
   
         /*
- -       * The following list leaves out MSRs whose values are determined
- -       * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
- -       * We always support the "true" VMX control MSRs, even if the host
- -       * processor does not, so I am putting these registers here rather
- -       * than in msrs_to_save_all.
+ +       * KVM always supports the "true" VMX control MSRs, even if the host
+ +       * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
+ +       * doesn't strictly require them to exist in the host (ignoring that
+ +       * KVM would refuse to load in the first place if the core set of MSRs
+ +       * aren't supported).
          */
         MSR_IA32_VMX_BASIC,
         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@@ -1630,7 -1631,7 +1630,7 @@@ static u64 kvm_get_arch_capabilities(vo
          * If we're doing cache flushes (either "always" or "cond")
          * we will do one whenever the guest does a vmlaunch/vmresume.
          * If an outer hypervisor is doing the cache flush for us
- -       * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ +       * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
          * capability to the guest too, and if EPT is disabled we're not
          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
          * require a nested hypervisor to do a flush of its own.
@@@ -1808,7 -1809,7 +1808,7 @@@ bool kvm_msr_allowed(struct kvm_vcpu *v
                 unsigned long *bitmap = ranges[i].bitmap;
   
                 if ((index >= start) && (index < end) && (flags & type)) {
- -                      allowed = !!test_bit(index - start, bitmap);
+ +                      allowed = test_bit(index - start, bitmap);
                         break;
                 }
         }
@@@ -3701,14 -3702,8 +3701,14 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                         return 1;
                 }
                 break;
- -      case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
- -      case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
+ +      case MSR_IA32_CR_PAT:
+ +              if (!kvm_pat_valid(data))
+ +                      return 1;
+ +
+ +              vcpu->arch.pat = data;
+ +              break;
+ +      case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ +      case MSR_MTRRdefType:
                 return kvm_mtrr_set_msr(vcpu, msr, data);
         case MSR_IA32_APICBASE:
                 return kvm_set_apic_base(vcpu, msr_info);
@@@ -4115,12 -4110,9 +4115,12 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                 msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
                 break;
         }
+ +      case MSR_IA32_CR_PAT:
+ +              msr_info->data = vcpu->arch.pat;
+ +              break;
         case MSR_MTRRcap:
- -      case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
- -      case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
+ +      case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ +      case MSR_MTRRdefType:
                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
         case 0xcd: /* fsb frequency */
                 msr_info->data = 3;
@@@ -7158,12 -7150,6 +7158,12 @@@ static void kvm_probe_msr_to_save(u32 m
                     kvm_pmu_cap.num_counters_fixed)
                         return;
                 break;
+ +      case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ +      case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ +      case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ +              if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ +                      return;
+ +              break;
         case MSR_IA32_XFD:
         case MSR_IA32_XFD_ERR:
                 if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
@@@ -10449,20 -10435,6 +10449,6 @@@ static void vcpu_load_eoi_exitmap(struc
                 vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
   }
   
- void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-                                           unsigned long start, unsigned long end)
- {
-       unsigned long apic_address;
- 
-       /*
-        * The physical address of apic access page is stored in the VMCS.
-        * Update it when it becomes invalid.
-        */
-       apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (start <= apic_address && apic_address < end)
-               kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
- }
- 
   void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
   {
         static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
@@@ -10772,9 -10744,6 +10758,9 @@@ static int vcpu_enter_guest(struct kvm_
                         exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
                         break;
                 }
+ +
+ +              /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ +              ++vcpu->stat.exits;
         }
   
         /*
diff --combined include/linux/kvm_host.h

index 84ba21c8093f8f30f0ea76ee1c517cbd60027026,cb66f4100be7487005e43edd2cccda352e76ca16..9d3ac7720da9f46a2de764bbd0508da040355f59
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -849,7 -849,7 +849,7 @@@ static inline void kvm_vm_bugged(struc
   
   #define KVM_BUG(cond, kvm, fmt...)                            \
   ({                                                            \
- -      int __ret = (cond);                                     \
+ +      bool __ret = !!(cond);                                  \
                                                                 \
         if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt))         \
                 kvm_vm_bugged(kvm);                             \
@@@ -858,7 -858,7 +858,7 @@@
   
   #define KVM_BUG_ON(cond, kvm)                                 \
   ({                                                            \
- -      int __ret = (cond);                                     \
+ +      bool __ret = !!(cond);                                  \
                                                                 \
         if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))           \
                 kvm_vm_bugged(kvm);                             \
@@@ -991,8 -991,6 +991,8 @@@ static inline bool kvm_memslots_empty(s
         return RB_EMPTY_ROOT(&slots->gfn_tree);
   }
   
+ +bool kvm_are_all_memslots_empty(struct kvm *kvm);
+ +
   #define kvm_for_each_memslot(memslot, bkt, slots)                           \
         hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
                 if (WARN_ON_ONCE(!memslot->npages)) {                         \
@@@ -2239,9 -2237,6 +2239,6 @@@ static inline long kvm_arch_vcpu_async_
   }
   #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
   
- void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-                                           unsigned long start, unsigned long end);
- 
   void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
   
   #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
diff --combined virt/kvm/kvm_main.c

index ab8c8eb9fd624275fdb485bf1c06c9d1fbe7b260,f3c7c3c901615e2be1b929d5610891ce39e19e8d..b838c8f71349e078986d1fda6b5a9a68c96cc415
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -154,11 -154,6 +154,6 @@@ static unsigned long long kvm_active_vm
   
   static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
   
- __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-                                                  unsigned long start, unsigned long end)
- {
- }
- 
   __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
   {
   }
@@@ -521,18 -516,6 +516,6 @@@ static inline struct kvm *mmu_notifier_
         return container_of(mn, struct kvm, mmu_notifier);
   }
   
- static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
-                                             struct mm_struct *mm,
-                                             unsigned long start, unsigned long end)
- {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int idx;
- 
-       idx = srcu_read_lock(&kvm->srcu);
-       kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
-       srcu_read_unlock(&kvm->srcu, idx);
- }
- 
   typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
   
   typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
@@@ -686,24 -669,6 +669,24 @@@ static __always_inline int kvm_handle_h
   
         return __kvm_handle_hva_range(kvm, &range);
   }
+ +
+ +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+ +{
+ +      /*
+ +       * Skipping invalid memslots is correct if and only change_pte() is
+ +       * surrounded by invalidate_range_{start,end}(), which is currently
+ +       * guaranteed by the primary MMU.  If that ever changes, KVM needs to
+ +       * unmap the memslot instead of skipping the memslot to ensure that KVM
+ +       * doesn't hold references to the old PFN.
+ +       */
+ +      WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+ +
+ +      if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ +              return false;
+ +
+ +      return kvm_set_spte_gfn(kvm, range);
+ +}
+ +
   static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
                                         struct mm_struct *mm,
                                         unsigned long address,
@@@ -725,7 -690,7 +708,7 @@@
         if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
                 return;
   
- -      kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ +      kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
   }
   
   void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
@@@ -910,7 -875,6 +893,6 @@@ static void kvm_mmu_notifier_release(st
   }
   
   static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
-       .invalidate_range       = kvm_mmu_notifier_invalidate_range,
         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
@@@ -3888,10 -3852,7 +3870,10 @@@ static int create_vcpu_fd(struct kvm_vc
   static int vcpu_get_pid(void *data, u64 *val)
   {
         struct kvm_vcpu *vcpu = data;
- -      *val = pid_nr(rcu_access_pointer(vcpu->pid));
+ +
+ +      rcu_read_lock();
+ +      *val = pid_nr(rcu_dereference(vcpu->pid));
+ +      rcu_read_unlock();
         return 0;
   }
   
@@@ -3993,7 -3954,7 +3975,7 @@@ static int kvm_vm_ioctl_create_vcpu(str
         if (r < 0)
                 goto kvm_put_xa_release;
   
- -      if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
+ +      if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
                 r = -EINVAL;
                 goto kvm_put_xa_release;
         }
@@@ -4623,7 -4584,7 +4605,7 @@@ int __attribute__((weak)) kvm_vm_ioctl_
         return -EINVAL;
   }
   
- -static bool kvm_are_all_memslots_empty(struct kvm *kvm)
+ +bool kvm_are_all_memslots_empty(struct kvm *kvm)
   {
         int i;
   
@@@ -4636,7 -4597,6 +4618,7 @@@
   
         return true;
   }
+ +EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
   
   static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
                                            struct kvm_enable_cap *cap)
@@@ -5316,12 -5276,6 +5298,12 @@@ static void hardware_disable_all(void
   }
   #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
   
+ +static void kvm_iodevice_destructor(struct kvm_io_device *dev)
+ +{
+ +      if (dev->ops->destructor)
+ +              dev->ops->destructor(dev);
+ +}
+ +
   static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
   {
         int i;
@@@ -5545,7 -5499,7 +5527,7 @@@ int kvm_io_bus_register_dev(struct kvm 
   int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                               struct kvm_io_device *dev)
   {
- -      int i, j;
+ +      int i;
         struct kvm_io_bus *new_bus, *bus;
   
         lockdep_assert_held(&kvm->slots_lock);
@@@ -5575,19 -5529,18 +5557,19 @@@
         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
         synchronize_srcu_expedited(&kvm->srcu);
   
- -      /* Destroy the old bus _after_ installing the (null) bus. */
+ +      /*
+ +       * If NULL bus is installed, destroy the old bus, including all the
+ +       * attached devices. Otherwise, destroy the caller's device only.
+ +       */
         if (!new_bus) {
                 pr_err("kvm: failed to shrink bus, removing it completely\n");
- -              for (j = 0; j < bus->dev_count; j++) {
- -                      if (j == i)
- -                              continue;
- -                      kvm_iodevice_destructor(bus->range[j].dev);
- -              }
+ +              kvm_io_bus_destroy(bus);
+ +              return -ENOMEM;
         }
   
+ +      kvm_iodevice_destructor(dev);
         kfree(bus);
- -      return new_bus ? 0 : -ENOMEM;
+ +      return 0;
   }
   
   struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
author	Paolo Bonzini <[email protected]>
	Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)
committer	Paolo Bonzini <[email protected]>
	Sat, 1 Jul 2023 11:20:04 +0000 (07:20 -0400)
		1	2
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/nested.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/pmu_intel.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history