Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <[email protected]>

Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)

committer Linus Torvalds <[email protected]>

Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
author Linus Torvalds <[email protected]>
Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
committer Linus Torvalds <[email protected]>
Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
diff --combined arch/x86/include/asm/kvm_host.h

index 0677b9ea01c901c68ea763e7330d904701fa8617,682ad02a4e585bb91dcbd0c1308f186ed7d97d0c..1384517d77093b6ec2ba471f9136f67d5176507e
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -782,7 -782,6 +782,7 @@@ struct kvm_vcpu_arch 
         unsigned nmi_pending; /* NMI queued after currently running handler */
         bool nmi_injected;    /* Trying to inject an NMI this entry */
         bool smi_pending;    /* SMI queued after currently running handler */
+ +      u8 handling_intr_from_guest;
   
         struct kvm_mtrr mtrr_state;
         u64 pat;
@@@ -1381,6 -1380,7 +1381,7 @@@ struct kvm_x86_ops 
          */
         void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
   
+       int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
         enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
         int (*handle_exit)(struct kvm_vcpu *vcpu,
                 enum exit_fastpath_completion exit_fastpath);
@@@ -1454,18 -1454,6 +1455,6 @@@
         const struct kvm_pmu_ops *pmu_ops;
         const struct kvm_x86_nested_ops *nested_ops;
   
-       /*
-        * Architecture specific hooks for vCPU blocking due to
-        * HLT instruction.
-        * Returns for .pre_block():
-        *    - 0 means continue to block the vCPU.
-        *    - 1 means we cannot block the vCPU since some event
-        *        happens during this period, such as, 'ON' bit in
-        *        posted-interrupts descriptor is set.
-        */
-       int (*pre_block)(struct kvm_vcpu *vcpu);
-       void (*post_block)(struct kvm_vcpu *vcpu);
- 
         void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
         void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
   
@@@ -1529,7 -1517,6 +1518,7 @@@ struct kvm_x86_init_ops 
         int (*disabled_by_bios)(void);
         int (*check_processor_compatibility)(void);
         int (*hardware_setup)(void);
+ +      unsigned int (*handle_intel_pt_intr)(void);
   
         struct kvm_x86_ops *runtime_ops;
   };
@@@ -1579,9 -1566,6 +1568,9 @@@ static inline int kvm_arch_flush_remote
                 return -ENOTSUPP;
   }
   
+ +#define kvm_arch_pmi_in_guest(vcpu) \
+ +      ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+ +
   int kvm_mmu_module_init(void);
   void kvm_mmu_module_exit(void);
   
@@@ -1913,6 -1897,8 +1902,6 @@@ int kvm_skip_emulated_instruction(struc
   int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
   void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
   
- -int kvm_is_in_guest(void);
- -
   void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
                                      u32 size);
   bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
diff --combined arch/x86/kvm/mmu/spte.c

index 351b04ad62a18f10b11962b7b4db433171a8f5d9,f8677404c93cb8431d6b597f85d7b5b12b9e4846..73cfe62fdad1fd4ac55e0dba7e04466abf9bc33d
--- 1/arch/x86/kvm/mmu/spte.c
--- 2/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@@ -16,7 -16,6 +16,7 @@@
   #include "spte.h"
   
   #include <asm/e820/api.h>
+ +#include <asm/memtype.h>
   #include <asm/vmx.h>
   
   static bool __read_mostly enable_mmio_caching = true;
@@@ -216,6 -215,7 +216,7 @@@ u64 kvm_mmu_changed_pte_notifier_make_s
   
         new_spte &= ~PT_WRITABLE_MASK;
         new_spte &= ~shadow_host_writable_mask;
+       new_spte &= ~shadow_mmu_writable_mask;
   
         new_spte = mark_spte_for_access_track(new_spte);
   
diff --combined arch/x86/kvm/pmu.c

index 261b39cbef6ea52c77473a097839cf1f08438387,2c98f3ee8df402c6bdbcec8d1bb9ccc381584d1a..f614f95acc6b3e38b0a928e42cdd0f43c9180e02
--- 1/arch/x86/kvm/pmu.c
--- 2/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@@ -13,6 -13,8 +13,8 @@@
   #include <linux/types.h>
   #include <linux/kvm_host.h>
   #include <linux/perf_event.h>
+ #include <linux/bsearch.h>
+ #include <linux/sort.h>
   #include <asm/perf_event.h>
   #include "x86.h"
   #include "cpuid.h"
@@@ -77,7 -79,7 +79,7 @@@ static inline void __kvm_perf_overflow(
          * woken up. So we should wake it, but this is impossible from
          * NMI context. Do it from irq work instead.
          */
- -      if (in_pmi && !kvm_is_in_guest())
+ +      if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
                 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
         else
                 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
@@@ -109,6 -111,9 +111,9 @@@ static void pmc_reprogram_counter(struc
                 .config = config,
         };
   
+       if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
+               return;
+ 
         attr.sample_period = get_sample_period(pmc, pmc->counter);
   
         if (in_tx)
@@@ -169,12 -174,16 +174,16 @@@ static bool pmc_resume_counter(struct k
         return true;
   }
   
+ static int cmp_u64(const void *a, const void *b)
+ {
+       return *(__u64 *)a - *(__u64 *)b;
+ }
+ 
   void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
   {
         unsigned config, type = PERF_TYPE_RAW;
         struct kvm *kvm = pmc->vcpu->kvm;
         struct kvm_pmu_event_filter *filter;
-       int i;
         bool allow_event = true;
   
         if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@@ -189,16 -198,13 +198,13 @@@
   
         filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
         if (filter) {
-               for (i = 0; i < filter->nevents; i++)
-                       if (filter->events[i] ==
-                           (eventsel & AMD64_RAW_EVENT_MASK_NB))
-                               break;
-               if (filter->action == KVM_PMU_EVENT_ALLOW &&
-                   i == filter->nevents)
-                       allow_event = false;
-               if (filter->action == KVM_PMU_EVENT_DENY &&
-                   i < filter->nevents)
-                       allow_event = false;
+               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+ 
+               if (bsearch(&key, filter->events, filter->nevents,
+                           sizeof(__u64), cmp_u64))
+                       allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+               else
+                       allow_event = filter->action == KVM_PMU_EVENT_DENY;
         }
         if (!allow_event)
                 return;
@@@ -573,6 -579,11 +579,11 @@@ int kvm_vm_ioctl_set_pmu_event_filter(s
         /* Ensure nevents can't be changed between the user copies. */
         *filter = tmp;
   
+       /*
+        * Sort the in-kernel list so that we can search it with bsearch.
+        */
+       sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+ 
         mutex_lock(&kvm->lock);
         filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
                                      mutex_is_locked(&kvm->lock));
diff --combined arch/x86/kvm/svm/svm.c

index 46bcc706f25740b3f0a1956a4d2124fda095306d,6d31d357a83b9fdc88a68d01130ca68db8b9c06f..2c99b18d76c0f9b5157b3999a733a6e46cbc8561
--- 1/arch/x86/kvm/svm/svm.c
--- 2/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -192,10 -192,6 +192,6 @@@ module_param(vgif, int, 0444)
   static int lbrv = true;
   module_param(lbrv, int, 0444);
   
- /* enable/disable PMU virtualization */
- bool pmu = true;
- module_param(pmu, bool, 0444);
- 
   static int tsc_scaling = true;
   module_param(tsc_scaling, int, 0444);
   
@@@ -873,47 -869,6 +869,6 @@@ static void shrink_ple_window(struct kv
         }
   }
   
- /*
-  * The default MMIO mask is a single bit (excluding the present bit),
-  * which could conflict with the memory encryption bit. Check for
-  * memory encryption support and override the default MMIO mask if
-  * memory encryption is enabled.
-  */
- static __init void svm_adjust_mmio_mask(void)
- {
-       unsigned int enc_bit, mask_bit;
-       u64 msr, mask;
- 
-       /* If there is no memory encryption support, use existing mask */
-       if (cpuid_eax(0x80000000) < 0x8000001f)
-               return;
- 
-       /* If memory encryption is not enabled, use existing mask */
-       rdmsrl(MSR_AMD64_SYSCFG, msr);
-       if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
-               return;
- 
-       enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
-       mask_bit = boot_cpu_data.x86_phys_bits;
- 
-       /* Increment the mask bit if it is the same as the encryption bit */
-       if (enc_bit == mask_bit)
-               mask_bit++;
- 
-       /*
-        * If the mask bit location is below 52, then some bits above the
-        * physical addressing limit will always be reserved, so use the
-        * rsvd_bits() function to generate the mask. This mask, along with
-        * the present bit, will be used to generate a page fault with
-        * PFER.RSV = 1.
-        *
-        * If the mask bit location is 52 (or above), then clear the mask.
-        */
-       mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
- 
-       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
- }
- 
   static void svm_hardware_teardown(void)
   {
         int cpu;
@@@ -928,198 -883,6 +883,6 @@@
         iopm_base = 0;
   }
   
- static __init void svm_set_cpu_caps(void)
- {
-       kvm_set_cpu_caps();
- 
-       supported_xss = 0;
- 
-       /* CPUID 0x80000001 and 0x8000000A (SVM features) */
-       if (nested) {
-               kvm_cpu_cap_set(X86_FEATURE_SVM);
- 
-               if (nrips)
-                       kvm_cpu_cap_set(X86_FEATURE_NRIPS);
- 
-               if (npt_enabled)
-                       kvm_cpu_cap_set(X86_FEATURE_NPT);
- 
-               if (tsc_scaling)
-                       kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
- 
-               /* Nested VM can receive #VMEXIT instead of triggering #GP */
-               kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
-       }
- 
-       /* CPUID 0x80000008 */
-       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
-           boot_cpu_has(X86_FEATURE_AMD_SSBD))
-               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
- 
-       /* AMD PMU PERFCTR_CORE CPUID */
-       if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
- 
-       /* CPUID 0x8000001F (SME/SEV features) */
-       sev_set_cpu_caps();
- }
- 
- static __init int svm_hardware_setup(void)
- {
-       int cpu;
-       struct page *iopm_pages;
-       void *iopm_va;
-       int r;
-       unsigned int order = get_order(IOPM_SIZE);
- 
-       /*
-        * NX is required for shadow paging and for NPT if the NX huge pages
-        * mitigation is enabled.
-        */
-       if (!boot_cpu_has(X86_FEATURE_NX)) {
-               pr_err_ratelimited("NX (Execute Disable) not supported\n");
-               return -EOPNOTSUPP;
-       }
-       kvm_enable_efer_bits(EFER_NX);
- 
-       iopm_pages = alloc_pages(GFP_KERNEL, order);
- 
-       if (!iopm_pages)
-               return -ENOMEM;
- 
-       iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
-       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
- 
-       init_msrpm_offsets();
- 
-       supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
- 
-       if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
-               kvm_enable_efer_bits(EFER_FFXSR);
- 
-       if (tsc_scaling) {
-               if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
-                       tsc_scaling = false;
-               } else {
-                       pr_info("TSC scaling supported\n");
-                       kvm_has_tsc_control = true;
-                       kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
-                       kvm_tsc_scaling_ratio_frac_bits = 32;
-               }
-       }
- 
-       tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
- 
-       /* Check for pause filtering support */
-       if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
-               pause_filter_count = 0;
-               pause_filter_thresh = 0;
-       } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
-               pause_filter_thresh = 0;
-       }
- 
-       if (nested) {
-               printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
-               kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
-       }
- 
-       /*
-        * KVM's MMU doesn't support using 2-level paging for itself, and thus
-        * NPT isn't supported if the host is using 2-level paging since host
-        * CR4 is unchanged on VMRUN.
-        */
-       if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
-               npt_enabled = false;
- 
-       if (!boot_cpu_has(X86_FEATURE_NPT))
-               npt_enabled = false;
- 
-       /* Force VM NPT level equal to the host's paging level */
-       kvm_configure_mmu(npt_enabled, get_npt_level(),
-                         get_npt_level(), PG_LEVEL_1G);
-       pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
- 
-       /* Note, SEV setup consumes npt_enabled. */
-       sev_hardware_setup();
- 
-       svm_hv_hardware_setup();
- 
-       svm_adjust_mmio_mask();
- 
-       for_each_possible_cpu(cpu) {
-               r = svm_cpu_init(cpu);
-               if (r)
-                       goto err;
-       }
- 
-       if (nrips) {
-               if (!boot_cpu_has(X86_FEATURE_NRIPS))
-                       nrips = false;
-       }
- 
-       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
- 
-       if (enable_apicv) {
-               pr_info("AVIC enabled\n");
- 
-               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
-       }
- 
-       if (vls) {
-               if (!npt_enabled ||
-                   !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
-                   !IS_ENABLED(CONFIG_X86_64)) {
-                       vls = false;
-               } else {
-                       pr_info("Virtual VMLOAD VMSAVE supported\n");
-               }
-       }
- 
-       if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
-               svm_gp_erratum_intercept = false;
- 
-       if (vgif) {
-               if (!boot_cpu_has(X86_FEATURE_VGIF))
-                       vgif = false;
-               else
-                       pr_info("Virtual GIF supported\n");
-       }
- 
-       if (lbrv) {
-               if (!boot_cpu_has(X86_FEATURE_LBRV))
-                       lbrv = false;
-               else
-                       pr_info("LBR virtualization supported\n");
-       }
- 
-       if (!pmu)
-               pr_info("PMU virtualization is disabled\n");
- 
-       svm_set_cpu_caps();
- 
-       /*
-        * It seems that on AMD processors PTE's accessed bit is
-        * being set by the CPU hardware before the NPF vmexit.
-        * This is not expected behaviour and our tests fail because
-        * of it.
-        * A workaround here is to disable support for
-        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
-        * In this case userspace can know if there is support using
-        * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
-        * it
-        * If future AMD CPU models change the behaviour described above,
-        * this variable can be changed accordingly
-        */
-       allow_smaller_maxphyaddr = !npt_enabled;
- 
-       return 0;
- 
- err:
-       svm_hardware_teardown();
-       return r;
- }
- 
   static void init_seg(struct vmcb_seg *seg)
   {
         seg->selector = 0;
@@@ -1444,12 -1207,6 +1207,6 @@@ static int svm_create_vcpu(struct kvm_v
         if (err)
                 goto error_free_vmsa_page;
   
-       /* We initialize this flag to true to make sure that the is_running
-        * bit would be set the first time the vcpu is loaded.
-        */
-       if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
-               svm->avic_is_running = true;
- 
         svm->msrpm = svm_vcpu_alloc_msrpm();
         if (!svm->msrpm) {
                 err = -ENOMEM;
@@@ -3833,6 -3590,11 +3590,11 @@@ static void svm_cancel_injection(struc
         svm_complete_interrupts(vcpu);
   }
   
+ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+ {
+       return 1;
+ }
+ 
   static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
   {
         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@@@ -3967,7 -3729,7 +3729,7 @@@ static __no_kcsan fastpath_t svm_vcpu_r
         vcpu->arch.regs_dirty = 0;
   
         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- -              kvm_before_interrupt(vcpu);
+ +              kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
   
         kvm_load_host_xsave_state(vcpu);
         stgi();
@@@ -4629,8 -4391,8 +4391,8 @@@ static struct kvm_x86_ops svm_x86_ops _
         .prepare_guest_switch = svm_prepare_guest_switch,
         .vcpu_load = svm_vcpu_load,
         .vcpu_put = svm_vcpu_put,
-       .vcpu_blocking = svm_vcpu_blocking,
-       .vcpu_unblocking = svm_vcpu_unblocking,
+       .vcpu_blocking = avic_vcpu_blocking,
+       .vcpu_unblocking = avic_vcpu_unblocking,
   
         .update_exception_bitmap = svm_update_exception_bitmap,
         .get_msr_feature = svm_get_msr_feature,
@@@ -4662,6 -4424,7 +4424,7 @@@
         .tlb_flush_gva = svm_flush_tlb_gva,
         .tlb_flush_guest = svm_flush_tlb,
   
+       .vcpu_pre_run = svm_vcpu_pre_run,
         .run = svm_vcpu_run,
         .handle_exit = handle_exit,
         .skip_emulated_instruction = skip_emulated_instruction,
@@@ -4742,6 -4505,243 +4505,243 @@@
         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
   };
   
+ /*
+  * The default MMIO mask is a single bit (excluding the present bit),
+  * which could conflict with the memory encryption bit. Check for
+  * memory encryption support and override the default MMIO mask if
+  * memory encryption is enabled.
+  */
+ static __init void svm_adjust_mmio_mask(void)
+ {
+       unsigned int enc_bit, mask_bit;
+       u64 msr, mask;
+ 
+       /* If there is no memory encryption support, use existing mask */
+       if (cpuid_eax(0x80000000) < 0x8000001f)
+               return;
+ 
+       /* If memory encryption is not enabled, use existing mask */
+       rdmsrl(MSR_AMD64_SYSCFG, msr);
+       if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+               return;
+ 
+       enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+       mask_bit = boot_cpu_data.x86_phys_bits;
+ 
+       /* Increment the mask bit if it is the same as the encryption bit */
+       if (enc_bit == mask_bit)
+               mask_bit++;
+ 
+       /*
+        * If the mask bit location is below 52, then some bits above the
+        * physical addressing limit will always be reserved, so use the
+        * rsvd_bits() function to generate the mask. This mask, along with
+        * the present bit, will be used to generate a page fault with
+        * PFER.RSV = 1.
+        *
+        * If the mask bit location is 52 (or above), then clear the mask.
+        */
+       mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+ 
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+ }
+ 
+ static __init void svm_set_cpu_caps(void)
+ {
+       kvm_set_cpu_caps();
+ 
+       supported_xss = 0;
+ 
+       /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+       if (nested) {
+               kvm_cpu_cap_set(X86_FEATURE_SVM);
+ 
+               if (nrips)
+                       kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+ 
+               if (npt_enabled)
+                       kvm_cpu_cap_set(X86_FEATURE_NPT);
+ 
+               if (tsc_scaling)
+                       kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+ 
+               /* Nested VM can receive #VMEXIT instead of triggering #GP */
+               kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+       }
+ 
+       /* CPUID 0x80000008 */
+       if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+           boot_cpu_has(X86_FEATURE_AMD_SSBD))
+               kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+ 
+       /* AMD PMU PERFCTR_CORE CPUID */
+       if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+ 
+       /* CPUID 0x8000001F (SME/SEV features) */
+       sev_set_cpu_caps();
+ }
+ 
+ static __init int svm_hardware_setup(void)
+ {
+       int cpu;
+       struct page *iopm_pages;
+       void *iopm_va;
+       int r;
+       unsigned int order = get_order(IOPM_SIZE);
+ 
+       /*
+        * NX is required for shadow paging and for NPT if the NX huge pages
+        * mitigation is enabled.
+        */
+       if (!boot_cpu_has(X86_FEATURE_NX)) {
+               pr_err_ratelimited("NX (Execute Disable) not supported\n");
+               return -EOPNOTSUPP;
+       }
+       kvm_enable_efer_bits(EFER_NX);
+ 
+       iopm_pages = alloc_pages(GFP_KERNEL, order);
+ 
+       if (!iopm_pages)
+               return -ENOMEM;
+ 
+       iopm_va = page_address(iopm_pages);
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+       iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ 
+       init_msrpm_offsets();
+ 
+       supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+ 
+       if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+               kvm_enable_efer_bits(EFER_FFXSR);
+ 
+       if (tsc_scaling) {
+               if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+                       tsc_scaling = false;
+               } else {
+                       pr_info("TSC scaling supported\n");
+                       kvm_has_tsc_control = true;
+                       kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+                       kvm_tsc_scaling_ratio_frac_bits = 32;
+               }
+       }
+ 
+       tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+ 
+       /* Check for pause filtering support */
+       if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+               pause_filter_count = 0;
+               pause_filter_thresh = 0;
+       } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+               pause_filter_thresh = 0;
+       }
+ 
+       if (nested) {
+               printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+               kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+       }
+ 
+       /*
+        * KVM's MMU doesn't support using 2-level paging for itself, and thus
+        * NPT isn't supported if the host is using 2-level paging since host
+        * CR4 is unchanged on VMRUN.
+        */
+       if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+               npt_enabled = false;
+ 
+       if (!boot_cpu_has(X86_FEATURE_NPT))
+               npt_enabled = false;
+ 
+       /* Force VM NPT level equal to the host's paging level */
+       kvm_configure_mmu(npt_enabled, get_npt_level(),
+                         get_npt_level(), PG_LEVEL_1G);
+       pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ 
+       /* Note, SEV setup consumes npt_enabled. */
+       sev_hardware_setup();
+ 
+       svm_hv_hardware_setup();
+ 
+       svm_adjust_mmio_mask();
+ 
+       for_each_possible_cpu(cpu) {
+               r = svm_cpu_init(cpu);
+               if (r)
+                       goto err;
+       }
+ 
+       if (nrips) {
+               if (!boot_cpu_has(X86_FEATURE_NRIPS))
+                       nrips = false;
+       }
+ 
+       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+ 
+       if (enable_apicv) {
+               pr_info("AVIC enabled\n");
+ 
+               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+       } else {
+               svm_x86_ops.vcpu_blocking = NULL;
+               svm_x86_ops.vcpu_unblocking = NULL;
+       }
+ 
+       if (vls) {
+               if (!npt_enabled ||
+                   !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+                   !IS_ENABLED(CONFIG_X86_64)) {
+                       vls = false;
+               } else {
+                       pr_info("Virtual VMLOAD VMSAVE supported\n");
+               }
+       }
+ 
+       if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+               svm_gp_erratum_intercept = false;
+ 
+       if (vgif) {
+               if (!boot_cpu_has(X86_FEATURE_VGIF))
+                       vgif = false;
+               else
+                       pr_info("Virtual GIF supported\n");
+       }
+ 
+       if (lbrv) {
+               if (!boot_cpu_has(X86_FEATURE_LBRV))
+                       lbrv = false;
+               else
+                       pr_info("LBR virtualization supported\n");
+       }
+ 
+       if (!enable_pmu)
+               pr_info("PMU virtualization is disabled\n");
+ 
+       svm_set_cpu_caps();
+ 
+       /*
+        * It seems that on AMD processors PTE's accessed bit is
+        * being set by the CPU hardware before the NPF vmexit.
+        * This is not expected behaviour and our tests fail because
+        * of it.
+        * A workaround here is to disable support for
+        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+        * In this case userspace can know if there is support using
+        * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+        * it
+        * If future AMD CPU models change the behaviour described above,
+        * this variable can be changed accordingly
+        */
+       allow_smaller_maxphyaddr = !npt_enabled;
+ 
+       return 0;
+ 
+ err:
+       svm_hardware_teardown();
+       return r;
+ }
+ 
+ 
   static struct kvm_x86_init_ops svm_init_ops __initdata = {
         .cpu_has_kvm_support = has_svm,
         .disabled_by_bios = is_disabled,
diff --combined arch/x86/kvm/vmx/vmx.c

index 1b2e9d8c5cc9b0c0743ec034156869d83a9a1f0f,a02a28ce7cc34f95fe6ab7a77bc7c990587e45c9..4ac676066d6079b5d918d2daf54af045fe14c796
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -3931,12 -3931,10 +3931,10 @@@ static void vmx_msr_filter_changed(stru
         pt_update_intercept_for_msr(vcpu);
   }
   
- static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
-                                                    bool nested)
+ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                    int pi_vec)
   {
   #ifdef CONFIG_SMP
-       int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
- 
         if (vcpu->mode == IN_GUEST_MODE) {
                 /*
                  * The vector of interrupt to be delivered to vcpu had
@@@ -3964,10 -3962,15 +3962,15 @@@
                  */
   
                 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
-               return true;
+               return;
         }
   #endif
-       return false;
+       /*
+        * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+        * otherwise do nothing as KVM will grab the highest priority pending
+        * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+        */
+       kvm_vcpu_wake_up(vcpu);
   }
   
   static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@@ -3997,8 -4000,7 +4000,7 @@@
                 smp_mb__after_atomic();
   
                 /* the PIR and ON have been set by L1. */
-               if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
-                       kvm_vcpu_kick(vcpu);
+               kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
                 return 0;
         }
         return -1;
@@@ -4035,9 -4037,7 +4037,7 @@@ static int vmx_deliver_posted_interrupt
          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
          */
-       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
-               kvm_vcpu_kick(vcpu);
- 
+       kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
         return 0;
   }
   
@@@ -5426,6 -5426,14 +5426,14 @@@ static int handle_nmi_window(struct kvm
         return 1;
   }
   
+ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+ 
+       return vmx->emulation_required && !vmx->rmode.vm86_active &&
+              vcpu->arch.exception.pending;
+ }
+ 
   static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -5445,8 -5453,7 +5453,7 @@@
                 if (!kvm_emulate_instruction(vcpu, 0))
                         return 0;
   
-               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending) {
+               if (vmx_emulation_required_with_pending_exception(vcpu)) {
                         kvm_prepare_emulation_failure_exit(vcpu);
                         return 0;
                 }
@@@ -5468,6 -5475,16 +5475,16 @@@
         return 1;
   }
   
+ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+ {
+       if (vmx_emulation_required_with_pending_exception(vcpu)) {
+               kvm_prepare_emulation_failure_exit(vcpu);
+               return 0;
+       }
+ 
+       return 1;
+ }
+ 
   static void grow_ple_window(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -6434,9 -6451,7 +6451,9 @@@ void vmx_do_interrupt_nmi_irqoff(unsign
   static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
                                         unsigned long entry)
   {
- -      kvm_before_interrupt(vcpu);
+ +      bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
+ +
+ +      kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
         vmx_do_interrupt_nmi_irqoff(entry);
         kvm_after_interrupt(vcpu);
   }
@@@ -6928,6 -6943,8 +6945,8 @@@ static int vmx_create_vcpu(struct kvm_v
         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
         vmx = to_vmx(vcpu);
   
+       INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+ 
         err = -ENOMEM;
   
         vmx->vpid = allocate_vpid();
@@@ -7549,25 -7566,6 +7568,6 @@@ void vmx_update_cpu_dirty_logging(struc
                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
   }
   
- static int vmx_pre_block(struct kvm_vcpu *vcpu)
- {
-       if (pi_pre_block(vcpu))
-               return 1;
- 
-       if (kvm_lapic_hv_timer_in_use(vcpu))
-               kvm_lapic_switch_to_sw_timer(vcpu);
- 
-       return 0;
- }
- 
- static void vmx_post_block(struct kvm_vcpu *vcpu)
- {
-       if (kvm_x86_ops.set_hv_timer)
-               kvm_lapic_switch_to_hv_timer(vcpu);
- 
-       pi_post_block(vcpu);
- }
- 
   static void vmx_setup_mce(struct kvm_vcpu *vcpu)
   {
         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@@ -7710,6 -7708,7 +7710,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
         .tlb_flush_gva = vmx_flush_tlb_gva,
         .tlb_flush_guest = vmx_flush_tlb_guest,
   
+       .vcpu_pre_run = vmx_vcpu_pre_run,
         .run = vmx_vcpu_run,
         .handle_exit = vmx_handle_exit,
         .skip_emulated_instruction = vmx_skip_emulated_instruction,
@@@ -7768,9 -7767,6 +7769,6 @@@
         .cpu_dirty_log_size = PML_ENTITY_NUM,
         .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
   
-       .pre_block = vmx_pre_block,
-       .post_block = vmx_post_block,
- 
         .pmu_ops = &intel_pmu_ops,
         .nested_ops = &vmx_nested_ops,
   
@@@ -7799,20 -7795,6 +7797,20 @@@
         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
   };
   
+ +static unsigned int vmx_handle_intel_pt_intr(void)
+ +{
+ +      struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+ +
+ +      /* '0' on failure so that the !PT case can use a RET0 static call. */
+ +      if (!kvm_arch_pmi_in_guest(vcpu))
+ +              return 0;
+ +
+ +      kvm_make_request(KVM_REQ_PMI, vcpu);
+ +      __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ +                (unsigned long *)&vcpu->arch.pmu.global_status);
+ +      return 1;
+ +}
+ +
   static __init void vmx_setup_user_return_msrs(void)
   {
   
@@@ -7839,8 -7821,6 +7837,8 @@@
                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
   }
   
+ +static struct kvm_x86_init_ops vmx_init_ops __initdata;
+ +
   static __init int hardware_setup(void)
   {
         unsigned long host_bndcfgs;
@@@ -7991,10 -7971,6 +7989,10 @@@
                 return -EINVAL;
         if (!enable_ept || !cpu_has_vmx_intel_pt())
                 pt_mode = PT_MODE_SYSTEM;
+ +      if (pt_mode == PT_MODE_HOST_GUEST)
+ +              vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ +      else
+ +              vmx_init_ops.handle_intel_pt_intr = NULL;
   
         setup_default_sgx_lepubkeyhash();
   
@@@ -8023,7 -7999,6 +8021,7 @@@ static struct kvm_x86_init_ops vmx_init
         .disabled_by_bios = vmx_disabled_by_bios,
         .check_processor_compatibility = vmx_check_processor_compat,
         .hardware_setup = hardware_setup,
+ +      .handle_intel_pt_intr = NULL,
   
         .runtime_ops = &vmx_x86_ops,
   };
diff --combined arch/x86/kvm/x86.c

index 76b4803dd3bdd0584bf01371df0f1652afdb8416,55518b7d3b964f5c2e44e3fbe7590fa6c7152a42..9e43d756312f17063dbf7a68adb3b6d1066f9080
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -187,6 -187,11 +187,11 @@@ module_param(force_emulation_prefix, bo
   int __read_mostly pi_inject_timer = -1;
   module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
   
+ /* Enable/disable PMU virtualization */
+ bool __read_mostly enable_pmu = true;
+ EXPORT_SYMBOL_GPL(enable_pmu);
+ module_param(enable_pmu, bool, 0444);
+ 
   /*
    * Restoring the host value for MSRs that are only consumed when running in
    * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@@ -5230,17 -5235,6 +5235,6 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                 struct kvm_cpuid __user *cpuid_arg = argp;
                 struct kvm_cpuid cpuid;
   
-               /*
-                * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
-                * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
-                * tracked in kvm_mmu_page_role.  As a result, KVM may miss guest page
-                * faults due to reusing SPs/SPTEs.  In practice no sane VMM mucks with
-                * the core vCPU model on the fly, so fail.
-                */
-               r = -EINVAL;
-               if (vcpu->arch.last_vmentry_cpu != -1)
-                       goto out;
- 
                 r = -EFAULT;
                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
                         goto out;
@@@ -5251,14 -5245,6 +5245,6 @@@
                 struct kvm_cpuid2 __user *cpuid_arg = argp;
                 struct kvm_cpuid2 cpuid;
   
-               /*
-                * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
-                * KVM_SET_CPUID case above.
-                */
-               r = -EINVAL;
-               if (vcpu->arch.last_vmentry_cpu != -1)
-                       goto out;
- 
                 r = -EFAULT;
                 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
                         goto out;
@@@ -8665,6 -8651,50 +8651,6 @@@ static void kvm_timer_init(void
                           kvmclock_cpu_online, kvmclock_cpu_down_prep);
   }
   
- -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
- -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
- -
- -int kvm_is_in_guest(void)
- -{
- -      return __this_cpu_read(current_vcpu) != NULL;
- -}
- -
- -static int kvm_is_user_mode(void)
- -{
- -      int user_mode = 3;
- -
- -      if (__this_cpu_read(current_vcpu))
- -              user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
- -
- -      return user_mode != 0;
- -}
- -
- -static unsigned long kvm_get_guest_ip(void)
- -{
- -      unsigned long ip = 0;
- -
- -      if (__this_cpu_read(current_vcpu))
- -              ip = kvm_rip_read(__this_cpu_read(current_vcpu));
- -
- -      return ip;
- -}
- -
- -static void kvm_handle_intel_pt_intr(void)
- -{
- -      struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
- -
- -      kvm_make_request(KVM_REQ_PMI, vcpu);
- -      __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
- -                      (unsigned long *)&vcpu->arch.pmu.global_status);
- -}
- -
- -static struct perf_guest_info_callbacks kvm_guest_cbs = {
- -      .is_in_guest            = kvm_is_in_guest,
- -      .is_user_mode           = kvm_is_user_mode,
- -      .get_guest_ip           = kvm_get_guest_ip,
- -      .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
- -};
- -
   #ifdef CONFIG_X86_64
   static void pvclock_gtod_update_fn(struct work_struct *work)
   {
@@@ -8777,6 -8807,8 +8763,6 @@@ int kvm_arch_init(void *opaque
   
         kvm_timer_init();
   
- -      perf_register_guest_info_callbacks(&kvm_guest_cbs);
- -
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
                 supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@@ -8808,6 -8840,7 +8794,6 @@@ void kvm_arch_exit(void
                 clear_hv_tscchange_cb();
   #endif
         kvm_lapic_exit();
- -      perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
   
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@@ -9945,10 -9978,11 +9931,11 @@@ static int vcpu_enter_guest(struct kvm_
         smp_mb__after_srcu_read_unlock();
   
         /*
-        * This handles the case where a posted interrupt was
-        * notified with kvm_vcpu_kick.  Assigned devices can
-        * use the POSTED_INTR_VECTOR even if APICv is disabled,
-        * so do it even if APICv is disabled on this vCPU.
+        * Process pending posted interrupts to handle the case where the
+        * notification IRQ arrived in the host, or was never sent (because the
+        * target vCPU wasn't running).  Do this regardless of the vCPU's APICv
+        * status, KVM doesn't update assigned devices when APICv is inhibited,
+        * i.e. they can post interrupts even if APICv is temporarily disabled.
          */
         if (kvm_lapic_enabled(vcpu))
                 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@@@ -10056,7 -10090,7 +10043,7 @@@
          * interrupts on processors that implement an interrupt shadow, the
          * stat.exits increment will do nicely.
          */
- -      kvm_before_interrupt(vcpu);
+ +      kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
         local_irq_enable();
         ++vcpu->stat.exits;
         local_irq_disable();
@@@ -10113,8 -10147,20 +10100,20 @@@ out
   
   static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
   {
-       if (!kvm_arch_vcpu_runnable(vcpu) &&
-           (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
+       bool hv_timer;
+ 
+       if (!kvm_arch_vcpu_runnable(vcpu)) {
+               /*
+                * Switch to the software timer before halt-polling/blocking as
+                * the guest's timer may be a break event for the vCPU, and the
+                * hypervisor timer runs only when the CPU is in guest mode.
+                * Switch before halt-polling so that KVM recognizes an expired
+                * timer before blocking.
+                */
+               hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+               if (hv_timer)
+                       kvm_lapic_switch_to_sw_timer(vcpu);
+ 
                 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
                 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                         kvm_vcpu_halt(vcpu);
@@@ -10122,8 -10168,8 +10121,8 @@@
                         kvm_vcpu_block(vcpu);
                 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
   
-               if (kvm_x86_ops.post_block)
-                       static_call(kvm_x86_post_block)(vcpu);
+               if (hv_timer)
+                       kvm_lapic_switch_to_hv_timer(vcpu);
   
                 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
                         return 1;
@@@ -10316,6 -10362,11 +10315,11 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                         r = -EINTR;
                         goto out;
                 }
+               /*
+                * It should be impossible for the hypervisor timer to be in
+                * use before KVM has ever run the vCPU.
+                */
+               WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
                 kvm_vcpu_block(vcpu);
                 if (kvm_apic_accept_events(vcpu) < 0) {
                         r = 0;
@@@ -10360,10 -10411,16 +10364,16 @@@
         } else
                 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
   
-       if (kvm_run->immediate_exit)
+       if (kvm_run->immediate_exit) {
                 r = -EINTR;
-       else
-               r = vcpu_run(vcpu);
+               goto out;
+       }
+ 
+       r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+       if (r <= 0)
+               goto out;
+ 
+       r = vcpu_run(vcpu);
   
   out:
         kvm_put_guest_fpu(vcpu);
@@@ -11393,8 -11450,6 +11403,8 @@@ int kvm_arch_hardware_setup(void *opaqu
         memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
         kvm_ops_static_call_update();
   
+ +      kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+ +
         if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
                 supported_xss = 0;
   
@@@ -11422,8 -11477,6 +11432,8 @@@
   
   void kvm_arch_hardware_unsetup(void)
   {
+ +      kvm_unregister_perf_callbacks();
+ +
         static_call(kvm_x86_hardware_unsetup)();
   }
   
@@@ -12017,11 -12070,6 +12027,11 @@@ bool kvm_arch_vcpu_in_kernel(struct kvm
         return vcpu->arch.preempted_in_kernel;
   }
   
+ +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+ +{
+ +      return kvm_rip_read(vcpu);
+ +}
+ +
   int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
   {
         return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
diff --combined arch/x86/kvm/x86.h

index bec8ed090abcef044e78ec61a7ec7a404a9beee0,1ebd5a7594da74dbf536ae6737de8c874c73630c..635b75f9e14540aff2aceb6e4b1c5c3221c444ab
--- 1/arch/x86/kvm/x86.h
--- 2/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@@ -336,6 -336,7 +336,7 @@@ extern u64 host_xcr0
   extern u64 supported_xcr0;
   extern u64 host_xss;
   extern u64 supported_xss;
+ extern bool enable_pmu;
   
   static inline bool kvm_mpx_supported(void)
   {
@@@ -391,27 -392,18 +392,27 @@@ static inline bool kvm_cstate_in_guest(
         return kvm->arch.cstate_in_guest;
   }
   
- -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+ +enum kvm_intr_type {
+ +      /* Values are arbitrary, but must be non-zero. */
+ +      KVM_HANDLING_IRQ = 1,
+ +      KVM_HANDLING_NMI,
+ +};
   
- -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+ +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ +                                      enum kvm_intr_type intr)
   {
- -      __this_cpu_write(current_vcpu, vcpu);
+ +      WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
   }
   
   static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
   {
- -      __this_cpu_write(current_vcpu, NULL);
+ +      WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
   }
   
+ +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+ +{
+ +      return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+ +}
   
   static inline bool kvm_pat_valid(u64 data)
   {
diff --combined include/linux/kvm_host.h

index d89d564f7c19a198677177e800e919aac927872d,f079820f52b50c7785174b3c36c6b421b64ff74d..06912d6b39d051013b731ae04b3b26c9f68a5efb
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -309,9 -309,6 +309,6 @@@ struct kvm_vcpu 
         u64 requests;
         unsigned long guest_debug;
   
-       int pre_pcpu;
-       struct list_head blocked_vcpu_list;
- 
         struct mutex mutex;
         struct kvm_run *run;
   
@@@ -1424,16 -1421,6 +1421,16 @@@ static inline bool kvm_arch_intc_initia
   }
   #endif
   
+ +#ifdef CONFIG_GUEST_PERF_EVENTS
+ +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu);
+ +
+ +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void));
+ +void kvm_unregister_perf_callbacks(void);
+ +#else
+ +static inline void kvm_register_perf_callbacks(void *ign) {}
+ +static inline void kvm_unregister_perf_callbacks(void) {}
+ +#endif /* CONFIG_GUEST_PERF_EVENTS */
+ +
   int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
   void kvm_arch_destroy_vm(struct kvm *kvm);
   void kvm_arch_sync_events(struct kvm *kvm);
diff --combined virt/kvm/kvm_main.c

index 504158f0e1314a3566427aa4ad3c6dbfbd3426b1,5a1164483e6c990b1c98b716615c3ef3f323ea53..9a20f2299386eb0385746725fb73c3f5041faee6
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -427,9 -427,6 +427,6 @@@ static void kvm_vcpu_init(struct kvm_vc
   #endif
         kvm_async_pf_vcpu_init(vcpu);
   
-       vcpu->pre_pcpu = -1;
-       INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
- 
         kvm_vcpu_set_in_spin_loop(vcpu, false);
         kvm_vcpu_set_dy_eligible(vcpu, false);
         vcpu->preempted = false;
@@@ -3163,8 -3160,10 +3160,10 @@@ void mark_page_dirty_in_slot(struct kv
   {
         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
   
+ #ifdef CONFIG_HAVE_KVM_DIRTY_RING
         if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
                 return;
+ #endif
   
         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
                 unsigned long rel_gfn = gfn - memslot->base_gfn;
@@@ -5603,50 -5602,6 +5602,50 @@@ struct kvm_vcpu * __percpu *kvm_get_run
           return &kvm_running_vcpu;
   }
   
+ +#ifdef CONFIG_GUEST_PERF_EVENTS
+ +static unsigned int kvm_guest_state(void)
+ +{
+ +      struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+ +      unsigned int state;
+ +
+ +      if (!kvm_arch_pmi_in_guest(vcpu))
+ +              return 0;
+ +
+ +      state = PERF_GUEST_ACTIVE;
+ +      if (!kvm_arch_vcpu_in_kernel(vcpu))
+ +              state |= PERF_GUEST_USER;
+ +
+ +      return state;
+ +}
+ +
+ +static unsigned long kvm_guest_get_ip(void)
+ +{
+ +      struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+ +
+ +      /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
+ +      if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
+ +              return 0;
+ +
+ +      return kvm_arch_vcpu_get_ip(vcpu);
+ +}
+ +
+ +static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ +      .state                  = kvm_guest_state,
+ +      .get_ip                 = kvm_guest_get_ip,
+ +      .handle_intel_pt_intr   = NULL,
+ +};
+ +
+ +void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
+ +{
+ +      kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
+ +      perf_register_guest_info_callbacks(&kvm_guest_cbs);
+ +}
+ +void kvm_unregister_perf_callbacks(void)
+ +{
+ +      perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+ +}
+ +#endif
+ +
   struct kvm_cpu_compat_check {
         void *opaque;
         int *ret;
author	Linus Torvalds <[email protected]>
	Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
committer	Linus Torvalds <[email protected]>
	Sat, 22 Jan 2022 07:40:01 +0000 (09:40 +0200)
		1	2
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/spte.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/pmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history