Merge tag 'kvmarm-fixes-5.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Paolo Bonzini <[email protected]>

Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)

committer Paolo Bonzini <[email protected]>

Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
author Paolo Bonzini <[email protected]>
Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
committer Paolo Bonzini <[email protected]>
Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
diff --combined arch/x86/kvm/svm/svm.c

index 4dd9b7856e5b125b3ae59a6d27e4b1d45f1dd4d7,b649f92287a2e53d6e018ba33066ec2b281bf930..dfa351e605dec38882e2fafd47aebe6e646e66fd
--- 1/arch/x86/kvm/svm/svm.c
--- 2/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -212,7 -212,7 +212,7 @@@ DEFINE_PER_CPU(struct svm_cpu_data *, s
    * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
    * defer the restoration of TSC_AUX until the CPU returns to userspace.
    */
- -#define TSC_AUX_URET_SLOT     0
+ +static int tsc_aux_uret_slot __read_mostly = -1;
   
   static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
   
@@@ -447,11 -447,6 +447,11 @@@ static int has_svm(void
                 return 0;
         }
   
+ +      if (pgtable_l5_enabled()) {
+ +              pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
+ +              return 0;
+ +      }
+ +
         return 1;
   }
   
@@@ -964,7 -959,8 +964,7 @@@ static __init int svm_hardware_setup(vo
                 kvm_tsc_scaling_ratio_frac_bits = 32;
         }
   
- -      if (boot_cpu_has(X86_FEATURE_RDTSCP))
- -              kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+ +      tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
   
         /* Check for pause filtering support */
         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@@@ -1104,9 -1100,7 +1104,9 @@@ static u64 svm_write_l1_tsc_offset(stru
         return svm->vmcb->control.tsc_offset;
   }
   
- -static void svm_check_invpcid(struct vcpu_svm *svm)
+ +/* Evaluate instruction intercepts that depend on guest CPUID features. */
+ +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ +                                            struct vcpu_svm *svm)
   {
         /*
          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
@@@ -1119,13 -1113,6 +1119,13 @@@
                 else
                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
         }
+ +
+ +      if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ +              if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ +                      svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ +              else
+ +                      svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ +      }
   }
   
   static void init_vmcb(struct kvm_vcpu *vcpu)
@@@ -1248,8 -1235,8 +1248,8 @@@
         svm->current_vmcb->asid_generation = 0;
         svm->asid = 0;
   
- -      svm->nested.vmcb12_gpa = 0;
- -      svm->nested.last_vmcb12_gpa = 0;
+ +      svm->nested.vmcb12_gpa = INVALID_GPA;
+ +      svm->nested.last_vmcb12_gpa = INVALID_GPA;
         vcpu->arch.hflags = 0;
   
         if (!kvm_pause_in_guest(vcpu->kvm)) {
@@@ -1261,7 -1248,7 +1261,7 @@@
                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
         }
   
- -      svm_check_invpcid(svm);
+ +      svm_recalc_instruction_intercepts(vcpu, svm);
   
         /*
          * If the host supports V_SPEC_CTRL then disable the interception
@@@ -1437,9 -1424,6 +1437,9 @@@ static void svm_prepare_guest_switch(st
         struct vcpu_svm *svm = to_svm(vcpu);
         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
   
+ +      if (sev_es_guest(vcpu->kvm))
+ +              sev_es_unmap_ghcb(svm);
+ +
         if (svm->guest_state_loaded)
                 return;
   
@@@ -1461,8 -1445,8 +1461,8 @@@
                 }
         }
   
- -      if (static_cpu_has(X86_FEATURE_RDTSCP))
- -              kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
+ +      if (likely(tsc_aux_uret_slot >= 0))
+ +              kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
   
         svm->guest_state_loaded = true;
   }
@@@ -2671,6 -2655,11 +2671,6 @@@ static int svm_get_msr(struct kvm_vcpu 
                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                 break;
         case MSR_TSC_AUX:
- -              if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- -                      return 1;
- -              if (!msr_info->host_initiated &&
- -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- -                      return 1;
                 msr_info->data = svm->tsc_aux;
                 break;
         /*
@@@ -2887,13 -2876,30 +2887,13 @@@ static int svm_set_msr(struct kvm_vcpu 
                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                 break;
         case MSR_TSC_AUX:
- -              if (!boot_cpu_has(X86_FEATURE_RDTSCP))
- -                      return 1;
- -
- -              if (!msr->host_initiated &&
- -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- -                      return 1;
- -
- -              /*
- -               * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
- -               * incomplete and conflicting architectural behavior.  Current
- -               * AMD CPUs completely ignore bits 63:32, i.e. they aren't
- -               * reserved and always read as zeros.  Emulate AMD CPU behavior
- -               * to avoid explosions if the vCPU is migrated from an AMD host
- -               * to an Intel host.
- -               */
- -              data = (u32)data;
- -
                 /*
                  * TSC_AUX is usually changed only during boot and never read
                  * directly.  Intercept TSC_AUX instead of exposing it to the
                  * guest via direct_access_msrs, and switch it via user return.
                  */
                 preempt_disable();
- -              r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+ +              r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
                 preempt_enable();
                 if (r)
                         return 1;
@@@ -3078,7 -3084,6 +3078,7 @@@ static int (*const svm_exit_handlers[])
         [SVM_EXIT_STGI]                         = stgi_interception,
         [SVM_EXIT_CLGI]                         = clgi_interception,
         [SVM_EXIT_SKINIT]                       = skinit_interception,
+ +      [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
@@@ -3705,25 -3710,7 +3705,7 @@@ static noinstr void svm_vcpu_enter_exit
         struct vcpu_svm *svm = to_svm(vcpu);
         unsigned long vmcb_pa = svm->current_vmcb->pa;
   
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
- 
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
   
         if (sev_es_guest(vcpu->kvm)) {
                 __svm_sev_es_vcpu_run(vmcb_pa);
@@@ -3743,24 -3730,7 +3725,7 @@@
                 vmload(__sme_page_pa(sd->save_area));
         }
   
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
- 
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
   }
   
   static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@@ -4002,7 -3972,8 +3967,7 @@@ static void svm_vcpu_after_set_cpuid(st
         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
   
- -      /* Check again if INVPCID interception if required */
- -      svm_check_invpcid(svm);
+ +      svm_recalc_instruction_intercepts(vcpu, svm);
   
         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
         if (sev_guest(vcpu->kvm)) {
diff --combined arch/x86/kvm/vmx/vmx.c

index f2fd447eed459a2e8c91da44845b7ecec153ee33,d000cddbd7349dc81aa0bf4731d971dda0e95b92..4bceb5ca3a8997e9d6addc24e10a4d359598d3ca
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -36,6 -36,7 +36,7 @@@
   #include <asm/debugreg.h>
   #include <asm/desc.h>
   #include <asm/fpu/internal.h>
+ #include <asm/idtentry.h>
   #include <asm/io.h>
   #include <asm/irq_remapping.h>
   #include <asm/kexec.h>
@@@ -454,6 -455,21 +455,6 @@@ static inline void vmx_segment_cache_cl
   
   static unsigned long host_idt_base;
   
- -/*
- - * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- - * will emulate SYSCALL in legacy mode if the vendor string in guest
- - * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- - * support this emulation, IA32_STAR must always be included in
- - * vmx_uret_msrs_list[], even in i386 builds.
- - */
- -static const u32 vmx_uret_msrs_list[] = {
- -#ifdef CONFIG_X86_64
- -      MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
- -#endif
- -      MSR_EFER, MSR_TSC_AUX, MSR_STAR,
- -      MSR_IA32_TSX_CTRL,
- -};
- -
   #if IS_ENABLED(CONFIG_HYPERV)
   static bool __read_mostly enlightened_vmcs = true;
   module_param(enlightened_vmcs, bool, 0444);
@@@ -681,11 -697,21 +682,11 @@@ static bool is_valid_passthrough_msr(u3
         return r;
   }
   
- -static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
- -{
- -      int i;
- -
- -      for (i = 0; i < vmx->nr_uret_msrs; ++i)
- -              if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
- -                      return i;
- -      return -1;
- -}
- -
   struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
   {
         int i;
   
- -      i = __vmx_find_uret_msr(vmx, msr);
+ +      i = kvm_find_user_return_msr(msr);
         if (i >= 0)
                 return &vmx->guest_uret_msrs[i];
         return NULL;
@@@ -694,14 -720,13 +695,14 @@@
   static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
                                   struct vmx_uret_msr *msr, u64 data)
   {
+ +      unsigned int slot = msr - vmx->guest_uret_msrs;
         int ret = 0;
   
         u64 old_msr_data = msr->data;
         msr->data = data;
- -      if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ +      if (msr->load_into_hardware) {
                 preempt_disable();
- -              ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ +              ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
                 preempt_enable();
                 if (ret)
                         msr->data = old_msr_data;
@@@ -1053,7 -1078,7 +1054,7 @@@ static bool update_transition_efer(stru
                 return false;
         }
   
- -      i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ +      i = kvm_find_user_return_msr(MSR_EFER);
         if (i < 0)
                 return false;
   
@@@ -1215,14 -1240,11 +1216,14 @@@ void vmx_prepare_switch_to_guest(struc
          */
         if (!vmx->guest_uret_msrs_loaded) {
                 vmx->guest_uret_msrs_loaded = true;
- -              for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
- -                      kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ +              for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ +                      if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ +                              continue;
+ +
+ +                      kvm_set_user_return_msr(i,
                                                 vmx->guest_uret_msrs[i].data,
                                                 vmx->guest_uret_msrs[i].mask);
- -
+ +              }
         }
   
         if (vmx->nested.need_vmcs12_to_shadow_sync)
@@@ -1729,16 -1751,19 +1730,16 @@@ static void vmx_queue_exception(struct 
         vmx_clear_hlt(vcpu);
   }
   
- -static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+ +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ +                             bool load_into_hardware)
   {
- -      struct vmx_uret_msr tmp;
- -      int from, to;
+ +      struct vmx_uret_msr *uret_msr;
   
- -      from = __vmx_find_uret_msr(vmx, msr);
- -      if (from < 0)
+ +      uret_msr = vmx_find_uret_msr(vmx, msr);
+ +      if (!uret_msr)
                 return;
- -      to = vmx->nr_active_uret_msrs++;
   
- -      tmp = vmx->guest_uret_msrs[to];
- -      vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
- -      vmx->guest_uret_msrs[from] = tmp;
+ +      uret_msr->load_into_hardware = load_into_hardware;
   }
   
   /*
@@@ -1748,42 -1773,29 +1749,42 @@@
    */
   static void setup_msrs(struct vcpu_vmx *vmx)
   {
- -      vmx->guest_uret_msrs_loaded = false;
- -      vmx->nr_active_uret_msrs = 0;
   #ifdef CONFIG_X86_64
+ +      bool load_syscall_msrs;
+ +
         /*
          * The SYSCALL MSRs are only needed on long mode guests, and only
          * when EFER.SCE is set.
          */
- -      if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- -              vmx_setup_uret_msr(vmx, MSR_STAR);
- -              vmx_setup_uret_msr(vmx, MSR_LSTAR);
- -              vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
- -      }
+ +      load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ +                          (vmx->vcpu.arch.efer & EFER_SCE);
+ +
+ +      vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ +      vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ +      vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
   #endif
- -      if (update_transition_efer(vmx))
- -              vmx_setup_uret_msr(vmx, MSR_EFER);
+ +      vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
   
- -      if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- -              vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+ +      vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ +                         guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ +                         guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
   
- -      vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ +      /*
+ +       * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ +       * kernel and old userspace.  If those guests run on a tsx=off host, do
+ +       * allow guests to use TSX_CTRL, but don't change the value in hardware
+ +       * so that TSX remains always disabled.
+ +       */
+ +      vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
   
         if (cpu_has_vmx_msr_bitmap())
                 vmx_update_msr_bitmap(&vmx->vcpu);
+ +
+ +      /*
+ +       * The set of MSRs to load may have changed, reload MSRs before the
+ +       * next VM-Enter.
+ +       */
+ +      vmx->guest_uret_msrs_loaded = false;
   }
   
   static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@@ -1981,6 -1993,11 +1982,6 @@@ static int vmx_get_msr(struct kvm_vcpu 
                 else
                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
                 break;
- -      case MSR_TSC_AUX:
- -              if (!msr_info->host_initiated &&
- -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- -                      return 1;
- -              goto find_uret_msr;
         case MSR_IA32_DEBUGCTLMSR:
                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
                 break;
@@@ -2014,9 -2031,6 +2015,9 @@@ static u64 vcpu_supported_debugctl(stru
         if (!intel_pmu_lbr_is_enabled(vcpu))
                 debugctl &= ~DEBUGCTLMSR_LBR_MASK;
   
+ +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ +              debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ +
         return debugctl;
   }
   
@@@ -2299,6 -2313,14 +2300,6 @@@ static int vmx_set_msr(struct kvm_vcpu 
                 else
                         vmx->pt_desc.guest.addr_a[index / 2] = data;
                 break;
- -      case MSR_TSC_AUX:
- -              if (!msr_info->host_initiated &&
- -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- -                      return 1;
- -              /* Check reserved bit, higher 32 bits should be zero */
- -              if ((data >> 32) != 0)
- -                      return 1;
- -              goto find_uret_msr;
         case MSR_IA32_PERF_CAPABILITIES:
                 if (data && !vcpu_to_pmu(vcpu)->version)
                         return 1;
@@@ -4347,23 -4369,7 +4348,23 @@@ static void vmx_compute_secondary_exec_
                                                   xsaves_enabled, false);
         }
   
- -      vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+ +      /*
+ +       * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ +       * feature is exposed to the guest.  This creates a virtualization hole
+ +       * if both are supported in hardware but only one is exposed to the
+ +       * guest, but letting the guest execute RDTSCP or RDPID when either one
+ +       * is advertised is preferable to emulating the advertised instruction
+ +       * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ +       */
+ +      if (cpu_has_vmx_rdtscp()) {
+ +              bool rdpid_or_rdtscp_enabled =
+ +                      guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ +                      guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ +
+ +              vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ +                                                SECONDARY_EXEC_ENABLE_RDTSCP,
+ +                                                rdpid_or_rdtscp_enabled, false);
+ +      }
         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
   
         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@@ -6410,18 -6416,17 +6411,17 @@@ static void vmx_apicv_post_state_restor
   
   void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
   
- static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+ static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+                                       unsigned long entry)
   {
-       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-       gate_desc *desc = (gate_desc *)host_idt_base + vector;
- 
         kvm_before_interrupt(vcpu);
-       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       vmx_do_interrupt_nmi_irqoff(entry);
         kvm_after_interrupt(vcpu);
   }
   
   static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
   {
+       const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
   
         /* if exit due to PF check for async PF */
@@@ -6432,18 -6437,20 +6432,20 @@@
                 kvm_machine_check();
         /* We need to handle NMIs before interrupts are enabled */
         else if (is_nmi(intr_info))
-               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
   }
   
   static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
   {
         u32 intr_info = vmx_get_intr_info(vcpu);
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
   
         if (WARN_ONCE(!is_external_intr(intr_info),
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
   
-       handle_interrupt_nmi_irqoff(vcpu, intr_info);
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
   }
   
   static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@@ -6657,25 -6664,7 +6659,7 @@@ static fastpath_t vmx_exit_handlers_fas
   static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                         struct vcpu_vmx *vmx)
   {
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
- 
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
   
         /* L1D Flush includes CPU buffer clear to mitigate MDS */
         if (static_branch_unlikely(&vmx_l1d_should_flush))
@@@ -6691,24 -6680,7 +6675,7 @@@
   
         vcpu->arch.cr2 = native_read_cr2();
   
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
- 
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
   }
   
   static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@@ -6883,7 -6855,6 +6850,7 @@@ static void vmx_free_vcpu(struct kvm_vc
   
   static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
   {
+ +      struct vmx_uret_msr *tsx_ctrl;
         struct vcpu_vmx *vmx;
         int i, cpu, err;
   
@@@ -6906,19 -6877,43 +6873,19 @@@
                         goto free_vpid;
         }
   
- -      BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
- -
- -      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
- -              u32 index = vmx_uret_msrs_list[i];
- -              u32 data_low, data_high;
- -              int j = vmx->nr_uret_msrs;
- -
- -              if (rdmsr_safe(index, &data_low, &data_high) < 0)
- -                      continue;
- -              if (wrmsr_safe(index, data_low, data_high) < 0)
- -                      continue;
- -
- -              vmx->guest_uret_msrs[j].slot = i;
- -              vmx->guest_uret_msrs[j].data = 0;
- -              switch (index) {
- -              case MSR_IA32_TSX_CTRL:
- -                      /*
- -                       * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
- -                       * interception.  Keep the host value unchanged to avoid
- -                       * changing CPUID bits under the host kernel's feet.
- -                       *
- -                       * hle=0, rtm=0, tsx_ctrl=1 can be found with some
- -                       * combinations of new kernel and old userspace.  If
- -                       * those guests run on a tsx=off host, do allow guests
- -                       * to use TSX_CTRL, but do not change the value on the
- -                       * host so that TSX remains always disabled.
- -                       */
- -                      if (boot_cpu_has(X86_FEATURE_RTM))
- -                              vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
- -                      else
- -                              vmx->guest_uret_msrs[j].mask = 0;
- -                      break;
- -              default:
- -                      vmx->guest_uret_msrs[j].mask = -1ull;
- -                      break;
- -              }
- -              ++vmx->nr_uret_msrs;
+ +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ +              vmx->guest_uret_msrs[i].data = 0;
+ +              vmx->guest_uret_msrs[i].mask = -1ull;
+ +      }
+ +      if (boot_cpu_has(X86_FEATURE_RTM)) {
+ +              /*
+ +               * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ +               * Keep the host value unchanged to avoid changing CPUID bits
+ +               * under the host kernel's feet.
+ +               */
+ +              tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ +              if (tsx_ctrl)
+ +                      vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
         }
   
         err = alloc_loaded_vmcs(&vmx->vmcs01);
@@@ -7349,11 -7344,9 +7316,11 @@@ static __init void vmx_set_cpu_caps(voi
         if (!cpu_has_vmx_xsaves())
                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
   
- -      /* CPUID 0x80000001 */
- -      if (!cpu_has_vmx_rdtscp())
+ +      /* CPUID 0x80000001 and 0x7 (RDPID) */
+ +      if (!cpu_has_vmx_rdtscp()) {
                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ +              kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ +      }
   
         if (cpu_has_vmx_waitpkg())
                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@@ -7409,9 -7402,8 +7376,9 @@@ static int vmx_check_intercept(struct k
         /*
          * RDPID causes #UD if disabled through secondary execution controls.
          * Because it is marked as EmulateOnUD, we need to intercept it here.
+ +       * Note, RDPID is hidden behind ENABLE_RDTSCP.
          */
- -      case x86_intercept_rdtscp:
+ +      case x86_intercept_rdpid:
                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                         exception->vector = UD_VECTOR;
                         exception->error_code_valid = false;
@@@ -7777,42 -7769,17 +7744,42 @@@ static struct kvm_x86_ops vmx_x86_ops _
         .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
   };
   
+ +static __init void vmx_setup_user_return_msrs(void)
+ +{
+ +
+ +      /*
+ +       * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ +       * will emulate SYSCALL in legacy mode if the vendor string in guest
+ +       * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ +       * support this emulation, MSR_STAR is included in the list for i386,
+ +       * but is never loaded into hardware.  MSR_CSTAR is also never loaded
+ +       * into hardware and is here purely for emulation purposes.
+ +       */
+ +      const u32 vmx_uret_msrs_list[] = {
+ +      #ifdef CONFIG_X86_64
+ +              MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ +      #endif
+ +              MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ +              MSR_IA32_TSX_CTRL,
+ +      };
+ +      int i;
+ +
+ +      BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+ +
+ +      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ +              kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+ +}
+ +
   static __init int hardware_setup(void)
   {
         unsigned long host_bndcfgs;
         struct desc_ptr dt;
- -      int r, i, ept_lpage_level;
+ +      int r, ept_lpage_level;
   
         store_idt(&dt);
         host_idt_base = dt.address;
   
- -      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
- -              kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+ +      vmx_setup_user_return_msrs();
   
         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                 return -EIO;
diff --combined arch/x86/kvm/x86.c

index 5bd550eaf683317b9558275b013469b247c2f90d,6eda2834fc05ef07bef81d873aa80692169dcce9..9b6bca61692912099fcc7eaa9ab8d49d512be67d
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -184,6 -184,11 +184,6 @@@ module_param(pi_inject_timer, bint, S_I
    */
   #define KVM_MAX_NR_USER_RETURN_MSRS 16
   
- -struct kvm_user_return_msrs_global {
- -      int nr;
- -      u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
- -};
- -
   struct kvm_user_return_msrs {
         struct user_return_notifier urn;
         bool registered;
@@@ -193,9 -198,7 +193,9 @@@
         } values[KVM_MAX_NR_USER_RETURN_MSRS];
   };
   
- -static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+ +u32 __read_mostly kvm_nr_uret_msrs;
+ +EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+ +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
   static struct kvm_user_return_msrs __percpu *user_return_msrs;
   
   #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@@ -327,53 -330,23 +327,53 @@@ static void kvm_on_user_return(struct u
                 user_return_notifier_unregister(urn);
         }
         local_irq_restore(flags);
- -      for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ +      for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
                 values = &msrs->values[slot];
                 if (values->host != values->curr) {
- -                      wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+ +                      wrmsrl(kvm_uret_msrs_list[slot], values->host);
                         values->curr = values->host;
                 }
         }
   }
   
- -void kvm_define_user_return_msr(unsigned slot, u32 msr)
+ +static int kvm_probe_user_return_msr(u32 msr)
+ +{
+ +      u64 val;
+ +      int ret;
+ +
+ +      preempt_disable();
+ +      ret = rdmsrl_safe(msr, &val);
+ +      if (ret)
+ +              goto out;
+ +      ret = wrmsrl_safe(msr, val);
+ +out:
+ +      preempt_enable();
+ +      return ret;
+ +}
+ +
+ +int kvm_add_user_return_msr(u32 msr)
+ +{
+ +      BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+ +
+ +      if (kvm_probe_user_return_msr(msr))
+ +              return -1;
+ +
+ +      kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ +      return kvm_nr_uret_msrs++;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+ +
+ +int kvm_find_user_return_msr(u32 msr)
   {
- -      BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
- -      user_return_msrs_global.msrs[slot] = msr;
- -      if (slot >= user_return_msrs_global.nr)
- -              user_return_msrs_global.nr = slot + 1;
+ +      int i;
+ +
+ +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ +              if (kvm_uret_msrs_list[i] == msr)
+ +                      return i;
+ +      }
+ +      return -1;
   }
- -EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+ +EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
   
   static void kvm_user_return_msr_cpu_online(void)
   {
@@@ -382,8 -355,8 +382,8 @@@
         u64 value;
         int i;
   
- -      for (i = 0; i < user_return_msrs_global.nr; ++i) {
- -              rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ +              rdmsrl_safe(kvm_uret_msrs_list[i], &value);
                 msrs->values[i].host = value;
                 msrs->values[i].curr = value;
         }
@@@ -398,7 -371,7 +398,7 @@@ int kvm_set_user_return_msr(unsigned sl
         value = (value & mask) | (msrs->values[slot].host & ~mask);
         if (value == msrs->values[slot].curr)
                 return 0;
- -      err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+ +      err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
         if (err)
                 return 1;
   
@@@ -1176,9 -1149,6 +1176,9 @@@ static u64 kvm_dr6_fixed(struct kvm_vcp
   
         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
                 fixed |= DR6_RTM;
+ +
+ +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ +              fixed |= DR6_BUS_LOCK;
         return fixed;
   }
   
@@@ -1645,30 -1615,6 +1645,30 @@@ static int __kvm_set_msr(struct kvm_vcp
                  * invokes 64-bit SYSENTER.
                  */
                 data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ +              break;
+ +      case MSR_TSC_AUX:
+ +              if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ +                      return 1;
+ +
+ +              if (!host_initiated &&
+ +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ +                      return 1;
+ +
+ +              /*
+ +               * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ +               * incomplete and conflicting architectural behavior.  Current
+ +               * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ +               * reserved and always read as zeros.  Enforce Intel's reserved
+ +               * bits check if and only if the guest CPU is Intel, and clear
+ +               * the bits in all other cases.  This ensures cross-vendor
+ +               * migration will provide consistent behavior for the guest.
+ +               */
+ +              if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ +                      return 1;
+ +
+ +              data = (u32)data;
+ +              break;
         }
   
         msr.data = data;
@@@ -1705,18 -1651,6 +1705,18 @@@ int __kvm_get_msr(struct kvm_vcpu *vcpu
         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
                 return KVM_MSR_RET_FILTERED;
   
+ +      switch (index) {
+ +      case MSR_TSC_AUX:
+ +              if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ +                      return 1;
+ +
+ +              if (!host_initiated &&
+ +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ +                      return 1;
+ +              break;
+ +      }
+ +
         msr.index = index;
         msr.host_initiated = host_initiated;
   
@@@ -5534,18 -5468,14 +5534,18 @@@ static void kvm_free_msr_filter(struct 
   static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
                               struct kvm_msr_filter_range *user_range)
   {
- -      struct msr_bitmap_range range;
         unsigned long *bitmap = NULL;
         size_t bitmap_size;
- -      int r;
   
         if (!user_range->nmsrs)
                 return 0;
   
+ +      if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ +              return -EINVAL;
+ +
+ +      if (!user_range->flags)
+ +              return -EINVAL;
+ +
         bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
         if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
                 return -EINVAL;
@@@ -5554,15 -5484,31 +5554,15 @@@
         if (IS_ERR(bitmap))
                 return PTR_ERR(bitmap);
   
- -      range = (struct msr_bitmap_range) {
+ +      msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
                 .flags = user_range->flags,
                 .base = user_range->base,
                 .nmsrs = user_range->nmsrs,
                 .bitmap = bitmap,
         };
   
- -      if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
- -              r = -EINVAL;
- -              goto err;
- -      }
- -
- -      if (!range.flags) {
- -              r = -EINVAL;
- -              goto err;
- -      }
- -
- -      /* Everything ok, add this range identifier. */
- -      msr_filter->ranges[msr_filter->count] = range;
         msr_filter->count++;
- -
         return 0;
- -err:
- -      kfree(bitmap);
- -      return r;
   }
   
   static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@@ -5991,8 -5937,7 +5991,8 @@@ static void kvm_init_msr_list(void
                                 continue;
                         break;
                 case MSR_TSC_AUX:
- -                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ +                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ +                          !kvm_cpu_cap_has(X86_FEATURE_RDPID))
                                 continue;
                         break;
                 case MSR_IA32_UMWAIT_CONTROL:
@@@ -8094,18 -8039,6 +8094,18 @@@ static void pvclock_gtod_update_fn(stru
   
   static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
   
+ +/*
+ + * Indirection to move queue_work() out of the tk_core.seq write held
+ + * region to prevent possible deadlocks against time accessors which
+ + * are invoked with work related locks held.
+ + */
+ +static void pvclock_irq_work_fn(struct irq_work *w)
+ +{
+ +      queue_work(system_long_wq, &pvclock_gtod_work);
+ +}
+ +
+ +static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+ +
   /*
    * Notification about pvclock gtod data update.
    */
@@@ -8117,14 -8050,13 +8117,14 @@@ static int pvclock_gtod_notify(struct n
   
         update_pvclock_gtod(tk);
   
- -      /* disable master clock if host does not trust, or does not
- -       * use, TSC based clocksource.
+ +      /*
+ +       * Disable master clock if host does not trust, or does not use,
+ +       * TSC based clocksource. Delegate queue_work() to irq_work as
+ +       * this is invoked with tk_core.seq write held.
          */
         if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
             atomic_read(&kvm_guest_has_master_clock) != 0)
- -              queue_work(system_long_wq, &pvclock_gtod_work);
- -
+ +              irq_work_queue(&pvclock_irq_work);
         return 0;
   }
   
@@@ -8186,7 -8118,6 +8186,7 @@@ int kvm_arch_init(void *opaque
                 printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
                 goto out_free_x86_emulator_cache;
         }
+ +      kvm_nr_uret_msrs = 0;
   
         r = kvm_mmu_module_init();
         if (r)
@@@ -8237,8 -8168,6 +8237,8 @@@ void kvm_arch_exit(void
         cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
   #ifdef CONFIG_X86_64
         pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ +      irq_work_sync(&pvclock_irq_work);
+ +      cancel_work_sync(&pvclock_gtod_work);
   #endif
         kvm_x86_ops.hardware_enable = NULL;
         kvm_mmu_module_exit();
@@@ -9386,6 -9315,15 +9386,15 @@@ static int vcpu_enter_guest(struct kvm_
         local_irq_disable();
         kvm_after_interrupt(vcpu);
   
+       /*
+        * Wait until after servicing IRQs to account guest time so that any
+        * ticks that occurred while running the guest are properly accounted
+        * to the guest.  Waiting until IRQs are enabled degrades the accuracy
+        * of accounting via context tracking, but the loss of accuracy is
+        * acceptable for all known use cases.
+        */
+       vtime_account_guest_exit();
+ 
         if (lapic_in_kernel(vcpu)) {
                 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
                 if (delta != S64_MIN) {
author	Paolo Bonzini <[email protected]>
	Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
committer	Paolo Bonzini <[email protected]>
	Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
		1	2
arch/x86/kvm/svm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history