]> Git Repo - linux.git/commitdiff
Merge tag 'kvmarm-fixes-5.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <[email protected]>
Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
committerPaolo Bonzini <[email protected]>
Mon, 17 May 2021 07:55:12 +0000 (09:55 +0200)
KVM/arm64 fixes for 5.13, take #1

- Fix regression with irqbypass not restarting the guest on failed connect
- Fix regression with debug register decoding resulting in overlapping access
- Commit exception state on exit to usrspace
- Fix the MMU notifier return values
- Add missing 'static' qualifiers in the new host stage-2 code

1  2 
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c

diff --combined arch/x86/kvm/svm/svm.c
index 4dd9b7856e5b125b3ae59a6d27e4b1d45f1dd4d7,b649f92287a2e53d6e018ba33066ec2b281bf930..dfa351e605dec38882e2fafd47aebe6e646e66fd
@@@ -212,7 -212,7 +212,7 @@@ DEFINE_PER_CPU(struct svm_cpu_data *, s
   * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
   * defer the restoration of TSC_AUX until the CPU returns to userspace.
   */
 -#define TSC_AUX_URET_SLOT     0
 +static int tsc_aux_uret_slot __read_mostly = -1;
  
  static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
  
@@@ -447,11 -447,6 +447,11 @@@ static int has_svm(void
                return 0;
        }
  
 +      if (pgtable_l5_enabled()) {
 +              pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
 +              return 0;
 +      }
 +
        return 1;
  }
  
@@@ -964,7 -959,8 +964,7 @@@ static __init int svm_hardware_setup(vo
                kvm_tsc_scaling_ratio_frac_bits = 32;
        }
  
 -      if (boot_cpu_has(X86_FEATURE_RDTSCP))
 -              kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
 +      tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
  
        /* Check for pause filtering support */
        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@@@ -1104,9 -1100,7 +1104,9 @@@ static u64 svm_write_l1_tsc_offset(stru
        return svm->vmcb->control.tsc_offset;
  }
  
 -static void svm_check_invpcid(struct vcpu_svm *svm)
 +/* Evaluate instruction intercepts that depend on guest CPUID features. */
 +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
 +                                            struct vcpu_svm *svm)
  {
        /*
         * Intercept INVPCID if shadow paging is enabled to sync/free shadow
                else
                        svm_clr_intercept(svm, INTERCEPT_INVPCID);
        }
 +
 +      if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
 +              if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 +                      svm_clr_intercept(svm, INTERCEPT_RDTSCP);
 +              else
 +                      svm_set_intercept(svm, INTERCEPT_RDTSCP);
 +      }
  }
  
  static void init_vmcb(struct kvm_vcpu *vcpu)
        svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
  
 -      svm->nested.vmcb12_gpa = 0;
 -      svm->nested.last_vmcb12_gpa = 0;
 +      svm->nested.vmcb12_gpa = INVALID_GPA;
 +      svm->nested.last_vmcb12_gpa = INVALID_GPA;
        vcpu->arch.hflags = 0;
  
        if (!kvm_pause_in_guest(vcpu->kvm)) {
                svm_clr_intercept(svm, INTERCEPT_PAUSE);
        }
  
 -      svm_check_invpcid(svm);
 +      svm_recalc_instruction_intercepts(vcpu, svm);
  
        /*
         * If the host supports V_SPEC_CTRL then disable the interception
@@@ -1437,9 -1424,6 +1437,9 @@@ static void svm_prepare_guest_switch(st
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
  
 +      if (sev_es_guest(vcpu->kvm))
 +              sev_es_unmap_ghcb(svm);
 +
        if (svm->guest_state_loaded)
                return;
  
                }
        }
  
 -      if (static_cpu_has(X86_FEATURE_RDTSCP))
 -              kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
 +      if (likely(tsc_aux_uret_slot >= 0))
 +              kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
  
        svm->guest_state_loaded = true;
  }
@@@ -2671,6 -2655,11 +2671,6 @@@ static int svm_get_msr(struct kvm_vcpu 
                        msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                break;
        case MSR_TSC_AUX:
 -              if (!boot_cpu_has(X86_FEATURE_RDTSCP))
 -                      return 1;
 -              if (!msr_info->host_initiated &&
 -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 -                      return 1;
                msr_info->data = svm->tsc_aux;
                break;
        /*
@@@ -2887,13 -2876,30 +2887,13 @@@ static int svm_set_msr(struct kvm_vcpu 
                svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_TSC_AUX:
 -              if (!boot_cpu_has(X86_FEATURE_RDTSCP))
 -                      return 1;
 -
 -              if (!msr->host_initiated &&
 -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 -                      return 1;
 -
 -              /*
 -               * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
 -               * incomplete and conflicting architectural behavior.  Current
 -               * AMD CPUs completely ignore bits 63:32, i.e. they aren't
 -               * reserved and always read as zeros.  Emulate AMD CPU behavior
 -               * to avoid explosions if the vCPU is migrated from an AMD host
 -               * to an Intel host.
 -               */
 -              data = (u32)data;
 -
                /*
                 * TSC_AUX is usually changed only during boot and never read
                 * directly.  Intercept TSC_AUX instead of exposing it to the
                 * guest via direct_access_msrs, and switch it via user return.
                 */
                preempt_disable();
 -              r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
 +              r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
                preempt_enable();
                if (r)
                        return 1;
@@@ -3078,7 -3084,6 +3078,7 @@@ static int (*const svm_exit_handlers[])
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
 +      [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
        [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
        [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
        [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
@@@ -3705,25 -3710,7 +3705,7 @@@ static noinstr void svm_vcpu_enter_exit
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long vmcb_pa = svm->current_vmcb->pa;
  
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
  
        if (sev_es_guest(vcpu->kvm)) {
                __svm_sev_es_vcpu_run(vmcb_pa);
                vmload(__sme_page_pa(sd->save_area));
        }
  
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
  }
  
  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
@@@ -4002,7 -3972,8 +3967,7 @@@ static void svm_vcpu_after_set_cpuid(st
        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
                             guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
  
 -      /* Check again if INVPCID interception if required */
 -      svm_check_invpcid(svm);
 +      svm_recalc_instruction_intercepts(vcpu, svm);
  
        /* For sev guests, the memory encryption bit is not reserved in CR3.  */
        if (sev_guest(vcpu->kvm)) {
diff --combined arch/x86/kvm/vmx/vmx.c
index f2fd447eed459a2e8c91da44845b7ecec153ee33,d000cddbd7349dc81aa0bf4731d971dda0e95b92..4bceb5ca3a8997e9d6addc24e10a4d359598d3ca
@@@ -36,6 -36,7 +36,7 @@@
  #include <asm/debugreg.h>
  #include <asm/desc.h>
  #include <asm/fpu/internal.h>
+ #include <asm/idtentry.h>
  #include <asm/io.h>
  #include <asm/irq_remapping.h>
  #include <asm/kexec.h>
@@@ -454,6 -455,21 +455,6 @@@ static inline void vmx_segment_cache_cl
  
  static unsigned long host_idt_base;
  
 -/*
 - * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 - * will emulate SYSCALL in legacy mode if the vendor string in guest
 - * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 - * support this emulation, IA32_STAR must always be included in
 - * vmx_uret_msrs_list[], even in i386 builds.
 - */
 -static const u32 vmx_uret_msrs_list[] = {
 -#ifdef CONFIG_X86_64
 -      MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 -#endif
 -      MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 -      MSR_IA32_TSX_CTRL,
 -};
 -
  #if IS_ENABLED(CONFIG_HYPERV)
  static bool __read_mostly enlightened_vmcs = true;
  module_param(enlightened_vmcs, bool, 0444);
@@@ -681,11 -697,21 +682,11 @@@ static bool is_valid_passthrough_msr(u3
        return r;
  }
  
 -static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 -{
 -      int i;
 -
 -      for (i = 0; i < vmx->nr_uret_msrs; ++i)
 -              if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
 -                      return i;
 -      return -1;
 -}
 -
  struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
  {
        int i;
  
 -      i = __vmx_find_uret_msr(vmx, msr);
 +      i = kvm_find_user_return_msr(msr);
        if (i >= 0)
                return &vmx->guest_uret_msrs[i];
        return NULL;
  static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
                                  struct vmx_uret_msr *msr, u64 data)
  {
 +      unsigned int slot = msr - vmx->guest_uret_msrs;
        int ret = 0;
  
        u64 old_msr_data = msr->data;
        msr->data = data;
 -      if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
 +      if (msr->load_into_hardware) {
                preempt_disable();
 -              ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
 +              ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
                preempt_enable();
                if (ret)
                        msr->data = old_msr_data;
@@@ -1053,7 -1078,7 +1054,7 @@@ static bool update_transition_efer(stru
                return false;
        }
  
 -      i = __vmx_find_uret_msr(vmx, MSR_EFER);
 +      i = kvm_find_user_return_msr(MSR_EFER);
        if (i < 0)
                return false;
  
@@@ -1215,14 -1240,11 +1216,14 @@@ void vmx_prepare_switch_to_guest(struc
         */
        if (!vmx->guest_uret_msrs_loaded) {
                vmx->guest_uret_msrs_loaded = true;
 -              for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
 -                      kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
 +              for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 +                      if (!vmx->guest_uret_msrs[i].load_into_hardware)
 +                              continue;
 +
 +                      kvm_set_user_return_msr(i,
                                                vmx->guest_uret_msrs[i].data,
                                                vmx->guest_uret_msrs[i].mask);
 -
 +              }
        }
  
        if (vmx->nested.need_vmcs12_to_shadow_sync)
@@@ -1729,16 -1751,19 +1730,16 @@@ static void vmx_queue_exception(struct 
        vmx_clear_hlt(vcpu);
  }
  
 -static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
 +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
 +                             bool load_into_hardware)
  {
 -      struct vmx_uret_msr tmp;
 -      int from, to;
 +      struct vmx_uret_msr *uret_msr;
  
 -      from = __vmx_find_uret_msr(vmx, msr);
 -      if (from < 0)
 +      uret_msr = vmx_find_uret_msr(vmx, msr);
 +      if (!uret_msr)
                return;
 -      to = vmx->nr_active_uret_msrs++;
  
 -      tmp = vmx->guest_uret_msrs[to];
 -      vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
 -      vmx->guest_uret_msrs[from] = tmp;
 +      uret_msr->load_into_hardware = load_into_hardware;
  }
  
  /*
   */
  static void setup_msrs(struct vcpu_vmx *vmx)
  {
 -      vmx->guest_uret_msrs_loaded = false;
 -      vmx->nr_active_uret_msrs = 0;
  #ifdef CONFIG_X86_64
 +      bool load_syscall_msrs;
 +
        /*
         * The SYSCALL MSRs are only needed on long mode guests, and only
         * when EFER.SCE is set.
         */
 -      if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
 -              vmx_setup_uret_msr(vmx, MSR_STAR);
 -              vmx_setup_uret_msr(vmx, MSR_LSTAR);
 -              vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
 -      }
 +      load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
 +                          (vmx->vcpu.arch.efer & EFER_SCE);
 +
 +      vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
 +      vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
 +      vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
  #endif
 -      if (update_transition_efer(vmx))
 -              vmx_setup_uret_msr(vmx, MSR_EFER);
 +      vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
  
 -      if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
 -              vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
 +      vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
 +                         guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
 +                         guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
  
 -      vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
 +      /*
 +       * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
 +       * kernel and old userspace.  If those guests run on a tsx=off host, do
 +       * allow guests to use TSX_CTRL, but don't change the value in hardware
 +       * so that TSX remains always disabled.
 +       */
 +      vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
  
        if (cpu_has_vmx_msr_bitmap())
                vmx_update_msr_bitmap(&vmx->vcpu);
 +
 +      /*
 +       * The set of MSRs to load may have changed, reload MSRs before the
 +       * next VM-Enter.
 +       */
 +      vmx->guest_uret_msrs_loaded = false;
  }
  
  static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@@ -1981,6 -1993,11 +1982,6 @@@ static int vmx_get_msr(struct kvm_vcpu 
                else
                        msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
                break;
 -      case MSR_TSC_AUX:
 -              if (!msr_info->host_initiated &&
 -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 -                      return 1;
 -              goto find_uret_msr;
        case MSR_IA32_DEBUGCTLMSR:
                msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
                break;
@@@ -2014,9 -2031,6 +2015,9 @@@ static u64 vcpu_supported_debugctl(stru
        if (!intel_pmu_lbr_is_enabled(vcpu))
                debugctl &= ~DEBUGCTLMSR_LBR_MASK;
  
 +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 +              debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
 +
        return debugctl;
  }
  
@@@ -2299,6 -2313,14 +2300,6 @@@ static int vmx_set_msr(struct kvm_vcpu 
                else
                        vmx->pt_desc.guest.addr_a[index / 2] = data;
                break;
 -      case MSR_TSC_AUX:
 -              if (!msr_info->host_initiated &&
 -                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 -                      return 1;
 -              /* Check reserved bit, higher 32 bits should be zero */
 -              if ((data >> 32) != 0)
 -                      return 1;
 -              goto find_uret_msr;
        case MSR_IA32_PERF_CAPABILITIES:
                if (data && !vcpu_to_pmu(vcpu)->version)
                        return 1;
@@@ -4347,23 -4369,7 +4348,23 @@@ static void vmx_compute_secondary_exec_
                                                  xsaves_enabled, false);
        }
  
 -      vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
 +      /*
 +       * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
 +       * feature is exposed to the guest.  This creates a virtualization hole
 +       * if both are supported in hardware but only one is exposed to the
 +       * guest, but letting the guest execute RDTSCP or RDPID when either one
 +       * is advertised is preferable to emulating the advertised instruction
 +       * in KVM on #UD, and obviously better than incorrectly injecting #UD.
 +       */
 +      if (cpu_has_vmx_rdtscp()) {
 +              bool rdpid_or_rdtscp_enabled =
 +                      guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
 +                      guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
 +
 +              vmx_adjust_secondary_exec_control(vmx, &exec_control,
 +                                                SECONDARY_EXEC_ENABLE_RDTSCP,
 +                                                rdpid_or_rdtscp_enabled, false);
 +      }
        vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
  
        vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@@ -6410,18 -6416,17 +6411,17 @@@ static void vmx_apicv_post_state_restor
  
  void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
  
- static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+ static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+                                       unsigned long entry)
  {
-       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-       gate_desc *desc = (gate_desc *)host_idt_base + vector;
        kvm_before_interrupt(vcpu);
-       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       vmx_do_interrupt_nmi_irqoff(entry);
        kvm_after_interrupt(vcpu);
  }
  
  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
  {
+       const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
        u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
  
        /* if exit due to PF check for async PF */
                kvm_machine_check();
        /* We need to handle NMIs before interrupts are enabled */
        else if (is_nmi(intr_info))
-               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
  }
  
  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  {
        u32 intr_info = vmx_get_intr_info(vcpu);
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
  
        if (WARN_ONCE(!is_external_intr(intr_info),
            "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                return;
  
-       handle_interrupt_nmi_irqoff(vcpu, intr_info);
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
  }
  
  static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@@ -6657,25 -6664,7 +6659,7 @@@ static fastpath_t vmx_exit_handlers_fas
  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                        struct vcpu_vmx *vmx)
  {
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
  
        /* L1D Flush includes CPU buffer clear to mitigate MDS */
        if (static_branch_unlikely(&vmx_l1d_should_flush))
  
        vcpu->arch.cr2 = native_read_cr2();
  
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
  }
  
  static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@@ -6883,7 -6855,6 +6850,7 @@@ static void vmx_free_vcpu(struct kvm_vc
  
  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
  {
 +      struct vmx_uret_msr *tsx_ctrl;
        struct vcpu_vmx *vmx;
        int i, cpu, err;
  
                        goto free_vpid;
        }
  
 -      BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
 -
 -      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
 -              u32 index = vmx_uret_msrs_list[i];
 -              u32 data_low, data_high;
 -              int j = vmx->nr_uret_msrs;
 -
 -              if (rdmsr_safe(index, &data_low, &data_high) < 0)
 -                      continue;
 -              if (wrmsr_safe(index, data_low, data_high) < 0)
 -                      continue;
 -
 -              vmx->guest_uret_msrs[j].slot = i;
 -              vmx->guest_uret_msrs[j].data = 0;
 -              switch (index) {
 -              case MSR_IA32_TSX_CTRL:
 -                      /*
 -                       * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
 -                       * interception.  Keep the host value unchanged to avoid
 -                       * changing CPUID bits under the host kernel's feet.
 -                       *
 -                       * hle=0, rtm=0, tsx_ctrl=1 can be found with some
 -                       * combinations of new kernel and old userspace.  If
 -                       * those guests run on a tsx=off host, do allow guests
 -                       * to use TSX_CTRL, but do not change the value on the
 -                       * host so that TSX remains always disabled.
 -                       */
 -                      if (boot_cpu_has(X86_FEATURE_RTM))
 -                              vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
 -                      else
 -                              vmx->guest_uret_msrs[j].mask = 0;
 -                      break;
 -              default:
 -                      vmx->guest_uret_msrs[j].mask = -1ull;
 -                      break;
 -              }
 -              ++vmx->nr_uret_msrs;
 +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 +              vmx->guest_uret_msrs[i].data = 0;
 +              vmx->guest_uret_msrs[i].mask = -1ull;
 +      }
 +      if (boot_cpu_has(X86_FEATURE_RTM)) {
 +              /*
 +               * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
 +               * Keep the host value unchanged to avoid changing CPUID bits
 +               * under the host kernel's feet.
 +               */
 +              tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
 +              if (tsx_ctrl)
 +                      vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
        }
  
        err = alloc_loaded_vmcs(&vmx->vmcs01);
@@@ -7349,11 -7344,9 +7316,11 @@@ static __init void vmx_set_cpu_caps(voi
        if (!cpu_has_vmx_xsaves())
                kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
  
 -      /* CPUID 0x80000001 */
 -      if (!cpu_has_vmx_rdtscp())
 +      /* CPUID 0x80000001 and 0x7 (RDPID) */
 +      if (!cpu_has_vmx_rdtscp()) {
                kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 +              kvm_cpu_cap_clear(X86_FEATURE_RDPID);
 +      }
  
        if (cpu_has_vmx_waitpkg())
                kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@@ -7409,9 -7402,8 +7376,9 @@@ static int vmx_check_intercept(struct k
        /*
         * RDPID causes #UD if disabled through secondary execution controls.
         * Because it is marked as EmulateOnUD, we need to intercept it here.
 +       * Note, RDPID is hidden behind ENABLE_RDTSCP.
         */
 -      case x86_intercept_rdtscp:
 +      case x86_intercept_rdpid:
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                        exception->vector = UD_VECTOR;
                        exception->error_code_valid = false;
@@@ -7777,42 -7769,17 +7744,42 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
  };
  
 +static __init void vmx_setup_user_return_msrs(void)
 +{
 +
 +      /*
 +       * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 +       * will emulate SYSCALL in legacy mode if the vendor string in guest
 +       * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 +       * support this emulation, MSR_STAR is included in the list for i386,
 +       * but is never loaded into hardware.  MSR_CSTAR is also never loaded
 +       * into hardware and is here purely for emulation purposes.
 +       */
 +      const u32 vmx_uret_msrs_list[] = {
 +      #ifdef CONFIG_X86_64
 +              MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 +      #endif
 +              MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 +              MSR_IA32_TSX_CTRL,
 +      };
 +      int i;
 +
 +      BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
 +
 +      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
 +              kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
 +}
 +
  static __init int hardware_setup(void)
  {
        unsigned long host_bndcfgs;
        struct desc_ptr dt;
 -      int r, i, ept_lpage_level;
 +      int r, ept_lpage_level;
  
        store_idt(&dt);
        host_idt_base = dt.address;
  
 -      for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
 -              kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
 +      vmx_setup_user_return_msrs();
  
        if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                return -EIO;
diff --combined arch/x86/kvm/x86.c
index 5bd550eaf683317b9558275b013469b247c2f90d,6eda2834fc05ef07bef81d873aa80692169dcce9..9b6bca61692912099fcc7eaa9ab8d49d512be67d
@@@ -184,6 -184,11 +184,6 @@@ module_param(pi_inject_timer, bint, S_I
   */
  #define KVM_MAX_NR_USER_RETURN_MSRS 16
  
 -struct kvm_user_return_msrs_global {
 -      int nr;
 -      u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
 -};
 -
  struct kvm_user_return_msrs {
        struct user_return_notifier urn;
        bool registered;
        } values[KVM_MAX_NR_USER_RETURN_MSRS];
  };
  
 -static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
 +u32 __read_mostly kvm_nr_uret_msrs;
 +EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
 +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
  static struct kvm_user_return_msrs __percpu *user_return_msrs;
  
  #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@@ -327,53 -330,23 +327,53 @@@ static void kvm_on_user_return(struct u
                user_return_notifier_unregister(urn);
        }
        local_irq_restore(flags);
 -      for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
 +      for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
                values = &msrs->values[slot];
                if (values->host != values->curr) {
 -                      wrmsrl(user_return_msrs_global.msrs[slot], values->host);
 +                      wrmsrl(kvm_uret_msrs_list[slot], values->host);
                        values->curr = values->host;
                }
        }
  }
  
 -void kvm_define_user_return_msr(unsigned slot, u32 msr)
 +static int kvm_probe_user_return_msr(u32 msr)
 +{
 +      u64 val;
 +      int ret;
 +
 +      preempt_disable();
 +      ret = rdmsrl_safe(msr, &val);
 +      if (ret)
 +              goto out;
 +      ret = wrmsrl_safe(msr, val);
 +out:
 +      preempt_enable();
 +      return ret;
 +}
 +
 +int kvm_add_user_return_msr(u32 msr)
 +{
 +      BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
 +
 +      if (kvm_probe_user_return_msr(msr))
 +              return -1;
 +
 +      kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
 +      return kvm_nr_uret_msrs++;
 +}
 +EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
 +
 +int kvm_find_user_return_msr(u32 msr)
  {
 -      BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
 -      user_return_msrs_global.msrs[slot] = msr;
 -      if (slot >= user_return_msrs_global.nr)
 -              user_return_msrs_global.nr = slot + 1;
 +      int i;
 +
 +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 +              if (kvm_uret_msrs_list[i] == msr)
 +                      return i;
 +      }
 +      return -1;
  }
 -EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
 +EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
  
  static void kvm_user_return_msr_cpu_online(void)
  {
        u64 value;
        int i;
  
 -      for (i = 0; i < user_return_msrs_global.nr; ++i) {
 -              rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
 +      for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 +              rdmsrl_safe(kvm_uret_msrs_list[i], &value);
                msrs->values[i].host = value;
                msrs->values[i].curr = value;
        }
@@@ -398,7 -371,7 +398,7 @@@ int kvm_set_user_return_msr(unsigned sl
        value = (value & mask) | (msrs->values[slot].host & ~mask);
        if (value == msrs->values[slot].curr)
                return 0;
 -      err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
 +      err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
        if (err)
                return 1;
  
@@@ -1176,9 -1149,6 +1176,9 @@@ static u64 kvm_dr6_fixed(struct kvm_vcp
  
        if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
                fixed |= DR6_RTM;
 +
 +      if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 +              fixed |= DR6_BUS_LOCK;
        return fixed;
  }
  
@@@ -1645,30 -1615,6 +1645,30 @@@ static int __kvm_set_msr(struct kvm_vcp
                 * invokes 64-bit SYSENTER.
                 */
                data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
 +              break;
 +      case MSR_TSC_AUX:
 +              if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
 +                      return 1;
 +
 +              if (!host_initiated &&
 +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
 +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
 +                      return 1;
 +
 +              /*
 +               * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
 +               * incomplete and conflicting architectural behavior.  Current
 +               * AMD CPUs completely ignore bits 63:32, i.e. they aren't
 +               * reserved and always read as zeros.  Enforce Intel's reserved
 +               * bits check if and only if the guest CPU is Intel, and clear
 +               * the bits in all other cases.  This ensures cross-vendor
 +               * migration will provide consistent behavior for the guest.
 +               */
 +              if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
 +                      return 1;
 +
 +              data = (u32)data;
 +              break;
        }
  
        msr.data = data;
@@@ -1705,18 -1651,6 +1705,18 @@@ int __kvm_get_msr(struct kvm_vcpu *vcpu
        if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
                return KVM_MSR_RET_FILTERED;
  
 +      switch (index) {
 +      case MSR_TSC_AUX:
 +              if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
 +                      return 1;
 +
 +              if (!host_initiated &&
 +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
 +                  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
 +                      return 1;
 +              break;
 +      }
 +
        msr.index = index;
        msr.host_initiated = host_initiated;
  
@@@ -5534,18 -5468,14 +5534,18 @@@ static void kvm_free_msr_filter(struct 
  static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
                              struct kvm_msr_filter_range *user_range)
  {
 -      struct msr_bitmap_range range;
        unsigned long *bitmap = NULL;
        size_t bitmap_size;
 -      int r;
  
        if (!user_range->nmsrs)
                return 0;
  
 +      if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
 +              return -EINVAL;
 +
 +      if (!user_range->flags)
 +              return -EINVAL;
 +
        bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
        if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
                return -EINVAL;
        if (IS_ERR(bitmap))
                return PTR_ERR(bitmap);
  
 -      range = (struct msr_bitmap_range) {
 +      msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
                .flags = user_range->flags,
                .base = user_range->base,
                .nmsrs = user_range->nmsrs,
                .bitmap = bitmap,
        };
  
 -      if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
 -              r = -EINVAL;
 -              goto err;
 -      }
 -
 -      if (!range.flags) {
 -              r = -EINVAL;
 -              goto err;
 -      }
 -
 -      /* Everything ok, add this range identifier. */
 -      msr_filter->ranges[msr_filter->count] = range;
        msr_filter->count++;
 -
        return 0;
 -err:
 -      kfree(bitmap);
 -      return r;
  }
  
  static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@@ -5991,8 -5937,7 +5991,8 @@@ static void kvm_init_msr_list(void
                                continue;
                        break;
                case MSR_TSC_AUX:
 -                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
 +                      if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
 +                          !kvm_cpu_cap_has(X86_FEATURE_RDPID))
                                continue;
                        break;
                case MSR_IA32_UMWAIT_CONTROL:
@@@ -8094,18 -8039,6 +8094,18 @@@ static void pvclock_gtod_update_fn(stru
  
  static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
  
 +/*
 + * Indirection to move queue_work() out of the tk_core.seq write held
 + * region to prevent possible deadlocks against time accessors which
 + * are invoked with work related locks held.
 + */
 +static void pvclock_irq_work_fn(struct irq_work *w)
 +{
 +      queue_work(system_long_wq, &pvclock_gtod_work);
 +}
 +
 +static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
 +
  /*
   * Notification about pvclock gtod data update.
   */
@@@ -8117,14 -8050,13 +8117,14 @@@ static int pvclock_gtod_notify(struct n
  
        update_pvclock_gtod(tk);
  
 -      /* disable master clock if host does not trust, or does not
 -       * use, TSC based clocksource.
 +      /*
 +       * Disable master clock if host does not trust, or does not use,
 +       * TSC based clocksource. Delegate queue_work() to irq_work as
 +       * this is invoked with tk_core.seq write held.
         */
        if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
            atomic_read(&kvm_guest_has_master_clock) != 0)
 -              queue_work(system_long_wq, &pvclock_gtod_work);
 -
 +              irq_work_queue(&pvclock_irq_work);
        return 0;
  }
  
@@@ -8186,7 -8118,6 +8186,7 @@@ int kvm_arch_init(void *opaque
                printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
                goto out_free_x86_emulator_cache;
        }
 +      kvm_nr_uret_msrs = 0;
  
        r = kvm_mmu_module_init();
        if (r)
@@@ -8237,8 -8168,6 +8237,8 @@@ void kvm_arch_exit(void
        cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
  #ifdef CONFIG_X86_64
        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 +      irq_work_sync(&pvclock_irq_work);
 +      cancel_work_sync(&pvclock_gtod_work);
  #endif
        kvm_x86_ops.hardware_enable = NULL;
        kvm_mmu_module_exit();
@@@ -9386,6 -9315,15 +9386,15 @@@ static int vcpu_enter_guest(struct kvm_
        local_irq_disable();
        kvm_after_interrupt(vcpu);
  
+       /*
+        * Wait until after servicing IRQs to account guest time so that any
+        * ticks that occurred while running the guest are properly accounted
+        * to the guest.  Waiting until IRQs are enabled degrades the accuracy
+        * of accounting via context tracking, but the loss of accuracy is
+        * acceptable for all known use cases.
+        */
+       vtime_account_guest_exit();
        if (lapic_in_kernel(vcpu)) {
                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
                if (delta != S64_MIN) {
This page took 0.131636 seconds and 4 git commands to generate.