]> Git Repo - linux.git/commitdiff
Merge tag 'kvm-x86-svm-6.7' of https://github.com/kvm-x86/linux into HEAD
authorPaolo Bonzini <[email protected]>
Tue, 31 Oct 2023 14:22:43 +0000 (10:22 -0400)
committerPaolo Bonzini <[email protected]>
Tue, 31 Oct 2023 14:22:43 +0000 (10:22 -0400)
KVM SVM changes for 6.7:

 - Report KVM_EXIT_SHUTDOWN instead of EINVAL if KVM intercepts SHUTDOWN while
   running an SEV-ES guest.

 - Clean up handling "failures" when KVM detects it can't emulate the "skip"
   action for an instruction that has already been partially emulated.  Drop a
   hack in the SVM code that was fudging around the emulator code not giving
   SVM enough information to do the right thing.

1  2 
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c

index 9b419f0de713cc720ad348598ac3c5a615426c58,ee2404a559af1ad78c8b279ea7f402c44d6a99ef..26b628d84594b93fea349b0a738768cfb9a9a15b
@@@ -108,7 -108,6 +108,7 @@@ KVM_X86_OP_OPTIONAL(vcpu_blocking
  KVM_X86_OP_OPTIONAL(vcpu_unblocking)
  KVM_X86_OP_OPTIONAL(pi_update_irte)
  KVM_X86_OP_OPTIONAL(pi_start_assignment)
 +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
  KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
  KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
  KVM_X86_OP_OPTIONAL(set_hv_timer)
@@@ -127,7 -126,7 +127,7 @@@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context
  KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
  KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
  KVM_X86_OP(get_msr_feature)
- KVM_X86_OP(can_emulate_instruction)
+ KVM_X86_OP(check_emulate_instruction)
  KVM_X86_OP(apic_init_signal_blocked)
  KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
  KVM_X86_OP_OPTIONAL(migrate_timers)
index ae48fd8cb27899a8a51efd63b4a27d55aafebda1,89583b410527eb3767e68ef07e1901d11607d6bf..d7036982332e33d7b858379785b9071ad4992451
  
  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
  
 +/*
 + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
 + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
 + */
 +#ifdef CONFIG_KVM_MAX_NR_VCPUS
 +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
 +#else
  #define KVM_MAX_VCPUS 1024
 +#endif
  
  /*
   * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
@@@ -536,6 -528,7 +536,6 @@@ struct kvm_pmu 
        u64 raw_event_mask;
        struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
        struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
 -      struct irq_work irq_work;
  
        /*
         * Overlay the bitmap with a 64-bit atomic so that all bits can be
@@@ -687,7 -680,6 +687,7 @@@ struct kvm_hypervisor_cpuid 
        u32 limit;
  };
  
 +#ifdef CONFIG_KVM_XEN
  /* Xen HVM per vcpu emulation context */
  struct kvm_vcpu_xen {
        u64 hypercall_rip;
        struct timer_list poll_timer;
        struct kvm_hypervisor_cpuid cpuid;
  };
 +#endif
  
  struct kvm_queued_exception {
        bool pending;
@@@ -939,9 -930,8 +939,9 @@@ struct kvm_vcpu_arch 
  
        bool hyperv_enabled;
        struct kvm_vcpu_hv *hyperv;
 +#ifdef CONFIG_KVM_XEN
        struct kvm_vcpu_xen xen;
 -
 +#endif
        cpumask_var_t wbinvd_dirty_mask;
  
        unsigned long last_retry_eip;
@@@ -1286,6 -1276,7 +1286,6 @@@ struct kvm_arch 
         */
        spinlock_t mmu_unsync_pages_lock;
  
 -      struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
        bool iommu_noncoherent;
  #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
        int nr_vcpus_matched_tsc;
  
        u32 default_tsc_khz;
 +      bool user_set_tsc;
  
        seqcount_raw_spinlock_t pvclock_sc;
        bool use_master_clock;
@@@ -1702,7 -1692,7 +1702,7 @@@ struct kvm_x86_ops 
  
        void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
  
 -      void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
 +      void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
  
        /*
         * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
        int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
                              uint32_t guest_irq, bool set);
        void (*pi_start_assignment)(struct kvm *kvm);
 +      void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
        void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
        bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
  
  
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
  
-       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
-                                       void *insn, int insn_len);
+       int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+                                        void *insn, int insn_len);
  
        bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
        int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
diff --combined arch/x86/kvm/svm/svm.c
index eb234cdd370b647d3186e6fac3f8758af057c12c,b7472ad183b9d361128cc76dd2d81c221394652b..1855a6d7c976ad2fcff44a83a41589c6c8d45e59
@@@ -199,7 -199,7 +199,7 @@@ module_param_named(npt, npt_enabled, bo
  
  /* allow nested virtualization in KVM/SVM */
  static int nested = true;
 -module_param(nested, int, S_IRUGO);
 +module_param(nested, int, 0444);
  
  /* enable/disable Next RIP Save */
  int nrips = true;
@@@ -364,8 -364,6 +364,6 @@@ static void svm_set_interrupt_shadow(st
                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
  
  }
- static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
-                                       void *insn, int insn_len);
  
  static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
                                           bool commit_side_effects)
        }
  
        if (!svm->next_rip) {
-               /*
-                * FIXME: Drop this when kvm_emulate_instruction() does the
-                * right thing and treats "can't emulate" as outright failure
-                * for EMULTYPE_SKIP.
-                */
-               if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
-                       return 0;
                if (unlikely(!commit_side_effects))
                        old_rflags = svm->vmcb->save.rflags;
  
@@@ -691,7 -681,7 +681,7 @@@ static int svm_hardware_enable(void
         */
        if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
                struct sev_es_save_area *hostsa;
 -              u32 msr_hi;
 +              u32 __maybe_unused msr_hi;
  
                hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
  
@@@ -913,7 -903,8 +903,7 @@@ void svm_set_x2apic_msr_interception(st
        if (intercept == svm->x2avic_msrs_intercepted)
                return;
  
 -      if (!x2avic_enabled ||
 -          !apic_x2apic_mode(svm->vcpu.arch.apic))
 +      if (!x2avic_enabled)
                return;
  
        for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
@@@ -2202,12 -2193,6 +2192,6 @@@ static int shutdown_interception(struc
        struct kvm_run *kvm_run = vcpu->run;
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       /*
-        * The VM save area has already been encrypted so it
-        * cannot be reinitialized - just terminate.
-        */
-       if (sev_es_guest(vcpu->kvm))
-               return -EINVAL;
  
        /*
         * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
         * userspace.  At a platform view, INIT is acceptable behavior as
         * there exist bare metal platforms that automatically INIT the CPU
         * in response to shutdown.
+        *
+        * The VM save area for SEV-ES guests has already been encrypted so it
+        * cannot be reinitialized, i.e. synthesizing INIT is futile.
         */
-       clear_page(svm->vmcb);
-       kvm_vcpu_reset(vcpu, true);
+       if (!sev_es_guest(vcpu->kvm)) {
+               clear_page(svm->vmcb);
+               kvm_vcpu_reset(vcpu, true);
+       }
  
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
@@@ -4727,15 -4717,15 +4716,15 @@@ static void svm_enable_smi_window(struc
  }
  #endif
  
- static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
-                                       void *insn, int insn_len)
+ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                        void *insn, int insn_len)
  {
        bool smep, smap, is_user;
        u64 error_code;
  
        /* Emulation is always possible when KVM has access to all guest state. */
        if (!sev_guest(vcpu->kvm))
-               return true;
+               return X86EMUL_CONTINUE;
  
        /* #UD and #GP should never be intercepted for SEV guests. */
        WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
         * to guest register state.
         */
        if (sev_es_guest(vcpu->kvm))
-               return false;
+               return X86EMUL_RETRY_INSTR;
  
        /*
         * Emulation is possible if the instruction is already decoded, e.g.
         * when completing I/O after returning from userspace.
         */
        if (emul_type & EMULTYPE_NO_DECODE)
-               return true;
+               return X86EMUL_CONTINUE;
  
        /*
         * Emulation is possible for SEV guests if and only if a prefilled
         * success (and in practice it will work the vast majority of the time).
         */
        if (unlikely(!insn)) {
-               if (!(emul_type & EMULTYPE_SKIP))
-                       kvm_queue_exception(vcpu, UD_VECTOR);
-               return false;
+               if (emul_type & EMULTYPE_SKIP)
+                       return X86EMUL_UNHANDLEABLE;
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return X86EMUL_PROPAGATE_FAULT;
        }
  
        /*
         * table used to translate CS:RIP resides in emulated MMIO.
         */
        if (likely(insn_len))
-               return true;
+               return X86EMUL_CONTINUE;
  
        /*
         * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
                        kvm_inject_gp(vcpu, 0);
                else
                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               return X86EMUL_PROPAGATE_FAULT;
        }
  
  resume_guest:
         * doesn't explicitly define "ignored", i.e. doing nothing and letting
         * the guest spin is technically "ignoring" the access.
         */
-       return false;
+       return X86EMUL_RETRY_INSTR;
  }
  
  static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
@@@ -5028,7 -5021,7 +5020,7 @@@ static struct kvm_x86_ops svm_x86_ops _
        .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
        .vm_move_enc_context_from = sev_vm_move_enc_context_from,
  
-       .can_emulate_instruction = svm_can_emulate_instruction,
+       .check_emulate_instruction = svm_check_emulate_instruction,
  
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
  
diff --combined arch/x86/kvm/vmx/vmx.c
index 610e37e215be598ea8432b2c018733126ddf0f0c,4e453ba283207714c748ea31d3284fd140c938f4..be20a60047b1f2930a621409f97d7f668b1ae29a
@@@ -82,28 -82,28 +82,28 @@@ bool __read_mostly enable_vpid = 1
  module_param_named(vpid, enable_vpid, bool, 0444);
  
  static bool __read_mostly enable_vnmi = 1;
 -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
 +module_param_named(vnmi, enable_vnmi, bool, 0444);
  
  bool __read_mostly flexpriority_enabled = 1;
 -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 +module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
  
  bool __read_mostly enable_ept = 1;
 -module_param_named(ept, enable_ept, bool, S_IRUGO);
 +module_param_named(ept, enable_ept, bool, 0444);
  
  bool __read_mostly enable_unrestricted_guest = 1;
  module_param_named(unrestricted_guest,
 -                      enable_unrestricted_guest, bool, S_IRUGO);
 +                      enable_unrestricted_guest, bool, 0444);
  
  bool __read_mostly enable_ept_ad_bits = 1;
 -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 +module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
  
  static bool __read_mostly emulate_invalid_guest_state = true;
 -module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 +module_param(emulate_invalid_guest_state, bool, 0444);
  
  static bool __read_mostly fasteoi = 1;
 -module_param(fasteoi, bool, S_IRUGO);
 +module_param(fasteoi, bool, 0444);
  
 -module_param(enable_apicv, bool, S_IRUGO);
 +module_param(enable_apicv, bool, 0444);
  
  bool __read_mostly enable_ipiv = true;
  module_param(enable_ipiv, bool, 0444);
   * use VMX instructions.
   */
  static bool __read_mostly nested = 1;
 -module_param(nested, bool, S_IRUGO);
 +module_param(nested, bool, 0444);
  
  bool __read_mostly enable_pml = 1;
 -module_param_named(pml, enable_pml, bool, S_IRUGO);
 +module_param_named(pml, enable_pml, bool, 0444);
  
  static bool __read_mostly error_on_inconsistent_vmcs_config = true;
  module_param(error_on_inconsistent_vmcs_config, bool, 0444);
@@@ -1657,8 -1657,8 +1657,8 @@@ static int vmx_rtit_ctl_check(struct kv
        return 0;
  }
  
- static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
-                                       void *insn, int insn_len)
+ static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                        void *insn, int insn_len)
  {
        /*
         * Emulation of instructions in SGX enclaves is impossible as RIP does
         */
        if (to_vmx(vcpu)->exit_reason.enclave_mode) {
                kvm_queue_exception(vcpu, UD_VECTOR);
-               return false;
+               return X86EMUL_PROPAGATE_FAULT;
        }
-       return true;
+       return X86EMUL_CONTINUE;
  }
  
  static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@@ -5792,7 -5792,7 +5792,7 @@@ static int handle_ept_misconfig(struct 
  {
        gpa_t gpa;
  
-       if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+       if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
                return 1;
  
        /*
@@@ -6912,7 -6912,7 +6912,7 @@@ static void vmx_load_eoi_exitmap(struc
        vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
  }
  
 -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 +static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
@@@ -7579,6 -7579,8 +7579,6 @@@ static int vmx_vm_init(struct kvm *kvm
  
  static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
  {
 -      u8 cache;
 -
        /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
         * memory aliases with conflicting memory types and sometimes MCEs.
         * We have to be careful as to what are honored and when.
  
        if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
                if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 -                      cache = MTRR_TYPE_WRBACK;
 +                      return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
                else
 -                      cache = MTRR_TYPE_UNCACHABLE;
 -
 -              return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
 +                      return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
 +                              VMX_EPT_IPAT_BIT;
        }
  
        return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
@@@ -8283,7 -8286,7 +8283,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
        .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = vmx_load_eoi_exitmap,
 -      .apicv_post_state_restore = vmx_apicv_post_state_restore,
 +      .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
        .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
        .enable_smi_window = vmx_enable_smi_window,
  #endif
  
-       .can_emulate_instruction = vmx_can_emulate_instruction,
+       .check_emulate_instruction = vmx_check_emulate_instruction,
        .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
        .migrate_timers = vmx_migrate_timers,
  
diff --combined arch/x86/kvm/x86.c
index 761e0b3c9be01d21aba7a807c70a114cc5fff40e,cc7d29e9104ba5d42c8559ea930f1b7ba3c089ee..2c924075f6f112a594c0a4390bb7dcc2d7e8fabf
@@@ -145,21 -145,21 +145,21 @@@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_d
  EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
  
  static bool __read_mostly ignore_msrs = 0;
 -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 +module_param(ignore_msrs, bool, 0644);
  
  bool __read_mostly report_ignored_msrs = true;
 -module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
 +module_param(report_ignored_msrs, bool, 0644);
  EXPORT_SYMBOL_GPL(report_ignored_msrs);
  
  unsigned int min_timer_period_us = 200;
 -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 +module_param(min_timer_period_us, uint, 0644);
  
  static bool __read_mostly kvmclock_periodic_sync = true;
 -module_param(kvmclock_periodic_sync, bool, S_IRUGO);
 +module_param(kvmclock_periodic_sync, bool, 0444);
  
  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
  static u32 __read_mostly tsc_tolerance_ppm = 250;
 -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 +module_param(tsc_tolerance_ppm, uint, 0644);
  
  /*
   * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
   * tuning, i.e. allows privileged userspace to set an exact advancement time.
   */
  static int __read_mostly lapic_timer_advance_ns = -1;
 -module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
 +module_param(lapic_timer_advance_ns, int, 0644);
  
  static bool __read_mostly vector_hashing = true;
 -module_param(vector_hashing, bool, S_IRUGO);
 +module_param(vector_hashing, bool, 0444);
  
  bool __read_mostly enable_vmware_backdoor = false;
 -module_param(enable_vmware_backdoor, bool, S_IRUGO);
 +module_param(enable_vmware_backdoor, bool, 0444);
  EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
  
  /*
@@@ -186,7 -186,7 +186,7 @@@ static int __read_mostly force_emulatio
  module_param(force_emulation_prefix, int, 0644);
  
  int __read_mostly pi_inject_timer = -1;
 -module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 +module_param(pi_inject_timer, bint, 0644);
  
  /* Enable/disable PMU virtualization */
  bool __read_mostly enable_pmu = true;
@@@ -962,7 -962,7 +962,7 @@@ void kvm_post_set_cr0(struct kvm_vcpu *
                kvm_mmu_reset_context(vcpu);
  
        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 -          kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
 +          kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
            !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
                kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
  }
@@@ -2331,9 -2331,14 +2331,9 @@@ static void kvm_write_wall_clock(struc
        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
                return;
  
 -      /*
 -       * The guest calculates current wall clock time by adding
 -       * system time (updated by kvm_guest_time_update below) to the
 -       * wall clock specified here.  We do the reverse here.
 -       */
 -      wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
 +      wall_nsec = kvm_get_wall_clock_epoch(kvm);
  
 -      wc.nsec = do_div(wall_nsec, 1000000000);
 +      wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
        wc.version = version;
  
@@@ -2709,9 -2714,8 +2709,9 @@@ static void __kvm_synchronize_tsc(struc
        kvm_track_tsc_matching(vcpu);
  }
  
 -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
  {
 +      u64 data = user_value ? *user_value : 0;
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
        if (vcpu->arch.virtual_tsc_khz) {
                if (data == 0) {
                        /*
 -                       * detection of vcpu initialization -- need to sync
 -                       * with other vCPUs. This particularly helps to keep
 -                       * kvm_clock stable after CPU hotplug
 +                       * Force synchronization when creating a vCPU, or when
 +                       * userspace explicitly writes a zero value.
                         */
                        synchronizing = true;
 -              } else {
 +              } else if (kvm->arch.user_set_tsc) {
                        u64 tsc_exp = kvm->arch.last_tsc_write +
                                                nsec_to_cycles(vcpu, elapsed);
                        u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
                        /*
 -                       * Special case: TSC write with a small delta (1 second)
 -                       * of virtual cycle time against real time is
 -                       * interpreted as an attempt to synchronize the CPU.
 +                       * Here lies UAPI baggage: when a user-initiated TSC write has
 +                       * a small delta (1 second) of virtual cycle time against the
 +                       * previously set vCPU, we assume that they were intended to be
 +                       * in sync and the delta was only due to the racy nature of the
 +                       * legacy API.
 +                       *
 +                       * This trick falls down when restoring a guest which genuinely
 +                       * has been running for less time than the 1 second of imprecision
 +                       * which we allow for in the legacy API. In this case, the first
 +                       * value written by userspace (on any vCPU) should not be subject
 +                       * to this 'correction' to make it sync up with values that only
 +                       * come from the kernel's default vCPU creation. Make the 1-second
 +                       * slop hack only trigger if the user_set_tsc flag is already set.
                         */
                        synchronizing = data < tsc_exp + tsc_hz &&
                                        data + tsc_hz > tsc_exp;
                }
        }
  
 +      if (user_value)
 +              kvm->arch.user_set_tsc = true;
 +
        /*
         * For a reliable TSC, we can match TSC offsets, and for an unstable
         * TSC, we add elapsed time in this computation.  We could let the
@@@ -3240,93 -3232,15 +3240,93 @@@ static int kvm_guest_time_update(struc
  
        if (vcpu->pv_time.active)
                kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
 +#ifdef CONFIG_KVM_XEN
        if (vcpu->xen.vcpu_info_cache.active)
                kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
                                        offsetof(struct compat_vcpu_info, time));
        if (vcpu->xen.vcpu_time_info_cache.active)
                kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
 +#endif
        kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
  }
  
 +/*
 + * The pvclock_wall_clock ABI tells the guest the wall clock time at
 + * which it started (i.e. its epoch, when its kvmclock was zero).
 + *
 + * In fact those clocks are subtly different; wall clock frequency is
 + * adjusted by NTP and has leap seconds, while the kvmclock is a
 + * simple function of the TSC without any such adjustment.
 + *
 + * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
 + * that and kvmclock, but even that would be subject to change over
 + * time.
 + *
 + * Attempt to calculate the epoch at a given moment using the *same*
 + * TSC reading via kvm_get_walltime_and_clockread() to obtain both
 + * wallclock and kvmclock times, and subtracting one from the other.
 + *
 + * Fall back to using their values at slightly different moments by
 + * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
 + */
 +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
 +{
 +#ifdef CONFIG_X86_64
 +      struct pvclock_vcpu_time_info hv_clock;
 +      struct kvm_arch *ka = &kvm->arch;
 +      unsigned long seq, local_tsc_khz;
 +      struct timespec64 ts;
 +      uint64_t host_tsc;
 +
 +      do {
 +              seq = read_seqcount_begin(&ka->pvclock_sc);
 +
 +              local_tsc_khz = 0;
 +              if (!ka->use_master_clock)
 +                      break;
 +
 +              /*
 +               * The TSC read and the call to get_cpu_tsc_khz() must happen
 +               * on the same CPU.
 +               */
 +              get_cpu();
 +
 +              local_tsc_khz = get_cpu_tsc_khz();
 +
 +              if (local_tsc_khz &&
 +                  !kvm_get_walltime_and_clockread(&ts, &host_tsc))
 +                      local_tsc_khz = 0; /* Fall back to old method */
 +
 +              put_cpu();
 +
 +              /*
 +               * These values must be snapshotted within the seqcount loop.
 +               * After that, it's just mathematics which can happen on any
 +               * CPU at any time.
 +               */
 +              hv_clock.tsc_timestamp = ka->master_cycle_now;
 +              hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
 +
 +      } while (read_seqcount_retry(&ka->pvclock_sc, seq));
 +
 +      /*
 +       * If the conditions were right, and obtaining the wallclock+TSC was
 +       * successful, calculate the KVM clock at the corresponding time and
 +       * subtract one from the other to get the guest's epoch in nanoseconds
 +       * since 1970-01-01.
 +       */
 +      if (local_tsc_khz) {
 +              kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
 +                                 &hv_clock.tsc_shift,
 +                                 &hv_clock.tsc_to_system_mul);
 +              return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
 +                      __pvclock_read_cycles(&hv_clock, host_tsc);
 +      }
 +#endif
 +      return ktime_get_real_ns() - get_kvmclock_ns(kvm);
 +}
 +
  /*
   * kvmclock updates which are isolated to a given vcpu, such as
   * vcpu->cpu migration, should not allow system_timestamp from
@@@ -3376,6 -3290,9 +3376,6 @@@ static void kvmclock_sync_fn(struct wor
                                           kvmclock_sync_work);
        struct kvm *kvm = container_of(ka, struct kvm, arch);
  
 -      if (!kvmclock_periodic_sync)
 -              return;
 -
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
@@@ -3724,7 -3641,6 +3724,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
        case MSR_AMD64_PATCH_LOADER:
        case MSR_AMD64_BU_CFG2:
        case MSR_AMD64_DC_CFG:
 +      case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
                break;
  
                vcpu->arch.perf_capabilities = data;
                kvm_pmu_refresh(vcpu);
                break;
 -      case MSR_IA32_PRED_CMD:
 -              if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
 -                      return 1;
 +      case MSR_IA32_PRED_CMD: {
 +              u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
 +
 +              if (!msr_info->host_initiated) {
 +                      if ((!guest_has_pred_cmd_msr(vcpu)))
 +                              return 1;
 +
 +                      if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
 +                          !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
 +                              reserved_bits |= PRED_CMD_IBPB;
 +
 +                      if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
 +                              reserved_bits |= PRED_CMD_SBPB;
 +              }
 +
 +              if (!boot_cpu_has(X86_FEATURE_IBPB))
 +                      reserved_bits |= PRED_CMD_IBPB;
 +
 +              if (!boot_cpu_has(X86_FEATURE_SBPB))
 +                      reserved_bits |= PRED_CMD_SBPB;
  
 -              if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
 +              if (data & reserved_bits)
                        return 1;
 +
                if (!data)
                        break;
  
 -              wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
 +              wrmsrl(MSR_IA32_PRED_CMD, data);
                break;
 +      }
        case MSR_IA32_FLUSH_CMD:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                data &= ~(u64)0x8;      /* ignore TLB cache disable */
  
 -              /* Handle McStatusWrEn */
 -              if (data == BIT_ULL(18)) {
 -                      vcpu->arch.msr_hwcr = data;
 -              } else if (data != 0) {
 +              /*
 +               * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
 +               * through at least v6.6 whine if TscFreqSel is clear,
 +               * depending on F/M/S.
 +               */
 +              if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
                        kvm_pr_unimpl_wrmsr(vcpu, msr, data);
                        return 1;
                }
 +              vcpu->arch.msr_hwcr = data;
                break;
        case MSR_FAM10H_MMIO_CONF_BASE:
                if (data != 0) {
                break;
        case MSR_IA32_TSC:
                if (msr_info->host_initiated) {
 -                      kvm_synchronize_tsc(vcpu, data);
 +                      kvm_synchronize_tsc(vcpu, &data);
                } else {
                        u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
@@@ -4171,7 -4065,6 +4171,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
        case MSR_AMD64_BU_CFG2:
        case MSR_IA32_PERF_CTL:
        case MSR_AMD64_DC_CFG:
 +      case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
        /*
         * Intel Sandy Bridge CPUs must support the RAPL (running average power
@@@ -5489,37 -5382,26 +5489,37 @@@ static int kvm_vcpu_ioctl_x86_set_debug
        return 0;
  }
  
 -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 -                                       struct kvm_xsave *guest_xsave)
 -{
 -      if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 -              return;
 -
 -      fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
 -                                     guest_xsave->region,
 -                                     sizeof(guest_xsave->region),
 -                                     vcpu->arch.pkru);
 -}
  
  static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
                                          u8 *state, unsigned int size)
  {
 +      /*
 +       * Only copy state for features that are enabled for the guest.  The
 +       * state itself isn't problematic, but setting bits in the header for
 +       * features that are supported in *this* host but not exposed to the
 +       * guest can result in KVM_SET_XSAVE failing when live migrating to a
 +       * compatible host without the features that are NOT exposed to the
 +       * guest.
 +       *
 +       * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
 +       * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
 +       * supported by the host.
 +       */
 +      u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
 +                           XFEATURE_MASK_FPSSE;
 +
        if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                return;
  
 -      fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
 -                                     state, size, vcpu->arch.pkru);
 +      fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
 +                                     supported_xcr0, vcpu->arch.pkru);
 +}
 +
 +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 +                                       struct kvm_xsave *guest_xsave)
 +{
 +      return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
 +                                           sizeof(guest_xsave->region));
  }
  
  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@@ -5654,7 -5536,6 +5654,7 @@@ static int kvm_arch_tsc_set_attr(struc
                tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
                ns = get_kvmclock_base_ns();
  
 +              kvm->arch.user_set_tsc = true;
                __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
                raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
@@@ -6367,9 -6248,6 +6367,9 @@@ void kvm_arch_sync_dirty_log(struct kv
        struct kvm_vcpu *vcpu;
        unsigned long i;
  
 +      if (!kvm_x86_ops.cpu_dirty_log_size)
 +              return;
 +
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vcpu_kick(vcpu);
  }
@@@ -7596,11 -7474,11 +7596,11 @@@ int kvm_write_guest_virt_system(struct 
  }
  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
  
- static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
-                               void *insn, int insn_len)
+ static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+                                 void *insn, int insn_len)
  {
-       return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
-                                                           insn, insn_len);
+       return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
+                                                             insn, insn_len);
  }
  
  int handle_ud(struct kvm_vcpu *vcpu)
        int emul_type = EMULTYPE_TRAP_UD;
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
+       int r;
  
-       if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+       r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+       if (r != X86EMUL_CONTINUE)
                return 1;
  
        if (fep_flags &&
@@@ -8993,8 -8873,14 +8995,14 @@@ int x86_emulate_instruction(struct kvm_
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
  
-       if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
-               return 1;
+       r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+       if (r != X86EMUL_CONTINUE) {
+               if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+                       return 1;
+               WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+               return handle_emulation_failure(vcpu, emulation_type);
+       }
  
        vcpu->arch.l1tf_flush_l1d = true;
  
@@@ -10698,16 -10584,16 +10706,16 @@@ static int vcpu_enter_guest(struct kvm_
                }
                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
                        record_steal_time(vcpu);
 +              if (kvm_check_request(KVM_REQ_PMU, vcpu))
 +                      kvm_pmu_handle_event(vcpu);
 +              if (kvm_check_request(KVM_REQ_PMI, vcpu))
 +                      kvm_pmu_deliver_pmi(vcpu);
  #ifdef CONFIG_KVM_SMM
                if (kvm_check_request(KVM_REQ_SMI, vcpu))
                        process_smi(vcpu);
  #endif
                if (kvm_check_request(KVM_REQ_NMI, vcpu))
                        process_nmi(vcpu);
 -              if (kvm_check_request(KVM_REQ_PMU, vcpu))
 -                      kvm_pmu_handle_event(vcpu);
 -              if (kvm_check_request(KVM_REQ_PMI, vcpu))
 -                      kvm_pmu_deliver_pmi(vcpu);
                if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
                        BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
                        if (test_bit(vcpu->arch.pending_ioapic_eoi,
@@@ -11643,6 -11529,7 +11651,6 @@@ static int __set_sregs_common(struct kv
  
        *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
 -      vcpu->arch.cr0 = sregs->cr0;
  
        *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@@ -11686,10 -11573,8 +11694,10 @@@ static int __set_sregs(struct kvm_vcpu 
        if (ret)
                return ret;
  
 -      if (mmu_reset_needed)
 +      if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
 +              kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
 +      }
  
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
@@@ -11730,10 -11615,8 +11738,10 @@@ static int __set_sregs2(struct kvm_vcp
                mmu_reset_needed = 1;
                vcpu->arch.pdptrs_from_userspace = true;
        }
 -      if (mmu_reset_needed)
 +      if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
 +              kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
 +      }
        return 0;
  }
  
@@@ -12084,7 -11967,7 +12092,7 @@@ void kvm_arch_vcpu_postcreate(struct kv
        if (mutex_lock_killable(&vcpu->mutex))
                return;
        vcpu_load(vcpu);
 -      kvm_synchronize_tsc(vcpu, 0);
 +      kvm_synchronize_tsc(vcpu, NULL);
        vcpu_put(vcpu);
  
        /* poll control enabled by default */
@@@ -12440,6 -12323,7 +12448,6 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
                goto out_uninit_mmu;
  
        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 -      INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
  
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@@ -12967,9 -12851,6 +12975,9 @@@ static inline bool kvm_vcpu_has_events(
                return true;
  #endif
  
 +      if (kvm_test_request(KVM_REQ_PMI, vcpu))
 +              return true;
 +
        if (kvm_arch_interrupt_allowed(vcpu) &&
            (kvm_cpu_has_interrupt(vcpu) ||
            kvm_guest_apic_has_interrupt(vcpu)))
@@@ -13315,30 -13196,15 +13323,30 @@@ bool noinstr kvm_arch_has_assigned_devi
  }
  EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
  
 +static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
 +{
 +      /*
 +       * Non-coherent DMA assignment and de-assignment will affect
 +       * whether KVM honors guest MTRRs and cause changes in memtypes
 +       * in TDP.
 +       * So, pass %true unconditionally to indicate non-coherent DMA was,
 +       * or will be involved, and that zapping SPTEs might be necessary.
 +       */
 +      if (__kvm_mmu_honors_guest_mtrrs(true))
 +              kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
 +}
 +
  void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
  {
 -      atomic_inc(&kvm->arch.noncoherent_dma_count);
 +      if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
 +              kvm_noncoherent_dma_assignment_start_or_stop(kvm);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
  
  void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
  {
 -      atomic_dec(&kvm->arch.noncoherent_dma_count);
 +      if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
 +              kvm_noncoherent_dma_assignment_start_or_stop(kvm);
  }
  EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
  
This page took 0.114129 seconds and 4 git commands to generate.