Merge tag 'kvm-x86-selftests-6.4' of https://github.com/kvm-x86/linux into HEAD

[linux.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 7713420abab093b19d6c19e41d9ca9454c006a08..6a41bdb7f599805168762336c13f37c18cda8333 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,7 +194,7 @@ bool __read_mostly eager_page_split = true;
  module_param(eager_page_split, bool, 0644);
  
  /* Enable/disable SMT_RSB bug mitigation */
-bool __read_mostly mitigate_smt_rsb;
+static bool __read_mostly mitigate_smt_rsb;
  module_param(mitigate_smt_rsb, bool, 0444);
  
  /*
@@ -802,8 +802,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
          */
         if ((fault->error_code & PFERR_PRESENT_MASK) &&
             !(fault->error_code & PFERR_RSVD_MASK))
-               kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
-                                      fault_mmu->root.hpa);
+               kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+                                       KVM_MMU_ROOT_CURRENT);
  
         fault_mmu->inject_page_fault(vcpu, fault);
  }
@@ -841,7 +841,7 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
  
  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
  {
-       if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+       if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
                 return true;
  
         kvm_queue_exception(vcpu, UD_VECTOR);
@@ -906,6 +906,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
  
  void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
  {
+       /*
+        * CR0.WP is incorporated into the MMU role, but only for non-nested,
+        * indirect shadow MMUs.  If paging is disabled, no updates are needed
+        * as there are no permission bits to emulate.  If TDP is enabled, the
+        * MMU's metadata needs to be updated, e.g. so that emulating guest
+        * translations does the right thing, but there's no need to unload the
+        * root as CR0.WP doesn't affect SPTEs.
+        */
+       if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+               if (!(cr0 & X86_CR0_PG))
+                       return;
+
+               if (tdp_enabled) {
+                       kvm_init_mmu(vcpu);
+                       return;
+               }
+       }
+
         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
                 kvm_clear_async_pf_completion_queue(vcpu);
                 kvm_async_pf_hash_reset(vcpu);
@@ -965,7 +983,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                 return 1;
  
         if (!(cr0 & X86_CR0_PG) &&
-           (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
+           (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
                 return 1;
  
         static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -987,7 +1005,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
         if (vcpu->arch.guest_state_protected)
                 return;
  
-       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+       if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
  
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
@@ -1001,7 +1019,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
         if (static_cpu_has(X86_FEATURE_PKU) &&
             vcpu->arch.pkru != vcpu->arch.host_pkru &&
             ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
-            kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
+            kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
                 write_pkru(vcpu->arch.pkru);
  #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  }
@@ -1015,14 +1033,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
  #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
         if (static_cpu_has(X86_FEATURE_PKU) &&
             ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
-            kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
+            kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
                 vcpu->arch.pkru = rdpkru();
                 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                         write_pkru(vcpu->arch.host_pkru);
         }
  #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  
-       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+       if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
  
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -1178,9 +1196,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 return 1;
  
         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
-               if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
-                       return 1;
-
                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
                         return 1;
@@ -1227,7 +1242,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
          * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
          * with PCIDE=0.
          */
-       if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+       if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
                 return;
  
         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
@@ -1242,9 +1257,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
         bool skip_tlb_flush = false;
         unsigned long pcid = 0;
  #ifdef CONFIG_X86_64
-       bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
-       if (pcid_enabled) {
+       if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
                 pcid = cr3 & X86_CR3_PCID_MASK;
@@ -1543,38 +1556,40 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
  static unsigned num_emulated_msrs;
  
  /*
- * List of msr numbers which are used to expose MSR-based features that
- * can be used by a hypervisor to validate requested CPU features.
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs.  VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
   */
-static const u32 msr_based_features_all[] = {
-       MSR_IA32_VMX_BASIC,
-       MSR_IA32_VMX_TRUE_PINBASED_CTLS,
-       MSR_IA32_VMX_PINBASED_CTLS,
-       MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
-       MSR_IA32_VMX_PROCBASED_CTLS,
-       MSR_IA32_VMX_TRUE_EXIT_CTLS,
-       MSR_IA32_VMX_EXIT_CTLS,
-       MSR_IA32_VMX_TRUE_ENTRY_CTLS,
-       MSR_IA32_VMX_ENTRY_CTLS,
-       MSR_IA32_VMX_MISC,
-       MSR_IA32_VMX_CR0_FIXED0,
-       MSR_IA32_VMX_CR0_FIXED1,
-       MSR_IA32_VMX_CR4_FIXED0,
-       MSR_IA32_VMX_CR4_FIXED1,
-       MSR_IA32_VMX_VMCS_ENUM,
-       MSR_IA32_VMX_PROCBASED_CTLS2,
-       MSR_IA32_VMX_EPT_VPID_CAP,
-       MSR_IA32_VMX_VMFUNC,
-
+static const u32 msr_based_features_all_except_vmx[] = {
         MSR_AMD64_DE_CFG,
         MSR_IA32_UCODE_REV,
         MSR_IA32_ARCH_CAPABILITIES,
         MSR_IA32_PERF_CAPABILITIES,
  };
  
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+                             (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
  static unsigned int num_msr_based_features;
  
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+       int i;
+
+       if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+               return true;
+
+       for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+               if (msr == msr_based_features_all_except_vmx[i])
+                       return msr != MSR_IA32_UCODE_REV;
+       }
+
+       return false;
+}
+
  /*
   * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
   * does not yet virtualize. These include:
@@ -2192,6 +2207,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
  
  static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
  {
+       u64 val;
+
+       /*
+        * Disallow writes to immutable feature MSRs after KVM_RUN.  KVM does
+        * not support modifying the guest vCPU model on the fly, e.g. changing
+        * the nVMX capabilities while L2 is running is nonsensical.  Ignore
+        * writes of the same value, e.g. to allow userspace to blindly stuff
+        * all MSRs when emulating RESET.
+        */
+       if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
+               if (do_get_msr(vcpu, index, &val) || *data != val)
+                       return -EINVAL;
+
+               return 0;
+       }
+
         return kvm_set_msr_ignored_check(vcpu, index, *data, true);
  }
  
@@ -3614,9 +3645,40 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 if (data & ~kvm_caps.supported_perf_cap)
                         return 1;
  
+               /*
+                * Note, this is not just a performance optimization!  KVM
+                * disallows changing feature MSRs after the vCPU has run; PMU
+                * refresh will bug the VM if called after the vCPU has run.
+                */
+               if (vcpu->arch.perf_capabilities == data)
+                       break;
+
                 vcpu->arch.perf_capabilities = data;
                 kvm_pmu_refresh(vcpu);
-               return 0;
+               break;
+       case MSR_IA32_PRED_CMD:
+               if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
+                       return 1;
+
+               if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+                       return 1;
+               if (!data)
+                       break;
+
+               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+               break;
+       case MSR_IA32_FLUSH_CMD:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+                       return 1;
+
+               if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+                       return 1;
+               if (!data)
+                       break;
+
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               break;
         case MSR_EFER:
                 return set_efer(vcpu, msr_info);
         case MSR_K7_HWCR:
@@ -4531,9 +4593,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                         r = 0;
                 break;
         case KVM_CAP_XSAVE2: {
-               u64 guest_perm = xstate_get_guest_group_perm();
-
-               r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
+               r = xstate_required_size(kvm_get_filtered_xcr0(), false);
                 if (r < sizeof(struct kvm_xsave))
                         r = sizeof(struct kvm_xsave);
                 break;
@@ -5033,7 +5093,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
                 return 0;
         if (mce->status & MCI_STATUS_UC) {
                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
-                   !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+                   !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                         return 0;
                 }
@@ -6021,11 +6081,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
         return 0;
  }
  
-static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
-       return kvm->arch.n_max_mmu_pages;
-}
-
  static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
  {
         struct kvm_pic *pic = kvm->arch.vpic;
@@ -6672,8 +6727,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
         return 0;
  }
  
-long kvm_arch_vm_ioctl(struct file *filp,
-                      unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
  {
         struct kvm *kvm = filp->private_data;
         void __user *argp = (void __user *)arg;
@@ -6711,9 +6765,6 @@ set_identity_unlock:
         case KVM_SET_NR_MMU_PAGES:
                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
                 break;
-       case KVM_GET_NR_MMU_PAGES:
-               r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
-               break;
         case KVM_CREATE_IRQCHIP: {
                 mutex_lock(&kvm->lock);
  
@@ -7018,6 +7069,18 @@ out:
         return r;
  }
  
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+       struct kvm_msr_entry msr = {
+               .index = msr_index,
+       };
+
+       if (kvm_get_msr_feature(&msr))
+               return;
+
+       msr_based_features[num_msr_based_features++] = msr_index;
+}
+
  static void kvm_probe_msr_to_save(u32 msr_index)
  {
         u32 dummy[2];
@@ -7093,7 +7156,7 @@ static void kvm_probe_msr_to_save(u32 msr_index)
         msrs_to_save[num_msrs_to_save++] = msr_index;
  }
  
-static void kvm_init_msr_list(void)
+static void kvm_init_msr_lists(void)
  {
         unsigned i;
  
@@ -7119,15 +7182,11 @@ static void kvm_init_msr_list(void)
                 emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
         }
  
-       for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
-               struct kvm_msr_entry msr;
-
-               msr.index = msr_based_features_all[i];
-               if (kvm_get_msr_feature(&msr))
-                       continue;
+       for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+               kvm_probe_feature_msr(i);
  
-               msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
-       }
+       for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+               kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
  }
  
  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -8463,7 +8522,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
  }
  
  static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                 bool write_fault_to_shadow_pgtable,
                                   int emulation_type)
  {
         gpa_t gpa = cr2_or_gpa;
@@ -8534,7 +8592,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
          * be fixed by unprotecting shadow page and it should
          * be reported to userspace.
          */
-       return !write_fault_to_shadow_pgtable;
+       return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
  }
  
  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -8782,20 +8840,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         int r;
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
         bool writeback = true;
-       bool write_fault_to_spt;
  
         if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
                 return 1;
  
         vcpu->arch.l1tf_flush_l1d = true;
  
-       /*
-        * Clear write_fault_to_shadow_pgtable here to ensure it is
-        * never reused.
-        */
-       write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
-       vcpu->arch.write_fault_to_shadow_pgtable = false;
-
         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                 kvm_clear_exception_queue(vcpu);
  
@@ -8816,7 +8866,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                                 return 1;
                         }
                         if (reexecute_instruction(vcpu, cr2_or_gpa,
-                                                 write_fault_to_spt,
                                                   emulation_type))
                                 return 1;
  
@@ -8895,8 +8944,7 @@ restart:
                 return 1;
  
         if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
-                                       emulation_type))
+               if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
                         return 1;
  
                 return handle_emulation_failure(vcpu, emulation_type);
@@ -9472,7 +9520,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
                 kvm_caps.max_guest_tsc_khz = max;
         }
         kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
-       kvm_init_msr_list();
+       kvm_init_msr_lists();
         return 0;
  
  out_unwind_ops:
@@ -9803,7 +9851,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                 vcpu->run->hypercall.args[0]  = gpa;
                 vcpu->run->hypercall.args[1]  = npages;
                 vcpu->run->hypercall.args[2]  = attrs;
-               vcpu->run->hypercall.longmode = op_64_bit;
+               vcpu->run->hypercall.flags    = 0;
+               if (op_64_bit)
+                       vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+               WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
                 vcpu->arch.complete_userspace_io = complete_hypercall_exit;
                 return 0;
         }
@@ -13256,7 +13308,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
                 return 1;
         }
  
-       pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+       pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
  
         switch (type) {
         case INVPCID_TYPE_INDIV_ADDR: