Merge tag 'kvm-x86-mtrrs-6.11' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <[email protected]>

Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)

committer Paolo Bonzini <[email protected]>

Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)
author Paolo Bonzini <[email protected]>
Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)
committer Paolo Bonzini <[email protected]>
Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)
diff --combined Documentation/virt/kvm/api.rst

index 798ad65f4feee56d52eeff3ae4c4615b9834fda6,2981673637069d650ed347a2c48a521ea5844ef9..8e5dad80b337ab69a6f13a5b6fd34fc1ca69b19d
--- 1/Documentation/virt/kvm/api.rst
--- 2/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@@ -891,12 -891,12 +891,12 @@@ like this:
   
   The irq_type field has the following values:
   
- -- irq_type[0]:
+ +- KVM_ARM_IRQ_TYPE_CPU:
                out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
- -- irq_type[1]:
+ +- KVM_ARM_IRQ_TYPE_SPI:
                in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
                  (the vcpu_index field is ignored)
- -- irq_type[2]:
+ +- KVM_ARM_IRQ_TYPE_PPI:
                in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
   
   (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
@@@ -1403,12 -1403,6 +1403,12 @@@ Instead, an abort (data abort if the ca
   was a load or a store, instruction abort if it was an instruction
   fetch) is injected in the guest.
   
+ +S390:
+ +^^^^^
+ +
+ +Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
+ +Returns -EINVAL if called on a protected VM.
+ +
   4.36 KVM_SET_TSS_ADDR
   ---------------------
   
@@@ -1927,7 -1921,7 +1927,7 @@@ flags
   
   If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier
   for the device that wrote the MSI message.  For PCI, this is usually a
- -BFD identifier in the lower 16 bits.
+ +BDF identifier in the lower 16 bits.
   
   On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
   feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
@@@ -2992,7 -2986,7 +2992,7 @@@ flags
   
   If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier
   for the device that wrote the MSI message.  For PCI, this is usually a
- -BFD identifier in the lower 16 bits.
+ +BDF identifier in the lower 16 bits.
   
   On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
   feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
@@@ -6279,12 -6273,6 +6279,12 @@@ state.  At VM creation time, all memor
   is '0' for all gfns.  Userspace can control whether memory is shared/private by
   toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
   
+ +S390:
+ +^^^^^
+ +
+ +Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
+ +Returns -EINVAL if called on a protected VM.
+ +
   4.141 KVM_SET_MEMORY_ATTRIBUTES
   -------------------------------
   
@@@ -6364,61 -6352,6 +6364,61 @@@ a single guest_memfd file, but the boun
   
   See KVM_SET_USER_MEMORY_REGION2 for additional details.
   
+ +4.143 KVM_PRE_FAULT_MEMORY
+ +------------------------
+ +
+ +:Capability: KVM_CAP_PRE_FAULT_MEMORY
+ +:Architectures: none
+ +:Type: vcpu ioctl
+ +:Parameters: struct kvm_pre_fault_memory (in/out)
+ +:Returns: 0 if at least one page is processed, < 0 on error
+ +
+ +Errors:
+ +
+ +  ========== ===============================================================
+ +  EINVAL     The specified `gpa` and `size` were invalid (e.g. not
+ +             page aligned, causes an overflow, or size is zero).
+ +  ENOENT     The specified `gpa` is outside defined memslots.
+ +  EINTR      An unmasked signal is pending and no page was processed.
+ +  EFAULT     The parameter address was invalid.
+ +  EOPNOTSUPP Mapping memory for a GPA is unsupported by the
+ +             hypervisor, and/or for the current vCPU state/mode.
+ +  EIO        unexpected error conditions (also causes a WARN)
+ +  ========== ===============================================================
+ +
+ +::
+ +
+ +  struct kvm_pre_fault_memory {
+ +      /* in/out */
+ +      __u64 gpa;
+ +      __u64 size;
+ +      /* in */
+ +      __u64 flags;
+ +      __u64 padding[5];
+ +  };
+ +
+ +KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
+ +for the current vCPU state.  KVM maps memory as if the vCPU generated a
+ +stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
+ +CoW.  However, KVM does not mark any newly created stage-2 PTE as Accessed.
+ +
+ +In some cases, multiple vCPUs might share the page tables.  In this
+ +case, the ioctl can be called in parallel.
+ +
+ +When the ioctl returns, the input values are updated to point to the
+ +remaining range.  If `size` > 0 on return, the caller can just issue
+ +the ioctl again with the same `struct kvm_map_memory` argument.
+ +
+ +Shadow page tables cannot support this ioctl because they
+ +are indexed by virtual address or nested guest physical address.
+ +Calling this ioctl when the guest is using shadow page tables (for
+ +example because it is running a nested guest with nested page tables)
+ +will fail with `EOPNOTSUPP` even if `KVM_CHECK_EXTENSION` reports
+ +the capability to be present.
+ +
+ +`flags` must currently be zero.
+ +
+ +
   5. The kvm_run structure
   ========================
   
@@@ -6483,12 -6416,9 +6483,12 @@@ More architecture-specific flags detail
   affect the device's behavior. Current defined flags::
   
     /* x86, set if the VCPU is in system management mode */
- -  #define KVM_RUN_X86_SMM     (1 << 0)
+ +  #define KVM_RUN_X86_SMM          (1 << 0)
     /* x86, set if bus lock detected in VM */
- -  #define KVM_RUN_BUS_LOCK    (1 << 1)
+ +  #define KVM_RUN_X86_BUS_LOCK     (1 << 1)
+ +  /* x86, set if the VCPU is executing a nested (L2) guest */
+ +  #define KVM_RUN_X86_GUEST_MODE   (1 << 2)
+ +
     /* arm64, set for KVM_EXIT_DEBUG */
     #define KVM_DEBUG_ARCH_HSR_HIGH_VALID  (1 << 0)
   
@@@ -7834,31 -7764,29 +7834,31 @@@ Valid bits in args[0] are:
     #define KVM_BUS_LOCK_DETECTION_OFF      (1 << 0)
     #define KVM_BUS_LOCK_DETECTION_EXIT     (1 << 1)
   
- -Enabling this capability on a VM provides userspace with a way to select
- -a policy to handle the bus locks detected in guest. Userspace can obtain
- -the supported modes from the result of KVM_CHECK_EXTENSION and define it
- -through the KVM_ENABLE_CAP.
+ +Enabling this capability on a VM provides userspace with a way to select a
+ +policy to handle the bus locks detected in guest. Userspace can obtain the
+ +supported modes from the result of KVM_CHECK_EXTENSION and define it through
+ +the KVM_ENABLE_CAP. The supported modes are mutually-exclusive.
   
- -KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported
- -currently and mutually exclusive with each other. More bits can be added in
- -the future.
+ +This capability allows userspace to force VM exits on bus locks detected in the
+ +guest, irrespective whether or not the host has enabled split-lock detection
+ +(which triggers an #AC exception that KVM intercepts). This capability is
+ +intended to mitigate attacks where a malicious/buggy guest can exploit bus
+ +locks to degrade the performance of the whole system.
   
- -With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits
- -so that no additional actions are needed. This is the default mode.
+ +If KVM_BUS_LOCK_DETECTION_OFF is set, KVM doesn't force guest bus locks to VM
+ +exit, although the host kernel's split-lock #AC detection still applies, if
+ +enabled.
   
- -With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected
- -in VM. KVM just exits to userspace when handling them. Userspace can enforce
- -its own throttling or other policy based mitigations.
+ +If KVM_BUS_LOCK_DETECTION_EXIT is set, KVM enables a CPU feature that ensures
+ +bus locks in the guest trigger a VM exit, and KVM exits to userspace for all
+ +such VM exits, e.g. to allow userspace to throttle the offending guest and/or
+ +apply some other policy-based mitigation. When exiting to userspace, KVM sets
+ +KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
+ +to KVM_EXIT_X86_BUS_LOCK.
   
- -This capability is aimed to address the thread that VM can exploit bus locks to
- -degree the performance of the whole system. Once the userspace enable this
- -capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the
- -KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning
- -the bus lock vm exit can be preempted by a higher priority VM exit, the exit
- -notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons.
- -KVM_RUN_BUS_LOCK flag is used to distinguish between them.
+ +Note! Detected bus locks may be coincident with other exits to userspace, i.e.
+ +KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
+ +userspace wants to take action on all detected bus locks.
   
   7.23 KVM_CAP_PPC_DAWR1
   ----------------------
@@@ -7974,10 -7902,10 +7974,10 @@@ perform a bulk copy of tags to/from th
   7.29 KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM
   -------------------------------------
   
- -Architectures: x86 SEV enabled
- -Type: vm
- -Parameters: args[0] is the fd of the source vm
- -Returns: 0 on success
+ +:Architectures: x86 SEV enabled
+ +:Type: vm
+ +:Parameters: args[0] is the fd of the source vm
+ +:Returns: 0 on success
   
   This capability enables userspace to migrate the encryption context from the VM
   indicated by the fd to the VM this is called on.
@@@ -8025,7 -7953,11 +8025,11 @@@ The valid bits in cap.args[0] are
                                       When this quirk is disabled, the reset value
                                       is 0x10000 (APIC_LVT_MASKED).
   
-  KVM_X86_QUIRK_CD_NW_CLEARED        By default, KVM clears CR0.CD and CR0.NW.
+  KVM_X86_QUIRK_CD_NW_CLEARED        By default, KVM clears CR0.CD and CR0.NW on
+                                     AMD CPUs to workaround buggy guest firmware
+                                     that runs in perpetuity with CR0.CD, i.e.
+                                     with caches in "no fill" mode.
+ 
                                       When this quirk is disabled, KVM does not
                                       change the value of CR0.CD and CR0.NW.
   
@@@ -8142,37 -8074,6 +8146,37 @@@ error/annotated fault
   
   See KVM_EXIT_MEMORY_FAULT for more information.
   
+ +7.35 KVM_CAP_X86_APIC_BUS_CYCLES_NS
+ +-----------------------------------
+ +
+ +:Architectures: x86
+ +:Target: VM
+ +:Parameters: args[0] is the desired APIC bus clock rate, in nanoseconds
+ +:Returns: 0 on success, -EINVAL if args[0] contains an invalid value for the
+ +          frequency or if any vCPUs have been created, -ENXIO if a virtual
+ +          local APIC has not been created using KVM_CREATE_IRQCHIP.
+ +
+ +This capability sets the VM's APIC bus clock frequency, used by KVM's in-kernel
+ +virtual APIC when emulating APIC timers.  KVM's default value can be retrieved
+ +by KVM_CHECK_EXTENSION.
+ +
+ +Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
+ +core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
+ +
+ +7.36 KVM_CAP_X86_GUEST_MODE
+ +------------------------------
+ +
+ +:Architectures: x86
+ +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+ +
+ +The presence of this capability indicates that KVM_RUN will update the
+ +KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
+ +vCPU was executing nested guest code when it exited.
+ +
+ +KVM exits with the register state of either the L1 or L2 guest
+ +depending on which executed at the time of an exit. Userspace must
+ +take care to differentiate between these cases.
+ +
   8. Other capabilities.
   ======================
   
diff --combined arch/x86/include/asm/kvm_host.h

index d5101f52e76cf6c9ba3298375c8a326ad8a38076,e919b121cd3f59cd1303f38b03d923a9010dbe10..210408361e9a244a589fd7838a36ce4c21fff339
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -121,7 -121,6 +121,7 @@@
         KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
   #define KVM_REQ_HV_TLB_FLUSH \
         KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+ +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE  KVM_ARCH_REQ(34)
   
   #define CR0_RESERVED_BITS                                               \
         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@@ -160,7 -159,6 +160,6 @@@
   #define KVM_MIN_FREE_MMU_PAGES 5
   #define KVM_REFILL_PAGES 25
   #define KVM_MAX_CPUID_ENTRIES 256
- #define KVM_NR_FIXED_MTRR_REGION 88
   #define KVM_NR_VAR_MTRR 8
   
   #define ASYNC_PF_PER_VCPU 64
@@@ -605,18 -603,12 +604,12 @@@ enum 
         KVM_DEBUGREG_WONT_EXIT = 2,
   };
   
- struct kvm_mtrr_range {
-       u64 base;
-       u64 mask;
-       struct list_head node;
- };
- 
   struct kvm_mtrr {
-       struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
-       mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+       u64 var[KVM_NR_VAR_MTRR * 2];
+       u64 fixed_64k;
+       u64 fixed_16k[2];
+       u64 fixed_4k[8];
         u64 deftype;
- 
-       struct list_head head;
   };
   
   /* Hyper-V SynIC timer */
@@@ -1208,7 -1200,7 +1201,7 @@@ enum kvm_apicv_inhibit 
          * APIC acceleration is disabled by a module parameter
          * and/or not supported in hardware.
          */
- -      APICV_INHIBIT_REASON_DISABLE,
+ +      APICV_INHIBIT_REASON_DISABLED,
   
         /*
          * APIC acceleration is inhibited because AutoEOI feature is
@@@ -1278,27 -1270,8 +1271,27 @@@
          * mapping between logical ID and vCPU.
          */
         APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+ +
+ +      NR_APICV_INHIBIT_REASONS,
   };
   
+ +#define __APICV_INHIBIT_REASON(reason)                        \
+ +      { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+ +
+ +#define APICV_INHIBIT_REASONS                         \
+ +      __APICV_INHIBIT_REASON(DISABLED),               \
+ +      __APICV_INHIBIT_REASON(HYPERV),                 \
+ +      __APICV_INHIBIT_REASON(ABSENT),                 \
+ +      __APICV_INHIBIT_REASON(BLOCKIRQ),               \
+ +      __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED),    \
+ +      __APICV_INHIBIT_REASON(APIC_ID_MODIFIED),       \
+ +      __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED),     \
+ +      __APICV_INHIBIT_REASON(NESTED),                 \
+ +      __APICV_INHIBIT_REASON(IRQWIN),                 \
+ +      __APICV_INHIBIT_REASON(PIT_REINJ),              \
+ +      __APICV_INHIBIT_REASON(SEV),                    \
+ +      __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+ +
   struct kvm_arch {
         unsigned long n_used_mmu_pages;
         unsigned long n_requested_mmu_pages;
@@@ -1384,7 -1357,6 +1377,7 @@@
   
         u32 default_tsc_khz;
         bool user_set_tsc;
+ +      u64 apic_bus_cycle_ns;
   
         seqcount_raw_spinlock_t pvclock_sc;
         bool use_master_clock;
@@@ -1729,6 -1701,7 +1722,6 @@@ struct kvm_x86_ops 
         void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
         void (*enable_irq_window)(struct kvm_vcpu *vcpu);
         void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- -      bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
         const unsigned long required_apicv_inhibits;
         bool allow_apicv_in_x2apic_without_x2apic_virtualization;
         void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@@ -1769,6 -1742,8 +1762,6 @@@
                                struct x86_exception *exception);
         void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
   
- -      void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
- -
         /*
          * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
          * value indicates CPU dirty logging is unsupported or disabled.
@@@ -1830,9 -1805,6 +1823,9 @@@
   
         gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
         void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+ +      int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+ +      void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ +      int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
   };
   
   struct kvm_x86_nested_ops {
@@@ -1874,6 -1846,7 +1867,6 @@@ struct kvm_arch_async_pf 
   };
   
   extern u32 __read_mostly kvm_nr_uret_msrs;
- -extern u64 __read_mostly host_efer;
   extern bool __read_mostly allow_smaller_maxphyaddr;
   extern bool __read_mostly enable_apicv;
   extern struct kvm_x86_ops kvm_x86_ops;
@@@ -1959,7 -1932,6 +1952,7 @@@ void kvm_mmu_slot_leaf_clear_dirty(stru
                                    const struct kvm_memory_slot *memslot);
   void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
   void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+ +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
   
   int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
   
@@@ -2175,7 -2147,6 +2168,7 @@@ int kvm_emulate_hypercall(struct kvm_vc
   
   int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                        void *insn, int insn_len);
+ +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
   void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
   void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                              u64 addr, unsigned long roots);
diff --combined arch/x86/kvm/mmu.h

index f2e7e5c9b9efd045a7cc2df70f4998e26ab4e8c2,eb2de8eb6e4691d7ee226531247e86ec0ac0bdb6..24ea7183d7b4f836df6725efde1a07d5ed3718a8
--- 1/arch/x86/kvm/mmu.h
--- 2/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@@ -57,6 -57,12 +57,6 @@@ static __always_inline u64 rsvd_bits(in
         return ((2ULL << (e - s)) - 1) << s;
   }
   
- -/*
- - * The number of non-reserved physical address bits irrespective of features
- - * that repurpose legal bits, e.g. MKTME.
- - */
- -extern u8 __read_mostly shadow_phys_bits;
- -
   static inline gfn_t kvm_mmu_max_gfn(void)
   {
         /*
@@@ -70,11 -76,30 +70,11 @@@
          * than hardware's real MAXPHYADDR.  Using the host MAXPHYADDR
          * disallows such SPTEs entirely and simplifies the TDP MMU.
          */
- -      int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+ +      int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
   
         return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
   }
   
- -static inline u8 kvm_get_shadow_phys_bits(void)
- -{
- -      /*
- -       * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- -       * in CPU detection code, but the processor treats those reduced bits as
- -       * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- -       * the physical address bits reported by CPUID.
- -       */
- -      if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- -              return cpuid_eax(0x80000008) & 0xff;
- -
- -      /*
- -       * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- -       * custom CPUID.  Proceed with whatever the kernel found since these features
- -       * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- -       */
- -      return boot_cpu_data.x86_phys_bits;
- -}
- -
   u8 kvm_mmu_get_max_tdp_level(void);
   
   void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
@@@ -221,13 -246,10 +221,8 @@@ static inline u8 permission_fault(struc
         return -(u32)fault & errcode;
   }
   
- bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
- 
- static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
- {
-       return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
- }
+ bool kvm_mmu_may_ignore_guest_pat(void);
   
- -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
- -
   int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
   
   int kvm_mmu_post_init_vm(struct kvm *kvm);
diff --combined arch/x86/kvm/mmu/mmu.c

index d3b8e4fad924fb7c5ecaf95b616b605b5beb8bf6,77ca81e13722d48757649ae23595bbd00a7a7729..ee13fc7b0e27387254d2c540010aec4103337859
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -336,19 -336,16 +336,19 @@@ static int is_cpuid_PSE36(void
   #ifdef CONFIG_X86_64
   static void __set_spte(u64 *sptep, u64 spte)
   {
+ +      KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
         WRITE_ONCE(*sptep, spte);
   }
   
   static void __update_clear_spte_fast(u64 *sptep, u64 spte)
   {
+ +      KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
         WRITE_ONCE(*sptep, spte);
   }
   
   static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
   {
+ +      KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
         return xchg(sptep, spte);
   }
   
@@@ -722,7 -719,7 +722,7 @@@ static gfn_t kvm_mmu_page_get_gfn(struc
         if (sp->role.passthrough)
                 return sp->gfn;
   
- -      if (!sp->role.direct)
+ +      if (sp->shadowed_translation)
                 return sp->shadowed_translation[index] >> PAGE_SHIFT;
   
         return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
@@@ -736,7 -733,7 +736,7 @@@
    */
   static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
   {
- -      if (sp_has_gptes(sp))
+ +      if (sp->shadowed_translation)
                 return sp->shadowed_translation[index] & ACC_ALL;
   
         /*
@@@ -757,7 -754,7 +757,7 @@@
   static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
                                          gfn_t gfn, unsigned int access)
   {
- -      if (sp_has_gptes(sp)) {
+ +      if (sp->shadowed_translation) {
                 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
                 return;
         }
@@@ -1700,7 -1697,8 +1700,7 @@@ static void kvm_mmu_free_shadow_page(st
         hlist_del(&sp->hash_link);
         list_del(&sp->link);
         free_page((unsigned long)sp->spt);
- -      if (!sp->role.direct)
- -              free_page((unsigned long)sp->shadowed_translation);
+ +      free_page((unsigned long)sp->shadowed_translation);
         kmem_cache_free(mmu_page_header_cache, sp);
   }
   
@@@ -2202,7 -2200,7 +2202,7 @@@ static struct kvm_mmu_page *kvm_mmu_all
   
         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
         sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
- -      if (!role.direct)
+ +      if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
   
         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@@ -3307,7 -3305,7 +3307,7 @@@ static int kvm_handle_noslot_fault(stru
         return RET_PF_CONTINUE;
   }
   
- -static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+ +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
   {
         /*
          * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@@ -3318,26 -3316,6 +3318,26 @@@
         if (fault->rsvd)
                 return false;
   
+ +      /*
+ +       * For hardware-protected VMs, certain conditions like attempting to
+ +       * perform a write to a page which is not in the state that the guest
+ +       * expects it to be in can result in a nested/extended #PF. In this
+ +       * case, the below code might misconstrue this situation as being the
+ +       * result of a write-protected access, and treat it as a spurious case
+ +       * rather than taking any action to satisfy the real source of the #PF
+ +       * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ +       * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ +       * in this case.
+ +       *
+ +       * Note that the kvm_mem_is_private() check might race with an
+ +       * attribute update, but this will either result in the guest spinning
+ +       * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ +       * case might go down the slow path. Either case will resolve itself.
+ +       */
+ +      if (kvm->arch.has_private_mem &&
+ +          fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ +              return false;
+ +
         /*
          * #PF can be fast if:
          *
@@@ -3438,7 -3416,7 +3438,7 @@@ static int fast_page_fault(struct kvm_v
         u64 *sptep;
         uint retry_count = 0;
   
- -      if (!page_fault_can_be_fast(fault))
+ +      if (!page_fault_can_be_fast(vcpu->kvm, fault))
                 return ret;
   
         walk_shadow_page_lockless_begin(vcpu);
@@@ -3447,7 -3425,7 +3447,7 @@@
                 u64 new_spte;
   
                 if (tdp_mmu_enabled)
- -                      sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ +                      sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
                 else
                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
   
@@@ -3457,7 -3435,7 +3457,7 @@@
                  * available as the vCPU holds a reference to its root(s).
                  */
                 if (WARN_ON_ONCE(!sptep))
- -                      spte = REMOVED_SPTE;
+ +                      spte = FROZEN_SPTE;
   
                 if (!is_shadow_present_pte(spte))
                         break;
@@@ -4123,31 -4101,23 +4123,31 @@@ static int get_walk(struct kvm_vcpu *vc
         return leaf;
   }
   
- -/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
- -static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+ +static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ +                            int *root_level)
   {
- -      u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
- -      struct rsvd_bits_validate *rsvd_check;
- -      int root, leaf, level;
- -      bool reserved = false;
+ +      int leaf;
   
         walk_shadow_page_lockless_begin(vcpu);
   
         if (is_tdp_mmu_active(vcpu))
- -              leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
+ +              leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
         else
- -              leaf = get_walk(vcpu, addr, sptes, &root);
+ +              leaf = get_walk(vcpu, addr, sptes, root_level);
   
         walk_shadow_page_lockless_end(vcpu);
+ +      return leaf;
+ +}
+ +
+ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+ +{
+ +      u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ +      struct rsvd_bits_validate *rsvd_check;
+ +      int root, leaf, level;
+ +      bool reserved = false;
   
+ +      leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
         if (unlikely(leaf < 0)) {
                 *sptep = 0ull;
                 return reserved;
@@@ -4290,16 -4260,7 +4290,16 @@@ void kvm_arch_async_page_ready(struct k
               work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
                 return;
   
- -      kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
+ +      r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ +                                true, NULL, NULL);
+ +
+ +      /*
+ +       * Account fixed page faults, otherwise they'll never be counted, but
+ +       * ignore stats for all other return times.  Page-ready "faults" aren't
+ +       * truly spurious and never trigger emulation
+ +       */
+ +      if (r == RET_PF_FIXED)
+ +              vcpu->stat.pf_fixed++;
   }
   
   static inline u8 kvm_max_level_for_order(int order)
@@@ -4319,25 -4280,6 +4319,25 @@@
         return PG_LEVEL_4K;
   }
   
+ +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ +                                      u8 max_level, int gmem_order)
+ +{
+ +      u8 req_max_level;
+ +
+ +      if (max_level == PG_LEVEL_4K)
+ +              return PG_LEVEL_4K;
+ +
+ +      max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+ +      if (max_level == PG_LEVEL_4K)
+ +              return PG_LEVEL_4K;
+ +
+ +      req_max_level = static_call(kvm_x86_private_max_mapping_level)(kvm, pfn);
+ +      if (req_max_level)
+ +              max_level = min(max_level, req_max_level);
+ +
+ +      return req_max_level;
+ +}
+ +
   static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
                                    struct kvm_page_fault *fault)
   {
@@@ -4355,9 -4297,9 +4355,9 @@@
                 return r;
         }
   
- -      fault->max_level = min(kvm_max_level_for_order(max_order),
- -                             fault->max_level);
         fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ +      fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
+ +                                                       fault->max_level, max_order);
   
         return RET_PF_CONTINUE;
   }
@@@ -4458,6 -4400,9 +4458,6 @@@ static int kvm_faultin_pfn(struct kvm_v
                         return RET_PF_EMULATE;
         }
   
- -      fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- -      smp_rmb();
- -
         /*
          * Check for a relevant mmu_notifier invalidation event before getting
          * the pfn from the primary MMU, and before acquiring mmu_lock.
@@@ -4608,10 -4553,7 +4608,10 @@@ int kvm_handle_page_fault(struct kvm_vc
         if (WARN_ON_ONCE(error_code >> 32))
                 error_code = lower_32_bits(error_code);
   
- -      /* Ensure the above sanity check also covers KVM-defined flags. */
+ +      /*
+ +       * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ +       * them to conflict with #PF error codes, which are limited to 32 bits.
+ +       */
         BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
   
         vcpu->arch.l1tf_flush_l1d = true;
@@@ -4671,38 -4613,23 +4671,23 @@@ out_unlock
   }
   #endif
   
- bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+ bool kvm_mmu_may_ignore_guest_pat(void)
   {
         /*
-        * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
-        * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
-        * to honor the memtype from the guest's MTRRs so that guest accesses
-        * to memory that is DMA'd aren't cached against the guest's wishes.
-        *
-        * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
-        * e.g. KVM will force UC memtype for host MMIO.
+        * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
+        * not support self-snoop (or is affected by an erratum), and the VM
+        * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+        * honor the memtype from the guest's PAT so that guest accesses to
+        * memory that is DMA'd aren't cached against the guest's wishes.  As a
+        * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+        * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
+        * bits in response to non-coherent device (un)registration.
          */
-       return vm_has_noncoherent_dma && shadow_memtype_mask;
+       return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
   }
   
   int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   {
-       /*
-        * If the guest's MTRRs may be used to compute the "real" memtype,
-        * restrict the mapping level to ensure KVM uses a consistent memtype
-        * across the entire mapping.
-        */
-       if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
-               for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
-                       int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
-                       gfn_t base = gfn_round_for_level(fault->gfn,
-                                                        fault->max_level);
- 
-                       if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
-                               break;
-               }
-       }
- 
   #ifdef CONFIG_X86_64
         if (tdp_mmu_enabled)
                 return kvm_tdp_mmu_page_fault(vcpu, fault);
@@@ -4711,79 -4638,6 +4696,79 @@@
         return direct_page_fault(vcpu, fault);
   }
   
+ +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ +                          u8 *level)
+ +{
+ +      int r;
+ +
+ +      /*
+ +       * Restrict to TDP page fault, since that's the only case where the MMU
+ +       * is indexed by GPA.
+ +       */
+ +      if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ +              return -EOPNOTSUPP;
+ +
+ +      do {
+ +              if (signal_pending(current))
+ +                      return -EINTR;
+ +              cond_resched();
+ +              r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ +      } while (r == RET_PF_RETRY);
+ +
+ +      if (r < 0)
+ +              return r;
+ +
+ +      switch (r) {
+ +      case RET_PF_FIXED:
+ +      case RET_PF_SPURIOUS:
+ +              return 0;
+ +
+ +      case RET_PF_EMULATE:
+ +              return -ENOENT;
+ +
+ +      case RET_PF_RETRY:
+ +      case RET_PF_CONTINUE:
+ +      case RET_PF_INVALID:
+ +      default:
+ +              WARN_ONCE(1, "could not fix page fault during prefault");
+ +              return -EIO;
+ +      }
+ +}
+ +
+ +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ +                                  struct kvm_pre_fault_memory *range)
+ +{
+ +      u64 error_code = PFERR_GUEST_FINAL_MASK;
+ +      u8 level = PG_LEVEL_4K;
+ +      u64 end;
+ +      int r;
+ +
+ +      /*
+ +       * reload is efficient when called repeatedly, so we can do it on
+ +       * every iteration.
+ +       */
+ +      kvm_mmu_reload(vcpu);
+ +
+ +      if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ +          kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ +              error_code |= PFERR_PRIVATE_ACCESS;
+ +
+ +      /*
+ +       * Shadow paging uses GVA for kvm page fault, so restrict to
+ +       * two-dimensional paging.
+ +       */
+ +      r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ +      if (r < 0)
+ +              return r;
+ +
+ +      /*
+ +       * If the mapping that covers range->gpa can use a huge page, it
+ +       * may start below it or end after range->gpa + range->size.
+ +       */
+ +      end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ +      return min(range->size, end - range->gpa);
+ +}
+ +
   static void nonpaging_init_context(struct kvm_mmu *context)
   {
         context->page_fault = nonpaging_page_fault;
@@@ -5111,7 -4965,7 +5096,7 @@@ static void reset_rsvds_bits_mask_ept(s
   
   static inline u64 reserved_hpa_bits(void)
   {
- -      return rsvd_bits(shadow_phys_bits, 63);
+ +      return rsvd_bits(kvm_host.maxphyaddr, 63);
   }
   
   /*
@@@ -6009,24 -5863,14 +5994,24 @@@ int noinline kvm_mmu_page_fault(struct 
         }
   
         if (r == RET_PF_INVALID) {
+ +              vcpu->stat.pf_taken++;
+ +
                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
- -                                        &emulation_type);
+ +                                        &emulation_type, NULL);
                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
                         return -EIO;
         }
   
         if (r < 0)
                 return r;
+ +
+ +      if (r == RET_PF_FIXED)
+ +              vcpu->stat.pf_fixed++;
+ +      else if (r == RET_PF_EMULATE)
+ +              vcpu->stat.pf_emulate++;
+ +      else if (r == RET_PF_SPURIOUS)
+ +              vcpu->stat.pf_spurious++;
+ +
         if (r != RET_PF_EMULATE)
                 return 1;
   
@@@ -6062,22 -5906,6 +6047,22 @@@ emulate
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   
+ +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
+ +{
+ +      u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ +      int root_level, leaf, level;
+ +
+ +      leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
+ +      if (unlikely(leaf < 0))
+ +              return;
+ +
+ +      pr_err("%s %llx", msg, gpa);
+ +      for (level = root_level; level >= leaf; level--)
+ +              pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
+ +      pr_cont("\n");
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes);
+ +
   static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                       u64 addr, hpa_t root_hpa)
   {
@@@ -6920,7 -6748,6 +6905,7 @@@ restart
   
         return need_tlb_flush;
   }
+ +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
   
   static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
                                            const struct kvm_memory_slot *slot)
@@@ -7051,6 -6878,7 +7036,6 @@@ static unsigned long mmu_shrink_scan(st
   
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 int idx;
- -              LIST_HEAD(invalid_list);
   
                 /*
                  * Never scan more than sc->nr_to_scan VM instances.
diff --combined arch/x86/kvm/vmx/vmx.c

index bedb9ba96918f81f0d17688621671d9825af5bd2,e97e1ad79bf4a2e3f5885c23dd10ee13d9b860d2..13a6b0281e37cd16ead2ba46e5c88fbbcc230bba
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -74,7 -74,6 +74,7 @@@
   #include "posted_intr.h"
   
   MODULE_AUTHOR("Qumranet");
+ +MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
   MODULE_LICENSE("GPL");
   
   #ifdef MODULE
@@@ -260,7 -259,7 +260,7 @@@ static int vmx_setup_l1d_flush(enum vmx
                 return 0;
         }
   
- -      if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ +      if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
                 return 0;
         }
@@@ -405,7 -404,7 +405,7 @@@ static void vmx_update_fb_clear_dis(str
          * and VM-Exit.
          */
         vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- -                              (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ +                              (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
                                 !boot_cpu_has_bug(X86_BUG_MDS) &&
                                 !boot_cpu_has_bug(X86_BUG_TAA);
   
@@@ -1124,12 -1123,12 +1124,12 @@@ static bool update_transition_efer(stru
          * atomically, since it's faster than switching it manually.
          */
         if (cpu_has_load_ia32_efer() ||
- -          (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ +          (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
                 if (!(guest_efer & EFER_LMA))
                         guest_efer &= ~EFER_LME;
- -              if (guest_efer != host_efer)
+ +              if (guest_efer != kvm_host.efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
- -                                            guest_efer, host_efer, false);
+ +                                            guest_efer, kvm_host.efer, false);
                 else
                         clear_atomic_switch_msr(vmx, MSR_EFER);
                 return false;
@@@ -1142,7 -1141,7 +1142,7 @@@
         clear_atomic_switch_msr(vmx, MSR_EFER);
   
         guest_efer &= ~ignore_bits;
- -      guest_efer |= host_efer & ignore_bits;
+ +      guest_efer |= kvm_host.efer & ignore_bits;
   
         vmx->guest_uret_msrs[i].data = guest_efer;
         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@@ -1412,38 -1411,6 +1412,38 @@@ static void vmx_write_guest_kernel_gs_b
   }
   #endif
   
+ +static void grow_ple_window(struct kvm_vcpu *vcpu)
+ +{
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +      unsigned int old = vmx->ple_window;
+ +
+ +      vmx->ple_window = __grow_ple_window(old, ple_window,
+ +                                          ple_window_grow,
+ +                                          ple_window_max);
+ +
+ +      if (vmx->ple_window != old) {
+ +              vmx->ple_window_dirty = true;
+ +              trace_kvm_ple_window_update(vcpu->vcpu_id,
+ +                                          vmx->ple_window, old);
+ +      }
+ +}
+ +
+ +static void shrink_ple_window(struct kvm_vcpu *vcpu)
+ +{
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +      unsigned int old = vmx->ple_window;
+ +
+ +      vmx->ple_window = __shrink_ple_window(old, ple_window,
+ +                                            ple_window_shrink,
+ +                                            ple_window);
+ +
+ +      if (vmx->ple_window != old) {
+ +              vmx->ple_window_dirty = true;
+ +              trace_kvm_ple_window_update(vcpu->vcpu_id,
+ +                                          vmx->ple_window, old);
+ +      }
+ +}
+ +
   void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
                         struct loaded_vmcs *buddy)
   {
@@@ -1519,9 -1486,6 +1519,9 @@@ void vmx_vcpu_load(struct kvm_vcpu *vcp
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
+ +      if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ +              shrink_ple_window(vcpu);
+ +
         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
   
         vmx_vcpu_pi_load(vcpu, cpu);
@@@ -4393,7 -4357,7 +4393,7 @@@ void vmx_set_constant_host_state(struc
         }
   
         if (cpu_has_load_ia32_efer())
- -              vmcs_write64(HOST_IA32_EFER, host_efer);
+ +              vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
   }
   
   void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@@ -5254,15 -5218,8 +5254,15 @@@ static int handle_exception_nmi(struct 
         if (is_invalid_opcode(intr_info))
                 return handle_ud(vcpu);
   
- -      if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
- -              return -EIO;
+ +      if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ +              struct vmx_ve_information *ve_info = vmx->ve_info;
+ +
+ +              WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ +                        "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ +              dump_vmcs(vcpu);
+ +              kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ +              return 1;
+ +      }
   
         error_code = 0;
         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
@@@ -5933,6 -5890,38 +5933,6 @@@ int vmx_vcpu_pre_run(struct kvm_vcpu *v
         return 1;
   }
   
- -static void grow_ple_window(struct kvm_vcpu *vcpu)
- -{
- -      struct vcpu_vmx *vmx = to_vmx(vcpu);
- -      unsigned int old = vmx->ple_window;
- -
- -      vmx->ple_window = __grow_ple_window(old, ple_window,
- -                                          ple_window_grow,
- -                                          ple_window_max);
- -
- -      if (vmx->ple_window != old) {
- -              vmx->ple_window_dirty = true;
- -              trace_kvm_ple_window_update(vcpu->vcpu_id,
- -                                          vmx->ple_window, old);
- -      }
- -}
- -
- -static void shrink_ple_window(struct kvm_vcpu *vcpu)
- -{
- -      struct vcpu_vmx *vmx = to_vmx(vcpu);
- -      unsigned int old = vmx->ple_window;
- -
- -      vmx->ple_window = __shrink_ple_window(old, ple_window,
- -                                            ple_window_shrink,
- -                                            ple_window);
- -
- -      if (vmx->ple_window != old) {
- -              vmx->ple_window_dirty = true;
- -              trace_kvm_ple_window_update(vcpu->vcpu_id,
- -                                          vmx->ple_window, old);
- -      }
- -}
- -
   /*
    * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
    * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@@ -6681,10 -6670,9 +6681,10 @@@ static noinstr void vmx_l1d_flush(struc
                 bool flush_l1d;
   
                 /*
- -               * Clear the per-vcpu flush bit, it gets set again
- -               * either from vcpu_run() or from one of the unsafe
- -               * VMEXIT handlers.
+ +               * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ +               * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ +               * exits to userspace, or if KVM reaches one of the unsafe
+ +               * VMEXIT handlers, e.g. if KVM calls into the emulator.
                  */
                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
                 vcpu->arch.l1tf_flush_l1d = false;
@@@ -7670,39 -7658,25 +7670,25 @@@ int vmx_vm_init(struct kvm *kvm
   
   u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
   {
-       /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
-        * memory aliases with conflicting memory types and sometimes MCEs.
-        * We have to be careful as to what are honored and when.
-        *
-        * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
-        * UC.  The effective memory type is UC or WC depending on guest PAT.
-        * This was historically the source of MCEs and we want to be
-        * conservative.
-        *
-        * When there is no need to deal with noncoherent DMA (e.g., no VT-d
-        * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
-        * EPT memory type is set to WB.  The effective memory type is forced
-        * WB.
-        *
-        * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
-        * EPT memory type is used to emulate guest CD/MTRR.
+       /*
+        * Force UC for host MMIO regions, as allowing the guest to access MMIO
+        * with cacheable accesses will result in Machine Checks.
          */
- 
         if (is_mmio)
                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
   
-       if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+       /*
+        * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+        * device attached and the CPU doesn't support self-snoop.  Letting the
+        * guest control memory types on Intel CPUs without self-snoop may
+        * result in unexpected behavior, and so KVM's (historical) ABI is to
+        * trust the guest to behave only as a last resort.
+        */
+       if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
+           !kvm_arch_has_noncoherent_dma(vcpu->kvm))
                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
   
-       if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
-               if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-                       return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
-               else
-                       return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
-                               VMX_EPT_IPAT_BIT;
-       }
- 
-       return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+       return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
   }
   
   static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@@ -8184,6 -8158,12 +8170,6 @@@ void vmx_cancel_hv_timer(struct kvm_vcp
   }
   #endif
   
- -void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
- -{
- -      if (!kvm_pause_in_guest(vcpu->kvm))
- -              shrink_ple_window(vcpu);
- -}
- -
   void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -8395,16 -8375,18 +8381,16 @@@ static void __init vmx_setup_me_spte_ma
         u64 me_mask = 0;
   
         /*
- -       * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
- -       * the former to avoid exposing shadow_phys_bits.
- -       *
          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- -       * shadow_phys_bits.  On MKTME and/or TDX capable systems,
+ +       * kvm_host.maxphyaddr.  On MKTME and/or TDX capable systems,
          * boot_cpu_data.x86_phys_bits holds the actual physical address
- -       * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- -       * reported by CPUID.  Those bits between are KeyID bits.
+ +       * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ +       * MAXPHYADDR reported by CPUID.  Those bits between are KeyID bits.
          */
- -      if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ +      if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- -                      kvm_get_shadow_phys_bits() - 1);
+ +                                  kvm_host.maxphyaddr - 1);
+ +
         /*
          * Unlike SME, host kernel doesn't support setting up any
          * MKTME KeyID on Intel platforms.  No memory encryption
diff --combined arch/x86/kvm/x86.c

index e90e1a74564ebfbf29f4016b3c394985dcf8f2d6,121907049190f72e8ea183362b3a72c6ad066518..281edbbfc83d37bf21913ed33992091b1e4cd05a
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -100,9 -100,6 +100,9 @@@
   struct kvm_caps kvm_caps __read_mostly;
   EXPORT_SYMBOL_GPL(kvm_caps);
   
+ +struct kvm_host_values kvm_host __read_mostly;
+ +EXPORT_SYMBOL_GPL(kvm_host);
+ +
   #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
   
   #define emul_to_vcpu(ctxt) \
@@@ -167,6 -164,15 +167,6 @@@ module_param(kvmclock_periodic_sync, bo
   static u32 __read_mostly tsc_tolerance_ppm = 250;
   module_param(tsc_tolerance_ppm, uint, 0644);
   
- -/*
- - * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
- - * adaptive tuning starting from default advancement of 1000ns.  '0' disables
- - * advancement entirely.  Any other value is used as-is and disables adaptive
- - * tuning, i.e. allows privileged userspace to set an exact advancement time.
- - */
- -static int __read_mostly lapic_timer_advance_ns = -1;
- -module_param(lapic_timer_advance_ns, int, 0644);
- -
   static bool __read_mostly vector_hashing = true;
   module_param(vector_hashing, bool, 0444);
   
@@@ -223,12 -229,21 +223,12 @@@ static struct kvm_user_return_msrs __pe
                                 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
                                 | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
   
- -u64 __read_mostly host_efer;
- -EXPORT_SYMBOL_GPL(host_efer);
- -
   bool __read_mostly allow_smaller_maxphyaddr = 0;
   EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
   
   bool __read_mostly enable_apicv = true;
   EXPORT_SYMBOL_GPL(enable_apicv);
   
- -u64 __read_mostly host_xss;
- -EXPORT_SYMBOL_GPL(host_xss);
- -
- -u64 __read_mostly host_arch_capabilities;
- -EXPORT_SYMBOL_GPL(host_arch_capabilities);
- -
   const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
         KVM_GENERIC_VM_STATS(),
         STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@@ -302,6 -317,8 +302,6 @@@ const struct kvm_stats_header kvm_vcpu_
                        sizeof(kvm_vcpu_stats_desc),
   };
   
- -u64 __read_mostly host_xcr0;
- -
   static struct kmem_cache *x86_emulator_cache;
   
   /*
@@@ -946,11 -963,6 +946,6 @@@ void kvm_post_set_cr0(struct kvm_vcpu *
   
         if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
                 kvm_mmu_reset_context(vcpu);
- 
-       if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
-           kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
-           !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
   }
   EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
   
@@@ -1008,11 -1020,11 +1003,11 @@@ void kvm_load_guest_xsave_state(struct 
   
         if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
   
- -              if (vcpu->arch.xcr0 != host_xcr0)
+ +              if (vcpu->arch.xcr0 != kvm_host.xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
   
                 if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- -                  vcpu->arch.ia32_xss != host_xss)
+ +                  vcpu->arch.ia32_xss != kvm_host.xss)
                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
         }
   
@@@ -1039,12 -1051,12 +1034,12 @@@ void kvm_load_host_xsave_state(struct k
   
         if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
   
- -              if (vcpu->arch.xcr0 != host_xcr0)
- -                      xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ +              if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ +                      xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
   
                 if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- -                  vcpu->arch.ia32_xss != host_xss)
- -                      wrmsrl(MSR_IA32_XSS, host_xss);
+ +                  vcpu->arch.ia32_xss != kvm_host.xss)
+ +                      wrmsrl(MSR_IA32_XSS, kvm_host.xss);
         }
   
   }
@@@ -1611,7 -1623,7 +1606,7 @@@ static bool kvm_is_immutable_feature_ms
   
   static u64 kvm_get_arch_capabilities(void)
   {
- -      u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+ +      u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
   
         /*
          * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@@ -1869,11 -1881,11 +1864,11 @@@ static int __kvm_set_msr(struct kvm_vcp
                  * incomplete and conflicting architectural behavior.  Current
                  * AMD CPUs completely ignore bits 63:32, i.e. they aren't
                  * reserved and always read as zeros.  Enforce Intel's reserved
- -               * bits check if and only if the guest CPU is Intel, and clear
- -               * the bits in all other cases.  This ensures cross-vendor
- -               * migration will provide consistent behavior for the guest.
+ +               * bits check if the guest CPU is Intel compatible, otherwise
+ +               * clear the bits.  This ensures cross-vendor migration will
+ +               * provide consistent behavior for the guest.
                  */
- -              if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ +              if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
                         return 1;
   
                 data = (u32)data;
@@@ -4695,15 -4707,8 +4690,15 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
         case KVM_CAP_IRQFD_RESAMPLE:
         case KVM_CAP_MEMORY_FAULT_INFO:
+ +      case KVM_CAP_X86_GUEST_MODE:
                 r = 1;
                 break;
+ +      case KVM_CAP_PRE_FAULT_MEMORY:
+ +              r = tdp_enabled;
+ +              break;
+ +      case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ +              r = APIC_BUS_CYCLE_NS_DEFAULT;
+ +              break;
         case KVM_CAP_EXIT_HYPERCALL:
                 r = KVM_EXIT_HYPERCALL_VALID_MASK;
                 break;
@@@ -4994,15 -4999,6 +4989,15 @@@ static bool need_emulate_wbinvd(struct 
   
   void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
   {
+ +      struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ +
+ +      vcpu->arch.l1tf_flush_l1d = true;
+ +
+ +      if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+ +              pmu->need_cleanup = true;
+ +              kvm_make_request(KVM_REQ_PMU, vcpu);
+ +      }
+ +
         /* Address WBINVD may be executed by guest */
         if (need_emulate_wbinvd(vcpu)) {
                 if (static_call(kvm_x86_has_wbinvd_exit)())
@@@ -5887,7 -5883,8 +5882,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                 r = -EINVAL;
                 if (!lapic_in_kernel(vcpu))
                         goto out;
- -              u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
- -                              GFP_KERNEL_ACCOUNT);
+ +              u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
   
                 r = -ENOMEM;
                 if (!u.lapic)
@@@ -6080,7 -6077,7 +6075,7 @@@
                 if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
                         break;
   
- -              u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ +              u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
                 r = -ENOMEM;
                 if (!u.xsave)
                         break;
@@@ -6111,7 -6108,7 +6106,7 @@@
         case KVM_GET_XSAVE2: {
                 int size = vcpu->arch.guest_fpu.uabi_size;
   
- -              u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ +              u.xsave = kzalloc(size, GFP_KERNEL);
                 r = -ENOMEM;
                 if (!u.xsave)
                         break;
@@@ -6129,7 -6126,7 +6124,7 @@@
         }
   
         case KVM_GET_XCRS: {
- -              u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ +              u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
                 r = -ENOMEM;
                 if (!u.xcrs)
                         break;
@@@ -6550,6 -6547,9 +6545,6 @@@ int kvm_vm_ioctl_enable_cap(struct kvm 
                         goto split_irqchip_unlock;
                 if (kvm->created_vcpus)
                         goto split_irqchip_unlock;
- -              r = kvm_setup_empty_irq_routing(kvm);
- -              if (r)
- -                      goto split_irqchip_unlock;
                 /* Pairs with irqchip_in_kernel. */
                 smp_wmb();
                 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@@ -6696,9 -6696,7 +6691,9 @@@ split_irqchip_unlock
                         break;
   
                 mutex_lock(&kvm->lock);
- -              if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ +              if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
+ +                      ;
+ +              } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
                         r = 0;
                 } else if (!kvm->arch.max_vcpu_ids) {
                         kvm->arch.max_vcpu_ids = cap->args[0];
@@@ -6751,30 -6749,6 +6746,30 @@@
                 }
                 mutex_unlock(&kvm->lock);
                 break;
+ +      case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ +              u64 bus_cycle_ns = cap->args[0];
+ +              u64 unused;
+ +
+ +              /*
+ +               * Guard against overflow in tmict_to_ns(). 128 is the highest
+ +               * divide value that can be programmed in APIC_TDCR.
+ +               */
+ +              r = -EINVAL;
+ +              if (!bus_cycle_ns ||
+ +                  check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ +                      break;
+ +
+ +              r = 0;
+ +              mutex_lock(&kvm->lock);
+ +              if (!irqchip_in_kernel(kvm))
+ +                      r = -ENXIO;
+ +              else if (kvm->created_vcpus)
+ +                      r = -EINVAL;
+ +              else
+ +                      kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ +              mutex_unlock(&kvm->lock);
+ +              break;
+ +      }
         default:
                 r = -EINVAL;
                 break;
@@@ -7243,9 -7217,6 +7238,9 @@@ set_pit2_out
                 mutex_lock(&kvm->lock);
                 if (kvm->created_vcpus)
                         r = -EBUSY;
+ +              else if (arg > KVM_MAX_VCPU_IDS ||
+ +                       (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
+ +                      r = -EINVAL;
                 else
                         kvm->arch.bsp_vcpu_id = arg;
                 mutex_unlock(&kvm->lock);
@@@ -8554,11 -8525,6 +8549,11 @@@ static bool emulator_guest_has_rdpid(st
         return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
   }
   
+ +static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+ +{
+ +      return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+ +}
+ +
   static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
   {
         return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@@ -8657,7 -8623,6 +8652,7 @@@ static const struct x86_emulate_ops emu
         .guest_has_movbe     = emulator_guest_has_movbe,
         .guest_has_fxsr      = emulator_guest_has_fxsr,
         .guest_has_rdpid     = emulator_guest_has_rdpid,
+ +      .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
         .set_nmi_mask        = emulator_set_nmi_mask,
         .is_smm              = emulator_is_smm,
         .is_guest_mode       = emulator_is_guest_mode,
@@@ -9039,17 -9004,19 +9034,17 @@@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_ins
   
   static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
   {
- -      u32 shadow;
- -
         if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
                 return true;
   
         /*
- -       * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
- -       * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first
- -       * to avoid the relatively expensive CPUID lookup.
+ +       * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ +       * active, but AMD compatible CPUs do not.
          */
- -      shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
- -      return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
- -             guest_cpuid_is_intel(vcpu);
+ +      if (!guest_cpuid_is_intel_compatible(vcpu))
+ +              return false;
+ +
+ +      return static_call(kvm_x86_get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
   }
   
   static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
@@@ -9809,19 -9776,19 +9804,19 @@@ int kvm_x86_vendor_init(struct kvm_x86_
         kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
   
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- -              host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- -              kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ +              kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ +              kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
         }
   
- -      rdmsrl_safe(MSR_EFER, &host_efer);
+ +      rdmsrl_safe(MSR_EFER, &kvm_host.efer);
   
         if (boot_cpu_has(X86_FEATURE_XSAVES))
- -              rdmsrl(MSR_IA32_XSS, host_xss);
+ +              rdmsrl(MSR_IA32_XSS, kvm_host.xss);
   
         kvm_init_pmu_capability(ops->pmu_ops);
   
         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- -              rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ +              rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
   
         r = ops->hardware_setup();
         if (r != 0)
@@@ -10046,10 -10013,6 +10041,10 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activa
   static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
                                        enum kvm_apicv_inhibit reason, bool set)
   {
+ +      const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+ +
+ +      BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+ +
         if (set)
                 __set_bit(reason, inhibits);
         else
@@@ -10061,7 -10024,7 +10056,7 @@@
   static void kvm_apicv_init(struct kvm *kvm)
   {
         enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
- -                                                     APICV_INHIBIT_REASON_DISABLE;
+ +                                                     APICV_INHIBIT_REASON_DISABLED;
   
         set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
   
@@@ -10282,8 -10245,6 +10277,8 @@@ static void post_kvm_run_save(struct kv
   
         if (is_smm(vcpu))
                 kvm_run->flags |= KVM_RUN_X86_SMM;
+ +      if (is_guest_mode(vcpu))
+ +              kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
   }
   
   static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@@ -10761,12 -10722,13 +10756,12 @@@ static void vcpu_scan_ioapic(struct kvm
   
         bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
   
+ +      static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ +
         if (irqchip_split(vcpu->kvm))
                 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
- -      else {
- -              static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
- -              if (ioapic_in_kernel(vcpu->kvm))
- -                      kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
- -      }
+ +      else if (ioapic_in_kernel(vcpu->kvm))
+ +              kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
   
         if (is_guest_mode(vcpu))
                 vcpu->arch.load_eoi_exitmap_pending = true;
@@@ -10972,14 -10934,6 +10967,14 @@@ static int vcpu_enter_guest(struct kvm_
   
                 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
                         static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+ +
+ +              if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+ +                      kvm_vcpu_reset(vcpu, true);
+ +                      if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+ +                              r = 1;
+ +                              goto out;
+ +                      }
+ +              }
         }
   
         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@@ -11181,6 -11135,12 +11176,12 @@@
   
         kvm_vcpu_srcu_read_lock(vcpu);
   
+       /*
+        * Call this to ensure WC buffers in guest are evicted after each VM
+        * Exit, so that the evicted WC writes can be snooped across all cpus
+        */
+       smp_mb__after_srcu_read_lock();
+ 
         /*
          * Profile KVM exit RIPs:
          */
@@@ -11288,6 -11248,7 +11289,6 @@@ static int vcpu_run(struct kvm_vcpu *vc
         int r;
   
         vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- -      vcpu->arch.l1tf_flush_l1d = true;
   
         for (;;) {
                 /*
@@@ -11437,7 -11398,7 +11438,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
   
         kvm_vcpu_srcu_read_lock(vcpu);
         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
- -              if (kvm_run->immediate_exit) {
+ +              if (!vcpu->wants_to_run) {
                         r = -EINTR;
                         goto out;
                 }
@@@ -11515,7 -11476,7 +11516,7 @@@
                 WARN_ON_ONCE(vcpu->mmio_needed);
         }
   
- -      if (kvm_run->immediate_exit) {
+ +      if (!vcpu->wants_to_run) {
                 r = -EINTR;
                 goto out;
         }
@@@ -12209,7 -12170,7 +12210,7 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
         if (r < 0)
                 return r;
   
- -      r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ +      r = kvm_create_lapic(vcpu);
         if (r < 0)
                 goto fail_mmu_destroy;
   
@@@ -12264,7 -12225,6 +12265,6 @@@
         vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
         vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
         kvm_xen_init_vcpu(vcpu);
-       kvm_vcpu_mtrr_init(vcpu);
         vcpu_load(vcpu);
         kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
         kvm_vcpu_reset(vcpu, false);
@@@ -12608,6 -12568,18 +12608,6 @@@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *v
         return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
   }
   
- -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
- -{
- -      struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- -
- -      vcpu->arch.l1tf_flush_l1d = true;
- -      if (pmu->version && unlikely(pmu->event_count)) {
- -              pmu->need_cleanup = true;
- -              kvm_make_request(KVM_REQ_PMU, vcpu);
- -      }
- -      static_call(kvm_x86_sched_in)(vcpu, cpu);
- -}
- -
   void kvm_arch_free_vm(struct kvm *kvm)
   {
   #if IS_ENABLED(CONFIG_HYPERV)
@@@ -12658,7 -12630,6 +12658,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
   
         kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ +      kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
         kvm->arch.guest_can_read_msr_platform_info = true;
         kvm->arch.enable_pmu = enable_pmu;
   
@@@ -13175,9 -13146,6 +13175,9 @@@ static inline bool kvm_vcpu_has_events(
         if (kvm_test_request(KVM_REQ_PMI, vcpu))
                 return true;
   
+ +      if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ +              return true;
+ +
         if (kvm_arch_interrupt_allowed(vcpu) &&
             (kvm_cpu_has_interrupt(vcpu) ||
             kvm_guest_apic_has_interrupt(vcpu)))
@@@ -13528,13 -13496,13 +13528,13 @@@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned
   static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
   {
         /*
-        * Non-coherent DMA assignment and de-assignment will affect
-        * whether KVM honors guest MTRRs and cause changes in memtypes
-        * in TDP.
-        * So, pass %true unconditionally to indicate non-coherent DMA was,
-        * or will be involved, and that zapping SPTEs might be necessary.
+        * Non-coherent DMA assignment and de-assignment may affect whether or
+        * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+        * due to toggling the "ignore PAT" bit.  Zap all SPTEs when the first
+        * (or last) non-coherent device is (un)registered to so that new SPTEs
+        * with the correct "ignore guest PAT" setting are created.
          */
-       if (__kvm_mmu_honors_guest_mtrrs(true))
+       if (kvm_mmu_may_ignore_guest_pat())
                 kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
   }
   
@@@ -13631,24 -13599,6 +13631,24 @@@ bool kvm_arch_no_poll(struct kvm_vcpu *
   }
   EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
   
+ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+ +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
+ +{
+ +      return kvm->arch.vm_type == KVM_X86_SNP_VM;
+ +}
+ +
+ +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+ +{
+ +      return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order);
+ +}
+ +#endif
+ +
+ +#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+ +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+ +{
+ +      static_call_cond(kvm_x86_gmem_invalidate)(start, end);
+ +}
+ +#endif
   
   int kvm_spec_ctrl_test_value(u64 value)
   {
@@@ -14034,7 -13984,6 +14034,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexi
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+ +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
   
   static int __init kvm_x86_init(void)
   {
diff --combined arch/x86/kvm/x86.h

index a88c65d3ea26a9c89997a98a45c32bec79e7796c,c8b0f0c7cdf6252207a9b830fa55b6c70889d642..5da5b869a991c42b29f050a01514db0c98315c89
--- 1/arch/x86/kvm/x86.h
--- 2/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@@ -33,20 -33,6 +33,20 @@@ struct kvm_caps 
         u64 supported_perf_cap;
   };
   
+ +struct kvm_host_values {
+ +      /*
+ +       * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ +       * address bits irrespective of features that repurpose legal bits,
+ +       * e.g. MKTME.
+ +       */
+ +      u8 maxphyaddr;
+ +
+ +      u64 efer;
+ +      u64 xcr0;
+ +      u64 xss;
+ +      u64 arch_capabilities;
+ +};
+ +
   void kvm_spurious_fault(void);
   
   #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)               \
@@@ -325,12 -311,8 +325,8 @@@ int handle_ud(struct kvm_vcpu *vcpu)
   void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
                                    struct kvm_queued_exception *ex);
   
- void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
- u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
   int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
   int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
- bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                         int page_num);
   bool kvm_vector_hashing_enabled(void);
   void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
   int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
@@@ -339,8 -321,11 +335,8 @@@ int x86_emulate_instruction(struct kvm_
                             int emulation_type, void *insn, int insn_len);
   fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
   
- -extern u64 host_xcr0;
- -extern u64 host_xss;
- -extern u64 host_arch_capabilities;
- -
   extern struct kvm_caps kvm_caps;
+ +extern struct kvm_host_values kvm_host;
   
   extern bool enable_pmu;
author	Paolo Bonzini <[email protected]>
	Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)
committer	Paolo Bonzini <[email protected]>
	Tue, 16 Jul 2024 13:54:57 +0000 (09:54 -0400)
		1	2
Documentation/virt/kvm/api.rst	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.h	patch \|	diff1 \|	diff2 \|	blob \| history