]> Git Repo - J-linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <[email protected]>
Thu, 6 Aug 2020 19:59:31 +0000 (12:59 -0700)
committerLinus Torvalds <[email protected]>
Thu, 6 Aug 2020 19:59:31 +0000 (12:59 -0700)
Pull KVM updates from Paolo Bonzini:
 "s390:
   - implement diag318

  x86:
   - Report last CPU for debugging
   - Emulate smaller MAXPHYADDR in the guest than in the host
   - .noinstr and tracing fixes from Thomas
   - nested SVM page table switching optimization and fixes

  Generic:
   - Unify shadow MMU cache data structures across architectures"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (127 commits)
  KVM: SVM: Fix sev_pin_memory() error handling
  KVM: LAPIC: Set the TDCR settable bits
  KVM: x86: Specify max TDP level via kvm_configure_mmu()
  KVM: x86/mmu: Rename max_page_level to max_huge_page_level
  KVM: x86: Dynamically calculate TDP level from max level and MAXPHYADDR
  KVM: VXM: Remove temporary WARN on expected vs. actual EPTP level mismatch
  KVM: x86: Pull the PGD's level from the MMU instead of recalculating it
  KVM: VMX: Make vmx_load_mmu_pgd() static
  KVM: x86/mmu: Add separate helper for shadow NPT root page role calc
  KVM: VMX: Drop a duplicate declaration of construct_eptp()
  KVM: nSVM: Correctly set the shadow NPT root level in its MMU role
  KVM: Using macros instead of magic values
  MIPS: KVM: Fix build error caused by 'kvm_run' cleanup
  KVM: nSVM: remove nonsensical EXITINFO1 adjustment on nested NPF
  KVM: x86: Add a capability for GUEST_MAXPHYADDR < HOST_MAXPHYADDR support
  KVM: VMX: optimize #PF injection when MAXPHYADDR does not match
  KVM: VMX: Add guest physical address check in EPT violation and misconfig
  KVM: VMX: introduce vmx_need_pf_intercept
  KVM: x86: update exception bitmap on CPUID changes
  KVM: x86: rename update_bp_intercept to update_exception_bitmap
  ...

20 files changed:
1  2 
Documentation/admin-guide/kernel-parameters.txt
Documentation/virt/kvm/api.rst
arch/arm64/include/asm/kvm_host.h
arch/arm64/kvm/mmu.c
arch/arm64/kvm/sys_regs.c
arch/mips/Kconfig
arch/mips/kvm/emulate.c
arch/mips/kvm/vz.c
arch/s390/kernel/setup.c
arch/x86/Kconfig
arch/x86/include/asm/idtentry.h
arch/x86/kernel/kvm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm_host.h

index 254bfafdcbcdb26031e0d876239ef58b097dac1f,4740d0d9aaa366767dc2eceb2991f2fb81449a2e..ef66b3c45ba221041b0bcf0b58afa337392945a4
        cpufreq.off=1   [CPU_FREQ]
                        disable the cpufreq sub-system
  
 +      cpufreq.default_governor=
 +                      [CPU_FREQ] Name of the default cpufreq governor or
 +                      policy to use. This governor must be registered in the
 +                      kernel before the cpufreq driver probes.
 +
        cpu_init_udelay=N
                        [X86] Delay for N microsec between assert and de-assert
                        of APIC INIT to start processors.  This delay occurs
                        useful to also enable the page_owner functionality.
                        on: enable the feature
  
 +      debugfs=        [KNL] This parameter enables what is exposed to userspace
 +                      and debugfs internal clients.
 +                      Format: { on, no-mount, off }
 +                      on:     All functions are enabled.
 +                      no-mount:
 +                              Filesystem is not registered but kernel clients can
 +                              access APIs and a crashkernel can be used to read
 +                              its content. There is nothing to mount.
 +                      off:    Filesystem is not registered and clients
 +                              get a -EPERM as result when trying to register files
 +                              or directories within debugfs.
 +                              This is equivalent of the runtime functionality if
 +                              debugfs was not enabled in the kernel at all.
 +                      Default value is set in build-time with a kernel configuration.
 +
        debugpat        [X86] Enable PAT debugging
  
        decnet.addr=    [HW,NET]
                        Format: {"off" | "on" | "skip[mbr]"}
  
        efi=            [EFI]
 -                      Format: { "old_map", "nochunk", "noruntime", "debug",
 -                                "nosoftreserve", "disable_early_pci_dma",
 -                                "no_disable_early_pci_dma" }
 -                      old_map [X86-64]: switch to the old ioremap-based EFI
 -                      runtime services mapping. [Needs CONFIG_X86_UV=y]
 +                      Format: { "debug", "disable_early_pci_dma",
 +                                "nochunk", "noruntime", "nosoftreserve",
 +                                "novamap", "no_disable_early_pci_dma",
 +                                "old_map" }
 +                      debug: enable misc debug output.
 +                      disable_early_pci_dma: disable the busmaster bit on all
 +                      PCI bridges while in the EFI boot stub.
                        nochunk: disable reading files in "chunks" in the EFI
                        boot stub, as chunking can cause problems with some
                        firmware implementations.
                        noruntime : disable EFI runtime services support
 -                      debug: enable misc debug output
                        nosoftreserve: The EFI_MEMORY_SP (Specific Purpose)
                        attribute may cause the kernel to reserve the
                        memory range for a memory mapping driver to
                        claim. Specify efi=nosoftreserve to disable this
                        reservation and treat the memory by its base type
                        (i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
 -                      disable_early_pci_dma: Disable the busmaster bit on all
 -                      PCI bridges while in the EFI boot stub
 +                      novamap: do not call SetVirtualAddressMap().
                        no_disable_early_pci_dma: Leave the busmaster bit set
                        on all PCI bridges while in the EFI boot stub
 +                      old_map [X86-64]: switch to the old ioremap-based EFI
 +                      runtime services mapping. [Needs CONFIG_X86_UV=y]
  
        efi_no_storage_paranoia [EFI; X86]
                        Using this parameter you can use more than 50% of
                        touchscreen support is not enabled in the mainstream
                        kernel as of 2.6.30, a preliminary port can be found
                        in the "bleeding edge" mini2440 support kernel at
 -                      http://repo.or.cz/w/linux-2.6/mini2440.git
 +                      https://repo.or.cz/w/linux-2.6/mini2440.git
  
        mitigations=
                        [X86,PPC,S390,ARM64] Control optional mitigations for
        no5lvl          [X86-64] Disable 5-level paging mode. Forces
                        kernel to use 4-level paging instead.
  
 +      nofsgsbase      [X86] Disables FSGSBASE instructions.
 +
        no_console_suspend
                        [HW] Never suspend the console
                        Disable suspending of consoles during suspend and
                        latencies, which will choose a value aligned
                        with the appropriate hardware boundaries.
  
 +      rcutree.rcu_min_cached_objs= [KNL]
 +                      Minimum number of objects which are cached and
 +                      maintained per one CPU. Object size is equal
 +                      to PAGE_SIZE. The cache allows to reduce the
 +                      pressure to page allocator, also it makes the
 +                      whole algorithm to behave better in low memory
 +                      condition.
 +
        rcutree.jiffies_till_first_fqs= [KNL]
                        Set delay from grace-period initialization to
                        first attempt to force quiescent states.
                        Set time (jiffies) between CPU-hotplug operations,
                        or zero to disable CPU-hotplug testing.
  
 +      rcutorture.read_exit= [KNL]
 +                      Set the number of read-then-exit kthreads used
 +                      to test the interaction of RCU updaters and
 +                      task-exit processing.
 +
 +      rcutorture.read_exit_burst= [KNL]
 +                      The number of times in a given read-then-exit
 +                      episode that a set of read-then-exit kthreads
 +                      is spawned.
 +
 +      rcutorture.read_exit_delay= [KNL]
 +                      The delay, in seconds, between successive
 +                      read-then-exit testing episodes.
 +
        rcutorture.shuffle_interval= [KNL]
                        Set task-shuffle interval (s).  Shuffling tasks
                        allows some CPUs to go into dyntick-idle mode
                              reboot_cpu is s[mp]#### with #### being the processor
                                        to be used for rebooting.
  
 +      refscale.holdoff= [KNL]
 +                      Set test-start holdoff period.  The purpose of
 +                      this parameter is to delay the start of the
 +                      test until boot completes in order to avoid
 +                      interference.
 +
 +      refscale.loops= [KNL]
 +                      Set the number of loops over the synchronization
 +                      primitive under test.  Increasing this number
 +                      reduces noise due to loop start/end overhead,
 +                      but the default has already reduced the per-pass
 +                      noise to a handful of picoseconds on ca. 2020
 +                      x86 laptops.
 +
 +      refscale.nreaders= [KNL]
 +                      Set number of readers.  The default value of -1
 +                      selects N, where N is roughly 75% of the number
 +                      of CPUs.  A value of zero is an interesting choice.
 +
 +      refscale.nruns= [KNL]
 +                      Set number of runs, each of which is dumped onto
 +                      the console log.
 +
 +      refscale.readdelay= [KNL]
 +                      Set the read-side critical-section duration,
 +                      measured in microseconds.
 +
 +      refscale.scale_type= [KNL]
 +                      Specify the read-protection implementation to test.
 +
 +      refscale.shutdown= [KNL]
 +                      Shut down the system at the end of the performance
 +                      test.  This defaults to 1 (shut it down) when
 +                      rcuperf is built into the kernel and to 0 (leave
 +                      it running) when rcuperf is built as a module.
 +
 +      refscale.verbose= [KNL]
 +                      Enable additional printk() statements.
 +
        relax_domain_level=
                        [KNL, SMP] Set scheduler's default relax_domain_level.
                        See Documentation/admin-guide/cgroup-v1/cpusets.rst.
                        Prevent the CPU-hotplug component of torturing
                        until after init has spawned.
  
 +      torture.ftrace_dump_at_shutdown= [KNL]
 +                      Dump the ftrace buffer at torture-test shutdown,
 +                      even if there were no errors.  This can be a
 +                      very costly operation when many torture tests
 +                      are running concurrently, especially on systems
 +                      with rotating-rust storage.
 +
        tp720=          [HW,PS2]
  
        tpm_suspend_pcr=[HW,TPM]
                        panic() code such as dumping handler.
  
        xen_nopvspin    [X86,XEN]
-                       Disables the ticketlock slowpath using Xen PV
-                       optimizations.
+                       Disables the qspinlock slowpath using Xen PV optimizations.
+                       This parameter is obsoleted by "nopvspin" parameter, which
+                       has equivalent effect for XEN platform.
  
        xen_nopv        [X86]
                        Disables the PV optimizations forcing the HVM guest to
                        as generic guest with no PV drivers. Currently support
                        XEN HVM, KVM, HYPER_V and VMWARE guest.
  
+       nopvspin        [X86,XEN,KVM]
+                       Disables the qspinlock slow path using PV optimizations
+                       which allow the hypervisor to 'idle' the guest on lock
+                       contention.
        xirc2ps_cs=     [NET,PCMCIA]
                        Format:
                        <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
index fe05201e17bec6b6106c0d42414517db5f91242a,644e5326aa50d36a005ca974319f0b0009656074..cdfd98155311164e93326fee27d5e08fdff4a86d
@@@ -65,7 -65,7 +65,7 @@@ not be freed until both the parent (ori
  put their references to the VM's file descriptor.
  
  Because a VM's resources are not freed until the last reference to its
 -file descriptor is released, creating additional references to a VM via
 +file descriptor is released, creating additional references to a VM
  via fork(), dup(), etc... without careful consideration is strongly
  discouraged and may have unwanted side effects, e.g. memory allocated
  by and on behalf of the VM's process may not be freed/unaccounted when
@@@ -536,7 -536,7 +536,7 @@@ X86
        ========= ===================================
          0       on success,
         -EEXIST  if an interrupt is already enqueued
 -       -EINVAL  the the irq number is invalid
 +       -EINVAL  the irq number is invalid
         -ENXIO   if the PIC is in the kernel
         -EFAULT  if the pointer is invalid
        ========= ===================================
@@@ -669,6 -669,10 +669,10 @@@ MSRs that have been set successfully
  Defines the vcpu responses to the cpuid instruction.  Applications
  should use the KVM_SET_CPUID2 ioctl if available.
  
+ Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID
+ configuration (if there is) is not corrupted. Userspace can get a copy of the
+ resulting CPUID configuration through KVM_GET_CPUID2 in case.
  ::
  
    struct kvm_cpuid_entry {
@@@ -3147,7 -3151,7 +3151,7 @@@ Possible features
  :Capability: basic
  :Architectures: arm, arm64
  :Type: vm ioctl
 -:Parameters: struct struct kvm_vcpu_init (out)
 +:Parameters: struct kvm_vcpu_init (out)
  :Returns: 0 on success; -1 on error
  
  Errors:
@@@ -3167,7 -3171,7 +3171,7 @@@ not mandatory
  
  The information returned by this ioctl can be used to prepare an instance
  of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in
 -in VCPU matching underlying host.
 +VCPU matching underlying host.
  
  
  4.84 KVM_GET_REG_LIST
@@@ -4339,15 -4343,14 +4343,15 @@@ Errors
  #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
  
    struct kvm_vmx_nested_state_hdr {
 -      __u32 flags;
        __u64 vmxon_pa;
        __u64 vmcs12_pa;
 -      __u64 preemption_timer_deadline;
  
        struct {
                __u16 flags;
        } smm;
 +
 +      __u32 flags;
 +      __u64 preemption_timer_deadline;
    };
  
    struct kvm_vmx_nested_state_data {
@@@ -4795,6 -4798,7 +4799,7 @@@ hardware_exit_reason
                /* KVM_EXIT_FAIL_ENTRY */
                struct {
                        __u64 hardware_entry_failure_reason;
+                       __u32 cpu; /* if KVM_LAST_CPU */
                } fail_entry;
  
  If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
@@@ -5856,7 -5860,7 +5861,7 @@@ features of the KVM implementation
  :Architectures: ppc
  
  This capability, if KVM_CHECK_EXTENSION indicates that it is
 -available, means that that the kernel has an implementation of the
 +available, means that the kernel has an implementation of the
  H_RANDOM hypercall backed by a hardware random-number generator.
  If present, the kernel H_RANDOM handler can be enabled for guest use
  with the KVM_CAP_PPC_ENABLE_HCALL capability.
  :Architectures: x86
  
  This capability, if KVM_CHECK_EXTENSION indicates that it is
 -available, means that that the kernel has an implementation of the
 +available, means that the kernel has an implementation of the
  Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is
  used to support Windows Hyper-V based guest paravirt drivers(VMBus).
  
@@@ -5882,7 -5886,7 +5887,7 @@@ by the CPU, as it's incompatible with S
  :Architectures: ppc
  
  This capability, if KVM_CHECK_EXTENSION indicates that it is
 -available, means that that the kernel can support guests using the
 +available, means that the kernel can support guests using the
  radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
  processor).
  
  :Architectures: ppc
  
  This capability, if KVM_CHECK_EXTENSION indicates that it is
 -available, means that that the kernel can support guests using the
 +available, means that the kernel can support guests using the
  hashed page table MMU defined in Power ISA V3.00 (as implemented in
  the POWER9 processor), including in-memory segment tables.
  
@@@ -5997,7 -6001,7 +6002,7 @@@ run->kvm_valid_regs or run->kvm_dirty_r
  
  If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a
  number larger than 0 indicating the version of this capability is implemented
 -and thereby which bits in in run->s.regs.device_irq_level can signal values.
 +and thereby which bits in run->s.regs.device_irq_level can signal values.
  
  Currently the following bits are defined for the device_irq_level bitmap::
  
index e21d4a01372fe7413392a866c7f1df6c66354257,ad337d3162fe11fca559aa712a13dc5822a74cf7..f81151ad3d3cd4f8394147d25de02d082b4a48eb
@@@ -97,17 -97,6 +97,6 @@@ struct kvm_arch 
        bool return_nisv_io_abort_to_user;
  };
  
- #define KVM_NR_MEM_OBJS     40
- /*
-  * We don't want allocation failures within the mmu code, so we preallocate
-  * enough memory for a single page fault in a cache.
-  */
- struct kvm_mmu_memory_cache {
-       int nobjs;
-       void *objects[KVM_NR_MEM_OBJS];
- };
  struct kvm_vcpu_fault_info {
        u32 esr_el2;            /* Hyp Syndrom Register */
        u64 far_el2;            /* Hyp Fault Address Register */
@@@ -380,14 -369,9 +369,14 @@@ struct kvm_vcpu_arch 
  #define vcpu_has_sve(vcpu) (system_supports_sve() && \
                            ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
  
 -#define vcpu_has_ptrauth(vcpu)        ((system_supports_address_auth() || \
 -                                system_supports_generic_auth()) && \
 -                               ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH))
 +#ifdef CONFIG_ARM64_PTR_AUTH
 +#define vcpu_has_ptrauth(vcpu)                                                \
 +      ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) ||                \
 +        cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) &&               \
 +       (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH)
 +#else
 +#define vcpu_has_ptrauth(vcpu)                false
 +#endif
  
  #define vcpu_gp_regs(v)               (&(v)->arch.ctxt.gp_regs)
  
@@@ -486,18 -470,15 +475,15 @@@ u64 __kvm_call_hyp(void *hypfn, ...)
  void force_vm_exit(const cpumask_t *mask);
  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
- int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
-               int exception_index);
- void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                      int exception_index);
+ int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
+ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
  
  /* MMIO helpers */
  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
  unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
  
- int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
- int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                phys_addr_t fault_ipa);
+ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu);
+ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa);
  
  int kvm_perf_init(void);
  int kvm_perf_teardown(void);
diff --combined arch/arm64/kvm/mmu.c
index 31058e6e7c2a3a42445df11705c66f2b664b1da9,838aad520f1c0a7aa608de3708d4f3754500d600..7a7ddc4558a7697a69ca42b581f11a82e02788dd
@@@ -124,38 -124,6 +124,6 @@@ static void stage2_dissolve_pud(struct 
        put_page(virt_to_page(pudp));
  }
  
- static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-                                 int min, int max)
- {
-       void *page;
-       BUG_ON(max > KVM_NR_MEM_OBJS);
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < max) {
-               page = (void *)__get_free_page(GFP_PGTABLE_USER);
-               if (!page)
-                       return -ENOMEM;
-               cache->objects[cache->nobjs++] = page;
-       }
-       return 0;
- }
- static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
- {
-       while (mc->nobjs)
-               free_page((unsigned long)mc->objects[--mc->nobjs]);
- }
- static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
- {
-       void *p;
-       BUG_ON(!mc || !mc->nobjs);
-       p = mc->objects[--mc->nobjs];
-       return p;
- }
  static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
  {
        p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
@@@ -1132,7 -1100,7 +1100,7 @@@ static p4d_t *stage2_get_p4d(struct kv
        if (stage2_pgd_none(kvm, *pgd)) {
                if (!cache)
                        return NULL;
-               p4d = mmu_memory_cache_alloc(cache);
+               p4d = kvm_mmu_memory_cache_alloc(cache);
                stage2_pgd_populate(kvm, pgd, p4d);
                get_page(virt_to_page(pgd));
        }
@@@ -1150,7 -1118,7 +1118,7 @@@ static pud_t *stage2_get_pud(struct kv
        if (stage2_p4d_none(kvm, *p4d)) {
                if (!cache)
                        return NULL;
-               pud = mmu_memory_cache_alloc(cache);
+               pud = kvm_mmu_memory_cache_alloc(cache);
                stage2_p4d_populate(kvm, p4d, pud);
                get_page(virt_to_page(p4d));
        }
@@@ -1171,7 -1139,7 +1139,7 @@@ static pmd_t *stage2_get_pmd(struct kv
        if (stage2_pud_none(kvm, *pud)) {
                if (!cache)
                        return NULL;
-               pmd = mmu_memory_cache_alloc(cache);
+               pmd = kvm_mmu_memory_cache_alloc(cache);
                stage2_pud_populate(kvm, pud, pmd);
                get_page(virt_to_page(pud));
        }
@@@ -1326,7 -1294,7 +1294,7 @@@ static bool stage2_get_leaf_entry(struc
        return true;
  }
  
 -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
 +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
  {
        pud_t *pudp;
        pmd_t *pmdp;
                return false;
  
        if (pudp)
 -              return kvm_s2pud_exec(pudp);
 +              return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
        else if (pmdp)
 -              return kvm_s2pmd_exec(pmdp);
 +              return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
        else
 -              return kvm_s2pte_exec(ptep);
 +              return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
  }
  
  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
        if (stage2_pud_none(kvm, *pud)) {
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
-               pmd = mmu_memory_cache_alloc(cache);
+               pmd = kvm_mmu_memory_cache_alloc(cache);
                stage2_pud_populate(kvm, pud, pmd);
                get_page(virt_to_page(pud));
        }
        if (pmd_none(*pmd)) {
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
-               pte = mmu_memory_cache_alloc(cache);
+               pte = kvm_mmu_memory_cache_alloc(cache);
                kvm_pmd_populate(pmd, pte);
                get_page(virt_to_page(pmd));
        }
@@@ -1469,7 -1437,7 +1437,7 @@@ int kvm_phys_addr_ioremap(struct kvm *k
        phys_addr_t addr, end;
        int ret = 0;
        unsigned long pfn;
-       struct kvm_mmu_memory_cache cache = { 0, };
+       struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
  
        end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
        pfn = __phys_to_pfn(pa);
                if (writable)
                        pte = kvm_s2pte_mkwrite(pte);
  
-               ret = mmu_topup_memory_cache(&cache,
-                                            kvm_mmu_cache_min_pages(kvm),
-                                            KVM_NR_MEM_OBJS);
+               ret = kvm_mmu_topup_memory_cache(&cache,
+                                                kvm_mmu_cache_min_pages(kvm));
                if (ret)
                        goto out;
                spin_lock(&kvm->mmu_lock);
        }
  
  out:
-       mmu_free_memory_cache(&cache);
+       kvm_mmu_free_memory_cache(&cache);
        return ret;
  }
  
@@@ -1882,8 -1849,7 +1849,7 @@@ static int user_mem_abort(struct kvm_vc
        mmap_read_unlock(current->mm);
  
        /* We need minimum second+third level pages */
-       ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
-                                    KVM_NR_MEM_OBJS);
+       ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
        if (ret)
                return ret;
  
         * execute permissions, and we preserve whatever we have.
         */
        needs_exec = exec_fault ||
 -              (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
 +              (fault_status == FSC_PERM &&
 +               stage2_is_exec(kvm, fault_ipa, vma_pagesize));
  
        if (vma_pagesize == PUD_SIZE) {
                pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
@@@ -2050,7 -2015,6 +2016,6 @@@ out
  /**
   * kvm_handle_guest_abort - handles all 2nd stage aborts
   * @vcpu:     the VCPU pointer
-  * @run:      the kvm_run structure
   *
   * Any abort that gets to the host is almost guaranteed to be caused by a
   * missing second stage translation table entry, which can mean that either the
   * space. The distinction is based on the IPA causing the fault and whether this
   * memory region has been registered as standard RAM by user space.
   */
- int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
  {
        unsigned long fault_status;
        phys_addr_t fault_ipa;
                 * of the page size.
                 */
                fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-               ret = io_mem_abort(vcpu, run, fault_ipa);
+               ret = io_mem_abort(vcpu, fault_ipa);
                goto out_unlock;
        }
  
@@@ -2307,7 -2271,7 +2272,7 @@@ int kvm_test_age_hva(struct kvm *kvm, u
  
  void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
  {
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
  }
  
  phys_addr_t kvm_mmu_get_httbr(void)
index d3196671c590759d8bbad6a4f6b22eb845750034,c7a856913de8c5503d72d088285081934e61125a..138961d7ebe391e2f22453ad2520cfe82fd42b52
@@@ -1024,9 -1024,9 +1024,9 @@@ static bool access_amu(struct kvm_vcpu 
  
  /* Macro to expand the AMU counter and type registers*/
  #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), access_amu }
 -#define AMU_AMEVTYPE0_EL0(n) { SYS_DESC(SYS_AMEVTYPE0_EL0(n)), access_amu }
 +#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), access_amu }
  #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), access_amu }
 -#define AMU_AMEVTYPE1_EL0(n) { SYS_DESC(SYS_AMEVTYPE1_EL0(n)), access_amu }
 +#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), access_amu }
  
  static bool trap_ptrauth(struct kvm_vcpu *vcpu,
                         struct sys_reg_params *p,
@@@ -1629,22 -1629,22 +1629,22 @@@ static const struct sys_reg_desc sys_re
        AMU_AMEVCNTR0_EL0(13),
        AMU_AMEVCNTR0_EL0(14),
        AMU_AMEVCNTR0_EL0(15),
 -      AMU_AMEVTYPE0_EL0(0),
 -      AMU_AMEVTYPE0_EL0(1),
 -      AMU_AMEVTYPE0_EL0(2),
 -      AMU_AMEVTYPE0_EL0(3),
 -      AMU_AMEVTYPE0_EL0(4),
 -      AMU_AMEVTYPE0_EL0(5),
 -      AMU_AMEVTYPE0_EL0(6),
 -      AMU_AMEVTYPE0_EL0(7),
 -      AMU_AMEVTYPE0_EL0(8),
 -      AMU_AMEVTYPE0_EL0(9),
 -      AMU_AMEVTYPE0_EL0(10),
 -      AMU_AMEVTYPE0_EL0(11),
 -      AMU_AMEVTYPE0_EL0(12),
 -      AMU_AMEVTYPE0_EL0(13),
 -      AMU_AMEVTYPE0_EL0(14),
 -      AMU_AMEVTYPE0_EL0(15),
 +      AMU_AMEVTYPER0_EL0(0),
 +      AMU_AMEVTYPER0_EL0(1),
 +      AMU_AMEVTYPER0_EL0(2),
 +      AMU_AMEVTYPER0_EL0(3),
 +      AMU_AMEVTYPER0_EL0(4),
 +      AMU_AMEVTYPER0_EL0(5),
 +      AMU_AMEVTYPER0_EL0(6),
 +      AMU_AMEVTYPER0_EL0(7),
 +      AMU_AMEVTYPER0_EL0(8),
 +      AMU_AMEVTYPER0_EL0(9),
 +      AMU_AMEVTYPER0_EL0(10),
 +      AMU_AMEVTYPER0_EL0(11),
 +      AMU_AMEVTYPER0_EL0(12),
 +      AMU_AMEVTYPER0_EL0(13),
 +      AMU_AMEVTYPER0_EL0(14),
 +      AMU_AMEVTYPER0_EL0(15),
        AMU_AMEVCNTR1_EL0(0),
        AMU_AMEVCNTR1_EL0(1),
        AMU_AMEVCNTR1_EL0(2),
        AMU_AMEVCNTR1_EL0(13),
        AMU_AMEVCNTR1_EL0(14),
        AMU_AMEVCNTR1_EL0(15),
 -      AMU_AMEVTYPE1_EL0(0),
 -      AMU_AMEVTYPE1_EL0(1),
 -      AMU_AMEVTYPE1_EL0(2),
 -      AMU_AMEVTYPE1_EL0(3),
 -      AMU_AMEVTYPE1_EL0(4),
 -      AMU_AMEVTYPE1_EL0(5),
 -      AMU_AMEVTYPE1_EL0(6),
 -      AMU_AMEVTYPE1_EL0(7),
 -      AMU_AMEVTYPE1_EL0(8),
 -      AMU_AMEVTYPE1_EL0(9),
 -      AMU_AMEVTYPE1_EL0(10),
 -      AMU_AMEVTYPE1_EL0(11),
 -      AMU_AMEVTYPE1_EL0(12),
 -      AMU_AMEVTYPE1_EL0(13),
 -      AMU_AMEVTYPE1_EL0(14),
 -      AMU_AMEVTYPE1_EL0(15),
 +      AMU_AMEVTYPER1_EL0(0),
 +      AMU_AMEVTYPER1_EL0(1),
 +      AMU_AMEVTYPER1_EL0(2),
 +      AMU_AMEVTYPER1_EL0(3),
 +      AMU_AMEVTYPER1_EL0(4),
 +      AMU_AMEVTYPER1_EL0(5),
 +      AMU_AMEVTYPER1_EL0(6),
 +      AMU_AMEVTYPER1_EL0(7),
 +      AMU_AMEVTYPER1_EL0(8),
 +      AMU_AMEVTYPER1_EL0(9),
 +      AMU_AMEVTYPER1_EL0(10),
 +      AMU_AMEVTYPER1_EL0(11),
 +      AMU_AMEVTYPER1_EL0(12),
 +      AMU_AMEVTYPER1_EL0(13),
 +      AMU_AMEVTYPER1_EL0(14),
 +      AMU_AMEVTYPER1_EL0(15),
  
        { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
@@@ -2156,7 -2156,7 +2156,7 @@@ static const struct sys_reg_desc *find_
        return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
  }
  
- int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
  {
        kvm_inject_undefined(vcpu);
        return 1;
@@@ -2335,7 -2335,7 +2335,7 @@@ static int kvm_handle_cp_32(struct kvm_
        return 1;
  }
  
- int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu)
  {
        const struct sys_reg_desc *target_specific;
        size_t num;
                                target_specific, num);
  }
  
- int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
  {
        const struct sys_reg_desc *target_specific;
        size_t num;
                                target_specific, num);
  }
  
- int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu)
  {
        return kvm_handle_cp_64(vcpu,
                                cp14_64_regs, ARRAY_SIZE(cp14_64_regs),
                                NULL, 0);
  }
  
- int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu)
  {
        return kvm_handle_cp_32(vcpu,
                                cp14_regs, ARRAY_SIZE(cp14_regs),
@@@ -2416,9 -2416,8 +2416,8 @@@ static void reset_sys_reg_descs(struct 
  /**
   * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access
   * @vcpu: The VCPU pointer
-  * @run:  The kvm_run struct
   */
- int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
  {
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_hsr(vcpu);
diff --combined arch/mips/Kconfig
index dd36cba078bc81d401fd23eb5bee710c057bbfdd,2efc34ed94ebeb69abaa875839457666cdca1b1b..c95fa3a2484cf056c9dc4238980acd896ef13ca6
@@@ -51,6 -51,7 +51,6 @@@ config MIP
        select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
        select HAVE_CONTEXT_TRACKING
        select HAVE_TIF_NOHZ
 -      select HAVE_COPY_THREAD_TLS
        select HAVE_C_RECORDMCOUNT
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DEBUG_STACKOVERFLOW
@@@ -366,7 -367,6 +366,7 @@@ config MACH_JAZ
        select ARC_PROMLIB
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
 +      select DMA_OPS
        select FW_ARC
        select FW_ARC32
        select ARCH_MAY_HAVE_PC_FDC
@@@ -478,7 -478,6 +478,7 @@@ config MACH_LOONGSON6
        select COMMON_CLK
        select USE_OF
        select BUILTIN_DTB
 +      select PCI_HOST_GENERIC
        help
          This enables the support of Loongson-2/3 family of machines.
  
@@@ -679,7 -678,6 +679,7 @@@ config SGI_IP2
        select SYS_SUPPORTS_NUMA
        select SYS_SUPPORTS_SMP
        select MIPS_L1_CACHE_SHIFT_7
 +      select NUMA
        help
          This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics
          workstations.  To compile a Linux kernel that runs on these, say Y
@@@ -1015,6 -1013,24 +1015,6 @@@ config NLM_XLP_BOAR
          This board is based on Netlogic XLP Processor.
          Say Y here if you have a XLP based board.
  
 -config MIPS_PARAVIRT
 -      bool "Para-Virtualized guest system"
 -      select CEVT_R4K
 -      select CSRC_R4K
 -      select SYS_SUPPORTS_64BIT_KERNEL
 -      select SYS_SUPPORTS_32BIT_KERNEL
 -      select SYS_SUPPORTS_BIG_ENDIAN
 -      select SYS_SUPPORTS_SMP
 -      select NR_CPUS_DEFAULT_4
 -      select SYS_HAS_EARLY_PRINTK
 -      select SYS_HAS_CPU_MIPS32_R2
 -      select SYS_HAS_CPU_MIPS64_R2
 -      select SYS_HAS_CPU_CAVIUM_OCTEON
 -      select HAVE_PCI
 -      select SWAP_IO_SPACE
 -      help
 -        This option supports guest running under ????
 -
  endchoice
  
  source "arch/mips/alchemy/Kconfig"
@@@ -1039,6 -1055,7 +1039,6 @@@ source "arch/mips/loongson2ef/Kconfig
  source "arch/mips/loongson32/Kconfig"
  source "arch/mips/loongson64/Kconfig"
  source "arch/mips/netlogic/Kconfig"
 -source "arch/mips/paravirt/Kconfig"
  
  endmenu
  
@@@ -1152,6 -1169,9 +1152,6 @@@ config MIPS_MS
  config SYNC_R4K
        bool
  
 -config MIPS_MACHINE
 -      def_bool n
 -
  config NO_IOPORT_MAP
        def_bool n
  
@@@ -2182,6 -2202,7 +2182,7 @@@ endchoic
  
  config KVM_GUEST
        bool "KVM Guest Kernel"
+       depends on CPU_MIPS32_R2
        depends on BROKEN_ON_SMP
        help
          Select this option if building a guest kernel for KVM (Trap & Emulate)
@@@ -2805,7 -2826,7 +2806,7 @@@ config SM
          Y to "Enhanced Real Time Clock Support", below.
  
          See also the SMP-HOWTO available at
 -        <http://www.tldp.org/docs.html#howto>.
 +        <https://www.tldp.org/docs.html#howto>.
  
          If you don't know what to do here, say N.
  
diff --combined arch/mips/kvm/emulate.c
index d242300cacc04371038bf59d954ccc026a59c3dc,8018e92ffd4b7251c2ad96c2745958e6dbc4e4b0..703782355318f1e2209c7a9ce08d8e7135f6a2f0
@@@ -1262,7 -1262,6 +1262,6 @@@ unsigned int kvm_mips_config5_wrmask(st
  
  enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                                           u32 *opc, u32 cause,
-                                          struct kvm_run *run,
                                           struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
@@@ -1597,12 -1596,12 +1596,12 @@@ dont_update_pc
  
  enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
                                             u32 cause,
-                                            struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
  {
        int r;
        enum emulation_result er;
        u32 rt;
+       struct kvm_run *run = vcpu->run;
        void *data = run->mmio.data;
        unsigned int imme;
        unsigned long curr_pc;
                          vcpu->arch.gprs[rt], *(u32 *)data);
                break;
  
 +#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
        case sdl_op:
                run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
                                        vcpu->arch.host_cp0_badvaddr) & (~0x7);
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
                          vcpu->arch.gprs[rt], *(u64 *)data);
                break;
 +#endif
  
  #ifdef CONFIG_CPU_LOONGSON64
        case sdc2_op:
                                  vcpu->arch.gprs[rt], *(u64 *)data);
                        break;
                default:
-                       kvm_err("Godson Exteneded GS-Store not yet supported (inst=0x%08x)\n",
+                       kvm_err("Godson Extended GS-Store not yet supported (inst=0x%08x)\n",
                                inst.word);
                        break;
                }
@@@ -1896,9 -1893,9 +1895,9 @@@ out_fail
  }
  
  enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
-                                           u32 cause, struct kvm_run *run,
-                                           struct kvm_vcpu *vcpu)
+                                           u32 cause, struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
        int r;
        enum emulation_result er;
        unsigned long curr_pc;
                }
                break;
  
 +#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
        case ldl_op:
                run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
                                        vcpu->arch.host_cp0_badvaddr) & (~0x7);
                        break;
                }
                break;
 +#endif
  
  #ifdef CONFIG_CPU_LOONGSON64
        case ldc2_op:
                        vcpu->mmio_needed = 30; /* signed */
                        break;
                default:
-                       kvm_err("Godson Exteneded GS-Load for float not yet supported (inst=0x%08x)\n",
+                       kvm_err("Godson Extended GS-Load for float not yet supported (inst=0x%08x)\n",
                                inst.word);
                        break;
                }
                        run->mmio.phys_addr, run->mmio.len, run->mmio.data);
  
        if (!r) {
-               kvm_mips_complete_mmio_load(vcpu, run);
+               kvm_mips_complete_mmio_load(vcpu);
                vcpu->mmio_needed = 0;
                return EMULATE_DONE;
        }
  static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
                                                     unsigned long curr_pc,
                                                     unsigned long addr,
-                                                    struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu,
                                                     u32 cause)
  {
                        /* no matching guest TLB */
                        vcpu->arch.host_cp0_badvaddr = addr;
                        vcpu->arch.pc = curr_pc;
-                       kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
+                       kvm_mips_emulate_tlbmiss_ld(cause, NULL, vcpu);
                        return EMULATE_EXCEPT;
                case KVM_MIPS_TLBINV:
                        /* invalid matching guest TLB */
                        vcpu->arch.host_cp0_badvaddr = addr;
                        vcpu->arch.pc = curr_pc;
-                       kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
+                       kvm_mips_emulate_tlbinv_ld(cause, NULL, vcpu);
                        return EMULATE_EXCEPT;
                default:
                        break;
  
  enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
                                             u32 *opc, u32 cause,
-                                            struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
  {
        enum emulation_result er = EMULATE_DONE;
                 * guest's behalf.
                 */
                er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
-                                            curr_pc, va, run, vcpu, cause);
+                                            curr_pc, va, vcpu, cause);
                if (er != EMULATE_DONE)
                        goto done;
  #ifdef CONFIG_KVM_MIPS_DYN_TRANS
        } else if (op_inst == Hit_Invalidate_I) {
                /* Perform the icache synchronisation on the guest's behalf */
                er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
-                                            curr_pc, va, run, vcpu, cause);
+                                            curr_pc, va, vcpu, cause);
                if (er != EMULATE_DONE)
                        goto done;
                er = kvm_mips_guest_cache_op(protected_flush_icache_line,
-                                            curr_pc, va, run, vcpu, cause);
+                                            curr_pc, va, vcpu, cause);
                if (er != EMULATE_DONE)
                        goto done;
  
@@@ -2317,7 -2310,6 +2314,6 @@@ done
  }
  
  enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
-                                           struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
  {
        union mips_instruction inst;
  
        switch (inst.r_format.opcode) {
        case cop0_op:
-               er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
+               er = kvm_mips_emulate_CP0(inst, opc, cause, vcpu);
                break;
  
  #ifndef CONFIG_CPU_MIPSR6
        case cache_op:
                ++vcpu->stat.cache_exits;
                trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
-               er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
+               er = kvm_mips_emulate_cache(inst, opc, cause, vcpu);
                break;
  #else
        case spec3_op:
                case cache6_op:
                        ++vcpu->stat.cache_exits;
                        trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
-                       er = kvm_mips_emulate_cache(inst, opc, cause, run,
+                       er = kvm_mips_emulate_cache(inst, opc, cause,
                                                    vcpu);
                        break;
                default:
@@@ -2388,7 -2380,6 +2384,6 @@@ long kvm_mips_guest_exception_base(stru
  
  enum emulation_result kvm_mips_emulate_syscall(u32 cause,
                                               u32 *opc,
-                                              struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
                                                  u32 *opc,
-                                                 struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
                                                 u32 *opc,
-                                                struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
                                                  u32 *opc,
-                                                 struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
                                                 u32 *opc,
-                                                struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
                                              u32 *opc,
-                                             struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
                                               u32 *opc,
-                                              struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
                                              u32 *opc,
-                                             struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
                                              u32 *opc,
-                                             struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
                                                u32 *opc,
-                                               struct kvm_run *run,
                                                struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
                                                  u32 *opc,
-                                                 struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
                                               u32 *opc,
-                                              struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
                                                  u32 *opc,
-                                                 struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  }
  
  enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
-                                        struct kvm_run *run,
                                         struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
@@@ -2959,12 -2937,12 +2941,12 @@@ emulate_ri
         * branch target), and pass the RI exception to the guest OS.
         */
        vcpu->arch.pc = curr_pc;
-       return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
+       return kvm_mips_emulate_ri_exc(cause, opc, vcpu);
  }
  
- enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
-                                                 struct kvm_run *run)
+ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu)
  {
+       struct kvm_run *run = vcpu->run;
        unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr];
        enum emulation_result er = EMULATE_DONE;
  
@@@ -3107,7 -3085,6 +3089,6 @@@ done
  
  static enum emulation_result kvm_mips_emulate_exc(u32 cause,
                                                  u32 *opc,
-                                                 struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
  {
        u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
  
  enum emulation_result kvm_mips_check_privilege(u32 cause,
                                               u32 *opc,
-                                              struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
  {
        enum emulation_result er = EMULATE_DONE;
        }
  
        if (er == EMULATE_PRIV_FAIL)
-               kvm_mips_emulate_exc(cause, opc, run, vcpu);
+               kvm_mips_emulate_exc(cause, opc, vcpu);
  
        return er;
  }
   */
  enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
                                              u32 *opc,
-                                             struct kvm_run *run,
                                              struct kvm_vcpu *vcpu,
                                              bool write_fault)
  {
                       KVM_ENTRYHI_ASID));
        if (index < 0) {
                if (exccode == EXCCODE_TLBL) {
-                       er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu);
+                       er = kvm_mips_emulate_tlbmiss_ld(cause, opc, vcpu);
                } else if (exccode == EXCCODE_TLBS) {
-                       er = kvm_mips_emulate_tlbmiss_st(cause, opc, run, vcpu);
+                       er = kvm_mips_emulate_tlbmiss_st(cause, opc, vcpu);
                } else {
                        kvm_err("%s: invalid exc code: %d\n", __func__,
                                exccode);
                 */
                if (!TLB_IS_VALID(*tlb, va)) {
                        if (exccode == EXCCODE_TLBL) {
-                               er = kvm_mips_emulate_tlbinv_ld(cause, opc, run,
+                               er = kvm_mips_emulate_tlbinv_ld(cause, opc,
                                                                vcpu);
                        } else if (exccode == EXCCODE_TLBS) {
-                               er = kvm_mips_emulate_tlbinv_st(cause, opc, run,
+                               er = kvm_mips_emulate_tlbinv_st(cause, opc,
                                                                vcpu);
                        } else {
                                kvm_err("%s: invalid exc code: %d\n", __func__,
diff --combined arch/mips/kvm/vz.c
index 9d03bd0a604acbdb26d44c7924e090a5387b3f63,9e58c479ee202ea5b8d751689bc0d19e4fb4860c..3932f767e93867e0358b33717554143c0c35b8d5
@@@ -129,7 -129,7 +129,7 @@@ static inline unsigned int kvm_vz_confi
  
  static inline unsigned int kvm_vz_config6_guest_wrmask(struct kvm_vcpu *vcpu)
  {
 -      return MIPS_CONF6_LOONGSON_INTIMER | MIPS_CONF6_LOONGSON_EXTIMER;
 +      return LOONGSON_CONF6_INTIMER | LOONGSON_CONF6_EXTIMER;
  }
  
  /*
@@@ -189,7 -189,7 +189,7 @@@ static inline unsigned int kvm_vz_confi
  static inline unsigned int kvm_vz_config6_user_wrmask(struct kvm_vcpu *vcpu)
  {
        return kvm_vz_config6_guest_wrmask(vcpu) |
 -              MIPS_CONF6_LOONGSON_SFBEN | MIPS_CONF6_LOONGSON_FTLBDIS;
 +              LOONGSON_CONF6_SFBEN | LOONGSON_CONF6_FTLBDIS;
  }
  
  static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
@@@ -874,7 -874,6 +874,6 @@@ static void kvm_write_maari(struct kvm_
  
  static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
                                              u32 *opc, u32 cause,
-                                             struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
  {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
  
  static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
                                               u32 *opc, u32 cause,
-                                              struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
  {
        enum emulation_result er = EMULATE_DONE;
@@@ -1217,7 -1215,6 +1215,6 @@@ static enum emulation_result kvm_trap_v
  {
        enum emulation_result er = EMULATE_DONE;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
-       struct kvm_run *run = vcpu->run;
        union mips_instruction inst;
        int rd, rt, sel;
        int err;
  
        switch (inst.r_format.opcode) {
        case cop0_op:
-               er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+               er = kvm_vz_gpsi_cop0(inst, opc, cause, vcpu);
                break;
  #ifndef CONFIG_CPU_MIPSR6
        case cache_op:
                trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
-               er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+               er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu);
                break;
  #endif
  #ifdef CONFIG_CPU_LOONGSON64
  #ifdef CONFIG_CPU_MIPSR6
                case cache6_op:
                        trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
-                       er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+                       er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu);
                        break;
  #endif
                case rdhwr_op:
@@@ -1553,7 -1550,6 +1550,6 @@@ static int kvm_trap_vz_handle_guest_exi
   */
  static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *run = vcpu->run;
        u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_FAIL;
        int ret = RESUME_GUEST;
                break;
  
        case EMULATE_FAIL:
-               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
                break;
  
   */
  static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
  {
-       struct kvm_run *run = vcpu->run;
        /*
         * If MSA not present or not exposed to guest or FR=0, the MSA operation
         * should have been treated as a reserved instruction!
            (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
            !(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
            vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
-               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                return RESUME_HOST;
        }
  
@@@ -1648,7 -1642,7 +1642,7 @@@ static int kvm_trap_vz_handle_tlb_ld_mi
                }
  
                /* Treat as MMIO */
-               er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+               er = kvm_mips_emulate_load(inst, cause, vcpu);
                if (er == EMULATE_FAIL) {
                        kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
                                opc, badvaddr);
@@@ -1695,7 -1689,7 +1689,7 @@@ static int kvm_trap_vz_handle_tlb_st_mi
                }
  
                /* Treat as MMIO */
-               er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+               er = kvm_mips_emulate_store(inst, cause, vcpu);
                if (er == EMULATE_FAIL) {
                        kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
                                opc, badvaddr);
@@@ -3242,7 -3236,7 +3236,7 @@@ static void kvm_vz_flush_shadow_memslot
        kvm_vz_flush_shadow_all(kvm);
  }
  
- static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+ static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
  {
        int cpu = smp_processor_id();
        int preserve_guest_tlb;
                kvm_vz_vcpu_load_wired(vcpu);
  }
  
- static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu)
  {
        int cpu = smp_processor_id();
        int r;
        kvm_vz_vcpu_load_tlb(vcpu, cpu);
        kvm_vz_vcpu_load_wired(vcpu);
  
-       r = vcpu->arch.vcpu_run(run, vcpu);
+       r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
  
        kvm_vz_vcpu_save_wired(vcpu);
  
diff --combined arch/s390/kernel/setup.c
index 0c4194d407aca255e86af4f0df1f10a8c3ead271,878cacfc9c3ea9b98651c14ca2da967a6ba60788..e600f6953d7ceff50264f79e1f99d8d3a163c665
@@@ -1021,8 -1021,7 +1021,7 @@@ static void __init setup_control_progra
  {
        union diag318_info diag318_info = {
                .cpnc = CPNC_LINUX,
-               .cpvc_linux = 0,
-               .cpvc_distro = {0},
+               .cpvc = 0,
        };
  
        if (!sclp.has_diag318)
@@@ -1100,7 -1099,6 +1099,7 @@@ void __init setup_arch(char **cmdline_p
        if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
                nospec_auto_detect();
  
 +      jump_label_init();
        parse_early_param();
  #ifdef CONFIG_CRASH_DUMP
        /* Deactivate elfcorehdr= kernel parameter */
        free_mem_detect_info();
        remove_oldmem();
  
 -      /*
 -       * Make sure all chunks are MAX_ORDER aligned so we don't need the
 -       * extra checks that HOLES_IN_ZONE would require.
 -       *
 -       * Is this still required?
 -       */
 -      memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
 -
        if (is_prot_virt_host())
                setup_uv();
        setup_memory_end();
diff --combined arch/x86/Kconfig
index fd03cefabd34de8b2327893852a8325f9f7e1663,aba928581226203bd53633ec06fe2f9417669fc5..9a2849527dd753dca5e5404bb04edac377d025f5
@@@ -67,7 -67,7 +67,7 @@@ config X8
        select ARCH_HAS_FILTER_PGPROT
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
 -      select ARCH_HAS_KCOV                    if X86_64
 +      select ARCH_HAS_KCOV                    if X86_64 && STACK_VALIDATION
        select ARCH_HAS_MEM_ENCRYPT
        select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select GENERIC_CPU_AUTOPROBE
        select GENERIC_CPU_VULNERABILITIES
        select GENERIC_EARLY_IOREMAP
 +      select GENERIC_ENTRY
        select GENERIC_FIND_FIRST_BIT
        select GENERIC_IOMAP
        select GENERIC_IRQ_EFFECTIVE_AFF_MASK   if SMP
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
        select HAVE_CONTEXT_TRACKING            if X86_64
 -      select HAVE_COPY_THREAD_TLS
        select HAVE_C_RECORDMCOUNT
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_CONTIGUOUS
        select HAVE_KERNEL_LZMA
        select HAVE_KERNEL_LZO
        select HAVE_KERNEL_XZ
 +      select HAVE_KERNEL_ZSTD
        select HAVE_KPROBES
        select HAVE_KPROBES_ON_FTRACE
        select HAVE_FUNCTION_ERROR_INJECTION
@@@ -803,6 -802,7 +803,7 @@@ config KVM_GUES
        depends on PARAVIRT
        select PARAVIRT_CLOCK
        select ARCH_CPUIDLE_HALTPOLL
+       select X86_HV_CALLBACK_VECTOR
        default y
        help
          This option enables various optimizations for running under the KVM
@@@ -910,7 -910,6 +911,7 @@@ config DM
  
  config GART_IOMMU
        bool "Old AMD GART IOMMU support"
 +      select DMA_OPS
        select IOMMU_HELPER
        select SWIOTLB
        depends on X86_64 && PCI && AMD_NB
@@@ -1294,6 -1293,7 +1295,6 @@@ config MICROCOD
        bool "CPU microcode loading support"
        default y
        depends on CPU_SUP_AMD || CPU_SUP_INTEL
 -      select FW_LOADER
        help
          If you say Y here, you will be able to update the microcode on
          Intel and AMD processors. The Intel support is for the IA32 family,
@@@ -1315,6 -1315,7 +1316,6 @@@ config MICROCODE_INTE
        bool "Intel microcode loading support"
        depends on MICROCODE
        default MICROCODE
 -      select FW_LOADER
        help
          This options enables microcode patch loading support for Intel
          processors.
  config MICROCODE_AMD
        bool "AMD microcode loading support"
        depends on MICROCODE
 -      select FW_LOADER
        help
          If you select this option, microcode patch loading support for AMD
          processors will be enabled.
index ff198fc2495ee5a48dfd8d0b062c305279dc9dec,cecc603eed3599cb39ac86d54b0d7493a681cee1..a4336619121208135c18fad12be42d90ba9d603e
@@@ -6,13 -6,15 +6,13 @@@
  #include <asm/trapnr.h>
  
  #ifndef __ASSEMBLY__
 +#include <linux/entry-common.h>
  #include <linux/hardirq.h>
  
  #include <asm/irq_stack.h>
  
 -void idtentry_enter_user(struct pt_regs *regs);
 -void idtentry_exit_user(struct pt_regs *regs);
 -
 -bool idtentry_enter_cond_rcu(struct pt_regs *regs);
 -void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit);
 +bool idtentry_enter_nmi(struct pt_regs *regs);
 +void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
  
  /**
   * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
@@@ -43,8 -45,8 +43,8 @@@
   * The macro is written so it acts as function definition. Append the
   * body with a pair of curly brackets.
   *
 - * idtentry_enter() contains common code which has to be invoked before
 - * arbitrary code in the body. idtentry_exit() contains common code
 + * irqentry_enter() contains common code which has to be invoked before
 + * arbitrary code in the body. irqentry_exit() contains common code
   * which has to run before returning to the low level assembly code.
   */
  #define DEFINE_IDTENTRY(func)                                         \
@@@ -52,12 -54,12 +52,12 @@@ static __always_inline void __##func(st
                                                                        \
  __visible noinstr void func(struct pt_regs *regs)                     \
  {                                                                     \
 -      bool rcu_exit = idtentry_enter_cond_rcu(regs);                  \
 +      irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        __##func (regs);                                                \
        instrumentation_end();                                          \
 -      idtentry_exit_cond_rcu(regs, rcu_exit);                         \
 +      irqentry_exit(regs, state);                                     \
  }                                                                     \
                                                                        \
  static __always_inline void __##func(struct pt_regs *regs)
@@@ -99,12 -101,12 +99,12 @@@ static __always_inline void __##func(st
  __visible noinstr void func(struct pt_regs *regs,                     \
                            unsigned long error_code)                   \
  {                                                                     \
 -      bool rcu_exit = idtentry_enter_cond_rcu(regs);                  \
 +      irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        __##func (regs, error_code);                                    \
        instrumentation_end();                                          \
 -      idtentry_exit_cond_rcu(regs, rcu_exit);                         \
 +      irqentry_exit(regs, state);                                     \
  }                                                                     \
                                                                        \
  static __always_inline void __##func(struct pt_regs *regs,            \
@@@ -159,7 -161,7 +159,7 @@@ __visible noinstr void func(struct pt_r
   * body with a pair of curly brackets.
   *
   * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the
 - * idtentry_enter/exit() helpers before and after the body invocation. This
 + * irqentry_enter/exit() helpers before and after the body invocation. This
   * needs to be done in the body itself if applicable. Use if extra work
   * is required before the enter/exit() helpers are invoked.
   */
@@@ -185,9 -187,11 +185,9 @@@ __visible noinstr void func(struct pt_r
   * to the function as error_code argument which needs to be truncated
   * to an u8 because the push is sign extending.
   *
 - * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before
 - * and after switching to the interrupt stack. On 32-bit this happens in C.
 - *
   * irq_enter/exit_rcu() are invoked before the function body and the
 - * KVM L1D flush request is set.
 + * KVM L1D flush request is set. Stack switching to the interrupt stack
 + * has to be done in the function body if necessary.
   */
  #define DEFINE_IDTENTRY_IRQ(func)                                     \
  static __always_inline void __##func(struct pt_regs *regs, u8 vector);        \
  __visible noinstr void func(struct pt_regs *regs,                     \
                            unsigned long error_code)                   \
  {                                                                     \
 -      bool rcu_exit = idtentry_enter_cond_rcu(regs);                  \
 +      irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        irq_enter_rcu();                                                \
        __##func (regs, (u8)error_code);                                \
        irq_exit_rcu();                                                 \
        instrumentation_end();                                          \
 -      idtentry_exit_cond_rcu(regs, rcu_exit);                         \
 +      irqentry_exit(regs, state);                                     \
  }                                                                     \
                                                                        \
  static __always_inline void __##func(struct pt_regs *regs, u8 vector)
   * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points
   * @func:     Function name of the entry point
   *
 - * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
 + * irqentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
   * function body. KVM L1D flush request is set.
   *
   * Runs the function on the interrupt stack if the entry hit kernel mode
@@@ -237,7 -241,7 +237,7 @@@ static void __##func(struct pt_regs *re
                                                                        \
  __visible noinstr void func(struct pt_regs *regs)                     \
  {                                                                     \
 -      bool rcu_exit = idtentry_enter_cond_rcu(regs);                  \
 +      irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        irq_enter_rcu();                                                \
        run_on_irqstack_cond(__##func, regs, regs);                     \
        irq_exit_rcu();                                                 \
        instrumentation_end();                                          \
 -      idtentry_exit_cond_rcu(regs, rcu_exit);                         \
 +      irqentry_exit(regs, state);                                     \
  }                                                                     \
                                                                        \
  static noinline void __##func(struct pt_regs *regs)
@@@ -266,7 -270,7 +266,7 @@@ static __always_inline void __##func(st
                                                                        \
  __visible noinstr void func(struct pt_regs *regs)                     \
  {                                                                     \
 -      bool rcu_exit = idtentry_enter_cond_rcu(regs);                  \
 +      irqentry_state_t state = irqentry_enter(regs);                  \
                                                                        \
        instrumentation_begin();                                        \
        __irq_enter_raw();                                              \
        __##func (regs);                                                \
        __irq_exit_raw();                                               \
        instrumentation_end();                                          \
 -      idtentry_exit_cond_rcu(regs, rcu_exit);                         \
 +      irqentry_exit(regs, state);                                     \
  }                                                                     \
                                                                        \
  static __always_inline void __##func(struct pt_regs *regs)
  
  #else /* CONFIG_X86_64 */
  
 -/* Maps to a regular IDTENTRY on 32bit for now */
 -# define DECLARE_IDTENTRY_IST         DECLARE_IDTENTRY
 -# define DEFINE_IDTENTRY_IST          DEFINE_IDTENTRY
 -
  /**
   * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant
   * @vector:   Vector number (ignored for C)
@@@ -379,18 -387,28 +379,18 @@@ __visible noinstr void func(struct pt_r
  #endif        /* !CONFIG_X86_64 */
  
  /* C-Code mapping */
 +#define DECLARE_IDTENTRY_NMI          DECLARE_IDTENTRY_RAW
 +#define DEFINE_IDTENTRY_NMI           DEFINE_IDTENTRY_RAW
 +
 +#ifdef CONFIG_X86_64
  #define DECLARE_IDTENTRY_MCE          DECLARE_IDTENTRY_IST
  #define DEFINE_IDTENTRY_MCE           DEFINE_IDTENTRY_IST
  #define DEFINE_IDTENTRY_MCE_USER      DEFINE_IDTENTRY_NOIST
  
 -#define DECLARE_IDTENTRY_NMI          DECLARE_IDTENTRY_RAW
 -#define DEFINE_IDTENTRY_NMI           DEFINE_IDTENTRY_RAW
 -
  #define DECLARE_IDTENTRY_DEBUG                DECLARE_IDTENTRY_IST
  #define DEFINE_IDTENTRY_DEBUG         DEFINE_IDTENTRY_IST
  #define DEFINE_IDTENTRY_DEBUG_USER    DEFINE_IDTENTRY_NOIST
 -
 -/**
 - * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points
 - * @vector:   Vector number (ignored for C)
 - * @func:     Function name of the entry point
 - *
 - * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM
 - * indirection magic.
 - */
 -#define DECLARE_IDTENTRY_XEN(vector, func)                            \
 -      asmlinkage void xen_asm_exc_xen##func(void);                    \
 -      asmlinkage void asm_exc_xen##func(void)
 +#endif
  
  #else /* !__ASSEMBLY__ */
  
  # define DECLARE_IDTENTRY_MCE(vector, func)                           \
        DECLARE_IDTENTRY(vector, func)
  
 -# define DECLARE_IDTENTRY_DEBUG(vector, func)                         \
 -      DECLARE_IDTENTRY(vector, func)
 -
  /* No ASM emitted for DF as this goes through a C shim */
  # define DECLARE_IDTENTRY_DF(vector, func)
  
  /* No ASM code emitted for NMI */
  #define DECLARE_IDTENTRY_NMI(vector, func)
  
 -/* XEN NMI and DB wrapper */
 -#define DECLARE_IDTENTRY_XEN(vector, func)                            \
 -      idtentry vector asm_exc_xen##func exc_##func has_error_code=0
 -
  /*
   * ASM code to emit the common vector entry stubs where each stub is
   * packed into 8 bytes.
        .align 8
  SYM_CODE_START(irq_entries_start)
      vector=FIRST_EXTERNAL_VECTOR
 -    pos = .
      .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
        UNWIND_HINT_IRET_REGS
 +0 :
        .byte   0x6a, vector
        jmp     asm_common_interrupt
        nop
        /* Ensure that the above is 8 bytes max */
 -      . = pos + 8
 -    pos=pos+8
 -    vector=vector+1
 +      . = 0b + 8
 +      vector = vector+1
      .endr
  SYM_CODE_END(irq_entries_start)
  
        .align 8
  SYM_CODE_START(spurious_entries_start)
      vector=FIRST_SYSTEM_VECTOR
 -    pos = .
      .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
        UNWIND_HINT_IRET_REGS
 +0 :
        .byte   0x6a, vector
        jmp     asm_spurious_interrupt
        nop
        /* Ensure that the above is 8 bytes max */
 -      . = pos + 8
 -    pos=pos+8
 -    vector=vector+1
 +      . = 0b + 8
 +      vector = vector+1
      .endr
  SYM_CODE_END(spurious_entries_start)
  #endif
@@@ -538,28 -565,16 +538,28 @@@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP,               exc_
  DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,   exc_page_fault);
  
  #ifdef CONFIG_X86_MCE
 +#ifdef CONFIG_X86_64
  DECLARE_IDTENTRY_MCE(X86_TRAP_MC,     exc_machine_check);
 +#else
 +DECLARE_IDTENTRY_RAW(X86_TRAP_MC,     exc_machine_check);
 +#endif
  #endif
  
  /* NMI */
  DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,    exc_nmi);
 -DECLARE_IDTENTRY_XEN(X86_TRAP_NMI,    nmi);
 +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64)
 +DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,    xenpv_exc_nmi);
 +#endif
  
  /* #DB */
 +#ifdef CONFIG_X86_64
  DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB,   exc_debug);
 -DECLARE_IDTENTRY_XEN(X86_TRAP_DB,     debug);
 +#else
 +DECLARE_IDTENTRY_RAW(X86_TRAP_DB,     exc_debug);
 +#endif
 +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64)
 +DECLARE_IDTENTRY_RAW(X86_TRAP_DB,     xenpv_exc_debug);
 +#endif
  
  /* #DF */
  DECLARE_IDTENTRY_DF(X86_TRAP_DF,      exc_double_fault);
@@@ -620,8 -635,8 +620,8 @@@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NES
  
  #if IS_ENABLED(CONFIG_HYPERV)
  DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,   sysvec_hyperv_callback);
 -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR,    sysvec_hyperv_reenlightenment);
 -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR,    sysvec_hyperv_stimer0);
 +DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR,        sysvec_hyperv_reenlightenment);
 +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR,        sysvec_hyperv_stimer0);
  #endif
  
  #if IS_ENABLED(CONFIG_ACRN_GUEST)
@@@ -632,6 -647,10 +632,10 @@@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALL
  DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,   sysvec_xen_hvm_callback);
  #endif
  
+ #ifdef CONFIG_KVM_GUEST
+ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,   sysvec_kvm_asyncpf_interrupt);
+ #endif
  #undef X86_TRAP_OTHER
  
  #endif
diff --combined arch/x86/kernel/kvm.c
index 233c77d056c9f38cad32d696d08499cfc02d5aa1,d9995931ea18458a0bdd525ae9c6e8613fc2fda0..08320b0b2b276f0ceb82e28b07812d12dfdd0465
@@@ -7,8 -7,11 +7,11 @@@
   *   Authors: Anthony Liguori <[email protected]>
   */
  
+ #define pr_fmt(fmt) "kvm-guest: " fmt
  #include <linux/context_tracking.h>
  #include <linux/init.h>
+ #include <linux/irq.h>
  #include <linux/kernel.h>
  #include <linux/kvm_para.h>
  #include <linux/cpu.h>
@@@ -232,18 -235,13 +235,13 @@@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_ap
  
  noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
  {
-       u32 reason = kvm_read_and_reset_apf_flags();
+       u32 flags = kvm_read_and_reset_apf_flags();
 -      bool rcu_exit;
 +      irqentry_state_t state;
  
-       switch (reason) {
-       case KVM_PV_REASON_PAGE_NOT_PRESENT:
-       case KVM_PV_REASON_PAGE_READY:
-               break;
-       default:
+       if (!flags)
                return false;
-       }
  
 -      rcu_exit = idtentry_enter_cond_rcu(regs);
 +      state = irqentry_enter(regs);
        instrumentation_begin();
  
        /*
        if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
                panic("Host injected async #PF in interrupt disabled region\n");
  
-       if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
+       if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
                if (unlikely(!(user_mode(regs))))
                        panic("Host injected async #PF in kernel mode\n");
                /* Page is swapped out by the host. */
                kvm_async_pf_task_wait_schedule(token);
        } else {
-               kvm_async_pf_task_wake(token);
+               WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
        }
  
        instrumentation_end();
 -      idtentry_exit_cond_rcu(regs, rcu_exit);
 +      irqentry_exit(regs, state);
        return true;
  }
  
 -      bool rcu_exit;
+ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+ {
+       struct pt_regs *old_regs = set_irq_regs(regs);
+       u32 token;
 -      rcu_exit = idtentry_enter_cond_rcu(regs);
++      irqentry_state_t state;
 -      idtentry_exit_cond_rcu(regs, rcu_exit);
++      state = irqentry_enter(regs);
+       inc_irq_stat(irq_hv_callback_count);
+       if (__this_cpu_read(apf_reason.enabled)) {
+               token = __this_cpu_read(apf_reason.token);
+               kvm_async_pf_task_wake(token);
+               __this_cpu_write(apf_reason.token, 0);
+               wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
+       }
++      irqentry_exit(regs, state);
+       set_irq_regs(old_regs);
+ }
  static void __init paravirt_ops_setup(void)
  {
        pv_info.name = "KVM";
@@@ -289,8 -308,8 +308,8 @@@ static void kvm_register_steal_time(voi
                return;
  
        wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
-       pr_info("kvm-stealtime: cpu %d, msr %llx\n",
-               cpu, (unsigned long long) slow_virt_to_phys(st));
+       pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+               (unsigned long long) slow_virt_to_phys(st));
  }
  
  static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@@ -311,17 -330,19 +330,19 @@@ static notrace void kvm_guest_apic_eoi_
  
  static void kvm_guest_cpu_init(void)
  {
-       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
-               u64 pa;
+       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+               u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
  
                WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
  
                pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
-               pa |= KVM_ASYNC_PF_ENABLED;
+               pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
  
                if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
                        pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
  
+               wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
                __this_cpu_write(apf_reason.enabled, 1);
                pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
@@@ -493,7 -514,8 +514,8 @@@ static void __send_ipi_mask(const struc
                } else {
                        ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                                (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
-                       WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+                       WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+                                 ret);
                        min = max = apic_id;
                        ipi_bitmap = 0;
                }
        if (ipi_bitmap) {
                ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                        (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
-               WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+               WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+                         ret);
        }
  
        local_irq_restore(flags);
@@@ -533,7 -556,7 +556,7 @@@ static void kvm_setup_pv_ipi(void
  {
        apic->send_IPI_mask = kvm_send_ipi_mask;
        apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
-       pr_info("KVM setup pv IPIs\n");
+       pr_info("setup PV IPIs\n");
  }
  
  static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
        }
  }
  
- static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
- {
-       native_smp_prepare_cpus(max_cpus);
-       if (kvm_para_has_hint(KVM_HINTS_REALTIME))
-               static_branch_disable(&virt_spin_lock_key);
- }
  static void __init kvm_smp_prepare_boot_cpu(void)
  {
        /*
@@@ -646,19 -662,20 +662,20 @@@ static void __init kvm_guest_init(void
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
  
-       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
                static_branch_enable(&kvm_async_pf_enabled);
+               alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
+       }
  
  #ifdef CONFIG_SMP
-       smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
-               pr_info("KVM setup pv sched yield\n");
+               pr_info("setup PV sched yield\n");
        }
        if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
                                      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
-               pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
+               pr_err("failed to install cpu hotplug callbacks\n");
  #else
        sev_map_percpu_data();
        kvm_guest_cpu_init();
@@@ -854,16 -871,36 +871,36 @@@ asm
   */
  void __init kvm_spinlock_init(void)
  {
-       /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
-       if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+       /*
+        * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+        * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+        * preferred over native qspinlock when vCPU is preempted.
+        */
+       if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+               pr_info("PV spinlocks disabled, no host support\n");
                return;
+       }
  
-       if (kvm_para_has_hint(KVM_HINTS_REALTIME))
-               return;
+       /*
+        * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+        * are available.
+        */
+       if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
+               pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+               goto out;
+       }
  
-       /* Don't use the pvqspinlock code if there is only 1 vCPU. */
-       if (num_possible_cpus() == 1)
-               return;
+       if (num_possible_cpus() == 1) {
+               pr_info("PV spinlocks disabled, single CPU\n");
+               goto out;
+       }
+       if (nopvspin) {
+               pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+               goto out;
+       }
+       pr_info("PV spinlocks enabled\n");
  
        __pv_init_lock_hash();
        pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
                pv_ops.lock.vcpu_is_preempted =
                        PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }
+       /*
+        * When PV spinlock is enabled which is preferred over
+        * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+        * Just disable it anyway.
+        */
+ out:
+       static_branch_disable(&virt_spin_lock_key);
  }
  
  #endif        /* CONFIG_PARAVIRT_SPINLOCKS */
@@@ -895,8 -939,8 +939,8 @@@ static void kvm_enable_host_haltpoll(vo
  void arch_haltpoll_enable(unsigned int cpu)
  {
        if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
-               pr_err_once("kvm: host does not support poll control\n");
-               pr_err_once("kvm: host upgrade recommended\n");
+               pr_err_once("host does not support poll control\n");
+               pr_err_once("host upgrade recommended\n");
                return;
        }
  
diff --combined arch/x86/kvm/lapic.c
index 4ce2ddd26c0b735d7100a68059f3680f37303af4,bd16e31009325ad666eeefa6df3120beae3d470d..5ccbee7165a215f31744073adcb74364939a956d
@@@ -354,7 -354,6 +354,6 @@@ static inline int apic_lvt_nmi_mode(u3
  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
-       struct kvm_cpuid_entry2 *feat;
        u32 v = APIC_VERSION;
  
        if (!lapic_in_kernel(vcpu))
         * version first and level-triggered interrupts never get EOIed in
         * IOAPIC.
         */
-       feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
-       if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) &&
            !ioapic_in_kernel(vcpu->kvm))
                v |= APIC_LVR_DIRECTED_EOI;
        kvm_lapic_set_reg(apic, APIC_LVR, v);
@@@ -2068,7 -2066,7 +2066,7 @@@ int kvm_lapic_reg_write(struct kvm_lapi
        case APIC_TDCR: {
                uint32_t old_divisor = apic->divide_count;
  
-               kvm_lapic_set_reg(apic, APIC_TDCR, val);
+               kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
                update_divide_count(apic);
                if (apic->divide_count != old_divisor &&
                                apic->lapic_timer.period) {
  
        case APIC_SELF_IPI:
                if (apic_x2apic_mode(apic)) {
-                       kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+                       kvm_lapic_reg_write(apic, APIC_ICR,
+                                           APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
                } else
                        ret = 1;
                break;
@@@ -2195,7 -2194,7 +2194,7 @@@ void kvm_set_lapic_tscdeadline_msr(stru
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
  
 -      if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
 +      if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return;
  
@@@ -2232,7 -2231,7 +2231,7 @@@ void kvm_lapic_set_base(struct kvm_vcp
        vcpu->arch.apic_base = value;
  
        if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
-               kvm_update_cpuid(vcpu);
+               kvm_update_cpuid_runtime(vcpu);
  
        if (!apic)
                return;
diff --combined arch/x86/kvm/mmu/mmu.c
index e2e2e5cc7dc6878f707dcbd0b81d55a8e9f999d8,862bf418214e21a585e2d100e7c693cdc18e0c74..4e03841f053dec97081d421a7720fab88253946d
@@@ -18,6 -18,7 +18,7 @@@
  #include "irq.h"
  #include "ioapic.h"
  #include "mmu.h"
+ #include "mmu_internal.h"
  #include "x86.h"
  #include "kvm_cache_regs.h"
  #include "kvm_emulate.h"
@@@ -91,7 -92,8 +92,8 @@@ module_param_named(flush_on_reuse, forc
   */
  bool tdp_enabled = false;
  
- static int max_page_level __read_mostly;
+ static int max_huge_page_level __read_mostly;
+ static int max_tdp_level __read_mostly;
  
  enum {
        AUDIT_PRE_PAGE_FAULT,
@@@ -515,6 -517,18 +517,18 @@@ static bool check_mmio_spte(struct kvm_
        return likely(kvm_gen == spte_gen);
  }
  
+ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+                                   struct x86_exception *exception)
+ {
+       /* Check if guest physical address doesn't exceed guest maximum */
+       if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+               exception->error_code |= PFERR_RSVD_MASK;
+               return UNMAPPED_GVA;
+       }
+         return gpa;
+ }
  /*
   * Sets the shadow PTE masks used by the MMU.
   *
@@@ -676,7 -690,7 +690,7 @@@ union split_spte 
  
  static void count_spte_clear(u64 *sptep, u64 spte)
  {
-       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+       struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
  
        if (is_shadow_present_pte(spte))
                return;
@@@ -760,7 -774,7 +774,7 @@@ static u64 __update_clear_spte_slow(u6
   */
  static u64 __get_spte_lockless(u64 *sptep)
  {
-       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+       struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
        union split_spte spte, *orig = (union split_spte *)sptep;
        int count;
  
@@@ -1060,94 -1074,40 +1074,40 @@@ static void walk_shadow_page_lockless_e
        local_irq_enable();
  }
  
- static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-                                 struct kmem_cache *base_cache, int min)
- {
-       void *obj;
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
-               if (!obj)
-                       return cache->nobjs >= min ? 0 : -ENOMEM;
-               cache->objects[cache->nobjs++] = obj;
-       }
-       return 0;
- }
- static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
- {
-       return cache->nobjs;
- }
- static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
-                                 struct kmem_cache *cache)
- {
-       while (mc->nobjs)
-               kmem_cache_free(cache, mc->objects[--mc->nobjs]);
- }
- static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-                                      int min)
- {
-       void *page;
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
-               if (!page)
-                       return cache->nobjs >= min ? 0 : -ENOMEM;
-               cache->objects[cache->nobjs++] = page;
-       }
-       return 0;
- }
- static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
- {
-       while (mc->nobjs)
-               free_page((unsigned long)mc->objects[--mc->nobjs]);
- }
- static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
  {
        int r;
  
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
-                                  pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+       /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+       r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+                                      1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
        if (r)
-               goto out;
-       r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+               return r;
+       r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+                                      PT64_ROOT_MAX_LEVEL);
        if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
-                                  mmu_page_header_cache, 4);
- out:
-       return r;
+               return r;
+       if (maybe_indirect) {
+               r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+                                              PT64_ROOT_MAX_LEVEL);
+               if (r)
+                       return r;
+       }
+       return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                                         PT64_ROOT_MAX_LEVEL);
  }
  
  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
  {
-       mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
-                               pte_list_desc_cache);
-       mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
-                               mmu_page_header_cache);
- }
- static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
- {
-       void *p;
-       BUG_ON(!mc->nobjs);
-       p = mc->objects[--mc->nobjs];
-       return p;
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
  }
  
  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
  {
-       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+       return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
  }
  
  static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@@ -1415,10 -1375,10 +1375,10 @@@ static struct kvm_rmap_head *gfn_to_rma
  
  static bool rmap_can_add(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu_memory_cache *cache;
+       struct kvm_mmu_memory_cache *mc;
  
-       cache = &vcpu->arch.mmu_pte_list_desc_cache;
-       return mmu_memory_cache_free_objects(cache);
+       mc = &vcpu->arch.mmu_pte_list_desc_cache;
+       return kvm_mmu_memory_cache_nr_free_objects(mc);
  }
  
  static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
        struct kvm_mmu_page *sp;
        struct kvm_rmap_head *rmap_head;
  
-       sp = page_header(__pa(spte));
+       sp = sptep_to_sp(spte);
        kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
        return pte_list_add(vcpu, spte, rmap_head);
@@@ -1438,7 -1398,7 +1398,7 @@@ static void rmap_remove(struct kvm *kvm
        gfn_t gfn;
        struct kvm_rmap_head *rmap_head;
  
-       sp = page_header(__pa(spte));
+       sp = sptep_to_sp(spte);
        gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
        rmap_head = gfn_to_rmap(kvm, gfn, sp);
        __pte_list_remove(spte, rmap_head);
@@@ -1530,7 -1490,7 +1490,7 @@@ static void drop_spte(struct kvm *kvm, 
  static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
  {
        if (is_large_pte(*sptep)) {
-               WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
+               WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
                drop_spte(kvm, sptep);
                --kvm->stat.lpages;
                return true;
  static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
  {
        if (__drop_large_spte(vcpu->kvm, sptep)) {
-               struct kvm_mmu_page *sp = page_header(__pa(sptep));
+               struct kvm_mmu_page *sp = sptep_to_sp(sptep);
  
                kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
                        KVM_PAGES_PER_HPAGE(sp->role.level));
@@@ -1738,21 -1698,6 +1698,6 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
  }
  
- /**
-  * kvm_arch_write_log_dirty - emulate dirty page logging
-  * @vcpu: Guest mode vcpu
-  *
-  * Emulate arch specific page modification logging for the
-  * nested hypervisor
-  */
- int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
- {
-       if (kvm_x86_ops.write_log_dirty)
-               return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa);
-       return 0;
- }
  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
                                    struct kvm_memory_slot *slot, u64 gfn)
  {
@@@ -1986,7 -1931,7 +1931,7 @@@ static int kvm_age_rmapp(struct kvm *kv
                         unsigned long data)
  {
        u64 *sptep;
 -      struct rmap_iterator uninitialized_var(iter);
 +      struct rmap_iterator iter;
        int young = 0;
  
        for_each_rmap_spte(rmap_head, &iter, sptep)
@@@ -2016,7 -1961,7 +1961,7 @@@ static void rmap_recycle(struct kvm_vcp
        struct kvm_rmap_head *rmap_head;
        struct kvm_mmu_page *sp;
  
-       sp = page_header(__pa(spte));
+       sp = sptep_to_sp(spte);
  
        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
  
@@@ -2105,10 -2050,10 +2050,10 @@@ static struct kvm_mmu_page *kvm_mmu_all
  {
        struct kvm_mmu_page *sp;
  
-       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
-       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+       sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+       sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
        if (!direct)
-               sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+               sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  
        /*
@@@ -2138,7 -2083,7 +2083,7 @@@ static void mark_unsync(u64 *spte
        struct kvm_mmu_page *sp;
        unsigned int index;
  
-       sp = page_header(__pa(spte));
+       sp = sptep_to_sp(spte);
        index = spte - sp->spt;
        if (__test_and_set_bit(index, sp->unsync_child_bitmap))
                return;
@@@ -2207,7 -2152,7 +2152,7 @@@ static int __mmu_unsync_walk(struct kvm
                        continue;
                }
  
-               child = page_header(ent & PT64_BASE_ADDR_MASK);
+               child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
  
                if (child->unsync_children) {
                        if (mmu_pages_add(pvec, child, i))
@@@ -2258,15 -2203,14 +2203,14 @@@ static bool kvm_mmu_prepare_zap_page(st
  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
  
- #define for_each_valid_sp(_kvm, _sp, _gfn)                            \
-       hlist_for_each_entry(_sp,                                       \
-         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ #define for_each_valid_sp(_kvm, _sp, _list)                           \
+       hlist_for_each_entry(_sp, _list, hash_link)                     \
                if (is_obsolete_sp((_kvm), (_sp))) {                    \
                } else
  
  #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                       \
-       for_each_valid_sp(_kvm, _sp, _gfn)                              \
+       for_each_valid_sp(_kvm, _sp,                                    \
+         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
                if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
  
  static inline bool is_ept_sp(struct kvm_mmu_page *sp)
@@@ -2464,9 -2408,7 +2408,7 @@@ static void __clear_sp_write_flooding_c
  
  static void clear_sp_write_flooding_count(u64 *spte)
  {
-       struct kvm_mmu_page *sp =  page_header(__pa(spte));
-       __clear_sp_write_flooding_count(sp);
+       __clear_sp_write_flooding_count(sptep_to_sp(spte));
  }
  
  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             int direct,
                                             unsigned int access)
  {
+       bool direct_mmu = vcpu->arch.mmu->direct_map;
        union kvm_mmu_page_role role;
+       struct hlist_head *sp_list;
        unsigned quadrant;
        struct kvm_mmu_page *sp;
        bool need_sync = false;
        if (role.direct)
                role.gpte_is_8_bytes = true;
        role.access = access;
-       if (!vcpu->arch.mmu->direct_map
-           && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+       if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       for_each_valid_sp(vcpu->kvm, sp, gfn) {
+       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
                if (sp->gfn != gfn) {
                        collisions++;
                        continue;
                if (sp->role.word != role.word)
                        continue;
  
+               if (direct_mmu)
+                       goto trace_get_page;
                if (sp->unsync) {
                        /* The page is good, but __kvm_sync_page might still end
                         * up zapping it.  If so, break in order to rebuild it.
                        kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  
                __clear_sp_write_flooding_count(sp);
+ trace_get_page:
                trace_kvm_mmu_get_page(sp, false);
                goto out;
        }
  
        sp->gfn = gfn;
        sp->role = role;
-       hlist_add_head(&sp->hash_link,
-               &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+       hlist_add_head(&sp->hash_link, sp_list);
        if (!direct) {
                /*
                 * we should do write protection before syncing pages
                if (level > PG_LEVEL_4K && need_sync)
                        flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
        }
-       clear_page(sp->spt);
        trace_kvm_mmu_get_page(sp, true);
  
        kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@@ -2657,7 -2605,7 +2605,7 @@@ static void validate_direct_spte(struc
                 * so we should update the spte at this point to get
                 * a new sp with the correct access.
                 */
-               child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+               child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
                if (child->role.access == direct_access)
                        return;
  
@@@ -2679,7 -2627,7 +2627,7 @@@ static bool mmu_page_zap_pte(struct kv
                        if (is_large_pte(pte))
                                --kvm->stat.lpages;
                } else {
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
                        drop_parent_pte(child, spte);
                }
                return true;
@@@ -2757,10 -2705,23 +2705,23 @@@ static bool __kvm_mmu_prepare_zap_page(
        if (!sp->root_count) {
                /* Count self */
                (*nr_zapped)++;
-               list_move(&sp->link, invalid_list);
+               /*
+                * Already invalid pages (previously active roots) are not on
+                * the active page list.  See list_del() in the "else" case of
+                * !sp->root_count.
+                */
+               if (sp->role.invalid)
+                       list_add(&sp->link, invalid_list);
+               else
+                       list_move(&sp->link, invalid_list);
                kvm_mod_used_mmu_pages(kvm, -1);
        } else {
-               list_move(&sp->link, &kvm->arch.active_mmu_pages);
+               /*
+                * Remove the active root from the active page list, the root
+                * will be explicitly freed when the root_count hits zero.
+                */
+               list_del(&sp->link);
  
                /*
                 * Obsolete pages cannot be used on any vCPUs, see the comment
@@@ -2812,33 -2773,60 +2773,60 @@@ static void kvm_mmu_commit_zap_page(str
        }
  }
  
- static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
-                                       struct list_head *invalid_list)
+ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+                                                 unsigned long nr_to_zap)
  {
-       struct kvm_mmu_page *sp;
+       unsigned long total_zapped = 0;
+       struct kvm_mmu_page *sp, *tmp;
+       LIST_HEAD(invalid_list);
+       bool unstable;
+       int nr_zapped;
  
        if (list_empty(&kvm->arch.active_mmu_pages))
-               return false;
+               return 0;
+ restart:
+       list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+               /*
+                * Don't zap active root pages, the page itself can't be freed
+                * and zapping it will just force vCPUs to realloc and reload.
+                */
+               if (sp->root_count)
+                       continue;
+               unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+                                                     &nr_zapped);
+               total_zapped += nr_zapped;
+               if (total_zapped >= nr_to_zap)
+                       break;
+               if (unstable)
+                       goto restart;
+       }
  
-       sp = list_last_entry(&kvm->arch.active_mmu_pages,
-                            struct kvm_mmu_page, link);
-       return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       kvm->stat.mmu_recycled += total_zapped;
+       return total_zapped;
+ }
+ static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+ {
+       if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+               return kvm->arch.n_max_mmu_pages -
+                       kvm->arch.n_used_mmu_pages;
+       return 0;
  }
  
  static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
  {
-       LIST_HEAD(invalid_list);
+       unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
  
-       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+       if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
                return 0;
  
-       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
-               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
-                       break;
-               ++vcpu->kvm->stat.mmu_recycled;
-       }
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
  
        if (!kvm_mmu_available_pages(vcpu->kvm))
                return -ENOSPC;
   */
  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
  {
-       LIST_HEAD(invalid_list);
        spin_lock(&kvm->mmu_lock);
  
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
-               /* Need to free some mmu pages to achieve the goal. */
-               while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
-                       if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
-                               break;
+               kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+                                                 goal_nr_mmu_pages);
  
-               kvm_mmu_commit_zap_page(kvm, &invalid_list);
                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
        }
  
@@@ -2999,7 -2982,7 +2982,7 @@@ static int set_spte(struct kvm_vcpu *vc
        if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
                return 0;
  
-       sp = page_header(__pa(sptep));
+       sp = sptep_to_sp(sptep);
        if (sp_ad_disabled(sp))
                spte |= SPTE_AD_DISABLED_MASK;
        else if (kvm_vcpu_ad_need_write_protect(vcpu))
@@@ -3102,7 -3085,7 +3085,7 @@@ static int mmu_set_spte(struct kvm_vcp
                        struct kvm_mmu_page *child;
                        u64 pte = *sptep;
  
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
                        drop_parent_pte(child, sptep);
                        flush = true;
                } else if (pfn != spte_to_pfn(*sptep)) {
@@@ -3212,7 -3195,7 +3195,7 @@@ static void direct_pte_prefetch(struct 
  {
        struct kvm_mmu_page *sp;
  
-       sp = page_header(__pa(sptep));
+       sp = sptep_to_sp(sptep);
  
        /*
         * Without accessed bits, there's no way to distinguish between
@@@ -3274,7 -3257,7 +3257,7 @@@ static int kvm_mmu_hugepage_adjust(stru
        if (!slot)
                return PG_LEVEL_4K;
  
-       max_level = min(max_level, max_page_level);
+       max_level = min(max_level, max_huge_page_level);
        for ( ; max_level > PG_LEVEL_4K; max_level--) {
                linfo = lpage_info_slot(gfn, slot, max_level);
                if (!linfo->disallow_lpage)
@@@ -3520,7 -3503,7 +3503,7 @@@ static bool fast_page_fault(struct kvm_
                        if (!is_shadow_present_pte(spte))
                                break;
  
-               sp = page_header(__pa(iterator.sptep));
+               sp = sptep_to_sp(iterator.sptep);
                if (!is_last_spte(spte, sp->role.level))
                        break;
  
@@@ -3607,7 -3590,7 +3590,7 @@@ static void mmu_free_root_page(struct k
        if (!VALID_PAGE(*root_hpa))
                return;
  
-       sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+       sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
        --sp->root_count;
        if (!sp->root_count && sp->role.invalid)
                kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@@ -3668,7 -3651,7 +3651,7 @@@ static int mmu_check_root(struct kvm_vc
  {
        int ret = 0;
  
-       if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+       if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                ret = 1;
        }
@@@ -3837,7 -3820,7 +3820,7 @@@ void kvm_mmu_sync_roots(struct kvm_vcp
  
        if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
                hpa_t root = vcpu->arch.mmu->root_hpa;
-               sp = page_header(root);
+               sp = to_shadow_page(root);
  
                /*
                 * Even if another CPU was marking the SP as unsync-ed
  
                if (root && VALID_PAGE(root)) {
                        root &= PT64_BASE_ADDR_MASK;
-                       sp = page_header(root);
+                       sp = to_shadow_page(root);
                        mmu_sync_children(vcpu, sp);
                }
        }
@@@ -4045,8 -4028,8 +4028,8 @@@ static void shadow_page_table_clear_flo
        walk_shadow_page_lockless_end(vcpu);
  }
  
- static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                                  gfn_t gfn)
+ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                   gfn_t gfn)
  {
        struct kvm_arch_async_pf arch;
  
@@@ -4108,16 -4091,16 +4091,16 @@@ static int direct_page_fault(struct kvm
        if (page_fault_handle_page_track(vcpu, error_code, gfn))
                return RET_PF_EMULATE;
  
-       r = mmu_topup_memory_caches(vcpu);
+       if (fast_page_fault(vcpu, gpa, error_code))
+               return RET_PF_RETRY;
+       r = mmu_topup_memory_caches(vcpu, false);
        if (r)
                return r;
  
        if (lpage_disallowed)
                max_level = PG_LEVEL_4K;
  
-       if (fast_page_fault(vcpu, gpa, error_code))
-               return RET_PF_RETRY;
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
  
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
-       if (make_mmu_pages_available(vcpu) < 0)
+       r = make_mmu_pages_available(vcpu);
+       if (r)
                goto out_unlock;
        r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
                         prefault, is_tdp && lpage_disallowed);
@@@ -4156,6 -4140,7 +4140,7 @@@ int kvm_handle_page_fault(struct kvm_vc
                                u64 fault_address, char *insn, int insn_len)
  {
        int r = 1;
+       u32 flags = vcpu->arch.apf.host_apf_flags;
  
  #ifndef CONFIG_X86_64
        /* A 64-bit CR2 should be impossible on 32-bit KVM. */
  #endif
  
        vcpu->arch.l1tf_flush_l1d = true;
-       switch (vcpu->arch.apf.host_apf_flags) {
-       default:
+       if (!flags) {
                trace_kvm_page_fault(fault_address, error_code);
  
                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, fault_address);
                r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
                                insn_len);
-               break;
-       case KVM_PV_REASON_PAGE_NOT_PRESENT:
+       } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
                vcpu->arch.apf.host_apf_flags = 0;
                local_irq_disable();
                kvm_async_pf_task_wait_schedule(fault_address);
                local_irq_enable();
-               break;
-       case KVM_PV_REASON_PAGE_READY:
-               vcpu->arch.apf.host_apf_flags = 0;
-               local_irq_disable();
-               kvm_async_pf_task_wake(fault_address);
-               local_irq_enable();
-               break;
+       } else {
+               WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
        }
        return r;
  }
  EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
@@@ -4227,8 -4206,8 +4206,8 @@@ static inline bool is_root_usable(struc
                                  union kvm_mmu_page_role role)
  {
        return (role.direct || pgd == root->pgd) &&
-              VALID_PAGE(root->hpa) && page_header(root->hpa) &&
-              role.word == page_header(root->hpa)->role.word;
+              VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+              role.word == to_shadow_page(root->hpa)->role.word;
  }
  
  /*
@@@ -4277,8 -4256,7 +4256,7 @@@ static bool fast_pgd_switch(struct kvm_
         */
        if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
            mmu->root_level >= PT64_ROOT_4LEVEL)
-               return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
-                      cached_root_available(vcpu, new_pgd, new_role);
+               return cached_root_available(vcpu, new_pgd, new_role);
  
        return false;
  }
@@@ -4313,7 -4291,7 +4291,7 @@@ static void __kvm_mmu_new_pgd(struct kv
         */
        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
  
-       __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
+       __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa));
  }
  
  void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
@@@ -4869,13 -4847,22 +4847,22 @@@ static union kvm_mmu_role kvm_calc_mmu_
        return role;
  }
  
+ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+ {
+       /* Use 5-level TDP if and only if it's useful/necessary. */
+       if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+               return 4;
+       return max_tdp_level;
+ }
  static union kvm_mmu_role
  kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
  {
        union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
  
        role.base.ad_disabled = (shadow_accessed_mask == 0);
-       role.base.level = vcpu->arch.tdp_level;
+       role.base.level = kvm_mmu_get_tdp_level(vcpu);
        role.base.direct = true;
        role.base.gpte_is_8_bytes = true;
  
  
  static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu *context = vcpu->arch.mmu;
+       struct kvm_mmu *context = &vcpu->arch.root_mmu;
        union kvm_mmu_role new_role =
                kvm_calc_tdp_mmu_root_page_role(vcpu, false);
  
        context->sync_page = nonpaging_sync_page;
        context->invlpg = NULL;
        context->update_pte = nonpaging_update_pte;
-       context->shadow_root_level = vcpu->arch.tdp_level;
+       context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
        context->direct_map = true;
        context->get_guest_pgd = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
  }
  
  static union kvm_mmu_role
- kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
  {
        union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
  
                !is_write_protection(vcpu);
        role.base.smap_andnot_wp = role.ext.cr4_smap &&
                !is_write_protection(vcpu);
-       role.base.direct = !is_paging(vcpu);
        role.base.gpte_is_8_bytes = !!is_pae(vcpu);
  
+       return role;
+ }
+ static union kvm_mmu_role
+ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+ {
+       union kvm_mmu_role role =
+               kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+       role.base.direct = !is_paging(vcpu);
        if (!is_long_mode(vcpu))
                role.base.level = PT32E_ROOT_LEVEL;
        else if (is_la57_mode(vcpu))
        return role;
  }
  
- void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                                   u32 cr0, u32 cr4, u32 efer,
+                                   union kvm_mmu_role new_role)
  {
-       struct kvm_mmu *context = vcpu->arch.mmu;
-       union kvm_mmu_role new_role =
-               kvm_calc_shadow_mmu_root_page_role(vcpu, false);
-       if (new_role.as_u64 == context->mmu_role.as_u64)
-               return;
        if (!(cr0 & X86_CR0_PG))
                nonpaging_init_context(vcpu, context);
        else if (efer & EFER_LMA)
        context->mmu_role.as_u64 = new_role.as_u64;
        reset_shadow_zero_bits_mask(vcpu, context);
  }
- EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+ {
+       struct kvm_mmu *context = &vcpu->arch.root_mmu;
+       union kvm_mmu_role new_role =
+               kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+       if (new_role.as_u64 != context->mmu_role.as_u64)
+               shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ }
+ static union kvm_mmu_role
+ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+ {
+       union kvm_mmu_role role =
+               kvm_calc_shadow_root_page_role_common(vcpu, false);
+       role.base.direct = false;
+       role.base.level = kvm_mmu_get_tdp_level(vcpu);
+       return role;
+ }
+ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
+                            gpa_t nested_cr3)
+ {
+       struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+       union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+       context->shadow_root_level = new_role.base.level;
+       __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+       if (new_role.as_u64 != context->mmu_role.as_u64)
+               shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ }
+ EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
  
  static union kvm_mmu_role
  kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
  void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
                             bool accessed_dirty, gpa_t new_eptp)
  {
-       struct kvm_mmu *context = vcpu->arch.mmu;
+       struct kvm_mmu *context = &vcpu->arch.guest_mmu;
        u8 level = vmx_eptp_page_walk_level(new_eptp);
        union kvm_mmu_role new_role =
                kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
@@@ -5041,7 -5069,7 +5069,7 @@@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_m
  
  static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_mmu *context = vcpu->arch.mmu;
+       struct kvm_mmu *context = &vcpu->arch.root_mmu;
  
        kvm_init_shadow_mmu(vcpu,
                            kvm_read_cr0_bits(vcpu, X86_CR0_PG),
@@@ -5151,7 -5179,7 +5179,7 @@@ int kvm_mmu_load(struct kvm_vcpu *vcpu
  {
        int r;
  
-       r = mmu_topup_memory_caches(vcpu);
+       r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
        if (r)
                goto out;
        r = mmu_alloc_roots(vcpu);
@@@ -5345,7 -5373,7 +5373,7 @@@ static void kvm_mmu_pte_write(struct kv
         * or not since pte prefetch is skiped if it does not have
         * enough objects in the cache.
         */
-       mmu_topup_memory_caches(vcpu);
+       mmu_topup_memory_caches(vcpu, true);
  
        spin_lock(&vcpu->kvm->mmu_lock);
  
@@@ -5553,23 -5581,25 +5581,25 @@@ void kvm_mmu_invpcid_gva(struct kvm_vcp
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
  
- void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
+ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
+                      int tdp_huge_page_level)
  {
        tdp_enabled = enable_tdp;
+       max_tdp_level = tdp_max_root_level;
  
        /*
-        * max_page_level reflects the capabilities of KVM's MMU irrespective
+        * max_huge_page_level reflects KVM's MMU capabilities irrespective
         * of kernel support, e.g. KVM may be capable of using 1GB pages when
         * the kernel is not.  But, KVM never creates a page size greater than
         * what is used by the kernel for any given HVA, i.e. the kernel's
         * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
         */
        if (tdp_enabled)
-               max_page_level = tdp_page_level;
+               max_huge_page_level = tdp_huge_page_level;
        else if (boot_cpu_has(X86_FEATURE_GBPAGES))
-               max_page_level = PG_LEVEL_1G;
+               max_huge_page_level = PG_LEVEL_1G;
        else
-               max_page_level = PG_LEVEL_2M;
+               max_huge_page_level = PG_LEVEL_2M;
  }
  EXPORT_SYMBOL_GPL(kvm_configure_mmu);
  
@@@ -5665,7 -5695,7 +5695,7 @@@ static int alloc_mmu_pages(struct kvm_v
         * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
         * skip allocating the PDP table.
         */
-       if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
+       if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
                return 0;
  
        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
@@@ -5684,6 -5714,14 +5714,14 @@@ int kvm_mmu_create(struct kvm_vcpu *vcp
        uint i;
        int ret;
  
+       vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+       vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+       vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+       vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+       vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
        vcpu->arch.mmu = &vcpu->arch.root_mmu;
        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  
@@@ -5732,12 -5770,11 +5770,11 @@@ restart
                        break;
  
                /*
-                * Skip invalid pages with a non-zero root count, zapping pages
-                * with a non-zero root count will never succeed, i.e. the page
-                * will get thrown back on active_mmu_pages and we'll get stuck
-                * in an infinite loop.
+                * Invalid pages should never land back on the list of active
+                * pages.  Skip the bogus page, otherwise we'll get stuck in an
+                * infinite loop if the page gets put back on the list (again).
                 */
-               if (sp->role.invalid && sp->root_count)
+               if (WARN_ON(sp->role.invalid))
                        continue;
  
                /*
@@@ -5904,7 -5941,7 +5941,7 @@@ static bool kvm_mmu_zap_collapsible_spt
  
  restart:
        for_each_rmap_spte(rmap_head, &iter, sptep) {
-               sp = page_header(__pa(sptep));
+               sp = sptep_to_sp(sptep);
                pfn = spte_to_pfn(*sptep);
  
                /*
@@@ -6015,7 -6052,7 +6052,7 @@@ void kvm_mmu_zap_all(struct kvm *kvm
        spin_lock(&kvm->mmu_lock);
  restart:
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-               if (sp->role.invalid && sp->root_count)
+               if (WARN_ON(sp->role.invalid))
                        continue;
                if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
                        goto restart;
@@@ -6092,9 -6129,7 +6129,7 @@@ mmu_shrink_scan(struct shrinker *shrink
                        goto unlock;
                }
  
-               if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
-                       freed++;
-               kvm_mmu_commit_zap_page(kvm, &invalid_list);
+               freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
  
  unlock:
                spin_unlock(&kvm->mmu_lock);
index 275564a0ebdb764e726a569e6a7e7937bb5e27b3,0172a949f6a75129723f03c1a30e83814db8026f..4dd6b1e5b8cf7238c038c2842b2a0fe4edcb18dc
@@@ -260,7 -260,7 +260,7 @@@ static int FNAME(update_accessed_dirty_
                                !(pte & PT_GUEST_DIRTY_MASK)) {
                        trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
  #if PTTYPE == PTTYPE_EPT
-                       if (kvm_arch_write_log_dirty(vcpu, addr))
+                       if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
                                return -EINVAL;
  #endif
                        pte |= PT_GUEST_DIRTY_MASK;
@@@ -314,7 -314,7 +314,7 @@@ static int FNAME(walk_addr_generic)(str
  {
        int ret;
        pt_element_t pte;
 -      pt_element_t __user *uninitialized_var(ptep_user);
 +      pt_element_t __user *ptep_user;
        gfn_t table_gfn;
        u64 pt_access, pte_access;
        unsigned index, accessed_dirty, pte_pkey;
@@@ -596,7 -596,7 +596,7 @@@ static void FNAME(pte_prefetch)(struct 
        u64 *spte;
        int i;
  
-       sp = page_header(__pa(sptep));
+       sp = sptep_to_sp(sptep);
  
        if (sp->role.level > PG_LEVEL_4K)
                return;
@@@ -789,10 -789,6 +789,6 @@@ static int FNAME(page_fault)(struct kvm
  
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
  
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
        /*
         * If PFEC.RSVD is set, this is a shadow page fault.
         * The bit needs to be cleared before walking guest page tables.
                return RET_PF_EMULATE;
        }
  
+       r = mmu_topup_memory_caches(vcpu, true);
+       if (r)
+               return r;
        vcpu->arch.write_fault_to_shadow_pgtable = false;
  
        is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
                goto out_unlock;
  
        kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
-       if (make_mmu_pages_available(vcpu) < 0)
+       r = make_mmu_pages_available(vcpu);
+       if (r)
                goto out_unlock;
        r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
                         map_writable, prefault, lpage_disallowed);
@@@ -903,7 -904,7 +904,7 @@@ static void FNAME(invlpg)(struct kvm_vc
         * No need to check return value here, rmap_can_add() can
         * help us to skip pte prefetch later.
         */
-       mmu_topup_memory_caches(vcpu);
+       mmu_topup_memory_caches(vcpu, true);
  
        if (!VALID_PAGE(root_hpa)) {
                WARN_ON(1);
                level = iterator.level;
                sptep = iterator.sptep;
  
-               sp = page_header(__pa(sptep));
+               sp = sptep_to_sp(sptep);
                if (is_last_spte(*sptep, level)) {
                        pt_element_t gpte;
                        gpa_t pte_gpa;
diff --combined arch/x86/kvm/svm/svm.c
index 5bbf76189afa4e554187ec8d8acc5bc5560b3513,5f47b44c5c324ac024df816d4e85783fb1a5a602..03dd7bac80348857b378be2edff0382302ed6e2a
@@@ -254,7 -254,7 +254,7 @@@ static inline void invlpga(unsigned lon
        asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
  }
  
- static int get_npt_level(struct kvm_vcpu *vcpu)
+ static int get_max_npt_level(void)
  {
  #ifdef CONFIG_X86_64
        return PT64_ROOT_4LEVEL;
@@@ -282,7 -282,7 +282,7 @@@ void svm_set_efer(struct kvm_vcpu *vcpu
        }
  
        svm->vmcb->save.efer = efer | EFER_SVME;
-       mark_dirty(svm->vmcb, VMCB_CR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  }
  
  static int is_external_interrupt(u32 info)
@@@ -713,7 -713,7 +713,7 @@@ static void grow_ple_window(struct kvm_
                                                        pause_filter_count_max);
  
        if (control->pause_filter_count != old) {
-               mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            control->pause_filter_count, old);
        }
@@@ -731,7 -731,7 +731,7 @@@ static void shrink_ple_window(struct kv
                                                    pause_filter_count_shrink,
                                                    pause_filter_count);
        if (control->pause_filter_count != old) {
-               mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            control->pause_filter_count, old);
        }
@@@ -885,7 -885,7 +885,7 @@@ static __init int svm_hardware_setup(vo
        if (npt_enabled && !npt)
                npt_enabled = false;
  
-       kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
+       kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
        if (nrips) {
  
        svm_set_cpu_caps();
  
+       /*
+        * It seems that on AMD processors PTE's accessed bit is
+        * being set by the CPU hardware before the NPF vmexit.
+        * This is not expected behaviour and our tests fail because
+        * of it.
+        * A workaround here is to disable support for
+        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+        * In this case userspace can know if there is support using
+        * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+        * it
+        * If future AMD CPU models change the behaviour described above,
+        * this variable can be changed accordingly
+        */
+       allow_smaller_maxphyaddr = !npt_enabled;
        return 0;
  
  err:
@@@ -966,7 -981,7 +981,7 @@@ static u64 svm_write_l1_tsc_offset(stru
  
        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
  
-       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
        return svm->vmcb->control.tsc_offset;
  }
  
@@@ -1002,38 -1017,38 +1017,38 @@@ static void init_vmcb(struct vcpu_svm *
        if (enable_vmware_backdoor)
                set_exception_intercept(svm, GP_VECTOR);
  
-       set_intercept(svm, INTERCEPT_INTR);
-       set_intercept(svm, INTERCEPT_NMI);
-       set_intercept(svm, INTERCEPT_SMI);
-       set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
-       set_intercept(svm, INTERCEPT_RDPMC);
-       set_intercept(svm, INTERCEPT_CPUID);
-       set_intercept(svm, INTERCEPT_INVD);
-       set_intercept(svm, INTERCEPT_INVLPG);
-       set_intercept(svm, INTERCEPT_INVLPGA);
-       set_intercept(svm, INTERCEPT_IOIO_PROT);
-       set_intercept(svm, INTERCEPT_MSR_PROT);
-       set_intercept(svm, INTERCEPT_TASK_SWITCH);
-       set_intercept(svm, INTERCEPT_SHUTDOWN);
-       set_intercept(svm, INTERCEPT_VMRUN);
-       set_intercept(svm, INTERCEPT_VMMCALL);
-       set_intercept(svm, INTERCEPT_VMLOAD);
-       set_intercept(svm, INTERCEPT_VMSAVE);
-       set_intercept(svm, INTERCEPT_STGI);
-       set_intercept(svm, INTERCEPT_CLGI);
-       set_intercept(svm, INTERCEPT_SKINIT);
-       set_intercept(svm, INTERCEPT_WBINVD);
-       set_intercept(svm, INTERCEPT_XSETBV);
-       set_intercept(svm, INTERCEPT_RDPRU);
-       set_intercept(svm, INTERCEPT_RSM);
+       svm_set_intercept(svm, INTERCEPT_INTR);
+       svm_set_intercept(svm, INTERCEPT_NMI);
+       svm_set_intercept(svm, INTERCEPT_SMI);
+       svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+       svm_set_intercept(svm, INTERCEPT_RDPMC);
+       svm_set_intercept(svm, INTERCEPT_CPUID);
+       svm_set_intercept(svm, INTERCEPT_INVD);
+       svm_set_intercept(svm, INTERCEPT_INVLPG);
+       svm_set_intercept(svm, INTERCEPT_INVLPGA);
+       svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+       svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+       svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+       svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+       svm_set_intercept(svm, INTERCEPT_VMRUN);
+       svm_set_intercept(svm, INTERCEPT_VMMCALL);
+       svm_set_intercept(svm, INTERCEPT_VMLOAD);
+       svm_set_intercept(svm, INTERCEPT_VMSAVE);
+       svm_set_intercept(svm, INTERCEPT_STGI);
+       svm_set_intercept(svm, INTERCEPT_CLGI);
+       svm_set_intercept(svm, INTERCEPT_SKINIT);
+       svm_set_intercept(svm, INTERCEPT_WBINVD);
+       svm_set_intercept(svm, INTERCEPT_XSETBV);
+       svm_set_intercept(svm, INTERCEPT_RDPRU);
+       svm_set_intercept(svm, INTERCEPT_RSM);
  
        if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
-               set_intercept(svm, INTERCEPT_MONITOR);
-               set_intercept(svm, INTERCEPT_MWAIT);
+               svm_set_intercept(svm, INTERCEPT_MONITOR);
+               svm_set_intercept(svm, INTERCEPT_MWAIT);
        }
  
        if (!kvm_hlt_in_guest(svm->vcpu.kvm))
-               set_intercept(svm, INTERCEPT_HLT);
+               svm_set_intercept(svm, INTERCEPT_HLT);
  
        control->iopm_base_pa = __sme_set(iopm_base);
        control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
-               clr_intercept(svm, INTERCEPT_INVLPG);
+               svm_clr_intercept(svm, INTERCEPT_INVLPG);
                clr_exception_intercept(svm, PF_VECTOR);
                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
        svm->nested.vmcb = 0;
        svm->vcpu.arch.hflags = 0;
  
 -      if (pause_filter_count) {
 +      if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
                control->pause_filter_count = pause_filter_count;
                if (pause_filter_thresh)
                        control->pause_filter_thresh = pause_filter_thresh;
-               set_intercept(svm, INTERCEPT_PAUSE);
+               svm_set_intercept(svm, INTERCEPT_PAUSE);
        } else {
-               clr_intercept(svm, INTERCEPT_PAUSE);
+               svm_clr_intercept(svm, INTERCEPT_PAUSE);
        }
  
        if (kvm_vcpu_apicv_active(&svm->vcpu))
         * in VMCB and clear intercepts to avoid #VMEXIT.
         */
        if (vls) {
-               clr_intercept(svm, INTERCEPT_VMLOAD);
-               clr_intercept(svm, INTERCEPT_VMSAVE);
+               svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+               svm_clr_intercept(svm, INTERCEPT_VMSAVE);
                svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
        }
  
        if (vgif) {
-               clr_intercept(svm, INTERCEPT_STGI);
-               clr_intercept(svm, INTERCEPT_CLGI);
+               svm_clr_intercept(svm, INTERCEPT_STGI);
+               svm_clr_intercept(svm, INTERCEPT_CLGI);
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }
  
                clr_exception_intercept(svm, UD_VECTOR);
        }
  
-       mark_all_dirty(svm->vmcb);
+       vmcb_mark_all_dirty(svm->vmcb);
  
        enable_gif(svm);
  
@@@ -1257,7 -1272,7 +1272,7 @@@ static void svm_vcpu_load(struct kvm_vc
  
        if (unlikely(cpu != vcpu->cpu)) {
                svm->asid_generation = 0;
-               mark_all_dirty(svm->vmcb);
+               vmcb_mark_all_dirty(svm->vmcb);
        }
  
  #ifdef CONFIG_X86_64
@@@ -1356,7 -1371,7 +1371,7 @@@ static void svm_set_vintr(struct vcpu_s
  
        /* The following fields are ignored when AVIC is enabled */
        WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
-       set_intercept(svm, INTERCEPT_VINTR);
+       svm_set_intercept(svm, INTERCEPT_VINTR);
  
        /*
         * This is just a dummy VINTR to actually cause a vmexit to happen.
        control->int_ctl &= ~V_INTR_PRIO_MASK;
        control->int_ctl |= V_IRQ_MASK |
                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-       mark_dirty(svm->vmcb, VMCB_INTR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
  }
  
  static void svm_clear_vintr(struct vcpu_svm *svm)
  {
        const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
-       clr_intercept(svm, INTERCEPT_VINTR);
+       svm_clr_intercept(svm, INTERCEPT_VINTR);
  
        /* Drop int_ctl fields related to VINTR injection.  */
        svm->vmcb->control.int_ctl &= mask;
                svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
        }
  
-       mark_dirty(svm->vmcb, VMCB_INTR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
  }
  
  static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@@ -1503,7 -1518,7 +1518,7 @@@ static void svm_set_idt(struct kvm_vcp
  
        svm->vmcb->save.idtr.limit = dt->size;
        svm->vmcb->save.idtr.base = dt->address ;
-       mark_dirty(svm->vmcb, VMCB_DT);
+       vmcb_mark_dirty(svm->vmcb, VMCB_DT);
  }
  
  static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@@ -1520,7 -1535,7 +1535,7 @@@ static void svm_set_gdt(struct kvm_vcp
  
        svm->vmcb->save.gdtr.limit = dt->size;
        svm->vmcb->save.gdtr.base = dt->address ;
-       mark_dirty(svm->vmcb, VMCB_DT);
+       vmcb_mark_dirty(svm->vmcb, VMCB_DT);
  }
  
  static void update_cr0_intercept(struct vcpu_svm *svm)
        *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
                | (gcr0 & SVM_CR0_SELECTIVE_MASK);
  
-       mark_dirty(svm->vmcb, VMCB_CR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  
        if (gcr0 == *hcr0) {
                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
@@@ -1572,7 -1587,7 +1587,7 @@@ void svm_set_cr0(struct kvm_vcpu *vcpu
        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
                cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
        svm->vmcb->save.cr0 = cr0;
-       mark_dirty(svm->vmcb, VMCB_CR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
        update_cr0_intercept(svm);
  }
  
@@@ -1592,7 -1607,7 +1607,7 @@@ int svm_set_cr4(struct kvm_vcpu *vcpu, 
                cr4 |= X86_CR4_PAE;
        cr4 |= host_cr4_mce;
        to_svm(vcpu)->vmcb->save.cr4 = cr4;
-       mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+       vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
        return 0;
  }
  
@@@ -1624,10 -1639,10 +1639,10 @@@ static void svm_set_segment(struct kvm_
                /* This is symmetric with svm_get_segment() */
                svm->vmcb->save.cpl = (var->dpl & 3);
  
-       mark_dirty(svm->vmcb, VMCB_SEG);
+       vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
  }
  
- static void update_bp_intercept(struct kvm_vcpu *vcpu)
+ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
                        set_exception_intercept(svm, BP_VECTOR);
-       } else
-               vcpu->guest_debug = 0;
+       }
  }
  
  static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
        svm->asid_generation = sd->asid_generation;
        svm->vmcb->control.asid = sd->next_asid++;
  
-       mark_dirty(svm->vmcb, VMCB_ASID);
+       vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
  }
  
  static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
  
        if (unlikely(value != vmcb->save.dr6)) {
                vmcb->save.dr6 = value;
-               mark_dirty(vmcb, VMCB_DR);
+               vmcb_mark_dirty(vmcb, VMCB_DR);
        }
  }
  
@@@ -1687,7 -1701,7 +1701,7 @@@ static void svm_set_dr7(struct kvm_vcp
        struct vcpu_svm *svm = to_svm(vcpu);
  
        svm->vmcb->save.dr7 = value;
-       mark_dirty(svm->vmcb, VMCB_DR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_DR);
  }
  
  static int pf_interception(struct vcpu_svm *svm)
@@@ -2000,8 -2014,8 +2014,8 @@@ void svm_set_gif(struct vcpu_svm *svm, 
                 * again while processing KVM_REQ_EVENT if needed.
                 */
                if (vgif_enabled(svm))
-                       clr_intercept(svm, INTERCEPT_STGI);
-               if (is_intercept(svm, INTERCEPT_VINTR))
+                       svm_clr_intercept(svm, INTERCEPT_STGI);
+               if (svm_is_intercept(svm, INTERCEPT_VINTR))
                        svm_clear_vintr(svm);
  
                enable_gif(svm);
@@@ -2162,7 -2176,7 +2176,7 @@@ static int cpuid_interception(struct vc
  static int iret_interception(struct vcpu_svm *svm)
  {
        ++svm->vcpu.stat.nmi_window_exits;
-       clr_intercept(svm, INTERCEPT_IRET);
+       svm_clr_intercept(svm, INTERCEPT_IRET);
        svm->vcpu.arch.hflags |= HF_IRET_MASK;
        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@@ -2358,8 -2372,10 +2372,10 @@@ static int svm_get_msr_feature(struct k
                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
                        msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
                break;
+       case MSR_IA32_PERF_CAPABILITIES:
+               return 0;
        default:
-               return 1;
+               return KVM_MSR_RET_INVALID;
        }
  
        return 0;
@@@ -2512,7 -2528,7 +2528,7 @@@ static int svm_set_msr(struct kvm_vcpu 
                        return 1;
                vcpu->arch.pat = data;
                svm->vmcb->save.g_pat = data;
-               mark_dirty(svm->vmcb, VMCB_NPT);
+               vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
                        return 1;
  
-               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+               if (kvm_spec_ctrl_test_value(data))
                        return 1;
  
                svm->spec_ctrl = data;
                        return 1;
  
                svm->vmcb->save.dbgctl = data;
-               mark_dirty(svm->vmcb, VMCB_LBR);
+               vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
                if (data & (1ULL<<0))
                        svm_enable_lbrv(svm);
                else
@@@ -2693,7 -2709,7 +2709,7 @@@ static int pause_interception(struct vc
        struct kvm_vcpu *vcpu = &svm->vcpu;
        bool in_kernel = (svm_get_cpl(vcpu) == 0);
  
 -      if (pause_filter_thresh)
 +      if (!kvm_pause_in_guest(vcpu->kvm))
                grow_ple_window(vcpu);
  
        kvm_vcpu_on_spin(vcpu, in_kernel);
@@@ -2947,6 -2963,7 +2963,7 @@@ static int handle_exit(struct kvm_vcpu 
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                kvm_run->fail_entry.hardware_entry_failure_reason
                        = svm->vmcb->control.exit_code;
+               kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                dump_vmcb(vcpu);
                return 0;
        }
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
-               vcpu->run->internal.ndata = 1;
+               vcpu->run->internal.ndata = 2;
                vcpu->run->internal.data[0] = exit_code;
+               vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
                return 0;
        }
  
  
  static void reload_tss(struct kvm_vcpu *vcpu)
  {
-       int cpu = raw_smp_processor_id();
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
  
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
        load_TR_desc();
  }
  
  static void pre_svm_run(struct vcpu_svm *svm)
  {
-       int cpu = raw_smp_processor_id();
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
  
        if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, cpu);
+               return pre_sev_run(svm, svm->vcpu.cpu);
  
        /* FIXME: handle wraparound of asid_generation */
        if (svm->asid_generation != sd->asid_generation)
@@@ -3019,7 -3034,7 +3034,7 @@@ static void svm_inject_nmi(struct kvm_v
  
        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
        vcpu->arch.hflags |= HF_NMI_MASK;
-       set_intercept(svm, INTERCEPT_IRET);
+       svm_set_intercept(svm, INTERCEPT_IRET);
        ++vcpu->stat.nmi_injections;
  }
  
@@@ -3040,7 -3055,7 +3055,7 @@@ static void update_cr8_intercept(struc
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (svm_nested_virtualize_tpr(vcpu))
+       if (nested_svm_virtualize_tpr(vcpu))
                return;
  
        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@@ -3096,10 -3111,10 +3111,10 @@@ static void svm_set_nmi_mask(struct kvm
  
        if (masked) {
                svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               set_intercept(svm, INTERCEPT_IRET);
+               svm_set_intercept(svm, INTERCEPT_IRET);
        } else {
                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               clr_intercept(svm, INTERCEPT_IRET);
+               svm_clr_intercept(svm, INTERCEPT_IRET);
        }
  }
  
@@@ -3179,7 -3194,7 +3194,7 @@@ static void enable_nmi_window(struct kv
  
        if (!gif_set(svm)) {
                if (vgif_enabled(svm))
-                       set_intercept(svm, INTERCEPT_STGI);
+                       svm_set_intercept(svm, INTERCEPT_STGI);
                return; /* STGI will cause a vm exit */
        }
  
@@@ -3234,7 -3249,7 +3249,7 @@@ static inline void sync_cr8_to_lapic(st
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (svm_nested_virtualize_tpr(vcpu))
+       if (nested_svm_virtualize_tpr(vcpu))
                return;
  
        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
@@@ -3248,7 -3263,7 +3263,7 @@@ static inline void sync_lapic_to_cr8(st
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 cr8;
  
-       if (svm_nested_virtualize_tpr(vcpu) ||
+       if (nested_svm_virtualize_tpr(vcpu) ||
            kvm_vcpu_apicv_active(vcpu))
                return;
  
@@@ -3344,6 -3359,60 +3359,60 @@@ static fastpath_t svm_exit_handlers_fas
  
  void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
  
+ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+                                       struct vcpu_svm *svm)
+ {
+       /*
+        * VMENTER enables interrupts (host state), but the kernel state is
+        * interrupts disabled when this is invoked. Also tell RCU about
+        * it. This is the same logic as for exit_to_user_mode().
+        *
+        * This ensures that e.g. latency analysis on the host observes
+        * guest mode as interrupt enabled.
+        *
+        * guest_enter_irqoff() informs context tracking about the
+        * transition to guest mode and if enabled adjusts RCU state
+        * accordingly.
+        */
+       instrumentation_begin();
+       trace_hardirqs_on_prepare();
+       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+       instrumentation_end();
+       guest_enter_irqoff();
+       lockdep_hardirqs_on(CALLER_ADDR0);
+       __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+ #ifdef CONFIG_X86_64
+       native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ #else
+       loadsegment(fs, svm->host.fs);
+ #ifndef CONFIG_X86_32_LAZY_GS
+       loadsegment(gs, svm->host.gs);
+ #endif
+ #endif
+       /*
+        * VMEXIT disables interrupts (host state), but tracing and lockdep
+        * have them in state 'on' as recorded before entering guest mode.
+        * Same as enter_from_user_mode().
+        *
+        * guest_exit_irqoff() restores host context and reinstates RCU if
+        * enabled and required.
+        *
+        * This needs to be done before the below as native_read_msr()
+        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+        * into world and some more.
+        */
+       lockdep_hardirqs_off(CALLER_ADDR0);
+       guest_exit_irqoff();
+       instrumentation_begin();
+       trace_hardirqs_off_finish();
+       instrumentation_end();
+ }
  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
  {
        fastpath_t exit_fastpath;
         */
        x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
  
-       __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
- #ifdef CONFIG_X86_64
-       wrmsrl(MSR_GS_BASE, svm->host.gs_base);
- #else
-       loadsegment(fs, svm->host.fs);
- #ifndef CONFIG_X86_32_LAZY_GS
-       loadsegment(gs, svm->host.gs);
- #endif
- #endif
+       svm_vcpu_enter_exit(vcpu, svm);
  
        /*
         * We do not use IBRS in the kernel. If this vCPU has used the
                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
                svm_handle_mce(svm);
  
-       mark_all_clean(svm->vmcb);
+       vmcb_mark_all_clean(svm->vmcb);
        return exit_fastpath;
  }
  
- static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
+ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+                            int root_level)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr3;
        cr3 = __sme_set(root);
        if (npt_enabled) {
                svm->vmcb->control.nested_cr3 = cr3;
-               mark_dirty(svm->vmcb, VMCB_NPT);
+               vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
  
                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
        }
  
        svm->vmcb->save.cr3 = cr3;
-       mark_dirty(svm->vmcb, VMCB_CR);
+       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  }
  
  static int is_disabled(void)
@@@ -3551,7 -3612,7 +3612,7 @@@ static u64 svm_get_mt_mask(struct kvm_v
        return 0;
  }
  
- static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
@@@ -3780,7 -3841,7 +3841,7 @@@ static void svm_handle_exit_irqoff(stru
  
  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
  {
 -      if (pause_filter_thresh)
 +      if (!kvm_pause_in_guest(vcpu->kvm))
                shrink_ple_window(vcpu);
  }
  
@@@ -3843,6 -3904,7 +3904,7 @@@ static int svm_pre_leave_smm(struct kvm
        struct kvm_host_map map;
        u64 guest;
        u64 vmcb;
+       int ret = 0;
  
        guest = GET_SMSTATE(u64, smstate, 0x7ed8);
        vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
                if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
                        return 1;
                nested_vmcb = map.hva;
-               enter_svm_guest_mode(svm, vmcb, nested_vmcb);
+               ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
                kvm_vcpu_unmap(&svm->vcpu, &map, true);
        }
-       return 0;
+       return ret;
  }
  
  static void enable_smi_window(struct kvm_vcpu *vcpu)
  
        if (!gif_set(svm)) {
                if (vgif_enabled(svm))
-                       set_intercept(svm, INTERCEPT_STGI);
+                       svm_set_intercept(svm, INTERCEPT_STGI);
                /* STGI will cause a vm exit */
        } else {
                /* We must be in SMM; RSM will cause a vmexit anyway.  */
@@@ -3958,9 -4021,6 +4021,9 @@@ static void svm_vm_destroy(struct kvm *
  
  static int svm_vm_init(struct kvm *kvm)
  {
 +      if (!pause_filter_count || !pause_filter_thresh)
 +              kvm->arch.pause_in_guest = true;
 +
        if (avic) {
                int ret = avic_vm_init(kvm);
                if (ret)
@@@ -3992,7 -4052,7 +4055,7 @@@ static struct kvm_x86_ops svm_x86_ops _
        .vcpu_blocking = svm_vcpu_blocking,
        .vcpu_unblocking = svm_vcpu_unblocking,
  
-       .update_bp_intercept = update_bp_intercept,
+       .update_exception_bitmap = update_exception_bitmap,
        .get_msr_feature = svm_get_msr_feature,
        .get_msr = svm_get_msr,
        .set_msr = svm_set_msr,
  
        .set_tss_addr = svm_set_tss_addr,
        .set_identity_map_addr = svm_set_identity_map_addr,
-       .get_tdp_level = get_npt_level,
        .get_mt_mask = svm_get_mt_mask,
  
        .get_exit_info = svm_get_exit_info,
  
-       .cpuid_update = svm_cpuid_update,
+       .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
  
        .has_wbinvd_exit = svm_has_wbinvd_exit,
  
index 11e4df5600183af206a0f5fab4adb93108bef6e4,e405e754b592be58c911bbe9cd16839f150cabc2..23b58c28a1c926f461cb87c9ee1f03a310f23e25
@@@ -171,15 -171,6 +171,6 @@@ static int nested_vmx_failInvalid(struc
  static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
                                u32 vm_instruction_error)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       /*
-        * failValid writes the error number to the current VMCS, which
-        * can't be done if there isn't a current VMCS.
-        */
-       if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
-               return nested_vmx_failInvalid(vcpu);
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
        return kvm_skip_emulated_instruction(vcpu);
  }
  
+ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       /*
+        * failValid writes the error number to the current VMCS, which
+        * can't be done if there isn't a current VMCS.
+        */
+       if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+               return nested_vmx_failInvalid(vcpu);
+       return nested_vmx_failValid(vcpu, vm_instruction_error);
+ }
  static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
  {
        /* TODO: not to reset guest simply here. */
@@@ -2157,7 -2162,8 +2162,8 @@@ static void prepare_vmcs02_constant_sta
         * consistency checks.
         */
        if (enable_ept && nested_early_check)
-               vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+               vmcs_write64(EPT_POINTER,
+                            construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
  
        /* All VMFUNCs are currently emulated through L0 vmexits.  */
        if (cpu_has_vmx_vmfunc())
@@@ -2433,22 -2439,28 +2439,28 @@@ static void prepare_vmcs02_rare(struct 
  
        /*
         * Whether page-faults are trapped is determined by a combination of
-        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-        * If enable_ept, L0 doesn't care about page faults and we should
-        * set all of these to L1's desires. However, if !enable_ept, L0 does
-        * care about (at least some) page faults, and because it is not easy
-        * (if at all possible?) to merge L0 and L1's desires, we simply ask
-        * to exit on each and every L2 page fault. This is done by setting
-        * MASK=MATCH=0 and (see below) EB.PF=1.
+        * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
+        * doesn't care about page faults then we should set all of these to
+        * L1's desires. However, if L0 does care about (some) page faults, it
+        * is not easy (if at all possible?) to merge L0 and L1's desires, we
+        * simply ask to exit on each and every L2 page fault. This is done by
+        * setting MASK=MATCH=0 and (see below) EB.PF=1.
         * Note that below we don't need special code to set EB.PF beyond the
         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
         */
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-               enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-               enable_ept ? vmcs12->page_fault_error_code_match : 0);
+       if (vmx_need_pf_intercept(&vmx->vcpu)) {
+               /*
+                * TODO: if both L0 and L1 need the same MASK and MATCH,
+                * go ahead and use it?
+                */
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       } else {
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+       }
  
        if (cpu_has_vmx_apicv()) {
                vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
@@@ -3205,6 -3217,43 +3217,43 @@@ static bool nested_get_vmcs12_pages(str
        return true;
  }
  
+ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+ {
+       struct vmcs12 *vmcs12;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gpa_t dst;
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+               return 0;
+       if (WARN_ON_ONCE(vmx->nested.pml_full))
+               return 1;
+       /*
+        * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+        * set is already checked as part of A/D emulation.
+        */
+       vmcs12 = get_vmcs12(vcpu);
+       if (!nested_cpu_has_pml(vmcs12))
+               return 0;
+       if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+               vmx->nested.pml_full = true;
+               return 1;
+       }
+       gpa &= ~0xFFFull;
+       dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+       if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+                                offset_in_page(dst), sizeof(gpa)))
+               return 0;
+       vmcs12->guest_pml_index--;
+       return 0;
+ }
  /*
   * Intel's VMX Instruction Reference specifies a common set of prerequisites
   * for running VMX instructions (except VMXON, whose prerequisites are
@@@ -3456,19 -3505,18 +3505,18 @@@ static int nested_vmx_run(struct kvm_vc
         * when using the merged vmcs02.
         */
        if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+               return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
  
        if (vmcs12->launch_state == launch)
-               return nested_vmx_failValid(vcpu,
+               return nested_vmx_fail(vcpu,
                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
  
        if (nested_vmx_check_controls(vcpu, vmcs12))
-               return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
        if (nested_vmx_check_host_state(vcpu, vmcs12))
-               return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+               return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
  
        /*
         * We're finally done with prerequisite checking, and can start with
@@@ -3517,7 -3565,7 +3565,7 @@@ vmentry_failed
        if (status == NVMX_VMENTRY_VMEXIT)
                return 1;
        WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
-       return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+       return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  }
  
  /*
@@@ -4460,7 -4508,7 +4508,7 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
         * flag and the VM-instruction error field of the VMCS
         * accordingly, and skip the emulated instruction.
         */
-       (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+       (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
        /*
         * Restore L1's host state to KVM's software model.  We're here
@@@ -4760,8 -4808,7 +4808,7 @@@ static int handle_vmon(struct kvm_vcpu 
        }
  
        if (vmx->nested.vmxon)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+               return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
  
        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
@@@ -4852,12 -4899,10 +4899,10 @@@ static int handle_vmclear(struct kvm_vc
                return r;
  
        if (!page_address_valid(vcpu, vmptr))
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMCLEAR_INVALID_ADDRESS);
+               return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
  
        if (vmptr == vmx->nested.vmxon_ptr)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMCLEAR_VMXON_POINTER);
+               return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
  
        /*
         * When Enlightened VMEntry is enabled on the calling CPU we treat
@@@ -4927,8 -4972,7 +4972,7 @@@ static int handle_vmread(struct kvm_vcp
  
        offset = vmcs_field_to_offset(field);
        if (offset < 0)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+               return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  
        if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
@@@ -5031,8 -5075,7 +5075,7 @@@ static int handle_vmwrite(struct kvm_vc
  
        offset = vmcs_field_to_offset(field);
        if (offset < 0)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+               return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  
        /*
         * If the vCPU supports "VMWRITE to any supported field in the
         */
        if (vmcs_field_readonly(field) &&
            !nested_cpu_has_vmwrite_any_field(vcpu))
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+               return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
  
        /*
         * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
@@@ -5116,12 -5158,10 +5158,10 @@@ static int handle_vmptrld(struct kvm_vc
                return r;
  
        if (!page_address_valid(vcpu, vmptr))
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMPTRLD_INVALID_ADDRESS);
+               return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
  
        if (vmptr == vmx->nested.vmxon_ptr)
-               return nested_vmx_failValid(vcpu,
-                       VMXERR_VMPTRLD_VMXON_POINTER);
+               return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
  
        /* Forbid normal VMPTRLD if Enlightened version was used */
        if (vmx->nested.hv_evmcs)
                         * given physical address won't match the required
                         * VMCS12_REVISION identifier.
                         */
-                       return nested_vmx_failValid(vcpu,
+                       return nested_vmx_fail(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
                }
  
                    (new_vmcs12->hdr.shadow_vmcs &&
                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                        kvm_vcpu_unmap(vcpu, &map, false);
-                       return nested_vmx_failValid(vcpu,
+                       return nested_vmx_fail(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
                }
  
@@@ -5233,8 -5273,7 +5273,7 @@@ static int handle_invept(struct kvm_vcp
        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
  
        if (type >= 32 || !(types & (1 << type)))
-               return nested_vmx_failValid(vcpu,
-                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  
        /* According to the Intel VMX instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
        switch (type) {
        case VMX_EPT_EXTENT_CONTEXT:
                if (!nested_vmx_check_eptp(vcpu, operand.eptp))
-                       return nested_vmx_failValid(vcpu,
+                       return nested_vmx_fail(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  
                roots_to_free = 0;
@@@ -5315,7 -5354,7 +5354,7 @@@ static int handle_invvpid(struct kvm_vc
                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
  
        if (type >= 32 || !(types & (1 << type)))
-               return nested_vmx_failValid(vcpu,
+               return nested_vmx_fail(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  
        /* according to the intel vmx instruction reference, the memory
                return vmx_handle_memory_failure(vcpu, r, &e);
  
        if (operand.vpid >> 16)
-               return nested_vmx_failValid(vcpu,
+               return nested_vmx_fail(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
  
        vpid02 = nested_get_vpid02(vcpu);
        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                if (!operand.vpid ||
                    is_noncanonical_address(operand.gla, vcpu))
-                       return nested_vmx_failValid(vcpu,
+                       return nested_vmx_fail(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                vpid_sync_vcpu_addr(vpid02, operand.gla);
                break;
        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
                if (!operand.vpid)
-                       return nested_vmx_failValid(vcpu,
+                       return nested_vmx_fail(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                vpid_sync_context(vpid02);
                break;
@@@ -6079,9 -6118,6 +6118,9 @@@ static int vmx_set_nested_state(struct 
            ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
                return -EINVAL;
  
 +      if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
 +              return -EINVAL;
 +
        /*
         * SMM temporarily disables VMX, so we cannot be in guest mode,
         * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
        if (ret)
                return ret;
  
 -      /* Empty 'VMXON' state is permitted */
 -      if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
 -              return 0;
 +      /* Empty 'VMXON' state is permitted if no VMCS loaded */
 +      if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
 +              /* See vmx_has_valid_vmcs12.  */
 +              if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
 +                  (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
 +                  (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
 +                      return -EINVAL;
 +              else
 +                      return 0;
 +      }
  
        if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
                if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
                        goto error_guest_mode;
        }
  
 +      vmx->nested.has_preemption_timer_deadline = false;
        if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
                vmx->nested.has_preemption_timer_deadline = true;
                vmx->nested.preemption_timer_deadline =
@@@ -6333,7 -6361,8 +6372,8 @@@ void nested_vmx_setup_ctls_msrs(struct 
  
        /*
         * secondary cpu-based controls.  Do not include those that
-        * depend on CPUID bits, they are added later by vmx_cpuid_update.
+        * depend on CPUID bits, they are added later by
+        * vmx_vcpu_after_set_cpuid.
         */
        if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
                rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@@ -6514,6 -6543,7 +6554,7 @@@ struct kvm_x86_nested_ops vmx_nested_op
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
        .get_vmcs12_pages = nested_get_vmcs12_pages,
+       .write_log_dirty = nested_vmx_write_pml_buffer,
        .enable_evmcs = nested_enable_evmcs,
        .get_evmcs_version = nested_get_evmcs_version,
  };
diff --combined arch/x86/kvm/vmx/vmx.c
index 559634b59d2a25c0a0e48eebfb2464ead9b6fdba,a70d8f6d8aba77fabebcf4fa4b171e46648f206d..46ba2e03a8926d7a3d47463b5661ecd857aa76fb
@@@ -27,7 -27,6 +27,7 @@@
  #include <linux/slab.h>
  #include <linux/tboot.h>
  #include <linux/trace_events.h>
 +#include <linux/entry-kvm.h>
  
  #include <asm/apic.h>
  #include <asm/asm.h>
@@@ -781,7 -780,7 +781,7 @@@ void update_exception_bitmap(struct kvm
                eb |= 1u << BP_VECTOR;
        if (to_vmx(vcpu)->rmode.vm86_active)
                eb = ~0;
-       if (enable_ept)
+       if (!vmx_need_pf_intercept(vcpu))
                eb &= ~(1u << PF_VECTOR);
  
        /* When we are running a nested L2 guest and L1 specified for it a
@@@ -1170,7 -1169,7 +1170,7 @@@ void vmx_prepare_switch_to_guest(struc
  
        gs_base = cpu_kernelmode_gs_base(cpu);
        if (likely(is_64bit_mm(current->mm))) {
 -              save_fsgs_for_kvm();
 +              current_save_fsgs();
                fs_sel = current->thread.fsindex;
                gs_sel = current->thread.gsindex;
                fs_base = current->thread.fsbase;
@@@ -1816,7 -1815,7 +1816,7 @@@ static int vmx_get_msr_feature(struct k
                msr->data = vmx_get_perf_capabilities();
                return 0;
        default:
-               return 1;
+               return KVM_MSR_RET_INVALID;
        }
  }
  
@@@ -2063,7 -2062,7 +2063,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                        return 1;
  
-               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+               if (kvm_spec_ctrl_test_value(data))
                        return 1;
  
                vmx->spec_ctrl = data;
@@@ -2934,14 -2933,16 +2934,16 @@@ static void vmx_flush_tlb_all(struct kv
  
  static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
  {
-       u64 root_hpa = vcpu->arch.mmu->root_hpa;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 root_hpa = mmu->root_hpa;
  
        /* No flush required if the current context is invalid. */
        if (!VALID_PAGE(root_hpa))
                return;
  
        if (enable_ept)
-               ept_sync_context(construct_eptp(vcpu, root_hpa));
+               ept_sync_context(construct_eptp(vcpu, root_hpa,
+                                               mmu->shadow_root_level));
        else if (!is_guest_mode(vcpu))
                vpid_sync_context(to_vmx(vcpu)->vpid);
        else
@@@ -3064,26 -3065,19 +3066,19 @@@ void vmx_set_cr0(struct kvm_vcpu *vcpu
        vmx->emulation_required = emulation_required(vcpu);
  }
  
- static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
+ static int vmx_get_max_tdp_level(void)
  {
-       if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+       if (cpu_has_vmx_ept_5levels())
                return 5;
        return 4;
  }
  
- static int get_ept_level(struct kvm_vcpu *vcpu)
- {
-       if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-               return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
-       return vmx_get_tdp_level(vcpu);
- }
- u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+                  int root_level)
  {
        u64 eptp = VMX_EPTP_MT_WB;
  
-       eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+       eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
  
        if (enable_ept_ad_bits &&
            (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
        return eptp;
  }
  
- void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
+ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
+                            int pgd_level)
  {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
        u64 eptp;
  
        if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd);
+               eptp = construct_eptp(vcpu, pgd, pgd_level);
                vmcs_write64(EPT_POINTER, eptp);
  
                if (kvm_x86_ops.tlb_remote_flush) {
@@@ -4356,6 -4351,16 +4352,16 @@@ static void init_vmcs(struct vcpu_vmx *
                vmx->pt_desc.guest.output_mask = 0x7F;
                vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
        }
+       /*
+        * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+        * between guest and host.  In that case we only care about present
+        * faults.
+        */
+       if (enable_ept) {
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+               vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+       }
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@@ -4782,18 -4787,25 +4788,25 @@@ static int handle_exception_nmi(struct 
            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-               vcpu->run->internal.ndata = 3;
+               vcpu->run->internal.ndata = 4;
                vcpu->run->internal.data[0] = vect_info;
                vcpu->run->internal.data[1] = intr_info;
                vcpu->run->internal.data[2] = error_code;
+               vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
                return 0;
        }
  
        if (is_page_fault(intr_info)) {
                cr2 = vmx_get_exit_qual(vcpu);
-               /* EPT won't cause page fault directly */
-               WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
-               return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+               if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+                       /*
+                        * EPT will cause page fault only if we need to
+                        * detect illegal GPAs.
+                        */
+                       kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+                       return 1;
+               } else
+                       return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
        }
  
        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@@ -5309,6 -5321,18 +5322,18 @@@ static int handle_ept_violation(struct 
               PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
  
        vcpu->arch.exit_qualification = exit_qualification;
+       /*
+        * Check that the GPA doesn't exceed physical memory limits, as that is
+        * a guest page fault.  We have to emulate the instruction here, because
+        * if the illegal address is that of a paging structure, then
+        * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
+        * would also use advanced VM-exit information for EPT violations to
+        * reconstruct the page fault error code.
+        */
+       if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+               return kvm_emulate_instruction(vcpu, 0);
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
  }
  
@@@ -5374,12 -5398,14 +5399,12 @@@ static int handle_invalid_guest_state(s
                }
  
                /*
 -               * Note, return 1 and not 0, vcpu_run() is responsible for
 -               * morphing the pending signal into the proper return code.
 +               * Note, return 1 and not 0, vcpu_run() will invoke
 +               * xfer_to_guest_mode() which will create a proper return
 +               * code.
                 */
 -              if (signal_pending(current))
 +              if (__xfer_to_guest_mode_work_pending())
                        return 1;
 -
 -              if (need_resched())
 -                      schedule();
        }
  
        return 1;
@@@ -6005,6 -6031,7 +6030,7 @@@ static int vmx_handle_exit(struct kvm_v
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason;
+               vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }
  
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
+               vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }
  
                        vcpu->run->internal.data[3] =
                                vmcs_read64(GUEST_PHYSICAL_ADDRESS);
                }
+               vcpu->run->internal.data[vcpu->run->internal.ndata++] =
+                       vcpu->arch.last_vmentry_cpu;
                return 0;
        }
  
@@@ -6094,8 -6124,9 +6123,9 @@@ unexpected_vmexit
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
-       vcpu->run->internal.ndata = 1;
+       vcpu->run->internal.ndata = 2;
        vcpu->run->internal.data[0] = exit_reason;
+       vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
        return 0;
  }
  
   * information but as all relevant affected CPUs have 32KiB L1D cache size
   * there is no point in doing so.
   */
- static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
  {
        int size = PAGE_SIZE << L1D_CACHE_ORDER;
  
        vcpu->stat.l1d_flush++;
  
        if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
-               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
                return;
        }
  
@@@ -6628,7 -6659,7 +6658,7 @@@ static void vmx_update_hv_timer(struct 
        }
  }
  
- void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
  {
        if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
                vmx->loaded_vmcs->host_state.rsp = host_rsp;
@@@ -6650,6 -6681,63 +6680,63 @@@ static fastpath_t vmx_exit_handlers_fas
  
  bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
  
+ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+                                       struct vcpu_vmx *vmx)
+ {
+       /*
+        * VMENTER enables interrupts (host state), but the kernel state is
+        * interrupts disabled when this is invoked. Also tell RCU about
+        * it. This is the same logic as for exit_to_user_mode().
+        *
+        * This ensures that e.g. latency analysis on the host observes
+        * guest mode as interrupt enabled.
+        *
+        * guest_enter_irqoff() informs context tracking about the
+        * transition to guest mode and if enabled adjusts RCU state
+        * accordingly.
+        */
+       instrumentation_begin();
+       trace_hardirqs_on_prepare();
+       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+       instrumentation_end();
+       guest_enter_irqoff();
+       lockdep_hardirqs_on(CALLER_ADDR0);
+       /* L1D Flush includes CPU buffer clear to mitigate MDS */
+       if (static_branch_unlikely(&vmx_l1d_should_flush))
+               vmx_l1d_flush(vcpu);
+       else if (static_branch_unlikely(&mds_user_clear))
+               mds_clear_cpu_buffers();
+       if (vcpu->arch.cr2 != native_read_cr2())
+               native_write_cr2(vcpu->arch.cr2);
+       vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+                                  vmx->loaded_vmcs->launched);
+       vcpu->arch.cr2 = native_read_cr2();
+       /*
+        * VMEXIT disables interrupts (host state), but tracing and lockdep
+        * have them in state 'on' as recorded before entering guest mode.
+        * Same as enter_from_user_mode().
+        *
+        * guest_exit_irqoff() restores host context and reinstates RCU if
+        * enabled and required.
+        *
+        * This needs to be done before the below as native_read_msr()
+        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+        * into world and some more.
+        */
+       lockdep_hardirqs_off(CALLER_ADDR0);
+       guest_exit_irqoff();
+       instrumentation_begin();
+       trace_hardirqs_off_finish();
+       instrumentation_end();
+ }
  static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
        fastpath_t exit_fastpath;
@@@ -6724,19 -6812,8 +6811,8 @@@ reenter_guest
         */
        x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
  
-       /* L1D Flush includes CPU buffer clear to mitigate MDS */
-       if (static_branch_unlikely(&vmx_l1d_should_flush))
-               vmx_l1d_flush(vcpu);
-       else if (static_branch_unlikely(&mds_user_clear))
-               mds_clear_cpu_buffers();
-       if (vcpu->arch.cr2 != read_cr2())
-               write_cr2(vcpu->arch.cr2);
-       vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
-                                  vmx->loaded_vmcs->launched);
-       vcpu->arch.cr2 = read_cr2();
+       /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+       vmx_vcpu_enter_exit(vcpu, vmx);
  
        /*
         * We do not use IBRS in the kernel. If this vCPU has used the
@@@ -7229,7 -7306,7 +7305,7 @@@ static void update_intel_pt_cfg(struct 
                vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
  }
  
- static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
@@@ -7478,42 -7555,6 +7554,6 @@@ static void vmx_flush_log_dirty(struct 
        kvm_flush_pml_buffers(kvm);
  }
  
- static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
- {
-       struct vmcs12 *vmcs12;
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       gpa_t dst;
-       if (is_guest_mode(vcpu)) {
-               WARN_ON_ONCE(vmx->nested.pml_full);
-               /*
-                * Check if PML is enabled for the nested guest.
-                * Whether eptp bit 6 is set is already checked
-                * as part of A/D emulation.
-                */
-               vmcs12 = get_vmcs12(vcpu);
-               if (!nested_cpu_has_pml(vmcs12))
-                       return 0;
-               if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
-                       vmx->nested.pml_full = true;
-                       return 1;
-               }
-               gpa &= ~0xFFFull;
-               dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-               if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
-                                        offset_in_page(dst), sizeof(gpa)))
-                       return 0;
-               vmcs12->guest_pml_index--;
-       }
-       return 0;
- }
  static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
                                           struct kvm_memory_slot *memslot,
                                           gfn_t offset, unsigned long mask)
@@@ -7858,7 -7899,7 +7898,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
  
-       .update_bp_intercept = update_exception_bitmap,
+       .update_exception_bitmap = update_exception_bitmap,
        .get_msr_feature = vmx_get_msr_feature,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
  
        .set_tss_addr = vmx_set_tss_addr,
        .set_identity_map_addr = vmx_set_identity_map_addr,
-       .get_tdp_level = vmx_get_tdp_level,
        .get_mt_mask = vmx_get_mt_mask,
  
        .get_exit_info = vmx_get_exit_info,
  
-       .cpuid_update = vmx_cpuid_update,
+       .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
  
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
        .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
        .flush_log_dirty = vmx_flush_log_dirty,
        .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
-       .write_log_dirty = vmx_write_pml_buffer,
  
        .pre_block = vmx_pre_block,
        .post_block = vmx_post_block,
@@@ -8070,7 -8109,7 +8108,7 @@@ static __init int hardware_setup(void
                ept_lpage_level = PG_LEVEL_2M;
        else
                ept_lpage_level = PG_LEVEL_4K;
-       kvm_configure_mmu(enable_ept, ept_lpage_level);
+       kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
  
        /*
         * Only enable PML when hardware supports PML feature, and both EPT
@@@ -8265,6 -8304,13 +8303,13 @@@ static int __init vmx_init(void
  #endif
        vmx_check_vmcs12_offsets();
  
+       /*
+        * Intel processors don't have problems with
+        * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+        * it for VMX by default
+        */
+       allow_smaller_maxphyaddr = true;
        return 0;
  }
  module_init(vmx_init);
diff --combined arch/x86/kvm/x86.c
index 21d5e7a7ffd0015819182e0fef28b809c9b69e64,dc4370394ab8e897d24e202766100cffcb7fe9e7..12ea77f99ff350d48c2b61d68687cc4cacfe7183
@@@ -56,7 -56,6 +56,7 @@@
  #include <linux/sched/stat.h>
  #include <linux/sched/isolation.h>
  #include <linux/mem_encrypt.h>
 +#include <linux/entry-kvm.h>
  
  #include <trace/events/kvm.h>
  
@@@ -188,6 -187,9 +188,9 @@@ static struct kvm_shared_msrs __percpu 
  u64 __read_mostly host_efer;
  EXPORT_SYMBOL_GPL(host_efer);
  
+ bool __read_mostly allow_smaller_maxphyaddr;
+ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
  static u64 __read_mostly host_xss;
  u64 __read_mostly supported_xss;
  EXPORT_SYMBOL_GPL(supported_xss);
@@@ -244,6 -246,29 +247,29 @@@ static struct kmem_cache *x86_fpu_cache
  
  static struct kmem_cache *x86_emulator_cache;
  
+ /*
+  * When called, it means the previous get/set msr reached an invalid msr.
+  * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
+  * to fail the caller.
+  */
+ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+                                u64 data, bool write)
+ {
+       const char *op = write ? "wrmsr" : "rdmsr";
+       if (ignore_msrs) {
+               if (report_ignored_msrs)
+                       vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
+                                   op, msr, data);
+               /* Mask the error */
+               return 0;
+       } else {
+               vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
+                                      op, msr, data);
+               return 1;
+       }
+ }
  static struct kmem_cache *kvm_alloc_emulator_cache(void)
  {
        unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
@@@ -380,7 -405,7 +406,7 @@@ int kvm_set_apic_base(struct kvm_vcpu *
  }
  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
  
- asmlinkage __visible void kvm_spurious_fault(void)
+ asmlinkage __visible noinstr void kvm_spurious_fault(void)
  {
        /* Fault while not rebooting.  We want the trace. */
        BUG_ON(!kvm_rebooting);
@@@ -776,6 -801,7 +802,7 @@@ EXPORT_SYMBOL_GPL(pdptrs_changed)
  int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
+       unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
  
        cr0 |= X86_CR0_ET;
        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
                return 1;
  
-       if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+       if (cr0 & X86_CR0_PG) {
  #ifdef CONFIG_X86_64
-               if ((vcpu->arch.efer & EFER_LME)) {
+               if (!is_paging(vcpu) && (vcpu->arch.efer & EFER_LME)) {
                        int cs_db, cs_l;
  
                        if (!is_pae(vcpu))
                                return 1;
                } else
  #endif
-               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
-                                                kvm_read_cr3(vcpu)))
+               if (is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
+                   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
                        return 1;
        }
  
@@@ -917,7 -943,7 +944,7 @@@ static int __kvm_set_xcr(struct kvm_vcp
        vcpu->arch.xcr0 = xcr0;
  
        if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
-               kvm_update_cpuid(vcpu);
+               kvm_update_cpuid_runtime(vcpu);
        return 0;
  }
  
@@@ -932,37 -958,17 +959,17 @@@ int kvm_set_xcr(struct kvm_vcpu *vcpu, 
  }
  EXPORT_SYMBOL_GPL(kvm_set_xcr);
  
- #define __cr4_reserved_bits(__cpu_has, __c)           \
- ({                                                    \
-       u64 __reserved_bits = CR4_RESERVED_BITS;        \
-                                                       \
-       if (!__cpu_has(__c, X86_FEATURE_XSAVE))         \
-               __reserved_bits |= X86_CR4_OSXSAVE;     \
-       if (!__cpu_has(__c, X86_FEATURE_SMEP))          \
-               __reserved_bits |= X86_CR4_SMEP;        \
-       if (!__cpu_has(__c, X86_FEATURE_SMAP))          \
-               __reserved_bits |= X86_CR4_SMAP;        \
-       if (!__cpu_has(__c, X86_FEATURE_FSGSBASE))      \
-               __reserved_bits |= X86_CR4_FSGSBASE;    \
-       if (!__cpu_has(__c, X86_FEATURE_PKU))           \
-               __reserved_bits |= X86_CR4_PKE;         \
-       if (!__cpu_has(__c, X86_FEATURE_LA57))          \
-               __reserved_bits |= X86_CR4_LA57;        \
-       if (!__cpu_has(__c, X86_FEATURE_UMIP))          \
-               __reserved_bits |= X86_CR4_UMIP;        \
-       __reserved_bits;                                \
- })
- static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
        if (cr4 & cr4_reserved_bits)
                return -EINVAL;
  
-       if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
+       if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
                return -EINVAL;
  
        return 0;
  }
+ EXPORT_SYMBOL_GPL(kvm_valid_cr4);
  
  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
                kvm_mmu_reset_context(vcpu);
  
        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
-               kvm_update_cpuid(vcpu);
+               kvm_update_cpuid_runtime(vcpu);
  
        return 0;
  }
@@@ -1111,7 -1117,7 +1118,7 @@@ static int __kvm_set_dr(struct kvm_vcp
        case 4:
                /* fall through */
        case 6:
-               if (val & 0xffffffff00000000ULL)
+               if (!kvm_dr6_valid(val))
                        return -1; /* #GP */
                vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
                break;
@@@ -1390,8 -1396,7 +1397,7 @@@ static int kvm_get_msr_feature(struct k
                rdmsrl_safe(msr->index, &msr->data);
                break;
        default:
-               if (kvm_x86_ops.get_msr_feature(msr))
-                       return 1;
+               return kvm_x86_ops.get_msr_feature(msr);
        }
        return 0;
  }
@@@ -1403,6 -1408,13 +1409,13 @@@ static int do_get_msr_feature(struct kv
  
        msr.index = index;
        r = kvm_get_msr_feature(&msr);
+       if (r == KVM_MSR_RET_INVALID) {
+               /* Unconditionally clear the output for simplicity */
+               *data = 0;
+               r = kvm_msr_ignored_check(vcpu, index, 0, false);
+       }
        if (r)
                return r;
  
@@@ -1517,6 -1529,17 +1530,17 @@@ static int __kvm_set_msr(struct kvm_vcp
        return kvm_x86_ops.set_msr(vcpu, &msr);
  }
  
+ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+                                    u32 index, u64 data, bool host_initiated)
+ {
+       int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+       if (ret == KVM_MSR_RET_INVALID)
+               ret = kvm_msr_ignored_check(vcpu, index, data, true);
+       return ret;
+ }
  /*
   * Read the MSR specified by @index into @data.  Select MSR specific fault
   * checks are bypassed if @host_initiated is %true.
@@@ -1538,15 -1561,29 +1562,29 @@@ int __kvm_get_msr(struct kvm_vcpu *vcpu
        return ret;
  }
  
+ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+                                    u32 index, u64 *data, bool host_initiated)
+ {
+       int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+       if (ret == KVM_MSR_RET_INVALID) {
+               /* Unconditionally clear *data for simplicity */
+               *data = 0;
+               ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+       }
+       return ret;
+ }
  int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
  {
-       return __kvm_get_msr(vcpu, index, data, false);
+       return kvm_get_msr_ignored_check(vcpu, index, data, false);
  }
  EXPORT_SYMBOL_GPL(kvm_get_msr);
  
  int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
  {
-       return __kvm_set_msr(vcpu, index, data, false);
+       return kvm_set_msr_ignored_check(vcpu, index, data, false);
  }
  EXPORT_SYMBOL_GPL(kvm_set_msr);
  
@@@ -1588,7 -1625,7 +1626,7 @@@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr)
  bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
  {
        return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
 -              need_resched() || signal_pending(current);
 +              xfer_to_guest_mode_work_pending();
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
  
@@@ -1666,12 -1703,12 +1704,12 @@@ EXPORT_SYMBOL_GPL(handle_fastpath_set_m
   */
  static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
  {
-       return __kvm_get_msr(vcpu, index, data, true);
+       return kvm_get_msr_ignored_check(vcpu, index, data, true);
  }
  
  static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
  {
-       return __kvm_set_msr(vcpu, index, *data, true);
+       return kvm_set_msr_ignored_check(vcpu, index, *data, true);
  }
  
  #ifdef CONFIG_X86_64
@@@ -2823,6 -2860,20 +2861,20 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                        return 1;
                vcpu->arch.arch_capabilities = data;
                break;
+       case MSR_IA32_PERF_CAPABILITIES: {
+               struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+               if (!msr_info->host_initiated)
+                       return 1;
+               if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+                       return 1;
+               if (data & ~msr_ent.data)
+                       return 1;
+               vcpu->arch.perf_capabilities = data;
+               return 0;
+               }
        case MSR_EFER:
                return set_efer(vcpu, msr_info);
        case MSR_K7_HWCR:
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
                                return 1;
                        vcpu->arch.ia32_misc_enable_msr = data;
-                       kvm_update_cpuid(vcpu);
+                       kvm_update_cpuid_runtime(vcpu);
                } else {
                        vcpu->arch.ia32_misc_enable_msr = data;
                }
                        return xen_hvm_config(vcpu, data);
                if (kvm_pmu_is_valid_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr_info);
-               if (!ignore_msrs) {
-                       vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
-                                   msr, data);
-                       return 1;
-               } else {
-                       if (report_ignored_msrs)
-                               vcpu_unimpl(vcpu,
-                                       "ignored wrmsr: 0x%x data 0x%llx\n",
-                                       msr, data);
-                       break;
-               }
+               return KVM_MSR_RET_INVALID;
        }
        return 0;
  }
@@@ -3173,6 -3214,12 +3215,12 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                        return 1;
                msr_info->data = vcpu->arch.arch_capabilities;
                break;
+       case MSR_IA32_PERF_CAPABILITIES:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+                       return 1;
+               msr_info->data = vcpu->arch.perf_capabilities;
+               break;
        case MSR_IA32_POWER_CTL:
                msr_info->data = vcpu->arch.msr_ia32_power_ctl;
                break;
        default:
                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                        return kvm_pmu_get_msr(vcpu, msr_info);
-               if (!ignore_msrs) {
-                       vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
-                                              msr_info->index);
-                       return 1;
-               } else {
-                       if (report_ignored_msrs)
-                               vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
-                                       msr_info->index);
-                       msr_info->data = 0;
-               }
-               break;
+               return KVM_MSR_RET_INVALID;
        }
        return 0;
  }
@@@ -3477,6 -3514,7 +3515,7 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_MSR_PLATFORM_INFO:
        case KVM_CAP_EXCEPTION_PAYLOAD:
        case KVM_CAP_SET_GUEST_DEBUG:
+       case KVM_CAP_LAST_CPU:
                r = 1;
                break;
        case KVM_CAP_SYNC_REGS:
        case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
                r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
                break;
+       case KVM_CAP_SMALLER_MAXPHYADDR:
+               r = (int) allow_smaller_maxphyaddr;
+               break;
        default:
                break;
        }
@@@ -8155,7 -8196,7 +8197,7 @@@ static void enter_smm(struct kvm_vcpu *
                kvm_x86_ops.set_efer(vcpu, 0);
  #endif
  
-       kvm_update_cpuid(vcpu);
+       kvm_update_cpuid_runtime(vcpu);
        kvm_mmu_reset_context(vcpu);
  }
  
@@@ -8507,7 -8548,6 +8549,6 @@@ static int vcpu_enter_guest(struct kvm_
        }
  
        trace_kvm_entry(vcpu->vcpu_id);
-       guest_enter_irqoff();
  
        fpregs_assert_state_consistent();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
  
+       vcpu->arch.last_vmentry_cpu = vcpu->cpu;
        vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
  
        vcpu->mode = OUTSIDE_GUEST_MODE;
        local_irq_disable();
        kvm_after_interrupt(vcpu);
  
-       guest_exit_irqoff();
        if (lapic_in_kernel(vcpu)) {
                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
                if (delta != S64_MIN) {
@@@ -8682,11 -8722,15 +8723,11 @@@ static int vcpu_run(struct kvm_vcpu *vc
                        break;
                }
  
 -              if (signal_pending(current)) {
 -                      r = -EINTR;
 -                      vcpu->run->exit_reason = KVM_EXIT_INTR;
 -                      ++vcpu->stat.signal_exits;
 -                      break;
 -              }
 -              if (need_resched()) {
 +              if (__xfer_to_guest_mode_work_pending()) {
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 -                      cond_resched();
 +                      r = xfer_to_guest_mode_handle_work(vcpu);
 +                      if (r)
 +                              return r;
                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
                }
        }
@@@ -9174,7 -9218,7 +9215,7 @@@ static int __set_sregs(struct kvm_vcpu 
                                (X86_CR4_OSXSAVE | X86_CR4_PKE));
        kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
        if (cpuid_update_needed)
-               kvm_update_cpuid(vcpu);
+               kvm_update_cpuid_runtime(vcpu);
  
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        if (is_pae_paging(vcpu)) {
@@@ -9278,7 -9322,7 +9319,7 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
         */
        kvm_set_rflags(vcpu, rflags);
  
-       kvm_x86_ops.update_bp_intercept(vcpu);
+       kvm_x86_ops.update_exception_bitmap(vcpu);
  
        r = 0;
  
@@@ -9476,7 -9520,6 +9517,6 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        fx_init(vcpu);
  
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-       vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
  
        vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
  
@@@ -9929,7 -9972,7 +9969,7 @@@ void kvm_arch_sync_events(struct kvm *k
  int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
  {
        int i, r;
 -      unsigned long hva, uninitialized_var(old_npages);
 +      unsigned long hva, old_npages;
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *slot;
  
@@@ -10673,28 -10716,53 +10713,53 @@@ bool kvm_arch_no_poll(struct kvm_vcpu *
  }
  EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
  
- u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+ int kvm_spec_ctrl_test_value(u64 value)
  {
-       uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+       /*
+        * test that setting IA32_SPEC_CTRL to given value
+        * is allowed by the host processor
+        */
  
-       /* The STIBP bit doesn't fault even if it's not advertised */
-       if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-           !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
-               bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
-       if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
-           !boot_cpu_has(X86_FEATURE_AMD_IBRS))
-               bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+       u64 saved_value;
+       unsigned long flags;
+       int ret = 0;
  
-       if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
-           !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
-               bits &= ~SPEC_CTRL_SSBD;
-       if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
-           !boot_cpu_has(X86_FEATURE_AMD_SSBD))
-               bits &= ~SPEC_CTRL_SSBD;
+       local_irq_save(flags);
+       if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+               ret = 1;
+       else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+               ret = 1;
+       else
+               wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+       local_irq_restore(flags);
  
-       return bits;
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+ {
+       struct x86_exception fault;
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+           vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) {
+               /*
+                * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+                * tables probably do not match the TLB.  Just proceed
+                * with the error code that the processor gave.
+                */
+               fault.vector = PF_VECTOR;
+               fault.error_code_valid = true;
+               fault.error_code = error_code;
+               fault.nested_page_fault = false;
+               fault.address = gva;
+       }
+       vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
  }
- EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
+ EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
diff --combined include/linux/kvm_host.h
index ac83e9c1d82c76a9752989f7cd3aefc6c98ca1c9,989afcbe642fb3bd582d145787b42f6628d30db3..a23076765b4cc26040bd6b3110f06c1d14724e7b
@@@ -211,8 -211,8 +211,8 @@@ struct kvm_async_pf 
  
  void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
  void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-                      unsigned long hva, struct kvm_arch_async_pf *arch);
bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                       unsigned long hva, struct kvm_arch_async_pf *arch);
  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
  #endif
  
@@@ -409,7 -409,7 +409,7 @@@ struct kvm_irq_routing_table 
         * Array indexed by gsi. Each entry contains list of irq chips
         * the gsi is connected to.
         */
 -      struct hlist_head map[0];
 +      struct hlist_head map[];
  };
  #endif
  
@@@ -774,6 -774,7 +774,7 @@@ int kvm_clear_guest_page(struct kvm *kv
  int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
  bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
  unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
  void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
  
@@@ -816,6 -817,13 +817,13 @@@ void kvm_vcpu_on_spin(struct kvm_vcpu *
  void kvm_flush_remote_tlbs(struct kvm *kvm);
  void kvm_reload_remote_mmus(struct kvm *kvm);
  
+ #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
+ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
+ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
+ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
+ #endif
  bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
                                 struct kvm_vcpu *except,
                                 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
@@@ -1439,12 -1447,4 +1447,12 @@@ int kvm_vm_create_worker_thread(struct 
                                uintptr_t data, const char *name,
                                struct task_struct **thread_ptr);
  
 +#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
 +static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 +{
 +      vcpu->run->exit_reason = KVM_EXIT_INTR;
 +      vcpu->stat.signal_exits++;
 +}
 +#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 +
  #endif
This page took 0.334727 seconds and 4 git commands to generate.