Merge tag 'x86-fpu-2021-07-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

author Linus Torvalds <[email protected]>

Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)

committer Linus Torvalds <[email protected]>

Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
author Linus Torvalds <[email protected]>
Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
committer Linus Torvalds <[email protected]>
Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
diff --combined Documentation/x86/index.rst

index 0004f5d2283ee0e4c9da4514df0fc9e040e67f78,d58614d5cde6b88ca267a8323b8fb2868f978e19..383048396336fde5c66f7d3b96a54f796f2f6d76
--- 1/Documentation/x86/index.rst
--- 2/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@@ -29,10 -29,10 +29,11 @@@ x86-specific Documentatio
      microcode
      resctrl
      tsx_async_abort
+ +   buslock
      usb-legacy-support
      i386/index
      x86_64/index
      sva
      sgx
      features
+    elf_auxvec
diff --combined arch/x86/events/intel/lbr.c

index e8453de7a96485700e308f340ecd16fd284472c1,f338645071c8959c60cef2f644076713d33d95f8..9e6d6eaeb4cb6037c13e259cf98fdab37c74b1e0
--- 1/arch/x86/events/intel/lbr.c
--- 2/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@@ -491,7 -491,7 +491,7 @@@ static void intel_pmu_arch_lbr_xrstors(
   {
         struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
   
-       copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+       xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
   }
   
   static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
@@@ -576,7 -576,7 +576,7 @@@ static void intel_pmu_arch_lbr_xsaves(v
   {
         struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
   
-       copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+       xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
   }
   
   static void __intel_pmu_lbr_save(void *ctx)
@@@ -731,8 -731,7 +731,8 @@@ void reserve_lbr_buffers(void
                 if (!kmem_cache || cpuc->lbr_xsave)
                         continue;
   
- -              cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL,
+ +              cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache,
+ +                                                      GFP_KERNEL | __GFP_ZERO,
                                                         cpu_to_node(cpu));
         }
   }
@@@ -993,7 -992,7 +993,7 @@@ static void intel_pmu_arch_lbr_read_xsa
                 intel_pmu_store_lbr(cpuc, NULL);
                 return;
         }
-       copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+       xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
   
         intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
   }
diff --combined arch/x86/include/asm/processor.h

index 364d0e42e28000f9ffd0fe2fac1725ab4bc64c58,91946fc3c006ec99649fb3b71cba4b26ded4bc13..f3020c54e2cb3db1347a3e497770be8648f71b8a
--- 1/arch/x86/include/asm/processor.h
--- 2/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -518,6 -518,15 +518,15 @@@ struct thread_struct 
   
         unsigned int            sig_on_uaccess_err:1;
   
+       /*
+        * Protection Keys Register for Userspace.  Loaded immediately on
+        * context switch. Store it in thread_struct to avoid a lookup in
+        * the tasks's FPU xstate buffer. This value is only valid when a
+        * task is scheduled out. For 'current' the authoritative source of
+        * PKRU is the hardware itself.
+        */
+       u32                     pkru;
+ 
         /* Floating point and extended processor state */
         struct fpu              fpu;
         /*
@@@ -663,7 -672,6 +672,7 @@@ extern void load_direct_gdt(int)
   extern void load_fixmap_gdt(int);
   extern void load_percpu_segment(int);
   extern void cpu_init(void);
+ +extern void cpu_init_secondary(void);
   extern void cpu_init_exception_handling(void);
   extern void cr4_init(void);
   
diff --combined arch/x86/kernel/cpu/common.c

index a99d00393206b9edbbcd645e16e1d4c15fc2d6bc,ca668efa4c81e2d47eafba930ab4a6fb62a4e5fe..64b805bd6a542ba2ad2a18f5fdb9e911f6dadb65
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -58,6 -58,7 +58,7 @@@
   #include <asm/intel-family.h>
   #include <asm/cpu_device_id.h>
   #include <asm/uv/uv.h>
+ #include <asm/sigframe.h>
   
   #include "cpu.h"
   
@@@ -465,27 -466,22 +466,22 @@@ static bool pku_disabled
   
   static __always_inline void setup_pku(struct cpuinfo_x86 *c)
   {
-       struct pkru_state *pk;
+       if (c == &boot_cpu_data) {
+               if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+                       return;
+               /*
+                * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+                * bit to be set.  Enforce it.
+                */
+               setup_force_cpu_cap(X86_FEATURE_OSPKE);
   
-       /* check the boot processor, plus compile options for PKU: */
-       if (!cpu_feature_enabled(X86_FEATURE_PKU))
-               return;
-       /* checks the actual processor's cpuid bits: */
-       if (!cpu_has(c, X86_FEATURE_PKU))
-               return;
-       if (pku_disabled)
+       } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                 return;
+       }
   
         cr4_set_bits(X86_CR4_PKE);
-       pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-       if (pk)
-               pk->pkru = init_pkru_value;
-       /*
-        * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
-        * cpuid bit to be set.  We need to ensure that we
-        * update that bit in this CPU's "cpu_info".
-        */
-       set_cpu_cap(c, X86_FEATURE_OSPKE);
+       /* Load the default PKRU value */
+       pkru_write_default();
   }
   
   #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@@ -1332,6 -1328,8 +1328,8 @@@ static void __init early_identify_cpu(s
   
         fpu__init_system(c);
   
+       init_sigframe_size();
+ 
   #ifdef CONFIG_X86_32
         /*
          * Regardless of whether PCID is enumerated, the SDM says
@@@ -1717,9 -1715,8 +1715,8 @@@ void print_cpu_info(struct cpuinfo_x86 
   }
   
   /*
-  * clearcpuid= was already parsed in fpu__init_parse_early_param.
-  * But we need to keep a dummy __setup around otherwise it would
-  * show up as an environment variable for init.
+  * clearcpuid= was already parsed in cpu_parse_early_param().  This dummy
+  * function prevents it from becoming an environment variable for init.
    */
   static __init int setup_clearcpuid(char *arg)
   {
@@@ -1773,16 -1770,10 +1770,16 @@@ void syscall_init(void
         wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
   #endif
   
- -      /* Flags to clear on syscall */
+ +      /*
+ +       * Flags to clear on syscall; clear as much as possible
+ +       * to minimize user space-kernel interference.
+ +       */
         wrmsrl(MSR_SYSCALL_MASK,
- -             X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- -             X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+ +             X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ +             X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ +             X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ +             X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ +             X86_EFLAGS_AC|X86_EFLAGS_ID);
   }
   
   #else /* CONFIG_X86_64 */
@@@ -1944,12 -1935,13 +1941,12 @@@ void cpu_init_exception_handling(void
   
   /*
    * cpu_init() initializes state that is per-CPU. Some data is already
- - * initialized (naturally) in the bootstrap process, such as the GDT
- - * and IDT. We reload them nevertheless, this function acts as a
- - * 'CPU state barrier', nothing should get across.
+ + * initialized (naturally) in the bootstrap process, such as the GDT.  We
+ + * reload it nevertheless, this function acts as a 'CPU state barrier',
+ + * nothing should get across.
    */
   void cpu_init(void)
   {
- -      struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
         struct task_struct *cur = current;
         int cpu = raw_smp_processor_id();
   
@@@ -1962,6 -1954,8 +1959,6 @@@
             early_cpu_to_node(cpu) != NUMA_NO_NODE)
                 set_numa_node(early_cpu_to_node(cpu));
   #endif
- -      setup_getcpu(cpu);
- -
         pr_debug("Initializing CPU#%d\n", cpu);
   
         if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
@@@ -1973,6 -1967,7 +1970,6 @@@
          * and set up the GDT descriptor:
          */
         switch_to_new_gdt(cpu);
- -      load_current_idt();
   
         if (IS_ENABLED(CONFIG_X86_64)) {
                 loadsegment(fs, 0);
@@@ -1992,6 -1987,12 +1989,6 @@@
         initialize_tlbstate_and_flush();
         enter_lazy_tlb(&init_mm, cur);
   
- -      /* Initialize the TSS. */
- -      tss_setup_ist(tss);
- -      tss_setup_io_bitmap(tss);
- -      set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- -
- -      load_TR_desc();
         /*
          * sp0 points to the entry trampoline stack regardless of what task
          * is running.
@@@ -2013,18 -2014,6 +2010,18 @@@
         load_fixmap_gdt(cpu);
   }
   
+ +#ifdef CONFIG_SMP
+ +void cpu_init_secondary(void)
+ +{
+ +      /*
+ +       * Relies on the BP having set-up the IDT tables, which are loaded
+ +       * on this CPU in cpu_init_exception_handling().
+ +       */
+ +      cpu_init_exception_handling();
+ +      cpu_init();
+ +}
+ +#endif
+ +
   /*
    * The microcode loader calls this upon late microcode load to recheck features,
    * only when microcode has been updated. Caller holds microcode_mutex and CPU
diff --combined arch/x86/kernel/process.c

index e52b208b4641b50537c52d05da353ad9cf8cd64b,fa6c8fa0f7788484389bdf3159269da6758a3770..1d9463e3096b68307e96445026e19cd01d98464e
--- 1/arch/x86/kernel/process.c
--- 2/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@ -87,8 -87,7 +87,7 @@@ int arch_dup_task_struct(struct task_st
   #ifdef CONFIG_VM86
         dst->thread.vm86 = NULL;
   #endif
- 
-       return fpu__copy(dst, src);
+       return fpu_clone(dst);
   }
   
   /*
@@@ -157,11 -156,18 +156,18 @@@ int copy_thread(unsigned long clone_fla
   
         /* Kernel thread ? */
         if (unlikely(p->flags & PF_KTHREAD)) {
+               p->thread.pkru = pkru_get_init_value();
                 memset(childregs, 0, sizeof(struct pt_regs));
                 kthread_frame_init(frame, sp, arg);
                 return 0;
         }
   
+       /*
+        * Clone current's PKRU value from hardware. tsk->thread.pkru
+        * is only valid when scheduled out.
+        */
+       p->thread.pkru = read_pkru();
+ 
         frame->bx = 0;
         *childregs = *current_pt_regs();
         childregs->ax = 0;
@@@ -199,6 -205,15 +205,15 @@@
         return ret;
   }
   
+ static void pkru_flush_thread(void)
+ {
+       /*
+        * If PKRU is enabled the default PKRU value has to be loaded into
+        * the hardware right here (similar to context switch).
+        */
+       pkru_write_default();
+ }
+ 
   void flush_thread(void)
   {
         struct task_struct *tsk = current;
@@@ -206,7 -221,8 +221,8 @@@
         flush_ptrace_hw_breakpoint(tsk);
         memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
   
-       fpu__clear_all(&tsk->thread.fpu);
+       fpu_flush_thread();
+       pkru_flush_thread();
   }
   
   void disable_TSC(void)
@@@ -931,7 -947,7 +947,7 @@@ unsigned long get_wchan(struct task_str
         unsigned long start, bottom, top, sp, fp, ip, ret = 0;
         int count = 0;
   
- -      if (p == current || p->state == TASK_RUNNING)
+ +      if (p == current || task_is_running(p))
                 return 0;
   
         if (!try_get_task_stack(p))
@@@ -975,7 -991,7 +991,7 @@@
                         goto out;
                 }
                 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- -      } while (count++ < 16 && p->state != TASK_RUNNING);
+ +      } while (count++ < 16 && !task_is_running(p));
   
   out:
         put_task_stack(p);
diff --combined arch/x86/kernel/signal.c

index e12779a2714dce0a141aab191f032964bd57c1bc,2ddcf2165bcb87bede34d9fcf980a362619ba031..f4d21e47008355a11aa81b02b5a94eb2b44c20a4
--- 1/arch/x86/kernel/signal.c
--- 2/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@@ -212,6 -212,11 +212,11 @@@ do {                                                                     
    * Set up a signal frame.
    */
   
+ /* x86 ABI requires 16-byte alignment */
+ #define FRAME_ALIGNMENT       16UL
+ 
+ #define MAX_FRAME_PADDING     (FRAME_ALIGNMENT - 1)
+ 
   /*
    * Determine which stack to use..
    */
@@@ -222,9 -227,9 +227,9 @@@ static unsigned long align_sigframe(uns
          * Align the stack pointer according to the i386 ABI,
          * i.e. so that on function entry ((sp + 4) & 15) == 0.
          */
-       sp = ((sp + 4) & -16ul) - 4;
+       sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
   #else /* !CONFIG_X86_32 */
-       sp = round_down(sp, 16) - 8;
+       sp = round_down(sp, FRAME_ALIGNMENT) - 8;
   #endif
         return sp;
   }
@@@ -234,10 -239,11 +239,11 @@@ get_sigframe(struct k_sigaction *ka, st
              void __user **fpstate)
   {
         /* Default to using normal stack */
+       bool nested_altstack = on_sig_stack(regs->sp);
+       bool entering_altstack = false;
         unsigned long math_size = 0;
         unsigned long sp = regs->sp;
         unsigned long buf_fx = 0;
-       int onsigstack = on_sig_stack(sp);
         int ret;
   
         /* redzone */
@@@ -246,15 -252,23 +252,23 @@@
   
         /* This is the X/Open sanctioned signal stack switching.  */
         if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
+               /*
+                * This checks nested_altstack via sas_ss_flags(). Sensible
+                * programs use SS_AUTODISARM, which disables that check, and
+                * programs that don't use SS_AUTODISARM get compatible.
+                */
+               if (sas_ss_flags(sp) == 0) {
                         sp = current->sas_ss_sp + current->sas_ss_size;
+                       entering_altstack = true;
+               }
         } else if (IS_ENABLED(CONFIG_X86_32) &&
-                  !onsigstack &&
+                  !nested_altstack &&
                    regs->ss != __USER_DS &&
                    !(ka->sa.sa_flags & SA_RESTORER) &&
                    ka->sa.sa_restorer) {
                 /* This is the legacy signal stack switching. */
                 sp = (unsigned long) ka->sa.sa_restorer;
+               entering_altstack = true;
         }
   
         sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@@ -267,8 -281,15 +281,15 @@@
          * If we are on the alternate signal stack and would overflow it, don't.
          * Return an always-bogus address instead so we will die with SIGSEGV.
          */
-       if (onsigstack && !likely(on_sig_stack(sp)))
+       if (unlikely((nested_altstack || entering_altstack) &&
+                    !__on_sig_stack(sp))) {
+ 
+               if (show_unhandled_signals && printk_ratelimit())
+                       pr_info("%s[%d] overflowed sigaltstack\n",
+                               current->comm, task_pid_nr(current));
+ 
                 return (void __user *)-1L;
+       }
   
         /* save i387 and extended state */
         ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
@@@ -663,6 -684,61 +684,61 @@@ badframe
         return 0;
   }
   
+ /*
+  * There are four different struct types for signal frame: sigframe_ia32,
+  * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+  * -- the largest size. It means the size for 64-bit apps is a bit more
+  * than needed, but this keeps the code simple.
+  */
+ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+ #else
+ # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+ #endif
+ 
+ /*
+  * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+  * If a signal frame starts at an unaligned address, extra space is required.
+  * This is the max alignment padding, conservatively.
+  */
+ #define MAX_XSAVE_PADDING     63UL
+ 
+ /*
+  * The frame data is composed of the following areas and laid out as:
+  *
+  * -------------------------
+  * | alignment padding     |
+  * -------------------------
+  * | (f)xsave frame        |
+  * -------------------------
+  * | fsave header          |
+  * -------------------------
+  * | alignment padding     |
+  * -------------------------
+  * | siginfo + ucontext    |
+  * -------------------------
+  */
+ 
+ /* max_frame_size tells userspace the worst case signal stack size. */
+ static unsigned long __ro_after_init max_frame_size;
+ 
+ void __init init_sigframe_size(void)
+ {
+       max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+ 
+       max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+ 
+       /* Userspace expects an aligned size. */
+       max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+ 
+       pr_info("max sigframe size: %lu\n", max_frame_size);
+ }
+ 
+ unsigned long get_sigframe_size(void)
+ {
+       return max_frame_size;
+ }
+ 
   static inline int is_ia32_compat_frame(struct ksignal *ksig)
   {
         return IS_ENABLED(CONFIG_IA32_EMULATION) &&
@@@ -713,7 -789,7 +789,7 @@@ handle_signal(struct ksignal *ksig, str
                 save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
   
         /* Are we from a system call? */
- -      if (syscall_get_nr(current, regs) >= 0) {
+ +      if (syscall_get_nr(current, regs) != -1) {
                 /* If so, check system call restarting.. */
                 switch (syscall_get_error(current, regs)) {
                 case -ERESTART_RESTARTBLOCK:
@@@ -793,7 -869,7 +869,7 @@@ void arch_do_signal_or_restart(struct p
         }
   
         /* Did we come from a system call? */
- -      if (syscall_get_nr(current, regs) >= 0) {
+ +      if (syscall_get_nr(current, regs) != -1) {
                 /* Restart the system call - no handlers present */
                 switch (syscall_get_error(current, regs)) {
                 case -ERESTARTNOHAND:
diff --combined arch/x86/kernel/traps.c

index ed540e09a399a577a5fc31d32ea2e57ffde0480f,4c9c4aa8321681e4b4482e152c9594a6beaf072b..a58800973aed3a16cbb8d97500bae51eb8306716
--- 1/arch/x86/kernel/traps.c
--- 2/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -1046,9 -1046,10 +1046,10 @@@ static void math_error(struct pt_regs *
         }
   
         /*
-        * Save the info for the exception handler and clear the error.
+        * Synchronize the FPU register state to the memory register state
+        * if necessary. This allows the exception handler to inspect it.
          */
-       fpu__save(fpu);
+       fpu_sync_fpstate(fpu);
   
         task->thread.trap_nr    = trapnr;
         task->thread.error_code = 0;
@@@ -1160,9 -1161,12 +1161,9 @@@ void __init trap_init(void
         /* Init GHCB memory pages when running as an SEV-ES guest */
         sev_es_init_vc_handling();
   
+ +      /* Initialize TSS before setting up traps so ISTs work */
+ +      cpu_init_exception_handling();
+ +      /* Setup traps as cpu_init() might #GP */
         idt_setup_traps();
- -
- -      /*
- -       * Should be a barrier for any external CPU state:
- -       */
         cpu_init();
- -
- -      idt_setup_ist_traps();
   }
diff --combined arch/x86/kvm/x86.c

index 17468d983fbd57d48150ae4474db24d6dccb9209,8ee7add0e7631a8619c2796cca902c2990eb840e..c6dc1b44523156e292fd01647b4c8ba761cb5042
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -58,7 -58,6 +58,7 @@@
   #include <linux/sched/isolation.h>
   #include <linux/mem_encrypt.h>
   #include <linux/entry-kvm.h>
+ +#include <linux/suspend.h>
   
   #include <trace/events/kvm.h>
   
@@@ -66,6 -65,7 +66,7 @@@
   #include <asm/msr.h>
   #include <asm/desc.h>
   #include <asm/mce.h>
+ #include <asm/pkru.h>
   #include <linux/kernel_stat.h>
   #include <asm/fpu/internal.h> /* Ugh! */
   #include <asm/pvclock.h>
@@@ -103,8 -103,6 +104,8 @@@ static u64 __read_mostly efer_reserved_
   
   static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
   
+ +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+ +
   #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
   
@@@ -116,9 -114,6 +117,9 @@@ static void __kvm_set_rflags(struct kvm
   static void store_regs(struct kvm_vcpu *vcpu);
   static int sync_regs(struct kvm_vcpu *vcpu);
   
+ +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+ +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+ +
   struct kvm_x86_ops kvm_x86_ops __read_mostly;
   EXPORT_SYMBOL_GPL(kvm_x86_ops);
   
@@@ -215,78 -210,55 +216,78 @@@ EXPORT_SYMBOL_GPL(host_efer)
   bool __read_mostly allow_smaller_maxphyaddr = 0;
   EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
   
+ +bool __read_mostly enable_apicv = true;
+ +EXPORT_SYMBOL_GPL(enable_apicv);
+ +
   u64 __read_mostly host_xss;
   EXPORT_SYMBOL_GPL(host_xss);
   u64 __read_mostly supported_xss;
   EXPORT_SYMBOL_GPL(supported_xss);
   
- -struct kvm_stats_debugfs_item debugfs_entries[] = {
- -      VCPU_STAT("pf_fixed", pf_fixed),
- -      VCPU_STAT("pf_guest", pf_guest),
- -      VCPU_STAT("tlb_flush", tlb_flush),
- -      VCPU_STAT("invlpg", invlpg),
- -      VCPU_STAT("exits", exits),
- -      VCPU_STAT("io_exits", io_exits),
- -      VCPU_STAT("mmio_exits", mmio_exits),
- -      VCPU_STAT("signal_exits", signal_exits),
- -      VCPU_STAT("irq_window", irq_window_exits),
- -      VCPU_STAT("nmi_window", nmi_window_exits),
- -      VCPU_STAT("halt_exits", halt_exits),
- -      VCPU_STAT("halt_successful_poll", halt_successful_poll),
- -      VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- -      VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- -      VCPU_STAT("halt_wakeup", halt_wakeup),
- -      VCPU_STAT("hypercalls", hypercalls),
- -      VCPU_STAT("request_irq", request_irq_exits),
- -      VCPU_STAT("irq_exits", irq_exits),
- -      VCPU_STAT("host_state_reload", host_state_reload),
- -      VCPU_STAT("fpu_reload", fpu_reload),
- -      VCPU_STAT("insn_emulation", insn_emulation),
- -      VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
- -      VCPU_STAT("irq_injections", irq_injections),
- -      VCPU_STAT("nmi_injections", nmi_injections),
- -      VCPU_STAT("req_event", req_event),
- -      VCPU_STAT("l1d_flush", l1d_flush),
- -      VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- -      VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- -      VCPU_STAT("nested_run", nested_run),
- -      VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
- -      VCPU_STAT("directed_yield_successful", directed_yield_successful),
- -      VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
- -      VM_STAT("mmu_pte_write", mmu_pte_write),
- -      VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
- -      VM_STAT("mmu_flooded", mmu_flooded),
- -      VM_STAT("mmu_recycled", mmu_recycled),
- -      VM_STAT("mmu_cache_miss", mmu_cache_miss),
- -      VM_STAT("mmu_unsync", mmu_unsync),
- -      VM_STAT("remote_tlb_flush", remote_tlb_flush),
- -      VM_STAT("largepages", lpages, .mode = 0444),
- -      VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
- -      VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
- -      { NULL }
+ +const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ +      KVM_GENERIC_VM_STATS(),
+ +      STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ +      STATS_DESC_COUNTER(VM, mmu_pte_write),
+ +      STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ +      STATS_DESC_COUNTER(VM, mmu_flooded),
+ +      STATS_DESC_COUNTER(VM, mmu_recycled),
+ +      STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ +      STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ +      STATS_DESC_ICOUNTER(VM, lpages),
+ +      STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ +      STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+ +};
+ +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ +              sizeof(struct kvm_vm_stat) / sizeof(u64));
+ +
+ +const struct kvm_stats_header kvm_vm_stats_header = {
+ +      .name_size = KVM_STATS_NAME_SIZE,
+ +      .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ +      .id_offset = sizeof(struct kvm_stats_header),
+ +      .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ +      .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ +                     sizeof(kvm_vm_stats_desc),
+ +};
+ +
+ +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ +      KVM_GENERIC_VCPU_STATS(),
+ +      STATS_DESC_COUNTER(VCPU, pf_fixed),
+ +      STATS_DESC_COUNTER(VCPU, pf_guest),
+ +      STATS_DESC_COUNTER(VCPU, tlb_flush),
+ +      STATS_DESC_COUNTER(VCPU, invlpg),
+ +      STATS_DESC_COUNTER(VCPU, exits),
+ +      STATS_DESC_COUNTER(VCPU, io_exits),
+ +      STATS_DESC_COUNTER(VCPU, mmio_exits),
+ +      STATS_DESC_COUNTER(VCPU, signal_exits),
+ +      STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ +      STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ +      STATS_DESC_COUNTER(VCPU, l1d_flush),
+ +      STATS_DESC_COUNTER(VCPU, halt_exits),
+ +      STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ +      STATS_DESC_COUNTER(VCPU, irq_exits),
+ +      STATS_DESC_COUNTER(VCPU, host_state_reload),
+ +      STATS_DESC_COUNTER(VCPU, fpu_reload),
+ +      STATS_DESC_COUNTER(VCPU, insn_emulation),
+ +      STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ +      STATS_DESC_COUNTER(VCPU, hypercalls),
+ +      STATS_DESC_COUNTER(VCPU, irq_injections),
+ +      STATS_DESC_COUNTER(VCPU, nmi_injections),
+ +      STATS_DESC_COUNTER(VCPU, req_event),
+ +      STATS_DESC_COUNTER(VCPU, nested_run),
+ +      STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ +      STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ +      STATS_DESC_ICOUNTER(VCPU, guest_mode)
+ +};
+ +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ +              sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+ +
+ +const struct kvm_stats_header kvm_vcpu_stats_header = {
+ +      .name_size = KVM_STATS_NAME_SIZE,
+ +      .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ +      .id_offset = sizeof(struct kvm_stats_header),
+ +      .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ +      .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ +                     sizeof(kvm_vcpu_stats_desc),
   };
   
   u64 __read_mostly host_xcr0;
@@@ -807,6 -779,13 +808,6 @@@ int kvm_read_guest_page_mmu(struct kvm_
   }
   EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
   
- -static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- -                             void *data, int offset, int len, u32 access)
- -{
- -      return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- -                                     data, offset, len, access);
- -}
- -
   static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
   {
         return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@@ -841,7 -820,6 +842,7 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
   
         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ +      vcpu->arch.pdptrs_from_userspace = false;
   
   out:
   
@@@ -849,14 -827,40 +850,14 @@@
   }
   EXPORT_SYMBOL_GPL(load_pdptrs);
   
- -bool pdptrs_changed(struct kvm_vcpu *vcpu)
- -{
- -      u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- -      int offset;
- -      gfn_t gfn;
- -      int r;
- -
- -      if (!is_pae_paging(vcpu))
- -              return false;
- -
- -      if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- -              return true;
- -
- -      gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
- -      offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
- -      r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- -                                     PFERR_USER_MASK | PFERR_WRITE_MASK);
- -      if (r < 0)
- -              return true;
- -
- -      return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
- -}
- -EXPORT_SYMBOL_GPL(pdptrs_changed);
- -
   void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
   {
- -      unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
- -
         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
                 kvm_clear_async_pf_completion_queue(vcpu);
                 kvm_async_pf_hash_reset(vcpu);
         }
   
- -      if ((cr0 ^ old_cr0) & update_bits)
+ +      if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
                 kvm_mmu_reset_context(vcpu);
   
         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@@ -939,7 -943,7 +940,7 @@@ void kvm_load_guest_xsave_state(struct 
             (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
             vcpu->arch.pkru != vcpu->arch.host_pkru)
-               __write_pkru(vcpu->arch.pkru);
+               write_pkru(vcpu->arch.pkru);
   }
   EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
   
@@@ -953,7 -957,7 +954,7 @@@ void kvm_load_host_xsave_state(struct k
              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
                 vcpu->arch.pkru = rdpkru();
                 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
-                       __write_pkru(vcpu->arch.host_pkru);
+                       write_pkru(vcpu->arch.host_pkru);
         }
   
         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@@ -1035,7 -1039,10 +1036,7 @@@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4)
   
   void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
   {
- -      unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- -                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
- -
- -      if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ +      if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                 kvm_mmu_reset_context(vcpu);
   }
@@@ -1078,46 -1085,25 +1079,46 @@@ int kvm_set_cr4(struct kvm_vcpu *vcpu, 
   }
   EXPORT_SYMBOL_GPL(kvm_set_cr4);
   
+ +static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+ +{
+ +      struct kvm_mmu *mmu = vcpu->arch.mmu;
+ +      unsigned long roots_to_free = 0;
+ +      int i;
+ +
+ +      /*
+ +       * If neither the current CR3 nor any of the prev_roots use the given
+ +       * PCID, then nothing needs to be done here because a resync will
+ +       * happen anyway before switching to any other CR3.
+ +       */
+ +      if (kvm_get_active_pcid(vcpu) == pcid) {
+ +              kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ +              kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ +      }
+ +
+ +      for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ +              if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ +                      roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ +
+ +      kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+ +}
+ +
   int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
   {
         bool skip_tlb_flush = false;
+ +      unsigned long pcid = 0;
   #ifdef CONFIG_X86_64
         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
   
         if (pcid_enabled) {
                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ +              pcid = cr3 & X86_CR3_PCID_MASK;
         }
   #endif
   
- -      if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- -              if (!skip_tlb_flush) {
- -                      kvm_mmu_sync_roots(vcpu);
- -                      kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- -              }
- -              return 0;
- -      }
+ +      /* PDPTRs are always reloaded for PAE paging. */
+ +      if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ +              goto handle_tlb_flush;
   
         /*
          * Do not condition the GPA check on long mode, this helper is used to
@@@ -1130,23 -1116,10 +1131,23 @@@
         if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                 return 1;
   
- -      kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+ +      if (cr3 != kvm_read_cr3(vcpu))
+ +              kvm_mmu_new_pgd(vcpu, cr3);
+ +
         vcpu->arch.cr3 = cr3;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
   
+ +handle_tlb_flush:
+ +      /*
+ +       * A load of CR3 that flushes the TLB flushes only the current PCID,
+ +       * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
+ +       * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ +       * and it's impossible to use a non-zero PCID when PCID is disabled,
+ +       * i.e. only PCID=0 can be relevant.
+ +       */
+ +      if (!skip_tlb_flush)
+ +              kvm_invalidate_pcid(vcpu, pcid);
+ +
         return 0;
   }
   EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@@ -2207,15 -2180,13 +2208,15 @@@ static u32 adjust_tsc_khz(u32 khz, s32 
         return v;
   }
   
+ +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+ +
   static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
   {
         u64 ratio;
   
         /* Guest TSC same frequency as host TSC? */
         if (!scale) {
- -              vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ +              kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                 return 0;
         }
   
@@@ -2241,7 -2212,7 +2242,7 @@@
                 return -1;
         }
   
- -      vcpu->arch.tsc_scaling_ratio = ratio;
+ +      kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
         return 0;
   }
   
@@@ -2253,7 -2224,7 +2254,7 @@@ static int kvm_set_tsc_khz(struct kvm_v
         /* tsc_khz can be zero if TSC calibration fails */
         if (user_tsc_khz == 0) {
                 /* set tsc_scaling_ratio to a safe value */
- -              vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ +              kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                 return -1;
         }
   
@@@ -2335,9 -2306,10 +2336,9 @@@ static inline u64 __scale_tsc(u64 ratio
         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
   }
   
- -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+ +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
   {
         u64 _tsc = tsc;
- -      u64 ratio = vcpu->arch.tsc_scaling_ratio;
   
         if (ratio != kvm_default_tsc_scaling_ratio)
                 _tsc = __scale_tsc(ratio, tsc);
@@@ -2346,86 -2318,25 +2347,86 @@@
   }
   EXPORT_SYMBOL_GPL(kvm_scale_tsc);
   
- -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+ +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
   {
         u64 tsc;
   
- -      tsc = kvm_scale_tsc(vcpu, rdtsc());
+ +      tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
   
         return target_tsc - tsc;
   }
   
   u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
   {
- -      return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ +      return vcpu->arch.l1_tsc_offset +
+ +              kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
   }
   EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
   
- -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+ +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
   {
- -      vcpu->arch.l1_tsc_offset = offset;
- -      vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+ +      u64 nested_offset;
+ +
+ +      if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+ +              nested_offset = l1_offset;
+ +      else
+ +              nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ +                                              kvm_tsc_scaling_ratio_frac_bits);
+ +
+ +      nested_offset += l2_offset;
+ +      return nested_offset;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+ +
+ +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+ +{
+ +      if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+ +              return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ +                                     kvm_tsc_scaling_ratio_frac_bits);
+ +
+ +      return l1_multiplier;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+ +
+ +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+ +{
+ +      trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ +                                 vcpu->arch.l1_tsc_offset,
+ +                                 l1_offset);
+ +
+ +      vcpu->arch.l1_tsc_offset = l1_offset;
+ +
+ +      /*
+ +       * If we are here because L1 chose not to trap WRMSR to TSC then
+ +       * according to the spec this should set L1's TSC (as opposed to
+ +       * setting L1's offset for L2).
+ +       */
+ +      if (is_guest_mode(vcpu))
+ +              vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ +                      l1_offset,
+ +                      static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+ +                      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ +      else
+ +              vcpu->arch.tsc_offset = l1_offset;
+ +
+ +      static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+ +}
+ +
+ +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+ +{
+ +      vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+ +
+ +      /* Userspace is changing the multiplier while L2 is active */
+ +      if (is_guest_mode(vcpu))
+ +              vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ +                      l1_multiplier,
+ +                      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ +      else
+ +              vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+ +
+ +      if (kvm_has_tsc_control)
+ +              static_call(kvm_x86_write_tsc_multiplier)(
+ +                      vcpu, vcpu->arch.tsc_scaling_ratio);
   }
   
   static inline bool kvm_check_tsc_unstable(void)
@@@ -2451,7 -2362,7 +2452,7 @@@ static void kvm_synchronize_tsc(struct 
         bool synchronizing = false;
   
         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- -      offset = kvm_compute_tsc_offset(vcpu, data);
+ +      offset = kvm_compute_l1_tsc_offset(vcpu, data);
         ns = get_kvmclock_base_ns();
         elapsed = ns - kvm->arch.last_tsc_nsec;
   
@@@ -2490,7 -2401,7 +2491,7 @@@
                 } else {
                         u64 delta = nsec_to_cycles(vcpu, elapsed);
                         data += delta;
- -                      offset = kvm_compute_tsc_offset(vcpu, data);
+ +                      offset = kvm_compute_l1_tsc_offset(vcpu, data);
                 }
                 matched = true;
                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@@ -2549,10 -2460,9 +2550,10 @@@ static inline void adjust_tsc_offset_gu
   
   static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
   {
- -      if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ +      if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
                 WARN_ON(adjustment < 0);
- -      adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ +      adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ +                                 vcpu->arch.l1_tsc_scaling_ratio);
         adjust_tsc_offset_guest(vcpu, adjustment);
   }
   
@@@ -2935,8 -2845,7 +2936,8 @@@ static int kvm_guest_time_update(struc
         /* With all the info we got, fill in the values */
   
         if (kvm_has_tsc_control)
- -              tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ +              tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ +                                          v->arch.l1_tsc_scaling_ratio);
   
         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@@ -3342,7 -3251,7 +3343,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                 if (msr_info->host_initiated) {
                         kvm_synchronize_tsc(vcpu, data);
                 } else {
- -                      u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ +                      u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                         adjust_tsc_offset_guest(vcpu, adj);
                         vcpu->arch.ia32_tsc_adjust_msr += adj;
                 }
@@@ -3644,17 -3553,10 +3645,17 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                  * return L1's TSC value to ensure backwards-compatible
                  * behavior for migration.
                  */
- -              u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
- -                                                          vcpu->arch.tsc_offset;
+ +              u64 offset, ratio;
   
- -              msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+ +              if (msr_info->host_initiated) {
+ +                      offset = vcpu->arch.l1_tsc_offset;
+ +                      ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ +              } else {
+ +                      offset = vcpu->arch.tsc_offset;
+ +                      ratio = vcpu->arch.tsc_scaling_ratio;
+ +              }
+ +
+ +              msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
                 break;
         }
         case MSR_MTRRcap:
@@@ -3978,7 -3880,6 +3979,7 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_HYPERV_TLBFLUSH:
         case KVM_CAP_HYPERV_SEND_IPI:
         case KVM_CAP_HYPERV_CPUID:
+ +      case KVM_CAP_HYPERV_ENFORCE_CPUID:
         case KVM_CAP_SYS_HYPERV_CPUID:
         case KVM_CAP_PCI_SEGMENT:
         case KVM_CAP_DEBUGREGS:
@@@ -4009,13 -3910,8 +4010,13 @@@
         case KVM_CAP_SGX_ATTRIBUTE:
   #endif
         case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ +      case KVM_CAP_SREGS2:
+ +      case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
                 r = 1;
                 break;
+ +      case KVM_CAP_EXIT_HYPERCALL:
+ +              r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ +              break;
         case KVM_CAP_SET_GUEST_DEBUG2:
                 return KVM_GUESTDBG_VALID_MASK;
   #ifdef CONFIG_KVM_XEN
@@@ -4243,7 -4139,7 +4244,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
                         mark_tsc_unstable("KVM discovered backwards TSC");
   
                 if (kvm_check_tsc_unstable()) {
- -                      u64 offset = kvm_compute_tsc_offset(vcpu,
+ +                      u64 offset = kvm_compute_l1_tsc_offset(vcpu,
                                                 vcpu->arch.last_guest_tsc);
                         kvm_vcpu_write_tsc_offset(vcpu, offset);
                         vcpu->arch.tsc_catchup = 1;
@@@ -4562,7 -4458,7 +4563,7 @@@ static void kvm_vcpu_ioctl_x86_get_vcpu
         memset(&events->reserved, 0, sizeof(events->reserved));
   }
   
- -static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+ +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
   
   static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
@@@ -4622,8 -4518,13 +4623,8 @@@
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
   
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- -              if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- -                      if (events->smi.smm)
- -                              vcpu->arch.hflags |= HF_SMM_MASK;
- -                      else
- -                              vcpu->arch.hflags &= ~HF_SMM_MASK;
- -                      kvm_smm_changed(vcpu);
- -              }
+ +              if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+ +                      kvm_smm_changed(vcpu, events->smi.smm);
   
                 vcpu->arch.smi_pending = events->smi.pending;
   
@@@ -4704,20 -4605,21 +4705,21 @@@ static void fill_xsave(u8 *dest, struc
          */
         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
         while (valid) {
+               u32 size, offset, ecx, edx;
                 u64 xfeature_mask = valid & -valid;
                 int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *src = get_xsave_addr(xsave, xfeature_nr);
- 
-               if (src) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(dest + offset, &vcpu->arch.pkru,
-                                      sizeof(vcpu->arch.pkru));
-                       else
-                               memcpy(dest + offset, src, size);
+               void *src;
+ 
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
   
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(dest + offset, &vcpu->arch.pkru,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       src = get_xsave_addr(xsave, xfeature_nr);
+                       if (src)
+                               memcpy(dest + offset, src, size);
                 }
   
                 valid -= xfeature_mask;
@@@ -4747,18 -4649,20 +4749,20 @@@ static void load_xsave(struct kvm_vcpu 
          */
         valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
         while (valid) {
+               u32 size, offset, ecx, edx;
                 u64 xfeature_mask = valid & -valid;
                 int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *dest = get_xsave_addr(xsave, xfeature_nr);
- 
-               if (dest) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(&vcpu->arch.pkru, src + offset,
-                                      sizeof(vcpu->arch.pkru));
-                       else
+ 
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
+ 
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(&vcpu->arch.pkru, src + offset,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       void *dest = get_xsave_addr(xsave, xfeature_nr);
+ 
+                       if (dest)
                                 memcpy(dest, src + offset, size);
                 }
   
@@@ -4907,9 -4811,6 +4911,9 @@@ static int kvm_vcpu_ioctl_enable_cap(st
   
                 return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
   
+ +      case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ +              return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+ +
         case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                 vcpu->arch.pv_cpuid.enforce = cap->args[0];
                 if (vcpu->arch.pv_cpuid.enforce)
@@@ -4928,7 -4829,6 +4932,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         void __user *argp = (void __user *)arg;
         int r;
         union {
+ +              struct kvm_sregs2 *sregs2;
                 struct kvm_lapic_state *lapic;
                 struct kvm_xsave *xsave;
                 struct kvm_xcrs *xcrs;
@@@ -5301,28 -5201,6 +5305,28 @@@
                 break;
         }
   #endif
+ +      case KVM_GET_SREGS2: {
+ +              u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ +              r = -ENOMEM;
+ +              if (!u.sregs2)
+ +                      goto out;
+ +              __get_sregs2(vcpu, u.sregs2);
+ +              r = -EFAULT;
+ +              if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ +                      goto out;
+ +              r = 0;
+ +              break;
+ +      }
+ +      case KVM_SET_SREGS2: {
+ +              u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ +              if (IS_ERR(u.sregs2)) {
+ +                      r = PTR_ERR(u.sregs2);
+ +                      u.sregs2 = NULL;
+ +                      goto out;
+ +              }
+ +              r = __set_sregs2(vcpu, u.sregs2);
+ +              break;
+ +      }
         default:
                 r = -EINVAL;
         }
@@@ -5642,21 -5520,6 +5646,21 @@@ split_irqchip_unlock
                 if (kvm_x86_ops.vm_copy_enc_context_from)
                         r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
                 return r;
+ +      case KVM_CAP_EXIT_HYPERCALL:
+ +              if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ +                      r = -EINVAL;
+ +                      break;
+ +              }
+ +              kvm->arch.hypercall_exit_enabled = cap->args[0];
+ +              r = 0;
+ +              break;
+ +      case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ +              r = -EINVAL;
+ +              if (cap->args[0] & ~1)
+ +                      break;
+ +              kvm->arch.exit_on_emulation_error = cap->args[0];
+ +              r = 0;
+ +              break;
         default:
                 r = -EINVAL;
                 break;
@@@ -5771,41 -5634,6 +5775,41 @@@ static int kvm_vm_ioctl_set_msr_filter(
         return 0;
   }
   
+ +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+ +static int kvm_arch_suspend_notifier(struct kvm *kvm)
+ +{
+ +      struct kvm_vcpu *vcpu;
+ +      int i, ret = 0;
+ +
+ +      mutex_lock(&kvm->lock);
+ +      kvm_for_each_vcpu(i, vcpu, kvm) {
+ +              if (!vcpu->arch.pv_time_enabled)
+ +                      continue;
+ +
+ +              ret = kvm_set_guest_paused(vcpu);
+ +              if (ret) {
+ +                      kvm_err("Failed to pause guest VCPU%d: %d\n",
+ +                              vcpu->vcpu_id, ret);
+ +                      break;
+ +              }
+ +      }
+ +      mutex_unlock(&kvm->lock);
+ +
+ +      return ret ? NOTIFY_BAD : NOTIFY_DONE;
+ +}
+ +
+ +int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+ +{
+ +      switch (state) {
+ +      case PM_HIBERNATION_PREPARE:
+ +      case PM_SUSPEND_PREPARE:
+ +              return kvm_arch_suspend_notifier(kvm);
+ +      }
+ +
+ +      return NOTIFY_DONE;
+ +}
+ +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+ +
   long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
   {
@@@ -7280,22 -7108,23 +7284,22 @@@ static unsigned emulator_get_hflags(str
         return emul_to_vcpu(ctxt)->arch.hflags;
   }
   
- -static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+ +static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
   {
         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
   
- -      vcpu->arch.hflags = emul_flags;
- -      kvm_mmu_reset_context(vcpu);
+ +      kvm_smm_changed(vcpu, false);
   }
   
- -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+ +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
                                   const char *smstate)
   {
- -      return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ +      return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
   }
   
- -static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+ +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
   {
- -      kvm_smm_changed(emul_to_vcpu(ctxt));
+ +      kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
   }
   
   static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@@ -7344,9 -7173,9 +7348,9 @@@ static const struct x86_emulate_ops emu
         .guest_has_fxsr      = emulator_guest_has_fxsr,
         .set_nmi_mask        = emulator_set_nmi_mask,
         .get_hflags          = emulator_get_hflags,
- -      .set_hflags          = emulator_set_hflags,
- -      .pre_leave_smm       = emulator_pre_leave_smm,
- -      .post_leave_smm      = emulator_post_leave_smm,
+ +      .exiting_smm         = emulator_exiting_smm,
+ +      .leave_smm           = emulator_leave_smm,
+ +      .triple_fault        = emulator_triple_fault,
         .set_xcr             = emulator_set_xcr,
   };
   
@@@ -7452,33 -7281,8 +7456,33 @@@ void kvm_inject_realmode_interrupt(stru
   }
   EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
   
+ +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+ +{
+ +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ +      u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
+ +      struct kvm_run *run = vcpu->run;
+ +
+ +      run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ +      run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ +      run->emulation_failure.ndata = 0;
+ +      run->emulation_failure.flags = 0;
+ +
+ +      if (insn_size) {
+ +              run->emulation_failure.ndata = 3;
+ +              run->emulation_failure.flags |=
+ +                      KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ +              run->emulation_failure.insn_size = insn_size;
+ +              memset(run->emulation_failure.insn_bytes, 0x90,
+ +                     sizeof(run->emulation_failure.insn_bytes));
+ +              memcpy(run->emulation_failure.insn_bytes,
+ +                     ctxt->fetch.data, insn_size);
+ +      }
+ +}
+ +
   static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
   {
+ +      struct kvm *kvm = vcpu->kvm;
+ +
         ++vcpu->stat.insn_emulation_fail;
         trace_kvm_emulate_insn_failed(vcpu);
   
@@@ -7487,9 -7291,10 +7491,9 @@@
                 return 1;
         }
   
- -      if (emulation_type & EMULTYPE_SKIP) {
- -              vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- -              vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- -              vcpu->run->internal.ndata = 0;
+ +      if (kvm->arch.exit_on_emulation_error ||
+ +          (emulation_type & EMULTYPE_SKIP)) {
+ +              prepare_emulation_failure_exit(vcpu);
                 return 0;
         }
   
@@@ -7631,14 -7436,11 +7635,14 @@@ static bool retry_instruction(struct x8
   static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
   static int complete_emulated_pio(struct kvm_vcpu *vcpu);
   
- -static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+ +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
   {
- -      if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
- -              /* This is a good place to trace that we are exiting SMM.  */
- -              trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+ +      trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+ +
+ +      if (entering_smm) {
+ +              vcpu->arch.hflags |= HF_SMM_MASK;
+ +      } else {
+ +              vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
   
                 /* Process a latched INIT or SMI, if any.  */
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -8563,15 -8365,16 +8567,15 @@@ bool kvm_apicv_activated(struct kvm *kv
   }
   EXPORT_SYMBOL_GPL(kvm_apicv_activated);
   
- -void kvm_apicv_init(struct kvm *kvm, bool enable)
+ +static void kvm_apicv_init(struct kvm *kvm)
   {
- -      if (enable)
+ +      if (enable_apicv)
                 clear_bit(APICV_INHIBIT_REASON_DISABLE,
                           &kvm->arch.apicv_inhibit_reasons);
         else
                 set_bit(APICV_INHIBIT_REASON_DISABLE,
                         &kvm->arch.apicv_inhibit_reasons);
   }
- -EXPORT_SYMBOL_GPL(kvm_apicv_init);
   
   static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
   {
@@@ -8607,17 -8410,6 +8611,17 @@@ no_yield
         return;
   }
   
+ +static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+ +{
+ +      u64 ret = vcpu->run->hypercall.ret;
+ +
+ +      if (!is_64_bit_mode(vcpu))
+ +              ret = (u32)ret;
+ +      kvm_rax_write(vcpu, ret);
+ +      ++vcpu->stat.hypercalls;
+ +      return kvm_skip_emulated_instruction(vcpu);
+ +}
+ +
   int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
   {
         unsigned long nr, a0, a1, a2, a3, ret;
@@@ -8683,28 -8475,6 +8687,28 @@@
                 kvm_sched_yield(vcpu, a0);
                 ret = 0;
                 break;
+ +      case KVM_HC_MAP_GPA_RANGE: {
+ +              u64 gpa = a0, npages = a1, attrs = a2;
+ +
+ +              ret = -KVM_ENOSYS;
+ +              if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ +                      break;
+ +
+ +              if (!PAGE_ALIGNED(gpa) || !npages ||
+ +                  gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ +                      ret = -KVM_EINVAL;
+ +                      break;
+ +              }
+ +
+ +              vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
+ +              vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
+ +              vcpu->run->hypercall.args[0]  = gpa;
+ +              vcpu->run->hypercall.args[1]  = npages;
+ +              vcpu->run->hypercall.args[2]  = attrs;
+ +              vcpu->run->hypercall.longmode = op_64_bit;
+ +              vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ +              return 0;
+ +      }
         default:
                 ret = -KVM_ENOSYS;
                 break;
@@@ -8788,6 -8558,9 +8792,6 @@@ static void update_cr8_intercept(struc
   
   int kvm_check_nested_events(struct kvm_vcpu *vcpu)
   {
- -      if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
- -              return -EIO;
- -
         if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                 kvm_x86_ops.nested_ops->triple_fault(vcpu);
                 return 1;
@@@ -8803,7 -8576,7 +8807,7 @@@ static void kvm_inject_exception(struc
         static_call(kvm_x86_queue_exception)(vcpu);
   }
   
- -static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+ +static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
   {
         int r;
         bool can_inject = true;
@@@ -8850,7 -8623,7 +8854,7 @@@
         if (is_guest_mode(vcpu)) {
                 r = kvm_check_nested_events(vcpu);
                 if (r < 0)
- -                      goto busy;
+ +                      goto out;
         }
   
         /* try to inject new event if pending */
@@@ -8892,7 -8665,7 +8896,7 @@@
         if (vcpu->arch.smi_pending) {
                 r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                 if (r < 0)
- -                      goto busy;
+ +                      goto out;
                 if (r) {
                         vcpu->arch.smi_pending = false;
                         ++vcpu->arch.smi_count;
@@@ -8905,7 -8678,7 +8909,7 @@@
         if (vcpu->arch.nmi_pending) {
                 r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
                 if (r < 0)
- -                      goto busy;
+ +                      goto out;
                 if (r) {
                         --vcpu->arch.nmi_pending;
                         vcpu->arch.nmi_injected = true;
@@@ -8920,7 -8693,7 +8924,7 @@@
         if (kvm_cpu_has_injectable_intr(vcpu)) {
                 r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
                 if (r < 0)
- -                      goto busy;
+ +                      goto out;
                 if (r) {
                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
                         static_call(kvm_x86_set_irq)(vcpu);
@@@ -8936,14 -8709,11 +8940,14 @@@
                 *req_immediate_exit = true;
   
         WARN_ON(vcpu->arch.exception.pending);
- -      return;
+ +      return 0;
   
- -busy:
- -      *req_immediate_exit = true;
- -      return;
+ +out:
+ +      if (r == -EBUSY) {
+ +              *req_immediate_exit = true;
+ +              r = 0;
+ +      }
+ +      return r;
   }
   
   static void process_nmi(struct kvm_vcpu *vcpu)
@@@ -9122,9 -8892,10 +9126,9 @@@ static void enter_smm(struct kvm_vcpu *
   {
         struct kvm_segment cs, ds;
         struct desc_ptr dt;
+ +      unsigned long cr0;
         char buf[512];
- -      u32 cr0;
   
- -      trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
         memset(buf, 0, 512);
   #ifdef CONFIG_X86_64
         if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@@ -9134,13 -8905,13 +9138,13 @@@
                 enter_smm_save_state_32(vcpu, buf);
   
         /*
- -       * Give pre_enter_smm() a chance to make ISA-specific changes to the
- -       * vCPU state (e.g. leave guest mode) after we've saved the state into
- -       * the SMM state-save area.
+ +       * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ +       * state (e.g. leave guest mode) after we've saved the state into the
+ +       * SMM state-save area.
          */
- -      static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+ +      static_call(kvm_x86_enter_smm)(vcpu, buf);
   
- -      vcpu->arch.hflags |= HF_SMM_MASK;
+ +      kvm_smm_changed(vcpu, true);
         kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
   
         if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@@ -9229,15 -9000,6 +9233,15 @@@ void kvm_vcpu_update_apicv(struct kvm_v
         vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
         kvm_apic_update_apicv(vcpu);
         static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+ +
+ +      /*
+ +       * When APICv gets disabled, we may still have injected interrupts
+ +       * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ +       * still active when the interrupt got accepted. Make sure
+ +       * inject_pending_event() is called to check for that.
+ +       */
+ +      if (!vcpu->arch.apicv_active)
+ +              kvm_make_request(KVM_REQ_EVENT, vcpu);
   }
   EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
   
@@@ -9413,7 -9175,7 +9417,7 @@@ static int vcpu_enter_guest(struct kvm_
                 }
                 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                         kvm_vcpu_flush_tlb_current(vcpu);
- -              if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ +              if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
                         kvm_vcpu_flush_tlb_guest(vcpu);
   
                 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@@ -9506,21 -9268,13 +9510,21 @@@
         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
             kvm_xen_has_interrupt(vcpu)) {
                 ++vcpu->stat.req_event;
- -              kvm_apic_accept_events(vcpu);
+ +              r = kvm_apic_accept_events(vcpu);
+ +              if (r < 0) {
+ +                      r = 0;
+ +                      goto out;
+ +              }
                 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                         r = 1;
                         goto out;
                 }
   
- -              inject_pending_event(vcpu, &req_immediate_exit);
+ +              r = inject_pending_event(vcpu, &req_immediate_exit);
+ +              if (r < 0) {
+ +                      r = 0;
+ +                      goto out;
+ +              }
                 if (req_int_win)
                         static_call(kvm_x86_enable_irq_window)(vcpu);
   
@@@ -9722,8 -9476,7 +9726,8 @@@ static inline int vcpu_block(struct kv
                         return 1;
         }
   
- -      kvm_apic_accept_events(vcpu);
+ +      if (kvm_apic_accept_events(vcpu) < 0)
+ +              return 0;
         switch(vcpu->arch.mp_state) {
         case KVM_MP_STATE_HALTED:
         case KVM_MP_STATE_AP_RESET_HOLD:
@@@ -9885,7 -9638,7 +9889,7 @@@ static void kvm_save_current_fpu(struc
                 memcpy(&fpu->state, &current->thread.fpu.state,
                        fpu_kernel_xstate_size);
         else
-               copy_fpregs_to_fpstate(fpu);
+               save_fpregs_to_fpstate(fpu);
   }
   
   /* Swap (qemu) user FPU context for the guest FPU context. */
@@@ -9901,7 -9654,7 +9905,7 @@@ static void kvm_load_guest_fpu(struct k
          */
         if (vcpu->arch.guest_fpu)
                 /* PKRU is separately restored in kvm_x86_ops.run. */
-               __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+               __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
                                         ~XFEATURE_MASK_PKRU);
   
         fpregs_mark_activate();
@@@ -9922,7 -9675,7 +9926,7 @@@ static void kvm_put_guest_fpu(struct kv
         if (vcpu->arch.guest_fpu)
                 kvm_save_current_fpu(vcpu->arch.guest_fpu);
   
-       copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+       restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
   
         fpregs_mark_activate();
         fpregs_unlock();
@@@ -9947,10 -9700,7 +9951,10 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                         goto out;
                 }
                 kvm_vcpu_block(vcpu);
- -              kvm_apic_accept_events(vcpu);
+ +              if (kvm_apic_accept_events(vcpu) < 0) {
+ +                      r = 0;
+ +                      goto out;
+ +              }
                 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                 r = -EAGAIN;
                 if (signal_pending(current)) {
@@@ -10099,7 -9849,7 +10103,7 @@@ void kvm_get_cs_db_l_bits(struct kvm_vc
   }
   EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
   
- -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+ +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
   {
         struct desc_ptr dt;
   
@@@ -10132,36 -9882,14 +10136,36 @@@ skip_protected_regs
         sregs->cr8 = kvm_get_cr8(vcpu);
         sregs->efer = vcpu->arch.efer;
         sregs->apic_base = kvm_get_apic_base(vcpu);
+ +}
+ +
+ +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+ +{
+ +      __get_sregs_common(vcpu, sregs);
   
- -      memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+ +      if (vcpu->arch.guest_state_protected)
+ +              return;
   
         if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
                 set_bit(vcpu->arch.interrupt.nr,
                         (unsigned long *)sregs->interrupt_bitmap);
   }
   
+ +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+ +{
+ +      int i;
+ +
+ +      __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+ +
+ +      if (vcpu->arch.guest_state_protected)
+ +              return;
+ +
+ +      if (is_pae_paging(vcpu)) {
+ +              for (i = 0 ; i < 4 ; i++)
+ +                      sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ +              sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ +      }
+ +}
+ +
   int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
   {
@@@ -10174,17 -9902,11 +10178,17 @@@
   int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
   {
+ +      int r;
+ +
         vcpu_load(vcpu);
         if (kvm_mpx_supported())
                 kvm_load_guest_fpu(vcpu);
   
- -      kvm_apic_accept_events(vcpu);
+ +      r = kvm_apic_accept_events(vcpu);
+ +      if (r < 0)
+ +              goto out;
+ +      r = 0;
+ +
         if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
              vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
             vcpu->arch.pv.pv_unhalted)
@@@ -10192,11 -9914,10 +10196,11 @@@
         else
                 mp_state->mp_state = vcpu->arch.mp_state;
   
+ +out:
         if (kvm_mpx_supported())
                 kvm_put_guest_fpu(vcpu);
         vcpu_put(vcpu);
- -      return 0;
+ +      return r;
   }
   
   int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@@ -10280,23 -10001,24 +10284,23 @@@ static bool kvm_is_valid_sregs(struct k
         return kvm_is_valid_cr4(vcpu, sregs->cr4);
   }
   
- -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+ +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ +              int *mmu_reset_needed, bool update_pdptrs)
   {
         struct msr_data apic_base_msr;
- -      int mmu_reset_needed = 0;
- -      int pending_vec, max_bits, idx;
+ +      int idx;
         struct desc_ptr dt;
- -      int ret = -EINVAL;
   
         if (!kvm_is_valid_sregs(vcpu, sregs))
- -              goto out;
+ +              return -EINVAL;
   
         apic_base_msr.data = sregs->apic_base;
         apic_base_msr.host_initiated = true;
         if (kvm_set_apic_base(vcpu, &apic_base_msr))
- -              goto out;
+ +              return -EINVAL;
   
         if (vcpu->arch.guest_state_protected)
- -              goto skip_protected_regs;
+ +              return 0;
   
         dt.size = sregs->idt.limit;
         dt.address = sregs->idt.base;
@@@ -10306,30 -10028,31 +10310,30 @@@
         static_call(kvm_x86_set_gdt)(vcpu, &dt);
   
         vcpu->arch.cr2 = sregs->cr2;
- -      mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ +      *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
         vcpu->arch.cr3 = sregs->cr3;
         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
   
         kvm_set_cr8(vcpu, sregs->cr8);
   
- -      mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ +      *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
         static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
   
- -      mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ +      *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
         static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
         vcpu->arch.cr0 = sregs->cr0;
   
- -      mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ +      *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
         static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
   
- -      idx = srcu_read_lock(&vcpu->kvm->srcu);
- -      if (is_pae_paging(vcpu)) {
- -              load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- -              mmu_reset_needed = 1;
+ +      if (update_pdptrs) {
+ +              idx = srcu_read_lock(&vcpu->kvm->srcu);
+ +              if (is_pae_paging(vcpu)) {
+ +                      load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ +                      *mmu_reset_needed = 1;
+ +              }
+ +              srcu_read_unlock(&vcpu->kvm->srcu, idx);
         }
- -      srcu_read_unlock(&vcpu->kvm->srcu, idx);
- -
- -      if (mmu_reset_needed)
- -              kvm_mmu_reset_context(vcpu);
   
         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@@ -10349,63 -10072,20 +10353,63 @@@
             !is_protmode(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
   
- -skip_protected_regs:
+ +      return 0;
+ +}
+ +
+ +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+ +{
+ +      int pending_vec, max_bits;
+ +      int mmu_reset_needed = 0;
+ +      int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+ +
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (mmu_reset_needed)
+ +              kvm_mmu_reset_context(vcpu);
+ +
         max_bits = KVM_NR_INTERRUPTS;
         pending_vec = find_first_bit(
                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ +
         if (pending_vec < max_bits) {
                 kvm_queue_interrupt(vcpu, pending_vec, false);
                 pr_debug("Set back pending irq %d\n", pending_vec);
+ +              kvm_make_request(KVM_REQ_EVENT, vcpu);
         }
+ +      return 0;
+ +}
   
- -      kvm_make_request(KVM_REQ_EVENT, vcpu);
+ +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+ +{
+ +      int mmu_reset_needed = 0;
+ +      bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ +      bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ +              !(sregs2->efer & EFER_LMA);
+ +      int i, ret;
   
- -      ret = 0;
- -out:
- -      return ret;
+ +      if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ +              return -EINVAL;
+ +
+ +      if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ +              return -EINVAL;
+ +
+ +      ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ +                               &mmu_reset_needed, !valid_pdptrs);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (valid_pdptrs) {
+ +              for (i = 0; i < 4 ; i++)
+ +                      kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+ +
+ +              kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ +              mmu_reset_needed = 1;
+ +              vcpu->arch.pdptrs_from_userspace = true;
+ +      }
+ +      if (mmu_reset_needed)
+ +              kvm_mmu_reset_context(vcpu);
+ +      return 0;
   }
   
   int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@@ -10629,13 -10309,13 +10633,13 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
         struct page *page;
         int r;
   
+ +      vcpu->arch.last_vmentry_cpu = -1;
+ +
         if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
         else
                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
   
- -      kvm_set_tsc_khz(vcpu, max_tsc_khz);
- -
         r = kvm_mmu_create(vcpu);
         if (r < 0)
                 return r;
@@@ -10695,10 -10375,6 +10699,10 @@@
         vcpu->arch.pending_external_vector = -1;
         vcpu->arch.preempted_in_kernel = false;
   
+ +#if IS_ENABLED(CONFIG_HYPERV)
+ +      vcpu->arch.hv_root_tdp = INVALID_PAGE;
+ +#endif
+ +
         r = static_call(kvm_x86_vcpu_create)(vcpu);
         if (r)
                 goto free_guest_fpu;
@@@ -10707,9 -10383,8 +10711,9 @@@
         vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
         kvm_vcpu_mtrr_init(vcpu);
         vcpu_load(vcpu);
+ +      kvm_set_tsc_khz(vcpu, max_tsc_khz);
         kvm_vcpu_reset(vcpu, false);
- -      kvm_init_mmu(vcpu, false);
+ +      kvm_init_mmu(vcpu);
         vcpu_put(vcpu);
         return 0;
   
@@@ -10783,8 -10458,6 +10787,8 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
   
   void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
   {
+ +      unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ +
         kvm_lapic_reset(vcpu, init_event);
   
         vcpu->arch.hflags = 0;
@@@ -10853,17 -10526,6 +10857,17 @@@
         vcpu->arch.ia32_xss = 0;
   
         static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ +
+ +      /*
+ +       * Reset the MMU context if paging was enabled prior to INIT (which is
+ +       * implied if CR0.PG=1 as CR0 will be '0' prior to RESET).  Unlike the
+ +       * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
+ +       * checked because it is unconditionally cleared on INIT and all other
+ +       * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
+ +       * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+ +       */
+ +      if (old_cr0 & X86_CR0_PG)
+ +              kvm_mmu_reset_context(vcpu);
   }
   
   void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@@ -10981,9 -10643,6 +10985,9 @@@ int kvm_arch_hardware_setup(void *opaqu
         int r;
   
         rdmsrl_safe(MSR_EFER, &host_efer);
+ +      if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
+ +                       !(host_efer & EFER_NX)))
+ +              return -EIO;
   
         if (boot_cpu_has(X86_FEATURE_XSAVES))
                 rdmsrl(MSR_IA32_XSS, host_xss);
@@@ -11099,15 -10758,9 +11103,15 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
   
         kvm->arch.guest_can_read_msr_platform_info = true;
   
+ +#if IS_ENABLED(CONFIG_HYPERV)
+ +      spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ +      kvm->arch.hv_root_tdp = INVALID_PAGE;
+ +#endif
+ +
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
   
+ +      kvm_apicv_init(kvm);
         kvm_hv_init_vm(kvm);
         kvm_page_track_init(kvm);
         kvm_mmu_init_vm(kvm);
@@@ -11268,23 -10921,17 +11272,23 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kvm_hv_destroy_vm(kvm);
   }
   
- -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+ +static void memslot_rmap_free(struct kvm_memory_slot *slot)
   {
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                 kvfree(slot->arch.rmap[i]);
                 slot->arch.rmap[i] = NULL;
+ +      }
+ +}
   
- -              if (i == 0)
- -                      continue;
+ +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+ +{
+ +      int i;
   
+ +      memslot_rmap_free(slot);
+ +
+ +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                 kvfree(slot->arch.lpage_info[i - 1]);
                 slot->arch.lpage_info[i - 1] = NULL;
         }
@@@ -11292,79 -10939,11 +11296,79 @@@
         kvm_page_track_free_memslot(slot);
   }
   
- -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
- -                                    unsigned long npages)
+ +static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
+ +                            unsigned long npages)
   {
+ +      const int sz = sizeof(*slot->arch.rmap[0]);
         int i;
   
+ +      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ +              int level = i + 1;
+ +              int lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ +                                        slot->base_gfn, level) + 1;
+ +
+ +              WARN_ON(slot->arch.rmap[i]);
+ +
+ +              slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ +              if (!slot->arch.rmap[i]) {
+ +                      memslot_rmap_free(slot);
+ +                      return -ENOMEM;
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +int alloc_all_memslots_rmaps(struct kvm *kvm)
+ +{
+ +      struct kvm_memslots *slots;
+ +      struct kvm_memory_slot *slot;
+ +      int r, i;
+ +
+ +      /*
+ +       * Check if memslots alreday have rmaps early before acquiring
+ +       * the slots_arch_lock below.
+ +       */
+ +      if (kvm_memslots_have_rmaps(kvm))
+ +              return 0;
+ +
+ +      mutex_lock(&kvm->slots_arch_lock);
+ +
+ +      /*
+ +       * Read memslots_have_rmaps again, under the slots arch lock,
+ +       * before allocating the rmaps
+ +       */
+ +      if (kvm_memslots_have_rmaps(kvm)) {
+ +              mutex_unlock(&kvm->slots_arch_lock);
+ +              return 0;
+ +      }
+ +
+ +      for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ +              slots = __kvm_memslots(kvm, i);
+ +              kvm_for_each_memslot(slot, slots) {
+ +                      r = memslot_rmap_alloc(slot, slot->npages);
+ +                      if (r) {
+ +                              mutex_unlock(&kvm->slots_arch_lock);
+ +                              return r;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Ensure that memslots_have_rmaps becomes true strictly after
+ +       * all the rmap pointers are set.
+ +       */
+ +      smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+ +      mutex_unlock(&kvm->slots_arch_lock);
+ +      return 0;
+ +}
+ +
+ +static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ +                                    struct kvm_memory_slot *slot,
+ +                                    unsigned long npages)
+ +{
+ +      int i, r;
+ +
         /*
          * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
          * old arrays will be freed by __kvm_set_memory_region() if installing
@@@ -11372,13 -10951,7 +11376,13 @@@
          */
         memset(&slot->arch, 0, sizeof(slot->arch));
   
- -      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ +      if (kvm_memslots_have_rmaps(kvm)) {
+ +              r = memslot_rmap_alloc(slot, npages);
+ +              if (r)
+ +                      return r;
+ +      }
+ +
+ +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                 struct kvm_lpage_info *linfo;
                 unsigned long ugfn;
                 int lpages;
@@@ -11387,6 -10960,14 +11391,6 @@@
                 lpages = gfn_to_index(slot->base_gfn + npages - 1,
                                       slot->base_gfn, level) + 1;
   
- -              slot->arch.rmap[i] =
- -                      kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- -                               GFP_KERNEL_ACCOUNT);
- -              if (!slot->arch.rmap[i])
- -                      goto out_free;
- -              if (i == 0)
- -                      continue;
- -
                 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                 if (!linfo)
                         goto out_free;
@@@ -11416,9 -10997,12 +11420,9 @@@
         return 0;
   
   out_free:
- -      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- -              kvfree(slot->arch.rmap[i]);
- -              slot->arch.rmap[i] = NULL;
- -              if (i == 0)
- -                      continue;
+ +      memslot_rmap_free(slot);
   
+ +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                 kvfree(slot->arch.lpage_info[i - 1]);
                 slot->arch.lpage_info[i - 1] = NULL;
         }
@@@ -11447,7 -11031,7 +11451,7 @@@ int kvm_arch_prepare_memory_region(stru
                                 enum kvm_mr_change change)
   {
         if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- -              return kvm_alloc_memslot_metadata(memslot,
+ +              return kvm_alloc_memslot_metadata(kvm, memslot,
                                                   mem->memory_size >> PAGE_SHIFT);
         return 0;
   }
@@@ -11523,19 -11107,36 +11527,19 @@@ static void kvm_mmu_slot_apply_flags(st
                  */
                 kvm_mmu_zap_collapsible_sptes(kvm, new);
         } else {
- -              /* By default, write-protect everything to log writes. */
- -              int level = PG_LEVEL_4K;
+ +              /*
+ +               * Initially-all-set does not require write protecting any page,
+ +               * because they're all assumed to be dirty.
+ +               */
+ +              if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ +                      return;
   
                 if (kvm_x86_ops.cpu_dirty_log_size) {
- -                      /*
- -                       * Clear all dirty bits, unless pages are treated as
- -                       * dirty from the get-go.
- -                       */
- -                      if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
- -                              kvm_mmu_slot_leaf_clear_dirty(kvm, new);
- -
- -                      /*
- -                       * Write-protect large pages on write so that dirty
- -                       * logging happens at 4k granularity.  No need to
- -                       * write-protect small SPTEs since write accesses are
- -                       * logged by the CPU via dirty bits.
- -                       */
- -                      level = PG_LEVEL_2M;
- -              } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- -                      /*
- -                       * If we're with initial-all-set, we don't need
- -                       * to write protect any small page because
- -                       * they're reported as dirty already.  However
- -                       * we still need to write-protect huge pages
- -                       * so that the page split can happen lazily on
- -                       * the first write to the huge page.
- -                       */
- -                      level = PG_LEVEL_2M;
+ +                      kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ +                      kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ +              } else {
+ +                      kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
                 }
- -              kvm_mmu_slot_remove_write_access(kvm, new, level);
         }
   }
   
@@@ -12104,6 -11705,8 +12108,6 @@@ int kvm_handle_invpcid(struct kvm_vcpu 
   {
         bool pcid_enabled;
         struct x86_exception e;
- -      unsigned i;
- -      unsigned long roots_to_free = 0;
         struct {
                 u64 pcid;
                 u64 gla;
@@@ -12137,7 -11740,23 +12141,7 @@@
                         return 1;
                 }
   
- -              if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- -                      kvm_mmu_sync_roots(vcpu);
- -                      kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- -              }
- -
- -              for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- -                      if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- -                          == operand.pcid)
- -                              roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
- -
- -              kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- -              /*
- -               * If neither the current cr3 nor any of the prev_roots use the
- -               * given PCID, then nothing needs to be done here because a
- -               * resync will happen anyway before switching to any other CR3.
- -               */
- -
+ +              kvm_invalidate_pcid(vcpu, operand.pcid);
                 return kvm_skip_emulated_instruction(vcpu);
   
         case INVPCID_TYPE_ALL_NON_GLOBAL:
@@@ -12150,7 -11769,7 +12154,7 @@@
   
                 fallthrough;
         case INVPCID_TYPE_ALL_INCL_GLOBAL:
- -              kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ +              kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
                 return kvm_skip_emulated_instruction(vcpu);
   
         default:
diff --combined arch/x86/mm/fault.c

index 2d27932c9ac7b2a4a0827ac1b7e259519e6e7edf,f33a61a432ce4feff89f376f5e66837bfa7ff526..b2eefdefc1083316ea27e6bfd8bb1ddbcea715ab
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -875,7 -875,7 +875,7 @@@ static inline bool bad_area_access_from
         /* This code is always called on the current mm */
         bool foreign = false;
   
-       if (!boot_cpu_has(X86_FEATURE_OSPKE))
+       if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                 return false;
         if (error_code & X86_PF_PK)
                 return true;
@@@ -1186,7 -1186,7 +1186,7 @@@ do_kern_addr_fault(struct pt_regs *regs
                 return;
   
         /* kprobes don't want to hook the spurious faults: */
- -      if (kprobe_page_fault(regs, X86_TRAP_PF))
+ +      if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                 return;
   
         /*
@@@ -1239,7 -1239,7 +1239,7 @@@ void do_user_addr_fault(struct pt_regs 
         }
   
         /* kprobes don't want to hook the spurious faults: */
- -      if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
+ +      if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                 return;
   
         /*
diff --combined arch/x86/mm/pkeys.c

index 4a67b922bce1ea52ecd115d649e608900ef317eb,fb171a5d7f339dff1e69da24034fec44de49b478..e44e938885b709f267d135e9530c7c88d9566cbb
--- 1/arch/x86/mm/pkeys.c
--- 2/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@@ -10,7 -10,6 +10,6 @@@
   
   #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
   #include <asm/mmu_context.h>            /* vma_pkey()                   */
- #include <asm/fpu/internal.h>         /* init_fpstate                 */
   
   int __execute_only_pkey(struct mm_struct *mm)
   {
@@@ -125,22 -124,6 +124,6 @@@ u32 init_pkru_value = PKRU_AD_KEY( 1) 
                       PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
                       PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
   
- /*
-  * Called from the FPU code when creating a fresh set of FPU
-  * registers.  This is called from a very specific context where
-  * we know the FPU registers are safe for use and we can use PKRU
-  * directly.
-  */
- void copy_init_pkru_to_fpregs(void)
- {
-       u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-       /*
-        * Override the PKRU state that came from 'init_fpstate'
-        * with the baseline from the process.
-        */
-       write_pkru(init_pkru_value_snapshot);
- }
- 
   static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
   {
@@@ -154,7 -137,6 +137,6 @@@
   static ssize_t init_pkru_write_file(struct file *file,
                  const char __user *user_buf, size_t count, loff_t *ppos)
   {
-       struct pkru_state *pk;
         char buf[32];
         ssize_t len;
         u32 new_init_pkru;
@@@ -177,10 -159,6 +159,6 @@@
                 return -EINVAL;
   
         WRITE_ONCE(init_pkru_value, new_init_pkru);
-       pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-       if (!pk)
-               return -EINVAL;
-       pk->pkru = new_init_pkru;
         return count;
   }
   
@@@ -192,10 -170,6 +170,10 @@@ static const struct file_operations fop
   
   static int __init create_init_pkru_value(void)
   {
+ +      /* Do not expose the file if pkeys are not supported. */
+ +      if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ +              return 0;
+ +
         debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
                         arch_debugfs_dir, NULL, &fops_init_pkru);
         return 0;
diff --combined include/linux/sched/signal.h

index c9cf678c347dc778c67235965fdacb7133724327,0d7fec79d28ffb00ff6178a0c7c20ccf07f8994f..b9126fe06c3fce036050bcf32698ca8a587cbc86
--- 1/include/linux/sched/signal.h
--- 2/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@@ -382,7 -382,7 +382,7 @@@ static inline int fatal_signal_pending(
         return task_sigpending(p) && __fatal_signal_pending(p);
   }
   
- -static inline int signal_pending_state(long state, struct task_struct *p)
+ +static inline int signal_pending_state(unsigned int state, struct task_struct *p)
   {
         if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                 return 0;
@@@ -538,6 -538,17 +538,17 @@@ static inline int kill_cad_pid(int sig
   #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
   #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1)
   
+ static inline int __on_sig_stack(unsigned long sp)
+ {
+ #ifdef CONFIG_STACK_GROWSUP
+       return sp >= current->sas_ss_sp &&
+               sp - current->sas_ss_sp < current->sas_ss_size;
+ #else
+       return sp > current->sas_ss_sp &&
+               sp - current->sas_ss_sp <= current->sas_ss_size;
+ #endif
+ }
+ 
   /*
    * True if we are on the alternate signal stack.
    */
@@@ -555,13 -566,7 +566,7 @@@ static inline int on_sig_stack(unsigne
         if (current->sas_ss_flags & SS_AUTODISARM)
                 return 0;
   
- #ifdef CONFIG_STACK_GROWSUP
-       return sp >= current->sas_ss_sp &&
-               sp - current->sas_ss_sp < current->sas_ss_size;
- #else
-       return sp > current->sas_ss_sp &&
-               sp - current->sas_ss_sp <= current->sas_ss_size;
- #endif
+       return __on_sig_stack(sp);
   }
   
   static inline int sas_ss_flags(unsigned long sp)
author	Linus Torvalds <[email protected]>
	Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
		1	2
Documentation/x86/index.rst	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/lbr.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pkeys.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/signal.h	patch \|	diff1 \|	diff2 \|	blob \| history