]> Git Repo - linux.git/commitdiff
Merge tag 'x86-fpu-2021-07-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
authorLinus Torvalds <[email protected]>
Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
committerLinus Torvalds <[email protected]>
Wed, 7 Jul 2021 18:12:01 +0000 (11:12 -0700)
Pull x86 fpu updates from Thomas Gleixner:
 "Fixes and improvements for FPU handling on x86:

   - Prevent sigaltstack out of bounds writes.

     The kernel unconditionally writes the FPU state to the alternate
     stack without checking whether the stack is large enough to
     accomodate it.

     Check the alternate stack size before doing so and in case it's too
     small force a SIGSEGV instead of silently corrupting user space
     data.

   - MINSIGSTKZ and SIGSTKSZ are constants in signal.h and have never
     been updated despite the fact that the FPU state which is stored on
     the signal stack has grown over time which causes trouble in the
     field when AVX512 is available on a CPU. The kernel does not expose
     the minimum requirements for the alternate stack size depending on
     the available and enabled CPU features.

     ARM already added an aux vector AT_MINSIGSTKSZ for the same reason.
     Add it to x86 as well.

   - A major cleanup of the x86 FPU code. The recent discoveries of
     XSTATE related issues unearthed quite some inconsistencies,
     duplicated code and other issues.

     The fine granular overhaul addresses this, makes the code more
     robust and maintainable, which allows to integrate upcoming XSTATE
     related features in sane ways"

* tag 'x86-fpu-2021-07-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (74 commits)
  x86/fpu/xstate: Clear xstate header in copy_xstate_to_uabi_buf() again
  x86/fpu/signal: Let xrstor handle the features to init
  x86/fpu/signal: Handle #PF in the direct restore path
  x86/fpu: Return proper error codes from user access functions
  x86/fpu/signal: Split out the direct restore code
  x86/fpu/signal: Sanitize copy_user_to_fpregs_zeroing()
  x86/fpu/signal: Sanitize the xstate check on sigframe
  x86/fpu/signal: Remove the legacy alignment check
  x86/fpu/signal: Move initial checks into fpu__restore_sig()
  x86/fpu: Mark init_fpstate __ro_after_init
  x86/pkru: Remove xstate fiddling from write_pkru()
  x86/fpu: Don't store PKRU in xstate in fpu_reset_fpstate()
  x86/fpu: Remove PKRU handling from switch_fpu_finish()
  x86/fpu: Mask PKRU from kernel XRSTOR[S] operations
  x86/fpu: Hook up PKRU into ptrace()
  x86/fpu: Add PKRU storage outside of task XSAVE buffer
  x86/fpu: Dont restore PKRU in fpregs_restore_userspace()
  x86/fpu: Rename xfeatures_mask_user() to xfeatures_mask_uabi()
  x86/fpu: Move FXSAVE_LEAK quirk info __copy_kernel_to_fpregs()
  x86/fpu: Rename __fpregs_load_activate() to fpregs_restore_userregs()
  ...

1  2 
Documentation/x86/index.rst
arch/x86/events/intel/lbr.c
arch/x86/include/asm/processor.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/process.c
arch/x86/kernel/signal.c
arch/x86/kernel/traps.c
arch/x86/kvm/x86.c
arch/x86/mm/fault.c
arch/x86/mm/pkeys.c
include/linux/sched/signal.h

index 0004f5d2283ee0e4c9da4514df0fc9e040e67f78,d58614d5cde6b88ca267a8323b8fb2868f978e19..383048396336fde5c66f7d3b96a54f796f2f6d76
@@@ -29,10 -29,10 +29,11 @@@ x86-specific Documentatio
     microcode
     resctrl
     tsx_async_abort
 +   buslock
     usb-legacy-support
     i386/index
     x86_64/index
     sva
     sgx
     features
+    elf_auxvec
index e8453de7a96485700e308f340ecd16fd284472c1,f338645071c8959c60cef2f644076713d33d95f8..9e6d6eaeb4cb6037c13e259cf98fdab37c74b1e0
@@@ -491,7 -491,7 +491,7 @@@ static void intel_pmu_arch_lbr_xrstors(
  {
        struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
  
-       copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+       xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
  }
  
  static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
@@@ -576,7 -576,7 +576,7 @@@ static void intel_pmu_arch_lbr_xsaves(v
  {
        struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
  
-       copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+       xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
  }
  
  static void __intel_pmu_lbr_save(void *ctx)
@@@ -731,8 -731,7 +731,8 @@@ void reserve_lbr_buffers(void
                if (!kmem_cache || cpuc->lbr_xsave)
                        continue;
  
 -              cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL,
 +              cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache,
 +                                                      GFP_KERNEL | __GFP_ZERO,
                                                        cpu_to_node(cpu));
        }
  }
@@@ -993,7 -992,7 +993,7 @@@ static void intel_pmu_arch_lbr_read_xsa
                intel_pmu_store_lbr(cpuc, NULL);
                return;
        }
-       copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+       xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
  
        intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
  }
index 364d0e42e28000f9ffd0fe2fac1725ab4bc64c58,91946fc3c006ec99649fb3b71cba4b26ded4bc13..f3020c54e2cb3db1347a3e497770be8648f71b8a
@@@ -518,6 -518,15 +518,15 @@@ struct thread_struct 
  
        unsigned int            sig_on_uaccess_err:1;
  
+       /*
+        * Protection Keys Register for Userspace.  Loaded immediately on
+        * context switch. Store it in thread_struct to avoid a lookup in
+        * the tasks's FPU xstate buffer. This value is only valid when a
+        * task is scheduled out. For 'current' the authoritative source of
+        * PKRU is the hardware itself.
+        */
+       u32                     pkru;
        /* Floating point and extended processor state */
        struct fpu              fpu;
        /*
@@@ -663,7 -672,6 +672,7 @@@ extern void load_direct_gdt(int)
  extern void load_fixmap_gdt(int);
  extern void load_percpu_segment(int);
  extern void cpu_init(void);
 +extern void cpu_init_secondary(void);
  extern void cpu_init_exception_handling(void);
  extern void cr4_init(void);
  
index a99d00393206b9edbbcd645e16e1d4c15fc2d6bc,ca668efa4c81e2d47eafba930ab4a6fb62a4e5fe..64b805bd6a542ba2ad2a18f5fdb9e911f6dadb65
@@@ -58,6 -58,7 +58,7 @@@
  #include <asm/intel-family.h>
  #include <asm/cpu_device_id.h>
  #include <asm/uv/uv.h>
+ #include <asm/sigframe.h>
  
  #include "cpu.h"
  
@@@ -465,27 -466,22 +466,22 @@@ static bool pku_disabled
  
  static __always_inline void setup_pku(struct cpuinfo_x86 *c)
  {
-       struct pkru_state *pk;
+       if (c == &boot_cpu_data) {
+               if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+                       return;
+               /*
+                * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+                * bit to be set.  Enforce it.
+                */
+               setup_force_cpu_cap(X86_FEATURE_OSPKE);
  
-       /* check the boot processor, plus compile options for PKU: */
-       if (!cpu_feature_enabled(X86_FEATURE_PKU))
-               return;
-       /* checks the actual processor's cpuid bits: */
-       if (!cpu_has(c, X86_FEATURE_PKU))
-               return;
-       if (pku_disabled)
+       } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                return;
+       }
  
        cr4_set_bits(X86_CR4_PKE);
-       pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-       if (pk)
-               pk->pkru = init_pkru_value;
-       /*
-        * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
-        * cpuid bit to be set.  We need to ensure that we
-        * update that bit in this CPU's "cpu_info".
-        */
-       set_cpu_cap(c, X86_FEATURE_OSPKE);
+       /* Load the default PKRU value */
+       pkru_write_default();
  }
  
  #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@@ -1332,6 -1328,8 +1328,8 @@@ static void __init early_identify_cpu(s
  
        fpu__init_system(c);
  
+       init_sigframe_size();
  #ifdef CONFIG_X86_32
        /*
         * Regardless of whether PCID is enumerated, the SDM says
@@@ -1717,9 -1715,8 +1715,8 @@@ void print_cpu_info(struct cpuinfo_x86 
  }
  
  /*
-  * clearcpuid= was already parsed in fpu__init_parse_early_param.
-  * But we need to keep a dummy __setup around otherwise it would
-  * show up as an environment variable for init.
+  * clearcpuid= was already parsed in cpu_parse_early_param().  This dummy
+  * function prevents it from becoming an environment variable for init.
   */
  static __init int setup_clearcpuid(char *arg)
  {
@@@ -1773,16 -1770,10 +1770,16 @@@ void syscall_init(void
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
  #endif
  
 -      /* Flags to clear on syscall */
 +      /*
 +       * Flags to clear on syscall; clear as much as possible
 +       * to minimize user space-kernel interference.
 +       */
        wrmsrl(MSR_SYSCALL_MASK,
 -             X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
 -             X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
 +             X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
 +             X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
 +             X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
 +             X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
 +             X86_EFLAGS_AC|X86_EFLAGS_ID);
  }
  
  #else /* CONFIG_X86_64 */
@@@ -1944,12 -1935,13 +1941,12 @@@ void cpu_init_exception_handling(void
  
  /*
   * cpu_init() initializes state that is per-CPU. Some data is already
 - * initialized (naturally) in the bootstrap process, such as the GDT
 - * and IDT. We reload them nevertheless, this function acts as a
 - * 'CPU state barrier', nothing should get across.
 + * initialized (naturally) in the bootstrap process, such as the GDT.  We
 + * reload it nevertheless, this function acts as a 'CPU state barrier',
 + * nothing should get across.
   */
  void cpu_init(void)
  {
 -      struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
        struct task_struct *cur = current;
        int cpu = raw_smp_processor_id();
  
            early_cpu_to_node(cpu) != NUMA_NO_NODE)
                set_numa_node(early_cpu_to_node(cpu));
  #endif
 -      setup_getcpu(cpu);
 -
        pr_debug("Initializing CPU#%d\n", cpu);
  
        if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
         * and set up the GDT descriptor:
         */
        switch_to_new_gdt(cpu);
 -      load_current_idt();
  
        if (IS_ENABLED(CONFIG_X86_64)) {
                loadsegment(fs, 0);
        initialize_tlbstate_and_flush();
        enter_lazy_tlb(&init_mm, cur);
  
 -      /* Initialize the TSS. */
 -      tss_setup_ist(tss);
 -      tss_setup_io_bitmap(tss);
 -      set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 -
 -      load_TR_desc();
        /*
         * sp0 points to the entry trampoline stack regardless of what task
         * is running.
        load_fixmap_gdt(cpu);
  }
  
 +#ifdef CONFIG_SMP
 +void cpu_init_secondary(void)
 +{
 +      /*
 +       * Relies on the BP having set-up the IDT tables, which are loaded
 +       * on this CPU in cpu_init_exception_handling().
 +       */
 +      cpu_init_exception_handling();
 +      cpu_init();
 +}
 +#endif
 +
  /*
   * The microcode loader calls this upon late microcode load to recheck features,
   * only when microcode has been updated. Caller holds microcode_mutex and CPU
index e52b208b4641b50537c52d05da353ad9cf8cd64b,fa6c8fa0f7788484389bdf3159269da6758a3770..1d9463e3096b68307e96445026e19cd01d98464e
@@@ -87,8 -87,7 +87,7 @@@ int arch_dup_task_struct(struct task_st
  #ifdef CONFIG_VM86
        dst->thread.vm86 = NULL;
  #endif
-       return fpu__copy(dst, src);
+       return fpu_clone(dst);
  }
  
  /*
@@@ -157,11 -156,18 +156,18 @@@ int copy_thread(unsigned long clone_fla
  
        /* Kernel thread ? */
        if (unlikely(p->flags & PF_KTHREAD)) {
+               p->thread.pkru = pkru_get_init_value();
                memset(childregs, 0, sizeof(struct pt_regs));
                kthread_frame_init(frame, sp, arg);
                return 0;
        }
  
+       /*
+        * Clone current's PKRU value from hardware. tsk->thread.pkru
+        * is only valid when scheduled out.
+        */
+       p->thread.pkru = read_pkru();
        frame->bx = 0;
        *childregs = *current_pt_regs();
        childregs->ax = 0;
        return ret;
  }
  
+ static void pkru_flush_thread(void)
+ {
+       /*
+        * If PKRU is enabled the default PKRU value has to be loaded into
+        * the hardware right here (similar to context switch).
+        */
+       pkru_write_default();
+ }
  void flush_thread(void)
  {
        struct task_struct *tsk = current;
        flush_ptrace_hw_breakpoint(tsk);
        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
  
-       fpu__clear_all(&tsk->thread.fpu);
+       fpu_flush_thread();
+       pkru_flush_thread();
  }
  
  void disable_TSC(void)
@@@ -931,7 -947,7 +947,7 @@@ unsigned long get_wchan(struct task_str
        unsigned long start, bottom, top, sp, fp, ip, ret = 0;
        int count = 0;
  
 -      if (p == current || p->state == TASK_RUNNING)
 +      if (p == current || task_is_running(p))
                return 0;
  
        if (!try_get_task_stack(p))
                        goto out;
                }
                fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
 -      } while (count++ < 16 && p->state != TASK_RUNNING);
 +      } while (count++ < 16 && !task_is_running(p));
  
  out:
        put_task_stack(p);
diff --combined arch/x86/kernel/signal.c
index e12779a2714dce0a141aab191f032964bd57c1bc,2ddcf2165bcb87bede34d9fcf980a362619ba031..f4d21e47008355a11aa81b02b5a94eb2b44c20a4
@@@ -212,6 -212,11 +212,11 @@@ do {                                                                     
   * Set up a signal frame.
   */
  
+ /* x86 ABI requires 16-byte alignment */
+ #define FRAME_ALIGNMENT       16UL
+ #define MAX_FRAME_PADDING     (FRAME_ALIGNMENT - 1)
  /*
   * Determine which stack to use..
   */
@@@ -222,9 -227,9 +227,9 @@@ static unsigned long align_sigframe(uns
         * Align the stack pointer according to the i386 ABI,
         * i.e. so that on function entry ((sp + 4) & 15) == 0.
         */
-       sp = ((sp + 4) & -16ul) - 4;
+       sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
  #else /* !CONFIG_X86_32 */
-       sp = round_down(sp, 16) - 8;
+       sp = round_down(sp, FRAME_ALIGNMENT) - 8;
  #endif
        return sp;
  }
@@@ -234,10 -239,11 +239,11 @@@ get_sigframe(struct k_sigaction *ka, st
             void __user **fpstate)
  {
        /* Default to using normal stack */
+       bool nested_altstack = on_sig_stack(regs->sp);
+       bool entering_altstack = false;
        unsigned long math_size = 0;
        unsigned long sp = regs->sp;
        unsigned long buf_fx = 0;
-       int onsigstack = on_sig_stack(sp);
        int ret;
  
        /* redzone */
  
        /* This is the X/Open sanctioned signal stack switching.  */
        if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
+               /*
+                * This checks nested_altstack via sas_ss_flags(). Sensible
+                * programs use SS_AUTODISARM, which disables that check, and
+                * programs that don't use SS_AUTODISARM get compatible.
+                */
+               if (sas_ss_flags(sp) == 0) {
                        sp = current->sas_ss_sp + current->sas_ss_size;
+                       entering_altstack = true;
+               }
        } else if (IS_ENABLED(CONFIG_X86_32) &&
-                  !onsigstack &&
+                  !nested_altstack &&
                   regs->ss != __USER_DS &&
                   !(ka->sa.sa_flags & SA_RESTORER) &&
                   ka->sa.sa_restorer) {
                /* This is the legacy signal stack switching. */
                sp = (unsigned long) ka->sa.sa_restorer;
+               entering_altstack = true;
        }
  
        sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
         * If we are on the alternate signal stack and would overflow it, don't.
         * Return an always-bogus address instead so we will die with SIGSEGV.
         */
-       if (onsigstack && !likely(on_sig_stack(sp)))
+       if (unlikely((nested_altstack || entering_altstack) &&
+                    !__on_sig_stack(sp))) {
+               if (show_unhandled_signals && printk_ratelimit())
+                       pr_info("%s[%d] overflowed sigaltstack\n",
+                               current->comm, task_pid_nr(current));
                return (void __user *)-1L;
+       }
  
        /* save i387 and extended state */
        ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
@@@ -663,6 -684,61 +684,61 @@@ badframe
        return 0;
  }
  
+ /*
+  * There are four different struct types for signal frame: sigframe_ia32,
+  * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+  * -- the largest size. It means the size for 64-bit apps is a bit more
+  * than needed, but this keeps the code simple.
+  */
+ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+ #else
+ # define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+ #endif
+ /*
+  * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+  * If a signal frame starts at an unaligned address, extra space is required.
+  * This is the max alignment padding, conservatively.
+  */
+ #define MAX_XSAVE_PADDING     63UL
+ /*
+  * The frame data is composed of the following areas and laid out as:
+  *
+  * -------------------------
+  * | alignment padding     |
+  * -------------------------
+  * | (f)xsave frame        |
+  * -------------------------
+  * | fsave header          |
+  * -------------------------
+  * | alignment padding     |
+  * -------------------------
+  * | siginfo + ucontext    |
+  * -------------------------
+  */
+ /* max_frame_size tells userspace the worst case signal stack size. */
+ static unsigned long __ro_after_init max_frame_size;
+ void __init init_sigframe_size(void)
+ {
+       max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+       max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+       /* Userspace expects an aligned size. */
+       max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+       pr_info("max sigframe size: %lu\n", max_frame_size);
+ }
+ unsigned long get_sigframe_size(void)
+ {
+       return max_frame_size;
+ }
  static inline int is_ia32_compat_frame(struct ksignal *ksig)
  {
        return IS_ENABLED(CONFIG_IA32_EMULATION) &&
@@@ -713,7 -789,7 +789,7 @@@ handle_signal(struct ksignal *ksig, str
                save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
  
        /* Are we from a system call? */
 -      if (syscall_get_nr(current, regs) >= 0) {
 +      if (syscall_get_nr(current, regs) != -1) {
                /* If so, check system call restarting.. */
                switch (syscall_get_error(current, regs)) {
                case -ERESTART_RESTARTBLOCK:
@@@ -793,7 -869,7 +869,7 @@@ void arch_do_signal_or_restart(struct p
        }
  
        /* Did we come from a system call? */
 -      if (syscall_get_nr(current, regs) >= 0) {
 +      if (syscall_get_nr(current, regs) != -1) {
                /* Restart the system call - no handlers present */
                switch (syscall_get_error(current, regs)) {
                case -ERESTARTNOHAND:
diff --combined arch/x86/kernel/traps.c
index ed540e09a399a577a5fc31d32ea2e57ffde0480f,4c9c4aa8321681e4b4482e152c9594a6beaf072b..a58800973aed3a16cbb8d97500bae51eb8306716
@@@ -1046,9 -1046,10 +1046,10 @@@ static void math_error(struct pt_regs *
        }
  
        /*
-        * Save the info for the exception handler and clear the error.
+        * Synchronize the FPU register state to the memory register state
+        * if necessary. This allows the exception handler to inspect it.
         */
-       fpu__save(fpu);
+       fpu_sync_fpstate(fpu);
  
        task->thread.trap_nr    = trapnr;
        task->thread.error_code = 0;
@@@ -1160,9 -1161,12 +1161,9 @@@ void __init trap_init(void
        /* Init GHCB memory pages when running as an SEV-ES guest */
        sev_es_init_vc_handling();
  
 +      /* Initialize TSS before setting up traps so ISTs work */
 +      cpu_init_exception_handling();
 +      /* Setup traps as cpu_init() might #GP */
        idt_setup_traps();
 -
 -      /*
 -       * Should be a barrier for any external CPU state:
 -       */
        cpu_init();
 -
 -      idt_setup_ist_traps();
  }
diff --combined arch/x86/kvm/x86.c
index 17468d983fbd57d48150ae4474db24d6dccb9209,8ee7add0e7631a8619c2796cca902c2990eb840e..c6dc1b44523156e292fd01647b4c8ba761cb5042
@@@ -58,7 -58,6 +58,7 @@@
  #include <linux/sched/isolation.h>
  #include <linux/mem_encrypt.h>
  #include <linux/entry-kvm.h>
 +#include <linux/suspend.h>
  
  #include <trace/events/kvm.h>
  
@@@ -66,6 -65,7 +66,7 @@@
  #include <asm/msr.h>
  #include <asm/desc.h>
  #include <asm/mce.h>
+ #include <asm/pkru.h>
  #include <linux/kernel_stat.h>
  #include <asm/fpu/internal.h> /* Ugh! */
  #include <asm/pvclock.h>
@@@ -103,8 -103,6 +104,8 @@@ static u64 __read_mostly efer_reserved_
  
  static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
  
 +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
 +
  #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
  
@@@ -116,9 -114,6 +117,9 @@@ static void __kvm_set_rflags(struct kvm
  static void store_regs(struct kvm_vcpu *vcpu);
  static int sync_regs(struct kvm_vcpu *vcpu);
  
 +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 +
  struct kvm_x86_ops kvm_x86_ops __read_mostly;
  EXPORT_SYMBOL_GPL(kvm_x86_ops);
  
@@@ -215,78 -210,55 +216,78 @@@ EXPORT_SYMBOL_GPL(host_efer)
  bool __read_mostly allow_smaller_maxphyaddr = 0;
  EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
  
 +bool __read_mostly enable_apicv = true;
 +EXPORT_SYMBOL_GPL(enable_apicv);
 +
  u64 __read_mostly host_xss;
  EXPORT_SYMBOL_GPL(host_xss);
  u64 __read_mostly supported_xss;
  EXPORT_SYMBOL_GPL(supported_xss);
  
 -struct kvm_stats_debugfs_item debugfs_entries[] = {
 -      VCPU_STAT("pf_fixed", pf_fixed),
 -      VCPU_STAT("pf_guest", pf_guest),
 -      VCPU_STAT("tlb_flush", tlb_flush),
 -      VCPU_STAT("invlpg", invlpg),
 -      VCPU_STAT("exits", exits),
 -      VCPU_STAT("io_exits", io_exits),
 -      VCPU_STAT("mmio_exits", mmio_exits),
 -      VCPU_STAT("signal_exits", signal_exits),
 -      VCPU_STAT("irq_window", irq_window_exits),
 -      VCPU_STAT("nmi_window", nmi_window_exits),
 -      VCPU_STAT("halt_exits", halt_exits),
 -      VCPU_STAT("halt_successful_poll", halt_successful_poll),
 -      VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
 -      VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
 -      VCPU_STAT("halt_wakeup", halt_wakeup),
 -      VCPU_STAT("hypercalls", hypercalls),
 -      VCPU_STAT("request_irq", request_irq_exits),
 -      VCPU_STAT("irq_exits", irq_exits),
 -      VCPU_STAT("host_state_reload", host_state_reload),
 -      VCPU_STAT("fpu_reload", fpu_reload),
 -      VCPU_STAT("insn_emulation", insn_emulation),
 -      VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
 -      VCPU_STAT("irq_injections", irq_injections),
 -      VCPU_STAT("nmi_injections", nmi_injections),
 -      VCPU_STAT("req_event", req_event),
 -      VCPU_STAT("l1d_flush", l1d_flush),
 -      VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
 -      VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 -      VCPU_STAT("nested_run", nested_run),
 -      VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
 -      VCPU_STAT("directed_yield_successful", directed_yield_successful),
 -      VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
 -      VM_STAT("mmu_pte_write", mmu_pte_write),
 -      VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
 -      VM_STAT("mmu_flooded", mmu_flooded),
 -      VM_STAT("mmu_recycled", mmu_recycled),
 -      VM_STAT("mmu_cache_miss", mmu_cache_miss),
 -      VM_STAT("mmu_unsync", mmu_unsync),
 -      VM_STAT("remote_tlb_flush", remote_tlb_flush),
 -      VM_STAT("largepages", lpages, .mode = 0444),
 -      VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
 -      VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
 -      { NULL }
 +const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 +      KVM_GENERIC_VM_STATS(),
 +      STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
 +      STATS_DESC_COUNTER(VM, mmu_pte_write),
 +      STATS_DESC_COUNTER(VM, mmu_pde_zapped),
 +      STATS_DESC_COUNTER(VM, mmu_flooded),
 +      STATS_DESC_COUNTER(VM, mmu_recycled),
 +      STATS_DESC_COUNTER(VM, mmu_cache_miss),
 +      STATS_DESC_ICOUNTER(VM, mmu_unsync),
 +      STATS_DESC_ICOUNTER(VM, lpages),
 +      STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
 +      STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
 +};
 +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
 +              sizeof(struct kvm_vm_stat) / sizeof(u64));
 +
 +const struct kvm_stats_header kvm_vm_stats_header = {
 +      .name_size = KVM_STATS_NAME_SIZE,
 +      .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
 +      .id_offset = sizeof(struct kvm_stats_header),
 +      .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
 +      .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
 +                     sizeof(kvm_vm_stats_desc),
 +};
 +
 +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 +      KVM_GENERIC_VCPU_STATS(),
 +      STATS_DESC_COUNTER(VCPU, pf_fixed),
 +      STATS_DESC_COUNTER(VCPU, pf_guest),
 +      STATS_DESC_COUNTER(VCPU, tlb_flush),
 +      STATS_DESC_COUNTER(VCPU, invlpg),
 +      STATS_DESC_COUNTER(VCPU, exits),
 +      STATS_DESC_COUNTER(VCPU, io_exits),
 +      STATS_DESC_COUNTER(VCPU, mmio_exits),
 +      STATS_DESC_COUNTER(VCPU, signal_exits),
 +      STATS_DESC_COUNTER(VCPU, irq_window_exits),
 +      STATS_DESC_COUNTER(VCPU, nmi_window_exits),
 +      STATS_DESC_COUNTER(VCPU, l1d_flush),
 +      STATS_DESC_COUNTER(VCPU, halt_exits),
 +      STATS_DESC_COUNTER(VCPU, request_irq_exits),
 +      STATS_DESC_COUNTER(VCPU, irq_exits),
 +      STATS_DESC_COUNTER(VCPU, host_state_reload),
 +      STATS_DESC_COUNTER(VCPU, fpu_reload),
 +      STATS_DESC_COUNTER(VCPU, insn_emulation),
 +      STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
 +      STATS_DESC_COUNTER(VCPU, hypercalls),
 +      STATS_DESC_COUNTER(VCPU, irq_injections),
 +      STATS_DESC_COUNTER(VCPU, nmi_injections),
 +      STATS_DESC_COUNTER(VCPU, req_event),
 +      STATS_DESC_COUNTER(VCPU, nested_run),
 +      STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
 +      STATS_DESC_COUNTER(VCPU, directed_yield_successful),
 +      STATS_DESC_ICOUNTER(VCPU, guest_mode)
 +};
 +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
 +              sizeof(struct kvm_vcpu_stat) / sizeof(u64));
 +
 +const struct kvm_stats_header kvm_vcpu_stats_header = {
 +      .name_size = KVM_STATS_NAME_SIZE,
 +      .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
 +      .id_offset = sizeof(struct kvm_stats_header),
 +      .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
 +      .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
 +                     sizeof(kvm_vcpu_stats_desc),
  };
  
  u64 __read_mostly host_xcr0;
@@@ -807,6 -779,13 +808,6 @@@ int kvm_read_guest_page_mmu(struct kvm_
  }
  EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
  
 -static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 -                             void *data, int offset, int len, u32 access)
 -{
 -      return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 -                                     data, offset, len, access);
 -}
 -
  static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
  {
        return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@@ -841,7 -820,6 +842,7 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
  
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
 +      vcpu->arch.pdptrs_from_userspace = false;
  
  out:
  
  }
  EXPORT_SYMBOL_GPL(load_pdptrs);
  
 -bool pdptrs_changed(struct kvm_vcpu *vcpu)
 -{
 -      u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 -      int offset;
 -      gfn_t gfn;
 -      int r;
 -
 -      if (!is_pae_paging(vcpu))
 -              return false;
 -
 -      if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
 -              return true;
 -
 -      gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
 -      offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
 -      r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 -                                     PFERR_USER_MASK | PFERR_WRITE_MASK);
 -      if (r < 0)
 -              return true;
 -
 -      return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 -}
 -EXPORT_SYMBOL_GPL(pdptrs_changed);
 -
  void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
  {
 -      unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 -
        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_async_pf_hash_reset(vcpu);
        }
  
 -      if ((cr0 ^ old_cr0) & update_bits)
 +      if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
                kvm_mmu_reset_context(vcpu);
  
        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@@ -939,7 -943,7 +940,7 @@@ void kvm_load_guest_xsave_state(struct 
            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
            vcpu->arch.pkru != vcpu->arch.host_pkru)
-               __write_pkru(vcpu->arch.pkru);
+               write_pkru(vcpu->arch.pkru);
  }
  EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
  
@@@ -953,7 -957,7 +954,7 @@@ void kvm_load_host_xsave_state(struct k
             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
                vcpu->arch.pkru = rdpkru();
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
-                       __write_pkru(vcpu->arch.host_pkru);
+                       write_pkru(vcpu->arch.host_pkru);
        }
  
        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@@ -1035,7 -1039,10 +1036,7 @@@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4)
  
  void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
  {
 -      unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
 -                                    X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
 -
 -      if (((cr4 ^ old_cr4) & mmu_role_bits) ||
 +      if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
  }
@@@ -1078,46 -1085,25 +1079,46 @@@ int kvm_set_cr4(struct kvm_vcpu *vcpu, 
  }
  EXPORT_SYMBOL_GPL(kvm_set_cr4);
  
 +static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
 +{
 +      struct kvm_mmu *mmu = vcpu->arch.mmu;
 +      unsigned long roots_to_free = 0;
 +      int i;
 +
 +      /*
 +       * If neither the current CR3 nor any of the prev_roots use the given
 +       * PCID, then nothing needs to be done here because a resync will
 +       * happen anyway before switching to any other CR3.
 +       */
 +      if (kvm_get_active_pcid(vcpu) == pcid) {
 +              kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 +              kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 +      }
 +
 +      for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 +              if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
 +                      roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 +
 +      kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
 +}
 +
  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
        bool skip_tlb_flush = false;
 +      unsigned long pcid = 0;
  #ifdef CONFIG_X86_64
        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
  
        if (pcid_enabled) {
                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
                cr3 &= ~X86_CR3_PCID_NOFLUSH;
 +              pcid = cr3 & X86_CR3_PCID_MASK;
        }
  #endif
  
 -      if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 -              if (!skip_tlb_flush) {
 -                      kvm_mmu_sync_roots(vcpu);
 -                      kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 -              }
 -              return 0;
 -      }
 +      /* PDPTRs are always reloaded for PAE paging. */
 +      if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
 +              goto handle_tlb_flush;
  
        /*
         * Do not condition the GPA check on long mode, this helper is used to
        if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
  
 -      kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
 +      if (cr3 != kvm_read_cr3(vcpu))
 +              kvm_mmu_new_pgd(vcpu, cr3);
 +
        vcpu->arch.cr3 = cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
  
 +handle_tlb_flush:
 +      /*
 +       * A load of CR3 that flushes the TLB flushes only the current PCID,
 +       * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
 +       * moot point in the end because _disabling_ PCID will flush all PCIDs,
 +       * and it's impossible to use a non-zero PCID when PCID is disabled,
 +       * i.e. only PCID=0 can be relevant.
 +       */
 +      if (!skip_tlb_flush)
 +              kvm_invalidate_pcid(vcpu, pcid);
 +
        return 0;
  }
  EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@@ -2207,15 -2180,13 +2208,15 @@@ static u32 adjust_tsc_khz(u32 khz, s32 
        return v;
  }
  
 +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
 +
  static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
  {
        u64 ratio;
  
        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
 -              vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 +              kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return 0;
        }
  
                return -1;
        }
  
 -      vcpu->arch.tsc_scaling_ratio = ratio;
 +      kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
        return 0;
  }
  
@@@ -2253,7 -2224,7 +2254,7 @@@ static int kvm_set_tsc_khz(struct kvm_v
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
 -              vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 +              kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return -1;
        }
  
@@@ -2335,9 -2306,10 +2336,9 @@@ static inline u64 __scale_tsc(u64 ratio
        return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
  }
  
 -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
  {
        u64 _tsc = tsc;
 -      u64 ratio = vcpu->arch.tsc_scaling_ratio;
  
        if (ratio != kvm_default_tsc_scaling_ratio)
                _tsc = __scale_tsc(ratio, tsc);
  }
  EXPORT_SYMBOL_GPL(kvm_scale_tsc);
  
 -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
  {
        u64 tsc;
  
 -      tsc = kvm_scale_tsc(vcpu, rdtsc());
 +      tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
  
        return target_tsc - tsc;
  }
  
  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
  {
 -      return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 +      return vcpu->arch.l1_tsc_offset +
 +              kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
  }
  EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
  
 -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
  {
 -      vcpu->arch.l1_tsc_offset = offset;
 -      vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
 +      u64 nested_offset;
 +
 +      if (l2_multiplier == kvm_default_tsc_scaling_ratio)
 +              nested_offset = l1_offset;
 +      else
 +              nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
 +                                              kvm_tsc_scaling_ratio_frac_bits);
 +
 +      nested_offset += l2_offset;
 +      return nested_offset;
 +}
 +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
 +
 +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
 +{
 +      if (l2_multiplier != kvm_default_tsc_scaling_ratio)
 +              return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
 +                                     kvm_tsc_scaling_ratio_frac_bits);
 +
 +      return l1_multiplier;
 +}
 +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
 +
 +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
 +{
 +      trace_kvm_write_tsc_offset(vcpu->vcpu_id,
 +                                 vcpu->arch.l1_tsc_offset,
 +                                 l1_offset);
 +
 +      vcpu->arch.l1_tsc_offset = l1_offset;
 +
 +      /*
 +       * If we are here because L1 chose not to trap WRMSR to TSC then
 +       * according to the spec this should set L1's TSC (as opposed to
 +       * setting L1's offset for L2).
 +       */
 +      if (is_guest_mode(vcpu))
 +              vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
 +                      l1_offset,
 +                      static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
 +                      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
 +      else
 +              vcpu->arch.tsc_offset = l1_offset;
 +
 +      static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 +}
 +
 +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
 +{
 +      vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
 +
 +      /* Userspace is changing the multiplier while L2 is active */
 +      if (is_guest_mode(vcpu))
 +              vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
 +                      l1_multiplier,
 +                      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
 +      else
 +              vcpu->arch.tsc_scaling_ratio = l1_multiplier;
 +
 +      if (kvm_has_tsc_control)
 +              static_call(kvm_x86_write_tsc_multiplier)(
 +                      vcpu, vcpu->arch.tsc_scaling_ratio);
  }
  
  static inline bool kvm_check_tsc_unstable(void)
@@@ -2451,7 -2362,7 +2452,7 @@@ static void kvm_synchronize_tsc(struct 
        bool synchronizing = false;
  
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 -      offset = kvm_compute_tsc_offset(vcpu, data);
 +      offset = kvm_compute_l1_tsc_offset(vcpu, data);
        ns = get_kvmclock_base_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
  
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
                        data += delta;
 -                      offset = kvm_compute_tsc_offset(vcpu, data);
 +                      offset = kvm_compute_l1_tsc_offset(vcpu, data);
                }
                matched = true;
                already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@@ -2549,10 -2460,9 +2550,10 @@@ static inline void adjust_tsc_offset_gu
  
  static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
  {
 -      if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
 +      if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
                WARN_ON(adjustment < 0);
 -      adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
 +      adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
 +                                 vcpu->arch.l1_tsc_scaling_ratio);
        adjust_tsc_offset_guest(vcpu, adjustment);
  }
  
@@@ -2935,8 -2845,7 +2936,8 @@@ static int kvm_guest_time_update(struc
        /* With all the info we got, fill in the values */
  
        if (kvm_has_tsc_control)
 -              tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
 +              tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
 +                                          v->arch.l1_tsc_scaling_ratio);
  
        if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
                kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@@ -3342,7 -3251,7 +3343,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                if (msr_info->host_initiated) {
                        kvm_synchronize_tsc(vcpu, data);
                } else {
 -                      u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
 +                      u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
                        vcpu->arch.ia32_tsc_adjust_msr += adj;
                }
@@@ -3644,17 -3553,10 +3645,17 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                 * return L1's TSC value to ensure backwards-compatible
                 * behavior for migration.
                 */
 -              u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
 -                                                          vcpu->arch.tsc_offset;
 +              u64 offset, ratio;
  
 -              msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
 +              if (msr_info->host_initiated) {
 +                      offset = vcpu->arch.l1_tsc_offset;
 +                      ratio = vcpu->arch.l1_tsc_scaling_ratio;
 +              } else {
 +                      offset = vcpu->arch.tsc_offset;
 +                      ratio = vcpu->arch.tsc_scaling_ratio;
 +              }
 +
 +              msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
                break;
        }
        case MSR_MTRRcap:
@@@ -3978,7 -3880,6 +3979,7 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_HYPERV_TLBFLUSH:
        case KVM_CAP_HYPERV_SEND_IPI:
        case KVM_CAP_HYPERV_CPUID:
 +      case KVM_CAP_HYPERV_ENFORCE_CPUID:
        case KVM_CAP_SYS_HYPERV_CPUID:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_SGX_ATTRIBUTE:
  #endif
        case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
 +      case KVM_CAP_SREGS2:
 +      case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
                r = 1;
                break;
 +      case KVM_CAP_EXIT_HYPERCALL:
 +              r = KVM_EXIT_HYPERCALL_VALID_MASK;
 +              break;
        case KVM_CAP_SET_GUEST_DEBUG2:
                return KVM_GUESTDBG_VALID_MASK;
  #ifdef CONFIG_KVM_XEN
@@@ -4243,7 -4139,7 +4244,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
                        mark_tsc_unstable("KVM discovered backwards TSC");
  
                if (kvm_check_tsc_unstable()) {
 -                      u64 offset = kvm_compute_tsc_offset(vcpu,
 +                      u64 offset = kvm_compute_l1_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
                        kvm_vcpu_write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
@@@ -4562,7 -4458,7 +4563,7 @@@ static void kvm_vcpu_ioctl_x86_get_vcpu
        memset(&events->reserved, 0, sizeof(events->reserved));
  }
  
 -static void kvm_smm_changed(struct kvm_vcpu *vcpu);
 +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
  
  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
 -              if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
 -                      if (events->smi.smm)
 -                              vcpu->arch.hflags |= HF_SMM_MASK;
 -                      else
 -                              vcpu->arch.hflags &= ~HF_SMM_MASK;
 -                      kvm_smm_changed(vcpu);
 -              }
 +              if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
 +                      kvm_smm_changed(vcpu, events->smi.smm);
  
                vcpu->arch.smi_pending = events->smi.pending;
  
@@@ -4704,20 -4605,21 +4705,21 @@@ static void fill_xsave(u8 *dest, struc
         */
        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
        while (valid) {
+               u32 size, offset, ecx, edx;
                u64 xfeature_mask = valid & -valid;
                int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *src = get_xsave_addr(xsave, xfeature_nr);
-               if (src) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(dest + offset, &vcpu->arch.pkru,
-                                      sizeof(vcpu->arch.pkru));
-                       else
-                               memcpy(dest + offset, src, size);
+               void *src;
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
  
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(dest + offset, &vcpu->arch.pkru,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       src = get_xsave_addr(xsave, xfeature_nr);
+                       if (src)
+                               memcpy(dest + offset, src, size);
                }
  
                valid -= xfeature_mask;
@@@ -4747,18 -4649,20 +4749,20 @@@ static void load_xsave(struct kvm_vcpu 
         */
        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
        while (valid) {
+               u32 size, offset, ecx, edx;
                u64 xfeature_mask = valid & -valid;
                int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *dest = get_xsave_addr(xsave, xfeature_nr);
-               if (dest) {
-                       u32 size, offset, ecx, edx;
-                       cpuid_count(XSTATE_CPUID, xfeature_nr,
-                                   &size, &offset, &ecx, &edx);
-                       if (xfeature_nr == XFEATURE_PKRU)
-                               memcpy(&vcpu->arch.pkru, src + offset,
-                                      sizeof(vcpu->arch.pkru));
-                       else
+               cpuid_count(XSTATE_CPUID, xfeature_nr,
+                           &size, &offset, &ecx, &edx);
+               if (xfeature_nr == XFEATURE_PKRU) {
+                       memcpy(&vcpu->arch.pkru, src + offset,
+                              sizeof(vcpu->arch.pkru));
+               } else {
+                       void *dest = get_xsave_addr(xsave, xfeature_nr);
+                       if (dest)
                                memcpy(dest, src + offset, size);
                }
  
@@@ -4907,9 -4811,6 +4911,9 @@@ static int kvm_vcpu_ioctl_enable_cap(st
  
                return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
  
 +      case KVM_CAP_HYPERV_ENFORCE_CPUID:
 +              return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
 +
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                vcpu->arch.pv_cpuid.enforce = cap->args[0];
                if (vcpu->arch.pv_cpuid.enforce)
@@@ -4928,7 -4829,6 +4932,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
        void __user *argp = (void __user *)arg;
        int r;
        union {
 +              struct kvm_sregs2 *sregs2;
                struct kvm_lapic_state *lapic;
                struct kvm_xsave *xsave;
                struct kvm_xcrs *xcrs;
                break;
        }
  #endif
 +      case KVM_GET_SREGS2: {
 +              u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
 +              r = -ENOMEM;
 +              if (!u.sregs2)
 +                      goto out;
 +              __get_sregs2(vcpu, u.sregs2);
 +              r = -EFAULT;
 +              if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
 +                      goto out;
 +              r = 0;
 +              break;
 +      }
 +      case KVM_SET_SREGS2: {
 +              u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
 +              if (IS_ERR(u.sregs2)) {
 +                      r = PTR_ERR(u.sregs2);
 +                      u.sregs2 = NULL;
 +                      goto out;
 +              }
 +              r = __set_sregs2(vcpu, u.sregs2);
 +              break;
 +      }
        default:
                r = -EINVAL;
        }
@@@ -5642,21 -5520,6 +5646,21 @@@ split_irqchip_unlock
                if (kvm_x86_ops.vm_copy_enc_context_from)
                        r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
                return r;
 +      case KVM_CAP_EXIT_HYPERCALL:
 +              if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
 +                      r = -EINVAL;
 +                      break;
 +              }
 +              kvm->arch.hypercall_exit_enabled = cap->args[0];
 +              r = 0;
 +              break;
 +      case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
 +              r = -EINVAL;
 +              if (cap->args[0] & ~1)
 +                      break;
 +              kvm->arch.exit_on_emulation_error = cap->args[0];
 +              r = 0;
 +              break;
        default:
                r = -EINVAL;
                break;
@@@ -5771,41 -5634,6 +5775,41 @@@ static int kvm_vm_ioctl_set_msr_filter(
        return 0;
  }
  
 +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 +static int kvm_arch_suspend_notifier(struct kvm *kvm)
 +{
 +      struct kvm_vcpu *vcpu;
 +      int i, ret = 0;
 +
 +      mutex_lock(&kvm->lock);
 +      kvm_for_each_vcpu(i, vcpu, kvm) {
 +              if (!vcpu->arch.pv_time_enabled)
 +                      continue;
 +
 +              ret = kvm_set_guest_paused(vcpu);
 +              if (ret) {
 +                      kvm_err("Failed to pause guest VCPU%d: %d\n",
 +                              vcpu->vcpu_id, ret);
 +                      break;
 +              }
 +      }
 +      mutex_unlock(&kvm->lock);
 +
 +      return ret ? NOTIFY_BAD : NOTIFY_DONE;
 +}
 +
 +int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
 +{
 +      switch (state) {
 +      case PM_HIBERNATION_PREPARE:
 +      case PM_SUSPEND_PREPARE:
 +              return kvm_arch_suspend_notifier(kvm);
 +      }
 +
 +      return NOTIFY_DONE;
 +}
 +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 +
  long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
  {
@@@ -7280,22 -7108,23 +7284,22 @@@ static unsigned emulator_get_hflags(str
        return emul_to_vcpu(ctxt)->arch.hflags;
  }
  
 -static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
 +static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
  {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
  
 -      vcpu->arch.hflags = emul_flags;
 -      kvm_mmu_reset_context(vcpu);
 +      kvm_smm_changed(vcpu, false);
  }
  
 -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
 +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
                                  const char *smstate)
  {
 -      return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
 +      return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
  }
  
 -static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
 +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
  {
 -      kvm_smm_changed(emul_to_vcpu(ctxt));
 +      kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
  }
  
  static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@@ -7344,9 -7173,9 +7348,9 @@@ static const struct x86_emulate_ops emu
        .guest_has_fxsr      = emulator_guest_has_fxsr,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
 -      .set_hflags          = emulator_set_hflags,
 -      .pre_leave_smm       = emulator_pre_leave_smm,
 -      .post_leave_smm      = emulator_post_leave_smm,
 +      .exiting_smm         = emulator_exiting_smm,
 +      .leave_smm           = emulator_leave_smm,
 +      .triple_fault        = emulator_triple_fault,
        .set_xcr             = emulator_set_xcr,
  };
  
@@@ -7452,33 -7281,8 +7456,33 @@@ void kvm_inject_realmode_interrupt(stru
  }
  EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
  
 +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
 +{
 +      struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 +      u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
 +      struct kvm_run *run = vcpu->run;
 +
 +      run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 +      run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
 +      run->emulation_failure.ndata = 0;
 +      run->emulation_failure.flags = 0;
 +
 +      if (insn_size) {
 +              run->emulation_failure.ndata = 3;
 +              run->emulation_failure.flags |=
 +                      KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
 +              run->emulation_failure.insn_size = insn_size;
 +              memset(run->emulation_failure.insn_bytes, 0x90,
 +                     sizeof(run->emulation_failure.insn_bytes));
 +              memcpy(run->emulation_failure.insn_bytes,
 +                     ctxt->fetch.data, insn_size);
 +      }
 +}
 +
  static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
  {
 +      struct kvm *kvm = vcpu->kvm;
 +
        ++vcpu->stat.insn_emulation_fail;
        trace_kvm_emulate_insn_failed(vcpu);
  
                return 1;
        }
  
 -      if (emulation_type & EMULTYPE_SKIP) {
 -              vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 -              vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 -              vcpu->run->internal.ndata = 0;
 +      if (kvm->arch.exit_on_emulation_error ||
 +          (emulation_type & EMULTYPE_SKIP)) {
 +              prepare_emulation_failure_exit(vcpu);
                return 0;
        }
  
@@@ -7631,14 -7436,11 +7635,14 @@@ static bool retry_instruction(struct x8
  static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
  static int complete_emulated_pio(struct kvm_vcpu *vcpu);
  
 -static void kvm_smm_changed(struct kvm_vcpu *vcpu)
 +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
  {
 -      if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
 -              /* This is a good place to trace that we are exiting SMM.  */
 -              trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
 +      trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
 +
 +      if (entering_smm) {
 +              vcpu->arch.hflags |= HF_SMM_MASK;
 +      } else {
 +              vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
  
                /* Process a latched INIT or SMI, if any.  */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -8563,15 -8365,16 +8567,15 @@@ bool kvm_apicv_activated(struct kvm *kv
  }
  EXPORT_SYMBOL_GPL(kvm_apicv_activated);
  
 -void kvm_apicv_init(struct kvm *kvm, bool enable)
 +static void kvm_apicv_init(struct kvm *kvm)
  {
 -      if (enable)
 +      if (enable_apicv)
                clear_bit(APICV_INHIBIT_REASON_DISABLE,
                          &kvm->arch.apicv_inhibit_reasons);
        else
                set_bit(APICV_INHIBIT_REASON_DISABLE,
                        &kvm->arch.apicv_inhibit_reasons);
  }
 -EXPORT_SYMBOL_GPL(kvm_apicv_init);
  
  static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
  {
@@@ -8607,17 -8410,6 +8611,17 @@@ no_yield
        return;
  }
  
 +static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
 +{
 +      u64 ret = vcpu->run->hypercall.ret;
 +
 +      if (!is_64_bit_mode(vcpu))
 +              ret = (u32)ret;
 +      kvm_rax_write(vcpu, ret);
 +      ++vcpu->stat.hypercalls;
 +      return kvm_skip_emulated_instruction(vcpu);
 +}
 +
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
  {
        unsigned long nr, a0, a1, a2, a3, ret;
                kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
 +      case KVM_HC_MAP_GPA_RANGE: {
 +              u64 gpa = a0, npages = a1, attrs = a2;
 +
 +              ret = -KVM_ENOSYS;
 +              if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
 +                      break;
 +
 +              if (!PAGE_ALIGNED(gpa) || !npages ||
 +                  gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
 +                      ret = -KVM_EINVAL;
 +                      break;
 +              }
 +
 +              vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
 +              vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
 +              vcpu->run->hypercall.args[0]  = gpa;
 +              vcpu->run->hypercall.args[1]  = npages;
 +              vcpu->run->hypercall.args[2]  = attrs;
 +              vcpu->run->hypercall.longmode = op_64_bit;
 +              vcpu->arch.complete_userspace_io = complete_hypercall_exit;
 +              return 0;
 +      }
        default:
                ret = -KVM_ENOSYS;
                break;
@@@ -8788,6 -8558,9 +8792,6 @@@ static void update_cr8_intercept(struc
  
  int kvm_check_nested_events(struct kvm_vcpu *vcpu)
  {
 -      if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
 -              return -EIO;
 -
        if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                kvm_x86_ops.nested_ops->triple_fault(vcpu);
                return 1;
@@@ -8803,7 -8576,7 +8807,7 @@@ static void kvm_inject_exception(struc
        static_call(kvm_x86_queue_exception)(vcpu);
  }
  
 -static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 +static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
  {
        int r;
        bool can_inject = true;
        if (is_guest_mode(vcpu)) {
                r = kvm_check_nested_events(vcpu);
                if (r < 0)
 -                      goto busy;
 +                      goto out;
        }
  
        /* try to inject new event if pending */
        if (vcpu->arch.smi_pending) {
                r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
 -                      goto busy;
 +                      goto out;
                if (r) {
                        vcpu->arch.smi_pending = false;
                        ++vcpu->arch.smi_count;
        if (vcpu->arch.nmi_pending) {
                r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
 -                      goto busy;
 +                      goto out;
                if (r) {
                        --vcpu->arch.nmi_pending;
                        vcpu->arch.nmi_injected = true;
        if (kvm_cpu_has_injectable_intr(vcpu)) {
                r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
 -                      goto busy;
 +                      goto out;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
                        static_call(kvm_x86_set_irq)(vcpu);
                *req_immediate_exit = true;
  
        WARN_ON(vcpu->arch.exception.pending);
 -      return;
 +      return 0;
  
 -busy:
 -      *req_immediate_exit = true;
 -      return;
 +out:
 +      if (r == -EBUSY) {
 +              *req_immediate_exit = true;
 +              r = 0;
 +      }
 +      return r;
  }
  
  static void process_nmi(struct kvm_vcpu *vcpu)
@@@ -9122,9 -8892,10 +9126,9 @@@ static void enter_smm(struct kvm_vcpu *
  {
        struct kvm_segment cs, ds;
        struct desc_ptr dt;
 +      unsigned long cr0;
        char buf[512];
 -      u32 cr0;
  
 -      trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        memset(buf, 0, 512);
  #ifdef CONFIG_X86_64
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
                enter_smm_save_state_32(vcpu, buf);
  
        /*
 -       * Give pre_enter_smm() a chance to make ISA-specific changes to the
 -       * vCPU state (e.g. leave guest mode) after we've saved the state into
 -       * the SMM state-save area.
 +       * Give enter_smm() a chance to make ISA-specific changes to the vCPU
 +       * state (e.g. leave guest mode) after we've saved the state into the
 +       * SMM state-save area.
         */
 -      static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
 +      static_call(kvm_x86_enter_smm)(vcpu, buf);
  
 -      vcpu->arch.hflags |= HF_SMM_MASK;
 +      kvm_smm_changed(vcpu, true);
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
  
        if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@@ -9229,15 -9000,6 +9233,15 @@@ void kvm_vcpu_update_apicv(struct kvm_v
        vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
        kvm_apic_update_apicv(vcpu);
        static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
 +
 +      /*
 +       * When APICv gets disabled, we may still have injected interrupts
 +       * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
 +       * still active when the interrupt got accepted. Make sure
 +       * inject_pending_event() is called to check for that.
 +       */
 +      if (!vcpu->arch.apicv_active)
 +              kvm_make_request(KVM_REQ_EVENT, vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
  
@@@ -9413,7 -9175,7 +9417,7 @@@ static int vcpu_enter_guest(struct kvm_
                }
                if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                        kvm_vcpu_flush_tlb_current(vcpu);
 -              if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
 +              if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
                        kvm_vcpu_flush_tlb_guest(vcpu);
  
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
            kvm_xen_has_interrupt(vcpu)) {
                ++vcpu->stat.req_event;
 -              kvm_apic_accept_events(vcpu);
 +              r = kvm_apic_accept_events(vcpu);
 +              if (r < 0) {
 +                      r = 0;
 +                      goto out;
 +              }
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        r = 1;
                        goto out;
                }
  
 -              inject_pending_event(vcpu, &req_immediate_exit);
 +              r = inject_pending_event(vcpu, &req_immediate_exit);
 +              if (r < 0) {
 +                      r = 0;
 +                      goto out;
 +              }
                if (req_int_win)
                        static_call(kvm_x86_enable_irq_window)(vcpu);
  
@@@ -9722,8 -9476,7 +9726,8 @@@ static inline int vcpu_block(struct kv
                        return 1;
        }
  
 -      kvm_apic_accept_events(vcpu);
 +      if (kvm_apic_accept_events(vcpu) < 0)
 +              return 0;
        switch(vcpu->arch.mp_state) {
        case KVM_MP_STATE_HALTED:
        case KVM_MP_STATE_AP_RESET_HOLD:
@@@ -9885,7 -9638,7 +9889,7 @@@ static void kvm_save_current_fpu(struc
                memcpy(&fpu->state, &current->thread.fpu.state,
                       fpu_kernel_xstate_size);
        else
-               copy_fpregs_to_fpstate(fpu);
+               save_fpregs_to_fpstate(fpu);
  }
  
  /* Swap (qemu) user FPU context for the guest FPU context. */
@@@ -9901,7 -9654,7 +9905,7 @@@ static void kvm_load_guest_fpu(struct k
         */
        if (vcpu->arch.guest_fpu)
                /* PKRU is separately restored in kvm_x86_ops.run. */
-               __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+               __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
                                        ~XFEATURE_MASK_PKRU);
  
        fpregs_mark_activate();
@@@ -9922,7 -9675,7 +9926,7 @@@ static void kvm_put_guest_fpu(struct kv
        if (vcpu->arch.guest_fpu)
                kvm_save_current_fpu(vcpu->arch.guest_fpu);
  
-       copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+       restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
  
        fpregs_mark_activate();
        fpregs_unlock();
@@@ -9947,10 -9700,7 +9951,10 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                        goto out;
                }
                kvm_vcpu_block(vcpu);
 -              kvm_apic_accept_events(vcpu);
 +              if (kvm_apic_accept_events(vcpu) < 0) {
 +                      r = 0;
 +                      goto out;
 +              }
                kvm_clear_request(KVM_REQ_UNHALT, vcpu);
                r = -EAGAIN;
                if (signal_pending(current)) {
@@@ -10099,7 -9849,7 +10103,7 @@@ void kvm_get_cs_db_l_bits(struct kvm_vc
  }
  EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
  
 -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
  {
        struct desc_ptr dt;
  
@@@ -10132,36 -9882,14 +10136,36 @@@ skip_protected_regs
        sregs->cr8 = kvm_get_cr8(vcpu);
        sregs->efer = vcpu->arch.efer;
        sregs->apic_base = kvm_get_apic_base(vcpu);
 +}
 +
 +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 +{
 +      __get_sregs_common(vcpu, sregs);
  
 -      memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
 +      if (vcpu->arch.guest_state_protected)
 +              return;
  
        if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
                set_bit(vcpu->arch.interrupt.nr,
                        (unsigned long *)sregs->interrupt_bitmap);
  }
  
 +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
 +{
 +      int i;
 +
 +      __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
 +
 +      if (vcpu->arch.guest_state_protected)
 +              return;
 +
 +      if (is_pae_paging(vcpu)) {
 +              for (i = 0 ; i < 4 ; i++)
 +                      sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
 +              sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
 +      }
 +}
 +
  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
  {
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
 +      int r;
 +
        vcpu_load(vcpu);
        if (kvm_mpx_supported())
                kvm_load_guest_fpu(vcpu);
  
 -      kvm_apic_accept_events(vcpu);
 +      r = kvm_apic_accept_events(vcpu);
 +      if (r < 0)
 +              goto out;
 +      r = 0;
 +
        if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
             vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
            vcpu->arch.pv.pv_unhalted)
        else
                mp_state->mp_state = vcpu->arch.mp_state;
  
 +out:
        if (kvm_mpx_supported())
                kvm_put_guest_fpu(vcpu);
        vcpu_put(vcpu);
 -      return 0;
 +      return r;
  }
  
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@@ -10280,23 -10001,24 +10284,23 @@@ static bool kvm_is_valid_sregs(struct k
        return kvm_is_valid_cr4(vcpu, sregs->cr4);
  }
  
 -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
 +              int *mmu_reset_needed, bool update_pdptrs)
  {
        struct msr_data apic_base_msr;
 -      int mmu_reset_needed = 0;
 -      int pending_vec, max_bits, idx;
 +      int idx;
        struct desc_ptr dt;
 -      int ret = -EINVAL;
  
        if (!kvm_is_valid_sregs(vcpu, sregs))
 -              goto out;
 +              return -EINVAL;
  
        apic_base_msr.data = sregs->apic_base;
        apic_base_msr.host_initiated = true;
        if (kvm_set_apic_base(vcpu, &apic_base_msr))
 -              goto out;
 +              return -EINVAL;
  
        if (vcpu->arch.guest_state_protected)
 -              goto skip_protected_regs;
 +              return 0;
  
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
        static_call(kvm_x86_set_gdt)(vcpu, &dt);
  
        vcpu->arch.cr2 = sregs->cr2;
 -      mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
 +      *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
  
        kvm_set_cr8(vcpu, sregs->cr8);
  
 -      mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
 +      *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
        static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
  
 -      mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
 +      *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
  
 -      mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 +      *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
  
 -      idx = srcu_read_lock(&vcpu->kvm->srcu);
 -      if (is_pae_paging(vcpu)) {
 -              load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 -              mmu_reset_needed = 1;
 +      if (update_pdptrs) {
 +              idx = srcu_read_lock(&vcpu->kvm->srcu);
 +              if (is_pae_paging(vcpu)) {
 +                      load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 +                      *mmu_reset_needed = 1;
 +              }
 +              srcu_read_unlock(&vcpu->kvm->srcu, idx);
        }
 -      srcu_read_unlock(&vcpu->kvm->srcu, idx);
 -
 -      if (mmu_reset_needed)
 -              kvm_mmu_reset_context(vcpu);
  
        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
            !is_protmode(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  
 -skip_protected_regs:
 +      return 0;
 +}
 +
 +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 +{
 +      int pending_vec, max_bits;
 +      int mmu_reset_needed = 0;
 +      int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
 +
 +      if (ret)
 +              return ret;
 +
 +      if (mmu_reset_needed)
 +              kvm_mmu_reset_context(vcpu);
 +
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
 +
        if (pending_vec < max_bits) {
                kvm_queue_interrupt(vcpu, pending_vec, false);
                pr_debug("Set back pending irq %d\n", pending_vec);
 +              kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 +      return 0;
 +}
  
 -      kvm_make_request(KVM_REQ_EVENT, vcpu);
 +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
 +{
 +      int mmu_reset_needed = 0;
 +      bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
 +      bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
 +              !(sregs2->efer & EFER_LMA);
 +      int i, ret;
  
 -      ret = 0;
 -out:
 -      return ret;
 +      if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
 +              return -EINVAL;
 +
 +      if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
 +              return -EINVAL;
 +
 +      ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
 +                               &mmu_reset_needed, !valid_pdptrs);
 +      if (ret)
 +              return ret;
 +
 +      if (valid_pdptrs) {
 +              for (i = 0; i < 4 ; i++)
 +                      kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
 +
 +              kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
 +              mmu_reset_needed = 1;
 +              vcpu->arch.pdptrs_from_userspace = true;
 +      }
 +      if (mmu_reset_needed)
 +              kvm_mmu_reset_context(vcpu);
 +      return 0;
  }
  
  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@@ -10629,13 -10309,13 +10633,13 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        struct page *page;
        int r;
  
 +      vcpu->arch.last_vmentry_cpu = -1;
 +
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
  
 -      kvm_set_tsc_khz(vcpu, max_tsc_khz);
 -
        r = kvm_mmu_create(vcpu);
        if (r < 0)
                return r;
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
  
 +#if IS_ENABLED(CONFIG_HYPERV)
 +      vcpu->arch.hv_root_tdp = INVALID_PAGE;
 +#endif
 +
        r = static_call(kvm_x86_vcpu_create)(vcpu);
        if (r)
                goto free_guest_fpu;
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
 +      kvm_set_tsc_khz(vcpu, max_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
 -      kvm_init_mmu(vcpu, false);
 +      kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
        return 0;
  
@@@ -10783,8 -10458,6 +10787,8 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
  
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
 +      unsigned long old_cr0 = kvm_read_cr0(vcpu);
 +
        kvm_lapic_reset(vcpu, init_event);
  
        vcpu->arch.hflags = 0;
        vcpu->arch.ia32_xss = 0;
  
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
 +
 +      /*
 +       * Reset the MMU context if paging was enabled prior to INIT (which is
 +       * implied if CR0.PG=1 as CR0 will be '0' prior to RESET).  Unlike the
 +       * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
 +       * checked because it is unconditionally cleared on INIT and all other
 +       * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
 +       * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
 +       */
 +      if (old_cr0 & X86_CR0_PG)
 +              kvm_mmu_reset_context(vcpu);
  }
  
  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@@ -10981,9 -10643,6 +10985,9 @@@ int kvm_arch_hardware_setup(void *opaqu
        int r;
  
        rdmsrl_safe(MSR_EFER, &host_efer);
 +      if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
 +                       !(host_efer & EFER_NX)))
 +              return -EIO;
  
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
@@@ -11099,15 -10758,9 +11103,15 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
  
        kvm->arch.guest_can_read_msr_platform_info = true;
  
 +#if IS_ENABLED(CONFIG_HYPERV)
 +      spin_lock_init(&kvm->arch.hv_root_tdp_lock);
 +      kvm->arch.hv_root_tdp = INVALID_PAGE;
 +#endif
 +
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
  
 +      kvm_apicv_init(kvm);
        kvm_hv_init_vm(kvm);
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
@@@ -11268,23 -10921,17 +11272,23 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        kvm_hv_destroy_vm(kvm);
  }
  
 -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 +static void memslot_rmap_free(struct kvm_memory_slot *slot)
  {
        int i;
  
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.rmap[i]);
                slot->arch.rmap[i] = NULL;
 +      }
 +}
  
 -              if (i == 0)
 -                      continue;
 +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 +{
 +      int i;
  
 +      memslot_rmap_free(slot);
 +
 +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
        kvm_page_track_free_memslot(slot);
  }
  
 -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
 -                                    unsigned long npages)
 +static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
 +                            unsigned long npages)
  {
 +      const int sz = sizeof(*slot->arch.rmap[0]);
        int i;
  
 +      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 +              int level = i + 1;
 +              int lpages = gfn_to_index(slot->base_gfn + npages - 1,
 +                                        slot->base_gfn, level) + 1;
 +
 +              WARN_ON(slot->arch.rmap[i]);
 +
 +              slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
 +              if (!slot->arch.rmap[i]) {
 +                      memslot_rmap_free(slot);
 +                      return -ENOMEM;
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +int alloc_all_memslots_rmaps(struct kvm *kvm)
 +{
 +      struct kvm_memslots *slots;
 +      struct kvm_memory_slot *slot;
 +      int r, i;
 +
 +      /*
 +       * Check if memslots alreday have rmaps early before acquiring
 +       * the slots_arch_lock below.
 +       */
 +      if (kvm_memslots_have_rmaps(kvm))
 +              return 0;
 +
 +      mutex_lock(&kvm->slots_arch_lock);
 +
 +      /*
 +       * Read memslots_have_rmaps again, under the slots arch lock,
 +       * before allocating the rmaps
 +       */
 +      if (kvm_memslots_have_rmaps(kvm)) {
 +              mutex_unlock(&kvm->slots_arch_lock);
 +              return 0;
 +      }
 +
 +      for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 +              slots = __kvm_memslots(kvm, i);
 +              kvm_for_each_memslot(slot, slots) {
 +                      r = memslot_rmap_alloc(slot, slot->npages);
 +                      if (r) {
 +                              mutex_unlock(&kvm->slots_arch_lock);
 +                              return r;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * Ensure that memslots_have_rmaps becomes true strictly after
 +       * all the rmap pointers are set.
 +       */
 +      smp_store_release(&kvm->arch.memslots_have_rmaps, true);
 +      mutex_unlock(&kvm->slots_arch_lock);
 +      return 0;
 +}
 +
 +static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 +                                    struct kvm_memory_slot *slot,
 +                                    unsigned long npages)
 +{
 +      int i, r;
 +
        /*
         * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
         * old arrays will be freed by __kvm_set_memory_region() if installing
         */
        memset(&slot->arch, 0, sizeof(slot->arch));
  
 -      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 +      if (kvm_memslots_have_rmaps(kvm)) {
 +              r = memslot_rmap_alloc(slot, npages);
 +              if (r)
 +                      return r;
 +      }
 +
 +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
                lpages = gfn_to_index(slot->base_gfn + npages - 1,
                                      slot->base_gfn, level) + 1;
  
 -              slot->arch.rmap[i] =
 -                      kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
 -                               GFP_KERNEL_ACCOUNT);
 -              if (!slot->arch.rmap[i])
 -                      goto out_free;
 -              if (i == 0)
 -                      continue;
 -
                linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
                if (!linfo)
                        goto out_free;
        return 0;
  
  out_free:
 -      for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 -              kvfree(slot->arch.rmap[i]);
 -              slot->arch.rmap[i] = NULL;
 -              if (i == 0)
 -                      continue;
 +      memslot_rmap_free(slot);
  
 +      for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
                kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
@@@ -11447,7 -11031,7 +11451,7 @@@ int kvm_arch_prepare_memory_region(stru
                                enum kvm_mr_change change)
  {
        if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
 -              return kvm_alloc_memslot_metadata(memslot,
 +              return kvm_alloc_memslot_metadata(kvm, memslot,
                                                  mem->memory_size >> PAGE_SHIFT);
        return 0;
  }
@@@ -11523,19 -11107,36 +11527,19 @@@ static void kvm_mmu_slot_apply_flags(st
                 */
                kvm_mmu_zap_collapsible_sptes(kvm, new);
        } else {
 -              /* By default, write-protect everything to log writes. */
 -              int level = PG_LEVEL_4K;
 +              /*
 +               * Initially-all-set does not require write protecting any page,
 +               * because they're all assumed to be dirty.
 +               */
 +              if (kvm_dirty_log_manual_protect_and_init_set(kvm))
 +                      return;
  
                if (kvm_x86_ops.cpu_dirty_log_size) {
 -                      /*
 -                       * Clear all dirty bits, unless pages are treated as
 -                       * dirty from the get-go.
 -                       */
 -                      if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
 -                              kvm_mmu_slot_leaf_clear_dirty(kvm, new);
 -
 -                      /*
 -                       * Write-protect large pages on write so that dirty
 -                       * logging happens at 4k granularity.  No need to
 -                       * write-protect small SPTEs since write accesses are
 -                       * logged by the CPU via dirty bits.
 -                       */
 -                      level = PG_LEVEL_2M;
 -              } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
 -                      /*
 -                       * If we're with initial-all-set, we don't need
 -                       * to write protect any small page because
 -                       * they're reported as dirty already.  However
 -                       * we still need to write-protect huge pages
 -                       * so that the page split can happen lazily on
 -                       * the first write to the huge page.
 -                       */
 -                      level = PG_LEVEL_2M;
 +                      kvm_mmu_slot_leaf_clear_dirty(kvm, new);
 +                      kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
 +              } else {
 +                      kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
                }
 -              kvm_mmu_slot_remove_write_access(kvm, new, level);
        }
  }
  
@@@ -12104,6 -11705,8 +12108,6 @@@ int kvm_handle_invpcid(struct kvm_vcpu 
  {
        bool pcid_enabled;
        struct x86_exception e;
 -      unsigned i;
 -      unsigned long roots_to_free = 0;
        struct {
                u64 pcid;
                u64 gla;
                        return 1;
                }
  
 -              if (kvm_get_active_pcid(vcpu) == operand.pcid) {
 -                      kvm_mmu_sync_roots(vcpu);
 -                      kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 -              }
 -
 -              for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 -                      if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
 -                          == operand.pcid)
 -                              roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 -
 -              kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
 -              /*
 -               * If neither the current cr3 nor any of the prev_roots use the
 -               * given PCID, then nothing needs to be done here because a
 -               * resync will happen anyway before switching to any other CR3.
 -               */
 -
 +              kvm_invalidate_pcid(vcpu, operand.pcid);
                return kvm_skip_emulated_instruction(vcpu);
  
        case INVPCID_TYPE_ALL_NON_GLOBAL:
  
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
 -              kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 +              kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
  
        default:
diff --combined arch/x86/mm/fault.c
index 2d27932c9ac7b2a4a0827ac1b7e259519e6e7edf,f33a61a432ce4feff89f376f5e66837bfa7ff526..b2eefdefc1083316ea27e6bfd8bb1ddbcea715ab
@@@ -875,7 -875,7 +875,7 @@@ static inline bool bad_area_access_from
        /* This code is always called on the current mm */
        bool foreign = false;
  
-       if (!boot_cpu_has(X86_FEATURE_OSPKE))
+       if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
@@@ -1186,7 -1186,7 +1186,7 @@@ do_kern_addr_fault(struct pt_regs *regs
                return;
  
        /* kprobes don't want to hook the spurious faults: */
 -      if (kprobe_page_fault(regs, X86_TRAP_PF))
 +      if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;
  
        /*
@@@ -1239,7 -1239,7 +1239,7 @@@ void do_user_addr_fault(struct pt_regs 
        }
  
        /* kprobes don't want to hook the spurious faults: */
 -      if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
 +      if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;
  
        /*
diff --combined arch/x86/mm/pkeys.c
index 4a67b922bce1ea52ecd115d649e608900ef317eb,fb171a5d7f339dff1e69da24034fec44de49b478..e44e938885b709f267d135e9530c7c88d9566cbb
@@@ -10,7 -10,6 +10,6 @@@
  
  #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
  #include <asm/mmu_context.h>            /* vma_pkey()                   */
- #include <asm/fpu/internal.h>         /* init_fpstate                 */
  
  int __execute_only_pkey(struct mm_struct *mm)
  {
@@@ -125,22 -124,6 +124,6 @@@ u32 init_pkru_value = PKRU_AD_KEY( 1) 
                      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
                      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
  
- /*
-  * Called from the FPU code when creating a fresh set of FPU
-  * registers.  This is called from a very specific context where
-  * we know the FPU registers are safe for use and we can use PKRU
-  * directly.
-  */
- void copy_init_pkru_to_fpregs(void)
- {
-       u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-       /*
-        * Override the PKRU state that came from 'init_fpstate'
-        * with the baseline from the process.
-        */
-       write_pkru(init_pkru_value_snapshot);
- }
  static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
                             size_t count, loff_t *ppos)
  {
  static ssize_t init_pkru_write_file(struct file *file,
                 const char __user *user_buf, size_t count, loff_t *ppos)
  {
-       struct pkru_state *pk;
        char buf[32];
        ssize_t len;
        u32 new_init_pkru;
                return -EINVAL;
  
        WRITE_ONCE(init_pkru_value, new_init_pkru);
-       pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
-       if (!pk)
-               return -EINVAL;
-       pk->pkru = new_init_pkru;
        return count;
  }
  
@@@ -192,10 -170,6 +170,10 @@@ static const struct file_operations fop
  
  static int __init create_init_pkru_value(void)
  {
 +      /* Do not expose the file if pkeys are not supported. */
 +      if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 +              return 0;
 +
        debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
                        arch_debugfs_dir, NULL, &fops_init_pkru);
        return 0;
index c9cf678c347dc778c67235965fdacb7133724327,0d7fec79d28ffb00ff6178a0c7c20ccf07f8994f..b9126fe06c3fce036050bcf32698ca8a587cbc86
@@@ -382,7 -382,7 +382,7 @@@ static inline int fatal_signal_pending(
        return task_sigpending(p) && __fatal_signal_pending(p);
  }
  
 -static inline int signal_pending_state(long state, struct task_struct *p)
 +static inline int signal_pending_state(unsigned int state, struct task_struct *p)
  {
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
@@@ -538,6 -538,17 +538,17 @@@ static inline int kill_cad_pid(int sig
  #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
  #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1)
  
+ static inline int __on_sig_stack(unsigned long sp)
+ {
+ #ifdef CONFIG_STACK_GROWSUP
+       return sp >= current->sas_ss_sp &&
+               sp - current->sas_ss_sp < current->sas_ss_size;
+ #else
+       return sp > current->sas_ss_sp &&
+               sp - current->sas_ss_sp <= current->sas_ss_size;
+ #endif
+ }
  /*
   * True if we are on the alternate signal stack.
   */
@@@ -555,13 -566,7 +566,7 @@@ static inline int on_sig_stack(unsigne
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;
  
- #ifdef CONFIG_STACK_GROWSUP
-       return sp >= current->sas_ss_sp &&
-               sp - current->sas_ss_sp < current->sas_ss_size;
- #else
-       return sp > current->sas_ss_sp &&
-               sp - current->sas_ss_sp <= current->sas_ss_size;
- #endif
+       return __on_sig_stack(sp);
  }
  
  static inline int sas_ss_flags(unsigned long sp)
This page took 0.19433 seconds and 4 git commands to generate.