Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <[email protected]>

Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)

committer Linus Torvalds <[email protected]>

Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)
author Linus Torvalds <[email protected]>
Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)
committer Linus Torvalds <[email protected]>
Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index bb48ae24ae69fa4d73ec91fadeddc3da0215e123,1b52b1b7bbc4abc4868b197793cea847404ce971..1518343bbe2237f1d577df5656339d6224b769be
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -333,17 -333,12 +333,17 @@@
                                           allowed anymore to lift isolation
                                           requirements as needed. This option
                                           does not override iommu=pt
- -                      force_enable - Force enable the IOMMU on platforms known
- -                                     to be buggy with IOMMU enabled. Use this
- -                                     option with care.
- -                      pgtbl_v1     - Use v1 page table for DMA-API (Default).
- -                      pgtbl_v2     - Use v2 page table for DMA-API.
- -                      irtcachedis  - Disable Interrupt Remapping Table (IRT) caching.
+ +                      force_enable    - Force enable the IOMMU on platforms known
+ +                                        to be buggy with IOMMU enabled. Use this
+ +                                        option with care.
+ +                      pgtbl_v1        - Use v1 page table for DMA-API (Default).
+ +                      pgtbl_v2        - Use v2 page table for DMA-API.
+ +                      irtcachedis     - Disable Interrupt Remapping Table (IRT) caching.
+ +                      nohugepages     - Limit page-sizes used for v1 page-tables
+ +                                        to 4 KiB.
+ +                      v2_pgsizes_only - Limit page-sizes used for v1 page-tables
+ +                                        to 4KiB/2Mib/1GiB.
+ +
   
         amd_iommu_dump= [HW,X86-64]
                         Enable AMD IOMMU driver option to dump the ACPI table
@@@ -522,18 -517,6 +522,18 @@@
                         Format: <io>,<irq>,<mode>
                         See header of drivers/net/hamradio/baycom_ser_hdx.c.
   
+ +      bdev_allow_write_mounted=
+ +                      Format: <bool>
+ +                      Control the ability to open a mounted block device
+ +                      for writing, i.e., allow / disallow writes that bypass
+ +                      the FS. This was implemented as a means to prevent
+ +                      fuzzers from crashing the kernel by overwriting the
+ +                      metadata underneath a mounted FS without its awareness.
+ +                      This also prevents destructive formatting of mounted
+ +                      filesystems by naive storage tooling that don't use
+ +                      O_EXCL. Default is Y and can be changed through the
+ +                      Kconfig option CONFIG_BLK_DEV_WRITE_MOUNTED.
+ +
         bert_disable    [ACPI]
                         Disable BERT OS support on buggy BIOSes.
   
@@@ -2367,18 -2350,6 +2367,18 @@@
         ipcmni_extend   [KNL,EARLY] Extend the maximum number of unique System V
                         IPC identifiers from 32,768 to 16,777,216.
   
+ +      ipe.enforce=    [IPE]
+ +                      Format: <bool>
+ +                      Determine whether IPE starts in permissive (0) or
+ +                      enforce (1) mode. The default is enforce.
+ +
+ +      ipe.success_audit=
+ +                      [IPE]
+ +                      Format: <bool>
+ +                      Start IPE with success auditing enabled, emitting
+ +                      an audit event when a binary is allowed. The default
+ +                      is 0.
+ +
         irqaffinity=    [SMP] Set the default irq affinity mask
                         The argument is a cpu list, as described above.
   
@@@ -2677,6 -2648,23 +2677,23 @@@
   
                         Default is Y (on).
   
+       kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86]
+                       If enabled, KVM will enable virtualization in hardware
+                       when KVM is loaded, and disable virtualization when KVM
+                       is unloaded (if KVM is built as a module).
+ 
+                       If disabled, KVM will dynamically enable and disable
+                       virtualization on-demand when creating and destroying
+                       VMs, i.e. on the 0=>1 and 1=>0 transitions of the
+                       number of VMs.
+ 
+                       Enabling virtualization at module lode avoids potential
+                       latency for creation of the 0=>1 VM, as KVM serializes
+                       virtualization enabling across all online CPUs.  The
+                       "cost" of enabling virtualization when KVM is loaded,
+                       is that doing so may interfere with using out-of-tree
+                       hypervisors that want to "own" virtualization hardware.
+ 
         kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
                                    Default is false (don't support).
   
@@@ -4152,21 -4140,6 +4169,21 @@@
                         Disable NUMA, Only set up a single NUMA node
                         spanning all memory.
   
+ +      numa=fake=<size>[MG]
+ +                      [KNL, ARM64, RISCV, X86, EARLY]
+ +                      If given as a memory unit, fills all system RAM with
+ +                      nodes of size interleaved over physical nodes.
+ +
+ +      numa=fake=<N>
+ +                      [KNL, ARM64, RISCV, X86, EARLY]
+ +                      If given as an integer, fills all system RAM with N
+ +                      fake nodes interleaved over physical nodes.
+ +
+ +      numa=fake=<N>U
+ +                      [KNL, ARM64, RISCV, X86, EARLY]
+ +                      If given as an integer followed by 'U', it will
+ +                      divide each physical node into N emulated nodes.
+ +
         numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
                         NUMA balancing.
                         Allowed values are enable and disable
@@@ -4832,16 -4805,6 +4849,16 @@@
         printk.time=    Show timing data prefixed to each printk message line
                         Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
   
+ +      proc_mem.force_override= [KNL]
+ +                      Format: {always | ptrace | never}
+ +                      Traditionally /proc/pid/mem allows memory permissions to be
+ +                      overridden without restrictions. This option may be set to
+ +                      restrict that. Can be one of:
+ +                      - 'always': traditional behavior always allows mem overrides.
+ +                      - 'ptrace': only allow mem overrides for active ptracers.
+ +                      - 'never':  never allow mem overrides.
+ +                      If not specified, default is the CONFIG_PROC_MEM_* choice.
+ +
         processor.max_cstate=   [HW,ACPI]
                         Limit processor to maximum C-state
                         max_cstate=9 overrides any DMI blacklist limit.
@@@ -4989,10 -4952,6 +5006,10 @@@
                         Set maximum number of finished RCU callbacks to
                         process in one batch.
   
+ +      rcutree.csd_lock_suppress_rcu_stall=    [KNL]
+ +                      Do only a one-line RCU CPU stall warning when
+ +                      there is an ongoing too-long CSD-lock wait.
+ +
         rcutree.do_rcu_barrier= [KNL]
                         Request a call to rcu_barrier().  This is
                         throttled so that userspace tests can safely
@@@ -5440,13 -5399,7 +5457,13 @@@
                         Time to wait (s) after boot before inducing stall.
   
         rcutorture.stall_cpu_irqsoff= [KNL]
- -                      Disable interrupts while stalling if set.
+ +                      Disable interrupts while stalling if set, but only
+ +                      on the first stall in the set.
+ +
+ +      rcutorture.stall_cpu_repeat= [KNL]
+ +                      Number of times to repeat the stall sequence,
+ +                      so that rcutorture.stall_cpu_repeat=3 will result
+ +                      in four stall sequences.
   
         rcutorture.stall_gp_kthread= [KNL]
                         Duration (s) of forced sleep within RCU
@@@ -5634,6 -5587,14 +5651,6 @@@
                         of zero will disable batching.  Batching is
                         always disabled for synchronize_rcu_tasks().
   
- -      rcupdate.rcu_tasks_rude_lazy_ms= [KNL]
- -                      Set timeout in milliseconds RCU Tasks
- -                      Rude asynchronous callback batching for
- -                      call_rcu_tasks_rude().  A negative value
- -                      will take the default.  A value of zero will
- -                      disable batching.  Batching is always disabled
- -                      for synchronize_rcu_tasks_rude().
- -
         rcupdate.rcu_tasks_trace_lazy_ms= [KNL]
                         Set timeout in milliseconds RCU Tasks
                         Trace asynchronous callback batching for
@@@ -6670,15 -6631,6 +6687,15 @@@
                         <deci-seconds>: poll all this frequency
                         0: no polling (default)
   
+ +      thp_anon=       [KNL]
+ +                      Format: <size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>
+ +                      state is one of "always", "madvise", "never" or "inherit".
+ +                      Control the default behavior of the system with respect
+ +                      to anonymous transparent hugepages.
+ +                      Can be used multiple times for multiple anon THP sizes.
+ +                      See Documentation/admin-guide/mm/transhuge.rst for more
+ +                      details.
+ +
         threadirqs      [KNL,EARLY]
                         Force threading of all interrupt handlers except those
                         marked explicitly IRQF_NO_THREAD.
@@@ -6808,51 -6760,6 +6825,51 @@@
                         the same thing would happen if it was left off). The irq_handler_entry
                         event, and all events under the "initcall" system.
   
+ +                      Flags can be added to the instance to modify its behavior when it is
+ +                      created. The flags are separated by '^'.
+ +
+ +                      The available flags are:
+ +
+ +                          traceoff    - Have the tracing instance tracing disabled after it is created.
+ +                          traceprintk - Have trace_printk() write into this trace instance
+ +                                        (note, "printk" and "trace_printk" can also be used)
+ +
+ +                              trace_instance=foo^traceoff^traceprintk,sched,irq
+ +
+ +                      The flags must come before the defined events.
+ +
+ +                      If memory has been reserved (see memmap for x86), the instance
+ +                      can use that memory:
+ +
+ +                              memmap=12M$0x284500000 trace_instance=boot_map@0x284500000:12M
+ +
+ +                      The above will create a "boot_map" instance that uses the physical
+ +                      memory at 0x284500000 that is 12Megs. The per CPU buffers of that
+ +                      instance will be split up accordingly.
+ +
+ +                      Alternatively, the memory can be reserved by the reserve_mem option:
+ +
+ +                              reserve_mem=12M:4096:trace trace_instance=boot_map@trace
+ +
+ +                      This will reserve 12 megabytes at boot up with a 4096 byte alignment
+ +                      and place the ring buffer in this memory. Note that due to KASLR, the
+ +                      memory may not be the same location each time, which will not preserve
+ +                      the buffer content.
+ +
+ +                      Also note that the layout of the ring buffer data may change between
+ +                      kernel versions where the validator will fail and reset the ring buffer
+ +                      if the layout is not the same as the previous kernel.
+ +
+ +                      If the ring buffer is used for persistent bootups and has events enabled,
+ +                      it is recommend to disable tracing so that events from a previous boot do not
+ +                      mix with events of the current boot (unless you are debugging a random crash
+ +                      at boot up).
+ +
+ +                              reserve_mem=12M:4096:trace trace_instance=boot_map^traceoff^traceprintk@trace,sched,irq
+ +
+ +                      See also Documentation/trace/debugging.rst
+ +
+ +
         trace_options=[option-list]
                         [FTRACE] Enable or disable tracer options at boot.
                         The option-list is a comma delimited list of options
@@@ -7462,13 -7369,6 +7479,13 @@@
                         it can be updated at runtime by writing to the
                         corresponding sysfs file.
   
+ +      workqueue.panic_on_stall=<uint>
+ +                      Panic when workqueue stall is detected by
+ +                      CONFIG_WQ_WATCHDOG. It sets the number times of the
+ +                      stall to trigger panic.
+ +
+ +                      The default is 0, which disables the panic on stall.
+ +
         workqueue.cpu_intensive_thresh_us=
                         Per-cpu work items which run for longer than this
                         threshold are automatically considered CPU intensive
diff --combined arch/s390/configs/debug_defconfig

index 7ec1b8cd0de9b18ae2fa6bbaeb872bf7be708c94,0c989caed19af839f87c9c76312ce6bba7e44ce8..9b57add02cd5c4c4ec4fb391a8fd1915a0d3ed00
--- 1/arch/s390/configs/debug_defconfig
--- 2/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@@ -59,6 -59,7 +59,7 @@@ CONFIG_CMM=
   CONFIG_APPLDATA_BASE=y
   CONFIG_S390_HYPFS_FS=y
   CONFIG_KVM=m
+ CONFIG_KVM_S390_UCONTROL=y
   CONFIG_S390_UNWIND_SELFTEST=m
   CONFIG_S390_KPROBES_SANITY_TEST=m
   CONFIG_S390_MODULES_SANITY_TEST=m
@@@ -794,12 -795,8 +795,12 @@@ CONFIG_CRYPTO_GHASH_S390=
   CONFIG_CRYPTO_AES_S390=m
   CONFIG_CRYPTO_DES_S390=m
   CONFIG_CRYPTO_CHACHA_S390=m
+ +CONFIG_CRYPTO_HMAC_S390=m
   CONFIG_ZCRYPT=m
   CONFIG_PKEY=m
+ +CONFIG_PKEY_CCA=m
+ +CONFIG_PKEY_EP11=m
+ +CONFIG_PKEY_PCKMO=m
   CONFIG_CRYPTO_PAES_S390=m
   CONFIG_CRYPTO_DEV_VIRTIO=m
   CONFIG_SYSTEM_BLACKLIST_KEYRING=y
diff --combined arch/x86/include/asm/cpuid.h

index 80cc6386d7b13336564acea9d2148298e075f480,aa21c105eef1451849a0b2e0e7e4df3e2a84f515..ca4243318aadc4c42156942e874a8d8796765a43
--- 1/arch/x86/include/asm/cpuid.h
--- 2/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@@ -179,6 -179,7 +179,7 @@@ static __always_inline bool cpuid_funct
         case 0x1d:
         case 0x1e:
         case 0x1f:
+       case 0x24:
         case 0x8000001d:
                 return true;
         }
@@@ -196,12 -197,7 +197,12 @@@ static inline uint32_t hypervisor_cpuid
         for_each_possible_hypervisor_cpuid_base(base) {
                 cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
   
- -              if (!memcmp(sig, signature, 12) &&
+ +              /*
+ +               * This must not compile to "call memcmp" because it's called
+ +               * from PVH early boot code before instrumentation is set up
+ +               * and memcmp() itself may be instrumented.
+ +               */
+ +              if (!__builtin_memcmp(sig, signature, 12) &&
                     (leaves == 0 || ((eax - base) >= leaves)))
                         return base;
         }
diff --combined arch/x86/include/asm/msr-index.h

index a7c06a46fb767d4aa77929a2964e71e4c31b6cbd,72c2c0ecb62c8af405ac14d86ddf5874350f8b79..3ae84c3b8e6dba73f0f44d3e25a1948a68daea91
--- 1/arch/x86/include/asm/msr-index.h
--- 2/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@@ -36,6 -36,20 +36,20 @@@
   #define EFER_FFXSR            (1<<_EFER_FFXSR)
   #define EFER_AUTOIBRS         (1<<_EFER_AUTOIBRS)
   
+ /*
+  * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
+  * Most MSRs support/allow only a subset of memory types, but the values
+  * themselves are common across all relevant MSRs.
+  */
+ #define X86_MEMTYPE_UC                0ull    /* Uncacheable, a.k.a. Strong Uncacheable */
+ #define X86_MEMTYPE_WC                1ull    /* Write Combining */
+ /* RESERVED                   2 */
+ /* RESERVED                   3 */
+ #define X86_MEMTYPE_WT                4ull    /* Write Through */
+ #define X86_MEMTYPE_WP                5ull    /* Write Protected */
+ #define X86_MEMTYPE_WB                6ull    /* Write Back */
+ #define X86_MEMTYPE_UC_MINUS  7ull    /* Weak Uncacheabled (PAT only) */
+ 
   /* FRED MSRs */
   #define MSR_IA32_FRED_RSP0    0x1cc                   /* Level 0 stack pointer */
   #define MSR_IA32_FRED_RSP1    0x1cd                   /* Level 1 stack pointer */
@@@ -247,8 -261,6 +261,8 @@@
   #define MSR_INTEGRITY_CAPS_ARRAY_BIST          BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
   #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT  4
   #define MSR_INTEGRITY_CAPS_PERIODIC_BIST      BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+ +#define MSR_INTEGRITY_CAPS_SBAF_BIT           8
+ +#define MSR_INTEGRITY_CAPS_SBAF                       BIT(MSR_INTEGRITY_CAPS_SBAF_BIT)
   #define MSR_INTEGRITY_CAPS_SAF_GEN_MASK       GENMASK_ULL(10, 9)
   
   #define MSR_LBR_NHM_FROM              0x00000680
@@@ -365,6 -377,12 +379,12 @@@
   
   #define MSR_IA32_CR_PAT                       0x00000277
   
+ #define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7)                     \
+       ((X86_MEMTYPE_ ## p0)      | (X86_MEMTYPE_ ## p1 << 8)  |       \
+       (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) |       \
+       (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) |       \
+       (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
+ 
   #define MSR_IA32_DEBUGCTLMSR          0x000001d9
   #define MSR_IA32_LASTBRANCHFROMIP     0x000001db
   #define MSR_IA32_LASTBRANCHTOIP               0x000001dc
@@@ -1159,15 -1177,6 +1179,6 @@@
   #define MSR_IA32_VMX_VMFUNC             0x00000491
   #define MSR_IA32_VMX_PROCBASED_CTLS3  0x00000492
   
- /* VMX_BASIC bits and bitmasks */
- #define VMX_BASIC_VMCS_SIZE_SHIFT     32
- #define VMX_BASIC_TRUE_CTLS           (1ULL << 55)
- #define VMX_BASIC_64          0x0001000000000000LLU
- #define VMX_BASIC_MEM_TYPE_SHIFT      50
- #define VMX_BASIC_MEM_TYPE_MASK       0x003c000000000000LLU
- #define VMX_BASIC_MEM_TYPE_WB 6LLU
- #define VMX_BASIC_INOUT               0x0040000000000000LLU
- 
   /* Resctrl MSRs: */
   /* - Intel: */
   #define MSR_IA32_L3_QOS_CFG           0xc81
@@@ -1185,11 -1194,6 +1196,6 @@@
   #define MSR_IA32_SMBA_BW_BASE         0xc0000280
   #define MSR_IA32_EVT_CFG_BASE         0xc0000400
   
- /* MSR_IA32_VMX_MISC bits */
- #define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
- #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
- #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
- 
   /* AMD-V MSRs */
   #define MSR_VM_CR                       0xc0010114
   #define MSR_VM_IGNNE                    0xc0010115
diff --combined arch/x86/kvm/mmu/mmu.c

index 7813d28b082f2f014dece5572bbfc0d0a49a6525,e081f785fb230c8f40641a2e461ce259e314b7f4..e52f990548df6370a403ac5a19b69fa74caca090
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -614,32 -614,6 +614,6 @@@ static u64 mmu_spte_get_lockless(u64 *s
         return __get_spte_lockless(sptep);
   }
   
- /* Returns the Accessed status of the PTE and resets it at the same time. */
- static bool mmu_spte_age(u64 *sptep)
- {
-       u64 spte = mmu_spte_get_lockless(sptep);
- 
-       if (!is_accessed_spte(spte))
-               return false;
- 
-       if (spte_ad_enabled(spte)) {
-               clear_bit((ffs(shadow_accessed_mask) - 1),
-                         (unsigned long *)sptep);
-       } else {
-               /*
-                * Capture the dirty status of the page, so that it doesn't get
-                * lost when the SPTE is marked for access tracking.
-                */
-               if (is_writable_pte(spte))
-                       kvm_set_pfn_dirty(spte_to_pfn(spte));
- 
-               spte = mark_spte_for_access_track(spte);
-               mmu_spte_update_no_track(sptep, spte);
-       }
- 
-       return true;
- }
- 
   static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
   {
         return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
@@@ -938,6 -912,7 +912,7 @@@ static struct kvm_memory_slot *gfn_to_m
    * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
    * pte_list_desc containing more mappings.
    */
+ #define KVM_RMAP_MANY BIT(0)
   
   /*
    * Returns the number of pointers in the rmap chain, not counting the new one.
@@@ -950,16 -925,16 +925,16 @@@ static int pte_list_add(struct kvm_mmu_
   
         if (!rmap_head->val) {
                 rmap_head->val = (unsigned long)spte;
-       } else if (!(rmap_head->val & 1)) {
+       } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
                 desc = kvm_mmu_memory_cache_alloc(cache);
                 desc->sptes[0] = (u64 *)rmap_head->val;
                 desc->sptes[1] = spte;
                 desc->spte_count = 2;
                 desc->tail_count = 0;
-               rmap_head->val = (unsigned long)desc | 1;
+               rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
                 ++count;
         } else {
-               desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+               desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
                 count = desc->tail_count + desc->spte_count;
   
                 /*
@@@ -968,10 -943,10 +943,10 @@@
                  */
                 if (desc->spte_count == PTE_LIST_EXT) {
                         desc = kvm_mmu_memory_cache_alloc(cache);
-                       desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+                       desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
                         desc->spte_count = 0;
                         desc->tail_count = count;
-                       rmap_head->val = (unsigned long)desc | 1;
+                       rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
                 }
                 desc->sptes[desc->spte_count++] = spte;
         }
@@@ -982,7 -957,7 +957,7 @@@ static void pte_list_desc_remove_entry(
                                        struct kvm_rmap_head *rmap_head,
                                        struct pte_list_desc *desc, int i)
   {
-       struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+       struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
         int j = head_desc->spte_count - 1;
   
         /*
@@@ -1011,7 -986,7 +986,7 @@@
         if (!head_desc->more)
                 rmap_head->val = 0;
         else
-               rmap_head->val = (unsigned long)head_desc->more | 1;
+               rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
         mmu_free_pte_list_desc(head_desc);
   }
   
@@@ -1024,13 -999,13 +999,13 @@@ static void pte_list_remove(struct kvm 
         if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
                 return;
   
-       if (!(rmap_head->val & 1)) {
+       if (!(rmap_head->val & KVM_RMAP_MANY)) {
                 if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
                         return;
   
                 rmap_head->val = 0;
         } else {
-               desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+               desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
                 while (desc) {
                         for (i = 0; i < desc->spte_count; ++i) {
                                 if (desc->sptes[i] == spte) {
@@@ -1063,12 -1038,12 +1038,12 @@@ static bool kvm_zap_all_rmap_sptes(stru
         if (!rmap_head->val)
                 return false;
   
-       if (!(rmap_head->val & 1)) {
+       if (!(rmap_head->val & KVM_RMAP_MANY)) {
                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
                 goto out;
         }
   
-       desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+       desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
   
         for (; desc; desc = next) {
                 for (i = 0; i < desc->spte_count; i++)
@@@ -1088,10 -1063,10 +1063,10 @@@ unsigned int pte_list_count(struct kvm_
   
         if (!rmap_head->val)
                 return 0;
-       else if (!(rmap_head->val & 1))
+       else if (!(rmap_head->val & KVM_RMAP_MANY))
                 return 1;
   
-       desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+       desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
         return desc->tail_count + desc->spte_count;
   }
   
@@@ -1153,13 -1128,13 +1128,13 @@@ static u64 *rmap_get_first(struct kvm_r
         if (!rmap_head->val)
                 return NULL;
   
-       if (!(rmap_head->val & 1)) {
+       if (!(rmap_head->val & KVM_RMAP_MANY)) {
                 iter->desc = NULL;
                 sptep = (u64 *)rmap_head->val;
                 goto out;
         }
   
-       iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+       iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
         iter->pos = 0;
         sptep = iter->desc->sptes[iter->pos];
   out:
@@@ -1307,15 -1282,6 +1282,6 @@@ static bool __rmap_clear_dirty(struct k
         return flush;
   }
   
- /**
-  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
-  * @kvm: kvm instance
-  * @slot: slot to protect
-  * @gfn_offset: start of the BITS_PER_LONG pages we care about
-  * @mask: indicates which pages we should protect
-  *
-  * Used when we do not need to care about huge page mappings.
-  */
   static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                      struct kvm_memory_slot *slot,
                                      gfn_t gfn_offset, unsigned long mask)
@@@ -1339,16 -1305,6 +1305,6 @@@
         }
   }
   
- /**
-  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
-  * protect the page if the D-bit isn't supported.
-  * @kvm: kvm instance
-  * @slot: slot to clear D-bit
-  * @gfn_offset: start of the BITS_PER_LONG pages we care about
-  * @mask: indicates which pages we should clear D-bit
-  *
-  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
-  */
   static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                          struct kvm_memory_slot *slot,
                                          gfn_t gfn_offset, unsigned long mask)
@@@ -1372,24 -1328,16 +1328,16 @@@
         }
   }
   
- /**
-  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
-  * PT level pages.
-  *
-  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
-  * enable dirty logging for them.
-  *
-  * We need to care about huge page mappings: e.g. during dirty logging we may
-  * have such mappings.
-  */
   void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                 struct kvm_memory_slot *slot,
                                 gfn_t gfn_offset, unsigned long mask)
   {
         /*
-        * Huge pages are NOT write protected when we start dirty logging in
-        * initially-all-set mode; must write protect them here so that they
-        * are split to 4K on the first write.
+        * If the slot was assumed to be "initially all dirty", write-protect
+        * huge pages to ensure they are split to 4KiB on the first write (KVM
+        * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+        * immediately try to split huge pages, e.g. so that vCPUs don't get
+        * saddled with the cost of splitting.
          *
          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
          * of memslot has no such restriction, so the range can cross two large
@@@ -1411,7 -1359,16 +1359,16 @@@
                                                        PG_LEVEL_2M);
         }
   
-       /* Now handle 4K PTEs.  */
+       /*
+        * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+        * mask.  If PML is enabled and the GFN doesn't need to be write-
+        * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+        * Otherwise clear the Writable bit.
+        *
+        * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+        * enabled but it chooses between clearing the Dirty bit and Writeable
+        * bit based on the context.
+        */
         if (kvm_x86_ops.cpu_dirty_log_size)
                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
         else
@@@ -1453,16 -1410,10 +1410,10 @@@ static bool kvm_vcpu_write_protect_gfn(
         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
   }
   
- static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                          const struct kvm_memory_slot *slot)
- {
-       return kvm_zap_all_rmap_sptes(kvm, rmap_head);
- }
- 
   static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                        struct kvm_memory_slot *slot, gfn_t gfn, int level)
+                        const struct kvm_memory_slot *slot)
   {
-       return __kvm_zap_rmap(kvm, rmap_head, slot);
+       return kvm_zap_all_rmap_sptes(kvm, rmap_head);
   }
   
   struct slot_rmap_walk_iterator {
@@@ -1513,7 -1464,7 +1464,7 @@@ static bool slot_rmap_walk_okay(struct 
   static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
   {
         while (++iterator->rmap <= iterator->end_rmap) {
-               iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+               iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
   
                 if (iterator->rmap->val)
                         return;
@@@ -1534,23 -1485,71 +1485,71 @@@
              slot_rmap_walk_okay(_iter_);                               \
              slot_rmap_walk_next(_iter_))
   
- typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                              struct kvm_memory_slot *slot, gfn_t gfn,
-                              int level);
+ /* The return value indicates if tlb flush on all vcpus is needed. */
+ typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+                                   struct kvm_rmap_head *rmap_head,
+                                   const struct kvm_memory_slot *slot);
   
- static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
-                                                struct kvm_gfn_range *range,
-                                                rmap_handler_t handler)
+ static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+                                             const struct kvm_memory_slot *slot,
+                                             slot_rmaps_handler fn,
+                                             int start_level, int end_level,
+                                             gfn_t start_gfn, gfn_t end_gfn,
+                                             bool can_yield, bool flush_on_yield,
+                                             bool flush)
   {
         struct slot_rmap_walk_iterator iterator;
-       bool ret = false;
   
-       for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
-                                range->start, range->end - 1, &iterator)
-               ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
-                              iterator.level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
   
-       return ret;
+       for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+                       end_gfn, &iterator) {
+               if (iterator.rmap)
+                       flush |= fn(kvm, iterator.rmap, slot);
+ 
+               if (!can_yield)
+                       continue;
+ 
+               if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+                       if (flush && flush_on_yield) {
+                               kvm_flush_remote_tlbs_range(kvm, start_gfn,
+                                                           iterator.gfn - start_gfn + 1);
+                               flush = false;
+                       }
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+               }
+       }
+ 
+       return flush;
+ }
+ 
+ static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+                                           const struct kvm_memory_slot *slot,
+                                           slot_rmaps_handler fn,
+                                           int start_level, int end_level,
+                                           bool flush_on_yield)
+ {
+       return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+                                slot->base_gfn, slot->base_gfn + slot->npages - 1,
+                                true, flush_on_yield, false);
+ }
+ 
+ static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+                                              const struct kvm_memory_slot *slot,
+                                              slot_rmaps_handler fn,
+                                              bool flush_on_yield)
+ {
+       return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+ }
+ 
+ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+                                    const struct kvm_memory_slot *slot,
+                                    gfn_t start, gfn_t end, bool can_yield,
+                                    bool flush)
+ {
+       return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+                                PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+                                start, end - 1, can_yield, true, flush);
   }
   
   bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
@@@ -1558,7 -1557,9 +1557,9 @@@
         bool flush = false;
   
         if (kvm_memslots_have_rmaps(kvm))
-               flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+               flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+                                                range->start, range->end,
+                                                range->may_block, flush);
   
         if (tdp_mmu_enabled)
                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@@ -1570,31 -1571,6 +1571,6 @@@
         return flush;
   }
   
- static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                        struct kvm_memory_slot *slot, gfn_t gfn, int level)
- {
-       u64 *sptep;
-       struct rmap_iterator iter;
-       int young = 0;
- 
-       for_each_rmap_spte(rmap_head, &iter, sptep)
-               young |= mmu_spte_age(sptep);
- 
-       return young;
- }
- 
- static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                             struct kvm_memory_slot *slot, gfn_t gfn, int level)
- {
-       u64 *sptep;
-       struct rmap_iterator iter;
- 
-       for_each_rmap_spte(rmap_head, &iter, sptep)
-               if (is_accessed_spte(*sptep))
-                       return true;
-       return false;
- }
- 
   #define RMAP_RECYCLE_THRESHOLD 1000
   
   static void __rmap_add(struct kvm *kvm,
@@@ -1629,12 -1605,52 +1605,52 @@@ static void rmap_add(struct kvm_vcpu *v
         __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
   }
   
+ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+                                  struct kvm_gfn_range *range, bool test_only)
+ {
+       struct slot_rmap_walk_iterator iterator;
+       struct rmap_iterator iter;
+       bool young = false;
+       u64 *sptep;
+ 
+       for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+                                range->start, range->end - 1, &iterator) {
+               for_each_rmap_spte(iterator.rmap, &iter, sptep) {
+                       u64 spte = *sptep;
+ 
+                       if (!is_accessed_spte(spte))
+                               continue;
+ 
+                       if (test_only)
+                               return true;
+ 
+                       if (spte_ad_enabled(spte)) {
+                               clear_bit((ffs(shadow_accessed_mask) - 1),
+                                       (unsigned long *)sptep);
+                       } else {
+                               /*
+                                * Capture the dirty status of the page, so that
+                                * it doesn't get lost when the SPTE is marked
+                                * for access tracking.
+                                */
+                               if (is_writable_pte(spte))
+                                       kvm_set_pfn_dirty(spte_to_pfn(spte));
+ 
+                               spte = mark_spte_for_access_track(spte);
+                               mmu_spte_update_no_track(sptep, spte);
+                       }
+                       young = true;
+               }
+       }
+       return young;
+ }
+ 
   bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   {
         bool young = false;
   
         if (kvm_memslots_have_rmaps(kvm))
-               young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+               young = kvm_rmap_age_gfn_range(kvm, range, false);
   
         if (tdp_mmu_enabled)
                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@@ -1647,7 -1663,7 +1663,7 @@@ bool kvm_test_age_gfn(struct kvm *kvm, 
         bool young = false;
   
         if (kvm_memslots_have_rmaps(kvm))
-               young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+               young = kvm_rmap_age_gfn_range(kvm, range, true);
   
         if (tdp_mmu_enabled)
                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@@ -2713,36 -2729,49 +2729,49 @@@ void kvm_mmu_change_mmu_pages(struct kv
         write_unlock(&kvm->mmu_lock);
   }
   
- int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+ bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                      bool always_retry)
   {
-       struct kvm_mmu_page *sp;
+       struct kvm *kvm = vcpu->kvm;
         LIST_HEAD(invalid_list);
-       int r;
+       struct kvm_mmu_page *sp;
+       gpa_t gpa = cr2_or_gpa;
+       bool r = false;
+ 
+       /*
+        * Bail early if there aren't any write-protected shadow pages to avoid
+        * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+        * by a third party.  Reading indirect_shadow_pages without holding
+        * mmu_lock is safe, as this is purely an optimization, i.e. a false
+        * positive is benign, and a false negative will simply result in KVM
+        * skipping the unprotect+retry path, which is also an optimization.
+        */
+       if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+               goto out;
+ 
+       if (!vcpu->arch.mmu->root_role.direct) {
+               gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+               if (gpa == INVALID_GPA)
+                       goto out;
+       }
   
-       r = 0;
         write_lock(&kvm->mmu_lock);
-       for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
-               r = 1;
+       for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-       }
+ 
+       /*
+        * Snapshot the result before zapping, as zapping will remove all list
+        * entries, i.e. checking the list later would yield a false negative.
+        */
+       r = !list_empty(&invalid_list);
         kvm_mmu_commit_zap_page(kvm, &invalid_list);
         write_unlock(&kvm->mmu_lock);
   
-       return r;
- }
- 
- static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
- {
-       gpa_t gpa;
-       int r;
- 
-       if (vcpu->arch.mmu->root_role.direct)
-               return 0;
- 
-       gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
- 
-       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- 
+ out:
+       if (r || always_retry) {
+               vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+               vcpu->arch.last_retry_addr = cr2_or_gpa;
+       }
         return r;
   }
   
@@@ -2914,10 -2943,8 +2943,8 @@@ static int mmu_set_spte(struct kvm_vcp
                 trace_kvm_mmu_set_spte(level, gfn, sptep);
         }
   
-       if (wrprot) {
-               if (write_fault)
-                       ret = RET_PF_EMULATE;
-       }
+       if (wrprot && write_fault)
+               ret = RET_PF_WRITE_PROTECTED;
   
         if (flush)
                 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@@ -4549,7 -4576,7 +4576,7 @@@ static int direct_page_fault(struct kvm
                 return RET_PF_RETRY;
   
         if (page_fault_handle_page_track(vcpu, fault))
-               return RET_PF_EMULATE;
+               return RET_PF_WRITE_PROTECTED;
   
         r = fast_page_fault(vcpu, fault);
         if (r != RET_PF_INVALID)
@@@ -4618,8 -4645,6 +4645,6 @@@ int kvm_handle_page_fault(struct kvm_vc
         if (!flags) {
                 trace_kvm_page_fault(vcpu, fault_address, error_code);
   
-               if (kvm_event_needs_reinjection(vcpu))
-                       kvm_mmu_unprotect_page_virt(vcpu, fault_address);
                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
                                 insn_len);
         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
@@@ -4642,7 -4667,7 +4667,7 @@@ static int kvm_tdp_mmu_page_fault(struc
         int r;
   
         if (page_fault_handle_page_track(vcpu, fault))
-               return RET_PF_EMULATE;
+               return RET_PF_WRITE_PROTECTED;
   
         r = fast_page_fault(vcpu, fault);
         if (r != RET_PF_INVALID)
@@@ -4674,14 -4699,16 +4699,14 @@@ out_unlock
   bool kvm_mmu_may_ignore_guest_pat(void)
   {
         /*
- -       * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
- -       * not support self-snoop (or is affected by an erratum), and the VM
+ +       * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
          * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
          * honor the memtype from the guest's PAT so that guest accesses to
          * memory that is DMA'd aren't cached against the guest's wishes.  As a
          * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
- -       * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
- -       * bits in response to non-coherent device (un)registration.
+ +       * KVM _always_ ignores guest PAT (when EPT is enabled).
          */
- -      return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
+ +      return shadow_memtype_mask;
   }
   
   int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@@ -4719,6 -4746,7 +4744,7 @@@ static int kvm_tdp_map_page(struct kvm_
         switch (r) {
         case RET_PF_FIXED:
         case RET_PF_SPURIOUS:
+       case RET_PF_WRITE_PROTECTED:
                 return 0;
   
         case RET_PF_EMULATE:
@@@ -5963,6 -5991,106 +5989,106 @@@ void kvm_mmu_track_write(struct kvm_vcp
         write_unlock(&vcpu->kvm->mmu_lock);
   }
   
+ static bool is_write_to_guest_page_table(u64 error_code)
+ {
+       const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+ 
+       return (error_code & mask) == mask;
+ }
+ 
+ static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                      u64 error_code, int *emulation_type)
+ {
+       bool direct = vcpu->arch.mmu->root_role.direct;
+ 
+       /*
+        * Do not try to unprotect and retry if the vCPU re-faulted on the same
+        * RIP with the same address that was previously unprotected, as doing
+        * so will likely put the vCPU into an infinite.  E.g. if the vCPU uses
+        * a non-page-table modifying instruction on the PDE that points to the
+        * instruction, then unprotecting the gfn will unmap the instruction's
+        * code, i.e. make it impossible for the instruction to ever complete.
+        */
+       if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+           vcpu->arch.last_retry_addr == cr2_or_gpa)
+               return RET_PF_EMULATE;
+ 
+       /*
+        * Reset the unprotect+retry values that guard against infinite loops.
+        * The values will be refreshed if KVM explicitly unprotects a gfn and
+        * retries, in all other cases it's safe to retry in the future even if
+        * the next page fault happens on the same RIP+address.
+        */
+       vcpu->arch.last_retry_eip = 0;
+       vcpu->arch.last_retry_addr = 0;
+ 
+       /*
+        * It should be impossible to reach this point with an MMIO cache hit,
+        * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+        * writable memslot, and creating a memslot should invalidate the MMIO
+        * cache by way of changing the memslot generation.  WARN and disallow
+        * retry if MMIO is detected, as retrying MMIO emulation is pointless
+        * and could put the vCPU into an infinite loop because the processor
+        * will keep faulting on the non-existent MMIO address.
+        */
+       if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+               return RET_PF_EMULATE;
+ 
+       /*
+        * Before emulating the instruction, check to see if the access was due
+        * to a read-only violation while the CPU was walking non-nested NPT
+        * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+        * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+        * having nCR3 share lower level page tables with hCR3, then when KVM
+        * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+        * unknowingly write-protecting L1's guest page tables, which KVM isn't
+        * shadowing.
+        *
+        * Because the CPU (by default) walks NPT page tables using a write
+        * access (to ensure the CPU can do A/D updates), page walks in L1 can
+        * trigger write faults for the above case even when L1 isn't modifying
+        * PTEs.  As a result, KVM will unnecessarily emulate (or at least, try
+        * to emulate) an excessive number of L1 instructions; because L1's MMU
+        * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+        * and thus no need to emulate in order to guarantee forward progress.
+        *
+        * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+        * proceed without triggering emulation.  If one or more shadow pages
+        * was zapped, skip emulation and resume L1 to let it natively execute
+        * the instruction.  If no shadow pages were zapped, then the write-
+        * fault is due to something else entirely, i.e. KVM needs to emulate,
+        * as resuming the guest will put it into an infinite loop.
+        *
+        * Note, this code also applies to Intel CPUs, even though it is *very*
+        * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+        * format) with L2's page tables (EPT format).
+        *
+        * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+        * unprotect the gfn and retry if an event is awaiting reinjection.  If
+        * KVM emulates multiple instructions before completing event injection,
+        * the event could be delayed beyond what is architecturally allowed,
+        * e.g. KVM could inject an IRQ after the TPR has been raised.
+        */
+       if (((direct && is_write_to_guest_page_table(error_code)) ||
+            (!direct && kvm_event_needs_reinjection(vcpu))) &&
+           kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+               return RET_PF_RETRY;
+ 
+       /*
+        * The gfn is write-protected, but if KVM detects its emulating an
+        * instruction that is unlikely to be used to modify page tables, or if
+        * emulation fails, KVM can try to unprotect the gfn and let the CPU
+        * re-execute the instruction that caused the page fault.  Do not allow
+        * retrying an instruction from a nested guest as KVM is only explicitly
+        * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+        * going to magically fix whatever issue caused L2 to fail.
+        */
+       if (!is_guest_mode(vcpu))
+               *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+ 
+       return RET_PF_EMULATE;
+ }
+ 
   int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                        void *insn, int insn_len)
   {
@@@ -6008,6 -6136,10 +6134,10 @@@
         if (r < 0)
                 return r;
   
+       if (r == RET_PF_WRITE_PROTECTED)
+               r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+                                               &emulation_type);
+ 
         if (r == RET_PF_FIXED)
                 vcpu->stat.pf_fixed++;
         else if (r == RET_PF_EMULATE)
@@@ -6018,32 -6150,6 +6148,6 @@@
         if (r != RET_PF_EMULATE)
                 return 1;
   
-       /*
-        * Before emulating the instruction, check if the error code
-        * was due to a RO violation while translating the guest page.
-        * This can occur when using nested virtualization with nested
-        * paging in both guests. If true, we simply unprotect the page
-        * and resume the guest.
-        */
-       if (vcpu->arch.mmu->root_role.direct &&
-           (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
-               return 1;
-       }
- 
-       /*
-        * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
-        * optimistically try to just unprotect the page and let the processor
-        * re-execute the instruction that caused the page fault.  Do not allow
-        * retrying MMIO emulation, as it's not only pointless but could also
-        * cause us to enter an infinite loop because the processor will keep
-        * faulting on the non-existent MMIO address.  Retrying an instruction
-        * from a nested guest is also pointless and dangerous as we are only
-        * explicitly shadowing L1's page tables, i.e. unprotecting something
-        * for L1 isn't going to magically fix whatever issue cause L2 to fail.
-        */
-       if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
-               emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
   emulate:
         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
                                        insn_len);
@@@ -6202,59 -6308,6 +6306,6 @@@ void kvm_configure_mmu(bool enable_tdp
   }
   EXPORT_SYMBOL_GPL(kvm_configure_mmu);
   
- /* The return value indicates if tlb flush on all vcpus is needed. */
- typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head,
-                                   const struct kvm_memory_slot *slot);
- 
- static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
-                                             const struct kvm_memory_slot *slot,
-                                             slot_rmaps_handler fn,
-                                             int start_level, int end_level,
-                                             gfn_t start_gfn, gfn_t end_gfn,
-                                             bool flush_on_yield, bool flush)
- {
-       struct slot_rmap_walk_iterator iterator;
- 
-       lockdep_assert_held_write(&kvm->mmu_lock);
- 
-       for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
-                       end_gfn, &iterator) {
-               if (iterator.rmap)
-                       flush |= fn(kvm, iterator.rmap, slot);
- 
-               if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       if (flush && flush_on_yield) {
-                               kvm_flush_remote_tlbs_range(kvm, start_gfn,
-                                                           iterator.gfn - start_gfn + 1);
-                               flush = false;
-                       }
-                       cond_resched_rwlock_write(&kvm->mmu_lock);
-               }
-       }
- 
-       return flush;
- }
- 
- static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
-                                           const struct kvm_memory_slot *slot,
-                                           slot_rmaps_handler fn,
-                                           int start_level, int end_level,
-                                           bool flush_on_yield)
- {
-       return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
-                                slot->base_gfn, slot->base_gfn + slot->npages - 1,
-                                flush_on_yield, false);
- }
- 
- static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
-                                              const struct kvm_memory_slot *slot,
-                                              slot_rmaps_handler fn,
-                                              bool flush_on_yield)
- {
-       return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
- }
- 
   static void free_mmu_pages(struct kvm_mmu *mmu)
   {
         if (!tdp_enabled && mmu->pae_root)
@@@ -6528,9 -6581,8 +6579,8 @@@ static bool kvm_rmap_zap_gfn_range(stru
                         if (WARN_ON_ONCE(start >= end))
                                 continue;
   
-                       flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
-                                                 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
-                                                 start, end - 1, true, flush);
+                       flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+                                                        end, true, flush);
                 }
         }
   
@@@ -6818,7 -6870,7 +6868,7 @@@ static void kvm_shadow_mmu_try_split_hu
          */
         for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
                 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
-                                 level, level, start, end - 1, true, false);
+                                 level, level, start, end - 1, true, true, false);
   }
   
   /* Must be called with the mmu_lock held in write-mode. */
@@@ -6997,10 -7049,42 +7047,42 @@@ void kvm_arch_flush_shadow_all(struct k
         kvm_mmu_zap_all(kvm);
   }
   
+ /*
+  * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
+  *
+  * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+  * case scenario we'll have unused shadow pages lying around until they
+  * are recycled due to age or when the VM is destroyed.
+  */
+ static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
+ {
+       struct kvm_gfn_range range = {
+               .slot = slot,
+               .start = slot->base_gfn,
+               .end = slot->base_gfn + slot->npages,
+               .may_block = true,
+       };
+ 
+       write_lock(&kvm->mmu_lock);
+       if (kvm_unmap_gfn_range(kvm, &range))
+               kvm_flush_remote_tlbs_memslot(kvm, slot);
+ 
+       write_unlock(&kvm->mmu_lock);
+ }
+ 
+ static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+ {
+       return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+              kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+ }
+ 
   void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                                    struct kvm_memory_slot *slot)
   {
-       kvm_mmu_zap_all_fast(kvm);
+       if (kvm_memslot_flush_zap_all(kvm))
+               kvm_mmu_zap_all_fast(kvm);
+       else
+               kvm_mmu_zap_memslot_leafs(kvm, slot);
   }
   
   void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
diff --combined arch/x86/kvm/vmx/vmx.c

index 733a0c45d1a6122a08e6f4590b4ba6ef1620ff21,c67e448c6ebd709fcd367e387723b7046e36800b..1a4438358c5e38ef09d5d306f026d887c14331cd
--- 1/arch/x86/kvm/vmx/vmx.c
--- 2/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@@ -525,10 -525,6 +525,6 @@@ static const struct kvm_vmx_segment_fie
         VMX_SEGMENT_FIELD(LDTR),
   };
   
- static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
- {
-       vmx->segment_cache.bitmask = 0;
- }
   
   static unsigned long host_idt_base;
   
@@@ -755,7 -751,7 +751,7 @@@ fault
         return -EIO;
   }
   
- static void vmx_emergency_disable(void)
+ void vmx_emergency_disable_virtualization_cpu(void)
   {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
@@@ -1998,15 -1994,15 +1994,15 @@@ static inline bool is_vmx_feature_contr
         return !(msr->data & ~valid_bits);
   }
   
- int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+ int vmx_get_feature_msr(u32 msr, u64 *data)
   {
-       switch (msr->index) {
+       switch (msr) {
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                 if (!nested)
                         return 1;
-               return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+               return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
         default:
-               return KVM_MSR_RET_INVALID;
+               return KVM_MSR_RET_UNSUPPORTED;
         }
   }
   
@@@ -2605,13 -2601,13 +2601,13 @@@ static u64 adjust_vmx_controls64(u64 ct
   static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                              struct vmx_capability *vmx_cap)
   {
-       u32 vmx_msr_low, vmx_msr_high;
         u32 _pin_based_exec_control = 0;
         u32 _cpu_based_exec_control = 0;
         u32 _cpu_based_2nd_exec_control = 0;
         u64 _cpu_based_3rd_exec_control = 0;
         u32 _vmexit_control = 0;
         u32 _vmentry_control = 0;
+       u64 basic_msr;
         u64 misc_msr;
         int i;
   
@@@ -2734,29 -2730,29 +2730,29 @@@
                 _vmexit_control &= ~x_ctrl;
         }
   
-       rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+       rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
   
         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
-       if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+       if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
                 return -EIO;
   
   #ifdef CONFIG_X86_64
-       /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
-       if (vmx_msr_high & (1u<<16))
+       /*
+        * KVM expects to be able to shove all legal physical addresses into
+        * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+        * 0 for processors that support Intel 64 architecture".
+        */
+       if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                 return -EIO;
   #endif
   
         /* Require Write-Back (WB) memory type for VMCS accesses. */
-       if (((vmx_msr_high >> 18) & 15) != 6)
+       if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
                 return -EIO;
   
         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
   
-       vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- 
-       vmcs_conf->revision_id = vmx_msr_low;
- 
+       vmcs_conf->basic = basic_msr;
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@@ -2844,7 -2840,7 +2840,7 @@@ fault
         return -EFAULT;
   }
   
- int vmx_hardware_enable(void)
+ int vmx_enable_virtualization_cpu(void)
   {
         int cpu = raw_smp_processor_id();
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@@ -2881,7 -2877,7 +2877,7 @@@ static void vmclear_local_loaded_vmcss(
                 __loaded_vmcs_clear(v);
   }
   
- void vmx_hardware_disable(void)
+ void vmx_disable_virtualization_cpu(void)
   {
         vmclear_local_loaded_vmcss();
   
@@@ -2903,13 -2899,13 +2899,13 @@@ struct vmcs *alloc_vmcs_cpu(bool shadow
         if (!pages)
                 return NULL;
         vmcs = page_address(pages);
-       memset(vmcs, 0, vmcs_config.size);
+       memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
   
         /* KVM supports Enlightened VMCS v1 only */
         if (kvm_is_using_evmcs())
                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
         else
-               vmcs->hdr.revision_id = vmcs_config.revision_id;
+               vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
   
         if (shadow)
                 vmcs->hdr.shadow_vmcs = 1;
@@@ -3002,7 -2998,7 +2998,7 @@@ static __init int alloc_kvm_area(void
                  * physical CPU.
                  */
                 if (kvm_is_using_evmcs())
-                       vmcs->hdr.revision_id = vmcs_config.revision_id;
+                       vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
   
                 per_cpu(vmxarea, cpu) = vmcs;
         }
@@@ -4219,6 -4215,13 +4215,13 @@@ static int vmx_deliver_nested_posted_in
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
+       /*
+        * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+        * and freed, and must not be accessed outside of vcpu->mutex.  The
+        * vCPU's cached PI NV is valid if and only if posted interrupts
+        * enabled in its vmcs12, i.e. checking the vector also checks that
+        * L1 has enabled posted interrupts for L2.
+        */
         if (is_guest_mode(vcpu) &&
             vector == vmx->nested.posted_intr_nv) {
                 /*
@@@ -5804,8 -5807,9 +5807,9 @@@ static int handle_ept_violation(struct 
         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
                       ? PFERR_PRESENT_MASK : 0;
   
-       error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
-              PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+       if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+               error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+                             PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
   
         /*
          * Check that the GPA doesn't exceed physical memory limits, as that is
@@@ -7265,6 -7269,8 +7269,8 @@@ static fastpath_t vmx_exit_handlers_fas
                 return handle_fastpath_set_msr_irqoff(vcpu);
         case EXIT_REASON_PREEMPTION_TIMER:
                 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+       case EXIT_REASON_HLT:
+               return handle_fastpath_hlt(vcpu);
         default:
                 return EXIT_FASTPATH_NONE;
         }
@@@ -7659,11 -7665,13 +7665,11 @@@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcp
   
         /*
          * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
- -       * device attached and the CPU doesn't support self-snoop.  Letting the
- -       * guest control memory types on Intel CPUs without self-snoop may
- -       * result in unexpected behavior, and so KVM's (historical) ABI is to
- -       * trust the guest to behave only as a last resort.
+ +       * device attached.  Letting the guest control memory types on Intel
+ +       * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
+ +       * the guest to behave only as a last resort.
          */
- -      if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
- -          !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ +      if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
   
         return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
@@@ -7965,6 -7973,7 +7971,7 @@@ static __init void vmx_set_cpu_caps(voi
                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
         }
   
         if (vmx_umip_emulated())
@@@ -8515,7 -8524,7 +8522,7 @@@ __init int vmx_hardware_setup(void
                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
   
                 cpu_preemption_timer_multi =
-                       vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+                       vmx_misc_preemption_timer_rate(vmcs_config.misc);
   
                 if (tsc_khz)
                         use_timer_freq = (u64)tsc_khz * 1000;
@@@ -8582,8 -8591,6 +8589,6 @@@ static void __vmx_exit(void
   {
         allow_smaller_maxphyaddr = false;
   
-       cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
- 
         vmx_cleanup_l1d_flush();
   }
   
@@@ -8630,8 -8637,6 +8635,6 @@@ static int __init vmx_init(void
                 pi_init_cpu(cpu);
         }
   
-       cpu_emergency_register_virt_callback(vmx_emergency_disable);
- 
         vmx_check_vmcs12_offsets();
   
         /*
diff --combined arch/x86/mm/pat/memtype.c

index f73b5ce270b3c7185bd6c31e1d449c742789a729,6c4e29457c1071dcfbacd4135a159dcea08a6ec4..feb8cc6a12bf23a53ad9d248e940218c876745dd
--- 1/arch/x86/mm/pat/memtype.c
--- 2/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@@ -104,7 -104,7 +104,7 @@@ __setup("debugpat", pat_debug_setup)
   
   #ifdef CONFIG_X86_PAT
   /*
- - * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ + * X86 PAT uses page flags arch_1 and arch_2 together to keep track of
    * memory type of pages that have backing page struct.
    *
    * X86 PAT supports 4 different memory types:
@@@ -118,9 -118,9 +118,9 @@@
   
   #define _PGMT_WB              0
   #define _PGMT_WC              (1UL << PG_arch_1)
- -#define _PGMT_UC_MINUS                (1UL << PG_uncached)
- -#define _PGMT_WT              (1UL << PG_uncached | 1UL << PG_arch_1)
- -#define _PGMT_MASK            (1UL << PG_uncached | 1UL << PG_arch_1)
+ +#define _PGMT_UC_MINUS                (1UL << PG_arch_2)
+ +#define _PGMT_WT              (1UL << PG_arch_2 | 1UL << PG_arch_1)
+ +#define _PGMT_MASK            (1UL << PG_arch_2 | 1UL << PG_arch_1)
   #define _PGMT_CLEAR_MASK      (~_PGMT_MASK)
   
   static inline enum page_cache_mode get_page_memtype(struct page *pg)
@@@ -176,15 -176,6 +176,6 @@@ static inline void set_page_memtype(str
   }
   #endif
   
- enum {
-       PAT_UC = 0,             /* uncached */
-       PAT_WC = 1,             /* Write combining */
-       PAT_WT = 4,             /* Write Through */
-       PAT_WP = 5,             /* Write Protected */
-       PAT_WB = 6,             /* Write Back (default) */
-       PAT_UC_MINUS = 7,       /* UC, but can be overridden by MTRR */
- };
- 
   #define CM(c) (_PAGE_CACHE_MODE_ ## c)
   
   static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
@@@ -194,13 -185,13 +185,13 @@@
         char *cache_mode;
   
         switch (pat_val) {
-       case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
-       case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
-       case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
-       case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
-       case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
-       case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
-       default:           cache = CM(WB);       cache_mode = "WB  "; break;
+       case X86_MEMTYPE_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
+       case X86_MEMTYPE_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
+       case X86_MEMTYPE_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
+       case X86_MEMTYPE_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
+       case X86_MEMTYPE_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
+       case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+       default:                   cache = CM(WB);       cache_mode = "WB  "; break;
         }
   
         memcpy(msg, cache_mode, 4);
@@@ -257,12 -248,6 +248,6 @@@ void pat_cpu_init(void
   void __init pat_bp_init(void)
   {
         struct cpuinfo_x86 *c = &boot_cpu_data;
- #define PAT(p0, p1, p2, p3, p4, p5, p6, p7)                   \
-       (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) |           \
-       ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) |     \
-       ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) |     \
-       ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
- 
   
         if (!IS_ENABLED(CONFIG_X86_PAT))
                 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
@@@ -293,7 -278,7 +278,7 @@@
                  * NOTE: When WC or WP is used, it is redirected to UC- per
                  * the default setup in __cachemode2pte_tbl[].
                  */
-               pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
+               pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
         }
   
         /*
@@@ -328,7 -313,7 +313,7 @@@
                  * NOTE: When WT or WP is used, it is redirected to UC- per
                  * the default setup in __cachemode2pte_tbl[].
                  */
-               pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
+               pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
         } else {
                 /*
                  * Full PAT support.  We put WT in slot 7 to improve
@@@ -356,13 -341,12 +341,12 @@@
                  * The reserved slots are unused, but mapped to their
                  * corresponding types in the presence of PAT errata.
                  */
-               pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
+               pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
         }
   
         memory_caching_control |= CACHE_PAT;
   
         init_cache_modes(pat_msr_val);
- #undef PAT
   }
   
   static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
@@@ -951,20 -935,23 +935,20 @@@ static void free_pfn_range(u64 paddr, u
   static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
                 resource_size_t *phys)
   {
- -      pte_t *ptep, pte;
- -      spinlock_t *ptl;
+ +      struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
   
- -      if (follow_pte(vma, vma->vm_start, &ptep, &ptl))
+ +      if (follow_pfnmap_start(&args))
                 return -EINVAL;
   
- -      pte = ptep_get(ptep);
- -
         /* Never return PFNs of anon folios in COW mappings. */
- -      if (vm_normal_folio(vma, vma->vm_start, pte)) {
- -              pte_unmap_unlock(ptep, ptl);
+ +      if (!args.special) {
+ +              follow_pfnmap_end(&args);
                 return -EINVAL;
         }
   
- -      *prot = pgprot_val(pte_pgprot(pte));
- -      *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
- -      pte_unmap_unlock(ptep, ptl);
+ +      *prot = pgprot_val(args.pgprot);
+ +      *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
+ +      follow_pfnmap_end(&args);
         return 0;
   }
   
diff --combined virt/kvm/kvm_main.c

index 4f81366f8b619b42f2277207859cc4ed7fe6e29b,8f04e628dd86256829cb70d111d1a6425a876d5e..05cbb2548d999bd5acdb329381cdfaa21409dd90
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -136,8 -136,8 +136,8 @@@ static int kvm_no_compat_open(struct in
   #define KVM_COMPAT(c) .compat_ioctl   = kvm_no_compat_ioctl,  \
                         .open           = kvm_no_compat_open
   #endif
- static int hardware_enable_all(void);
- static void hardware_disable_all(void);
+ static int kvm_enable_virtualization(void);
+ static void kvm_disable_virtualization(void);
   
   static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
   
@@@ -1220,7 -1220,7 +1220,7 @@@ static struct kvm *kvm_create_vm(unsign
         if (r)
                 goto out_err_no_arch_destroy_vm;
   
-       r = hardware_enable_all();
+       r = kvm_enable_virtualization();
         if (r)
                 goto out_err_no_disable;
   
@@@ -1263,7 -1263,7 +1263,7 @@@ out_no_coalesced_mmio
                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
   #endif
   out_err_no_mmu_notifier:
-       hardware_disable_all();
+       kvm_disable_virtualization();
   out_err_no_disable:
         kvm_arch_destroy_vm(kvm);
   out_err_no_arch_destroy_vm:
@@@ -1360,7 -1360,7 +1360,7 @@@ static void kvm_destroy_vm(struct kvm *
   #endif
         kvm_arch_free_vm(kvm);
         preempt_notifier_dec();
-       hardware_disable_all();
+       kvm_disable_virtualization();
         mmdrop(mm);
   }
   
@@@ -2860,11 -2860,13 +2860,11 @@@ static int hva_to_pfn_remapped(struct v
                                unsigned long addr, bool write_fault,
                                bool *writable, kvm_pfn_t *p_pfn)
   {
+ +      struct follow_pfnmap_args args = { .vma = vma, .address = addr };
         kvm_pfn_t pfn;
- -      pte_t *ptep;
- -      pte_t pte;
- -      spinlock_t *ptl;
         int r;
   
- -      r = follow_pte(vma, addr, &ptep, &ptl);
+ +      r = follow_pfnmap_start(&args);
         if (r) {
                 /*
                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@@ -2879,19 -2881,21 +2879,19 @@@
                 if (r)
                         return r;
   
- -              r = follow_pte(vma, addr, &ptep, &ptl);
+ +              r = follow_pfnmap_start(&args);
                 if (r)
                         return r;
         }
   
- -      pte = ptep_get(ptep);
- -
- -      if (write_fault && !pte_write(pte)) {
+ +      if (write_fault && !args.writable) {
                 pfn = KVM_PFN_ERR_RO_FAULT;
                 goto out;
         }
   
         if (writable)
- -              *writable = pte_write(pte);
- -      pfn = pte_pfn(pte);
+ +              *writable = args.writable;
+ +      pfn = args.pfn;
   
         /*
          * Get a reference here because callers of *hva_to_pfn* and
@@@ -2912,8 -2916,9 +2912,8 @@@
          */
         if (!kvm_try_get_pfn(pfn))
                 r = -EFAULT;
- -
   out:
- -      pte_unmap_unlock(ptep, ptl);
+ +      follow_pfnmap_end(&args);
         *p_pfn = pfn;
   
         return r;
@@@ -3270,6 -3275,9 +3270,9 @@@ static int __kvm_read_guest_page(struc
         int r;
         unsigned long addr;
   
+       if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+               return -EFAULT;
+ 
         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
         if (kvm_is_error_hva(addr))
                 return -EFAULT;
@@@ -3343,6 -3351,9 +3346,9 @@@ static int __kvm_read_guest_atomic(stru
         int r;
         unsigned long addr;
   
+       if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+               return -EFAULT;
+ 
         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
         if (kvm_is_error_hva(addr))
                 return -EFAULT;
@@@ -3373,6 -3384,9 +3379,9 @@@ static int __kvm_write_guest_page(struc
         int r;
         unsigned long addr;
   
+       if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+               return -EFAULT;
+ 
         addr = gfn_to_hva_memslot(memslot, gfn);
         if (kvm_is_error_hva(addr))
                 return -EFAULT;
@@@ -3576,7 -3590,7 +3585,7 @@@ int kvm_clear_guest(struct kvm *kvm, gp
         int ret;
   
         while ((seg = next_segment(len, offset)) != 0) {
-               ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
+               ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
                 if (ret < 0)
                         return ret;
                 offset = 0;
@@@ -5566,137 -5580,67 +5575,67 @@@ static struct miscdevice kvm_dev = 
   };
   
   #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+ static bool enable_virt_at_load = true;
+ module_param(enable_virt_at_load, bool, 0444);
+ 
   __visible bool kvm_rebooting;
   EXPORT_SYMBOL_GPL(kvm_rebooting);
   
- static DEFINE_PER_CPU(bool, hardware_enabled);
+ static DEFINE_PER_CPU(bool, virtualization_enabled);
+ static DEFINE_MUTEX(kvm_usage_lock);
   static int kvm_usage_count;
   
- static int __hardware_enable_nolock(void)
+ __weak void kvm_arch_enable_virtualization(void)
+ {
+ 
+ }
+ 
+ __weak void kvm_arch_disable_virtualization(void)
+ {
+ 
+ }
+ 
+ static int kvm_enable_virtualization_cpu(void)
   {
-       if (__this_cpu_read(hardware_enabled))
+       if (__this_cpu_read(virtualization_enabled))
                 return 0;
   
-       if (kvm_arch_hardware_enable()) {
+       if (kvm_arch_enable_virtualization_cpu()) {
                 pr_info("kvm: enabling virtualization on CPU%d failed\n",
                         raw_smp_processor_id());
                 return -EIO;
         }
   
-       __this_cpu_write(hardware_enabled, true);
+       __this_cpu_write(virtualization_enabled, true);
         return 0;
   }
   
- static void hardware_enable_nolock(void *failed)
- {
-       if (__hardware_enable_nolock())
-               atomic_inc(failed);
- }
- 
   static int kvm_online_cpu(unsigned int cpu)
   {
-       int ret = 0;
- 
         /*
          * Abort the CPU online process if hardware virtualization cannot
          * be enabled. Otherwise running VMs would encounter unrecoverable
          * errors when scheduled to this CPU.
          */
-       mutex_lock(&kvm_lock);
-       if (kvm_usage_count)
-               ret = __hardware_enable_nolock();
-       mutex_unlock(&kvm_lock);
-       return ret;
+       return kvm_enable_virtualization_cpu();
   }
   
- static void hardware_disable_nolock(void *junk)
+ static void kvm_disable_virtualization_cpu(void *ign)
   {
-       /*
-        * Note, hardware_disable_all_nolock() tells all online CPUs to disable
-        * hardware, not just CPUs that successfully enabled hardware!
-        */
-       if (!__this_cpu_read(hardware_enabled))
+       if (!__this_cpu_read(virtualization_enabled))
                 return;
   
-       kvm_arch_hardware_disable();
+       kvm_arch_disable_virtualization_cpu();
   
-       __this_cpu_write(hardware_enabled, false);
+       __this_cpu_write(virtualization_enabled, false);
   }
   
   static int kvm_offline_cpu(unsigned int cpu)
   {
-       mutex_lock(&kvm_lock);
-       if (kvm_usage_count)
-               hardware_disable_nolock(NULL);
-       mutex_unlock(&kvm_lock);
+       kvm_disable_virtualization_cpu(NULL);
         return 0;
   }
   
- static void hardware_disable_all_nolock(void)
- {
-       BUG_ON(!kvm_usage_count);
- 
-       kvm_usage_count--;
-       if (!kvm_usage_count)
-               on_each_cpu(hardware_disable_nolock, NULL, 1);
- }
- 
- static void hardware_disable_all(void)
- {
-       cpus_read_lock();
-       mutex_lock(&kvm_lock);
-       hardware_disable_all_nolock();
-       mutex_unlock(&kvm_lock);
-       cpus_read_unlock();
- }
- 
- static int hardware_enable_all(void)
- {
-       atomic_t failed = ATOMIC_INIT(0);
-       int r;
- 
-       /*
-        * Do not enable hardware virtualization if the system is going down.
-        * If userspace initiated a forced reboot, e.g. reboot -f, then it's
-        * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
-        * after kvm_reboot() is called.  Note, this relies on system_state
-        * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
-        * hook instead of registering a dedicated reboot notifier (the latter
-        * runs before system_state is updated).
-        */
-       if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
-           system_state == SYSTEM_RESTART)
-               return -EBUSY;
- 
-       /*
-        * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
-        * is called, and so on_each_cpu() between them includes the CPU that
-        * is being onlined.  As a result, hardware_enable_nolock() may get
-        * invoked before kvm_online_cpu(), which also enables hardware if the
-        * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
-        * enable hardware multiple times.
-        */
-       cpus_read_lock();
-       mutex_lock(&kvm_lock);
- 
-       r = 0;
- 
-       kvm_usage_count++;
-       if (kvm_usage_count == 1) {
-               on_each_cpu(hardware_enable_nolock, &failed, 1);
- 
-               if (atomic_read(&failed)) {
-                       hardware_disable_all_nolock();
-                       r = -EBUSY;
-               }
-       }
- 
-       mutex_unlock(&kvm_lock);
-       cpus_read_unlock();
- 
-       return r;
- }
- 
   static void kvm_shutdown(void)
   {
         /*
@@@ -5712,34 -5656,32 +5651,32 @@@
          */
         pr_info("kvm: exiting hardware virtualization\n");
         kvm_rebooting = true;
-       on_each_cpu(hardware_disable_nolock, NULL, 1);
+       on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
   }
   
   static int kvm_suspend(void)
   {
         /*
          * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
-        * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
-        * is stable.  Assert that kvm_lock is not held to ensure the system
-        * isn't suspended while KVM is enabling hardware.  Hardware enabling
-        * can be preempted, but the task cannot be frozen until it has dropped
-        * all locks (userspace tasks are frozen via a fake signal).
+        * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
+        * count is stable.  Assert that kvm_usage_lock is not held to ensure
+        * the system isn't suspended while KVM is enabling hardware.  Hardware
+        * enabling can be preempted, but the task cannot be frozen until it has
+        * dropped all locks (userspace tasks are frozen via a fake signal).
          */
-       lockdep_assert_not_held(&kvm_lock);
+       lockdep_assert_not_held(&kvm_usage_lock);
         lockdep_assert_irqs_disabled();
   
-       if (kvm_usage_count)
-               hardware_disable_nolock(NULL);
+       kvm_disable_virtualization_cpu(NULL);
         return 0;
   }
   
   static void kvm_resume(void)
   {
-       lockdep_assert_not_held(&kvm_lock);
+       lockdep_assert_not_held(&kvm_usage_lock);
         lockdep_assert_irqs_disabled();
   
-       if (kvm_usage_count)
-               WARN_ON_ONCE(__hardware_enable_nolock());
+       WARN_ON_ONCE(kvm_enable_virtualization_cpu());
   }
   
   static struct syscore_ops kvm_syscore_ops = {
@@@ -5747,13 -5689,95 +5684,95 @@@
         .resume = kvm_resume,
         .shutdown = kvm_shutdown,
   };
+ 
+ static int kvm_enable_virtualization(void)
+ {
+       int r;
+ 
+       guard(mutex)(&kvm_usage_lock);
+ 
+       if (kvm_usage_count++)
+               return 0;
+ 
+       kvm_arch_enable_virtualization();
+ 
+       r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+                             kvm_online_cpu, kvm_offline_cpu);
+       if (r)
+               goto err_cpuhp;
+ 
+       register_syscore_ops(&kvm_syscore_ops);
+ 
+       /*
+        * Undo virtualization enabling and bail if the system is going down.
+        * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+        * possible for an in-flight operation to enable virtualization after
+        * syscore_shutdown() is called, i.e. without kvm_shutdown() being
+        * invoked.  Note, this relies on system_state being set _before_
+        * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+        * or this CPU observes the impending shutdown.  Which is why KVM uses
+        * a syscore ops hook instead of registering a dedicated reboot
+        * notifier (the latter runs before system_state is updated).
+        */
+       if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+           system_state == SYSTEM_RESTART) {
+               r = -EBUSY;
+               goto err_rebooting;
+       }
+ 
+       return 0;
+ 
+ err_rebooting:
+       unregister_syscore_ops(&kvm_syscore_ops);
+       cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+ err_cpuhp:
+       kvm_arch_disable_virtualization();
+       --kvm_usage_count;
+       return r;
+ }
+ 
+ static void kvm_disable_virtualization(void)
+ {
+       guard(mutex)(&kvm_usage_lock);
+ 
+       if (--kvm_usage_count)
+               return;
+ 
+       unregister_syscore_ops(&kvm_syscore_ops);
+       cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+       kvm_arch_disable_virtualization();
+ }
+ 
+ static int kvm_init_virtualization(void)
+ {
+       if (enable_virt_at_load)
+               return kvm_enable_virtualization();
+ 
+       return 0;
+ }
+ 
+ static void kvm_uninit_virtualization(void)
+ {
+       if (enable_virt_at_load)
+               kvm_disable_virtualization();
+ }
   #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
- static int hardware_enable_all(void)
+ static int kvm_enable_virtualization(void)
+ {
+       return 0;
+ }
+ 
+ static int kvm_init_virtualization(void)
   {
         return 0;
   }
   
- static void hardware_disable_all(void)
+ static void kvm_disable_virtualization(void)
+ {
+ 
+ }
+ 
+ static void kvm_uninit_virtualization(void)
   {
   
   }
@@@ -6186,6 -6210,7 +6205,6 @@@ static const struct file_operations sta
         .release = kvm_debugfs_release,
         .read = simple_attr_read,
         .write = simple_attr_write,
- -      .llseek = no_llseek,
   };
   
   static int vm_stat_get(void *_offset, u64 *val)
@@@ -6454,15 -6479,6 +6473,6 @@@ int kvm_init(unsigned vcpu_size, unsign
         int r;
         int cpu;
   
- #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-       r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
-                                     kvm_online_cpu, kvm_offline_cpu);
-       if (r)
-               return r;
- 
-       register_syscore_ops(&kvm_syscore_ops);
- #endif
- 
         /* A kmem cache lets us meet the alignment requirements of fx_save. */
         if (!vcpu_align)
                 vcpu_align = __alignof__(struct kvm_vcpu);
@@@ -6473,10 -6489,8 +6483,8 @@@
                                            offsetofend(struct kvm_vcpu, stats_id)
                                            - offsetof(struct kvm_vcpu, arch),
                                            NULL);
-       if (!kvm_vcpu_cache) {
-               r = -ENOMEM;
-               goto err_vcpu_cache;
-       }
+       if (!kvm_vcpu_cache)
+               return -ENOMEM;
   
         for_each_possible_cpu(cpu) {
                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@@ -6510,6 -6524,10 +6518,10 @@@
   
         kvm_gmem_init(module);
   
+       r = kvm_init_virtualization();
+       if (r)
+               goto err_virt;
+ 
         /*
          * Registration _must_ be the very last thing done, as this exposes
          * /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@@ -6523,6 -6541,8 +6535,8 @@@
         return 0;
   
   err_register:
+       kvm_uninit_virtualization();
+ err_virt:
         kvm_vfio_ops_exit();
   err_vfio:
         kvm_async_pf_deinit();
@@@ -6533,11 -6553,6 +6547,6 @@@ err_cpu_kick_mask
         for_each_possible_cpu(cpu)
                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
         kmem_cache_destroy(kvm_vcpu_cache);
- err_vcpu_cache:
- #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-       unregister_syscore_ops(&kvm_syscore_ops);
-       cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
- #endif
         return r;
   }
   EXPORT_SYMBOL_GPL(kvm_init);
@@@ -6553,16 -6568,14 +6562,14 @@@ void kvm_exit(void
          */
         misc_deregister(&kvm_dev);
   
+       kvm_uninit_virtualization();
+ 
         debugfs_remove_recursive(kvm_debugfs_dir);
         for_each_possible_cpu(cpu)
                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
         kmem_cache_destroy(kvm_vcpu_cache);
         kvm_vfio_ops_exit();
         kvm_async_pf_deinit();
- #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-       unregister_syscore_ops(&kvm_syscore_ops);
-       cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
- #endif
         kvm_irqfd_exit();
   }
   EXPORT_SYMBOL_GPL(kvm_exit);
author	Linus Torvalds <[email protected]>
	Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)
committer	Linus Torvalds <[email protected]>
	Sat, 28 Sep 2024 16:20:14 +0000 (09:20 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/configs/debug_defconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/cpuid.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/msr-index.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pat/memtype.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history